df/d2d/arnoldi__vector_8F_source.html

!--------------------------------------------------------------------------------------------------!

!   CP2K: A general program to perform molecular dynamics simulations                              !

!   Copyright 2000-2025 CP2K developers group <https://cp2k.org>                                   !

!                                                                                                  !

!   SPDX-License-Identifier: GPL-2.0-or-later                                                      !

!--------------------------------------------------------------------------------------------------!


! **************************************************************************************************

!> \brief operations for skinny matrices/vectors expressed in dbcsr form

!> \par History

!>       2014.10 created [Florian Schiffmann]

!>       2023.12 Removed support for single-precision [Ole Schuett]

!>       2024.12 Removed support for complex input matrices [Ole Schuett]

!> \author Florian Schiffmann

! **************************************************************************************************

MODULE arnoldi_vector

   USE cp_dbcsr_api,                    ONLY: &

        dbcsr_copy, dbcsr_create, dbcsr_distribution_get, dbcsr_distribution_new, &

        dbcsr_distribution_release, dbcsr_distribution_type, dbcsr_get_data_p, dbcsr_get_info, &

        dbcsr_iterator_blocks_left, dbcsr_iterator_next_block, dbcsr_iterator_start, &

        dbcsr_iterator_stop, dbcsr_iterator_type, dbcsr_release, dbcsr_set, dbcsr_type, &

        dbcsr_type_antisymmetric, dbcsr_type_no_symmetry, dbcsr_type_symmetric

   USE cp_dbcsr_contrib,                ONLY: dbcsr_reserve_all_blocks

   USE kinds,                           ONLY: dp,&

                                              real_8

   USE message_passing,                 ONLY: mp_comm_type

#include "../base/base_uses.f90"


!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads


   IMPLICIT NONE


   PRIVATE


   CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'arnoldi_vector'


! **************************************************************************************************

!> \brief Types needed for the hashtable.

! **************************************************************************************************

   TYPE ele_type

      INTEGER :: c = 0

      INTEGER :: p = 0

   END TYPE ele_type


   TYPE hash_table_type

      TYPE(ele_type), DIMENSION(:), POINTER :: table => null()

      INTEGER :: nele = 0

      INTEGER :: nmax = 0

      INTEGER :: prime = 0

   END TYPE hash_table_type


! **************************************************************************************************

!> \brief Types needed for fast access to distributed dbcsr vectors.

! **************************************************************************************************

   TYPE block_ptr

      REAL(real_8), DIMENSION(:, :), POINTER          :: ptr => null()

      INTEGER                                         :: assigned_thread = -1

   END TYPE block_ptr


   TYPE fast_vec_access_type

      TYPE(hash_table_type) :: hash_table = hash_table_type()

      TYPE(block_ptr), DIMENSION(:), ALLOCATABLE :: blk_map

   END TYPE


   PUBLIC :: dbcsr_matrix_colvec_multiply, &

             create_col_vec_from_matrix, &

             create_row_vec_from_matrix, &

             create_replicated_col_vec_from_matrix, &

             create_replicated_row_vec_from_matrix


CONTAINS


! **************************************************************************************************

!> \brief creates a dbcsr col vector like object which lives on proc_col 0

!>        and has the same row dist as the template matrix

!>        the returned matrix is fully allocated and all blocks are set to 0

!>        this is not a sparse object (and must never be)

!> \param dbcsr_vec  the vector object to create must be allocated but not initialized

!> \param matrix a dbcsr matrix used as template

!> \param ncol number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)

! **************************************************************************************************


   SUBROUTINE create_col_vec_from_matrix(dbcsr_vec, matrix, ncol)

      TYPE(dbcsr_type)                                   :: dbcsr_vec, matrix

      INTEGER                                            :: ncol


      CHARACTER(LEN=*), PARAMETER :: routinen = 'create_col_vec_from_matrix'


      INTEGER                                            :: handle, npcols

      INTEGER, DIMENSION(:), POINTER                     :: col_blk_size, col_dist, row_blk_size, &

                                                            row_dist

      TYPE(dbcsr_distribution_type)                      :: dist, dist_col_vec


      CALL timeset(routinen, handle)


      CALL dbcsr_get_info(matrix, row_blk_size=row_blk_size, distribution=dist)

      CALL dbcsr_distribution_get(dist, npcols=npcols, row_dist=row_dist)


      ALLOCATE (col_dist(1), col_blk_size(1))

      col_dist(1) = 0

      col_blk_size(1) = ncol

      CALL dbcsr_distribution_new(dist_col_vec, template=dist, row_dist=row_dist, col_dist=col_dist)


      CALL dbcsr_create(dbcsr_vec, "D", dist_col_vec, matrix_type=dbcsr_type_no_symmetry, &

                        row_blk_size=row_blk_size, col_blk_size=col_blk_size)

      CALL dbcsr_reserve_all_blocks(dbcsr_vec)


      CALL dbcsr_distribution_release(dist_col_vec)

      DEALLOCATE (col_dist, col_blk_size)

      CALL timestop(handle)


   END SUBROUTINE create_col_vec_from_matrix


! **************************************************************************************************

!> \brief creates a dbcsr row vector like object which lives on proc_row 0

!>        and has the same row dist as the template matrix

!>        the returned matrix is fully allocated and all blocks are set to 0

!>        this is not a sparse object (and must never be)

!> \param dbcsr_vec ...

!> \param matrix a dbcsr matrix used as template

!> \param nrow number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)

! **************************************************************************************************


   SUBROUTINE create_row_vec_from_matrix(dbcsr_vec, matrix, nrow)

      TYPE(dbcsr_type)                                   :: dbcsr_vec, matrix

      INTEGER                                            :: nrow


      CHARACTER(LEN=*), PARAMETER :: routinen = 'create_row_vec_from_matrix'


      INTEGER                                            :: handle, nprows

      INTEGER, DIMENSION(:), POINTER                     :: col_blk_size, col_dist, row_blk_size, &

                                                            row_dist

      TYPE(dbcsr_distribution_type)                      :: dist, dist_row_vec


      CALL timeset(routinen, handle)


      CALL dbcsr_get_info(matrix, col_blk_size=col_blk_size, distribution=dist)

      CALL dbcsr_distribution_get(dist, nprows=nprows, col_dist=col_dist)


      ALLOCATE (row_dist(1), row_blk_size(1))

      row_dist(1) = 0

      row_blk_size(1) = nrow

      CALL dbcsr_distribution_new(dist_row_vec, template=dist, row_dist=row_dist, col_dist=col_dist)


      CALL dbcsr_create(dbcsr_vec, "D", dist_row_vec, matrix_type=dbcsr_type_no_symmetry, &

                        row_blk_size=row_blk_size, col_blk_size=col_blk_size)

      CALL dbcsr_reserve_all_blocks(dbcsr_vec)


      CALL dbcsr_distribution_release(dist_row_vec)

      DEALLOCATE (row_dist, row_blk_size)

      CALL timestop(handle)


   END SUBROUTINE create_row_vec_from_matrix


! **************************************************************************************************

!> \brief creates a col vector like object whose blocks can be replicated

!>        along the processor row and has the same row dist as the template matrix

!>        the returned matrix is fully allocated and all blocks are set to 0

!>        this is not a sparse object (and must never be)

!> \param dbcsr_vec the vector object to create must be allocated but not initialized

!> \param matrix a dbcsr matrix used as template

!> \param ncol number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)

! **************************************************************************************************


   SUBROUTINE create_replicated_col_vec_from_matrix(dbcsr_vec, matrix, ncol)

      TYPE(dbcsr_type)                                   :: dbcsr_vec, matrix

      INTEGER                                            :: ncol


      CHARACTER(LEN=*), PARAMETER :: routinen = 'create_replicated_col_vec_from_matrix'


      INTEGER                                            :: handle, i, npcols

      INTEGER, DIMENSION(:), POINTER                     :: col_blk_size, col_dist, row_blk_size, &

                                                            row_dist

      TYPE(dbcsr_distribution_type)                      :: dist, dist_col_vec


      CALL timeset(routinen, handle)


      CALL dbcsr_get_info(matrix, row_blk_size=row_blk_size, distribution=dist)

      CALL dbcsr_distribution_get(dist, npcols=npcols, row_dist=row_dist)


      ALLOCATE (col_dist(npcols), col_blk_size(npcols))

      col_blk_size(:) = ncol

      DO i = 0, npcols - 1

         col_dist(i + 1) = i

      END DO

      CALL dbcsr_distribution_new(dist_col_vec, template=dist, row_dist=row_dist, col_dist=col_dist)


      CALL dbcsr_create(dbcsr_vec, "D", dist_col_vec, matrix_type=dbcsr_type_no_symmetry, &

                        row_blk_size=row_blk_size, col_blk_size=col_blk_size)

      CALL dbcsr_reserve_all_blocks(dbcsr_vec)


      CALL dbcsr_distribution_release(dist_col_vec)

      DEALLOCATE (col_dist, col_blk_size)

      CALL timestop(handle)


   END SUBROUTINE create_replicated_col_vec_from_matrix


! **************************************************************************************************

!> \brief creates a row vector like object whose blocks can be replicated

!>        along the processor col and has the same col dist as the template matrix

!>        the returned matrix is fully allocated and all blocks are set to 0

!>        this is not a sparse object (and must never be)

!> \param dbcsr_vec the vector object to create must be allocated but not initialized

!> \param matrix a dbcsr matrix used as template

!> \param nrow number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)

! **************************************************************************************************


   SUBROUTINE create_replicated_row_vec_from_matrix(dbcsr_vec, matrix, nrow)

      TYPE(dbcsr_type)                                   :: dbcsr_vec, matrix

      INTEGER                                            :: nrow


      CHARACTER(LEN=*), PARAMETER :: routinen = 'create_replicated_row_vec_from_matrix'


      INTEGER                                            :: handle, i, nprows

      INTEGER, DIMENSION(:), POINTER                     :: col_blk_size, col_dist, row_blk_size, &

                                                            row_dist

      TYPE(dbcsr_distribution_type)                      :: dist, dist_row_vec


      CALL timeset(routinen, handle)


      CALL dbcsr_get_info(matrix, distribution=dist, col_blk_size=col_blk_size)

      CALL dbcsr_distribution_get(dist, nprows=nprows, col_dist=col_dist)


      ALLOCATE (row_dist(nprows), row_blk_size(nprows))

      row_blk_size(:) = nrow

      DO i = 0, nprows - 1

         row_dist(i + 1) = i

      END DO

      CALL dbcsr_distribution_new(dist_row_vec, template=dist, row_dist=row_dist, col_dist=col_dist)


      CALL dbcsr_create(dbcsr_vec, "D", dist_row_vec, dbcsr_type_no_symmetry, &

                        row_blk_size=row_blk_size, col_blk_size=col_blk_size)

      CALL dbcsr_reserve_all_blocks(dbcsr_vec)


      CALL dbcsr_distribution_release(dist_row_vec)

      DEALLOCATE (row_dist, row_blk_size)

      CALL timestop(handle)


   END SUBROUTINE create_replicated_row_vec_from_matrix


! **************************************************************************************************

!> \brief given a column vector, prepare the fast_vec_access container

!> \param vec ...

!> \param fast_vec_access ...

! **************************************************************************************************

   SUBROUTINE create_fast_col_vec_access(vec, fast_vec_access)

      TYPE(dbcsr_type)                                   :: vec

      TYPE(fast_vec_access_type)                         :: fast_vec_access


      CHARACTER(LEN=*), PARAMETER :: routinen = 'create_fast_col_vec_access'


      INTEGER                                            :: col, handle, iblock, nblk_local, &

                                                            nthreads, row

      REAL(kind=real_8), DIMENSION(:, :), POINTER        :: vec_bl

      TYPE(dbcsr_iterator_type)                          :: iter


      CALL timeset(routinen, handle)


      ! figure out the number of threads

      nthreads = 1

!$OMP PARALLEL DEFAULT(NONE) SHARED(nthreads)

!$OMP MASTER

!$    nthreads = OMP_GET_NUM_THREADS()

!$OMP END MASTER

!$OMP END PARALLEL


      CALL dbcsr_get_info(matrix=vec, nblkrows_local=nblk_local)

      ! 4 times makes sure the table is big enough to limit collisions.

      CALL hash_table_create(fast_vec_access%hash_table, 4*nblk_local)

      ! include zero for effective dealing with values not in the hash table (will return 0)

      ALLOCATE (fast_vec_access%blk_map(0:nblk_local))


      CALL dbcsr_get_info(matrix=vec, nblkcols_local=col)

      IF (col .GT. 1) cpabort("BUG")


      ! go through the blocks of the vector

      iblock = 0

      CALL dbcsr_iterator_start(iter, vec)

      DO WHILE (dbcsr_iterator_blocks_left(iter))

         CALL dbcsr_iterator_next_block(iter, row, col, vec_bl)

         iblock = iblock + 1

         CALL hash_table_add(fast_vec_access%hash_table, row, iblock)

         fast_vec_access%blk_map(iblock)%ptr => vec_bl

         fast_vec_access%blk_map(iblock)%assigned_thread = mod(iblock, nthreads)

      END DO

      CALL dbcsr_iterator_stop(iter)

      CALL timestop(handle)


   END SUBROUTINE create_fast_col_vec_access


! **************************************************************************************************

!> \brief given a row vector, prepare the fast_vec_access_container

!> \param vec ...

!> \param fast_vec_access ...

! **************************************************************************************************

   SUBROUTINE create_fast_row_vec_access(vec, fast_vec_access)

      TYPE(dbcsr_type)                                   :: vec

      TYPE(fast_vec_access_type)                         :: fast_vec_access


      CHARACTER(LEN=*), PARAMETER :: routinen = 'create_fast_row_vec_access'


      INTEGER                                            :: col, handle, iblock, nblk_local, &

                                                            nthreads, row

      REAL(kind=real_8), DIMENSION(:, :), POINTER        :: vec_bl

      TYPE(dbcsr_iterator_type)                          :: iter


      CALL timeset(routinen, handle)


      ! figure out the number of threads

      nthreads = 1

!$OMP PARALLEL DEFAULT(NONE) SHARED(nthreads)

!$OMP MASTER

!$    nthreads = OMP_GET_NUM_THREADS()

!$OMP END MASTER

!$OMP END PARALLEL


      CALL dbcsr_get_info(matrix=vec, nblkcols_local=nblk_local)

      ! 4 times makes sure the table is big enough to limit collisions.

      CALL hash_table_create(fast_vec_access%hash_table, 4*nblk_local)

      ! include zero for effective dealing with values not in the hash table (will return 0)

      ALLOCATE (fast_vec_access%blk_map(0:nblk_local))


      ! sanity check

      CALL dbcsr_get_info(matrix=vec, nblkrows_local=row)

      IF (row .GT. 1) cpabort("BUG")


      ! go through the blocks of the vector

      iblock = 0

      CALL dbcsr_iterator_start(iter, vec)

      DO WHILE (dbcsr_iterator_blocks_left(iter))

         CALL dbcsr_iterator_next_block(iter, row, col, vec_bl)

         iblock = iblock + 1

         CALL hash_table_add(fast_vec_access%hash_table, col, iblock)

         fast_vec_access%blk_map(iblock)%ptr => vec_bl

         fast_vec_access%blk_map(iblock)%assigned_thread = mod(iblock, nthreads)

      END DO

      CALL dbcsr_iterator_stop(iter)


      CALL timestop(handle)


   END SUBROUTINE create_fast_row_vec_access


! **************************************************************************************************

!> \brief release all memory associated with the fast_vec_access type

!> \param fast_vec_access ...

! **************************************************************************************************

   SUBROUTINE release_fast_vec_access(fast_vec_access)

      TYPE(fast_vec_access_type)                         :: fast_vec_access


      CHARACTER(LEN=*), PARAMETER :: routinen = 'release_fast_vec_access'


      INTEGER                                            :: handle


      CALL timeset(routinen, handle)


      CALL hash_table_release(fast_vec_access%hash_table)

      DEALLOCATE (fast_vec_access%blk_map)


      CALL timestop(handle)


   END SUBROUTINE release_fast_vec_access


! --------------------------------------------------------------------------------------------------

! Beginning of hashtable.

! this file can be 'INCLUDE'ed verbatim in various place, where it needs to be

! part of the module to guarantee inlining

! hashes (c,p) pairs, where p is assumed to be >0

! on return (0 is used as a flag for not present)

!

!

! **************************************************************************************************

!> \brief finds a prime equal or larger than i, needed at creation

!> \param i ...

!> \return ...

! **************************************************************************************************

   FUNCTION matching_prime(i) RESULT(res)

      INTEGER, INTENT(IN)                                :: i

      INTEGER                                            :: res


      INTEGER                                            :: j


      res = i

      j = 0

      DO WHILE (j < res)

         DO j = 2, res - 1

            IF (mod(res, j) == 0) THEN

               res = res + 1

               EXIT

            END IF

         END DO

      END DO

   END FUNCTION


! **************************************************************************************************

!> \brief create a hash_table of given initial size.

!>        the hash table will expand as needed (but this requires rehashing)

!> \param hash_table ...

!> \param table_size ...

! **************************************************************************************************

   SUBROUTINE hash_table_create(hash_table, table_size)

      TYPE(hash_table_type)                              :: hash_table

      INTEGER, INTENT(IN)                                :: table_size


      INTEGER                                            :: j


      ! guarantee a minimal hash table size (8), so that expansion works


      j = 3

      DO WHILE (2**j - 1 < table_size)

         j = j + 1

      END DO

      hash_table%nmax = 2**j - 1

      hash_table%prime = matching_prime(hash_table%nmax)

      hash_table%nele = 0

      ALLOCATE (hash_table%table(0:hash_table%nmax))

   END SUBROUTINE hash_table_create


! **************************************************************************************************

!> \brief ...

!> \param hash_table ...

! **************************************************************************************************

   SUBROUTINE hash_table_release(hash_table)

      TYPE(hash_table_type)                              :: hash_table


      hash_table%nmax = 0

      hash_table%nele = 0

      DEALLOCATE (hash_table%table)


   END SUBROUTINE hash_table_release


! **************************************************************************************************

!> \brief add a pair (c,p) to the hash table

!> \param hash_table ...

!> \param c this value is being hashed

!> \param p this is being stored

! **************************************************************************************************

   RECURSIVE SUBROUTINE hash_table_add(hash_table, c, p)

      TYPE(hash_table_type), INTENT(INOUT)               :: hash_table

      INTEGER, INTENT(IN)                                :: c, p


      REAL(kind=real_8), PARAMETER :: hash_table_expand = 1.5_real_8, &

         inv_hash_table_fill = 2.5_real_8


      INTEGER                                            :: i, j

      TYPE(ele_type), ALLOCATABLE, DIMENSION(:)          :: tmp_hash


      ! if too small, make a copy and rehash in a larger table


      IF (hash_table%nele*inv_hash_table_fill > hash_table%nmax) THEN

         ALLOCATE (tmp_hash(lbound(hash_table%table, 1):ubound(hash_table%table, 1)))

         tmp_hash(:) = hash_table%table

         CALL hash_table_release(hash_table)

         CALL hash_table_create(hash_table, int((ubound(tmp_hash, 1) + 8)*hash_table_expand))

         DO i = lbound(tmp_hash, 1), ubound(tmp_hash, 1)

            IF (tmp_hash(i)%c .NE. 0) THEN

               CALL hash_table_add(hash_table, tmp_hash(i)%c, tmp_hash(i)%p)

            END IF

         END DO

         DEALLOCATE (tmp_hash)

      END IF


      hash_table%nele = hash_table%nele + 1

      i = iand(c*hash_table%prime, hash_table%nmax)


      DO j = i, hash_table%nmax

         IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN

            hash_table%table(j)%c = c

            hash_table%table(j)%p = p

            RETURN

         END IF

      END DO

      DO j = 0, i - 1

         IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN

            hash_table%table(j)%c = c

            hash_table%table(j)%p = p

            RETURN

         END IF

      END DO


   END SUBROUTINE hash_table_add


! **************************************************************************************************

!> \brief ...

!> \param hash_table ...

!> \param c ...

!> \return ...

! **************************************************************************************************

   PURE FUNCTION hash_table_get(hash_table, c) RESULT(p)

      TYPE(hash_table_type), INTENT(IN)                  :: hash_table

      INTEGER, INTENT(IN)                                :: c

      INTEGER                                            :: p


      INTEGER                                            :: i, j


      i = iand(c*hash_table%prime, hash_table%nmax)


      ! catch the likely case first

      IF (hash_table%table(i)%c == c) THEN

         p = hash_table%table(i)%p

         RETURN

      END IF


      DO j = i, hash_table%nmax

         IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN

            p = hash_table%table(j)%p

            RETURN

         END IF

      END DO

      DO j = 0, i - 1

         IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN

            p = hash_table%table(j)%p

            RETURN

         END IF

      END DO


      ! we should never reach this point.

      p = huge(p)


   END FUNCTION hash_table_get


! End of hashtable

! --------------------------------------------------------------------------------------------------


! **************************************************************************************************

!> \brief the real driver routine for the multiply, not all symmetries implemented yet

!> \param matrix ...

!> \param vec_in ...

!> \param vec_out ...

!> \param alpha ...

!> \param beta ...

!> \param work_row ...

!> \param work_col ...

! **************************************************************************************************


   SUBROUTINE dbcsr_matrix_colvec_multiply(matrix, vec_in, vec_out, alpha, beta, work_row, work_col)

      TYPE(dbcsr_type)                                   :: matrix, vec_in, vec_out

      REAL(kind=real_8)                                  :: alpha, beta

      TYPE(dbcsr_type)                                   :: work_row, work_col


      CHARACTER                                          :: matrix_type


      CALL dbcsr_get_info(matrix, matrix_type=matrix_type)


      SELECT CASE (matrix_type)

      CASE (dbcsr_type_no_symmetry)

         CALL dbcsr_matrix_vector_mult(matrix, vec_in, vec_out, alpha, beta, work_row, work_col)

      CASE (dbcsr_type_symmetric)

         CALL dbcsr_sym_matrix_vector_mult(matrix, vec_in, vec_out, alpha, beta, work_row, work_col)

      CASE (dbcsr_type_antisymmetric)

         ! Not yet implemented, should mainly be some prefactor magic, but who knows how antisymmetric matrices are stored???

         cpabort("NYI, antisymmetric matrix not permitted")

      CASE DEFAULT

         cpabort("Unknown matrix type, ...")

      END SELECT


   END SUBROUTINE dbcsr_matrix_colvec_multiply


! **************************************************************************************************

!> \brief low level routines for matrix vector multiplies

!> \param matrix ...

!> \param vec_in ...

!> \param vec_out ...

!> \param alpha ...

!> \param beta ...

!> \param work_row ...

!> \param work_col ...

! **************************************************************************************************

   SUBROUTINE dbcsr_matrix_vector_mult(matrix, vec_in, vec_out, alpha, beta, work_row, work_col)

      TYPE(dbcsr_type)                                   :: matrix, vec_in, vec_out

      REAL(kind=real_8)                                  :: alpha, beta

      TYPE(dbcsr_type)                                   :: work_row, work_col


      CHARACTER(LEN=*), PARAMETER :: routinen = 'dbcsr_matrix_vector_mult'


      INTEGER                                            :: col, handle, handle1, ithread, mypcol, &

                                                            myprow, ncols, nrows, pcol, prow, &

                                                            prow_handle, row

      REAL(kind=real_8), DIMENSION(:), POINTER           :: data_vec

      REAL(kind=real_8), DIMENSION(:, :), POINTER        :: data_d, vec_res

      TYPE(dbcsr_distribution_type)                      :: dist

      TYPE(dbcsr_iterator_type)                          :: iter

      TYPE(fast_vec_access_type)                         :: fast_vec_col, fast_vec_row

      TYPE(mp_comm_type)                                 :: prow_group


      CALL timeset(routinen, handle)

      ithread = 0


      ! Collect some data about the parallel environment. We will use them later to move the vector around

      CALL dbcsr_get_info(matrix, distribution=dist)

      CALL dbcsr_distribution_get(dist, prow_group=prow_handle, myprow=myprow, mypcol=mypcol)


      CALL prow_group%set_handle(prow_handle)


      CALL create_fast_row_vec_access(work_row, fast_vec_row)

      CALL create_fast_col_vec_access(work_col, fast_vec_col)


      ! Transfer the correct parts of the input vector to the correct locations so we can do a local multiply

      CALL dbcsr_col_vec_to_rep_row(vec_in, work_col, work_row, fast_vec_col)


      ! Set the work vector for the results to 0

      CALL dbcsr_set(work_col, 0.0_real_8)


      ! Perform the local multiply. Here we exploit, that we have the blocks replicated on the mpi processes

      ! It is important to note, that the input and result vector are distributed differently (row wise, col wise respectively)

      CALL timeset(routinen//"_local_mm", handle1)


!$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,ithread,pcol,prow) &

!$OMP          SHARED(matrix,fast_vec_col,fast_vec_row)

!$    ithread = omp_get_thread_num()

      CALL dbcsr_iterator_start(iter, matrix, shared=.false.)

      DO WHILE (dbcsr_iterator_blocks_left(iter))

         CALL dbcsr_iterator_next_block(iter, row, col, data_d)

         prow = hash_table_get(fast_vec_col%hash_table, row)

         IF (fast_vec_col%blk_map(prow)%assigned_thread .NE. ithread) cycle

         pcol = hash_table_get(fast_vec_row%hash_table, col)

         IF (SIZE(fast_vec_col%blk_map(prow)%ptr, 1) .EQ. 0 .OR. &

             SIZE(fast_vec_col%blk_map(prow)%ptr, 2) .EQ. 0 .OR. &

             SIZE(data_d, 2) .EQ. 0) cycle

         CALL dgemm('N', 'T', SIZE(fast_vec_col%blk_map(prow)%ptr, 1), &

                    SIZE(fast_vec_col%blk_map(prow)%ptr, 2), &

                    SIZE(data_d, 2), &

                    1.0_dp, &

                    data_d, &

                    SIZE(fast_vec_col%blk_map(prow)%ptr, 1), &

                    fast_vec_row%blk_map(pcol)%ptr, &

                    SIZE(fast_vec_col%blk_map(prow)%ptr, 2), &

                    1.0_dp, &

                    fast_vec_col%blk_map(prow)%ptr, &

                    SIZE(fast_vec_col%blk_map(prow)%ptr, 1))

      END DO

      CALL dbcsr_iterator_stop(iter)

!$OMP END PARALLEL


      CALL timestop(handle1)


      ! sum all the data onto the first processor col where the original vector is stored

      data_vec => dbcsr_get_data_p(work_col)

      CALL dbcsr_get_info(matrix=work_col, nfullrows_local=nrows, nfullcols_local=ncols)

      CALL prow_group%sum(data_vec(1:nrows*ncols))


      ! Local copy on the first mpi col (as this is the localtion of the vec_res blocks) of the result vector

      ! from the replicated to the original vector. Let's play it safe and use the iterator

      CALL dbcsr_iterator_start(iter, vec_out)

      DO WHILE (dbcsr_iterator_blocks_left(iter))

         CALL dbcsr_iterator_next_block(iter, row, col, vec_res)

         prow = hash_table_get(fast_vec_col%hash_table, row)

         IF (ASSOCIATED(fast_vec_col%blk_map(prow)%ptr)) THEN

            vec_res(:, :) = beta*vec_res(:, :) + alpha*fast_vec_col%blk_map(prow)%ptr(:, :)

         ELSE

            vec_res(:, :) = beta*vec_res(:, :)

         END IF

      END DO

      CALL dbcsr_iterator_stop(iter)


      CALL release_fast_vec_access(fast_vec_row)

      CALL release_fast_vec_access(fast_vec_col)


      CALL timestop(handle)


   END SUBROUTINE dbcsr_matrix_vector_mult


! **************************************************************************************************

!> \brief ...

!> \param matrix ...

!> \param vec_in ...

!> \param vec_out ...

!> \param alpha ...

!> \param beta ...

!> \param work_row ...

!> \param work_col ...

!> \param skip_diag ...

! **************************************************************************************************

   SUBROUTINE dbcsr_matrixt_vector_mult(matrix, vec_in, vec_out, alpha, beta, work_row, work_col, skip_diag)

      TYPE(dbcsr_type)                                   :: matrix, vec_in, vec_out

      REAL(kind=real_8)                                  :: alpha, beta

      TYPE(dbcsr_type)                                   :: work_row, work_col

      LOGICAL                                            :: skip_diag


      CHARACTER(LEN=*), PARAMETER :: routinen = 'dbcsr_matrixT_vector_mult'


      INTEGER                                            :: col, col_size, handle, handle1, ithread, &

                                                            mypcol, myprow, ncols, nrows, pcol, &

                                                            pcol_handle, prow, prow_handle, row, &

                                                            row_size

      REAL(kind=real_8), DIMENSION(:), POINTER           :: data_vec

      REAL(kind=real_8), DIMENSION(:, :), POINTER        :: data_d, vec_bl, vec_res

      TYPE(dbcsr_distribution_type)                      :: dist

      TYPE(dbcsr_iterator_type)                          :: iter

      TYPE(fast_vec_access_type)                         :: fast_vec_col, fast_vec_row

      TYPE(mp_comm_type)                                 :: pcol_group, prow_group


      CALL timeset(routinen, handle)

      ithread = 0


      ! Collect some data about the parallel environment. We will use them later to move the vector around

      CALL dbcsr_get_info(matrix, distribution=dist)

      CALL dbcsr_distribution_get(dist, prow_group=prow_handle, pcol_group=pcol_handle, myprow=myprow, mypcol=mypcol)


      CALL prow_group%set_handle(prow_handle)

      CALL pcol_group%set_handle(pcol_handle)


      CALL create_fast_row_vec_access(work_row, fast_vec_row)

      CALL create_fast_col_vec_access(work_col, fast_vec_col)


      ! Set the work vector for the results to 0

      CALL dbcsr_set(work_row, 0.0_real_8)


      ! Transfer the correct parts of the input vector to the replicated vector on proc_col 0

      CALL dbcsr_iterator_start(iter, vec_in)

      DO WHILE (dbcsr_iterator_blocks_left(iter))

         CALL dbcsr_iterator_next_block(iter, row, col, vec_bl, row_size=row_size, col_size=col_size)

         prow = hash_table_get(fast_vec_col%hash_table, row)

         fast_vec_col%blk_map(prow)%ptr(1:row_size, 1:col_size) = vec_bl(1:row_size, 1:col_size)

      END DO

      CALL dbcsr_iterator_stop(iter)

      ! Replicate the data on all processore in the row

      data_vec => dbcsr_get_data_p(work_col)

      CALL prow_group%bcast(data_vec, 0)


      ! Perform the local multiply. Here it is obvious why the vectors are replicated on the mpi rows and cols

      CALL timeset(routinen//"local_mm", handle1)

      CALL dbcsr_get_info(matrix=work_col, nfullcols_local=ncols)

!$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,row_size,col_size,ithread,prow,pcol) &

!$OMP          SHARED(matrix,fast_vec_row,fast_vec_col,skip_diag,ncols)

!$    ithread = omp_get_thread_num()

      CALL dbcsr_iterator_start(iter, matrix, shared=.false.)

      DO WHILE (dbcsr_iterator_blocks_left(iter))

         CALL dbcsr_iterator_next_block(iter, row, col, data_d, row_size=row_size, col_size=col_size)

         IF (skip_diag .AND. col == row) cycle

         prow = hash_table_get(fast_vec_col%hash_table, row)

         pcol = hash_table_get(fast_vec_row%hash_table, col)

         IF (ASSOCIATED(fast_vec_row%blk_map(pcol)%ptr) .AND. &

             ASSOCIATED(fast_vec_col%blk_map(prow)%ptr)) THEN

            IF (fast_vec_row%blk_map(pcol)%assigned_thread .NE. ithread) cycle

            fast_vec_row%blk_map(pcol)%ptr = fast_vec_row%blk_map(pcol)%ptr + &

                                             matmul(transpose(fast_vec_col%blk_map(prow)%ptr), data_d)

         ELSE

            prow = hash_table_get(fast_vec_row%hash_table, row)

            pcol = hash_table_get(fast_vec_col%hash_table, col)

            IF (fast_vec_row%blk_map(prow)%assigned_thread .NE. ithread) cycle

            fast_vec_row%blk_map(prow)%ptr = fast_vec_row%blk_map(prow)%ptr + &

                                             matmul(transpose(fast_vec_col%blk_map(pcol)%ptr), transpose(data_d))

         END IF

      END DO

      CALL dbcsr_iterator_stop(iter)

!$OMP END PARALLEL


      CALL timestop(handle1)


      ! sum all the data within a processor column to obtain the replicated result

      data_vec => dbcsr_get_data_p(work_row)

      ! we use the replicated vector but the final answer is only summed to proc_col 0 for efficiency

      CALL dbcsr_get_info(matrix=work_row, nfullrows_local=nrows, nfullcols_local=ncols)

      CALL pcol_group%sum(data_vec(1:nrows*ncols))


      ! Convert the result to a column wise distribution

      CALL dbcsr_rep_row_to_rep_col_vec(work_col, work_row, fast_vec_row)


      ! Create_the final vector by summing it to the result vector which lives on proc_col 0

      CALL dbcsr_iterator_start(iter, vec_out)

      DO WHILE (dbcsr_iterator_blocks_left(iter))

         CALL dbcsr_iterator_next_block(iter, row, col, vec_res, row_size=row_size)

         prow = hash_table_get(fast_vec_col%hash_table, row)

         IF (ASSOCIATED(fast_vec_col%blk_map(prow)%ptr)) THEN

            vec_res(:, :) = beta*vec_res(:, :) + alpha*fast_vec_col%blk_map(prow)%ptr(:, :)

         ELSE

            vec_res(:, :) = beta*vec_res(:, :)

         END IF

      END DO

      CALL dbcsr_iterator_stop(iter)


      CALL timestop(handle)


   END SUBROUTINE dbcsr_matrixt_vector_mult


! **************************************************************************************************

!> \brief ...

!> \param vec_in ...

!> \param rep_col_vec ...

!> \param rep_row_vec ...

!> \param fast_vec_col ...

! **************************************************************************************************

   SUBROUTINE dbcsr_col_vec_to_rep_row(vec_in, rep_col_vec, rep_row_vec, fast_vec_col)

      TYPE(dbcsr_type)                                   :: vec_in, rep_col_vec, rep_row_vec

      TYPE(fast_vec_access_type), INTENT(IN)             :: fast_vec_col


      CHARACTER(LEN=*), PARAMETER :: routinen = 'dbcsr_col_vec_to_rep_row'


      INTEGER                                            :: col, handle, mypcol, myprow, ncols, &

                                                            nrows, pcol_handle, prow_handle, row

      INTEGER, DIMENSION(:), POINTER                     :: local_cols, row_dist

      REAL(kind=real_8), DIMENSION(:), POINTER           :: data_vec, data_vec_rep

      REAL(kind=real_8), DIMENSION(:, :), POINTER        :: vec_row

      TYPE(dbcsr_distribution_type)                      :: dist_in, dist_rep_col

      TYPE(dbcsr_iterator_type)                          :: iter

      TYPE(mp_comm_type)                                 :: pcol_group, prow_group


      CALL timeset(routinen, handle)


      ! get information about the parallel environment

      CALL dbcsr_get_info(vec_in, distribution=dist_in)

      CALL dbcsr_distribution_get(dist_in, &

                                  prow_group=prow_handle, &

                                  pcol_group=pcol_handle, &

                                  myprow=myprow, &

                                  mypcol=mypcol)


      CALL prow_group%set_handle(prow_handle)

      CALL pcol_group%set_handle(pcol_handle)


      ! Get the vector which tells us which blocks are local to which processor row in the col vec

      CALL dbcsr_get_info(rep_col_vec, distribution=dist_rep_col)

      CALL dbcsr_distribution_get(dist_rep_col, row_dist=row_dist)


      ! Copy the local vector to the replicated on the first processor column (this is where vec_in lives)

      CALL dbcsr_get_info(matrix=rep_col_vec, nfullrows_local=nrows, nfullcols_local=ncols)

      data_vec_rep => dbcsr_get_data_p(rep_col_vec)

      data_vec => dbcsr_get_data_p(vec_in)

      IF (mypcol == 0) data_vec_rep(1:nrows*ncols) = data_vec(1:nrows*ncols)

      ! Replicate the data along the row

      CALL prow_group%bcast(data_vec_rep(1:nrows*ncols), 0)


      ! Here it gets a bit tricky as we are dealing with two different parallel layouts:

      ! The rep_col_vec contains all blocks local to the row distribution of the vector.

      ! The rep_row_vec only needs the fraction which is local to the col distribution.

      ! However in most cases this won't the complete set of block which can be obtained from col_vector p_row i

      ! Anyway, as the blocks don't repeat in the col_vec, a different fraction of the row vec will be available

      ! on every replica in the processor column, by summing along the column we end up with the complete vector everywhere

      ! Hope this clarifies the idea

      CALL dbcsr_set(rep_row_vec, 0.0_real_8)

      CALL dbcsr_get_info(matrix=rep_row_vec, nfullrows_local=nrows, local_cols=local_cols, nfullcols_local=ncols)

      CALL dbcsr_iterator_start(iter, rep_row_vec)

      DO WHILE (dbcsr_iterator_blocks_left(iter))

         CALL dbcsr_iterator_next_block(iter, row, col, vec_row)

         IF (row_dist(col) == myprow) THEN

            vec_row = transpose(fast_vec_col%blk_map(hash_table_get(fast_vec_col%hash_table, col))%ptr)

         END IF

      END DO

      CALL dbcsr_iterator_stop(iter)

      CALL dbcsr_get_info(matrix=rep_row_vec, nfullrows_local=nrows, nfullcols_local=ncols)

      data_vec_rep => dbcsr_get_data_p(rep_row_vec)

      CALL pcol_group%sum(data_vec_rep(1:ncols*nrows))


      CALL timestop(handle)


   END SUBROUTINE dbcsr_col_vec_to_rep_row


! **************************************************************************************************

!> \brief ...

!> \param rep_col_vec ...

!> \param rep_row_vec ...

!> \param fast_vec_row ...

!> \param fast_vec_col_add ...

! **************************************************************************************************

   SUBROUTINE dbcsr_rep_row_to_rep_col_vec(rep_col_vec, rep_row_vec, fast_vec_row, fast_vec_col_add)

      TYPE(dbcsr_type)                                   :: rep_col_vec, rep_row_vec

      TYPE(fast_vec_access_type)                         :: fast_vec_row

      TYPE(fast_vec_access_type), OPTIONAL               :: fast_vec_col_add


      CHARACTER(LEN=*), PARAMETER :: routinen = 'dbcsr_rep_row_to_rep_col_vec'


      INTEGER                                            :: col, handle, mypcol, myprow, ncols, &

                                                            nrows, prow_handle, row

      INTEGER, DIMENSION(:), POINTER                     :: col_dist

      REAL(kind=real_8), DIMENSION(:), POINTER           :: data_vec_rep

      REAL(kind=real_8), DIMENSION(:, :), POINTER        :: vec_col

      TYPE(dbcsr_distribution_type)                      :: dist_rep_col, dist_rep_row

      TYPE(dbcsr_iterator_type)                          :: iter

      TYPE(mp_comm_type)                                 :: prow_group


      CALL timeset(routinen, handle)


      ! get information about the parallel environment

      CALL dbcsr_get_info(matrix=rep_col_vec, distribution=dist_rep_col)

      CALL dbcsr_distribution_get(dist_rep_col, &

                                  prow_group=prow_handle, &

                                  myprow=myprow, &

                                  mypcol=mypcol)


      CALL prow_group%set_handle(prow_handle)


      ! Get the vector which tells us which blocks are local to which processor col in the row vec

      CALL dbcsr_get_info(matrix=rep_row_vec, distribution=dist_rep_row)

      CALL dbcsr_distribution_get(dist_rep_row, col_dist=col_dist)


      ! The same trick as described above with opposite direction

      CALL dbcsr_set(rep_col_vec, 0.0_real_8)

      CALL dbcsr_iterator_start(iter, rep_col_vec)

      DO WHILE (dbcsr_iterator_blocks_left(iter))

         CALL dbcsr_iterator_next_block(iter, row, col, vec_col)

         IF (col_dist(row) == mypcol) THEN

            vec_col = transpose(fast_vec_row%blk_map(hash_table_get(fast_vec_row%hash_table, row))%ptr)

         END IF

         ! this one is special and allows to add the elements of a not yet summed replicated

         ! column vector as it appears in M*V(row_rep) as result. Save an parallel summation in the symmetric case

         IF (PRESENT(fast_vec_col_add)) THEN

            vec_col = vec_col + fast_vec_col_add%blk_map(hash_table_get(fast_vec_col_add%hash_table, row))%ptr(:, :)

         END IF

      END DO

      CALL dbcsr_iterator_stop(iter)

      CALL dbcsr_get_info(matrix=rep_col_vec, nfullrows_local=nrows, nfullcols_local=ncols)

      data_vec_rep => dbcsr_get_data_p(rep_col_vec)

      CALL prow_group%sum(data_vec_rep(1:nrows*ncols))


      CALL timestop(handle)


   END SUBROUTINE dbcsr_rep_row_to_rep_col_vec


! **************************************************************************************************

!> \brief ...

!> \param matrix ...

!> \param vec_in ...

!> \param vec_out ...

!> \param alpha ...

!> \param beta ...

!> \param work_row ...

!> \param work_col ...

! **************************************************************************************************

   SUBROUTINE dbcsr_sym_matrix_vector_mult(matrix, vec_in, vec_out, alpha, beta, work_row, work_col)

      TYPE(dbcsr_type)                                   :: matrix, vec_in, vec_out

      REAL(kind=real_8)                                  :: alpha, beta

      TYPE(dbcsr_type)                                   :: work_row, work_col


      CHARACTER(LEN=*), PARAMETER :: routinen = 'dbcsr_sym_matrix_vector_mult'


      INTEGER                                            :: col, handle, handle1, ithread, mypcol, &

                                                            myprow, ncols, nrows, pcol, &

                                                            pcol_handle, prow, row, rpcol, rprow, &

                                                            vec_dim

      REAL(kind=real_8), DIMENSION(:), POINTER           :: data_vec

      REAL(kind=real_8), DIMENSION(:, :), POINTER        :: data_d, vec_res

      TYPE(dbcsr_distribution_type)                      :: dist

      TYPE(dbcsr_iterator_type)                          :: iter

      TYPE(dbcsr_type)                                   :: result_col, result_row

      TYPE(fast_vec_access_type)                         :: fast_vec_col, fast_vec_row, &

                                                            res_fast_vec_col, res_fast_vec_row

      TYPE(mp_comm_type)                                 :: pcol_group


      CALL timeset(routinen, handle)

      ithread = 0

      ! We need some work matrices as we try to exploit operations on the replicated vectors which are duplicated otherwise

      CALL dbcsr_get_info(matrix=vec_in, nfullcols_total=vec_dim)

      ! This is a performance hack as the new creation of a replicated vector is a fair bit more expensive

      CALL dbcsr_set(work_col, 0.0_real_8)

      CALL dbcsr_copy(result_col, work_col)

      CALL dbcsr_set(work_row, 0.0_real_8)

      CALL dbcsr_copy(result_row, work_row)


      ! Collect some data about the parallel environment. We will use them later to move the vector around

      CALL dbcsr_get_info(matrix=matrix, distribution=dist)

      CALL dbcsr_distribution_get(dist, pcol_group=pcol_handle, myprow=myprow, mypcol=mypcol)


      CALL pcol_group%set_handle(pcol_handle)


      CALL create_fast_row_vec_access(work_row, fast_vec_row)

      CALL create_fast_col_vec_access(work_col, fast_vec_col)

      CALL create_fast_row_vec_access(result_row, res_fast_vec_row)

      CALL create_fast_col_vec_access(result_col, res_fast_vec_col)


      ! Transfer the correct parts of the input vector to the correct locations so we can do a local multiply

      CALL dbcsr_col_vec_to_rep_row(vec_in, work_col, work_row, fast_vec_col)


      ! Probably I should rename the routine above as it delivers both the replicated row and column vector


      ! Perform the local multiply. Here we exploit, that we have the blocks replicated on the mpi processes

      ! It is important to note, that the input and result vector are distributed differently (row wise, col wise respectively)

      CALL timeset(routinen//"_local_mm", handle1)


      !------ perform the multiplication, we have to take car to take the correct blocks ----------


!$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,ithread,pcol,prow,rpcol,rprow) &

!$OMP          SHARED(matrix,fast_vec_row,res_fast_vec_col,res_fast_vec_row,fast_vec_col)

!$    ithread = omp_get_thread_num()

      CALL dbcsr_iterator_start(iter, matrix, shared=.false.)

      DO WHILE (dbcsr_iterator_blocks_left(iter))

         CALL dbcsr_iterator_next_block(iter, row, col, data_d)

         pcol = hash_table_get(fast_vec_row%hash_table, col)

         rprow = hash_table_get(res_fast_vec_col%hash_table, row)

         IF (ASSOCIATED(fast_vec_row%blk_map(pcol)%ptr) .AND. &

             ASSOCIATED(res_fast_vec_col%blk_map(rprow)%ptr)) THEN

            IF (res_fast_vec_col%blk_map(rprow)%assigned_thread .EQ. ithread) THEN

               res_fast_vec_col%blk_map(rprow)%ptr = res_fast_vec_col%blk_map(rprow)%ptr + &

                                                     matmul(data_d, transpose(fast_vec_row%blk_map(pcol)%ptr))

            END IF

            prow = hash_table_get(fast_vec_col%hash_table, row)

            rpcol = hash_table_get(res_fast_vec_row%hash_table, col)

            IF (res_fast_vec_row%blk_map(rpcol)%assigned_thread .EQ. ithread .AND. row .NE. col) THEN

               res_fast_vec_row%blk_map(rpcol)%ptr = res_fast_vec_row%blk_map(rpcol)%ptr + &

                                                     matmul(transpose(fast_vec_col%blk_map(prow)%ptr), data_d)

            END IF

         ELSE

            rpcol = hash_table_get(res_fast_vec_col%hash_table, col)

            prow = hash_table_get(fast_vec_row%hash_table, row)

            IF (res_fast_vec_col%blk_map(rpcol)%assigned_thread .EQ. ithread) THEN

               res_fast_vec_col%blk_map(rpcol)%ptr = res_fast_vec_col%blk_map(rpcol)%ptr + &

                                                     transpose(matmul(fast_vec_row%blk_map(prow)%ptr, data_d))

            END IF

            rprow = hash_table_get(res_fast_vec_row%hash_table, row)

            pcol = hash_table_get(fast_vec_col%hash_table, col)

            IF (res_fast_vec_row%blk_map(rprow)%assigned_thread .EQ. ithread .AND. row .NE. col) THEN

               res_fast_vec_row%blk_map(rprow)%ptr = res_fast_vec_row%blk_map(rprow)%ptr + &

                                                     transpose(matmul(data_d, fast_vec_col%blk_map(pcol)%ptr))

            END IF

         END IF

      END DO

      CALL dbcsr_iterator_stop(iter)

!$OMP END PARALLEL


      CALL timestop(handle1)


      ! sum all the data within a processor column to obtain the replicated result from lower

      data_vec => dbcsr_get_data_p(result_row)

      CALL dbcsr_get_info(matrix=result_row, nfullrows_local=nrows, nfullcols_local=ncols)


      CALL pcol_group%sum(data_vec(1:nrows*ncols))


      ! Convert the results to a column wise distribution, this is a bit involved as the result_row is fully replicated

      ! While the result_col still has the partial results in parallel. The routine below takes care of that and saves a

      ! parallel summation. Of the res_row vectors are created only taking the appropriate element (0 otherwise) while the res_col

      ! parallel bits are locally added. The sum magically creates the correct vector

      CALL dbcsr_rep_row_to_rep_col_vec(work_col, result_row, res_fast_vec_row, res_fast_vec_col)


      ! Create_the final vector by summing it to the result vector which lives on proc_col 0 lower

      CALL dbcsr_iterator_start(iter, vec_out)

      DO WHILE (dbcsr_iterator_blocks_left(iter))

         CALL dbcsr_iterator_next_block(iter, row, col, vec_res)

         prow = hash_table_get(fast_vec_col%hash_table, row)

         IF (ASSOCIATED(fast_vec_col%blk_map(prow)%ptr)) THEN

            vec_res(:, :) = beta*vec_res(:, :) + alpha*(fast_vec_col%blk_map(prow)%ptr(:, :))

         ELSE

            vec_res(:, :) = beta*vec_res(:, :)

         END IF

      END DO

      CALL dbcsr_iterator_stop(iter)


      CALL release_fast_vec_access(fast_vec_row)

      CALL release_fast_vec_access(fast_vec_col)

      CALL release_fast_vec_access(res_fast_vec_row)

      CALL release_fast_vec_access(res_fast_vec_col)


      CALL dbcsr_release(result_row); CALL dbcsr_release(result_col)


      CALL timestop(handle)


   END SUBROUTINE dbcsr_sym_matrix_vector_mult


END MODULE arnoldi_vector

dgemm
static void dgemm(const char transa, const char transb, const int m, const int n, const int k, const double alpha, const double *a, const int lda, const double *b, const int ldb, const double beta, double *c, const int ldc)
Convenient wrapper to hide Fortran nature of dgemm_, swapping a and b.
Definition grid_cpu_task_list.c:206

cp_dbcsr_api::dbcsr_create
Definition cp_dbcsr_api.F:192

arnoldi_vector
operations for skinny matrices/vectors expressed in dbcsr form
Definition arnoldi_vector.F:16

arnoldi_vector::create_replicated_col_vec_from_matrix
subroutine, public create_replicated_col_vec_from_matrix(dbcsr_vec, matrix, ncol)
creates a col vector like object whose blocks can be replicated along the processor row and has the s...
Definition arnoldi_vector.F:163

arnoldi_vector::create_row_vec_from_matrix
subroutine, public create_row_vec_from_matrix(dbcsr_vec, matrix, nrow)
creates a dbcsr row vector like object which lives on proc_row 0 and has the same row dist as the tem...
Definition arnoldi_vector.F:123

arnoldi_vector::create_col_vec_from_matrix
subroutine, public create_col_vec_from_matrix(dbcsr_vec, matrix, ncol)
creates a dbcsr col vector like object which lives on proc_col 0 and has the same row dist as the tem...
Definition arnoldi_vector.F:83

arnoldi_vector::create_replicated_row_vec_from_matrix
subroutine, public create_replicated_row_vec_from_matrix(dbcsr_vec, matrix, nrow)
creates a row vector like object whose blocks can be replicated along the processor col and has the s...
Definition arnoldi_vector.F:205

arnoldi_vector::dbcsr_matrix_colvec_multiply
subroutine, public dbcsr_matrix_colvec_multiply(matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
the real driver routine for the multiply, not all symmetries implemented yet
Definition arnoldi_vector.F:531

cp_dbcsr_api
Definition cp_dbcsr_api.F:8

cp_dbcsr_api::dbcsr_distribution_release
subroutine, public dbcsr_distribution_release(dist)
...
Definition cp_dbcsr_api.F:629

cp_dbcsr_api::dbcsr_distribution_new
subroutine, public dbcsr_distribution_new(dist, template, group, pgrid, row_dist, col_dist, reuse_arrays)
...
Definition cp_dbcsr_api.F:608

cp_dbcsr_api::dbcsr_iterator_next_block
subroutine, public dbcsr_iterator_next_block(iterator, row, column, block, block_number_argument_has_been_removed, row_size, col_size, row_offset, col_offset)
...
Definition cp_dbcsr_api.F:956

cp_dbcsr_api::dbcsr_iterator_blocks_left
logical function, public dbcsr_iterator_blocks_left(iterator)
...
Definition cp_dbcsr_api.F:930

cp_dbcsr_api::dbcsr_iterator_stop
subroutine, public dbcsr_iterator_stop(iterator)
...
Definition cp_dbcsr_api.F:1027

cp_dbcsr_api::dbcsr_copy
subroutine, public dbcsr_copy(matrix_b, matrix_a, name, keep_sparsity, keep_imaginary)
...
Definition cp_dbcsr_api.F:368

cp_dbcsr_api::dbcsr_get_info
subroutine, public dbcsr_get_info(matrix, nblkrows_total, nblkcols_total, nfullrows_total, nfullcols_total, nblkrows_local, nblkcols_local, nfullrows_local, nfullcols_local, my_prow, my_pcol, local_rows, local_cols, proc_row_dist, proc_col_dist, row_blk_size, col_blk_size, row_blk_offset, col_blk_offset, distribution, name, matrix_type, group)
...
Definition cp_dbcsr_api.F:794

cp_dbcsr_api::dbcsr_get_data_p
real(kind=dp) function, dimension(:), pointer, public dbcsr_get_data_p(matrix, lb, ub)
...
Definition cp_dbcsr_api.F:735

cp_dbcsr_api::dbcsr_iterator_start
subroutine, public dbcsr_iterator_start(iterator, matrix, shared, dynamic, dynamic_byrows)
...
Definition cp_dbcsr_api.F:989

cp_dbcsr_api::dbcsr_set
subroutine, public dbcsr_set(matrix, alpha)
...
Definition cp_dbcsr_api.F:1181

cp_dbcsr_api::dbcsr_release
subroutine, public dbcsr_release(matrix)
...
Definition cp_dbcsr_api.F:1119

cp_dbcsr_api::dbcsr_distribution_get
subroutine, public dbcsr_distribution_get(dist, row_dist, col_dist, nrows, ncols, has_threads, group, mynode, numnodes, nprows, npcols, myprow, mypcol, pgrid, subgroups_defined, prow_group, pcol_group)
...
Definition cp_dbcsr_api.F:564

cp_dbcsr_contrib
Definition cp_dbcsr_contrib.F:8

cp_dbcsr_contrib::dbcsr_reserve_all_blocks
subroutine, public dbcsr_reserve_all_blocks(matrix)
Reserves all blocks.
Definition cp_dbcsr_contrib.F:295

kinds
Defines the basic variable types.
Definition kinds.F:23

kinds::dp
integer, parameter, public dp
Definition kinds.F:34

kinds::real_8
integer, parameter, public real_8
Definition kinds.F:41

message_passing
Interface to the message passing library MPI.
Definition message_passing.F:23

cp_dbcsr_api::dbcsr_distribution_type
Definition cp_dbcsr_api.F:180

cp_dbcsr_api::dbcsr_iterator_type
Definition cp_dbcsr_api.F:186

cp_dbcsr_api::dbcsr_type
Definition cp_dbcsr_api.F:174

message_passing::mp_comm_type
Definition message_passing.F:189