d6/dc4/dbcsr__vector_8F_source.html

 !--------------------------------------------------------------------------------------------------!

 !   CP2K: A general program to perform molecular dynamics simulations                              !

 !   Copyright 2000-2024 CP2K developers group <https://cp2k.org>                                   !

 !                                                                                                  !

 !   SPDX-License-Identifier: GPL-2.0-or-later                                                      !

 !--------------------------------------------------------------------------------------------------!


 ! **************************************************************************************************

 !> \brief operations for skinny matrices/vectors expressed in dbcsr form

 !> \par History

 !>       2014.10 created [Florian Schiffmann]

 !> \author Florian Schiffmann

 ! **************************************************************************************************


 MODULE dbcsr_vector

    USE dbcsr_api, ONLY: dbcsr_copy, &

                         dbcsr_create, &

                         dbcsr_distribution_get, &

                         dbcsr_distribution_new, &

                         dbcsr_distribution_release, &

                         dbcsr_distribution_type, &

                         dbcsr_get_info, &

                         dbcsr_iterator_blocks_left, &

                         dbcsr_iterator_next_block, &

                         dbcsr_iterator_start, &

                         dbcsr_iterator_stop, &

                         dbcsr_iterator_type, &

                         dbcsr_release, &

                         dbcsr_reserve_all_blocks, &

                         dbcsr_set, dbcsr_get_data_p, &

                         dbcsr_type, &

                         dbcsr_type_antisymmetric, &

                         dbcsr_type_complex_8, &

                         dbcsr_type_complex_8, &

                         dbcsr_type_no_symmetry, &

                         dbcsr_type_real_8, &

                         dbcsr_type_real_8, &

                         dbcsr_type_symmetric

    USE kinds, ONLY: dp, &

                     real_8

    USE message_passing, ONLY: mp_comm_type


 #include "../base/base_uses.f90"


 !$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads


    IMPLICIT NONE


    PRIVATE


    CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'dbcsr_vector_operations'


 ! **************************************************************************************************

 !> \brief Types needed for the hashtable.

 ! **************************************************************************************************

    TYPE ele_type

       INTEGER :: c = 0

       INTEGER :: p = 0

    END TYPE ele_type


    TYPE hash_table_type

       TYPE(ele_type), DIMENSION(:), POINTER :: table => null()

       INTEGER :: nele = 0

       INTEGER :: nmax = 0

       INTEGER :: prime = 0

    END TYPE hash_table_type


 ! **************************************************************************************************

 !> \brief Types needed for fast access to distributed dbcsr vectors.

 ! **************************************************************************************************

    TYPE block_ptr_d

       REAL(real_8), DIMENSION(:, :), POINTER          :: ptr => null()

       INTEGER                                         :: assigned_thread = -1

    END TYPE

    TYPE block_ptr_z

       COMPLEX(real_8), DIMENSION(:, :), POINTER       :: ptr => null()

       INTEGER                                         :: assigned_thread = -1

    END TYPE


    TYPE fast_vec_access_type

       TYPE(hash_table_type) :: hash_table = hash_table_type()

       TYPE(block_ptr_d), DIMENSION(:), ALLOCATABLE :: blk_map_d

       TYPE(block_ptr_z), DIMENSION(:), ALLOCATABLE :: blk_map_z

    END TYPE


    PUBLIC :: dbcsr_matrix_colvec_multiply, &

              create_col_vec_from_matrix, &

              create_row_vec_from_matrix, &

              create_replicated_col_vec_from_matrix, &

              create_replicated_row_vec_from_matrix


    INTERFACE dbcsr_matrix_colvec_multiply

       MODULE PROCEDURE dbcsr_matrix_colvec_multiply_d

       MODULE PROCEDURE dbcsr_matrix_colvec_multiply_z

    END INTERFACE


 CONTAINS


 ! **************************************************************************************************

 !> \brief creates a dbcsr col vector like object which lives on proc_col 0

 !>        and has the same row dist as the template matrix

 !>        the returned matrix is fully allocated and all blocks are set to 0

 !>        this is not a sparse object (and must never be)

 !> \param dbcsr_vec  the vector object to create must be allocated but not initialized

 !> \param matrix a dbcsr matrix used as template

 !> \param ncol number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)

 ! **************************************************************************************************

    SUBROUTINE create_col_vec_from_matrix(dbcsr_vec, matrix, ncol)

       TYPE(dbcsr_type)                                   :: dbcsr_vec, matrix

       INTEGER                                            :: ncol


       CHARACTER(LEN=*), PARAMETER :: routinen = 'create_col_vec_from_matrix'


       INTEGER                                            :: handle, npcols, data_type

       INTEGER, DIMENSION(:), POINTER                     :: row_blk_size, col_blk_size, row_dist, col_dist

       TYPE(dbcsr_distribution_type)                      :: dist_col_vec, dist


       CALL timeset(routinen, handle)


       CALL dbcsr_get_info(matrix, data_type=data_type, row_blk_size=row_blk_size, distribution=dist)

       CALL dbcsr_distribution_get(dist, npcols=npcols, row_dist=row_dist)


       ALLOCATE (col_dist(1), col_blk_size(1))

       col_dist(1) = 0

       col_blk_size(1) = ncol

       CALL dbcsr_distribution_new(dist_col_vec, template=dist, row_dist=row_dist, col_dist=col_dist)


       CALL dbcsr_create(dbcsr_vec, "D", dist_col_vec, &

                         matrix_type=dbcsr_type_no_symmetry, &

                         row_blk_size=row_blk_size, &

                         col_blk_size=col_blk_size, &

                         nze=0, data_type=data_type)

       CALL dbcsr_reserve_all_blocks(dbcsr_vec)


       CALL dbcsr_distribution_release(dist_col_vec)

       DEALLOCATE (col_dist, col_blk_size)

       CALL timestop(handle)


    END SUBROUTINE create_col_vec_from_matrix


 ! **************************************************************************************************

 !> \brief creates a dbcsr row vector like object which lives on proc_row 0

 !>        and has the same row dist as the template matrix

 !>        the returned matrix is fully allocated and all blocks are set to 0

 !>        this is not a sparse object (and must never be)

 !> \param dbcsr_vec ...

 !> \param matrix a dbcsr matrix used as template

 !> \param nrow number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)

 ! **************************************************************************************************

    SUBROUTINE create_row_vec_from_matrix(dbcsr_vec, matrix, nrow)

       TYPE(dbcsr_type)                                   :: dbcsr_vec, matrix

       INTEGER                                            :: nrow


       CHARACTER(LEN=*), PARAMETER :: routinen = 'create_row_vec_from_matrix'


       INTEGER                                            :: handle, nprows, data_type

       INTEGER, DIMENSION(:), POINTER                     :: row_blk_size, col_blk_size, row_dist, col_dist

       TYPE(dbcsr_distribution_type)                      :: dist_row_vec, dist


       CALL timeset(routinen, handle)


       CALL dbcsr_get_info(matrix, data_type=data_type, col_blk_size=col_blk_size, distribution=dist)

       CALL dbcsr_distribution_get(dist, nprows=nprows, col_dist=col_dist)


       ALLOCATE (row_dist(1), row_blk_size(1))

       row_dist(1) = 0

       row_blk_size(1) = nrow

       CALL dbcsr_distribution_new(dist_row_vec, template=dist, row_dist=row_dist, col_dist=col_dist)


       CALL dbcsr_create(dbcsr_vec, "D", dist_row_vec, &

                         matrix_type=dbcsr_type_no_symmetry, &

                         row_blk_size=row_blk_size, &

                         col_blk_size=col_blk_size, &

                         nze=0, data_type=data_type)

       CALL dbcsr_reserve_all_blocks(dbcsr_vec)


       CALL dbcsr_distribution_release(dist_row_vec)

       DEALLOCATE (row_dist, row_blk_size)

       CALL timestop(handle)


    END SUBROUTINE create_row_vec_from_matrix


 ! **************************************************************************************************

 !> \brief creates a col vector like object whose blocks can be replicated

 !>        along the processor row and has the same row dist as the template matrix

 !>        the returned matrix is fully allocated and all blocks are set to 0

 !>        this is not a sparse object (and must never be)

 !> \param dbcsr_vec the vector object to create must be allocated but not initialized

 !> \param matrix a dbcsr matrix used as template

 !> \param ncol number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)

 ! **************************************************************************************************

    SUBROUTINE create_replicated_col_vec_from_matrix(dbcsr_vec, matrix, ncol)

       TYPE(dbcsr_type)                                   :: dbcsr_vec, matrix

       INTEGER                                            :: ncol


       CHARACTER(LEN=*), PARAMETER :: routinen = 'create_replicated_col_vec_from_matrix'


       INTEGER                                            :: handle, npcols, data_type, i

       INTEGER, DIMENSION(:), POINTER                     :: row_blk_size, col_blk_size, row_dist, col_dist

       TYPE(dbcsr_distribution_type)                      :: dist_col_vec, dist

       CALL timeset(routinen, handle)


       CALL dbcsr_get_info(matrix, data_type=data_type, row_blk_size=row_blk_size, distribution=dist)

       CALL dbcsr_distribution_get(dist, npcols=npcols, row_dist=row_dist)


       ALLOCATE (col_dist(npcols), col_blk_size(npcols))

       col_blk_size(:) = ncol

       DO i = 0, npcols - 1

          col_dist(i + 1) = i

       END DO

       CALL dbcsr_distribution_new(dist_col_vec, template=dist, row_dist=row_dist, col_dist=col_dist)


       CALL dbcsr_create(dbcsr_vec, "D", dist_col_vec, &

                         matrix_type=dbcsr_type_no_symmetry, &

                         row_blk_size=row_blk_size, &

                         col_blk_size=col_blk_size, &

                         nze=0, data_type=data_type)

       CALL dbcsr_reserve_all_blocks(dbcsr_vec)


       CALL dbcsr_distribution_release(dist_col_vec)

       DEALLOCATE (col_dist, col_blk_size)

       CALL timestop(handle)


    END SUBROUTINE create_replicated_col_vec_from_matrix


 ! **************************************************************************************************

 !> \brief creates a row vector like object whose blocks can be replicated

 !>        along the processor col and has the same col dist as the template matrix

 !>        the returned matrix is fully allocated and all blocks are set to 0

 !>        this is not a sparse object (and must never be)

 !> \param dbcsr_vec the vector object to create must be allocated but not initialized

 !> \param matrix a dbcsr matrix used as template

 !> \param nrow number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)

 ! **************************************************************************************************

    SUBROUTINE create_replicated_row_vec_from_matrix(dbcsr_vec, matrix, nrow)

       TYPE(dbcsr_type)                                   :: dbcsr_vec

       TYPE(dbcsr_type)                                   :: matrix

       INTEGER                                            :: nrow


       CHARACTER(LEN=*), PARAMETER :: routinen = 'create_replicated_row_vec_from_matrix'


       INTEGER                                            :: handle, i, nprows, data_type

       INTEGER, DIMENSION(:), POINTER                     :: row_dist, col_dist, row_blk_size, col_blk_size

       TYPE(dbcsr_distribution_type)                      :: dist_row_vec, dist


       CALL timeset(routinen, handle)


       CALL dbcsr_get_info(matrix, distribution=dist, col_blk_size=col_blk_size, data_type=data_type)

       CALL dbcsr_distribution_get(dist, nprows=nprows, col_dist=col_dist)


       ALLOCATE (row_dist(nprows), row_blk_size(nprows))

       row_blk_size(:) = nrow

       DO i = 0, nprows - 1

          row_dist(i + 1) = i

       END DO

       CALL dbcsr_distribution_new(dist_row_vec, template=dist, row_dist=row_dist, col_dist=col_dist)


       CALL dbcsr_create(dbcsr_vec, "D", dist_row_vec, dbcsr_type_no_symmetry, &

                         row_blk_size=row_blk_size, col_blk_size=col_blk_size, &

                         nze=0, data_type=data_type)

       CALL dbcsr_reserve_all_blocks(dbcsr_vec)


       CALL dbcsr_distribution_release(dist_row_vec)

       DEALLOCATE (row_dist, row_blk_size)

       CALL timestop(handle)


    END SUBROUTINE create_replicated_row_vec_from_matrix


 ! **************************************************************************************************

 !> \brief given a column vector, prepare the fast_vec_access container

 !> \param vec ...

 !> \param fast_vec_access ...

 ! **************************************************************************************************

    SUBROUTINE create_fast_col_vec_access(vec, fast_vec_access)

       TYPE(dbcsr_type)                                   :: vec

       TYPE(fast_vec_access_type)                         :: fast_vec_access


       CHARACTER(LEN=*), PARAMETER :: routinen = 'create_fast_col_vec_access'


       INTEGER                                            :: handle, data_type


       CALL timeset(routinen, handle)


       CALL dbcsr_get_info(vec, data_type=data_type)


       SELECT CASE (data_type)

       CASE (dbcsr_type_real_8)

          CALL create_fast_col_vec_access_d(vec, fast_vec_access)

       CASE (dbcsr_type_complex_8)

          CALL create_fast_col_vec_access_z(vec, fast_vec_access)

       END SELECT


       CALL timestop(handle)


    END SUBROUTINE create_fast_col_vec_access


 ! **************************************************************************************************

 !> \brief given a row vector, prepare the fast_vec_access_container

 !> \param vec ...

 !> \param fast_vec_access ...

 ! **************************************************************************************************

    SUBROUTINE create_fast_row_vec_access(vec, fast_vec_access)

       TYPE(dbcsr_type)                                   :: vec

       TYPE(fast_vec_access_type)                         :: fast_vec_access


       CHARACTER(LEN=*), PARAMETER :: routinen = 'create_fast_row_vec_access'


       INTEGER                                            :: handle, data_type


       CALL timeset(routinen, handle)


       CALL dbcsr_get_info(vec, data_type=data_type)


       SELECT CASE (data_type)

       CASE (dbcsr_type_real_8)

          CALL create_fast_row_vec_access_d(vec, fast_vec_access)

       CASE (dbcsr_type_complex_8)

          CALL create_fast_row_vec_access_z(vec, fast_vec_access)

       END SELECT


       CALL timestop(handle)


    END SUBROUTINE create_fast_row_vec_access


 ! **************************************************************************************************

 !> \brief release all memory associated with the fast_vec_access type

 !> \param fast_vec_access ...

 ! **************************************************************************************************

    SUBROUTINE release_fast_vec_access(fast_vec_access)

       TYPE(fast_vec_access_type)                         :: fast_vec_access


       CHARACTER(LEN=*), PARAMETER :: routinen = 'release_fast_vec_access'


       INTEGER                                            :: handle


       CALL timeset(routinen, handle)


       CALL hash_table_release(fast_vec_access%hash_table)

       IF (ALLOCATED(fast_vec_access%blk_map_d)) DEALLOCATE (fast_vec_access%blk_map_d)

       IF (ALLOCATED(fast_vec_access%blk_map_z)) DEALLOCATE (fast_vec_access%blk_map_z)


       CALL timestop(handle)


    END SUBROUTINE release_fast_vec_access


 ! --------------------------------------------------------------------------------------------------

 ! Beginning of hashtable.

 ! this file can be 'INCLUDE'ed verbatim in various place, where it needs to be

 ! part of the module to guarantee inlining

 ! hashes (c,p) pairs, where p is assumed to be >0

 ! on return (0 is used as a flag for not present)

 !

 !

 ! **************************************************************************************************

 !> \brief finds a prime equal or larger than i, needed at creation

 !> \param i ...

 !> \return ...

 ! **************************************************************************************************

    FUNCTION matching_prime(i) RESULT(res)

       INTEGER, INTENT(IN)                      :: i

       INTEGER                                  :: res


       INTEGER                                  :: j


       res = i

       j = 0

       DO WHILE (j < res)

          DO j = 2, res - 1

             IF (mod(res, j) == 0) THEN

                res = res + 1

                EXIT

             END IF

          END DO

       END DO

    END FUNCTION


 ! **************************************************************************************************

 !> \brief create a hash_table of given initial size.

 !>        the hash table will expand as needed (but this requires rehashing)

 !> \param hash_table ...

 !> \param table_size ...

 ! **************************************************************************************************

    SUBROUTINE hash_table_create(hash_table, table_size)

       TYPE(hash_table_type)                    :: hash_table

       INTEGER, INTENT(IN)                      :: table_size


       INTEGER                                  :: j


       ! guarantee a minimal hash table size (8), so that expansion works


       j = 3

       DO WHILE (2**j - 1 < table_size)

          j = j + 1

       END DO

       hash_table%nmax = 2**j - 1

       hash_table%prime = matching_prime(hash_table%nmax)

       hash_table%nele = 0

       ALLOCATE (hash_table%table(0:hash_table%nmax))

    END SUBROUTINE hash_table_create


 ! **************************************************************************************************

 !> \brief ...

 !> \param hash_table ...

 ! **************************************************************************************************

    SUBROUTINE hash_table_release(hash_table)

       TYPE(hash_table_type)                    :: hash_table


       hash_table%nmax = 0

       hash_table%nele = 0

       DEALLOCATE (hash_table%table)


    END SUBROUTINE hash_table_release


 ! **************************************************************************************************

 !> \brief add a pair (c,p) to the hash table

 !> \param hash_table ...

 !> \param c this value is being hashed

 !> \param p this is being stored

 ! **************************************************************************************************

    RECURSIVE SUBROUTINE hash_table_add(hash_table, c, p)

       TYPE(hash_table_type), INTENT(INOUT)     :: hash_table

       INTEGER, INTENT(IN)                      :: c, p


       REAL(kind=real_8), PARAMETER :: hash_table_expand = 1.5_real_8, &

                                       inv_hash_table_fill = 2.5_real_8


       INTEGER                                  :: i, j

       TYPE(ele_type), ALLOCATABLE, &

          DIMENSION(:)                           :: tmp_hash


 ! if too small, make a copy and rehash in a larger table


       IF (hash_table%nele*inv_hash_table_fill > hash_table%nmax) THEN

          ALLOCATE (tmp_hash(lbound(hash_table%table, 1):ubound(hash_table%table, 1)))

          tmp_hash(:) = hash_table%table

          CALL hash_table_release(hash_table)

          CALL hash_table_create(hash_table, int((ubound(tmp_hash, 1) + 8)*hash_table_expand))

          DO i = lbound(tmp_hash, 1), ubound(tmp_hash, 1)

             IF (tmp_hash(i)%c .NE. 0) THEN

                CALL hash_table_add(hash_table, tmp_hash(i)%c, tmp_hash(i)%p)

             END IF

          END DO

          DEALLOCATE (tmp_hash)

       END IF


       hash_table%nele = hash_table%nele + 1

       i = iand(c*hash_table%prime, hash_table%nmax)


       DO j = i, hash_table%nmax

          IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN

             hash_table%table(j)%c = c

             hash_table%table(j)%p = p

             RETURN

          END IF

       END DO

       DO j = 0, i - 1

          IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN

             hash_table%table(j)%c = c

             hash_table%table(j)%p = p

             RETURN

          END IF

       END DO


    END SUBROUTINE hash_table_add


 ! **************************************************************************************************

 !> \brief ...

 !> \param hash_table ...

 !> \param c ...

 !> \return ...

 ! **************************************************************************************************

    PURE FUNCTION hash_table_get(hash_table, c) RESULT(p)

       TYPE(hash_table_type), INTENT(IN)        :: hash_table

       INTEGER, INTENT(IN)                      :: c

       INTEGER                                  :: p


       INTEGER                                  :: i, j


       i = iand(c*hash_table%prime, hash_table%nmax)


       ! catch the likely case first

       IF (hash_table%table(i)%c == c) THEN

          p = hash_table%table(i)%p

          RETURN

       END IF


       DO j = i, hash_table%nmax

          IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN

             p = hash_table%table(j)%p

             RETURN

          END IF

       END DO

       DO j = 0, i - 1

          IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN

             p = hash_table%table(j)%p

             RETURN

          END IF

       END DO


       ! we should never reach this point.

       p = huge(p)


    END FUNCTION hash_table_get


 ! End of hashtable

 ! --------------------------------------------------------------------------------------------------


 ! **************************************************************************************************

 !> \brief the real driver routine for the multiply, not all symmetries implemented yet

 !> \param matrix ...

 !> \param vec_in ...

 !> \param vec_out ...

 !> \param alpha ...

 !> \param beta ...

 !> \param work_row ...

 !> \param work_col ...

 ! **************************************************************************************************

       SUBROUTINE dbcsr_matrix_colvec_multiply_d (matrix, vec_in, vec_out, alpha, beta, work_row, work_col)

          TYPE(dbcsr_type)                          :: matrix, vec_in, vec_out

          REAL(kind=real_8)                                  :: alpha, beta

          TYPE(dbcsr_type)                          :: work_row, work_col


          CHARACTER                                :: matrix_type


          CALL dbcsr_get_info(matrix, matrix_type=matrix_type)


          SELECT CASE (matrix_type)

          CASE (dbcsr_type_no_symmetry)

             CALL dbcsr_matrix_vector_mult_d (matrix, vec_in, vec_out, alpha, beta, work_row, work_col)

          CASE (dbcsr_type_symmetric)

             CALL dbcsr_sym_matrix_vector_mult_d (matrix, vec_in, vec_out, alpha, beta, work_row, work_col)

          CASE (dbcsr_type_antisymmetric)

             ! Not yet implemented, should mainly be some prefactor magic, but who knows how antisymmetric matrices are stored???

             cpabort("NYI, antisymmetric matrix not permitted")

          CASE DEFAULT

             cpabort("Unknown matrix type, ...")

          END SELECT


       END SUBROUTINE dbcsr_matrix_colvec_multiply_d


 ! **************************************************************************************************

 !> \brief low level routines for matrix vector multiplies

 !> \param matrix ...

 !> \param vec_in ...

 !> \param vec_out ...

 !> \param alpha ...

 !> \param beta ...

 !> \param work_row ...

 !> \param work_col ...

 ! **************************************************************************************************

       SUBROUTINE dbcsr_matrix_vector_mult_d (matrix, vec_in, vec_out, alpha, beta, work_row, work_col)

          TYPE(dbcsr_type)                          :: matrix, vec_in, vec_out

          REAL(kind=real_8)                                  :: alpha, beta

          TYPE(dbcsr_type)                          :: work_row, work_col


          CHARACTER(LEN=*), PARAMETER :: routinen = 'dbcsr_matrix_vector_mult'


          INTEGER                                  :: col, mypcol, &

                                                      myprow, prow_handle, &

                                                      ncols, nrows, &

                                                      row, &

                                                      handle, handle1, ithread

          TYPE(mp_comm_type) :: prow_group

          REAL(kind=real_8), DIMENSION(:), POINTER          :: data_vec

          REAL(kind=real_8), DIMENSION(:, :), POINTER       :: data_d, vec_res

          TYPE(dbcsr_distribution_type)            :: dist

          TYPE(dbcsr_iterator_type)                :: iter

          TYPE(fast_vec_access_type)               :: fast_vec_row, fast_vec_col

          INTEGER                                  :: prow, pcol


          CALL timeset(routinen, handle)

          ithread = 0


 ! Collect some data about the parallel environment. We will use them later to move the vector around

          CALL dbcsr_get_info(matrix, distribution=dist)

          CALL dbcsr_distribution_get(dist, prow_group=prow_handle, myprow=myprow, mypcol=mypcol)


          CALL prow_group%set_handle(prow_handle)


          CALL create_fast_row_vec_access(work_row, fast_vec_row)

          CALL create_fast_col_vec_access(work_col, fast_vec_col)


 ! Transfer the correct parts of the input vector to the correct locations so we can do a local multiply

          CALL dbcsr_col_vec_to_rep_row_d (vec_in, work_col, work_row, fast_vec_col)


 ! Set the work vector for the results to 0

          CALL dbcsr_set(work_col, 0.0_real_8)


 ! Perform the local multiply. Here we exploit, that we have the blocks replicated on the mpi processes

 ! It is important to note, that the input and result vector are distributed differently (row wise, col wise respectively)

          CALL timeset(routinen//"_local_mm", handle1)


 !$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,ithread,pcol,prow) &

 !$OMP          SHARED(matrix,fast_vec_col,fast_vec_row)

 !$       ithread = omp_get_thread_num()

          CALL dbcsr_iterator_start(iter, matrix, shared=.false.)

          DO WHILE (dbcsr_iterator_blocks_left(iter))

             CALL dbcsr_iterator_next_block(iter, row, col, data_d)

             prow = hash_table_get(fast_vec_col%hash_table, row)

             IF (fast_vec_col%blk_map_d (prow)%assigned_thread .NE. ithread) cycle

             pcol = hash_table_get(fast_vec_row%hash_table, col)

                IF (SIZE(fast_vec_col%blk_map_d (prow)%ptr, 1) .EQ. 0 .OR. &

                    SIZE(fast_vec_col%blk_map_d (prow)%ptr, 2) .EQ. 0 .OR. &

                    SIZE(data_d, 2) .EQ. 0) cycle

                CALL dgemm('N', 'T', SIZE(fast_vec_col%blk_map_d (prow)%ptr, 1), &

                           SIZE(fast_vec_col%blk_map_d (prow)%ptr, 2), &

                           SIZE(data_d, 2), &

                           1.0_dp, &

                           data_d, &

                           SIZE(fast_vec_col%blk_map_d (prow)%ptr, 1), &

                           fast_vec_row%blk_map_d (pcol)%ptr, &

                           SIZE(fast_vec_col%blk_map_d (prow)%ptr, 2), &

                           1.0_dp, &

                           fast_vec_col%blk_map_d (prow)%ptr, &

                           SIZE(fast_vec_col%blk_map_d (prow)%ptr, 1))

          END DO

          CALL dbcsr_iterator_stop(iter)

 !$OMP END PARALLEL


          CALL timestop(handle1)


 ! sum all the data onto the first processor col where the original vector is stored

          data_vec => dbcsr_get_data_p(work_col, select_data_type=0.0_real_8)

          CALL dbcsr_get_info(matrix=work_col, nfullrows_local=nrows, nfullcols_local=ncols)

          CALL prow_group%sum(data_vec(1:nrows*ncols))


 ! Local copy on the first mpi col (as this is the localtion of the vec_res blocks) of the result vector

 ! from the replicated to the original vector. Let's play it safe and use the iterator

          CALL dbcsr_iterator_start(iter, vec_out)

          DO WHILE (dbcsr_iterator_blocks_left(iter))

             CALL dbcsr_iterator_next_block(iter, row, col, vec_res)

             prow = hash_table_get(fast_vec_col%hash_table, row)

             IF (ASSOCIATED(fast_vec_col%blk_map_d (prow)%ptr)) THEN

                vec_res(:, :) = beta*vec_res(:, :) + alpha*fast_vec_col%blk_map_d (prow)%ptr(:, :)

             ELSE

                vec_res(:, :) = beta*vec_res(:, :)

             END IF

          END DO

          CALL dbcsr_iterator_stop(iter)


          CALL release_fast_vec_access(fast_vec_row)

          CALL release_fast_vec_access(fast_vec_col)


          CALL timestop(handle)


       END SUBROUTINE dbcsr_matrix_vector_mult_d


 ! **************************************************************************************************

 !> \brief ...

 !> \param matrix ...

 !> \param vec_in ...

 !> \param vec_out ...

 !> \param alpha ...

 !> \param beta ...

 !> \param work_row ...

 !> \param work_col ...

 !> \param skip_diag ...

 ! **************************************************************************************************

       SUBROUTINE dbcsr_matrixt_vector_mult_d (matrix, vec_in, vec_out, alpha, beta, work_row, work_col, skip_diag)

          TYPE(dbcsr_type)                          :: matrix, vec_in, vec_out

          REAL(kind=real_8)                                  :: alpha, beta

          TYPE(dbcsr_type)                          :: work_row, work_col

          LOGICAL                                   :: skip_diag


          CHARACTER(LEN=*), PARAMETER :: routinen = 'dbcsr_matrixT_vector_mult'


          INTEGER                                  :: col, col_size, mypcol, &

                                                      myprow, pcol_handle, prow_handle, &

                                                      ncols, nrows, &

                                                      row, row_size, &

                                                      handle, handle1, ithread

          TYPE(mp_comm_type) :: pcol_group, prow_group

          REAL(kind=real_8), DIMENSION(:), POINTER          :: data_vec

          REAL(kind=real_8), DIMENSION(:, :), POINTER       :: data_d, vec_bl, vec_res

          TYPE(dbcsr_distribution_type)            :: dist

          TYPE(dbcsr_iterator_type)                :: iter


          TYPE(fast_vec_access_type)               :: fast_vec_row, fast_vec_col

          INTEGER                                  :: prow, pcol


          CALL timeset(routinen, handle)

          ithread = 0


 ! Collect some data about the parallel environment. We will use them later to move the vector around

          CALL dbcsr_get_info(matrix, distribution=dist)

          CALL dbcsr_distribution_get(dist, prow_group=prow_handle, pcol_group=pcol_handle, myprow=myprow, mypcol=mypcol)


          CALL prow_group%set_handle(prow_handle)

          CALL pcol_group%set_handle(pcol_handle)


          CALL create_fast_row_vec_access(work_row, fast_vec_row)

          CALL create_fast_col_vec_access(work_col, fast_vec_col)


 ! Set the work vector for the results to 0

          CALL dbcsr_set(work_row, 0.0_real_8)


 ! Transfer the correct parts of the input vector to the replicated vector on proc_col 0

          CALL dbcsr_iterator_start(iter, vec_in)

          DO WHILE (dbcsr_iterator_blocks_left(iter))

             CALL dbcsr_iterator_next_block(iter, row, col, vec_bl, row_size=row_size, col_size=col_size)

             prow = hash_table_get(fast_vec_col%hash_table, row)

             fast_vec_col%blk_map_d (prow)%ptr(1:row_size, 1:col_size) = vec_bl(1:row_size, 1:col_size)

          END DO

          CALL dbcsr_iterator_stop(iter)

 ! Replicate the data on all processore in the row

          data_vec => dbcsr_get_data_p(work_col, select_data_type=0.0_real_8)

          CALL prow_group%bcast(data_vec, 0)


 ! Perform the local multiply. Here it is obvious why the vectors are replicated on the mpi rows and cols

          CALL timeset(routinen//"local_mm", handle1)

          CALL dbcsr_get_info(matrix=work_col, nfullcols_local=ncols)

 !$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,row_size,col_size,ithread,prow,pcol) &

 !$OMP          SHARED(matrix,fast_vec_row,fast_vec_col,skip_diag,ncols)

 !$       ithread = omp_get_thread_num()

          CALL dbcsr_iterator_start(iter, matrix, shared=.false.)

          DO WHILE (dbcsr_iterator_blocks_left(iter))

             CALL dbcsr_iterator_next_block(iter, row, col, data_d, row_size=row_size, col_size=col_size)

             IF (skip_diag .AND. col == row) cycle

             prow = hash_table_get(fast_vec_col%hash_table, row)

             pcol = hash_table_get(fast_vec_row%hash_table, col)

             IF (ASSOCIATED(fast_vec_row%blk_map_d (pcol)%ptr) .AND. &

                 ASSOCIATED(fast_vec_col%blk_map_d (prow)%ptr)) THEN

                IF (fast_vec_row%blk_map_d (pcol)%assigned_thread .NE. ithread) cycle

                fast_vec_row%blk_map_d (pcol)%ptr = fast_vec_row%blk_map_d (pcol)%ptr + &

                                                              matmul(transpose(fast_vec_col%blk_map_d (prow)%ptr), data_d)

             ELSE

                prow = hash_table_get(fast_vec_row%hash_table, row)

                pcol = hash_table_get(fast_vec_col%hash_table, col)

                IF (fast_vec_row%blk_map_d (prow)%assigned_thread .NE. ithread) cycle

                fast_vec_row%blk_map_d (prow)%ptr = fast_vec_row%blk_map_d (prow)%ptr + &

                                                   matmul(transpose(fast_vec_col%blk_map_d (pcol)%ptr), transpose(data_d))

             END IF

          END DO

          CALL dbcsr_iterator_stop(iter)

 !$OMP END PARALLEL


          CALL timestop(handle1)


 ! sum all the data within a processor column to obtain the replicated result

          data_vec => dbcsr_get_data_p(work_row, select_data_type=0.0_real_8)

 ! we use the replicated vector but the final answer is only summed to proc_col 0 for efficiency

          CALL dbcsr_get_info(matrix=work_row, nfullrows_local=nrows, nfullcols_local=ncols)

          CALL pcol_group%sum(data_vec(1:nrows*ncols))


 ! Convert the result to a column wise distribution

          CALL dbcsr_rep_row_to_rep_col_vec_d (work_col, work_row, fast_vec_row)


 ! Create_the final vector by summing it to the result vector which lives on proc_col 0

          CALL dbcsr_iterator_start(iter, vec_out)

          DO WHILE (dbcsr_iterator_blocks_left(iter))

             CALL dbcsr_iterator_next_block(iter, row, col, vec_res, row_size=row_size)

             prow = hash_table_get(fast_vec_col%hash_table, row)

             IF (ASSOCIATED(fast_vec_col%blk_map_d (prow)%ptr)) THEN

                vec_res(:, :) = beta*vec_res(:, :) + alpha*fast_vec_col%blk_map_d (prow)%ptr(:, :)

             ELSE

                vec_res(:, :) = beta*vec_res(:, :)

             END IF

          END DO

          CALL dbcsr_iterator_stop(iter)


          CALL timestop(handle)


       END SUBROUTINE dbcsr_matrixt_vector_mult_d


 ! **************************************************************************************************

 !> \brief ...

 !> \param vec_in ...

 !> \param rep_col_vec ...

 !> \param rep_row_vec ...

 !> \param fast_vec_col ...

 ! **************************************************************************************************

       SUBROUTINE dbcsr_col_vec_to_rep_row_d (vec_in, rep_col_vec, rep_row_vec, fast_vec_col)

          TYPE(dbcsr_type)                          :: vec_in, rep_col_vec, &

                                                       rep_row_vec

          TYPE(fast_vec_access_type), INTENT(IN)   :: fast_vec_col


          CHARACTER(LEN=*), PARAMETER :: routinen = 'dbcsr_col_vec_to_rep_row'


          INTEGER                                  :: col, mypcol, myprow, ncols, &

                                                      nrows, pcol_handle, prow_handle, &

                                                      row, handle

          TYPE(mp_comm_type) :: pcol_group, prow_group

          INTEGER, DIMENSION(:), POINTER           :: local_cols, row_dist

          REAL(kind=real_8), DIMENSION(:), POINTER          :: data_vec, data_vec_rep

          REAL(kind=real_8), DIMENSION(:, :), POINTER       :: vec_row

          TYPE(dbcsr_distribution_type)            :: dist_in, dist_rep_col

          TYPE(dbcsr_iterator_type)                :: iter


          CALL timeset(routinen, handle)


 ! get information about the parallel environment

          CALL dbcsr_get_info(vec_in, distribution=dist_in)

          CALL dbcsr_distribution_get(dist_in, &

                                      prow_group=prow_handle, &

                                      pcol_group=pcol_handle, &

                                      myprow=myprow, &

                                      mypcol=mypcol)


          CALL prow_group%set_handle(prow_handle)

          CALL pcol_group%set_handle(pcol_handle)


 ! Get the vector which tells us which blocks are local to which processor row in the col vec

          CALL dbcsr_get_info(rep_col_vec, distribution=dist_rep_col)

          CALL dbcsr_distribution_get(dist_rep_col, row_dist=row_dist)


 ! Copy the local vector to the replicated on the first processor column (this is where vec_in lives)

          CALL dbcsr_get_info(matrix=rep_col_vec, nfullrows_local=nrows, nfullcols_local=ncols)

          data_vec_rep => dbcsr_get_data_p(rep_col_vec, select_data_type=0.0_real_8)

          data_vec => dbcsr_get_data_p(vec_in, select_data_type=0.0_real_8)

          IF (mypcol == 0) data_vec_rep(1:nrows*ncols) = data_vec(1:nrows*ncols)

 ! Replicate the data along the row

          CALL prow_group%bcast(data_vec_rep(1:nrows*ncols), 0)


 ! Here it gets a bit tricky as we are dealing with two different parallel layouts:

 ! The rep_col_vec contains all blocks local to the row distribution of the vector.

 ! The rep_row_vec only needs the fraction which is local to the col distribution.

 ! However in most cases this won't the complete set of block which can be obtained from col_vector p_row i

 ! Anyway, as the blocks don't repeat in the col_vec, a different fraction of the row vec will be available

 ! on every replica in the processor column, by summing along the column we end up with the complete vector everywhere

 ! Hope this clarifies the idea

          CALL dbcsr_set(rep_row_vec, 0.0_real_8)

          CALL dbcsr_get_info(matrix=rep_row_vec, nfullrows_local=nrows, local_cols=local_cols, nfullcols_local=ncols)

          CALL dbcsr_iterator_start(iter, rep_row_vec)

          DO WHILE (dbcsr_iterator_blocks_left(iter))

             CALL dbcsr_iterator_next_block(iter, row, col, vec_row)

             IF (row_dist(col) == myprow) THEN

                vec_row = transpose(fast_vec_col%blk_map_d (hash_table_get(fast_vec_col%hash_table, col))%ptr)

             END IF

          END DO

          CALL dbcsr_iterator_stop(iter)

          CALL dbcsr_get_info(matrix=rep_row_vec, nfullrows_local=nrows, nfullcols_local=ncols)

          data_vec_rep => dbcsr_get_data_p(rep_row_vec, select_data_type=0.0_real_8)

          CALL pcol_group%sum(data_vec_rep(1:ncols*nrows))


          CALL timestop(handle)


       END SUBROUTINE dbcsr_col_vec_to_rep_row_d


 ! **************************************************************************************************

 !> \brief ...

 !> \param rep_col_vec ...

 !> \param rep_row_vec ...

 !> \param fast_vec_row ...

 !> \param fast_vec_col_add ...

 ! **************************************************************************************************

       SUBROUTINE dbcsr_rep_row_to_rep_col_vec_d (rep_col_vec, rep_row_vec, fast_vec_row, fast_vec_col_add)

          TYPE(dbcsr_type)                          :: rep_col_vec, rep_row_vec

          TYPE(fast_vec_access_type), OPTIONAL     :: fast_vec_col_add

          TYPE(fast_vec_access_type)               :: fast_vec_row


          CHARACTER(LEN=*), PARAMETER :: routinen = 'dbcsr_rep_row_to_rep_col_vec'


          INTEGER                                  :: col, mypcol, myprow, ncols, &

                                                      nrows, prow_handle, &

                                                      row, handle

          INTEGER, DIMENSION(:), POINTER           :: col_dist

          TYPE(mp_comm_type) :: prow_group

          REAL(kind=real_8), DIMENSION(:), POINTER          :: data_vec_rep

          REAL(kind=real_8), DIMENSION(:, :), POINTER       :: vec_col

          TYPE(dbcsr_distribution_type)            :: dist_rep_row, dist_rep_col

          TYPE(dbcsr_iterator_type)                :: iter


          CALL timeset(routinen, handle)


 ! get information about the parallel environment

          CALL dbcsr_get_info(matrix=rep_col_vec, distribution=dist_rep_col)

          CALL dbcsr_distribution_get(dist_rep_col, &

                                      prow_group=prow_handle, &

                                      myprow=myprow, &

                                      mypcol=mypcol)


          CALL prow_group%set_handle(prow_handle)


 ! Get the vector which tells us which blocks are local to which processor col in the row vec

          CALL dbcsr_get_info(matrix=rep_row_vec, distribution=dist_rep_row)

          CALL dbcsr_distribution_get(dist_rep_row, col_dist=col_dist)


 ! The same trick as described above with opposite direction

          CALL dbcsr_set(rep_col_vec, 0.0_real_8)

          CALL dbcsr_iterator_start(iter, rep_col_vec)

          DO WHILE (dbcsr_iterator_blocks_left(iter))

             CALL dbcsr_iterator_next_block(iter, row, col, vec_col)

             IF (col_dist(row) == mypcol) THEN

                vec_col = transpose(fast_vec_row%blk_map_d (hash_table_get(fast_vec_row%hash_table, row))%ptr)

             END IF

             ! this one is special and allows to add the elements of a not yet summed replicated

             ! column vector as it appears in M*V(row_rep) as result. Save an parallel summation in the symmetric case

             IF (PRESENT(fast_vec_col_add)) vec_col = vec_col + &

                                   fast_vec_col_add%blk_map_d (hash_table_get(fast_vec_col_add%hash_table, row))%ptr(:, :)

          END DO

          CALL dbcsr_iterator_stop(iter)

          CALL dbcsr_get_info(matrix=rep_col_vec, nfullrows_local=nrows, nfullcols_local=ncols)

          data_vec_rep => dbcsr_get_data_p(rep_col_vec, select_data_type=0.0_real_8)

          CALL prow_group%sum(data_vec_rep(1:nrows*ncols))


          CALL timestop(handle)


       END SUBROUTINE dbcsr_rep_row_to_rep_col_vec_d


 ! **************************************************************************************************

 !> \brief given a column vector, prepare the fast_vec_access container

 !> \param vec ...

 !> \param fast_vec_access ...

 ! **************************************************************************************************

       SUBROUTINE create_fast_col_vec_access_d (vec, fast_vec_access)

          TYPE(dbcsr_type)                          :: vec

          TYPE(fast_vec_access_type)               :: fast_vec_access


          CHARACTER(LEN=*), PARAMETER :: routinen = 'create_fast_col_vec_access_d'


          INTEGER                                  :: handle, nblk_local

          INTEGER                                  :: col, row, iblock, nthreads

          REAL(kind=real_8), DIMENSION(:, :), POINTER       :: vec_bl

          TYPE(dbcsr_iterator_type)                :: iter


          CALL timeset(routinen, handle)


          ! figure out the number of threads

          nthreads = 1

 !$OMP PARALLEL DEFAULT(NONE) SHARED(nthreads)

 !$OMP MASTER

 !$       nthreads = OMP_GET_NUM_THREADS()

 !$OMP END MASTER

 !$OMP END PARALLEL


          CALL dbcsr_get_info(matrix=vec, nblkrows_local=nblk_local)

          ! 4 times makes sure the table is big enough to limit collisions.

          CALL hash_table_create(fast_vec_access%hash_table, 4*nblk_local)

          ! include zero for effective dealing with values not in the hash table (will return 0)

          ALLOCATE (fast_vec_access%blk_map_d (0:nblk_local))


          CALL dbcsr_get_info(matrix=vec, nblkcols_local=col)

          IF (col .GT. 1) cpabort("BUG")


          ! go through the blocks of the vector

          iblock = 0

          CALL dbcsr_iterator_start(iter, vec)

          DO WHILE (dbcsr_iterator_blocks_left(iter))

             CALL dbcsr_iterator_next_block(iter, row, col, vec_bl)

             iblock = iblock + 1

             CALL hash_table_add(fast_vec_access%hash_table, row, iblock)

             fast_vec_access%blk_map_d (iblock)%ptr => vec_bl

             fast_vec_access%blk_map_d (iblock)%assigned_thread = mod(iblock, nthreads)

          END DO

          CALL dbcsr_iterator_stop(iter)


          CALL timestop(handle)


       END SUBROUTINE create_fast_col_vec_access_d


 ! **************************************************************************************************

 !> \brief given a row vector, prepare the fast_vec_access_container

 !> \param vec ...

 !> \param fast_vec_access ...

 ! **************************************************************************************************

       SUBROUTINE create_fast_row_vec_access_d (vec, fast_vec_access)

          TYPE(dbcsr_type)                          :: vec

          TYPE(fast_vec_access_type)                :: fast_vec_access


          CHARACTER(LEN=*), PARAMETER :: routinen = 'create_fast_row_vec_access_d'


          INTEGER                                  :: handle, nblk_local

          INTEGER                                  :: col, row, iblock, nthreads

          REAL(kind=real_8), DIMENSION(:, :), POINTER       :: vec_bl

          TYPE(dbcsr_iterator_type)                :: iter


          CALL timeset(routinen, handle)


          ! figure out the number of threads

          nthreads = 1

 !$OMP PARALLEL DEFAULT(NONE) SHARED(nthreads)

 !$OMP MASTER

 !$       nthreads = OMP_GET_NUM_THREADS()

 !$OMP END MASTER

 !$OMP END PARALLEL


          CALL dbcsr_get_info(matrix=vec, nblkcols_local=nblk_local)

          ! 4 times makes sure the table is big enough to limit collisions.

          CALL hash_table_create(fast_vec_access%hash_table, 4*nblk_local)

          ! include zero for effective dealing with values not in the hash table (will return 0)

          ALLOCATE (fast_vec_access%blk_map_d (0:nblk_local))


          ! sanity check

          CALL dbcsr_get_info(matrix=vec, nblkrows_local=row)

          IF (row .GT. 1) cpabort("BUG")


          ! go through the blocks of the vector

          iblock = 0

          CALL dbcsr_iterator_start(iter, vec)

          DO WHILE (dbcsr_iterator_blocks_left(iter))

             CALL dbcsr_iterator_next_block(iter, row, col, vec_bl)

             iblock = iblock + 1

             CALL hash_table_add(fast_vec_access%hash_table, col, iblock)

             fast_vec_access%blk_map_d (iblock)%ptr => vec_bl

             fast_vec_access%blk_map_d (iblock)%assigned_thread = mod(iblock, nthreads)

          END DO

          CALL dbcsr_iterator_stop(iter)


          CALL timestop(handle)


       END SUBROUTINE create_fast_row_vec_access_d


 ! **************************************************************************************************

 !> \brief ...

 !> \param matrix ...

 !> \param vec_in ...

 !> \param vec_out ...

 !> \param alpha ...

 !> \param beta ...

 !> \param work_row ...

 !> \param work_col ...

 ! **************************************************************************************************

       SUBROUTINE dbcsr_sym_matrix_vector_mult_d (matrix, vec_in, vec_out, alpha, beta, work_row, work_col)

          TYPE(dbcsr_type)                          :: matrix, vec_in, vec_out

          REAL(kind=real_8)                                  :: alpha, beta

          TYPE(dbcsr_type)                          :: work_row, work_col


          CHARACTER(LEN=*), PARAMETER :: routinen = 'dbcsr_sym_m_v_mult'


          INTEGER                                  :: col, mypcol, &

                                                      myprow, &

                                                      nrows, ncols, &

                                                      row, pcol_handle, &

                                                      handle, handle1, ithread, vec_dim

          REAL(kind=real_8), DIMENSION(:), POINTER          :: data_vec

          REAL(kind=real_8), DIMENSION(:, :), POINTER       :: data_d, vec_res

          TYPE(dbcsr_distribution_type)            :: dist

          TYPE(dbcsr_iterator_type)                :: iter

          TYPE(dbcsr_type)                         :: result_row, result_col

          TYPE(mp_comm_type) :: pcol_group


          TYPE(fast_vec_access_type)               :: fast_vec_row, fast_vec_col, res_fast_vec_row, res_fast_vec_col

          INTEGER                                  :: prow, pcol, rprow, rpcol


          CALL timeset(routinen, handle)

          ithread = 0

 ! We need some work matrices as we try to exploit operations on the replicated vectors which are duplicated otherwise

          CALL dbcsr_get_info(matrix=vec_in, nfullcols_total=vec_dim)

 ! This is a performance hack as the new creation of a replicated vector is a fair bit more expensive

          CALL dbcsr_set(work_col, 0.0_real_8)

          CALL dbcsr_copy(result_col, work_col)

          CALL dbcsr_set(work_row, 0.0_real_8)

          CALL dbcsr_copy(result_row, work_row)


 ! Collect some data about the parallel environment. We will use them later to move the vector around

          CALL dbcsr_get_info(matrix=matrix, distribution=dist)

          CALL dbcsr_distribution_get(dist, pcol_group=pcol_handle, myprow=myprow, mypcol=mypcol)


          CALL pcol_group%set_handle(pcol_handle)


          CALL create_fast_row_vec_access(work_row, fast_vec_row)

          CALL create_fast_col_vec_access(work_col, fast_vec_col)

          CALL create_fast_row_vec_access(result_row, res_fast_vec_row)

          CALL create_fast_col_vec_access(result_col, res_fast_vec_col)


 ! Transfer the correct parts of the input vector to the correct locations so we can do a local multiply

          CALL dbcsr_col_vec_to_rep_row_d (vec_in, work_col, work_row, fast_vec_col)


 ! Probably I should rename the routine above as it delivers both the replicated row and column vector


 ! Perform the local multiply. Here we exploit, that we have the blocks replicated on the mpi processes

 ! It is important to note, that the input and result vector are distributed differently (row wise, col wise respectively)

          CALL timeset(routinen//"_local_mm", handle1)


 !------ perform the multiplication, we have to take car to take the correct blocks ----------


 !$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,ithread,pcol,prow,rpcol,rprow) &

 !$OMP          SHARED(matrix,fast_vec_row,res_fast_vec_col,res_fast_vec_row,fast_vec_col)

 !$       ithread = omp_get_thread_num()

          CALL dbcsr_iterator_start(iter, matrix, shared=.false.)

          DO WHILE (dbcsr_iterator_blocks_left(iter))

             CALL dbcsr_iterator_next_block(iter, row, col, data_d)

             pcol = hash_table_get(fast_vec_row%hash_table, col)

             rprow = hash_table_get(res_fast_vec_col%hash_table, row)

             IF (ASSOCIATED(fast_vec_row%blk_map_d (pcol)%ptr) .AND. &

                 ASSOCIATED(res_fast_vec_col%blk_map_d (rprow)%ptr)) THEN

                IF (res_fast_vec_col%blk_map_d (rprow)%assigned_thread .EQ. ithread) THEN

                   res_fast_vec_col%blk_map_d (rprow)%ptr = res_fast_vec_col%blk_map_d (rprow)%ptr + &

                                                              matmul(data_d, transpose(fast_vec_row%blk_map_d (pcol)%ptr))

                END IF

                prow = hash_table_get(fast_vec_col%hash_table, row)

                rpcol = hash_table_get(res_fast_vec_row%hash_table, col)

                IF (res_fast_vec_row%blk_map_d (rpcol)%assigned_thread .EQ. ithread .AND. row .NE. col) THEN

                   res_fast_vec_row%blk_map_d (rpcol)%ptr = res_fast_vec_row%blk_map_d (rpcol)%ptr + &

                                                              matmul(transpose(fast_vec_col%blk_map_d (prow)%ptr), data_d)

                END IF

             ELSE

                rpcol = hash_table_get(res_fast_vec_col%hash_table, col)

                prow = hash_table_get(fast_vec_row%hash_table, row)

                IF (res_fast_vec_col%blk_map_d (rpcol)%assigned_thread .EQ. ithread) THEN

                   res_fast_vec_col%blk_map_d (rpcol)%ptr = res_fast_vec_col%blk_map_d (rpcol)%ptr + &

                                                              transpose(matmul(fast_vec_row%blk_map_d (prow)%ptr, data_d))

                END IF

                rprow = hash_table_get(res_fast_vec_row%hash_table, row)

                pcol = hash_table_get(fast_vec_col%hash_table, col)

                IF (res_fast_vec_row%blk_map_d (rprow)%assigned_thread .EQ. ithread .AND. row .NE. col) THEN

                   res_fast_vec_row%blk_map_d (rprow)%ptr = res_fast_vec_row%blk_map_d (rprow)%ptr + &

                                                              transpose(matmul(data_d, fast_vec_col%blk_map_d (pcol)%ptr))

                END IF

             END IF

          END DO

          CALL dbcsr_iterator_stop(iter)

 !$OMP END PARALLEL


          CALL timestop(handle1)


          ! sum all the data within a processor column to obtain the replicated result from lower

          data_vec => dbcsr_get_data_p(result_row, select_data_type=0.0_real_8)

          CALL dbcsr_get_info(matrix=result_row, nfullrows_local=nrows, nfullcols_local=ncols)


          CALL pcol_group%sum(data_vec(1:nrows*ncols))

 !

 !! Convert the results to a column wise distribution, this is a bit involved as the result_row is fully replicated

 !! While the result_col still has the partial results in parallel. The routine below takes care of that and saves a

 !! parallel summation. Of the res_row vectors are created only taking the appropriate element (0 otherwise) while the res_col

 !! parallel bits are locally added. The sum magically creates the correct vector

          CALL dbcsr_rep_row_to_rep_col_vec_d (work_col, result_row, res_fast_vec_row, res_fast_vec_col)


 !    ! Create_the final vector by summing it to the result vector which lives on proc_col 0 lower

          CALL dbcsr_iterator_start(iter, vec_out)

          DO WHILE (dbcsr_iterator_blocks_left(iter))

             CALL dbcsr_iterator_next_block(iter, row, col, vec_res)

             prow = hash_table_get(fast_vec_col%hash_table, row)

             IF (ASSOCIATED(fast_vec_col%blk_map_d (prow)%ptr)) THEN

                vec_res(:, :) = beta*vec_res(:, :) + alpha*(fast_vec_col%blk_map_d (prow)%ptr(:, :))

             ELSE

                vec_res(:, :) = beta*vec_res(:, :)

             END IF

          END DO

          CALL dbcsr_iterator_stop(iter)


          CALL release_fast_vec_access(fast_vec_row)

          CALL release_fast_vec_access(fast_vec_col)

          CALL release_fast_vec_access(res_fast_vec_row)

          CALL release_fast_vec_access(res_fast_vec_col)


          CALL dbcsr_release(result_row); CALL dbcsr_release(result_col)


          CALL timestop(handle)


       END SUBROUTINE dbcsr_sym_matrix_vector_mult_d


 ! **************************************************************************************************

 !> \brief the real driver routine for the multiply, not all symmetries implemented yet

 !> \param matrix ...

 !> \param vec_in ...

 !> \param vec_out ...

 !> \param alpha ...

 !> \param beta ...

 !> \param work_row ...

 !> \param work_col ...

 ! **************************************************************************************************

       SUBROUTINE dbcsr_matrix_colvec_multiply_z (matrix, vec_in, vec_out, alpha, beta, work_row, work_col)

          TYPE(dbcsr_type)                          :: matrix, vec_in, vec_out

          COMPLEX(kind=real_8)                                  :: alpha, beta

          TYPE(dbcsr_type)                          :: work_row, work_col


          CHARACTER                                :: matrix_type


          CALL dbcsr_get_info(matrix, matrix_type=matrix_type)


          SELECT CASE (matrix_type)

          CASE (dbcsr_type_no_symmetry)

             CALL dbcsr_matrix_vector_mult_z (matrix, vec_in, vec_out, alpha, beta, work_row, work_col)

          CASE (dbcsr_type_symmetric)

             CALL dbcsr_sym_matrix_vector_mult_z (matrix, vec_in, vec_out, alpha, beta, work_row, work_col)

          CASE (dbcsr_type_antisymmetric)

             ! Not yet implemented, should mainly be some prefactor magic, but who knows how antisymmetric matrices are stored???

             cpabort("NYI, antisymmetric matrix not permitted")

          CASE DEFAULT

             cpabort("Unknown matrix type, ...")

          END SELECT


       END SUBROUTINE dbcsr_matrix_colvec_multiply_z


 ! **************************************************************************************************

 !> \brief low level routines for matrix vector multiplies

 !> \param matrix ...

 !> \param vec_in ...

 !> \param vec_out ...

 !> \param alpha ...

 !> \param beta ...

 !> \param work_row ...

 !> \param work_col ...

 ! **************************************************************************************************

       SUBROUTINE dbcsr_matrix_vector_mult_z (matrix, vec_in, vec_out, alpha, beta, work_row, work_col)

          TYPE(dbcsr_type)                          :: matrix, vec_in, vec_out

          COMPLEX(kind=real_8)                                  :: alpha, beta

          TYPE(dbcsr_type)                          :: work_row, work_col


          CHARACTER(LEN=*), PARAMETER :: routinen = 'dbcsr_matrix_vector_mult'


          INTEGER                                  :: col, mypcol, &

                                                      myprow, prow_handle, &

                                                      ncols, nrows, &

                                                      row, &

                                                      handle, handle1, ithread

          TYPE(mp_comm_type) :: prow_group

          COMPLEX(kind=real_8), DIMENSION(:), POINTER          :: data_vec

          COMPLEX(kind=real_8), DIMENSION(:, :), POINTER       :: data_d, vec_res

          TYPE(dbcsr_distribution_type)            :: dist

          TYPE(dbcsr_iterator_type)                :: iter

          TYPE(fast_vec_access_type)               :: fast_vec_row, fast_vec_col

          INTEGER                                  :: prow, pcol


          CALL timeset(routinen, handle)

          ithread = 0


 ! Collect some data about the parallel environment. We will use them later to move the vector around

          CALL dbcsr_get_info(matrix, distribution=dist)

          CALL dbcsr_distribution_get(dist, prow_group=prow_handle, myprow=myprow, mypcol=mypcol)


          CALL prow_group%set_handle(prow_handle)


          CALL create_fast_row_vec_access(work_row, fast_vec_row)

          CALL create_fast_col_vec_access(work_col, fast_vec_col)


 ! Transfer the correct parts of the input vector to the correct locations so we can do a local multiply

          CALL dbcsr_col_vec_to_rep_row_z (vec_in, work_col, work_row, fast_vec_col)


 ! Set the work vector for the results to 0

          CALL dbcsr_set(work_col, cmplx(0.0, 0.0, real_8))


 ! Perform the local multiply. Here we exploit, that we have the blocks replicated on the mpi processes

 ! It is important to note, that the input and result vector are distributed differently (row wise, col wise respectively)

          CALL timeset(routinen//"_local_mm", handle1)


 !$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,ithread,pcol,prow) &

 !$OMP          SHARED(matrix,fast_vec_col,fast_vec_row)

 !$       ithread = omp_get_thread_num()

          CALL dbcsr_iterator_start(iter, matrix, shared=.false.)

          DO WHILE (dbcsr_iterator_blocks_left(iter))

             CALL dbcsr_iterator_next_block(iter, row, col, data_d)

             prow = hash_table_get(fast_vec_col%hash_table, row)

             IF (fast_vec_col%blk_map_z (prow)%assigned_thread .NE. ithread) cycle

             pcol = hash_table_get(fast_vec_row%hash_table, col)

                fast_vec_col%blk_map_z (prow)%ptr = fast_vec_col%blk_map_z (prow)%ptr + &

                                                              matmul(data_d, transpose(fast_vec_row%blk_map_z (pcol)%ptr))

          END DO

          CALL dbcsr_iterator_stop(iter)

 !$OMP END PARALLEL


          CALL timestop(handle1)


 ! sum all the data onto the first processor col where the original vector is stored

          data_vec => dbcsr_get_data_p(work_col, select_data_type=cmplx(0.0, 0.0, real_8))

          CALL dbcsr_get_info(matrix=work_col, nfullrows_local=nrows, nfullcols_local=ncols)

          CALL prow_group%sum(data_vec(1:nrows*ncols))


 ! Local copy on the first mpi col (as this is the localtion of the vec_res blocks) of the result vector

 ! from the replicated to the original vector. Let's play it safe and use the iterator

          CALL dbcsr_iterator_start(iter, vec_out)

          DO WHILE (dbcsr_iterator_blocks_left(iter))

             CALL dbcsr_iterator_next_block(iter, row, col, vec_res)

             prow = hash_table_get(fast_vec_col%hash_table, row)

             IF (ASSOCIATED(fast_vec_col%blk_map_z (prow)%ptr)) THEN

                vec_res(:, :) = beta*vec_res(:, :) + alpha*fast_vec_col%blk_map_z (prow)%ptr(:, :)

             ELSE

                vec_res(:, :) = beta*vec_res(:, :)

             END IF

          END DO

          CALL dbcsr_iterator_stop(iter)


          CALL release_fast_vec_access(fast_vec_row)

          CALL release_fast_vec_access(fast_vec_col)


          CALL timestop(handle)


       END SUBROUTINE dbcsr_matrix_vector_mult_z


 ! **************************************************************************************************

 !> \brief ...

 !> \param matrix ...

 !> \param vec_in ...

 !> \param vec_out ...

 !> \param alpha ...

 !> \param beta ...

 !> \param work_row ...

 !> \param work_col ...

 !> \param skip_diag ...

 ! **************************************************************************************************

       SUBROUTINE dbcsr_matrixt_vector_mult_z (matrix, vec_in, vec_out, alpha, beta, work_row, work_col, skip_diag)

          TYPE(dbcsr_type)                          :: matrix, vec_in, vec_out

          COMPLEX(kind=real_8)                                  :: alpha, beta

          TYPE(dbcsr_type)                          :: work_row, work_col

          LOGICAL                                   :: skip_diag


          CHARACTER(LEN=*), PARAMETER :: routinen = 'dbcsr_matrixT_vector_mult'


          INTEGER                                  :: col, col_size, mypcol, &

                                                      myprow, pcol_handle, prow_handle, &

                                                      ncols, nrows, &

                                                      row, row_size, &

                                                      handle, handle1, ithread

          TYPE(mp_comm_type) :: pcol_group, prow_group

          COMPLEX(kind=real_8), DIMENSION(:), POINTER          :: data_vec

          COMPLEX(kind=real_8), DIMENSION(:, :), POINTER       :: data_d, vec_bl, vec_res

          TYPE(dbcsr_distribution_type)            :: dist

          TYPE(dbcsr_iterator_type)                :: iter


          TYPE(fast_vec_access_type)               :: fast_vec_row, fast_vec_col

          INTEGER                                  :: prow, pcol


          CALL timeset(routinen, handle)

          ithread = 0


 ! Collect some data about the parallel environment. We will use them later to move the vector around

          CALL dbcsr_get_info(matrix, distribution=dist)

          CALL dbcsr_distribution_get(dist, prow_group=prow_handle, pcol_group=pcol_handle, myprow=myprow, mypcol=mypcol)


          CALL prow_group%set_handle(prow_handle)

          CALL pcol_group%set_handle(pcol_handle)


          CALL create_fast_row_vec_access(work_row, fast_vec_row)

          CALL create_fast_col_vec_access(work_col, fast_vec_col)


 ! Set the work vector for the results to 0

          CALL dbcsr_set(work_row, cmplx(0.0, 0.0, real_8))


 ! Transfer the correct parts of the input vector to the replicated vector on proc_col 0

          CALL dbcsr_iterator_start(iter, vec_in)

          DO WHILE (dbcsr_iterator_blocks_left(iter))

             CALL dbcsr_iterator_next_block(iter, row, col, vec_bl, row_size=row_size, col_size=col_size)

             prow = hash_table_get(fast_vec_col%hash_table, row)

             fast_vec_col%blk_map_z (prow)%ptr(1:row_size, 1:col_size) = vec_bl(1:row_size, 1:col_size)

          END DO

          CALL dbcsr_iterator_stop(iter)

 ! Replicate the data on all processore in the row

          data_vec => dbcsr_get_data_p(work_col, select_data_type=cmplx(0.0, 0.0, real_8))

          CALL prow_group%bcast(data_vec, 0)


 ! Perform the local multiply. Here it is obvious why the vectors are replicated on the mpi rows and cols

          CALL timeset(routinen//"local_mm", handle1)

          CALL dbcsr_get_info(matrix=work_col, nfullcols_local=ncols)

 !$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,row_size,col_size,ithread,prow,pcol) &

 !$OMP          SHARED(matrix,fast_vec_row,fast_vec_col,skip_diag,ncols)

 !$       ithread = omp_get_thread_num()

          CALL dbcsr_iterator_start(iter, matrix, shared=.false.)

          DO WHILE (dbcsr_iterator_blocks_left(iter))

             CALL dbcsr_iterator_next_block(iter, row, col, data_d, row_size=row_size, col_size=col_size)

             IF (skip_diag .AND. col == row) cycle

             prow = hash_table_get(fast_vec_col%hash_table, row)

             pcol = hash_table_get(fast_vec_row%hash_table, col)

             IF (ASSOCIATED(fast_vec_row%blk_map_z (pcol)%ptr) .AND. &

                 ASSOCIATED(fast_vec_col%blk_map_z (prow)%ptr)) THEN

                IF (fast_vec_row%blk_map_z (pcol)%assigned_thread .NE. ithread) cycle

                fast_vec_row%blk_map_z (pcol)%ptr = fast_vec_row%blk_map_z (pcol)%ptr + &

                                                              matmul(transpose(fast_vec_col%blk_map_z (prow)%ptr), data_d)

             ELSE

                prow = hash_table_get(fast_vec_row%hash_table, row)

                pcol = hash_table_get(fast_vec_col%hash_table, col)

                IF (fast_vec_row%blk_map_z (prow)%assigned_thread .NE. ithread) cycle

                fast_vec_row%blk_map_z (prow)%ptr = fast_vec_row%blk_map_z (prow)%ptr + &

                                                   matmul(transpose(fast_vec_col%blk_map_z (pcol)%ptr), transpose(data_d))

             END IF

          END DO

          CALL dbcsr_iterator_stop(iter)

 !$OMP END PARALLEL


          CALL timestop(handle1)


 ! sum all the data within a processor column to obtain the replicated result

          data_vec => dbcsr_get_data_p(work_row, select_data_type=cmplx(0.0, 0.0, real_8))

 ! we use the replicated vector but the final answer is only summed to proc_col 0 for efficiency

          CALL dbcsr_get_info(matrix=work_row, nfullrows_local=nrows, nfullcols_local=ncols)

          CALL pcol_group%sum(data_vec(1:nrows*ncols))


 ! Convert the result to a column wise distribution

          CALL dbcsr_rep_row_to_rep_col_vec_z (work_col, work_row, fast_vec_row)


 ! Create_the final vector by summing it to the result vector which lives on proc_col 0

          CALL dbcsr_iterator_start(iter, vec_out)

          DO WHILE (dbcsr_iterator_blocks_left(iter))

             CALL dbcsr_iterator_next_block(iter, row, col, vec_res, row_size=row_size)

             prow = hash_table_get(fast_vec_col%hash_table, row)

             IF (ASSOCIATED(fast_vec_col%blk_map_z (prow)%ptr)) THEN

                vec_res(:, :) = beta*vec_res(:, :) + alpha*fast_vec_col%blk_map_z (prow)%ptr(:, :)

             ELSE

                vec_res(:, :) = beta*vec_res(:, :)

             END IF

          END DO

          CALL dbcsr_iterator_stop(iter)


          CALL timestop(handle)


       END SUBROUTINE dbcsr_matrixt_vector_mult_z


 ! **************************************************************************************************

 !> \brief ...

 !> \param vec_in ...

 !> \param rep_col_vec ...

 !> \param rep_row_vec ...

 !> \param fast_vec_col ...

 ! **************************************************************************************************

       SUBROUTINE dbcsr_col_vec_to_rep_row_z (vec_in, rep_col_vec, rep_row_vec, fast_vec_col)

          TYPE(dbcsr_type)                          :: vec_in, rep_col_vec, &

                                                       rep_row_vec

          TYPE(fast_vec_access_type), INTENT(IN)   :: fast_vec_col


          CHARACTER(LEN=*), PARAMETER :: routinen = 'dbcsr_col_vec_to_rep_row'


          INTEGER                                  :: col, mypcol, myprow, ncols, &

                                                      nrows, pcol_handle, prow_handle, &

                                                      row, handle

          TYPE(mp_comm_type) :: pcol_group, prow_group

          INTEGER, DIMENSION(:), POINTER           :: local_cols, row_dist

          COMPLEX(kind=real_8), DIMENSION(:), POINTER          :: data_vec, data_vec_rep

          COMPLEX(kind=real_8), DIMENSION(:, :), POINTER       :: vec_row

          TYPE(dbcsr_distribution_type)            :: dist_in, dist_rep_col

          TYPE(dbcsr_iterator_type)                :: iter


          CALL timeset(routinen, handle)


 ! get information about the parallel environment

          CALL dbcsr_get_info(vec_in, distribution=dist_in)

          CALL dbcsr_distribution_get(dist_in, &

                                      prow_group=prow_handle, &

                                      pcol_group=pcol_handle, &

                                      myprow=myprow, &

                                      mypcol=mypcol)


          CALL prow_group%set_handle(prow_handle)

          CALL pcol_group%set_handle(pcol_handle)


 ! Get the vector which tells us which blocks are local to which processor row in the col vec

          CALL dbcsr_get_info(rep_col_vec, distribution=dist_rep_col)

          CALL dbcsr_distribution_get(dist_rep_col, row_dist=row_dist)


 ! Copy the local vector to the replicated on the first processor column (this is where vec_in lives)

          CALL dbcsr_get_info(matrix=rep_col_vec, nfullrows_local=nrows, nfullcols_local=ncols)

          data_vec_rep => dbcsr_get_data_p(rep_col_vec, select_data_type=cmplx(0.0, 0.0, real_8))

          data_vec => dbcsr_get_data_p(vec_in, select_data_type=cmplx(0.0, 0.0, real_8))

          IF (mypcol == 0) data_vec_rep(1:nrows*ncols) = data_vec(1:nrows*ncols)

 ! Replicate the data along the row

          CALL prow_group%bcast(data_vec_rep(1:nrows*ncols), 0)


 ! Here it gets a bit tricky as we are dealing with two different parallel layouts:

 ! The rep_col_vec contains all blocks local to the row distribution of the vector.

 ! The rep_row_vec only needs the fraction which is local to the col distribution.

 ! However in most cases this won't the complete set of block which can be obtained from col_vector p_row i

 ! Anyway, as the blocks don't repeat in the col_vec, a different fraction of the row vec will be available

 ! on every replica in the processor column, by summing along the column we end up with the complete vector everywhere

 ! Hope this clarifies the idea

          CALL dbcsr_set(rep_row_vec, cmplx(0.0, 0.0, real_8))

          CALL dbcsr_get_info(matrix=rep_row_vec, nfullrows_local=nrows, local_cols=local_cols, nfullcols_local=ncols)

          CALL dbcsr_iterator_start(iter, rep_row_vec)

          DO WHILE (dbcsr_iterator_blocks_left(iter))

             CALL dbcsr_iterator_next_block(iter, row, col, vec_row)

             IF (row_dist(col) == myprow) THEN

                vec_row = transpose(fast_vec_col%blk_map_z (hash_table_get(fast_vec_col%hash_table, col))%ptr)

             END IF

          END DO

          CALL dbcsr_iterator_stop(iter)

          CALL dbcsr_get_info(matrix=rep_row_vec, nfullrows_local=nrows, nfullcols_local=ncols)

          data_vec_rep => dbcsr_get_data_p(rep_row_vec, select_data_type=cmplx(0.0, 0.0, real_8))

          CALL pcol_group%sum(data_vec_rep(1:ncols*nrows))


          CALL timestop(handle)


       END SUBROUTINE dbcsr_col_vec_to_rep_row_z


 ! **************************************************************************************************

 !> \brief ...

 !> \param rep_col_vec ...

 !> \param rep_row_vec ...

 !> \param fast_vec_row ...

 !> \param fast_vec_col_add ...

 ! **************************************************************************************************

       SUBROUTINE dbcsr_rep_row_to_rep_col_vec_z (rep_col_vec, rep_row_vec, fast_vec_row, fast_vec_col_add)

          TYPE(dbcsr_type)                          :: rep_col_vec, rep_row_vec

          TYPE(fast_vec_access_type), OPTIONAL     :: fast_vec_col_add

          TYPE(fast_vec_access_type)               :: fast_vec_row


          CHARACTER(LEN=*), PARAMETER :: routinen = 'dbcsr_rep_row_to_rep_col_vec'


          INTEGER                                  :: col, mypcol, myprow, ncols, &

                                                      nrows, prow_handle, &

                                                      row, handle

          INTEGER, DIMENSION(:), POINTER           :: col_dist

          TYPE(mp_comm_type) :: prow_group

          COMPLEX(kind=real_8), DIMENSION(:), POINTER          :: data_vec_rep

          COMPLEX(kind=real_8), DIMENSION(:, :), POINTER       :: vec_col

          TYPE(dbcsr_distribution_type)            :: dist_rep_row, dist_rep_col

          TYPE(dbcsr_iterator_type)                :: iter


          CALL timeset(routinen, handle)


 ! get information about the parallel environment

          CALL dbcsr_get_info(matrix=rep_col_vec, distribution=dist_rep_col)

          CALL dbcsr_distribution_get(dist_rep_col, &

                                      prow_group=prow_handle, &

                                      myprow=myprow, &

                                      mypcol=mypcol)


          CALL prow_group%set_handle(prow_handle)


 ! Get the vector which tells us which blocks are local to which processor col in the row vec

          CALL dbcsr_get_info(matrix=rep_row_vec, distribution=dist_rep_row)

          CALL dbcsr_distribution_get(dist_rep_row, col_dist=col_dist)


 ! The same trick as described above with opposite direction

          CALL dbcsr_set(rep_col_vec, cmplx(0.0, 0.0, real_8))

          CALL dbcsr_iterator_start(iter, rep_col_vec)

          DO WHILE (dbcsr_iterator_blocks_left(iter))

             CALL dbcsr_iterator_next_block(iter, row, col, vec_col)

             IF (col_dist(row) == mypcol) THEN

                vec_col = transpose(fast_vec_row%blk_map_z (hash_table_get(fast_vec_row%hash_table, row))%ptr)

             END IF

             ! this one is special and allows to add the elements of a not yet summed replicated

             ! column vector as it appears in M*V(row_rep) as result. Save an parallel summation in the symmetric case

             IF (PRESENT(fast_vec_col_add)) vec_col = vec_col + &

                                   fast_vec_col_add%blk_map_z (hash_table_get(fast_vec_col_add%hash_table, row))%ptr(:, :)

          END DO

          CALL dbcsr_iterator_stop(iter)

          CALL dbcsr_get_info(matrix=rep_col_vec, nfullrows_local=nrows, nfullcols_local=ncols)

          data_vec_rep => dbcsr_get_data_p(rep_col_vec, select_data_type=cmplx(0.0, 0.0, real_8))

          CALL prow_group%sum(data_vec_rep(1:nrows*ncols))


          CALL timestop(handle)


       END SUBROUTINE dbcsr_rep_row_to_rep_col_vec_z


 ! **************************************************************************************************

 !> \brief given a column vector, prepare the fast_vec_access container

 !> \param vec ...

 !> \param fast_vec_access ...

 ! **************************************************************************************************

       SUBROUTINE create_fast_col_vec_access_z (vec, fast_vec_access)

          TYPE(dbcsr_type)                          :: vec

          TYPE(fast_vec_access_type)               :: fast_vec_access


          CHARACTER(LEN=*), PARAMETER :: routinen = 'create_fast_col_vec_access_z'


          INTEGER                                  :: handle, nblk_local

          INTEGER                                  :: col, row, iblock, nthreads

          COMPLEX(kind=real_8), DIMENSION(:, :), POINTER       :: vec_bl

          TYPE(dbcsr_iterator_type)                :: iter


          CALL timeset(routinen, handle)


          ! figure out the number of threads

          nthreads = 1

 !$OMP PARALLEL DEFAULT(NONE) SHARED(nthreads)

 !$OMP MASTER

 !$       nthreads = OMP_GET_NUM_THREADS()

 !$OMP END MASTER

 !$OMP END PARALLEL


          CALL dbcsr_get_info(matrix=vec, nblkrows_local=nblk_local)

          ! 4 times makes sure the table is big enough to limit collisions.

          CALL hash_table_create(fast_vec_access%hash_table, 4*nblk_local)

          ! include zero for effective dealing with values not in the hash table (will return 0)

          ALLOCATE (fast_vec_access%blk_map_z (0:nblk_local))


          CALL dbcsr_get_info(matrix=vec, nblkcols_local=col)

          IF (col .GT. 1) cpabort("BUG")


          ! go through the blocks of the vector

          iblock = 0

          CALL dbcsr_iterator_start(iter, vec)

          DO WHILE (dbcsr_iterator_blocks_left(iter))

             CALL dbcsr_iterator_next_block(iter, row, col, vec_bl)

             iblock = iblock + 1

             CALL hash_table_add(fast_vec_access%hash_table, row, iblock)

             fast_vec_access%blk_map_z (iblock)%ptr => vec_bl

             fast_vec_access%blk_map_z (iblock)%assigned_thread = mod(iblock, nthreads)

          END DO

          CALL dbcsr_iterator_stop(iter)


          CALL timestop(handle)


       END SUBROUTINE create_fast_col_vec_access_z


 ! **************************************************************************************************

 !> \brief given a row vector, prepare the fast_vec_access_container

 !> \param vec ...

 !> \param fast_vec_access ...

 ! **************************************************************************************************

       SUBROUTINE create_fast_row_vec_access_z (vec, fast_vec_access)

          TYPE(dbcsr_type)                          :: vec

          TYPE(fast_vec_access_type)                :: fast_vec_access


          CHARACTER(LEN=*), PARAMETER :: routinen = 'create_fast_row_vec_access_z'


          INTEGER                                  :: handle, nblk_local

          INTEGER                                  :: col, row, iblock, nthreads

          COMPLEX(kind=real_8), DIMENSION(:, :), POINTER       :: vec_bl

          TYPE(dbcsr_iterator_type)                :: iter


          CALL timeset(routinen, handle)


          ! figure out the number of threads

          nthreads = 1

 !$OMP PARALLEL DEFAULT(NONE) SHARED(nthreads)

 !$OMP MASTER

 !$       nthreads = OMP_GET_NUM_THREADS()

 !$OMP END MASTER

 !$OMP END PARALLEL


          CALL dbcsr_get_info(matrix=vec, nblkcols_local=nblk_local)

          ! 4 times makes sure the table is big enough to limit collisions.

          CALL hash_table_create(fast_vec_access%hash_table, 4*nblk_local)

          ! include zero for effective dealing with values not in the hash table (will return 0)

          ALLOCATE (fast_vec_access%blk_map_z (0:nblk_local))


          ! sanity check

          CALL dbcsr_get_info(matrix=vec, nblkrows_local=row)

          IF (row .GT. 1) cpabort("BUG")


          ! go through the blocks of the vector

          iblock = 0

          CALL dbcsr_iterator_start(iter, vec)

          DO WHILE (dbcsr_iterator_blocks_left(iter))

             CALL dbcsr_iterator_next_block(iter, row, col, vec_bl)

             iblock = iblock + 1

             CALL hash_table_add(fast_vec_access%hash_table, col, iblock)

             fast_vec_access%blk_map_z (iblock)%ptr => vec_bl

             fast_vec_access%blk_map_z (iblock)%assigned_thread = mod(iblock, nthreads)

          END DO

          CALL dbcsr_iterator_stop(iter)


          CALL timestop(handle)


       END SUBROUTINE create_fast_row_vec_access_z


 ! **************************************************************************************************

 !> \brief ...

 !> \param matrix ...

 !> \param vec_in ...

 !> \param vec_out ...

 !> \param alpha ...

 !> \param beta ...

 !> \param work_row ...

 !> \param work_col ...

 ! **************************************************************************************************

       SUBROUTINE dbcsr_sym_matrix_vector_mult_z (matrix, vec_in, vec_out, alpha, beta, work_row, work_col)

          TYPE(dbcsr_type)                          :: matrix, vec_in, vec_out

          COMPLEX(kind=real_8)                                  :: alpha, beta

          TYPE(dbcsr_type)                          :: work_row, work_col


          CHARACTER(LEN=*), PARAMETER :: routinen = 'dbcsr_sym_m_v_mult'


          INTEGER                                  :: col, mypcol, &

                                                      myprow, &

                                                      nrows, ncols, &

                                                      row, pcol_handle, &

                                                      handle, handle1, ithread, vec_dim

          COMPLEX(kind=real_8), DIMENSION(:), POINTER          :: data_vec

          COMPLEX(kind=real_8), DIMENSION(:, :), POINTER       :: data_d, vec_res

          TYPE(dbcsr_distribution_type)            :: dist

          TYPE(dbcsr_iterator_type)                :: iter

          TYPE(dbcsr_type)                         :: result_row, result_col

          TYPE(mp_comm_type) :: pcol_group


          TYPE(fast_vec_access_type)               :: fast_vec_row, fast_vec_col, res_fast_vec_row, res_fast_vec_col

          INTEGER                                  :: prow, pcol, rprow, rpcol


          CALL timeset(routinen, handle)

          ithread = 0

 ! We need some work matrices as we try to exploit operations on the replicated vectors which are duplicated otherwise

          CALL dbcsr_get_info(matrix=vec_in, nfullcols_total=vec_dim)

 ! This is a performance hack as the new creation of a replicated vector is a fair bit more expensive

          CALL dbcsr_set(work_col, cmplx(0.0, 0.0, real_8))

          CALL dbcsr_copy(result_col, work_col)

          CALL dbcsr_set(work_row, cmplx(0.0, 0.0, real_8))

          CALL dbcsr_copy(result_row, work_row)


 ! Collect some data about the parallel environment. We will use them later to move the vector around

          CALL dbcsr_get_info(matrix=matrix, distribution=dist)

          CALL dbcsr_distribution_get(dist, pcol_group=pcol_handle, myprow=myprow, mypcol=mypcol)


          CALL pcol_group%set_handle(pcol_handle)


          CALL create_fast_row_vec_access(work_row, fast_vec_row)

          CALL create_fast_col_vec_access(work_col, fast_vec_col)

          CALL create_fast_row_vec_access(result_row, res_fast_vec_row)

          CALL create_fast_col_vec_access(result_col, res_fast_vec_col)


 ! Transfer the correct parts of the input vector to the correct locations so we can do a local multiply

          CALL dbcsr_col_vec_to_rep_row_z (vec_in, work_col, work_row, fast_vec_col)


 ! Probably I should rename the routine above as it delivers both the replicated row and column vector


 ! Perform the local multiply. Here we exploit, that we have the blocks replicated on the mpi processes

 ! It is important to note, that the input and result vector are distributed differently (row wise, col wise respectively)

          CALL timeset(routinen//"_local_mm", handle1)


 !------ perform the multiplication, we have to take car to take the correct blocks ----------


 !$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,ithread,pcol,prow,rpcol,rprow) &

 !$OMP          SHARED(matrix,fast_vec_row,res_fast_vec_col,res_fast_vec_row,fast_vec_col)

 !$       ithread = omp_get_thread_num()

          CALL dbcsr_iterator_start(iter, matrix, shared=.false.)

          DO WHILE (dbcsr_iterator_blocks_left(iter))

             CALL dbcsr_iterator_next_block(iter, row, col, data_d)

             pcol = hash_table_get(fast_vec_row%hash_table, col)

             rprow = hash_table_get(res_fast_vec_col%hash_table, row)

             IF (ASSOCIATED(fast_vec_row%blk_map_z (pcol)%ptr) .AND. &

                 ASSOCIATED(res_fast_vec_col%blk_map_z (rprow)%ptr)) THEN

                IF (res_fast_vec_col%blk_map_z (rprow)%assigned_thread .EQ. ithread) THEN

                   res_fast_vec_col%blk_map_z (rprow)%ptr = res_fast_vec_col%blk_map_z (rprow)%ptr + &

                                                              matmul(data_d, transpose(fast_vec_row%blk_map_z (pcol)%ptr))

                END IF

                prow = hash_table_get(fast_vec_col%hash_table, row)

                rpcol = hash_table_get(res_fast_vec_row%hash_table, col)

                IF (res_fast_vec_row%blk_map_z (rpcol)%assigned_thread .EQ. ithread .AND. row .NE. col) THEN

                   res_fast_vec_row%blk_map_z (rpcol)%ptr = res_fast_vec_row%blk_map_z (rpcol)%ptr + &

                                                              matmul(transpose(fast_vec_col%blk_map_z (prow)%ptr), data_d)

                END IF

             ELSE

                rpcol = hash_table_get(res_fast_vec_col%hash_table, col)

                prow = hash_table_get(fast_vec_row%hash_table, row)

                IF (res_fast_vec_col%blk_map_z (rpcol)%assigned_thread .EQ. ithread) THEN

                   res_fast_vec_col%blk_map_z (rpcol)%ptr = res_fast_vec_col%blk_map_z (rpcol)%ptr + &

                                                              transpose(matmul(fast_vec_row%blk_map_z (prow)%ptr, data_d))

                END IF

                rprow = hash_table_get(res_fast_vec_row%hash_table, row)

                pcol = hash_table_get(fast_vec_col%hash_table, col)

                IF (res_fast_vec_row%blk_map_z (rprow)%assigned_thread .EQ. ithread .AND. row .NE. col) THEN

                   res_fast_vec_row%blk_map_z (rprow)%ptr = res_fast_vec_row%blk_map_z (rprow)%ptr + &

                                                              transpose(matmul(data_d, fast_vec_col%blk_map_z (pcol)%ptr))

                END IF

             END IF

          END DO

          CALL dbcsr_iterator_stop(iter)

 !$OMP END PARALLEL


          CALL timestop(handle1)


          ! sum all the data within a processor column to obtain the replicated result from lower

          data_vec => dbcsr_get_data_p(result_row, select_data_type=cmplx(0.0, 0.0, real_8))

          CALL dbcsr_get_info(matrix=result_row, nfullrows_local=nrows, nfullcols_local=ncols)


          CALL pcol_group%sum(data_vec(1:nrows*ncols))

 !

 !! Convert the results to a column wise distribution, this is a bit involved as the result_row is fully replicated

 !! While the result_col still has the partial results in parallel. The routine below takes care of that and saves a

 !! parallel summation. Of the res_row vectors are created only taking the appropriate element (0 otherwise) while the res_col

 !! parallel bits are locally added. The sum magically creates the correct vector

          CALL dbcsr_rep_row_to_rep_col_vec_z (work_col, result_row, res_fast_vec_row, res_fast_vec_col)


 !    ! Create_the final vector by summing it to the result vector which lives on proc_col 0 lower

          CALL dbcsr_iterator_start(iter, vec_out)

          DO WHILE (dbcsr_iterator_blocks_left(iter))

             CALL dbcsr_iterator_next_block(iter, row, col, vec_res)

             prow = hash_table_get(fast_vec_col%hash_table, row)

             IF (ASSOCIATED(fast_vec_col%blk_map_z (prow)%ptr)) THEN

                vec_res(:, :) = beta*vec_res(:, :) + alpha*(fast_vec_col%blk_map_z (prow)%ptr(:, :))

             ELSE

                vec_res(:, :) = beta*vec_res(:, :)

             END IF

          END DO

          CALL dbcsr_iterator_stop(iter)


          CALL release_fast_vec_access(fast_vec_row)

          CALL release_fast_vec_access(fast_vec_col)

          CALL release_fast_vec_access(res_fast_vec_row)

          CALL release_fast_vec_access(res_fast_vec_col)


          CALL dbcsr_release(result_row); CALL dbcsr_release(result_col)


          CALL timestop(handle)


       END SUBROUTINE dbcsr_sym_matrix_vector_mult_z


 END MODULE dbcsr_vector

dgemm
static void dgemm(const char transa, const char transb, const int m, const int n, const int k, const double alpha, const double *a, const int lda, const double *b, const int ldb, const double beta, double *c, const int ldc)
Convenient wrapper to hide Fortran nature of dgemm_, swapping a and b.
Definition: grid_cpu_task_list.c:195

dbcsr_vector
operations for skinny matrices/vectors expressed in dbcsr form
Definition: dbcsr_vector.F:15

dbcsr_vector::create_replicated_row_vec_from_matrix
subroutine, public create_replicated_row_vec_from_matrix(dbcsr_vec, matrix, nrow)
creates a row vector like object whose blocks can be replicated along the processor col and has the s...
Definition: dbcsr_vector.F:236

dbcsr_vector::create_col_vec_from_matrix
subroutine, public create_col_vec_from_matrix(dbcsr_vec, matrix, ncol)
creates a dbcsr col vector like object which lives on proc_col 0 and has the same row dist as the tem...
Definition: dbcsr_vector.F:109

dbcsr_vector::create_replicated_col_vec_from_matrix
subroutine, public create_replicated_col_vec_from_matrix(dbcsr_vec, matrix, ncol)
creates a col vector like object whose blocks can be replicated along the processor row and has the s...
Definition: dbcsr_vector.F:193

dbcsr_vector::create_row_vec_from_matrix
subroutine, public create_row_vec_from_matrix(dbcsr_vec, matrix, nrow)
creates a dbcsr row vector like object which lives on proc_row 0 and has the same row dist as the tem...
Definition: dbcsr_vector.F:151

kinds
Defines the basic variable types.
Definition: kinds.F:23

kinds::dp
integer, parameter, public dp
Definition: kinds.F:34

kinds::real_8
integer, parameter, public real_8
Definition: kinds.F:41

message_passing
Interface to the message passing library MPI.
Definition: message_passing.F:23