d6/dc2/smeagol__matrix__utils_8F_source.html

!--------------------------------------------------------------------------------------------------!

!   CP2K: A general program to perform molecular dynamics simulations                              !

!   Copyright 2000-2025 CP2K developers group <https://cp2k.org>                                   !

!                                                                                                  !

!   SPDX-License-Identifier: GPL-2.0-or-later                                                      !

!--------------------------------------------------------------------------------------------------!


! **************************************************************************************************

!> \brief Routines to convert sparse matrices between DBCSR (distributed-blocks compressed sparse rows)

!>        and SIESTA (distributed compressed sparse columns) formats.

!> \author Sergey Chulkov

!> \author Christian Ahart

!> \author Clotilde Cucinotta

! **************************************************************************************************

MODULE smeagol_matrix_utils

   USE cell_types, ONLY: cell_type, &

                         real_to_scaled, &

                         scaled_to_real

   USE cp_dbcsr_api, ONLY: dbcsr_get_block_p, &

                           dbcsr_get_info, &

                           dbcsr_p_type, &

                           dbcsr_set

   USE kinds, ONLY: dp, &

                    dp_size, &

                    int_8

   USE message_passing, ONLY: mp_para_env_type, &

                              mp_request_type, &

                              mp_waitall

   USE negf_matrix_utils, ONLY: get_index_by_cell

#if defined(__SMEAGOL)

   USE parallel, ONLY: getnodeorbs, &

                       globaltolocalorb, &

                       localtoglobalorb, &

                       whichnodeorb

#endif

   USE particle_types, ONLY: particle_type

   USE qs_neighbor_list_types, ONLY: get_iterator_info, &

                                     get_neighbor_list_set_p, &

                                     neighbor_list_iterate, &

                                     neighbor_list_iterator_create, &

                                     neighbor_list_iterator_p_type, &

                                     neighbor_list_iterator_release, &

                                     neighbor_list_set_p_type

   USE qs_subsys_types, ONLY: qs_subsys_get, &

                              qs_subsys_type

   USE util, ONLY: sort

#include "./base/base_uses.f90"


   IMPLICIT NONE

   PRIVATE


   CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'smeagol_matrix_utils'

   LOGICAL, PARAMETER, PRIVATE          :: debug_this_module = .false.


   INTEGER, PARAMETER, PRIVATE          :: neighbor_list_iatom_index = 1

   INTEGER, PARAMETER, PRIVATE          :: neighbor_list_jatom_index = 2

   INTEGER, PARAMETER, PRIVATE          :: neighbor_list_dbcsr_image_index = 3

   INTEGER, PARAMETER, PRIVATE          :: neighbor_list_siesta_image_index = 4

   INTEGER, PARAMETER, PRIVATE          :: neighbor_list_siesta_transp_image_index = 5

   INTEGER, PARAMETER, PRIVATE          :: neighbor_list_dim1 = neighbor_list_siesta_transp_image_index


   PUBLIC :: siesta_distrib_csc_struct_type

   PUBLIC :: siesta_struct_create, siesta_struct_release

   PUBLIC :: convert_dbcsr_to_distributed_siesta, convert_distributed_siesta_to_dbcsr


   PRIVATE :: get_negf_cell_ijk, index_in_canonical_enumeration, number_from_canonical_enumeration, pbc_0_1

   PRIVATE :: get_number_of_mpi_sendrecv_requests, assign_nonzero_elements_to_requests


   !> number of DBCSR matrix elements to receive from a given rank

   INTEGER, PARAMETER, PRIVATE          :: nelements_dbcsr_recv = 1

   !> number of DBCSR matrix elements to send to a given rank

   INTEGER, PARAMETER, PRIVATE          :: nelements_dbcsr_send = 2

   INTEGER, PARAMETER, PRIVATE          :: nelements_dbcsr_dim2 = nelements_dbcsr_send


   INTEGER(kind=int_8), PARAMETER, PRIVATE :: max_mpi_packet_size_bytes = 134217728 ! 128 MiB (to limit memory usage for matrix redistribution)

   INTEGER(kind=int_8), PARAMETER, PRIVATE :: max_mpi_packet_size_dp = max_mpi_packet_size_bytes/int(dp_size, kind=int_8)


   ! a portable way to determine the upper bound for tag value is to call

   ! MPI_COMM_GET_ATTR(comm, MPI_TAG_UB, max_mpi_rank, flag, ierror).

   ! The MPI specification guarantees a value of 32767

   INTEGER, PARAMETER, PRIVATE          :: max_mpi_rank = 32767


! **************************************************************************************************

!> \brief Sparsity pattern of replicated SIESTA compressed sparse column (CSC) matrices

! **************************************************************************************************


   TYPE siesta_distrib_csc_struct_type

      !> gather all non-zero matrix elements on the given MPI process.

      !> Distribute the elements across MPI processes if gather_root < 0.

      !> The matrix elements should be located on I/O process in case of bulk transport,

      !> and should be distributed in case of SMEAGOL calculation.

      INTEGER                                            :: gather_root = 0

      !> Based of full (.FALSE.) or upper-triangular (.TRUE.) DBCSR matrix.

      !> It is used in CPASSERTs to allow access to lower-triangular matrix elements of non-symmetric DBCSR matrices

      !> In case there is no bugs in this module, these CPASSERTs should newer trigger.

      !> Therefore the 'symmetric' variable alongside with relevant CPASSERT calls are excessive and can be removed.

      LOGICAL                                            :: symmetric = .true.

      !> number of neighbour list nodes for each MPI process (0:num_pe-1).

      !> If do_merge == .TRUE., nodes for different cell images along k cell vector are merged into one node

      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: nnodes_per_proc


      !> replicated neighbour list (1:neighbor_list_dim1, 1:SUM(nnodes_per_proc)).

      !> Neighbour list nodes are ordered according to their MPI ranks.

      !> Thus, the first nnodes_per_proc(0) nodes are stored on MPI rank 0,

      !> the next nnodes_per_proc(1) nodes reside on MPI rank 1, etc

      !> Nodes for cell images along transport direction are merged into one node.

      !> The number of non-zero DBCSR matrix blocks and their DBCSR cell image indices

      !> are stored into 'n_dbcsr_cell_images_to_merge' and 'dbcsr_cell_image_to_merge' arrays

      INTEGER, ALLOCATABLE, DIMENSION(:, :)               :: nl_repl


      !> number of DBCSR images for each local merged neighbour list node (1:nnodes_per_proc(para_env%mepos))

      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: n_dbcsr_cell_images_to_merge

      !> list of DBCSR image indices to merge; (1:SUM(n_dbcsr_cell_images_to_merge))

      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: dbcsr_cell_image_to_merge


      !> number of DBCSR non-zero matrix elements that should be received/sent from each MPI rank

      !> (0:num_pe-1, 1:nelements_dbcsr_dim2)

      INTEGER(kind=int_8), ALLOCATABLE, DIMENSION(:, :)   :: nelements_per_proc


      !> number of non-zero matrix elements local to this MPI rank.

      INTEGER(kind=int_8)                                :: n_nonzero_elements = 0_int_8

      INTEGER                                            :: nrows = 0, ncols = 0

      !> Number of non-zero matrix elements (columns) on each row

      !> n_nonzero_cols(1:nrows); same as 'numh' in SMEAGOL code.

      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: n_nonzero_cols

      !> offset of the first non-zero matrix elements on each row.

      !> column_offset(1:nrows); same as 'listhptr' in SMEAGOL code.

      !> It should be declared as INTEGER(kind=int_8), but SMEAGOL expects it to be INTEGER

      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: row_offset

      !> column index of each non-zero matrix element.

      !> col_index(1:n_nonzero_elements); same as 'listh' in SMEAGOL code

      !> col index of the first non-zero matrix elements of irow row is col_index(row_offset(irow)+1)

      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: col_index

      !> index of the non-zero matrix element in a communication buffer for each rank

      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: packed_index

      !> R_atom_row - R_atom_col

      REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :)         :: xij

      !> equivalent atomic orbitals

      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: indxuo

      !> atomic index on which the orbital is centred

      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: iaorb

      !> coordinates of all atoms in the supercell

      REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :)         :: xa

   END TYPE siesta_distrib_csc_struct_type


CONTAINS


! **************************************************************************************************

!> \brief Map non-zero matrix blocks between sparse matrices in DBCSR and SIESTA formats.

!> \param siesta_struct      structure that stores metadata (sparsity pattern) of sparse SIESTA matrices

!> \param matrix_dbcsr_kp    DBCSR matrices for each cell image

!> \param subsys             QuickStep molecular system

!> \param cell_to_index      array to convert 3-D cell indices to 1-D DBCSR image indices

!> \param sab_nl             pair-wise neighbour list

!> \param para_env           MPI parallel environment

!> \param max_ij_cell_image  largest index of cell images along i and j cell vectors (e.g. (2,0) in

!>                           case of 5 cell images (0,0), (1,0), (-1,0), (2,0), and (-2,0))

!> \param do_merge           merge DBCSR images along transport direction (k cell vector)

!> \param gather_root        distribute non-zero matrix elements of SIESTA matrices across all

!>                           parallel processes (-1), or gather them on the given MPI rank (>= 0).

! **************************************************************************************************


   SUBROUTINE siesta_struct_create(siesta_struct, matrix_dbcsr_kp, subsys, cell_to_index, &

                                   sab_nl, para_env, max_ij_cell_image, do_merge, gather_root)

      TYPE(siesta_distrib_csc_struct_type), &

         INTENT(inout)                                   :: siesta_struct

      TYPE(dbcsr_p_type), DIMENSION(:), INTENT(in)       :: matrix_dbcsr_kp

      TYPE(qs_subsys_type), POINTER                      :: subsys

      INTEGER, DIMENSION(:, :, :), POINTER               :: cell_to_index

      TYPE(neighbor_list_set_p_type), DIMENSION(:), &

         POINTER                                         :: sab_nl

      TYPE(mp_para_env_type), POINTER                    :: para_env

      INTEGER, DIMENSION(2), INTENT(inout)               :: max_ij_cell_image

      LOGICAL, INTENT(in)                                :: do_merge

      INTEGER, INTENT(in)                                :: gather_root


      CHARACTER(len=*), PARAMETER :: routinen = 'siesta_struct_create'


      CHARACTER(len=20)                                  :: str_nelem, str_nelem_max

      INTEGER :: handle, iatom, icol, icol_blk, icol_local, image, image_j, image_k, irow, &

         irow_local, natoms, ncells_siesta_total, ncols_blk, ncols_total, nrows_local, &

         nrows_total, offset

      INTEGER(kind=int_8)                                :: n_nonzero_elements_local

      INTEGER, DIMENSION(3)                              :: max_ijk_cell_image, ncells_siesta

      INTEGER, DIMENSION(:), POINTER                     :: col_blk_offset, col_blk_size

      LOGICAL                                            :: do_distribute, is_root_rank

      REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :)        :: particle_coords

      REAL(kind=dp), DIMENSION(3)                        :: real_cell_shift, scaled_cell_shift

      TYPE(cell_type), POINTER                           :: cell

      TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set


      CALL timeset(routinen, handle)

      do_distribute = gather_root < 0

      is_root_rank = gather_root == para_env%mepos


      ! here row_blk_offset / col_blk_offset are global indices of the first row / column of a given non-zero block.

      ! They are not offsets (index-1) but the actual indices.

      CALL dbcsr_get_info(matrix=matrix_dbcsr_kp(1)%matrix, &

                          nfullrows_total=nrows_total, nfullcols_total=ncols_total, &

                          nblkcols_total=ncols_blk, col_blk_size=col_blk_size, col_blk_offset=col_blk_offset)

      IF (debug_this_module) THEN

         cpassert(nrows_total == ncols_total)

         cpassert(gather_root < para_env%num_pe)

      END IF


      siesta_struct%gather_root = gather_root

      CALL get_neighbor_list_set_p(neighbor_list_sets=sab_nl, symmetric=siesta_struct%symmetric)


      ! apply periodic boundary conditions to atomic coordinates

      CALL qs_subsys_get(subsys, cell=cell, particle_set=particle_set, nparticle=natoms)

      ALLOCATE (particle_coords(3, natoms))

      DO iatom = 1, natoms

         CALL pbc_0_1(particle_coords(1:3, iatom), particle_set(iatom)%r(1:3), cell)

      END DO


      ! Note: in case we would like to limit the number of cell images along transport direction (k cell vector)

      !       by enabling 'BulkTransvCellSizeZ' keyword, we need to pass max_ijk_cell_image(1:3) vector

      !       to the subroutine instead of the reduced vector max_ij_cell_image(1:2)

      max_ijk_cell_image(1:2) = max_ij_cell_image(1:2)


      ! determine the actual number of cell images along k cell vector. Unless the third element is also passed

      ! via subroutine arguments, an extra MPI_Allreduce operation is needed each time we call

      ! replicate_neighbour_list() / get_nnodes_local().

      max_ijk_cell_image(3) = -1

      IF (.NOT. do_merge) max_ijk_cell_image(3) = 1 ! bulk-transport calculation expects exactly 3 cell images along transport direction


      ! replicate pair-wise neighbour list. Identical non-zero matrix blocks from cell image along transport direction

      ! are grouped together if do_merge == .TRUE.

      ALLOCATE (siesta_struct%nnodes_per_proc(0:para_env%num_pe - 1))

      CALL replicate_neighbour_list(siesta_struct%nl_repl, &

                                    siesta_struct%n_dbcsr_cell_images_to_merge, &

                                    siesta_struct%dbcsr_cell_image_to_merge, &

                                    siesta_struct%nnodes_per_proc, &

                                    max_ijk_cell_image, sab_nl, para_env, particle_coords, cell, cell_to_index, do_merge)

      max_ij_cell_image(1:2) = max_ijk_cell_image(1:2)


      ! count number of non-zero matrix elements that need to be send to and received from other parallel processes

      ALLOCATE (siesta_struct%nelements_per_proc(0:para_env%num_pe - 1, nelements_dbcsr_dim2))

      CALL count_remote_dbcsr_elements(siesta_struct%nelements_per_proc, siesta_struct%nnodes_per_proc, &

                                       siesta_struct%nl_repl, matrix_dbcsr_kp, siesta_struct%symmetric, para_env, gather_root)


      ! number of SIESTA non-zero matrix elements that are going to be stored on this parallel process

      n_nonzero_elements_local = sum(siesta_struct%nelements_per_proc(:, nelements_dbcsr_recv))

      siesta_struct%n_nonzero_elements = n_nonzero_elements_local


      ! as SMEAGOL uses 32-bits integers, the number of non-zero matrix elements is limited by 2^31 per MPI rank.

      ! Abort CP2K if we are about to exceed this limit.

      IF (n_nonzero_elements_local > int(huge(0), kind=int_8)) THEN

         WRITE (str_nelem, '(I0)') n_nonzero_elements_local

         WRITE (str_nelem_max, '(I0)') huge(0)

         CALL cp_abort(__location__, &

                       "The number of non-zero matrix elements per MPI process "//trim(str_nelem)// &

                       " cannot exceed "//trim(str_nelem_max)// &

                       ". Please increase the number of MPI processes to satisfy this SMEAGOL limitation.")

      END IF


      ! in case there is no nonzero matrix element stored on this process, allocate arrays with one element to avoid SEGFAULT

      IF (n_nonzero_elements_local == 0) n_nonzero_elements_local = 1


      ! number of SIESTA-matrix rows local to the given parallel process

      IF (do_distribute) THEN

#if defined(__SMEAGOL)

         CALL getnodeorbs(nrows_total, para_env%mepos, para_env%num_pe, nrows_local)

#else

         CALL cp_abort(__location__, &

                       "CP2K was compiled with no SMEAGOL support.")

#endif

      ELSE

         IF (is_root_rank) THEN

            nrows_local = nrows_total

         ELSE

            nrows_local = 0

         END IF

      END IF


      ! number of cell images along each cell vector. It is 2*m+1, as SIESTA images are ordered as 0, 1, -1, ..., m, -m

      ncells_siesta(1:3) = 2*max_ijk_cell_image(1:3) + 1

      ! in case of merged cell images along the transport direction, there will be just 1 'merged' cell image along it

      IF (do_merge) ncells_siesta(3) = 1


      ncells_siesta_total = ncells_siesta(1)*ncells_siesta(2)*ncells_siesta(3)


      ! number of rows local to the given parallel process. Rows are distributed in a block-cyclic manner

      siesta_struct%nrows = nrows_local


      ! number of columns of the matrix in its dense form. SIESTA uses 1-D (rows) block-cyclic distribution.

      ! All non-zero matrix elements on a given row are stored on the same parallel process

      siesta_struct%ncols = nrows_total*ncells_siesta_total


      ! allocate at least one array element to avoid SIGFAULT when passing unallocated arrays to subroutines

      IF (nrows_local == 0) nrows_local = 1


      ALLOCATE (siesta_struct%n_nonzero_cols(nrows_local))

      ALLOCATE (siesta_struct%row_offset(nrows_local))

      ALLOCATE (siesta_struct%col_index(n_nonzero_elements_local))

      ALLOCATE (siesta_struct%packed_index(n_nonzero_elements_local))


      ! restore the actual number of local rows

      nrows_local = siesta_struct%nrows


      ! get number of non-zero matrix element on each local row (n_nonzero_cols),

      ! offset of the first non-zero matrix element for each local row (row_offset),

      ! global column indices of all local non-zero matrix elements (col_index), and

      ! the indices of all local non-zero matrix elements in the communication buffer (packed_index)

      CALL get_nonzero_element_indices(siesta_struct%n_nonzero_cols, siesta_struct%row_offset, &

                                       siesta_struct%col_index, siesta_struct%packed_index, &

                                       siesta_struct%nl_repl, matrix_dbcsr_kp, &

                                       siesta_struct%symmetric, para_env, gather_root)


      ! indices of equivalent atomic orbitals

      ALLOCATE (siesta_struct%indxuo(siesta_struct%ncols))

      DO icol = 1, ncols_total

         siesta_struct%indxuo(icol) = icol

      END DO

      DO image = 2, ncells_siesta_total

         siesta_struct%indxuo((image - 1)*ncols_total + 1:image*ncols_total) = siesta_struct%indxuo(1:ncols_total)

      END DO


      ! particle index on which the orbital is centred

      ALLOCATE (siesta_struct%iaorb(siesta_struct%ncols))

      DO icol_blk = 1, ncols_blk

         ! col_blk_offset() is not an offset but the index of the first atomic orbital in the column block

         siesta_struct%iaorb(col_blk_offset(icol_blk):col_blk_offset(icol_blk) + col_blk_size(icol_blk) - 1) = icol_blk

      END DO

      DO image = 2, ncells_siesta_total

        siesta_struct%iaorb((image - 1)*ncols_total + 1:image*ncols_total) = siesta_struct%iaorb(1:ncols_total) + (image - 1)*natoms

      END DO


      ! coordinates of all particles in each cell images

      ALLOCATE (siesta_struct%xa(3, natoms*ncells_siesta_total))

      DO image_k = 1, ncells_siesta(3)

         !icell_siesta(3) = image_k

         scaled_cell_shift(3) = real(number_from_canonical_enumeration(image_k), kind=dp) ! SIESTA -> actual cell index

         DO image_j = 1, ncells_siesta(2)

            scaled_cell_shift(2) = real(number_from_canonical_enumeration(image_j), kind=dp)

            DO image = 1, ncells_siesta(1)

               scaled_cell_shift(1) = real(number_from_canonical_enumeration(image), kind=dp)

               CALL scaled_to_real(real_cell_shift, scaled_cell_shift, cell)

               offset = (((image_k - 1)*ncells_siesta(2) + image_j - 1)*ncells_siesta(1) + image - 1)*natoms

               DO iatom = 1, natoms

                  siesta_struct%xa(1:3, offset + iatom) = particle_set(iatom)%r(1:3) + real_cell_shift(1:3)

               END DO

            END DO

         END DO

      END DO


      ! inter-atomic distance

      ALLOCATE (siesta_struct%xij(3, n_nonzero_elements_local))

      DO irow_local = 1, nrows_local

         IF (do_distribute) THEN

#if defined(__SMEAGOL)

            CALL localtoglobalorb(irow_local, para_env%mepos, para_env%num_pe, irow)

#else

            CALL cp_abort(__location__, &

                          "CP2K was compiled with no SMEAGOL support.")

#endif

         ELSE

            irow = irow_local

            IF (debug_this_module) THEN

               cpassert(is_root_rank)

            END IF

         END IF

         offset = siesta_struct%row_offset(irow_local)

         DO icol_local = offset + 1, offset + siesta_struct%n_nonzero_cols(irow_local)

            icol = siesta_struct%col_index(icol_local)

            siesta_struct%xij(1:3, icol_local) = siesta_struct%xa(1:3, siesta_struct%iaorb(icol)) - &

                                                 siesta_struct%xa(1:3, siesta_struct%iaorb(irow))

         END DO

      END DO


      DEALLOCATE (particle_coords)


      CALL timestop(handle)


   END SUBROUTINE siesta_struct_create


! **************************************************************************************************

!> \brief Release a SIESTA matrix structure

!> \param siesta_struct      structure to release

! **************************************************************************************************


   SUBROUTINE siesta_struct_release(siesta_struct)

      TYPE(siesta_distrib_csc_struct_type), &

         INTENT(inout)                                   :: siesta_struct


      CHARACTER(len=*), PARAMETER :: routinen = 'siesta_struct_release'


      INTEGER                                            :: handle


      CALL timeset(routinen, handle)


      siesta_struct%gather_root = -1


      IF (ALLOCATED(siesta_struct%nnodes_per_proc)) DEALLOCATE (siesta_struct%nnodes_per_proc)

      IF (ALLOCATED(siesta_struct%nl_repl)) DEALLOCATE (siesta_struct%nl_repl)

      IF (ALLOCATED(siesta_struct%n_dbcsr_cell_images_to_merge)) DEALLOCATE (siesta_struct%n_dbcsr_cell_images_to_merge)

      IF (ALLOCATED(siesta_struct%dbcsr_cell_image_to_merge)) DEALLOCATE (siesta_struct%dbcsr_cell_image_to_merge)

      IF (ALLOCATED(siesta_struct%nelements_per_proc)) DEALLOCATE (siesta_struct%nelements_per_proc)


      siesta_struct%n_nonzero_elements = 0

      siesta_struct%nrows = 0

      siesta_struct%ncols = 0


      IF (ALLOCATED(siesta_struct%n_nonzero_cols)) DEALLOCATE (siesta_struct%n_nonzero_cols)

      IF (ALLOCATED(siesta_struct%row_offset)) DEALLOCATE (siesta_struct%row_offset)

      IF (ALLOCATED(siesta_struct%col_index)) DEALLOCATE (siesta_struct%col_index)

      IF (ALLOCATED(siesta_struct%packed_index)) DEALLOCATE (siesta_struct%packed_index)


      IF (ALLOCATED(siesta_struct%xij)) DEALLOCATE (siesta_struct%xij)

      IF (ALLOCATED(siesta_struct%indxuo)) DEALLOCATE (siesta_struct%indxuo)

      IF (ALLOCATED(siesta_struct%iaorb)) DEALLOCATE (siesta_struct%iaorb)

      IF (ALLOCATED(siesta_struct%xa)) DEALLOCATE (siesta_struct%xa)


      CALL timestop(handle)


   END SUBROUTINE siesta_struct_release


! **************************************************************************************************

!> \brief Convert matrix from DBCSR to sparse SIESTA format.

!> \param matrix_siesta      matrix in SIESTA format [out]

!> \param matrix_dbcsr_kp    DBCSR matrix [in]

!> \param siesta_struct      structure to map matrix blocks between formats

!> \param para_env           MPI parallel environment

! **************************************************************************************************


   SUBROUTINE convert_dbcsr_to_distributed_siesta(matrix_siesta, matrix_dbcsr_kp, siesta_struct, para_env)

      REAL(kind=dp), DIMENSION(:), INTENT(out)           :: matrix_siesta

      TYPE(dbcsr_p_type), DIMENSION(:), INTENT(in)       :: matrix_dbcsr_kp

      TYPE(siesta_distrib_csc_struct_type), INTENT(in)   :: siesta_struct

      TYPE(mp_para_env_type), INTENT(in), POINTER        :: para_env


      CHARACTER(len=*), PARAMETER :: routinen = 'convert_dbcsr_to_distributed_siesta'


      INTEGER :: first_col_minus_one, first_row_minus_one, handle, icol_blk, icol_local, &

         image_dbcsr, image_ind, image_ind_offset, image_siesta, image_siesta_transp, inode, &

         inode_proc, iproc, irequest, irow_blk, irow_local, irow_proc, mepos, n_image_ind, &

         ncols_blk, ncols_local, nnodes_proc, node_offset, nprocs, nrequests_recv, &

         nrequests_total, nrows_blk, nrows_local

      INTEGER(kind=int_8)                                :: n_nonzero_elements_dbcsr, &

                                                            n_nonzero_elements_siesta, &

                                                            offset_recv_mepos, offset_send_mepos

      INTEGER(kind=int_8), ALLOCATABLE, DIMENSION(:)     :: n_packed_elements_per_proc, &

                                                            nelements_per_request, &

                                                            offset_per_proc, offset_per_request

      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: next_nonzero_element_offset, peer_rank, &

                                                            request_tag

      INTEGER, DIMENSION(:), POINTER                     :: col_blk_offset, col_blk_size, &

                                                            row_blk_offset, row_blk_size

      LOGICAL                                            :: do_distribute, found, is_root_rank, &

                                                            symmetric

      REAL(kind=dp), ALLOCATABLE, DIMENSION(:)           :: recv_buffer, reorder_recv_buffer, &

                                                            send_buffer

      REAL(kind=dp), DIMENSION(:, :), POINTER            :: sm_block, sm_block_merged

      TYPE(mp_request_type), ALLOCATABLE, DIMENSION(:)   :: requests


      CALL timeset(routinen, handle)

      matrix_siesta(:) = 0.0_dp


      mepos = para_env%mepos

      nprocs = para_env%num_pe

      do_distribute = siesta_struct%gather_root < 0

      is_root_rank = siesta_struct%gather_root == mepos


      CALL dbcsr_get_info(matrix=matrix_dbcsr_kp(1)%matrix, &

                          nblkrows_total=nrows_blk, nblkcols_total=ncols_blk, &

                          row_blk_size=row_blk_size, col_blk_size=col_blk_size, &

                          row_blk_offset=row_blk_offset, col_blk_offset=col_blk_offset)

      symmetric = siesta_struct%symmetric


      ! number of locally stored SIESTA non-zero matrix elements

      n_nonzero_elements_siesta = sum(siesta_struct%nelements_per_proc(:, nelements_dbcsr_recv))

      ! number of locally stored DBCSR non-zero matrix elements

      n_nonzero_elements_dbcsr = sum(siesta_struct%nelements_per_proc(:, nelements_dbcsr_send))


      ! number of concurrent MPI isend / irecv operations

      nrequests_recv = get_number_of_mpi_sendrecv_requests(mepos, siesta_struct%nelements_per_proc(:, nelements_dbcsr_recv), &

                                                           max_mpi_packet_size_dp)

      nrequests_total = get_number_of_mpi_sendrecv_requests(mepos, siesta_struct%nelements_per_proc(:, nelements_dbcsr_send), &

                                                            max_mpi_packet_size_dp) + nrequests_recv


      IF (nrequests_total > 0) THEN

         ! allocate MPI-related arrays. request_tag is not actually needed, as MPI standard guarantees the order of

         ! peer-to-peer messages with the same tag between same processes

         ALLOCATE (requests(nrequests_total))

         ALLOCATE (peer_rank(nrequests_total), request_tag(nrequests_total))

         ALLOCATE (offset_per_request(nrequests_total), nelements_per_request(nrequests_total))

         !requests(:) = mp_request_null


         ! split large messages into a number of smaller messages. It is not really needed

         ! unless we are going to send > 2^31 matrix elements per MPI request

         IF (nrequests_recv > 0) THEN

            CALL assign_nonzero_elements_to_requests(offset_per_request(1:nrequests_recv), &

                                                     nelements_per_request(1:nrequests_recv), &

                                                     peer_rank(1:nrequests_recv), &

                                                     request_tag(1:nrequests_recv), &

                                                     mepos, &

                                                     siesta_struct%nelements_per_proc(:, nelements_dbcsr_recv), &

                                                     max_mpi_packet_size_dp)

         END IF

         IF (nrequests_total > nrequests_recv) THEN

            CALL assign_nonzero_elements_to_requests(offset_per_request(nrequests_recv + 1:nrequests_total), &

                                                     nelements_per_request(nrequests_recv + 1:nrequests_total), &

                                                     peer_rank(nrequests_recv + 1:nrequests_total), &

                                                     request_tag(nrequests_recv + 1:nrequests_total), &

                                                     mepos, &

                                                     siesta_struct%nelements_per_proc(:, nelements_dbcsr_send), &

                                                     max_mpi_packet_size_dp)

         END IF

      END IF


      ! point-to-point recv/send can be replaced with alltoallv, if data to distribute per rank is < 2^31 elements (should be OK due to SMEAGOL limitation)

      ! in principle, it is possible to overcome this limit by using derived datatypes (which will require additional wrapper functions, indeed).

      !

      ! pre-post non-blocking receive operations

      IF (n_nonzero_elements_siesta > 0) THEN

         ALLOCATE (recv_buffer(n_nonzero_elements_siesta))

      END IF

      DO irequest = 1, nrequests_recv

         CALL para_env%irecv(recv_buffer(offset_per_request(irequest) + 1: &

                                         offset_per_request(irequest) + nelements_per_request(irequest)), &

                             peer_rank(irequest), requests(irequest), request_tag(irequest))

      END DO


      ! pack local DBCSR non-zero matrix elements ordering by their target parallel process

      ALLOCATE (offset_per_proc(0:nprocs - 1), n_packed_elements_per_proc(0:nprocs - 1))

      offset_per_proc(0) = 0

      DO iproc = 1, nprocs - 1

         offset_per_proc(iproc) = offset_per_proc(iproc - 1) + siesta_struct%nelements_per_proc(iproc - 1, nelements_dbcsr_send)

      END DO

      n_packed_elements_per_proc(:) = 0


      ! number of local neighbour-list nodes and offset of the first local neighbour-list node

      nnodes_proc = siesta_struct%nnodes_per_proc(mepos)

      !node_offset = SUM(siesta_struct%nnodes_per_proc(0:mepos)) - siesta_struct%nnodes_per_proc(mepos)

      node_offset = sum(siesta_struct%nnodes_per_proc(0:mepos)) - nnodes_proc


      ! if do_distribute == .FALSE., send all matrix elements to MPI process with rank gather_root

      ! in case of do_distribute == .TRUE., iproc is determined by calling WhichNodeOrb()

      iproc = siesta_struct%gather_root


      IF (n_nonzero_elements_dbcsr > 0) THEN

         ALLOCATE (send_buffer(n_nonzero_elements_dbcsr))

         send_buffer(:) = 0.0_dp


         ! iterate over locally-stored DBCSR matrix blocks.

         ! inode_proc is the target parallel process (where data are going to be sent)

         image_ind_offset = 0

         DO inode_proc = 1, nnodes_proc

            n_image_ind = siesta_struct%n_dbcsr_cell_images_to_merge(inode_proc)

            IF (n_image_ind > 0) THEN

               inode = node_offset + inode_proc


               irow_blk = siesta_struct%nl_repl(neighbor_list_iatom_index, inode)

               icol_blk = siesta_struct%nl_repl(neighbor_list_jatom_index, inode)

               cpassert(irow_blk <= icol_blk .OR. (.NOT. symmetric))

               image_siesta = siesta_struct%nl_repl(neighbor_list_siesta_image_index, inode)

               image_siesta_transp = siesta_struct%nl_repl(neighbor_list_siesta_transp_image_index, inode)


               nrows_local = row_blk_size(irow_blk)

               ncols_local = col_blk_size(icol_blk)

               first_row_minus_one = row_blk_offset(irow_blk) - 1

               first_col_minus_one = col_blk_offset(icol_blk) - 1


               ! merging cell images along transport direction

               IF (n_image_ind == 1) THEN

                  ! the most common case. Nothing to merge, so there is no need to allocate memory for a merged block

                  image_dbcsr = siesta_struct%dbcsr_cell_image_to_merge(image_ind_offset + 1)

                  CALL dbcsr_get_block_p(matrix=matrix_dbcsr_kp(image_dbcsr)%matrix, &

                                         row=irow_blk, col=icol_blk, block=sm_block_merged, found=found)

                  cpassert(found)

               ELSE ! n_image_ind > 1

                  ALLOCATE (sm_block_merged(nrows_local, ncols_local))


                  DO image_ind = 1, n_image_ind

                     image_dbcsr = siesta_struct%dbcsr_cell_image_to_merge(image_ind + image_ind_offset)


                     CALL dbcsr_get_block_p(matrix=matrix_dbcsr_kp(image_dbcsr)%matrix, &

                                            row=irow_blk, col=icol_blk, block=sm_block, found=found)

                     cpassert(found)

                     sm_block_merged(1:nrows_local, 1:ncols_local) = sm_block_merged(1:nrows_local, 1:ncols_local) + &

                                                                     sm_block(1:nrows_local, 1:ncols_local)

                  END DO

               END IF


               ! pack matrix elements for the 'normal' SIESTA matrix block

               IF (image_siesta > 0) THEN

                  DO irow_local = 1, nrows_local

                     IF (do_distribute) THEN

#if defined(__SMEAGOL)

                        CALL whichnodeorb(irow_local + first_row_minus_one, nprocs, iproc)

#else

                        CALL cp_abort(__location__, &

                                      "CP2K was compiled with no SMEAGOL support.")

#endif

                     END IF


                     ! CPASSERT

                     IF (debug_this_module) THEN

                        cpassert(iproc >= 0 .AND. iproc < nprocs)

                        IF (n_packed_elements_per_proc(iproc) + ncols_local > &

                            siesta_struct%nelements_per_proc(iproc, nelements_dbcsr_send)) THEN

                           CALL cp__a(__short_file__, __line__)

                        END IF

                     END IF


                     offset_send_mepos = offset_per_proc(iproc) + n_packed_elements_per_proc(iproc)

                     send_buffer(offset_send_mepos + 1:offset_send_mepos + ncols_local) = sm_block_merged(irow_local, 1:ncols_local)


                     n_packed_elements_per_proc(iproc) = n_packed_elements_per_proc(iproc) + ncols_local

                  END DO

               END IF


               ! pack matrix elements of the transposed SIESTA matrix block

               IF (image_siesta_transp > 0) THEN

                  DO icol_local = 1, ncols_local

                     IF (do_distribute) THEN

#if defined(__SMEAGOL)

                        CALL whichnodeorb(icol_local + first_col_minus_one, nprocs, iproc) ! iproc_orb

#else

                        CALL cp_abort(__location__, &

                                      "CP2K was compiled with no SMEAGOL support.")

#endif

                     END IF


                     ! CPASSERT

                     IF (debug_this_module) THEN

                        cpassert(iproc >= 0 .AND. iproc < nprocs)

                        IF (n_packed_elements_per_proc(iproc) + nrows_local > &

                            siesta_struct%nelements_per_proc(iproc, nelements_dbcsr_send)) THEN

                           CALL cp__a(__short_file__, __line__)

                        END IF

                     END IF


                     offset_send_mepos = offset_per_proc(iproc) + n_packed_elements_per_proc(iproc)

                     send_buffer(offset_send_mepos + 1:offset_send_mepos + nrows_local) = sm_block_merged(1:nrows_local, icol_local)


                     n_packed_elements_per_proc(iproc) = n_packed_elements_per_proc(iproc) + nrows_local

                  END DO

               END IF


               IF (n_image_ind > 1) THEN

                  DEALLOCATE (sm_block_merged)

               END IF

            END IF


            image_ind_offset = image_ind_offset + siesta_struct%n_dbcsr_cell_images_to_merge(inode_proc)

         END DO


         IF (debug_this_module) THEN

            DO iproc = 0, nprocs - 1

               IF (n_packed_elements_per_proc(iproc) /= siesta_struct%nelements_per_proc(iproc, nelements_dbcsr_send)) THEN

                  CALL cp__a(__short_file__, __line__)

               END IF

            END DO

         END IF


         ! send packed data to other parallel processes

         DO irequest = nrequests_recv + 1, nrequests_total

            CALL para_env%isend(send_buffer(offset_per_request(irequest) + 1: &

                                            offset_per_request(irequest) + nelements_per_request(irequest)), &

                                peer_rank(irequest), requests(irequest), request_tag(irequest))

         END DO


         ! copy data locally that stay on the same process.

         IF (mepos > 0) THEN

            offset_recv_mepos = sum(siesta_struct%nelements_per_proc(0:mepos - 1, nelements_dbcsr_recv))

         ELSE

            offset_recv_mepos = 0

         END IF

         offset_send_mepos = offset_per_proc(mepos)


         IF (debug_this_module) THEN

            IF (n_packed_elements_per_proc(mepos) /= siesta_struct%nelements_per_proc(mepos, nelements_dbcsr_recv)) THEN

               CALL cp__a(__short_file__, __line__)

            END IF

         END IF


         IF (n_packed_elements_per_proc(mepos) > 0) THEN

            recv_buffer(offset_recv_mepos + 1:offset_recv_mepos + n_packed_elements_per_proc(mepos)) = &

               send_buffer(offset_send_mepos + 1:offset_send_mepos + n_packed_elements_per_proc(mepos))

         END IF

      END IF


      IF (nrequests_total > 0) THEN

         ! wait for pending isend/irecv requests

         CALL mp_waitall(requests)

         DEALLOCATE (nelements_per_request, offset_per_request, peer_rank, requests, request_tag)

      END IF


      ! release send buffers

      IF (ALLOCATED(send_buffer)) DEALLOCATE (send_buffer)

      DEALLOCATE (offset_per_proc, n_packed_elements_per_proc)


      ! non-zero matrix elements in 'recv_buffer' array are grouped by their source MPI rank, local row index, and column index (in this order).

      ! Reorder the matrix elements ('reorder_recv_buffer') so they are grouped by their local row index, source MPI rank, and column index.

      ! (column indices are in the ascending order within each (row index, source MPI rank) block).

      ! The array 'packed_index' allows mapping matrix element between these intermediate order and SIESTA order : local row index, column index

      IF (n_nonzero_elements_siesta > 0) THEN

         ALLOCATE (reorder_recv_buffer(n_nonzero_elements_siesta))

         ALLOCATE (next_nonzero_element_offset(siesta_struct%nrows))

         next_nonzero_element_offset(:) = 0

         offset_recv_mepos = 0


         DO inode = 1, SIZE(siesta_struct%nl_repl, 2)

            irow_blk = siesta_struct%nl_repl(neighbor_list_iatom_index, inode)

            icol_blk = siesta_struct%nl_repl(neighbor_list_jatom_index, inode)

            cpassert(irow_blk <= icol_blk .OR. (.NOT. symmetric))

            image_siesta = siesta_struct%nl_repl(neighbor_list_siesta_image_index, inode)

            image_siesta_transp = siesta_struct%nl_repl(neighbor_list_siesta_transp_image_index, inode)


            nrows_local = row_blk_size(irow_blk)

            ncols_local = col_blk_size(icol_blk)

            first_row_minus_one = row_blk_offset(irow_blk) - 1

            first_col_minus_one = col_blk_offset(icol_blk) - 1


            ! normal block

            IF (image_siesta > 0) THEN

               DO irow_local = 1, nrows_local

                  IF (do_distribute) THEN

#if defined(__SMEAGOL)

                     CALL globaltolocalorb(irow_local + first_row_minus_one, mepos, nprocs, irow_proc)

#else

                     CALL cp_abort(__location__, &

                                   "CP2K was compiled with no SMEAGOL support.")

#endif

                  ELSE

                     IF (is_root_rank) THEN

                        irow_proc = irow_local + first_row_minus_one

                     ELSE

                        irow_proc = 0

                     END IF

                  END IF

                  IF (irow_proc > 0) THEN

                     offset_send_mepos = siesta_struct%row_offset(irow_proc) + next_nonzero_element_offset(irow_proc)

                     reorder_recv_buffer(offset_send_mepos + 1:offset_send_mepos + ncols_local) = &

                        recv_buffer(offset_recv_mepos + 1:offset_recv_mepos + ncols_local)

                     offset_recv_mepos = offset_recv_mepos + ncols_local

                     next_nonzero_element_offset(irow_proc) = next_nonzero_element_offset(irow_proc) + ncols_local

                  END IF

               END DO

            END IF


            ! transposed block

            IF (image_siesta_transp > 0) THEN

               DO icol_local = 1, ncols_local

                  IF (do_distribute) THEN

#if defined(__SMEAGOL)

                     CALL globaltolocalorb(icol_local + first_col_minus_one, mepos, nprocs, irow_proc)

#else

                     CALL cp_abort(__location__, &

                                   "CP2K was compiled with no SMEAGOL support.")

#endif

                  ELSE

                     IF (is_root_rank) THEN

                        irow_proc = icol_local + first_col_minus_one

                     ELSE

                        irow_proc = 0

                     END IF

                  END IF

                  IF (irow_proc > 0) THEN

                     offset_send_mepos = siesta_struct%row_offset(irow_proc) + next_nonzero_element_offset(irow_proc)

                     reorder_recv_buffer(offset_send_mepos + 1:offset_send_mepos + nrows_local) = &

                        recv_buffer(offset_recv_mepos + 1:offset_recv_mepos + nrows_local)

                     offset_recv_mepos = offset_recv_mepos + nrows_local

                     next_nonzero_element_offset(irow_proc) = next_nonzero_element_offset(irow_proc) + nrows_local

                  END IF

               END DO

            END IF

         END DO


         IF (debug_this_module) THEN

            DO irow_local = 1, siesta_struct%nrows

               IF (siesta_struct%n_nonzero_cols(irow_local) /= next_nonzero_element_offset(irow_local)) THEN

                  CALL cp__a(__short_file__, __line__)

               END IF

            END DO

         END IF


         DEALLOCATE (next_nonzero_element_offset)

         DEALLOCATE (recv_buffer)


         ! Map non-zero matrix element between the intermediate order and SIESTA order

         DO irow_local = 1, siesta_struct%nrows

            offset_recv_mepos = siesta_struct%row_offset(irow_local)

            DO icol_local = 1, siesta_struct%n_nonzero_cols(irow_local)

               matrix_siesta(offset_recv_mepos + icol_local) = &

                  reorder_recv_buffer(offset_recv_mepos + siesta_struct%packed_index(offset_recv_mepos + icol_local))

            END DO

         END DO

         DEALLOCATE (reorder_recv_buffer)

      END IF


      CALL timestop(handle)


   END SUBROUTINE convert_dbcsr_to_distributed_siesta


! **************************************************************************************************

!> \brief Convert matrix from DBCSR to sparse SIESTA format.

!> \param matrix_dbcsr_kp    DBCSR matrix [out]. The matrix is declared as INTENT(in) as pointers to

!>                 dbcsr matrices remain intact. However we have intention to update matrix elements

!> \param matrix_siesta      matrix in SIESTA format [in]

!> \param siesta_struct      structure to map matrix blocks between formats

!> \param para_env           MPI parallel environment

! **************************************************************************************************


   SUBROUTINE convert_distributed_siesta_to_dbcsr(matrix_dbcsr_kp, matrix_siesta, siesta_struct, para_env)

      TYPE(dbcsr_p_type), DIMENSION(:), INTENT(in)       :: matrix_dbcsr_kp

      REAL(kind=dp), DIMENSION(:), INTENT(in)            :: matrix_siesta

      TYPE(siesta_distrib_csc_struct_type), INTENT(in)   :: siesta_struct

      TYPE(mp_para_env_type), INTENT(in), POINTER        :: para_env


      CHARACTER(len=*), PARAMETER :: routinen = 'convert_distributed_siesta_to_dbcsr'


      INTEGER :: first_col_minus_one, first_row_minus_one, handle, icol_blk, icol_local, &

         image_dbcsr, image_siesta, image_siesta_transp, inode, inode_proc, iproc, irequest, &

         irow_blk, irow_local, irow_proc, mepos, n_image_ind, ncols_blk, ncols_local, nnodes_proc, &

         node_offset, nprocs, nrequests_recv, nrequests_total, nrows_blk, nrows_local

      INTEGER(kind=int_8)                                :: n_nonzero_elements_dbcsr, &

                                                            n_nonzero_elements_siesta, &

                                                            offset_recv_mepos, offset_send_mepos

      INTEGER(kind=int_8), ALLOCATABLE, DIMENSION(:)     :: n_packed_elements_per_proc, &

                                                            nelements_per_request, &

                                                            offset_per_proc, offset_per_request

      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: next_nonzero_element_offset, peer_rank, &

                                                            request_tag

      INTEGER, DIMENSION(:), POINTER                     :: col_blk_offset, col_blk_size, &

                                                            row_blk_offset, row_blk_size

      LOGICAL                                            :: do_distribute, found, is_root_rank, &

                                                            symmetric

      REAL(kind=dp), ALLOCATABLE, DIMENSION(:)           :: recv_buffer, reorder_send_buffer, &

                                                            send_buffer

      REAL(kind=dp), DIMENSION(:, :), POINTER            :: sm_block

      TYPE(mp_request_type), ALLOCATABLE, DIMENSION(:)   :: requests


      CALL timeset(routinen, handle)

      DO image_dbcsr = 1, SIZE(matrix_dbcsr_kp)

         CALL dbcsr_set(matrix_dbcsr_kp(image_dbcsr)%matrix, 0.0_dp)

      END DO


      mepos = para_env%mepos

      nprocs = para_env%num_pe

      do_distribute = siesta_struct%gather_root < 0

      is_root_rank = siesta_struct%gather_root == mepos


      CALL dbcsr_get_info(matrix=matrix_dbcsr_kp(1)%matrix, &

                          nblkrows_total=nrows_blk, nblkcols_total=ncols_blk, &

                          row_blk_size=row_blk_size, col_blk_size=col_blk_size, &

                          row_blk_offset=row_blk_offset, col_blk_offset=col_blk_offset)

      symmetric = siesta_struct%symmetric


      n_nonzero_elements_siesta = sum(siesta_struct%nelements_per_proc(:, nelements_dbcsr_recv))

      n_nonzero_elements_dbcsr = sum(siesta_struct%nelements_per_proc(:, nelements_dbcsr_send))


      nrequests_recv = get_number_of_mpi_sendrecv_requests(mepos, siesta_struct%nelements_per_proc(:, nelements_dbcsr_send), &

                                                           max_mpi_packet_size_dp)

      nrequests_total = get_number_of_mpi_sendrecv_requests(mepos, siesta_struct%nelements_per_proc(:, nelements_dbcsr_recv), &

                                                            max_mpi_packet_size_dp) + nrequests_recv

      IF (nrequests_total > 0) THEN

         ALLOCATE (requests(nrequests_total))

         ALLOCATE (peer_rank(nrequests_total), request_tag(nrequests_total))

         ALLOCATE (offset_per_request(nrequests_total), nelements_per_request(nrequests_total))

         !requests(:) = mp_request_null

         IF (nrequests_recv > 0) THEN

            CALL assign_nonzero_elements_to_requests(offset_per_request(1:nrequests_recv), &

                                                     nelements_per_request(1:nrequests_recv), &

                                                     peer_rank(1:nrequests_recv), &

                                                     request_tag(1:nrequests_recv), &

                                                     mepos, &

                                                     siesta_struct%nelements_per_proc(:, nelements_dbcsr_send), &

                                                     max_mpi_packet_size_dp)

         END IF

         IF (nrequests_total > nrequests_recv) THEN

            CALL assign_nonzero_elements_to_requests(offset_per_request(nrequests_recv + 1:nrequests_total), &

                                                     nelements_per_request(nrequests_recv + 1:nrequests_total), &

                                                     peer_rank(nrequests_recv + 1:nrequests_total), &

                                                     request_tag(nrequests_recv + 1:nrequests_total), &

                                                     mepos, &

                                                     siesta_struct%nelements_per_proc(:, nelements_dbcsr_recv), &

                                                     max_mpi_packet_size_dp)

         END IF

      END IF


      IF (n_nonzero_elements_dbcsr > 0) THEN

         ALLOCATE (recv_buffer(n_nonzero_elements_dbcsr))

      END IF

      DO irequest = 1, nrequests_recv

         CALL para_env%irecv(recv_buffer(offset_per_request(irequest) + 1: &

                                         offset_per_request(irequest) + nelements_per_request(irequest)), &

                             peer_rank(irequest), requests(irequest), request_tag(irequest))

      END DO


      ALLOCATE (offset_per_proc(0:nprocs - 1), n_packed_elements_per_proc(0:nprocs - 1))

      offset_per_proc(0) = 0

      DO iproc = 1, nprocs - 1

         offset_per_proc(iproc) = offset_per_proc(iproc - 1) + siesta_struct%nelements_per_proc(iproc - 1, nelements_dbcsr_send)

      END DO

      n_packed_elements_per_proc(:) = 0


      IF (mepos > 0) THEN

         node_offset = sum(siesta_struct%nnodes_per_proc(0:mepos - 1))

      ELSE

         node_offset = 0

      END IF

      nnodes_proc = siesta_struct%nnodes_per_proc(mepos)


      IF (n_nonzero_elements_siesta > 0) THEN

         ALLOCATE (send_buffer(n_nonzero_elements_siesta))


         ALLOCATE (reorder_send_buffer(n_nonzero_elements_siesta))

         DO irow_local = 1, siesta_struct%nrows

            offset_send_mepos = siesta_struct%row_offset(irow_local)

            DO icol_local = 1, siesta_struct%n_nonzero_cols(irow_local)

               reorder_send_buffer(offset_send_mepos + siesta_struct%packed_index(offset_send_mepos + icol_local)) = &

                  matrix_siesta(offset_send_mepos + icol_local)

            END DO

         END DO


         ALLOCATE (next_nonzero_element_offset(siesta_struct%nrows))

         next_nonzero_element_offset(:) = 0

         offset_send_mepos = 0


         DO inode = 1, SIZE(siesta_struct%nl_repl, 2)

            irow_blk = siesta_struct%nl_repl(neighbor_list_iatom_index, inode)

            icol_blk = siesta_struct%nl_repl(neighbor_list_jatom_index, inode)

            cpassert(irow_blk <= icol_blk .OR. (.NOT. symmetric))

            image_siesta = siesta_struct%nl_repl(neighbor_list_siesta_image_index, inode)

            image_siesta_transp = siesta_struct%nl_repl(neighbor_list_siesta_transp_image_index, inode)


            nrows_local = row_blk_size(irow_blk)

            ncols_local = col_blk_size(icol_blk)

            first_row_minus_one = row_blk_offset(irow_blk) - 1

            first_col_minus_one = col_blk_offset(icol_blk) - 1


            IF (image_siesta > 0) THEN

               DO irow_local = 1, nrows_local

                  IF (do_distribute) THEN

#if defined(__SMEAGOL)

                     CALL globaltolocalorb(irow_local + first_row_minus_one, mepos, nprocs, irow_proc)

#else

                     CALL cp_abort(__location__, &

                                   "CP2K was compiled with no SMEAGOL support.")

#endif

                  ELSE

                     IF (is_root_rank) THEN

                        irow_proc = irow_local + first_row_minus_one

                     ELSE

                        irow_proc = 0

                     END IF

                  END IF

                  IF (irow_proc > 0) THEN

                     offset_recv_mepos = siesta_struct%row_offset(irow_proc) + next_nonzero_element_offset(irow_proc)

                     send_buffer(offset_send_mepos + 1:offset_send_mepos + ncols_local) = &

                        reorder_send_buffer(offset_recv_mepos + 1:offset_recv_mepos + ncols_local)

                     offset_send_mepos = offset_send_mepos + ncols_local

                     next_nonzero_element_offset(irow_proc) = next_nonzero_element_offset(irow_proc) + ncols_local

                  END IF

               END DO

            END IF


            ! transposed block

            IF (image_siesta_transp > 0) THEN

               DO icol_local = 1, ncols_local

                  IF (do_distribute) THEN

#if defined(__SMEAGOL)

                     CALL globaltolocalorb(icol_local + first_col_minus_one, mepos, nprocs, irow_proc)

#else

                     CALL cp_abort(__location__, &

                                   "CP2K was compiled with no SMEAGOL support.")

#endif

                  ELSE

                     IF (is_root_rank) THEN

                        irow_proc = icol_local + first_col_minus_one

                     ELSE

                        irow_proc = 0

                     END IF

                  END IF

                  IF (irow_proc > 0) THEN

                     offset_recv_mepos = siesta_struct%row_offset(irow_proc) + next_nonzero_element_offset(irow_proc)

                     send_buffer(offset_send_mepos + 1:offset_send_mepos + nrows_local) = &

                        reorder_send_buffer(offset_recv_mepos + 1:offset_recv_mepos + nrows_local)

                     offset_send_mepos = offset_send_mepos + nrows_local

                     next_nonzero_element_offset(irow_proc) = next_nonzero_element_offset(irow_proc) + nrows_local

                  END IF

               END DO

            END IF

         END DO


         IF (debug_this_module) THEN

            DO irow_local = 1, siesta_struct%nrows

               IF (siesta_struct%n_nonzero_cols(irow_local) /= next_nonzero_element_offset(irow_local)) THEN

                  CALL cp__a(__short_file__, __line__)

               END IF

            END DO

         END IF


         DEALLOCATE (next_nonzero_element_offset)

         DEALLOCATE (reorder_send_buffer)


         DO irequest = nrequests_recv + 1, nrequests_total

            CALL para_env%isend(send_buffer(offset_per_request(irequest) + 1: &

                                            offset_per_request(irequest) + nelements_per_request(irequest)), &

                                peer_rank(irequest), requests(irequest), request_tag(irequest))

         END DO


         ! copy data locally that stay on the same process.

         IF (mepos > 0) THEN

            offset_send_mepos = sum(siesta_struct%nelements_per_proc(0:mepos - 1, nelements_dbcsr_recv))

         ELSE

            offset_send_mepos = 0

         END IF

         offset_recv_mepos = offset_per_proc(mepos)


         IF (debug_this_module) THEN

            IF (siesta_struct%nelements_per_proc(mepos, nelements_dbcsr_recv) /= &

                siesta_struct%nelements_per_proc(mepos, nelements_dbcsr_send)) THEN

               CALL cp__a(__short_file__, __line__)

            END IF

         END IF


         IF (siesta_struct%nelements_per_proc(mepos, nelements_dbcsr_send) > 0) THEN

            recv_buffer(offset_recv_mepos + 1:offset_recv_mepos + siesta_struct%nelements_per_proc(mepos, nelements_dbcsr_send)) = &

               send_buffer(offset_send_mepos + 1:offset_send_mepos + siesta_struct%nelements_per_proc(mepos, nelements_dbcsr_send))

         END IF

      END IF


      IF (nrequests_total > 0) THEN

         ! wait for pending isend/irecv requests

         CALL mp_waitall(requests)

         DEALLOCATE (nelements_per_request, offset_per_request, peer_rank, requests, request_tag)

      END IF


      IF (ALLOCATED(send_buffer)) DEALLOCATE (send_buffer)


      iproc = siesta_struct%gather_root ! if do_distribute == .FALSE., collect matrix elements from MPI process with rank gather_root

      IF (n_nonzero_elements_dbcsr > 0) THEN

         DO inode_proc = 1, nnodes_proc

            n_image_ind = siesta_struct%n_dbcsr_cell_images_to_merge(inode_proc)

            IF (n_image_ind > 0) THEN

               inode = node_offset + inode_proc


               irow_blk = siesta_struct%nl_repl(neighbor_list_iatom_index, inode)

               icol_blk = siesta_struct%nl_repl(neighbor_list_jatom_index, inode)

               image_dbcsr = siesta_struct%nl_repl(neighbor_list_dbcsr_image_index, inode)

               cpassert(irow_blk <= icol_blk .OR. (.NOT. symmetric))

               image_siesta = siesta_struct%nl_repl(neighbor_list_siesta_image_index, inode)

               image_siesta_transp = siesta_struct%nl_repl(neighbor_list_siesta_transp_image_index, inode)


               nrows_local = row_blk_size(irow_blk)

               ncols_local = col_blk_size(icol_blk)

               first_row_minus_one = row_blk_offset(irow_blk) - 1

               first_col_minus_one = col_blk_offset(icol_blk) - 1


               CALL dbcsr_get_block_p(matrix=matrix_dbcsr_kp(image_dbcsr)%matrix, &

                                      row=irow_blk, col=icol_blk, block=sm_block, found=found)

               cpassert(found)


               IF (image_siesta > 0) THEN

                  DO irow_local = 1, nrows_local

                     IF (do_distribute) THEN

#if defined(__SMEAGOL)

                        CALL whichnodeorb(irow_local + first_row_minus_one, nprocs, iproc) ! iproc_orb

#else

                        CALL cp_abort(__location__, &

                                      "CP2K was compiled with no SMEAGOL support.")

#endif

                     END IF

                     ! CPASSERT

                     IF (debug_this_module) THEN

                        cpassert(iproc >= 0 .AND. iproc < nprocs)

                        IF (n_packed_elements_per_proc(iproc) + ncols_local > &

                            siesta_struct%nelements_per_proc(iproc, nelements_dbcsr_send)) THEN

                           CALL cp__a(__short_file__, __line__)

                        END IF

                     END IF


                     offset_recv_mepos = offset_per_proc(iproc) + n_packed_elements_per_proc(iproc)

                     sm_block(irow_local, 1:ncols_local) = recv_buffer(offset_recv_mepos + 1:offset_recv_mepos + ncols_local)


                     n_packed_elements_per_proc(iproc) = n_packed_elements_per_proc(iproc) + ncols_local

                  END DO

               END IF


               ! transposed block

               IF (image_siesta_transp > 0) THEN

                  DO icol_local = 1, ncols_local

                     IF (do_distribute) THEN

#if defined(__SMEAGOL)

                        CALL whichnodeorb(icol_local + first_col_minus_one, nprocs, iproc) ! iproc_orb

#else

                        CALL cp_abort(__location__, &

                                      "CP2K was compiled with no SMEAGOL support.")

#endif

                     END IF

                     ! CPASSERT

                     IF (debug_this_module) THEN

                        cpassert(iproc >= 0 .AND. iproc < nprocs)

                        IF (n_packed_elements_per_proc(iproc) + nrows_local > &

                            siesta_struct%nelements_per_proc(iproc, nelements_dbcsr_send)) THEN

                           CALL cp__a(__short_file__, __line__)

                        END IF

                     END IF


                     offset_recv_mepos = offset_per_proc(iproc) + n_packed_elements_per_proc(iproc)

                     sm_block(1:nrows_local, icol_local) = recv_buffer(offset_recv_mepos + 1:offset_recv_mepos + nrows_local)


                     n_packed_elements_per_proc(iproc) = n_packed_elements_per_proc(iproc) + nrows_local

                  END DO

               END IF

            END IF

         END DO


         IF (debug_this_module) THEN

            DO iproc = 0, nprocs - 1

               IF (n_packed_elements_per_proc(iproc) /= siesta_struct%nelements_per_proc(iproc, nelements_dbcsr_send)) THEN

                  CALL cp__a(__short_file__, __line__)

               END IF

            END DO

         END IF


         DEALLOCATE (recv_buffer)

      END IF


      DEALLOCATE (offset_per_proc, n_packed_elements_per_proc)


      CALL timestop(handle)


   END SUBROUTINE convert_distributed_siesta_to_dbcsr


   ! *** PRIVATE SUBROUTINES ***


! **************************************************************************************************

!> \brief Computes number of neighbour-list nodes on the current parallel process.

!> \param nnodes_local              number of nodes [out]

!> \param max_ijk_cell_image_local  largest index of cell images along i, j and k cell vectors

!>                                  on this parallel process [out]

!> \param max_ijk_cell_image        largest index of cell images along i, j and k cell vectors [inout]

!> \param sab_nl                    pair-wise neighbour list [in]

!> \param para_env                  MPI parallel environment [in]

!> \param particle_coords           list of atomic coordinates subject to periodic boundary conditions [in]

!> \param cell                      simulation unit cell [in]

! **************************************************************************************************

   SUBROUTINE get_nnodes_local(nnodes_local, max_ijk_cell_image_local, max_ijk_cell_image, sab_nl, para_env, particle_coords, cell)

      INTEGER, INTENT(out)                               :: nnodes_local

      INTEGER, DIMENSION(3), INTENT(out)                 :: max_ijk_cell_image_local

      INTEGER, DIMENSION(3), INTENT(inout)               :: max_ijk_cell_image

      TYPE(neighbor_list_set_p_type), DIMENSION(:), &

         INTENT(in), POINTER                             :: sab_nl

      TYPE(mp_para_env_type), INTENT(in), POINTER        :: para_env

      REAL(kind=dp), CONTIGUOUS, DIMENSION(:, :), &

         INTENT(in)                                      :: particle_coords

      TYPE(cell_type), INTENT(in), POINTER               :: cell


      CHARACTER(len=*), PARAMETER                        :: routinen = 'get_nnodes_local'


      INTEGER                                            :: handle, iatom, icoord, jatom

      INTEGER, DIMENSION(3)                              :: cell_ijk, max_ijk_cell_image_tmp

      LOGICAL                                            :: update_ncells

      REAL(kind=dp), DIMENSION(3)                        :: r_ij

      TYPE(neighbor_list_iterator_p_type), &

         DIMENSION(:), POINTER                           :: nl_iterator


      CALL timeset(routinen, handle)


      update_ncells = .false.

      DO icoord = 1, 3 ! x, y, z

         IF (max_ijk_cell_image(icoord) >= 0) THEN

            max_ijk_cell_image_tmp(icoord) = max_ijk_cell_image(icoord)

         ELSE

            max_ijk_cell_image_tmp(icoord) = huge(max_ijk_cell_image_tmp(icoord))

            update_ncells = .true.

         END IF

      END DO


      nnodes_local = 0

      max_ijk_cell_image_local(:) = 0


      CALL neighbor_list_iterator_create(nl_iterator, sab_nl)

      DO WHILE (neighbor_list_iterate(nl_iterator) == 0)

         CALL get_iterator_info(nl_iterator, iatom=iatom, jatom=jatom, r=r_ij)

         CALL get_negf_cell_ijk(cell_ijk, r_ij, r_i=particle_coords(1:3, iatom), r_j=particle_coords(1:3, jatom), cell=cell)

         cell_ijk(1:3) = abs(cell_ijk(1:3))


         IF (cell_ijk(1) <= max_ijk_cell_image_tmp(1) .AND. cell_ijk(2) <= max_ijk_cell_image_tmp(2) .AND. &

             cell_ijk(3) <= max_ijk_cell_image_tmp(3)) THEN

            nnodes_local = nnodes_local + 1

            max_ijk_cell_image_local(1:3) = max(max_ijk_cell_image_local(1:3), cell_ijk(1:3))

         END IF

      END DO

      CALL neighbor_list_iterator_release(nl_iterator)


      IF (update_ncells) THEN

         max_ijk_cell_image_tmp(1:3) = max_ijk_cell_image_local(1:3)

         CALL para_env%max(max_ijk_cell_image_tmp)

         DO icoord = 1, 3

            IF (max_ijk_cell_image(icoord) < 0) THEN

               max_ijk_cell_image(icoord) = max_ijk_cell_image_tmp(icoord)

            END IF

         END DO

      END IF


      CALL timestop(handle)

   END SUBROUTINE get_nnodes_local


! **************************************************************************************************

!> \brief Construct list of neighbour-list's nodes on the current parallel process.

!> \param nl_local                  non-merged local neighbour-list's nodes [out]

!> \param max_ijk_cell_image_local  largest index of cell images along i, j and k cell vectors

!>                                  on this parallel process [in]

!> \param max_ijk_cell_image        largest index of cell images along i, j and k cell vectors [in]

!> \param sab_nl                    pair-wise neighbour list [in]

!> \param particle_coords           list of atomic coordinates subject to periodic boundary conditions [in]

!> \param cell                      simulation unit cell [in]

!> \param cell_to_index             array to convert 3-D cell indices to 1-D DBCSR image indices

!> \param do_merge                  merge cell images along transport direction [in]

!> \param node_merged_indices       nodes-related indices. Nodes with identical indices will be merged [out]

!> \param k_cells                   list of cell image indices along transport direction. Nodes to

!>                be merged have the same 'node_merged_indices' but different 'k_cells' indices [out]

! **************************************************************************************************

   SUBROUTINE get_nl_nodes_local(nl_local, max_ijk_cell_image_local, max_ijk_cell_image, sab_nl, particle_coords, &

                                 cell, cell_to_index, do_merge, node_merged_indices, k_cells)

      INTEGER, DIMENSION(:, :), INTENT(out)              :: nl_local

      INTEGER, DIMENSION(3), INTENT(in)                  :: max_ijk_cell_image_local, &

                                                            max_ijk_cell_image

      TYPE(neighbor_list_set_p_type), DIMENSION(:), &

         INTENT(in), POINTER                             :: sab_nl

      REAL(kind=dp), CONTIGUOUS, DIMENSION(:, :), &

         INTENT(in)                                      :: particle_coords

      TYPE(cell_type), INTENT(in), POINTER               :: cell

      INTEGER, DIMENSION(:, :, :), INTENT(in), POINTER   :: cell_to_index

      LOGICAL, INTENT(in)                                :: do_merge

      INTEGER(kind=int_8), DIMENSION(:), INTENT(out)     :: node_merged_indices

      INTEGER, DIMENSION(:), INTENT(out)                 :: k_cells


      CHARACTER(len=*), PARAMETER :: routinen = 'get_nl_nodes_local'


      INTEGER                                            :: handle, iatom, icol_blk, image, inode, &

                                                            irow_blk, jatom, natoms

      INTEGER(kind=8), DIMENSION(2)                      :: ncells_siesta_local

      INTEGER, DIMENSION(2)                              :: ncells_siesta

      INTEGER, DIMENSION(3)                              :: cell_ijk_abs, cell_ijk_dbcsr, &

                                                            cell_ijk_siesta

      LOGICAL                                            :: do_symmetric

      REAL(kind=dp), DIMENSION(3)                        :: r_ij

      TYPE(neighbor_list_iterator_p_type), &

         DIMENSION(:), POINTER                           :: nl_iterator


      CALL timeset(routinen, handle)

      ! natoms only used to compute a merged 1D index of each DBCSR block

      natoms = SIZE(particle_coords, 2)

      CALL get_neighbor_list_set_p(neighbor_list_sets=sab_nl, symmetric=do_symmetric)

      ncells_siesta(1:2) = 2*max_ijk_cell_image(1:2) + 1


      ncells_siesta_local(1:2) = int(2*max_ijk_cell_image_local(1:2) + 1, kind=int_8)


      inode = 0

      CALL neighbor_list_iterator_create(nl_iterator, sab_nl)

      DO WHILE (neighbor_list_iterate(nl_iterator) == 0)

         CALL get_iterator_info(nl_iterator, iatom=iatom, jatom=jatom, cell=cell_ijk_dbcsr, r=r_ij)

         CALL get_negf_cell_ijk(cell_ijk_abs, r_ij, r_i=particle_coords(1:3, iatom), r_j=particle_coords(1:3, jatom), cell=cell)


         IF (abs(cell_ijk_abs(1)) <= max_ijk_cell_image(1) .AND. abs(cell_ijk_abs(2)) <= max_ijk_cell_image(2) .AND. &

             abs(cell_ijk_abs(3)) <= max_ijk_cell_image(3)) THEN


            inode = inode + 1


            image = get_index_by_cell(cell_ijk_dbcsr, cell_to_index)

            cpassert(image > 0)

            nl_local(neighbor_list_dbcsr_image_index, inode) = image


            IF (do_symmetric .AND. iatom > jatom) THEN

               irow_blk = jatom

               icol_blk = iatom

               cell_ijk_abs(1:3) = -cell_ijk_abs(1:3)

            ELSE

               irow_blk = iatom

               icol_blk = jatom

            END IF


            nl_local(neighbor_list_iatom_index, inode) = irow_blk

            nl_local(neighbor_list_jatom_index, inode) = icol_blk


            cell_ijk_siesta(1:3) = index_in_canonical_enumeration(cell_ijk_abs(1:3)) ! absolute -> SIESTA


            IF (do_merge) THEN

               node_merged_indices(inode) = (((cell_ijk_siesta(2) - 1)*ncells_siesta_local(1) + cell_ijk_siesta(1) - 1)* &

                                     int(natoms, kind=int_8) + icol_blk - 1)*int(natoms, kind=int_8) + int(irow_blk - 1, kind=int_8)

               image = cell_ijk_siesta(1) + ncells_siesta(1)*(cell_ijk_siesta(2) - 1)

            ELSE

               node_merged_indices(inode) = ((((cell_ijk_siesta(3) - 1)*ncells_siesta_local(2) + &

                                               cell_ijk_siesta(2) - 1)*ncells_siesta_local(1) + cell_ijk_siesta(1) - 1)* &

                                     int(natoms, kind=int_8) + icol_blk - 1)*int(natoms, kind=int_8) + int(irow_blk - 1, kind=int_8)

               image = cell_ijk_siesta(1) + ncells_siesta(1)*(cell_ijk_siesta(2) - 1 + ncells_siesta(2)*(cell_ijk_siesta(3) - 1))

            END IF

            k_cells(inode) = cell_ijk_siesta(3)

            nl_local(neighbor_list_siesta_image_index, inode) = image


            IF (do_symmetric .AND. irow_blk /= icol_blk) THEN

               cell_ijk_abs(1:3) = -cell_ijk_abs(1:3)

               cell_ijk_siesta(1:3) = index_in_canonical_enumeration(cell_ijk_abs(1:3)) ! absolute -> SIESTA

               IF (do_merge) cell_ijk_siesta(3) = 1

               nl_local(neighbor_list_siesta_transp_image_index, inode) = &

                  cell_ijk_siesta(1) + ncells_siesta(1)*(cell_ijk_siesta(2) - 1 + ncells_siesta(2)*(cell_ijk_siesta(3) - 1))

            ELSE

               nl_local(neighbor_list_siesta_transp_image_index, inode) = 0

            END IF

         END IF

      END DO

      CALL neighbor_list_iterator_release(nl_iterator)


      IF (debug_this_module) THEN

         cpassert(SIZE(nl_local, 2) == inode)

      END IF


      CALL timestop(handle)

   END SUBROUTINE get_nl_nodes_local


! **************************************************************************************************

!> \brief Replicate (and optionally merge) pair-wise neighbour list.

!> \param repl_nl                   replicated neighbour list. It needs to be deallocated elsewhere [allocated]

!> \param n_dbcsr_cell_images_to_merge  number of merged blocks per neighbour-list node [allocated]

!> \param dbcsr_cell_image_to_merge list of DBCSR image indices to merge [allocated]

!> \param nnodes_per_proc           number of merged nodes on each parallel processes [out]

!> \param max_ijk_cell_image        largest index of cell images along i, j and k cell vectors [inout]

!> \param sab_nl                    pair-wise neighbour list [in]

!> \param para_env                  MPI parallel environment [in]

!> \param particle_coords           list of atomic coordinates subject to periodic boundary conditions [in]

!> \param cell                      simulation unit cell [in]

!> \param cell_to_index             array to convert 3-D cell indices to 1-D DBCSR image indices [in]

!> \param do_merge                  merge cell images along transport direction [in]

! **************************************************************************************************

   SUBROUTINE replicate_neighbour_list(repl_nl, n_dbcsr_cell_images_to_merge, dbcsr_cell_image_to_merge, &

                                       nnodes_per_proc, max_ijk_cell_image, sab_nl, para_env, particle_coords, &

                                       cell, cell_to_index, do_merge)

      INTEGER, ALLOCATABLE, DIMENSION(:, :), &

         INTENT(inout)                                   :: repl_nl

      INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(inout)  :: n_dbcsr_cell_images_to_merge, &

                                                            dbcsr_cell_image_to_merge

      INTEGER, DIMENSION(0:), INTENT(out)                :: nnodes_per_proc

      INTEGER, DIMENSION(3), INTENT(inout)               :: max_ijk_cell_image

      TYPE(neighbor_list_set_p_type), DIMENSION(:), &

         INTENT(in), POINTER                             :: sab_nl

      TYPE(mp_para_env_type), INTENT(in), POINTER        :: para_env

      REAL(kind=dp), CONTIGUOUS, DIMENSION(:, :), &

         INTENT(in)                                      :: particle_coords

      TYPE(cell_type), INTENT(in), POINTER               :: cell

      INTEGER, DIMENSION(:, :, :), INTENT(in), POINTER   :: cell_to_index

      LOGICAL, INTENT(in)                                :: do_merge


      CHARACTER(len=*), PARAMETER :: routinen = 'replicate_neighbour_list'


      INTEGER                                            :: handle, inode, iproc, kcell_closest, &

                                                            nnodes_local, nnodes_merged, &

                                                            nnodes_repl, offset_inode

      INTEGER(kind=int_8), ALLOCATABLE, DIMENSION(:)     :: node_merged_indices

      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: inodes_orig, k_cells

      INTEGER, ALLOCATABLE, DIMENSION(:, :)              :: nl_local

      INTEGER, DIMENSION(3)                              :: max_ijk_cell_image_local


      CALL timeset(routinen, handle)

      cpassert(.NOT. ALLOCATED(repl_nl))

      cpassert(.NOT. ALLOCATED(n_dbcsr_cell_images_to_merge))

      cpassert(.NOT. ALLOCATED(dbcsr_cell_image_to_merge))


      CALL get_nnodes_local(nnodes_local, max_ijk_cell_image_local, max_ijk_cell_image, sab_nl, para_env, particle_coords, cell)


      nnodes_per_proc(:) = 0


      IF (nnodes_local > 0) THEN

         ALLOCATE (nl_local(neighbor_list_dim1, nnodes_local))

         ALLOCATE (node_merged_indices(nnodes_local))

         ALLOCATE (k_cells(nnodes_local))

         CALL get_nl_nodes_local(nl_local, max_ijk_cell_image_local, max_ijk_cell_image, sab_nl, particle_coords, cell, &

                                 cell_to_index, do_merge, node_merged_indices, k_cells)


         ALLOCATE (inodes_orig(nnodes_local))

         CALL sort(node_merged_indices, nnodes_local, inodes_orig)


         nnodes_merged = 1

         DO inode = 2, nnodes_local

            IF (node_merged_indices(inode) > node_merged_indices(inode - 1)) nnodes_merged = nnodes_merged + 1

         END DO

      ELSE

         nnodes_merged = 0

      END IF


      nnodes_per_proc(para_env%mepos) = nnodes_merged

      CALL para_env%sum(nnodes_per_proc)


      nnodes_repl = sum(nnodes_per_proc(:))

      ALLOCATE (repl_nl(neighbor_list_dim1, nnodes_repl))


      IF (nnodes_local > 0) THEN

         IF (para_env%mepos > 0) THEN

            offset_inode = sum(nnodes_per_proc(0:para_env%mepos - 1))

         ELSE

            offset_inode = 0

         END IF


         ALLOCATE (n_dbcsr_cell_images_to_merge(nnodes_merged))

         ALLOCATE (dbcsr_cell_image_to_merge(nnodes_local))

         n_dbcsr_cell_images_to_merge(:) = 0


         nnodes_merged = 1 !offset_inode + 1

         repl_nl(:, offset_inode + 1) = nl_local(:, inodes_orig(1))

         n_dbcsr_cell_images_to_merge(1) = 1

         dbcsr_cell_image_to_merge(1) = nl_local(neighbor_list_dbcsr_image_index, inodes_orig(1))

         kcell_closest = k_cells(inodes_orig(1))

         DO inode = 2, nnodes_local

            IF (node_merged_indices(inode) > node_merged_indices(inode - 1)) THEN

               nnodes_merged = nnodes_merged + 1

               repl_nl(:, offset_inode + nnodes_merged) = nl_local(:, inodes_orig(inode))

               !n_dbcsr_cell_images_to_merge(nnodes_merged) = 1

               kcell_closest = k_cells(inodes_orig(inode))

            ELSE

               IF (abs(k_cells(inodes_orig(inode))) < abs(kcell_closest) .OR. &

                   (abs(k_cells(inodes_orig(inode))) == abs(kcell_closest) .AND. kcell_closest < 0)) THEN

                  repl_nl(:, offset_inode + nnodes_merged) = nl_local(:, inodes_orig(inode))

                  kcell_closest = k_cells(inodes_orig(inode))

               END IF

            END IF

            dbcsr_cell_image_to_merge(inode) = nl_local(neighbor_list_dbcsr_image_index, inodes_orig(inode))

            n_dbcsr_cell_images_to_merge(nnodes_merged) = n_dbcsr_cell_images_to_merge(nnodes_merged) + 1

         END DO


         IF (debug_this_module) THEN

            cpassert(sum(n_dbcsr_cell_images_to_merge) == nnodes_local)

         END IF


         DEALLOCATE (inodes_orig)

         DEALLOCATE (node_merged_indices, k_cells)

         DEALLOCATE (nl_local)

      END IF


      IF (para_env%num_pe > 1) THEN

         offset_inode = 0

         DO iproc = 0, para_env%num_pe - 1

            IF (nnodes_per_proc(iproc) > 0) THEN

               CALL para_env%bcast(repl_nl(:, offset_inode + 1:offset_inode + nnodes_per_proc(iproc)), iproc)

               offset_inode = offset_inode + nnodes_per_proc(iproc)

            END IF

         END DO

      END IF


      CALL timestop(handle)

   END SUBROUTINE replicate_neighbour_list


! **************************************************************************************************

!> \brief Count number of DBCSR matrix elements that should be received from (*,nelements_dbcsr_recv)

!>        and send to (*,nelements_dbcsr_send) each parallel process.

!> \param nelements_per_proc number of non-zero matrix elements for each MPI process

!> \param nnodes_per_proc    number of non-zero DBCSR matrix blocks (neighbour-list nodes)

!> \param nl_repl            replicated neighbour list

!> \param matrix_dbcsr_kp    DBCSR matrix

!> \param symmetric          whether the DBCSR matrix is a symmetric one

!> \param para_env           parallel environment

!> \param gather_root        if >=0, gather all non-zero matrix element on the MPI process with

!>                           gather_root rank (useful for bulk transport calculation).

!>                           If <0, distribute non-zero matrix element across all MPI processes

! **************************************************************************************************

   SUBROUTINE count_remote_dbcsr_elements(nelements_per_proc, nnodes_per_proc, nl_repl, matrix_dbcsr_kp, &

                                          symmetric, para_env, gather_root)

      INTEGER(kind=int_8), DIMENSION(0:, :), INTENT(out) :: nelements_per_proc

      INTEGER, DIMENSION(0:), INTENT(in)                 :: nnodes_per_proc

      INTEGER, DIMENSION(:, :), INTENT(in)               :: nl_repl

      TYPE(dbcsr_p_type), DIMENSION(:), INTENT(in)       :: matrix_dbcsr_kp

      LOGICAL, INTENT(in)                                :: symmetric

      TYPE(mp_para_env_type), INTENT(in), POINTER        :: para_env

      INTEGER, INTENT(in)                                :: gather_root


      CHARACTER(len=*), PARAMETER :: routinen = 'count_remote_dbcsr_elements'


      INTEGER :: first_row_minus_one, handle, icol_blk, image, image_transp, inode, inode_proc, &

         iproc, iproc_orb, irow_blk, irow_local, mepos, ncols_blk, ncols_local, nnodes_proc, &

         nprocs, nrows_blk, nrows_local, offset_inode

      INTEGER, DIMENSION(:), POINTER                     :: col_blk_offset, col_blk_size, &

                                                            row_blk_offset, row_blk_size

      LOGICAL                                            :: do_distribute


      CALL timeset(routinen, handle)

      nelements_per_proc(:, :) = 0

      mepos = para_env%mepos

      nprocs = para_env%num_pe

      do_distribute = gather_root < 0

      IF (debug_this_module) THEN

         cpassert(SIZE(nnodes_per_proc) == nprocs)

      END IF


      CALL dbcsr_get_info(matrix=matrix_dbcsr_kp(1)%matrix, &

                          nblkrows_total=nrows_blk, nblkcols_total=ncols_blk, &

                          row_blk_size=row_blk_size, col_blk_size=col_blk_size, &

                          row_blk_offset=row_blk_offset, col_blk_offset=col_blk_offset)


      offset_inode = 0

      iproc_orb = gather_root ! if do_distribute == .FALSE., send all matrix elements to MPI process with rank gather_root

      DO iproc = lbound(nnodes_per_proc, 1), ubound(nnodes_per_proc, 1)

         nnodes_proc = nnodes_per_proc(iproc)

         DO inode_proc = 1, nnodes_proc

            inode = inode_proc + offset_inode


            irow_blk = nl_repl(neighbor_list_iatom_index, inode)

            icol_blk = nl_repl(neighbor_list_jatom_index, inode)

            cpassert(irow_blk <= icol_blk .OR. (.NOT. symmetric))

            image = nl_repl(neighbor_list_siesta_image_index, inode)

            image_transp = nl_repl(neighbor_list_siesta_transp_image_index, inode)


            IF (image > 0) THEN

               nrows_local = row_blk_size(irow_blk)

               first_row_minus_one = row_blk_offset(irow_blk) - 1

               ncols_local = col_blk_size(icol_blk)

               DO irow_local = 1, nrows_local

                  IF (do_distribute) THEN

#if defined(__SMEAGOL)

                     CALL whichnodeorb(irow_local + first_row_minus_one, nprocs, iproc_orb)

#else

                     CALL cp_abort(__location__, &

                                   "CP2K was compiled with no SMEAGOL support.")

#endif

                  END IF

                  IF (iproc_orb == mepos) THEN

                     nelements_per_proc(iproc, nelements_dbcsr_recv) = nelements_per_proc(iproc, nelements_dbcsr_recv) + &

                                                                       ncols_local

                  END IF


                  IF (iproc == mepos) THEN

                     nelements_per_proc(iproc_orb, nelements_dbcsr_send) = nelements_per_proc(iproc_orb, nelements_dbcsr_send) + &

                                                                           ncols_local

                  END IF

               END DO

            END IF


            ! transposed block

            IF (image_transp > 0) THEN

               nrows_local = col_blk_size(icol_blk)

               first_row_minus_one = col_blk_offset(icol_blk) - 1

               ncols_local = row_blk_size(irow_blk)

               DO irow_local = 1, nrows_local

                  IF (do_distribute) THEN

#if defined(__SMEAGOL)

                     CALL whichnodeorb(irow_local + first_row_minus_one, nprocs, iproc_orb)

#else

                     CALL cp_abort(__location__, &

                                   "CP2K was compiled with no SMEAGOL support.")

#endif

                  END IF

                  IF (iproc_orb == mepos) THEN

                     nelements_per_proc(iproc, nelements_dbcsr_recv) = nelements_per_proc(iproc, nelements_dbcsr_recv) + &

                                                                       ncols_local

                  END IF


                  IF (iproc == mepos) THEN

                     nelements_per_proc(iproc_orb, nelements_dbcsr_send) = nelements_per_proc(iproc_orb, nelements_dbcsr_send) + &

                                                                           ncols_local

                  END IF

               END DO

            END IF

         END DO

         offset_inode = offset_inode + nnodes_proc

      END DO

      CALL timestop(handle)

   END SUBROUTINE count_remote_dbcsr_elements


! **************************************************************************************************

!> \brief Construct list of non-zero matrix elements' indices in SIESTA format.

!> \param n_nonzero_cols  number of non-zero matrix elements on each matrix row local to the current

!>                        MPI process

!> \param row_offset      offset of the first non-zero matrix elements for each locally-stores row

!> \param col_index       sorted list of column indices of non-zero matrix element

!> \param packed_index    original order of non-sorted column indices

!> \param nl_repl         replicated neighbour list

!> \param matrix_dbcsr_kp DBCSR matrix

!> \param symmetric       whether the DBCSR matrix is a symmetric one

!> \param para_env        parallel environment

!> \param gather_root     if >=0, gather all non-zero matrix element on the MPI process with

!>                        gather_root rank (useful for bulk transport calculation).

!>                        If <0, distribute non-zero matrix element across all MPI processes

! **************************************************************************************************

   SUBROUTINE get_nonzero_element_indices(n_nonzero_cols, row_offset, col_index, packed_index, &

                                          nl_repl, matrix_dbcsr_kp, symmetric, para_env, gather_root)

      INTEGER, DIMENSION(:), INTENT(out)                 :: n_nonzero_cols, row_offset, col_index, &

                                                            packed_index

      INTEGER, DIMENSION(:, :), INTENT(in)               :: nl_repl

      TYPE(dbcsr_p_type), DIMENSION(:), INTENT(in)       :: matrix_dbcsr_kp

      LOGICAL, INTENT(in)                                :: symmetric

      TYPE(mp_para_env_type), INTENT(in), POINTER        :: para_env

      INTEGER, INTENT(in)                                :: gather_root


      CHARACTER(len=*), PARAMETER :: routinen = 'get_nonzero_element_indices'


      INTEGER :: first_col_minus_one, first_row_minus_one, handle, icol_blk, icol_local, &

         icol_offset, image, image_transp, inode, irow_blk, irow_local, irow_proc, mepos, &

         ncols_blk, ncols_local, ncols_total, nnodes, nprocs, nrows_blk, nrows_local, nrows_total

      INTEGER, DIMENSION(:), POINTER                     :: col_blk_offset, col_blk_size, &

                                                            row_blk_offset, row_blk_size

      LOGICAL                                            :: do_distribute, is_root_rank


      CALL timeset(routinen, handle)

      n_nonzero_cols(:) = 0

      mepos = para_env%mepos

      nprocs = para_env%num_pe

      do_distribute = gather_root < 0

      is_root_rank = gather_root == mepos


      CALL dbcsr_get_info(matrix=matrix_dbcsr_kp(1)%matrix, &

                          nblkrows_total=nrows_blk, nblkcols_total=ncols_blk, &

                          nfullrows_total=nrows_total, nfullcols_total=ncols_total, &

                          row_blk_size=row_blk_size, col_blk_size=col_blk_size, &

                          row_blk_offset=row_blk_offset, col_blk_offset=col_blk_offset)


      nnodes = SIZE(nl_repl, 2)

      DO inode = 1, nnodes

         irow_blk = nl_repl(neighbor_list_iatom_index, inode)

         icol_blk = nl_repl(neighbor_list_jatom_index, inode)

         cpassert(irow_blk <= icol_blk .OR. (.NOT. symmetric))

         image = nl_repl(neighbor_list_siesta_image_index, inode)

         image_transp = nl_repl(neighbor_list_siesta_transp_image_index, inode)


         IF (image > 0) THEN

            nrows_local = row_blk_size(irow_blk)

            first_row_minus_one = row_blk_offset(irow_blk) - 1

            ncols_local = col_blk_size(icol_blk)

            DO irow_local = 1, nrows_local

               IF (do_distribute) THEN

#if defined(__SMEAGOL)

                  CALL globaltolocalorb(irow_local + first_row_minus_one, mepos, nprocs, irow_proc)

#else

                  CALL cp_abort(__location__, &

                                "CP2K was compiled with no SMEAGOL support.")

#endif

               ELSE

                  IF (is_root_rank) THEN

                     irow_proc = irow_local + first_row_minus_one

                  ELSE

                     irow_proc = 0

                  END IF

               END IF

               IF (irow_proc > 0) THEN

                  n_nonzero_cols(irow_proc) = n_nonzero_cols(irow_proc) + ncols_local

               END IF

            END DO

         END IF


         ! transposed block

         IF (image_transp > 0) THEN

            nrows_local = col_blk_size(icol_blk)

            first_row_minus_one = col_blk_offset(icol_blk) - 1

            ncols_local = row_blk_size(irow_blk)

            DO irow_local = 1, nrows_local

               IF (do_distribute) THEN

#if defined(__SMEAGOL)

                  CALL globaltolocalorb(irow_local + first_row_minus_one, mepos, nprocs, irow_proc)

#else

                  CALL cp_abort(__location__, &

                                "CP2K was compiled with no SMEAGOL support.")

#endif

               ELSE

                  IF (is_root_rank) THEN

                     irow_proc = irow_local + first_row_minus_one

                  ELSE

                     irow_proc = 0

                  END IF

               END IF

               IF (irow_proc > 0) THEN

                  n_nonzero_cols(irow_proc) = n_nonzero_cols(irow_proc) + ncols_local

               END IF

            END DO

         END IF

      END DO


      row_offset(1) = 0

      DO irow_local = 1, SIZE(n_nonzero_cols) - 1

         row_offset(irow_local + 1) = row_offset(irow_local) + n_nonzero_cols(irow_local)

      END DO


      n_nonzero_cols(:) = 0

      col_index(:) = 0

      DO inode = 1, nnodes

         irow_blk = nl_repl(neighbor_list_iatom_index, inode)

         icol_blk = nl_repl(neighbor_list_jatom_index, inode)

         cpassert(irow_blk <= icol_blk .OR. (.NOT. symmetric))

         image = nl_repl(neighbor_list_siesta_image_index, inode)

         image_transp = nl_repl(neighbor_list_siesta_transp_image_index, inode)


         IF (image > 0) THEN

            nrows_local = row_blk_size(irow_blk)

            first_row_minus_one = row_blk_offset(irow_blk) - 1

            ncols_local = col_blk_size(icol_blk)

            first_col_minus_one = col_blk_offset(icol_blk) + (image - 1)*ncols_total - 1

            DO irow_local = 1, nrows_local

               IF (do_distribute) THEN

#if defined(__SMEAGOL)

                  CALL globaltolocalorb(irow_local + first_row_minus_one, mepos, nprocs, irow_proc)

#else

                  CALL cp_abort(__location__, &

                                "CP2K was compiled with no SMEAGOL support.")

#endif

               ELSE

                  IF (is_root_rank) THEN

                     irow_proc = irow_local + first_row_minus_one

                  ELSE

                     irow_proc = 0

                  END IF

               END IF

               IF (irow_proc > 0) THEN

                  icol_offset = row_offset(irow_proc) + n_nonzero_cols(irow_proc)

                  DO icol_local = 1, ncols_local

                     col_index(icol_offset + icol_local) = first_col_minus_one + icol_local

                  END DO

                  n_nonzero_cols(irow_proc) = n_nonzero_cols(irow_proc) + ncols_local

               END IF

            END DO

         END IF


         ! transposed block

         IF (image_transp > 0) THEN

            nrows_local = col_blk_size(icol_blk)

            first_row_minus_one = col_blk_offset(icol_blk) - 1

            ncols_local = row_blk_size(irow_blk)

            first_col_minus_one = row_blk_offset(irow_blk) + (image_transp - 1)*nrows_total - 1

            DO irow_local = 1, nrows_local

               IF (do_distribute) THEN

#if defined(__SMEAGOL)

                  CALL globaltolocalorb(irow_local + first_row_minus_one, mepos, nprocs, irow_proc)

#else

                  CALL cp_abort(__location__, &

                                "CP2K was compiled with no SMEAGOL support.")

#endif

               ELSE

                  IF (is_root_rank) THEN

                     irow_proc = irow_local + first_row_minus_one

                  ELSE

                     irow_proc = 0

                  END IF

               END IF

               IF (irow_proc > 0) THEN

                  icol_offset = row_offset(irow_proc) + n_nonzero_cols(irow_proc)

                  DO icol_local = 1, ncols_local

                     col_index(icol_offset + icol_local) = first_col_minus_one + icol_local

                  END DO

                  n_nonzero_cols(irow_proc) = n_nonzero_cols(irow_proc) + ncols_local

               END IF

            END DO

         END IF

      END DO


      IF (SIZE(n_nonzero_cols) > 0) THEN

         DO irow_local = 1, SIZE(n_nonzero_cols)

            CALL sort(col_index(row_offset(irow_local) + 1:row_offset(irow_local) + n_nonzero_cols(irow_local)), &

                      n_nonzero_cols(irow_local), &

                      packed_index(row_offset(irow_local) + 1:row_offset(irow_local) + n_nonzero_cols(irow_local)))

         END DO

      END IF


      CALL timestop(handle)

   END SUBROUTINE get_nonzero_element_indices


! **************************************************************************************************

!> \brief Get absolute i, j, and k indices of cell image for the given DBCSR matrix block.

!>        Effective cell image recorded in the neighbour-list (where the matrix block is actually

!>        stored) depends on atomic coordinates can be significantly different.

!> \param cell_ijk      array with 3 indices along the cell's vectors

!> \param r_ij          actual interatomic distance (vector R_j - r_i), where R_j is the coordinates

!>                      of the j-th atom in the supercell

!> \param r_i           coordinates of the i-th atom in the primary unit cell

!> \param r_j           coordinates of the j-th atom in the primary unit cell

!> \param cell          unit cell

! **************************************************************************************************

   SUBROUTINE get_negf_cell_ijk(cell_ijk, r_ij, r_i, r_j, cell)

      INTEGER, DIMENSION(3), INTENT(out)                 :: cell_ijk

      REAL(kind=dp), DIMENSION(3), INTENT(in)            :: r_ij, r_i, r_j

      TYPE(cell_type), INTENT(in), POINTER               :: cell


      REAL(kind=dp), DIMENSION(3)                        :: coords_scaled, r


      r(:) = r_ij(:) + r_i(:) - r_j(:)

      CALL real_to_scaled(coords_scaled, r, cell)

      cell_ijk(:) = nint(coords_scaled(:))

   END SUBROUTINE get_negf_cell_ijk


! **************************************************************************************************

!> \brief Return the index of an integer number in the sequence 0, 1, -1, ..., n, -n, ...

!>        (canonical enumeration of integers, oeis.org/A001057).

!> \param  inum     integer number [in]

!> \return index of 'inum' in A001057

!> \note Cell images in SMEAGOL / SIESTA are ordered according to A001057. Therefore this

!>       function converts the absolute index of a cell replica along some (x/y/z) dimension

!>       into its corresponding SIESTA's index.

! **************************************************************************************************

   ELEMENTAL FUNCTION index_in_canonical_enumeration(inum) RESULT(ind)

      INTEGER, INTENT(in)                                :: inum

      INTEGER                                            :: ind


      INTEGER                                            :: inum_abs, is_non_positive


      inum_abs = abs(inum)

      !IF (inum <= 0) THEN; is_non_positive = 1; ELSE; is_non_positive = 0; END IF

      is_non_positive = merge(1, 0, inum <= 0)


      ! inum =  0 -> inum_abs = 0, is_non_positive = 1 -> ind = 1

      ! inum =  1 -> inum_abs = 1, is_non_positive = 0 -> ind = 2

      ! inum = -1 -> inum_abs = 1, is_non_positive = 1 -> ind = 3

      ind = 2*inum_abs + is_non_positive

   END FUNCTION index_in_canonical_enumeration


! **************************************************************************************************

!> \brief Return an integer number according to its index in the sequence 0, 1, -1, ..., n, -n, ...

!>        (canonical enumeration of integers, oeis.org/A001057)

!> \param  ind       index in A001057 starting from 1

!> \return integer number according to its position 'ind' in A001057

!> \note Cell images in SMEAGOL / SIESTA are ordered according to A001057. Therefore this

!>       function converts SIESTA's index of a cell replica along some (x/y/z) dimension

!>       into the corresponding absolute index.

! **************************************************************************************************

   ELEMENTAL FUNCTION number_from_canonical_enumeration(ind) RESULT(inum)

      INTEGER, INTENT(in)                                :: ind

      INTEGER                                            :: inum


      ! ind < 1 is invalid

      ! ind = 1 -> SIGN(0, -1) =  0

      ! ind = 2 -> SIGN(1,  0) =  1

      ! ind = 3 -> SIGN(1, -1) = -1

      inum = sign(ind/2, -mod(ind, 2))

   END FUNCTION number_from_canonical_enumeration


! **************************************************************************************************

!> \brief   Apply periodic boundary conditions defined by a simulation cell to a position

!>          vector r. Similar to pbc1 from cell_types.F but returns unscaled coordinates from

!>          the scaled range [0, 1) instead of [-0.5, 0.5)

!> \param r_pbc   position vector subject to the periodic boundary conditions [out]

!> \param r       initial position vector [in]

!> \param cell    simulation unit cell [in]

! **************************************************************************************************

   PURE SUBROUTINE pbc_0_1(r_pbc, r, cell)

      REAL(kind=dp), DIMENSION(3), INTENT(out)           :: r_pbc

      REAL(kind=dp), DIMENSION(3), INTENT(in)            :: r

      TYPE(cell_type), INTENT(in), POINTER               :: cell


      REAL(kind=dp), DIMENSION(3)                        :: s


      IF (cell%orthorhombic) THEN

         r_pbc(1) = r(1) - cell%hmat(1, 1)*cell%perd(1)*real(floor(cell%h_inv(1, 1)*r(1)), dp)

         r_pbc(2) = r(2) - cell%hmat(2, 2)*cell%perd(2)*real(floor(cell%h_inv(2, 2)*r(2)), dp)

         r_pbc(3) = r(3) - cell%hmat(3, 3)*cell%perd(3)*real(floor(cell%h_inv(3, 3)*r(3)), dp)

      ELSE

         s(1) = cell%h_inv(1, 1)*r(1) + cell%h_inv(1, 2)*r(2) + cell%h_inv(1, 3)*r(3)

         s(2) = cell%h_inv(2, 1)*r(1) + cell%h_inv(2, 2)*r(2) + cell%h_inv(2, 3)*r(3)

         s(3) = cell%h_inv(3, 1)*r(1) + cell%h_inv(3, 2)*r(2) + cell%h_inv(3, 3)*r(3)

         s(1) = s(1) - cell%perd(1)*real(floor(s(1)), dp)

         s(2) = s(2) - cell%perd(2)*real(floor(s(2)), dp)

         s(3) = s(3) - cell%perd(3)*real(floor(s(3)), dp)

         r_pbc(1) = cell%hmat(1, 1)*s(1) + cell%hmat(1, 2)*s(2) + cell%hmat(1, 3)*s(3)

         r_pbc(2) = cell%hmat(2, 1)*s(1) + cell%hmat(2, 2)*s(2) + cell%hmat(2, 3)*s(3)

         r_pbc(3) = cell%hmat(3, 1)*s(1) + cell%hmat(3, 2)*s(2) + cell%hmat(3, 3)*s(3)

      END IF

   END SUBROUTINE pbc_0_1


! **************************************************************************************************

!> \brief Computes the number of send requests from this MPI process to all the other processes.

!>        Alternatively computes the number of recv requests per MPI process that the given process

!>        expects.

!> \param mepos                    MPI rank of the given process

!> \param nelements_per_proc       number of element to send / receive

!> \param max_nelements_per_packet maximum number of elements per single MPI request

!> \return number of MPI requests

! **************************************************************************************************

   PURE FUNCTION get_number_of_mpi_sendrecv_requests(mepos, nelements_per_proc, max_nelements_per_packet) RESULT(nrequests)

      INTEGER, INTENT(in)                                :: mepos

      INTEGER(kind=int_8), DIMENSION(0:), INTENT(in)     :: nelements_per_proc

      INTEGER(kind=int_8), INTENT(in)                    :: max_nelements_per_packet

      INTEGER                                            :: nrequests


      INTEGER                                            :: iproc


      nrequests = 0

      DO iproc = lbound(nelements_per_proc, 1), ubound(nelements_per_proc, 1)

         ! there is no need to send data to the same MPI process

         IF (iproc /= mepos) THEN

            nrequests = nrequests + int(nelements_per_proc(iproc)/max_nelements_per_packet)

            IF (mod(nelements_per_proc(iproc), max_nelements_per_packet) > 0) &

               nrequests = nrequests + 1

         END IF

      END DO

   END FUNCTION get_number_of_mpi_sendrecv_requests


! **************************************************************************************************

!> \brief Map non-zero matrix elements on to MPI requests.

!> \param element_offset           offset (index-1) of the first element [out]

!> \param nelements_per_request    number of element for each request [out]

!> \param peer_rank                rank of a peering MPI process

!> \param tag                      MPI tag

!> \param mepos                    MPI rank of a given MPI process

!> \param nelements_per_proc       number of element to send / receive by the current MPI process

!> \param max_nelements_per_packet maximum number of elements per single MPI request

! **************************************************************************************************

   SUBROUTINE assign_nonzero_elements_to_requests(element_offset, nelements_per_request, peer_rank, tag, &

                                                  mepos, nelements_per_proc, max_nelements_per_packet)

      INTEGER(kind=int_8), DIMENSION(:), INTENT(out)     :: element_offset, nelements_per_request

      INTEGER, DIMENSION(:), INTENT(out)                 :: peer_rank, tag

      INTEGER, INTENT(in)                                :: mepos

      INTEGER(kind=int_8), DIMENSION(0:), INTENT(in)     :: nelements_per_proc

      INTEGER(kind=int_8), INTENT(in)                    :: max_nelements_per_packet


      INTEGER                                            :: iproc, irequest, nrequests, &

                                                            request_offset

      INTEGER(kind=int_8)                                :: element_offset_tmp, nelements


      request_offset = 0

      element_offset_tmp = 0

      DO iproc = lbound(nelements_per_proc, 1), ubound(nelements_per_proc, 1)

         IF (iproc /= mepos) THEN

            nrequests = int(nelements_per_proc(iproc)/max_nelements_per_packet)

            IF (mod(nelements_per_proc(iproc), max_nelements_per_packet) > 0) nrequests = nrequests + 1

            cpassert(nrequests <= max_mpi_rank + 1)

            IF (nrequests > 0) THEN

               nelements = nelements_per_proc(iproc)/nrequests

               IF (nelements_per_proc(iproc) - nelements*nrequests > 0) nelements = nelements + 1

               cpassert(nelements <= max_nelements_per_packet)


               DO irequest = 1, nrequests

                  element_offset(request_offset + irequest) = (irequest - 1)*nelements + element_offset_tmp

                  IF (irequest < nrequests) THEN

                     nelements_per_request(request_offset + irequest) = nelements

                  ELSE

                     nelements_per_request(request_offset + irequest) = nelements_per_proc(iproc) - nelements*(nrequests - 1)

                  END IF

                  peer_rank(request_offset + irequest) = iproc

                  tag(request_offset + irequest) = irequest - 1

               END DO

            END IF

            request_offset = request_offset + nrequests

         END IF

         element_offset_tmp = element_offset_tmp + nelements_per_proc(iproc)

      END DO


      IF (debug_this_module) THEN

         cpassert(SIZE(element_offset) == request_offset)

         cpassert(SIZE(nelements_per_request) == request_offset)

         cpassert(SIZE(peer_rank) == request_offset)

         cpassert(SIZE(tag) == request_offset)

      END IF

   END SUBROUTINE assign_nonzero_elements_to_requests


END MODULE smeagol_matrix_utils

message_passing::mp_waitall
Definition message_passing.F:871

util::sort
Definition util.F:31

cell_types
Handles all functions related to the CELL.
Definition cell_types.F:15

cell_types::scaled_to_real
subroutine, public scaled_to_real(r, s, cell)
Transform scaled cell coordinates real coordinates. r=h*s.
Definition cell_types.F:516

cell_types::real_to_scaled
subroutine, public real_to_scaled(s, r, cell)
Transform real to scaled cell coordinates. s=h_inv*r.
Definition cell_types.F:486

cp_dbcsr_api
Definition cp_dbcsr_api.F:8

cp_dbcsr_api::dbcsr_get_block_p
subroutine, public dbcsr_get_block_p(matrix, row, col, block, found, row_size, col_size)
...
Definition cp_dbcsr_api.F:693

cp_dbcsr_api::dbcsr_get_info
subroutine, public dbcsr_get_info(matrix, nblkrows_total, nblkcols_total, nfullrows_total, nfullcols_total, nblkrows_local, nblkcols_local, nfullrows_local, nfullcols_local, my_prow, my_pcol, local_rows, local_cols, proc_row_dist, proc_col_dist, row_blk_size, col_blk_size, row_blk_offset, col_blk_offset, distribution, name, matrix_type, group)
...
Definition cp_dbcsr_api.F:808

cp_dbcsr_api::dbcsr_set
subroutine, public dbcsr_set(matrix, alpha)
...
Definition cp_dbcsr_api.F:1195

kinds
Defines the basic variable types.
Definition kinds.F:23

kinds::int_8
integer, parameter, public int_8
Definition kinds.F:54

kinds::dp_size
integer, parameter, public dp_size
Definition kinds.F:36

kinds::dp
integer, parameter, public dp
Definition kinds.F:34

message_passing
Interface to the message passing library MPI.
Definition message_passing.F:23

negf_matrix_utils
Helper routines to manipulate with matrices.
Definition negf_matrix_utils.F:12

negf_matrix_utils::get_index_by_cell
pure integer function, public get_index_by_cell(cell, cell_to_index)
Helper routine to obtain index of a DBCSR matrix image by its unit cell replica. Can be used with any...
Definition negf_matrix_utils.F:781

particle_types
Define the data structure for the particle information.
Definition particle_types.F:19

qs_neighbor_list_types
Define the neighbor list data types and the corresponding functionality.
Definition qs_neighbor_list_types.F:17

qs_neighbor_list_types::neighbor_list_iterator_create
subroutine, public neighbor_list_iterator_create(iterator_set, nl, search, nthread)
Neighbor list iterator functions.
Definition qs_neighbor_list_types.F:165

qs_neighbor_list_types::neighbor_list_iterator_release
subroutine, public neighbor_list_iterator_release(iterator_set)
...
Definition qs_neighbor_list_types.F:251

qs_neighbor_list_types::get_neighbor_list_set_p
subroutine, public get_neighbor_list_set_p(neighbor_list_sets, nlist, symmetric)
Return the components of the first neighbor list set.
Definition qs_neighbor_list_types.F:993

qs_neighbor_list_types::neighbor_list_iterate
integer function, public neighbor_list_iterate(iterator_set, mepos)
...
Definition qs_neighbor_list_types.F:351

qs_neighbor_list_types::get_iterator_info
subroutine, public get_iterator_info(iterator_set, mepos, ikind, jkind, nkind, ilist, nlist, inode, nnode, iatom, jatom, r, cell)
...
Definition qs_neighbor_list_types.F:549

qs_subsys_types
types that represent a quickstep subsys
Definition qs_subsys_types.F:12

qs_subsys_types::qs_subsys_get
subroutine, public qs_subsys_get(subsys, atomic_kinds, atomic_kind_set, particles, particle_set, local_particles, molecules, molecule_set, molecule_kinds, molecule_kind_set, local_molecules, para_env, colvar_p, shell_particles, core_particles, gci, multipoles, natom, nparticle, ncore, nshell, nkind, atprop, virial, results, cell, cell_ref, use_ref_cell, energy, force, qs_kind_set, cp_subsys, nelectron_total, nelectron_spin)
...
Definition qs_subsys_types.F:134

smeagol_matrix_utils
Routines to convert sparse matrices between DBCSR (distributed-blocks compressed sparse rows) and SIE...
Definition smeagol_matrix_utils.F:15

smeagol_matrix_utils::siesta_struct_release
subroutine, public siesta_struct_release(siesta_struct)
Release a SIESTA matrix structure.
Definition smeagol_matrix_utils.F:379

smeagol_matrix_utils::siesta_struct_create
subroutine, public siesta_struct_create(siesta_struct, matrix_dbcsr_kp, subsys, cell_to_index, sab_nl, para_env, max_ij_cell_image, do_merge, gather_root)
Map non-zero matrix blocks between sparse matrices in DBCSR and SIESTA formats.
Definition smeagol_matrix_utils.F:163

smeagol_matrix_utils::convert_distributed_siesta_to_dbcsr
subroutine, public convert_distributed_siesta_to_dbcsr(matrix_dbcsr_kp, matrix_siesta, siesta_struct, para_env)
Convert matrix from DBCSR to sparse SIESTA format.
Definition smeagol_matrix_utils.F:799

smeagol_matrix_utils::convert_dbcsr_to_distributed_siesta
subroutine, public convert_dbcsr_to_distributed_siesta(matrix_siesta, matrix_dbcsr_kp, siesta_struct, para_env)
Convert matrix from DBCSR to sparse SIESTA format.
Definition smeagol_matrix_utils.F:421

util
All kind of helpful little routines.
Definition util.F:14

cell_types::cell_type
Type defining parameters related to the simulation cell.
Definition cell_types.F:55

cp_dbcsr_api::dbcsr_p_type
Definition cp_dbcsr_api.F:170

message_passing::mp_para_env_type
stores all the informations relevant to an mpi environment
Definition message_passing.F:763

message_passing::mp_request_type
Definition message_passing.F:623

particle_types::particle_type
Definition particle_types.F:35

qs_neighbor_list_types::neighbor_list_iterator_p_type
Definition qs_neighbor_list_types.F:117

qs_neighbor_list_types::neighbor_list_set_p_type
Definition qs_neighbor_list_types.F:67

qs_subsys_types::qs_subsys_type
Definition qs_subsys_types.F:56

smeagol_matrix_utils::siesta_distrib_csc_struct_type
Sparsity pattern of replicated SIESTA compressed sparse column (CSC) matrices.
Definition smeagol_matrix_utils.F:86