(git:ccc2433)
submatrix_dissection.F
Go to the documentation of this file.
1 !--------------------------------------------------------------------------------------------------!
2 ! CP2K: A general program to perform molecular dynamics simulations !
3 ! Copyright 2000-2024 CP2K developers group <https://cp2k.org> !
4 ! !
5 ! SPDX-License-Identifier: GPL-2.0-or-later !
6 !--------------------------------------------------------------------------------------------------!
7 
8 ! **************************************************************************************************
9 !> \note
10 !> This module implements a modified version of the submatrix method, proposed in
11 !> M. Lass, S. Mohr, H. Wiebeler, T. Kuehne, C. Plessl
12 !> A Massively Parallel Algorithm for the Approximate Calculation of Inverse p-th Roots of Large Sparse Matrices
13 !> Proc. Platform for Advanced Scientific Computing (PASC) Conference, ACM, 2018
14 !>
15 !> The method is extended to minimize the required data transfers and floating-point operations under the assumption that non-zero
16 !> blocks of the matrix are close to its diagonal.
17 !>
18 !> Submatrices can be constructed not for single columns of the matrix but for a set of w consecutive submatrices. The underlying
19 !> assumption is that columns next to each other have relatively similar occupation patterns, i.e., constructing a submatrix from
20 !> columns x and x+1 should not lead to a much bigger submatrix than contructing it only from column x.
21 !>
22 !> The construction of the submatrices requires communication between all ranks. It is crucial to implement this communication as
23 !> efficient as possible, i.e., data should only ever be transferred once between two ranks and message sizes need to be
24 !> sufficiently large to utilize the communication bandwidth. To achieve this, we communicate the required blocks for all
25 !> submatrices at once and copy them to large buffers before transmitting them via MPI.
26 !>
27 !> Note on multi-threading:
28 !> Submatrices can be constructed and processed in parallel by multiple threads. However, generate_submatrix, get_sm_ids_for_rank
29 !> and copy_resultcol are the only thread-safe routines in this module. All other routines involve MPI communication or operate on
30 !> common, non-protected data and are hence not thread-safe.
31 !>
32 !> TODO:
33 !> * generic types (for now only dp supported)
34 !> * optimization of threaded initialization
35 !> * sanity checks at the beginning of all methods
36 !>
37 !> \author Michael Lass
38 ! **************************************************************************************************
39 
41 
42  USE bibliography, ONLY: lass2018,&
43  cite_reference
44  USE dbcsr_api, ONLY: &
45  dbcsr_distribution_get, dbcsr_distribution_type, dbcsr_finalize, dbcsr_get_block_p, &
46  dbcsr_get_info, dbcsr_get_stored_coordinates, dbcsr_iterator_blocks_left, &
47  dbcsr_iterator_next_block, dbcsr_iterator_start, dbcsr_iterator_stop, dbcsr_iterator_type, &
48  dbcsr_put_block, dbcsr_type
49  USE kinds, ONLY: dp
50  USE message_passing, ONLY: mp_comm_type
51  USE submatrix_types, ONLY: buffer_type,&
52  bufptr_type,&
54  set_type,&
56 
57 !$ USE omp_lib, ONLY: omp_get_max_threads, omp_get_thread_num
58 
59 #include "./base/base_uses.f90"
60 
61  IMPLICIT NONE
62  PRIVATE
63 
64  CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'submatrix_dissection'
65 
66  TYPE, PUBLIC :: submatrix_dissection_type
67  TYPE(dbcsr_type) :: dbcsr_mat
68  TYPE(dbcsr_distribution_type) :: dist
69  LOGICAL :: initialized = .false.
70  TYPE(mp_comm_type) :: group
71  INTEGER :: numnodes, myrank, nblkcols, nblkrows, nblks, local_blocks, &
72  cols_per_sm, number_of_submatrices
73  INTEGER, DIMENSION(:), POINTER :: row_blk_size, col_blk_size
74  INTEGER, DIMENSION(:), ALLOCATABLE :: coo_cols, coo_rows, coo_col_offsets, coo_cols_local, coo_rows_local, &
75  coo_col_offsets_local, submatrix_owners, submatrix_sizes
76  TYPE(buffer_type), DIMENSION(:), ALLOCATABLE :: recvbufs, result_sendbufs ! Indexing starts with 0 to match rank ids!
77  TYPE(set_type), DIMENSION(:), ALLOCATABLE :: result_blocks_for_rank, result_blocks_from_rank
78  TYPE(bufptr_type), DIMENSION(:), ALLOCATABLE :: coo_dptr
79  TYPE(intbuffer_type), DIMENSION(:), ALLOCATABLE :: result_blocks_for_rank_buf_offsets
80  CONTAINS
81  PROCEDURE :: init => submatrix_dissection_init
82  PROCEDURE :: final => submatrix_dissection_final
83  PROCEDURE :: get_sm_ids_for_rank => submatrix_get_sm_ids_for_rank
84  PROCEDURE :: generate_submatrix => submatrix_generate_sm
85  PROCEDURE :: copy_resultcol => submatrix_cpy_resultcol
86  PROCEDURE :: communicate_results => submatrix_communicate_results
87  PROCEDURE :: get_relevant_sm_columns => submatrix_get_relevant_sm_columns
89 
90 CONTAINS
91 
92 ! **************************************************************************************************
93 !> \brief determine which columns of the submatrix are relevant for the result matrix
94 !> \param this - object of class submatrix_dissection_type
95 !> \param sm_id - id of the submatrix
96 !> \param first - first column of submatrix that is relevant
97 !> \param last - last column of submatrix that is relevant
98 ! **************************************************************************************************
99  SUBROUTINE submatrix_get_relevant_sm_columns(this, sm_id, first, last)
100  CLASS(submatrix_dissection_type), INTENT(IN) :: this
101  INTEGER, INTENT(IN) :: sm_id
102  INTEGER, INTENT(OUT) :: first, last
103  INTEGER :: i, j, blkid
104  TYPE(set_type) :: nonzero_rows
105 
106  ! TODO: Should we buffer the list of non-zero rows for each submatrix instead of recalculating it each time?
107  DO i = (sm_id - 1)*this%cols_per_sm + 1, sm_id*this%cols_per_sm ! all colums that determine submatrix sm_id
108  DO j = this%coo_col_offsets(i), this%coo_col_offsets(i + 1) - 1 ! all blocks that are within this column
109  CALL nonzero_rows%insert(this%coo_rows(j))
110  END DO
111  END DO
112 
113  first = 1
114  DO i = 1, nonzero_rows%elements
115  blkid = nonzero_rows%get(i)
116  IF (blkid .EQ. (sm_id - 1)*this%cols_per_sm + 1) THEN
117  ! We just found the nonzero row that corresponds to the first inducing column of our submatrix
118  ! Now add up block sizes to determine the last one as well
119  last = first - 1
120  DO j = i, nonzero_rows%elements
121  blkid = nonzero_rows%get(j)
122  last = last + this%col_blk_size(blkid)
123  IF (blkid .EQ. (sm_id)*this%cols_per_sm) EXIT
124  END DO
125  EXIT
126  END IF
127  first = first + this%col_blk_size(blkid)
128  END DO
129 
130  CALL nonzero_rows%reset
131 
132  END SUBROUTINE submatrix_get_relevant_sm_columns
133 
134 ! **************************************************************************************************
135 !> \brief initialize submatrix dissection and communicate, needs to be called before constructing any submatrices.
136 !> \param this - object of class submatrix_dissection_type
137 !> \param matrix_p - dbcsr input matrix
138 !> \par History
139 !> 2020.02 created [Michael Lass]
140 !> 2020.05 add time measurements [Michael Lass]
141 ! **************************************************************************************************
142  SUBROUTINE submatrix_dissection_init(this, matrix_p) ! Should be PURE but the iterator routines are not
143  CLASS(submatrix_dissection_type), INTENT(INOUT) :: this
144  TYPE(dbcsr_type), INTENT(IN) :: matrix_p
145 
146  INTEGER :: cur_row, cur_col, cur_blk, i, j, k, l, m, l_limit_left, l_limit_right, &
147  bufsize, bufsize_next
148  INTEGER, DIMENSION(:), ALLOCATABLE :: blocks_per_rank, coo_dsplcmnts, num_blockids_send, num_blockids_recv
149  TYPE(dbcsr_iterator_type) :: iter
150  TYPE(set_type) :: nonzero_rows
151  REAL(KIND=dp) :: flops_total, flops_per_rank, flops_per_sm, flops_threshold, &
152  flops_current, flops_remaining
153 
154  ! Indexing for the following arrays starts with 0 to match rank ids
155  TYPE(set_type), DIMENSION(:), ALLOCATABLE :: blocks_from_rank
156  TYPE(buffer_type), DIMENSION(:), ALLOCATABLE :: sendbufs
157  TYPE(intbuffer_type), DIMENSION(:), ALLOCATABLE :: blocks_for_rank
158 
159  ! Additional structures for threaded parts
160  INTEGER :: numthreads, mytid, group_handle
161  ! Indexing starts at 0 to match thread ids
162  TYPE(setarray_type), DIMENSION(:), ALLOCATABLE :: nonzero_rows_t
163  TYPE(setarray_type), DIMENSION(:), ALLOCATABLE :: result_blocks_from_rank_t, result_blocks_for_rank_t, &
164  blocks_from_rank_t
165 
166  LOGICAL :: valid
167  REAL(KIND=dp), DIMENSION(:), POINTER :: blockp
168 
169  CHARACTER(LEN=*), PARAMETER :: routineN = 'submatrix_dissection_init'
170  INTEGER :: handle
171 
172  CALL timeset(routinen, handle)
173  CALL cite_reference(lass2018)
174 
175  this%dbcsr_mat = matrix_p
176  CALL dbcsr_get_info(matrix=this%dbcsr_mat, nblkcols_total=this%nblkcols, nblkrows_total=this%nblkrows, &
177  row_blk_size=this%row_blk_size, col_blk_size=this%col_blk_size, group=group_handle, distribution=this%dist)
178  CALL dbcsr_distribution_get(dist=this%dist, mynode=this%myrank, numnodes=this%numnodes)
179 
180  CALL this%group%set_handle(group_handle)
181 
182  IF (this%nblkcols .NE. this%nblkrows) THEN
183  cpabort("Number of block rows and cols need to be identical")
184  END IF
185 
186  DO i = 1, this%nblkcols
187  IF (this%col_blk_size(i) .NE. this%row_blk_size(i)) THEN
188  cpabort("Block row sizes and col sizes need to be identical")
189  END IF
190  END DO
191 
192  ! TODO: We could do even more sanity checks here, e.g., the matrix must not be stored symmetrically
193 
194  ! For the submatrix method, we need global knwoledge about which blocks are actually used. Therefore, we create a COO
195  ! representation of the blocks (without their contents) on all ranks.
196 
197  ! TODO: Right now, the COO contains all blocks. Also we transfer all blocks. We could skip half of them due to the matrix
198  ! being symmetric (however, we need to transpose the blocks). This can increase performance only by a factor of 2 and is
199  ! therefore deferred.
200 
201  ! Determine number of locally stored blocks
202  this%local_blocks = 0
203  CALL dbcsr_iterator_start(iter, this%dbcsr_mat, read_only=.true.)
204  DO WHILE (dbcsr_iterator_blocks_left(iter))
205  CALL dbcsr_iterator_next_block(iter, cur_row, cur_col, cur_blk)
206  this%local_blocks = this%local_blocks + 1
207  END DO
208  CALL dbcsr_iterator_stop(iter)
209 
210  ALLOCATE (this%coo_cols_local(this%local_blocks), this%coo_rows_local(this%local_blocks), blocks_per_rank(this%numnodes), &
211  coo_dsplcmnts(this%numnodes), this%coo_col_offsets_local(this%nblkcols + 1), &
212  blocks_for_rank(0:this%numnodes - 1), blocks_from_rank(0:this%numnodes - 1), &
213  sendbufs(0:this%numnodes - 1), this%recvbufs(0:this%numnodes - 1), this%result_sendbufs(0:this%numnodes - 1), &
214  this%result_blocks_for_rank(0:this%numnodes - 1), this%result_blocks_from_rank(0:this%numnodes - 1), &
215  this%result_blocks_for_rank_buf_offsets(0:this%numnodes - 1))
216 
217  i = 1
218  CALL dbcsr_iterator_start(iter, this%dbcsr_mat, read_only=.true.)
219  DO WHILE (dbcsr_iterator_blocks_left(iter))
220  CALL dbcsr_iterator_next_block(iter, cur_row, cur_col, cur_blk)
221  this%coo_cols_local(i) = cur_col
222  this%coo_rows_local(i) = cur_row
223  i = i + 1
224  END DO
225  CALL dbcsr_iterator_stop(iter)
226 
227  ! We only know how many blocks we own. What's with the other ranks?
228  CALL this%group%allgather(msgout=this%local_blocks, msgin=blocks_per_rank)
229  coo_dsplcmnts(1) = 0
230  DO i = 1, this%numnodes - 1
231  coo_dsplcmnts(i + 1) = coo_dsplcmnts(i) + blocks_per_rank(i)
232  END DO
233 
234  ! Get a global view on the matrix
235  this%nblks = sum(blocks_per_rank)
236  ALLOCATE (this%coo_cols(this%nblks), this%coo_rows(this%nblks), this%coo_col_offsets(this%nblkcols + 1), &
237  this%coo_dptr(this%nblks))
238  CALL this%group%allgatherv(msgout=this%coo_rows_local, msgin=this%coo_rows, rcount=blocks_per_rank, &
239  rdispl=coo_dsplcmnts)
240  CALL this%group%allgatherv(msgout=this%coo_cols_local, msgin=this%coo_cols, rcount=blocks_per_rank, &
241  rdispl=coo_dsplcmnts)
242 
243  DEALLOCATE (blocks_per_rank, coo_dsplcmnts)
244 
245  ! Sort COO arrays according to their columns
246  CALL qsort_two(this%coo_cols_local, this%coo_rows_local, 1, this%local_blocks)
247  CALL qsort_two(this%coo_cols, this%coo_rows, 1, this%nblks)
248 
249  ! Get COO array offsets for columns to accelerate lookups
250  this%coo_col_offsets(this%nblkcols + 1) = this%nblks + 1
251  j = 1
252  DO i = 1, this%nblkcols
253  DO WHILE ((j .LE. this%nblks))
254  IF (this%coo_cols(j) .GE. i) EXIT
255  j = j + 1
256  END DO
257  this%coo_col_offsets(i) = j
258  END DO
259 
260  ! Same for local COO
261  this%coo_col_offsets_local(this%nblkcols + 1) = this%local_blocks + 1
262  j = 1
263  DO i = 1, this%nblkcols
264  DO WHILE ((j .LE. this%local_blocks))
265  IF (this%coo_cols_local(j) .GE. i) EXIT
266  j = j + 1
267  END DO
268  this%coo_col_offsets_local(i) = j
269  END DO
270 
271  ! We could combine multiple columns to generate a single submatrix. For now, we have not found a practical use-case for this
272  ! so we only look at single columns for now.
273  this%cols_per_sm = 1
274 
275  ! Determine sizes of all submatrices. This is required in order to assess the computational effort that is required to process
276  ! the submatrices.
277  this%number_of_submatrices = this%nblkcols/this%cols_per_sm
278  ALLOCATE (this%submatrix_sizes(this%number_of_submatrices))
279  this%submatrix_sizes = 0
280  flops_total = 0.0d0
281  DO i = 1, this%number_of_submatrices
282  CALL nonzero_rows%reset
283  DO j = (i - 1)*this%cols_per_sm + 1, i*this%cols_per_sm ! all colums that determine submatrix i
284  DO k = this%coo_col_offsets(j), this%coo_col_offsets(j + 1) - 1 ! all blocks that are within this column
285  CALL nonzero_rows%insert(this%coo_rows(k))
286  END DO
287  END DO
288  DO j = 1, nonzero_rows%elements
289  this%submatrix_sizes(i) = this%submatrix_sizes(i) + this%col_blk_size(nonzero_rows%get(j))
290  END DO
291  flops_total = flops_total + 2.0d0*this%submatrix_sizes(i)*this%submatrix_sizes(i)*this%submatrix_sizes(i)
292  END DO
293 
294  ! Create mapping from submatrices to ranks. Since submatrices can be of different sizes, we need to perform some load
295  ! balancing here. For that we assume that arithmetic operations performed on the submatrices scale cubically.
296  ALLOCATE (this%submatrix_owners(this%number_of_submatrices))
297  flops_per_sm = flops_total/this%number_of_submatrices
298  flops_per_rank = flops_total/this%numnodes
299  flops_current = 0.0d0
300  j = 0
301  DO i = 1, this%number_of_submatrices
302  this%submatrix_owners(i) = j
303  flops_current = flops_current + 2.0d0*this%submatrix_sizes(i)*this%submatrix_sizes(i)*this%submatrix_sizes(i)
304  flops_remaining = flops_total - flops_current
305  flops_threshold = (this%numnodes - j - 1)*flops_per_rank
306  IF ((j .LT. (this%numnodes - 1)) &
307  .AND. ((flops_remaining .LE. flops_threshold &
308  .OR. (this%number_of_submatrices - i) .LT. (this%numnodes - j)))) THEN
309  j = j + 1
310  flops_total = flops_total - flops_current
311  flops_current = 0.0d0
312  END IF
313  END DO
314 
315  ! Prepare data structures for multithreaded loop
316  numthreads = 1
317 !$ numthreads = omp_get_max_threads()
318 
319  ALLOCATE (result_blocks_from_rank_t(0:numthreads - 1), &
320  result_blocks_for_rank_t(0:numthreads - 1), &
321  blocks_from_rank_t(0:numthreads - 1), &
322  nonzero_rows_t(0:numthreads - 1))
323 
324  ! Figure out which blocks we need to receive. Blocks are identified here as indices into our COO representation.
325  ! TODO: This currently shows limited parallel efficiency. Investigate further.
326 
327  !$OMP PARALLEL DEFAULT(OMP_DEFAULT_NONE_WITH_OOP) &
328  !$OMP NUM_THREADS(numthreads) &
329  !$OMP PRIVATE(i,j,k,l,m,l_limit_left,l_limit_right,cur_col,cur_row,mytid) &
330  !$OMP SHARED(result_blocks_from_rank_t,result_blocks_for_rank_t,blocks_from_rank_t,this,numthreads,nonzero_rows_t)
331  mytid = 0
332 !$ mytid = omp_get_thread_num()
333 
334  ALLOCATE (nonzero_rows_t(mytid)%sets(1), &
335  result_blocks_from_rank_t(mytid)%sets(0:this%numnodes - 1), &
336  result_blocks_for_rank_t(mytid)%sets(0:this%numnodes - 1), &
337  blocks_from_rank_t(mytid)%sets(0:this%numnodes - 1))
338 
339  !$OMP DO schedule(guided)
340  DO i = 1, this%number_of_submatrices
341  CALL nonzero_rows_t(mytid)%sets(1)%reset
342  DO j = (i - 1)*this%cols_per_sm + 1, i*this%cols_per_sm ! all colums that determine submatrix i
343  DO k = this%coo_col_offsets(j), this%coo_col_offsets(j + 1) - 1 ! all blocks that are within this column
344  ! This block will be required to assemble the final block matrix as it is within an inducing column for submatrix i.
345  ! Figure out who stores it and insert it into the result_blocks_* sets.
346  CALL dbcsr_get_stored_coordinates(matrix=this%dbcsr_mat, row=this%coo_rows(k), column=j, processor=m)
347  IF (m .EQ. this%myrank) THEN
348  CALL result_blocks_from_rank_t(mytid)%sets(this%submatrix_owners(i))%insert(k)
349  END IF
350  IF (this%submatrix_owners(i) .EQ. this%myrank) THEN
351  CALL nonzero_rows_t(mytid)%sets(1)%insert(this%coo_rows(k))
352  CALL result_blocks_for_rank_t(mytid)%sets(m)%insert(k)
353  END IF
354  END DO
355  END DO
356 
357  IF (this%submatrix_owners(i) .NE. this%myrank) cycle
358 
359  ! In the following, we determine all blocks required to build the submatrix. We interpret nonzero_rows_t(mytid)(j) as
360  ! column and nonzero_rows_t(mytid)(k) as row.
361  DO j = 1, nonzero_rows_t(mytid)%sets(1)%elements
362  cur_col = nonzero_rows_t(mytid)%sets(1)%get(j)
363  l_limit_left = this%coo_col_offsets(cur_col)
364  l_limit_right = this%coo_col_offsets(cur_col + 1) - 1
365  DO k = 1, nonzero_rows_t(mytid)%sets(1)%elements
366  cur_row = nonzero_rows_t(mytid)%sets(1)%get(k)
367  l = l_limit_left
368  DO WHILE (l .LE. l_limit_right)
369  IF (this%coo_rows(l) .GE. cur_row) EXIT
370  l = l + 1
371  END DO
372  l_limit_left = l
373  IF (l .LE. l_limit_right) THEN
374  IF (this%coo_rows(l) .EQ. cur_row) THEN
375  ! We found a valid block. Figure out what to do with it.
376  CALL dbcsr_get_stored_coordinates(matrix=this%dbcsr_mat, row=this%coo_rows(l), &
377  column=this%coo_cols(l), processor=m)
378  CALL blocks_from_rank_t(mytid)%sets(m)%insert(l)
379  END IF
380  END IF
381  END DO
382  END DO
383  END DO
384  !$OMP END DO
385  !$OMP END PARALLEL
386 
387  ! Merge partial results from threads
388  DO i = 0, this%numnodes - 1
389  DO j = 0, numthreads - 1
390  DO k = 1, result_blocks_from_rank_t(j)%sets(i)%elements
391  CALL this%result_blocks_from_rank(i)%insert(result_blocks_from_rank_t(j)%sets(i)%get(k))
392  END DO
393  CALL result_blocks_from_rank_t(j)%sets(i)%reset
394  DO k = 1, result_blocks_for_rank_t(j)%sets(i)%elements
395  CALL this%result_blocks_for_rank(i)%insert(result_blocks_for_rank_t(j)%sets(i)%get(k))
396  END DO
397  CALL result_blocks_for_rank_t(j)%sets(i)%reset
398  DO k = 1, blocks_from_rank_t(j)%sets(i)%elements
399  CALL blocks_from_rank(i)%insert(blocks_from_rank_t(j)%sets(i)%get(k))
400  END DO
401  CALL blocks_from_rank_t(j)%sets(i)%reset
402  END DO
403  END DO
404  DO i = 0, numthreads - 1
405  CALL nonzero_rows_t(i)%sets(1)%reset
406  DEALLOCATE (result_blocks_from_rank_t(i)%sets, result_blocks_for_rank_t(i)%sets, blocks_from_rank_t(i)%sets, &
407  nonzero_rows_t(i)%sets)
408  END DO
409  DEALLOCATE (result_blocks_from_rank_t, result_blocks_for_rank_t, blocks_from_rank_t, nonzero_rows_t)
410 
411  ! Make other ranks aware of our needs
412  ALLOCATE (num_blockids_send(0:this%numnodes - 1), num_blockids_recv(0:this%numnodes - 1))
413  DO i = 0, this%numnodes - 1
414  num_blockids_send(i) = blocks_from_rank(i)%elements
415  END DO
416  CALL this%group%alltoall(num_blockids_send, num_blockids_recv, 1)
417  DO i = 0, this%numnodes - 1
418  CALL blocks_for_rank(i)%alloc(num_blockids_recv(i))
419  END DO
420  DEALLOCATE (num_blockids_send, num_blockids_recv)
421 
422  IF (this%numnodes .GT. 1) THEN
423  DO i = 1, this%numnodes
424  k = modulo(this%myrank - i, this%numnodes) ! rank to receive from
425  CALL this%group%irecv(msgout=blocks_for_rank(k)%data, source=k, request=blocks_for_rank(k)%mpi_request)
426  END DO
427  DO i = 1, this%numnodes
428  k = modulo(this%myrank + i, this%numnodes) ! rank to send to
429  CALL this%group%send(blocks_from_rank(k)%getall(), k, 0)
430  END DO
431  DO i = 0, this%numnodes - 1
432  CALL blocks_for_rank(i)%mpi_request%wait()
433  END DO
434  ELSE
435  blocks_for_rank(0)%data = blocks_from_rank(0)%getall()
436  END IF
437 
438  ! Free memory allocated in nonzero_rows
439  CALL nonzero_rows%reset
440 
441  ! Make get calls on this%result_blocks_for_rank(...) threadsafe in other routines by updating the internal sorted list
442  DO m = 0, this%numnodes - 1
443  IF ((.NOT. this%result_blocks_for_rank(m)%sorted_up_to_date) .AND. (this%result_blocks_for_rank(m)%elements .GT. 0)) THEN
444  CALL this%result_blocks_for_rank(m)%update_sorted
445  END IF
446  END DO
447 
448  ! Create and fill send buffers
449  DO i = 0, this%numnodes - 1
450  bufsize = 0
451  DO j = 1, blocks_for_rank(i)%size
452  k = blocks_for_rank(i)%data(j)
453  bufsize = bufsize + this%col_blk_size(this%coo_cols(k))*this%col_blk_size(this%coo_rows(k))
454  END DO
455  CALL sendbufs(i)%alloc(bufsize)
456 
457  bufsize = 0
458  CALL this%result_blocks_for_rank_buf_offsets(i)%alloc(this%result_blocks_for_rank(i)%elements)
459  DO j = 1, this%result_blocks_for_rank(i)%elements
460  k = this%result_blocks_for_rank(i)%get(j)
461  this%result_blocks_for_rank_buf_offsets(i)%data(j) = bufsize
462  bufsize = bufsize + this%col_blk_size(this%coo_cols(k))*this%col_blk_size(this%coo_rows(k))
463  END DO
464  CALL this%result_sendbufs(i)%alloc(bufsize)
465 
466  bufsize = 0
467  DO j = 1, blocks_for_rank(i)%size
468  k = blocks_for_rank(i)%data(j)
469  CALL dbcsr_get_block_p(this%dbcsr_mat, row=this%coo_rows(k), col=this%coo_cols(k), block=blockp, found=valid)
470  IF (.NOT. valid) THEN
471  cpabort("Block included in our COO and placed on our rank could not be fetched!")
472  END IF
473  bufsize_next = bufsize + SIZE(blockp)
474  sendbufs(i)%data(bufsize + 1:bufsize_next) = blockp
475  bufsize = bufsize_next
476  END DO
477  END DO
478 
479  ! Create receive buffers and mapping from blocks to memory locations
480  DO i = 0, this%numnodes - 1
481  bufsize = 0
482  DO j = 1, blocks_from_rank(i)%elements
483  k = blocks_from_rank(i)%get(j)
484  bufsize = bufsize + this%col_blk_size(this%coo_cols(k))*this%col_blk_size(this%coo_rows(k))
485  END DO
486  CALL this%recvbufs(i)%alloc(bufsize)
487  bufsize = 0
488  DO j = 1, blocks_from_rank(i)%elements
489  k = blocks_from_rank(i)%get(j)
490  bufsize_next = bufsize + this%col_blk_size(this%coo_cols(k))*this%col_blk_size(this%coo_rows(k))
491  this%coo_dptr(k)%target => this%recvbufs(i)%data(bufsize + 1:bufsize_next)
492  bufsize = bufsize_next
493  END DO
494  END DO
495 
496  DO i = 0, this%numnodes - 1
497  CALL blocks_for_rank(i)%dealloc
498  CALL blocks_from_rank(i)%reset
499  END DO
500  DEALLOCATE (blocks_for_rank, blocks_from_rank)
501 
502  IF (this%numnodes .GT. 1) THEN
503  ! Communicate. We attempt to balance communication load in the network here by starting our sends with our right neighbor
504  ! and first trying to receive from our left neighbor.
505  DO i = 1, this%numnodes
506  k = modulo(this%myrank - i, this%numnodes) ! rank to receive from
507  CALL this%group%irecv(msgout=this%recvbufs(k)%data, source=k, request=this%recvbufs(k)%mpi_request)
508  k = modulo(this%myrank + i, this%numnodes) ! rank to send to
509  CALL this%group%isend(msgin=sendbufs(k)%data, dest=k, request=sendbufs(k)%mpi_request)
510  END DO
511  DO i = 0, this%numnodes - 1
512  CALL sendbufs(i)%mpi_request%wait()
513  CALL this%recvbufs(i)%mpi_request%wait()
514  END DO
515  ELSE
516  this%recvbufs(0)%data = sendbufs(0)%data
517  END IF
518 
519  DO i = 0, this%numnodes - 1
520  CALL sendbufs(i)%dealloc
521  END DO
522  DEALLOCATE (sendbufs)
523 
524  this%initialized = .true.
525 
526  CALL timestop(handle)
527  END SUBROUTINE submatrix_dissection_init
528 
529 ! **************************************************************************************************
530 !> \brief free all associated memory, afterwards submatrix_dissection_init needs to be called again
531 !> \param this - object of class submatrix_dissection_type
532 ! **************************************************************************************************
533  PURE SUBROUTINE submatrix_dissection_final(this)
534  CLASS(submatrix_dissection_type), INTENT(INOUT) :: this
535  INTEGER :: i
536 
537  this%initialized = .false.
538 
539  IF (ALLOCATED(this%submatrix_sizes)) DEALLOCATE (this%submatrix_sizes)
540  IF (ALLOCATED(this%coo_cols_local)) DEALLOCATE (this%coo_cols_local)
541  IF (ALLOCATED(this%coo_rows_local)) DEALLOCATE (this%coo_rows_local)
542  IF (ALLOCATED(this%coo_col_offsets_local)) DEALLOCATE (this%coo_col_offsets_local)
543  IF (ALLOCATED(this%result_blocks_for_rank_buf_offsets)) THEN
544  DO i = 0, this%numnodes - 1
545  CALL this%result_blocks_for_rank_buf_offsets(i)%dealloc
546  END DO
547  DEALLOCATE (this%result_blocks_for_rank_buf_offsets)
548  END IF
549  IF (ALLOCATED(this%recvbufs)) THEN
550  DO i = 0, this%numnodes - 1
551  CALL this%recvbufs(i)%dealloc
552  END DO
553  DEALLOCATE (this%recvbufs)
554  END IF
555  IF (ALLOCATED(this%result_sendbufs)) THEN
556  DO i = 0, this%numnodes - 1
557  CALL this%result_sendbufs(i)%dealloc
558  END DO
559  DEALLOCATE (this%result_sendbufs)
560  END IF
561  IF (ALLOCATED(this%result_blocks_for_rank)) THEN
562  DO i = 0, this%numnodes - 1
563  CALL this%result_blocks_for_rank(i)%reset
564  END DO
565  DEALLOCATE (this%result_blocks_for_rank)
566  END IF
567  IF (ALLOCATED(this%result_blocks_from_rank)) THEN
568  DO i = 0, this%numnodes - 1
569  CALL this%result_blocks_from_rank(i)%reset
570  END DO
571  DEALLOCATE (this%result_blocks_from_rank)
572  END IF
573  IF (ALLOCATED(this%coo_cols)) DEALLOCATE (this%coo_cols)
574  IF (ALLOCATED(this%coo_rows)) DEALLOCATE (this%coo_rows)
575  IF (ALLOCATED(this%coo_col_offsets)) DEALLOCATE (this%coo_col_offsets)
576  IF (ALLOCATED(this%coo_dptr)) DEALLOCATE (this%coo_dptr)
577  IF (ALLOCATED(this%submatrix_owners)) DEALLOCATE (this%submatrix_owners)
578 
579  END SUBROUTINE submatrix_dissection_final
580 
581 ! **************************************************************************************************
582 !> \brief generate a specific submatrix
583 !> \param this - object of class submatrix_dissection_type
584 !> \param sm_id - id of the submatrix to generate
585 !> \param sm - generated submatrix
586 ! **************************************************************************************************
587  SUBROUTINE submatrix_generate_sm(this, sm_id, sm)
588  CLASS(submatrix_dissection_type), INTENT(IN) :: this
589  INTEGER, INTENT(IN) :: sm_id
590  REAL(KIND=dp), DIMENSION(:, :), ALLOCATABLE, INTENT(OUT) :: sm
591 
592  INTEGER :: sm_dim, i, j, k, offset_x1, offset_x2, offset_y1, &
593  offset_y2, k_limit_left, k_limit_right
594  TYPE(set_type) :: nonzero_rows
595 
596  IF (.NOT. this%initialized) THEN
597  cpabort("Submatrix dissection not initialized")
598  END IF
599 
600  IF (this%myrank .NE. this%submatrix_owners(sm_id)) THEN
601  cpabort("This rank is not supposed to construct this submatrix")
602  END IF
603 
604  ! TODO: Should we buffer the list of non-zero rows for each submatrix instead of recalculating it each time?
605  CALL nonzero_rows%reset
606  DO i = (sm_id - 1)*this%cols_per_sm + 1, sm_id*this%cols_per_sm ! all colums that determine submatrix sm_id
607  DO j = this%coo_col_offsets(i), this%coo_col_offsets(i + 1) - 1 ! all blocks that are within this column
608  CALL nonzero_rows%insert(this%coo_rows(j))
609  END DO
610  END DO
611  sm_dim = 0
612  DO i = 1, nonzero_rows%elements
613  sm_dim = sm_dim + this%col_blk_size(nonzero_rows%get(i))
614  END DO
615 
616  ALLOCATE (sm(sm_dim, sm_dim))
617  sm = 0
618 
619  offset_x1 = 0
620  DO j = 1, nonzero_rows%elements
621  offset_x2 = offset_x1 + this%col_blk_size(nonzero_rows%get(j))
622  offset_y1 = 0
623  k_limit_left = this%coo_col_offsets(nonzero_rows%get(j))
624  k_limit_right = this%coo_col_offsets(nonzero_rows%get(j) + 1) - 1
625  DO i = 1, nonzero_rows%elements
626  offset_y2 = offset_y1 + this%col_blk_size(nonzero_rows%get(i))
627  ! Copy block nonzero_rows(i),nonzero_rows(j) to sm(i,j) (if the block actually exists)
628  k = k_limit_left
629  DO WHILE (k .LE. k_limit_right)
630  IF (this%coo_rows(k) .GE. nonzero_rows%get(i)) EXIT
631  k = k + 1
632  END DO
633  k_limit_left = k
634  IF (this%coo_rows(k) .EQ. nonzero_rows%get(i)) THEN ! it does exist and k is our block id
635  sm(offset_y1 + 1:offset_y2, offset_x1 + 1:offset_x2) = reshape(this%coo_dptr(k)%target, &
636  (/offset_y2 - offset_y1, offset_x2 - offset_x1/))
637  END IF
638  offset_y1 = offset_y2
639  END DO
640  offset_x1 = offset_x2
641  END DO
642 
643  ! Free memory allocated in nonzero_rows
644  CALL nonzero_rows%reset
645 
646  END SUBROUTINE submatrix_generate_sm
647 
648 ! **************************************************************************************************
649 !> \brief determine submatrix ids that are handled by a specific rank
650 !> \param this - object of class submatrix_dissection_type
651 !> \param rank - rank id of interest
652 !> \param sm_ids - list of submatrix ids handled by that rank
653 ! **************************************************************************************************
654  SUBROUTINE submatrix_get_sm_ids_for_rank(this, rank, sm_ids)
655  CLASS(submatrix_dissection_type), INTENT(IN) :: this
656  INTEGER, INTENT(IN) :: rank
657  INTEGER, DIMENSION(:), ALLOCATABLE, INTENT(OUT) :: sm_ids
658  INTEGER :: count, i
659 
660  IF (.NOT. this%initialized) THEN
661  cpabort("Submatrix dissection not initialized")
662  END IF
663 
664  count = 0
665  DO i = 1, this%number_of_submatrices
666  IF (this%submatrix_owners(i) .EQ. rank) count = count + 1
667  END DO
668 
669  ALLOCATE (sm_ids(count))
670 
671  count = 0
672  DO i = 1, this%number_of_submatrices
673  IF (this%submatrix_owners(i) .EQ. rank) THEN
674  count = count + 1
675  sm_ids(count) = i
676  END IF
677  END DO
678 
679  END SUBROUTINE submatrix_get_sm_ids_for_rank
680 
681 ! **************************************************************************************************
682 !> \brief copy result columns from a submatrix into result buffer
683 !> \param this - object of class submatrix_dissection_type
684 !> \param sm_id - id of the submatrix
685 !> \param sm - result-submatrix
686 ! **************************************************************************************************
687  SUBROUTINE submatrix_cpy_resultcol(this, sm_id, sm)
688  CLASS(submatrix_dissection_type), INTENT(INOUT) :: this
689  INTEGER, INTENT(in) :: sm_id
690  REAL(KIND=dp), DIMENSION(:, :), ALLOCATABLE, INTENT(IN) :: sm
691 
692  TYPE(set_type) :: nonzero_rows
693  INTEGER :: i, j, k, m, sm_col_offset, offset_x1, offset_x2, offset_y1, &
694  offset_y2, bufsize, bufsize_next, k_limit_left, k_limit_right
695  INTEGER, DIMENSION(:), ALLOCATABLE :: buf_offset_idxs
696 
697  IF (.NOT. this%initialized) THEN
698  cpabort("Submatrix dissection is not initizalized")
699  END IF
700 
701  IF (this%myrank .NE. this%submatrix_owners(sm_id)) THEN
702  cpabort("This rank is not supposed to operate on this submatrix")
703  END IF
704 
705  ALLOCATE (buf_offset_idxs(0:this%numnodes - 1))
706  buf_offset_idxs = 1
707 
708  ! TODO: Should we buffer the list of non-zero rows for each submatrix instead of recalculating it each time?
709  sm_col_offset = 0
710  DO i = (sm_id - 1)*this%cols_per_sm + 1, sm_id*this%cols_per_sm ! all colums that determine submatrix sm_id
711  DO j = this%coo_col_offsets(i), this%coo_col_offsets(i + 1) - 1 ! all blocks that are within this column
712  CALL nonzero_rows%insert(this%coo_rows(j))
713  END DO
714  END DO
715 
716  sm_col_offset = 0
717  DO i = 1, nonzero_rows%elements
718  IF (nonzero_rows%get(i) .EQ. (sm_id - 1)*this%cols_per_sm + 1) THEN
719  ! We just found the nonzero row that corresponds to the first inducing column of our submatrix
720  sm_col_offset = i
721  EXIT
722  END IF
723  END DO
724  IF (sm_col_offset .EQ. 0) THEN
725  cpabort("Could not determine relevant result columns of submatrix")
726  END IF
727 
728  offset_x1 = 0
729  DO j = 1, nonzero_rows%elements
730  offset_x2 = offset_x1 + this%col_blk_size(nonzero_rows%get(j))
731  ! We only want to copy the blocks from the result columns
732  IF ((j .GE. sm_col_offset) .AND. (j .LT. sm_col_offset + this%cols_per_sm)) THEN
733  offset_y1 = 0
734  k_limit_left = this%coo_col_offsets(nonzero_rows%get(j))
735  k_limit_right = this%coo_col_offsets(nonzero_rows%get(j) + 1) - 1
736  DO i = 1, nonzero_rows%elements
737  offset_y2 = offset_y1 + this%col_blk_size(nonzero_rows%get(i))
738  ! Check if sm(i,j), i.e., (nonzero_rows(i),nonzero_rows(j)) exists in the original matrix and if so, copy it into the
739  ! correct send buffer.
740  k = k_limit_left
741  DO WHILE (k .LE. k_limit_right)
742  IF (this%coo_rows(k) .GE. nonzero_rows%get(i)) EXIT
743  k = k + 1
744  END DO
745  k_limit_left = k
746  IF (this%coo_rows(k) .EQ. nonzero_rows%get(i)) THEN ! it does exist and k is our block id
747  CALL dbcsr_get_stored_coordinates(matrix=this%dbcsr_mat, row=this%coo_rows(k), column=this%coo_cols(k), &
748  processor=m)
749  DO WHILE (buf_offset_idxs(m) .LE. this%result_blocks_for_rank(m)%elements)
750  IF (this%result_blocks_for_rank(m)%get(buf_offset_idxs(m)) .GE. k) EXIT
751  buf_offset_idxs(m) = buf_offset_idxs(m) + 1
752  END DO
753  IF (this%result_blocks_for_rank(m)%get(buf_offset_idxs(m)) .NE. k) THEN
754  cpabort("Could not determine buffer offset for block")
755  END IF
756  bufsize = this%result_blocks_for_rank_buf_offsets(m)%data(buf_offset_idxs(m))
757  bufsize_next = bufsize + this%col_blk_size(this%coo_cols(k))*this%col_blk_size(this%coo_rows(k))
758  this%result_sendbufs(m)%data(bufsize + 1:bufsize_next) = reshape( &
759  sm(offset_y1 + 1:offset_y2, offset_x1 + 1:offset_x2), &
760  (/bufsize_next - bufsize/))
761  END IF
762  offset_y1 = offset_y2
763  END DO
764  END IF
765  offset_x1 = offset_x2
766  END DO
767 
768  DEALLOCATE (buf_offset_idxs)
769 
770  ! Free memory in set
771  CALL nonzero_rows%reset
772 
773  END SUBROUTINE submatrix_cpy_resultcol
774 
775 ! **************************************************************************************************
776 !> \brief finalize results back into a dbcsr matrix
777 !> \param this - object of class submatrix_dissection_type
778 !> \param resultmat - result dbcsr matrix
779 !> \par History
780 !> 2020.02 created [Michael Lass]
781 !> 2020.05 add time measurements [Michael Lass]
782 ! **************************************************************************************************
783  SUBROUTINE submatrix_communicate_results(this, resultmat)
784  CLASS(submatrix_dissection_type), INTENT(INOUT) :: this
785  TYPE(dbcsr_type), INTENT(INOUT) :: resultmat
786 
787  INTEGER :: i, j, k, cur_row, cur_col, cur_sm, cur_buf, last_buf, &
788  bufsize, bufsize_next
789  TYPE(buffer_type), DIMENSION(:), ALLOCATABLE :: result_recvbufs
790 
791  CHARACTER(LEN=*), PARAMETER :: routineN = 'submatrix_communicate_results'
792  INTEGER :: handle
793 
794  CALL timeset(routinen, handle)
795 
796  ALLOCATE (result_recvbufs(0:this%numnodes - 1))
797  DO i = 0, this%numnodes - 1
798  bufsize = 0
799  DO j = 1, this%result_blocks_from_rank(i)%elements
800  k = this%result_blocks_from_rank(i)%get(j)
801  bufsize = bufsize + this%col_blk_size(this%coo_cols(k))*this%col_blk_size(this%coo_rows(k))
802  END DO
803  CALL result_recvbufs(i)%alloc(bufsize)
804  END DO
805 
806  ! Communicate. We attempt to balance communication load in the network here by starting our sends with our right neighbor
807  ! and first trying to receive from our left neighbor.
808  IF (this%numnodes .GT. 1) THEN
809  DO i = 1, this%numnodes
810  k = modulo(this%myrank - i, this%numnodes) ! rank to receive from
811  CALL this%group%irecv(msgout=result_recvbufs(k)%data, source=k, request=result_recvbufs(k)%mpi_request)
812  k = modulo(this%myrank + i, this%numnodes) ! rank to send to
813  CALL this%group%isend(msgin=this%result_sendbufs(k)%data, dest=k, request=this%result_sendbufs(k)%mpi_request)
814  END DO
815  DO i = 0, this%numnodes - 1
816  CALL this%result_sendbufs(i)%mpi_request%wait()
817  CALL result_recvbufs(i)%mpi_request%wait()
818  END DO
819  ELSE
820  result_recvbufs(0)%data = this%result_sendbufs(0)%data
821  END IF
822 
823  last_buf = -1
824  bufsize = 0
825  DO i = 1, this%local_blocks
826  cur_col = this%coo_cols_local(i)
827  cur_row = this%coo_rows_local(i)
828  cur_sm = (cur_col - 1)/this%cols_per_sm + 1
829  cur_buf = this%submatrix_owners(cur_sm)
830  IF (cur_buf .GT. last_buf) bufsize = 0
831  bufsize_next = bufsize + this%col_blk_size(cur_row)*this%col_blk_size(cur_col)
832  CALL dbcsr_put_block(matrix=resultmat, row=cur_row, col=cur_col, &
833  block=result_recvbufs(cur_buf)%data(bufsize + 1:bufsize_next))
834  bufsize = bufsize_next
835  last_buf = cur_buf
836  END DO
837 
838  DO i = 0, this%numnodes - 1
839  CALL result_recvbufs(i)%dealloc
840  END DO
841  DEALLOCATE (result_recvbufs)
842 
843  CALL dbcsr_finalize(resultmat)
844 
845  CALL timestop(handle)
846  END SUBROUTINE submatrix_communicate_results
847 
848 ! **************************************************************************************************
849 !> \brief sort two integer arrays using quicksort, using the second list as second-level sorting criterion
850 !> \param arr_a - first input array
851 !> \param arr_b - second input array
852 !> \param left - left boundary of region to be sorted
853 !> \param right - right boundary of region to be sorted
854 ! **************************************************************************************************
855  RECURSIVE PURE SUBROUTINE qsort_two(arr_a, arr_b, left, right)
856 
857  INTEGER, DIMENSION(:), INTENT(inout) :: arr_a, arr_b
858  INTEGER, INTENT(in) :: left, right
859 
860  INTEGER :: i, j, pivot_a, pivot_b, tmp
861 
862  IF (right - left .LT. 1) RETURN
863 
864  i = left
865  j = right - 1
866  pivot_a = arr_a(right)
867  pivot_b = arr_b(right)
868 
869  DO
870  DO WHILE ((arr_a(i) .LT. pivot_a) .OR. ((arr_a(i) .EQ. pivot_a) .AND. (arr_b(i) .LT. pivot_b)))
871  i = i + 1
872  END DO
873  DO WHILE ((j .GT. i) .AND. ((arr_a(j) .GT. pivot_a) .OR. ((arr_a(j) .EQ. pivot_a) .AND. (arr_b(j) .GE. pivot_b))))
874  j = j - 1
875  END DO
876  IF (i .LT. j) THEN
877  tmp = arr_a(i)
878  arr_a(i) = arr_a(j)
879  arr_a(j) = tmp
880  tmp = arr_b(i)
881  arr_b(i) = arr_b(j)
882  arr_b(j) = tmp
883  ELSE
884  EXIT
885  END IF
886  END DO
887 
888  IF ((arr_a(i) .GT. pivot_a) .OR. (arr_a(i) .EQ. pivot_a .AND. arr_b(i) .GT. pivot_b)) THEN
889  tmp = arr_a(i)
890  arr_a(i) = arr_a(right)
891  arr_a(right) = tmp
892  tmp = arr_b(i)
893  arr_b(i) = arr_b(right)
894  arr_b(right) = tmp
895  END IF
896 
897  IF (i - 1 .GT. left) CALL qsort_two(arr_a, arr_b, left, i - 1)
898  IF (i + 1 .LT. right) CALL qsort_two(arr_a, arr_b, i + 1, right)
899 
900  END SUBROUTINE qsort_two
901 
902 END MODULE submatrix_dissection
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
Definition: grid_common.h:117
collects all references to literature in CP2K as new algorithms / method are included from literature...
Definition: bibliography.F:28
integer, save, public lass2018
Definition: bibliography.F:43
Defines the basic variable types.
Definition: kinds.F:23
integer, parameter, public dp
Definition: kinds.F:34
Interface to the message passing library MPI.
subroutine submatrix_get_relevant_sm_columns(this, sm_id, first, last)
determine which columns of the submatrix are relevant for the result matrix