(git:ed6f26b)
Loading...
Searching...
No Matches
submatrix_dissection.F
Go to the documentation of this file.
1!--------------------------------------------------------------------------------------------------!
2! CP2K: A general program to perform molecular dynamics simulations !
3! Copyright 2000-2025 CP2K developers group <https://cp2k.org> !
4! !
5! SPDX-License-Identifier: GPL-2.0-or-later !
6!--------------------------------------------------------------------------------------------------!
7
8! **************************************************************************************************
9!> \note
10!> This module implements a modified version of the submatrix method, proposed in
11!> M. Lass, S. Mohr, H. Wiebeler, T. Kuehne, C. Plessl
12!> A Massively Parallel Algorithm for the Approximate Calculation of Inverse p-th Roots of Large Sparse Matrices
13!> Proc. Platform for Advanced Scientific Computing (PASC) Conference, ACM, 2018
14!>
15!> The method is extended to minimize the required data transfers and floating-point operations under the assumption that non-zero
16!> blocks of the matrix are close to its diagonal.
17!>
18!> Submatrices can be constructed not for single columns of the matrix but for a set of w consecutive submatrices. The underlying
19!> assumption is that columns next to each other have relatively similar occupation patterns, i.e., constructing a submatrix from
20!> columns x and x+1 should not lead to a much bigger submatrix than contructing it only from column x.
21!>
22!> The construction of the submatrices requires communication between all ranks. It is crucial to implement this communication as
23!> efficient as possible, i.e., data should only ever be transferred once between two ranks and message sizes need to be
24!> sufficiently large to utilize the communication bandwidth. To achieve this, we communicate the required blocks for all
25!> submatrices at once and copy them to large buffers before transmitting them via MPI.
26!>
27!> Note on multi-threading:
28!> Submatrices can be constructed and processed in parallel by multiple threads. However, generate_submatrix, get_sm_ids_for_rank
29!> and copy_resultcol are the only thread-safe routines in this module. All other routines involve MPI communication or operate on
30!> common, non-protected data and are hence not thread-safe.
31!>
32!> TODO:
33!> * generic types (for now only dp supported)
34!> * optimization of threaded initialization
35!> * sanity checks at the beginning of all methods
36!>
37!> \author Michael Lass
38! **************************************************************************************************
39
41
42 USE bibliography, ONLY: lass2018,&
43 cite_reference
44 USE cp_dbcsr_api, ONLY: &
49 USE kinds, ONLY: dp
51 USE submatrix_types, ONLY: buffer_type,&
54 set_type,&
56
57!$ USE omp_lib, ONLY: omp_get_max_threads, omp_get_thread_num
58
59#include "./base/base_uses.f90"
60
61 IMPLICIT NONE
62 PRIVATE
63
64 CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'submatrix_dissection'
65
67 TYPE(dbcsr_type) :: dbcsr_mat
69 LOGICAL :: initialized = .false.
70 TYPE(mp_comm_type) :: group = mp_comm_type()
71 INTEGER :: numnodes = -1, myrank = -1, nblkcols = -1, &
72 nblkrows = -1, nblks = -1, local_blocks = -1, &
73 cols_per_sm = -1, number_of_submatrices = -1
74 INTEGER, DIMENSION(:), POINTER :: row_blk_size => null(), col_blk_size => null()
75 INTEGER, DIMENSION(:), ALLOCATABLE :: coo_cols, coo_rows, coo_col_offsets, coo_cols_local, coo_rows_local, &
76 coo_col_offsets_local, submatrix_owners, submatrix_sizes
77 TYPE(buffer_type), DIMENSION(:), ALLOCATABLE :: recvbufs, result_sendbufs ! Indexing starts with 0 to match rank ids!
78 TYPE(set_type), DIMENSION(:), ALLOCATABLE :: result_blocks_for_rank, result_blocks_from_rank
79 TYPE(bufptr_type), DIMENSION(:), ALLOCATABLE :: coo_dptr
80 TYPE(intbuffer_type), DIMENSION(:), ALLOCATABLE :: result_blocks_for_rank_buf_offsets
81 CONTAINS
82 PROCEDURE :: init => submatrix_dissection_init
83 PROCEDURE :: final => submatrix_dissection_final
84 PROCEDURE :: get_sm_ids_for_rank => submatrix_get_sm_ids_for_rank
85 PROCEDURE :: generate_submatrix => submatrix_generate_sm
86 PROCEDURE :: copy_resultcol => submatrix_cpy_resultcol
87 PROCEDURE :: communicate_results => submatrix_communicate_results
88 PROCEDURE :: get_relevant_sm_columns => submatrix_get_relevant_sm_columns
90
91CONTAINS
92
93! **************************************************************************************************
94!> \brief determine which columns of the submatrix are relevant for the result matrix
95!> \param this - object of class submatrix_dissection_type
96!> \param sm_id - id of the submatrix
97!> \param first - first column of submatrix that is relevant
98!> \param last - last column of submatrix that is relevant
99! **************************************************************************************************
100 SUBROUTINE submatrix_get_relevant_sm_columns(this, sm_id, first, last)
101 CLASS(submatrix_dissection_type), INTENT(IN) :: this
102 INTEGER, INTENT(IN) :: sm_id
103 INTEGER, INTENT(OUT) :: first, last
104 INTEGER :: i, j, blkid
105 TYPE(set_type) :: nonzero_rows
106
107 ! TODO: Should we buffer the list of non-zero rows for each submatrix instead of recalculating it each time?
108 DO i = (sm_id - 1)*this%cols_per_sm + 1, sm_id*this%cols_per_sm ! all colums that determine submatrix sm_id
109 DO j = this%coo_col_offsets(i), this%coo_col_offsets(i + 1) - 1 ! all blocks that are within this column
110 CALL nonzero_rows%insert(this%coo_rows(j))
111 END DO
112 END DO
113
114 first = 1
115 DO i = 1, nonzero_rows%elements
116 blkid = nonzero_rows%get(i)
117 IF (blkid .EQ. (sm_id - 1)*this%cols_per_sm + 1) THEN
118 ! We just found the nonzero row that corresponds to the first inducing column of our submatrix
119 ! Now add up block sizes to determine the last one as well
120 last = first - 1
121 DO j = i, nonzero_rows%elements
122 blkid = nonzero_rows%get(j)
123 last = last + this%col_blk_size(blkid)
124 IF (blkid .EQ. (sm_id)*this%cols_per_sm) EXIT
125 END DO
126 EXIT
127 END IF
128 first = first + this%col_blk_size(blkid)
129 END DO
130
131 CALL nonzero_rows%reset
132
134
135! **************************************************************************************************
136!> \brief initialize submatrix dissection and communicate, needs to be called before constructing any submatrices.
137!> \param this - object of class submatrix_dissection_type
138!> \param matrix_p - dbcsr input matrix
139!> \par History
140!> 2020.02 created [Michael Lass]
141!> 2020.05 add time measurements [Michael Lass]
142! **************************************************************************************************
143 SUBROUTINE submatrix_dissection_init(this, matrix_p) ! Should be PURE but the iterator routines are not
144 CLASS(submatrix_dissection_type), INTENT(INOUT) :: this
145 TYPE(dbcsr_type), INTENT(IN) :: matrix_p
146
147 INTEGER :: cur_row, cur_col, i, j, k, l, m, l_limit_left, l_limit_right, &
148 bufsize, bufsize_next
149 INTEGER, DIMENSION(:), ALLOCATABLE :: blocks_per_rank, coo_dsplcmnts, num_blockids_send, num_blockids_recv
150 TYPE(dbcsr_iterator_type) :: iter
151 TYPE(set_type) :: nonzero_rows
152 REAL(KIND=dp) :: flops_total, flops_per_rank, flops_per_sm, flops_threshold, &
153 flops_current, flops_remaining
154
155 ! Indexing for the following arrays starts with 0 to match rank ids
156 TYPE(set_type), DIMENSION(:), ALLOCATABLE :: blocks_from_rank
157 TYPE(buffer_type), DIMENSION(:), ALLOCATABLE :: sendbufs
158 TYPE(intbuffer_type), DIMENSION(:), ALLOCATABLE :: blocks_for_rank
159
160 ! Additional structures for threaded parts
161 INTEGER :: numthreads, mytid
162 ! Indexing starts at 0 to match thread ids
163 TYPE(setarray_type), DIMENSION(:), ALLOCATABLE :: nonzero_rows_t
164 TYPE(setarray_type), DIMENSION(:), ALLOCATABLE :: result_blocks_from_rank_t, result_blocks_for_rank_t, &
165 blocks_from_rank_t
166
167 LOGICAL :: valid
168 REAL(KIND=dp), DIMENSION(:, :), POINTER :: blockp
169
170 CHARACTER(LEN=*), PARAMETER :: routineN = 'submatrix_dissection_init'
171 INTEGER :: handle
172
173 CALL timeset(routinen, handle)
174 CALL cite_reference(lass2018)
175
176 this%dbcsr_mat = matrix_p
177 CALL dbcsr_get_info(matrix=this%dbcsr_mat, nblkcols_total=this%nblkcols, nblkrows_total=this%nblkrows, &
178 row_blk_size=this%row_blk_size, col_blk_size=this%col_blk_size, group=this%group, distribution=this%dist)
179 CALL dbcsr_distribution_get(dist=this%dist, mynode=this%myrank, numnodes=this%numnodes)
180
181 IF (this%nblkcols .NE. this%nblkrows) THEN
182 cpabort("Number of block rows and cols need to be identical")
183 END IF
184
185 DO i = 1, this%nblkcols
186 IF (this%col_blk_size(i) .NE. this%row_blk_size(i)) THEN
187 cpabort("Block row sizes and col sizes need to be identical")
188 END IF
189 END DO
190
191 ! TODO: We could do even more sanity checks here, e.g., the matrix must not be stored symmetrically
192
193 ! For the submatrix method, we need global knwoledge about which blocks are actually used. Therefore, we create a COO
194 ! representation of the blocks (without their contents) on all ranks.
195
196 ! TODO: Right now, the COO contains all blocks. Also we transfer all blocks. We could skip half of them due to the matrix
197 ! being symmetric (however, we need to transpose the blocks). This can increase performance only by a factor of 2 and is
198 ! therefore deferred.
199
200 ! Determine number of locally stored blocks
201 this%local_blocks = 0
202 CALL dbcsr_iterator_readonly_start(iter, this%dbcsr_mat)
203 DO WHILE (dbcsr_iterator_blocks_left(iter))
204 CALL dbcsr_iterator_next_block(iter, cur_row, cur_col)
205 this%local_blocks = this%local_blocks + 1
206 END DO
207 CALL dbcsr_iterator_stop(iter)
208
209 ALLOCATE (this%coo_cols_local(this%local_blocks), this%coo_rows_local(this%local_blocks), blocks_per_rank(this%numnodes), &
210 coo_dsplcmnts(this%numnodes), this%coo_col_offsets_local(this%nblkcols + 1), &
211 blocks_for_rank(0:this%numnodes - 1), blocks_from_rank(0:this%numnodes - 1), &
212 sendbufs(0:this%numnodes - 1), this%recvbufs(0:this%numnodes - 1), this%result_sendbufs(0:this%numnodes - 1), &
213 this%result_blocks_for_rank(0:this%numnodes - 1), this%result_blocks_from_rank(0:this%numnodes - 1), &
214 this%result_blocks_for_rank_buf_offsets(0:this%numnodes - 1))
215
216 i = 1
217 CALL dbcsr_iterator_readonly_start(iter, this%dbcsr_mat)
218 DO WHILE (dbcsr_iterator_blocks_left(iter))
219 CALL dbcsr_iterator_next_block(iter, cur_row, cur_col)
220 this%coo_cols_local(i) = cur_col
221 this%coo_rows_local(i) = cur_row
222 i = i + 1
223 END DO
224 CALL dbcsr_iterator_stop(iter)
225
226 ! We only know how many blocks we own. What's with the other ranks?
227 CALL this%group%allgather(msgout=this%local_blocks, msgin=blocks_per_rank)
228 coo_dsplcmnts(1) = 0
229 DO i = 1, this%numnodes - 1
230 coo_dsplcmnts(i + 1) = coo_dsplcmnts(i) + blocks_per_rank(i)
231 END DO
232
233 ! Get a global view on the matrix
234 this%nblks = sum(blocks_per_rank)
235 ALLOCATE (this%coo_cols(this%nblks), this%coo_rows(this%nblks), this%coo_col_offsets(this%nblkcols + 1), &
236 this%coo_dptr(this%nblks))
237 CALL this%group%allgatherv(msgout=this%coo_rows_local, msgin=this%coo_rows, rcount=blocks_per_rank, &
238 rdispl=coo_dsplcmnts)
239 CALL this%group%allgatherv(msgout=this%coo_cols_local, msgin=this%coo_cols, rcount=blocks_per_rank, &
240 rdispl=coo_dsplcmnts)
241
242 DEALLOCATE (blocks_per_rank, coo_dsplcmnts)
243
244 ! Sort COO arrays according to their columns
245 CALL qsort_two(this%coo_cols_local, this%coo_rows_local, 1, this%local_blocks)
246 CALL qsort_two(this%coo_cols, this%coo_rows, 1, this%nblks)
247
248 ! Get COO array offsets for columns to accelerate lookups
249 this%coo_col_offsets(this%nblkcols + 1) = this%nblks + 1
250 j = 1
251 DO i = 1, this%nblkcols
252 DO WHILE ((j .LE. this%nblks))
253 IF (this%coo_cols(j) .GE. i) EXIT
254 j = j + 1
255 END DO
256 this%coo_col_offsets(i) = j
257 END DO
258
259 ! Same for local COO
260 this%coo_col_offsets_local(this%nblkcols + 1) = this%local_blocks + 1
261 j = 1
262 DO i = 1, this%nblkcols
263 DO WHILE ((j .LE. this%local_blocks))
264 IF (this%coo_cols_local(j) .GE. i) EXIT
265 j = j + 1
266 END DO
267 this%coo_col_offsets_local(i) = j
268 END DO
269
270 ! We could combine multiple columns to generate a single submatrix. For now, we have not found a practical use-case for this
271 ! so we only look at single columns for now.
272 this%cols_per_sm = 1
273
274 ! Determine sizes of all submatrices. This is required in order to assess the computational effort that is required to process
275 ! the submatrices.
276 this%number_of_submatrices = this%nblkcols/this%cols_per_sm
277 ALLOCATE (this%submatrix_sizes(this%number_of_submatrices))
278 this%submatrix_sizes = 0
279 flops_total = 0.0d0
280 DO i = 1, this%number_of_submatrices
281 CALL nonzero_rows%reset
282 DO j = (i - 1)*this%cols_per_sm + 1, i*this%cols_per_sm ! all colums that determine submatrix i
283 DO k = this%coo_col_offsets(j), this%coo_col_offsets(j + 1) - 1 ! all blocks that are within this column
284 CALL nonzero_rows%insert(this%coo_rows(k))
285 END DO
286 END DO
287 DO j = 1, nonzero_rows%elements
288 this%submatrix_sizes(i) = this%submatrix_sizes(i) + this%col_blk_size(nonzero_rows%get(j))
289 END DO
290 flops_total = flops_total + 2.0d0*this%submatrix_sizes(i)*this%submatrix_sizes(i)*this%submatrix_sizes(i)
291 END DO
292
293 ! Create mapping from submatrices to ranks. Since submatrices can be of different sizes, we need to perform some load
294 ! balancing here. For that we assume that arithmetic operations performed on the submatrices scale cubically.
295 ALLOCATE (this%submatrix_owners(this%number_of_submatrices))
296 flops_per_sm = flops_total/this%number_of_submatrices
297 flops_per_rank = flops_total/this%numnodes
298 flops_current = 0.0d0
299 j = 0
300 DO i = 1, this%number_of_submatrices
301 this%submatrix_owners(i) = j
302 flops_current = flops_current + 2.0d0*this%submatrix_sizes(i)*this%submatrix_sizes(i)*this%submatrix_sizes(i)
303 flops_remaining = flops_total - flops_current
304 flops_threshold = (this%numnodes - j - 1)*flops_per_rank
305 IF ((j .LT. (this%numnodes - 1)) &
306 .AND. ((flops_remaining .LE. flops_threshold &
307 .OR. (this%number_of_submatrices - i) .LT. (this%numnodes - j)))) THEN
308 j = j + 1
309 flops_total = flops_total - flops_current
310 flops_current = 0.0d0
311 END IF
312 END DO
313
314 ! Prepare data structures for multithreaded loop
315 numthreads = 1
316!$ numthreads = omp_get_max_threads()
317
318 ALLOCATE (result_blocks_from_rank_t(0:numthreads - 1), &
319 result_blocks_for_rank_t(0:numthreads - 1), &
320 blocks_from_rank_t(0:numthreads - 1), &
321 nonzero_rows_t(0:numthreads - 1))
322
323 ! Figure out which blocks we need to receive. Blocks are identified here as indices into our COO representation.
324 ! TODO: This currently shows limited parallel efficiency. Investigate further.
325
326 !$OMP PARALLEL DEFAULT(OMP_DEFAULT_NONE_WITH_OOP) &
327 !$OMP NUM_THREADS(numthreads) &
328 !$OMP PRIVATE(i,j,k,l,m,l_limit_left,l_limit_right,cur_col,cur_row,mytid) &
329 !$OMP SHARED(result_blocks_from_rank_t,result_blocks_for_rank_t,blocks_from_rank_t,this,numthreads,nonzero_rows_t)
330 mytid = 0
331!$ mytid = omp_get_thread_num()
332
333 ALLOCATE (nonzero_rows_t(mytid)%sets(1), &
334 result_blocks_from_rank_t(mytid)%sets(0:this%numnodes - 1), &
335 result_blocks_for_rank_t(mytid)%sets(0:this%numnodes - 1), &
336 blocks_from_rank_t(mytid)%sets(0:this%numnodes - 1))
337
338 !$OMP DO schedule(guided)
339 DO i = 1, this%number_of_submatrices
340 CALL nonzero_rows_t(mytid)%sets(1)%reset
341 DO j = (i - 1)*this%cols_per_sm + 1, i*this%cols_per_sm ! all colums that determine submatrix i
342 DO k = this%coo_col_offsets(j), this%coo_col_offsets(j + 1) - 1 ! all blocks that are within this column
343 ! This block will be required to assemble the final block matrix as it is within an inducing column for submatrix i.
344 ! Figure out who stores it and insert it into the result_blocks_* sets.
345 CALL dbcsr_get_stored_coordinates(matrix=this%dbcsr_mat, row=this%coo_rows(k), column=j, processor=m)
346 IF (m .EQ. this%myrank) THEN
347 CALL result_blocks_from_rank_t(mytid)%sets(this%submatrix_owners(i))%insert(k)
348 END IF
349 IF (this%submatrix_owners(i) .EQ. this%myrank) THEN
350 CALL nonzero_rows_t(mytid)%sets(1)%insert(this%coo_rows(k))
351 CALL result_blocks_for_rank_t(mytid)%sets(m)%insert(k)
352 END IF
353 END DO
354 END DO
355
356 IF (this%submatrix_owners(i) .NE. this%myrank) cycle
357
358 ! In the following, we determine all blocks required to build the submatrix. We interpret nonzero_rows_t(mytid)(j) as
359 ! column and nonzero_rows_t(mytid)(k) as row.
360 DO j = 1, nonzero_rows_t(mytid)%sets(1)%elements
361 cur_col = nonzero_rows_t(mytid)%sets(1)%get(j)
362 l_limit_left = this%coo_col_offsets(cur_col)
363 l_limit_right = this%coo_col_offsets(cur_col + 1) - 1
364 DO k = 1, nonzero_rows_t(mytid)%sets(1)%elements
365 cur_row = nonzero_rows_t(mytid)%sets(1)%get(k)
366 l = l_limit_left
367 DO WHILE (l .LE. l_limit_right)
368 IF (this%coo_rows(l) .GE. cur_row) EXIT
369 l = l + 1
370 END DO
371 l_limit_left = l
372 IF (l .LE. l_limit_right) THEN
373 IF (this%coo_rows(l) .EQ. cur_row) THEN
374 ! We found a valid block. Figure out what to do with it.
375 CALL dbcsr_get_stored_coordinates(matrix=this%dbcsr_mat, row=this%coo_rows(l), &
376 column=this%coo_cols(l), processor=m)
377 CALL blocks_from_rank_t(mytid)%sets(m)%insert(l)
378 END IF
379 END IF
380 END DO
381 END DO
382 END DO
383 !$OMP END DO
384 !$OMP END PARALLEL
385
386 ! Merge partial results from threads
387 DO i = 0, this%numnodes - 1
388 DO j = 0, numthreads - 1
389 DO k = 1, result_blocks_from_rank_t(j)%sets(i)%elements
390 CALL this%result_blocks_from_rank(i)%insert(result_blocks_from_rank_t(j)%sets(i)%get(k))
391 END DO
392 CALL result_blocks_from_rank_t(j)%sets(i)%reset
393 DO k = 1, result_blocks_for_rank_t(j)%sets(i)%elements
394 CALL this%result_blocks_for_rank(i)%insert(result_blocks_for_rank_t(j)%sets(i)%get(k))
395 END DO
396 CALL result_blocks_for_rank_t(j)%sets(i)%reset
397 DO k = 1, blocks_from_rank_t(j)%sets(i)%elements
398 CALL blocks_from_rank(i)%insert(blocks_from_rank_t(j)%sets(i)%get(k))
399 END DO
400 CALL blocks_from_rank_t(j)%sets(i)%reset
401 END DO
402 END DO
403 DO i = 0, numthreads - 1
404 CALL nonzero_rows_t(i)%sets(1)%reset
405 DEALLOCATE (result_blocks_from_rank_t(i)%sets, result_blocks_for_rank_t(i)%sets, blocks_from_rank_t(i)%sets, &
406 nonzero_rows_t(i)%sets)
407 END DO
408 DEALLOCATE (result_blocks_from_rank_t, result_blocks_for_rank_t, blocks_from_rank_t, nonzero_rows_t)
409
410 ! Make other ranks aware of our needs
411 ALLOCATE (num_blockids_send(0:this%numnodes - 1), num_blockids_recv(0:this%numnodes - 1))
412 DO i = 0, this%numnodes - 1
413 num_blockids_send(i) = blocks_from_rank(i)%elements
414 END DO
415 CALL this%group%alltoall(num_blockids_send, num_blockids_recv, 1)
416 DO i = 0, this%numnodes - 1
417 CALL blocks_for_rank(i)%alloc(num_blockids_recv(i))
418 END DO
419 DEALLOCATE (num_blockids_send, num_blockids_recv)
420
421 IF (this%numnodes .GT. 1) THEN
422 DO i = 1, this%numnodes
423 k = modulo(this%myrank - i, this%numnodes) ! rank to receive from
424 CALL this%group%irecv(msgout=blocks_for_rank(k)%data, source=k, request=blocks_for_rank(k)%mpi_request)
425 END DO
426 DO i = 1, this%numnodes
427 k = modulo(this%myrank + i, this%numnodes) ! rank to send to
428 CALL this%group%send(blocks_from_rank(k)%getall(), k, 0)
429 END DO
430 DO i = 0, this%numnodes - 1
431 CALL blocks_for_rank(i)%mpi_request%wait()
432 END DO
433 ELSE
434 blocks_for_rank(0)%data = blocks_from_rank(0)%getall()
435 END IF
436
437 ! Free memory allocated in nonzero_rows
438 CALL nonzero_rows%reset
439
440 ! Make get calls on this%result_blocks_for_rank(...) threadsafe in other routines by updating the internal sorted list
441 DO m = 0, this%numnodes - 1
442 IF ((.NOT. this%result_blocks_for_rank(m)%sorted_up_to_date) .AND. (this%result_blocks_for_rank(m)%elements .GT. 0)) THEN
443 CALL this%result_blocks_for_rank(m)%update_sorted
444 END IF
445 END DO
446
447 ! Create and fill send buffers
448 DO i = 0, this%numnodes - 1
449 bufsize = 0
450 DO j = 1, blocks_for_rank(i)%size
451 k = blocks_for_rank(i)%data(j)
452 bufsize = bufsize + this%col_blk_size(this%coo_cols(k))*this%col_blk_size(this%coo_rows(k))
453 END DO
454 CALL sendbufs(i)%alloc(bufsize)
455
456 bufsize = 0
457 CALL this%result_blocks_for_rank_buf_offsets(i)%alloc(this%result_blocks_for_rank(i)%elements)
458 DO j = 1, this%result_blocks_for_rank(i)%elements
459 k = this%result_blocks_for_rank(i)%get(j)
460 this%result_blocks_for_rank_buf_offsets(i)%data(j) = bufsize
461 bufsize = bufsize + this%col_blk_size(this%coo_cols(k))*this%col_blk_size(this%coo_rows(k))
462 END DO
463 CALL this%result_sendbufs(i)%alloc(bufsize)
464
465 bufsize = 0
466 DO j = 1, blocks_for_rank(i)%size
467 k = blocks_for_rank(i)%data(j)
468 CALL dbcsr_get_block_p(this%dbcsr_mat, row=this%coo_rows(k), col=this%coo_cols(k), block=blockp, found=valid)
469 IF (.NOT. valid) THEN
470 cpabort("Block included in our COO and placed on our rank could not be fetched!")
471 END IF
472 bufsize_next = bufsize + SIZE(blockp)
473 sendbufs(i)%data(bufsize + 1:bufsize_next) = reshape(blockp, [SIZE(blockp)])
474 bufsize = bufsize_next
475 END DO
476 END DO
477
478 ! Create receive buffers and mapping from blocks to memory locations
479 DO i = 0, this%numnodes - 1
480 bufsize = 0
481 DO j = 1, blocks_from_rank(i)%elements
482 k = blocks_from_rank(i)%get(j)
483 bufsize = bufsize + this%col_blk_size(this%coo_cols(k))*this%col_blk_size(this%coo_rows(k))
484 END DO
485 CALL this%recvbufs(i)%alloc(bufsize)
486 bufsize = 0
487 DO j = 1, blocks_from_rank(i)%elements
488 k = blocks_from_rank(i)%get(j)
489 bufsize_next = bufsize + this%col_blk_size(this%coo_cols(k))*this%col_blk_size(this%coo_rows(k))
490 this%coo_dptr(k)%target => this%recvbufs(i)%data(bufsize + 1:bufsize_next)
491 bufsize = bufsize_next
492 END DO
493 END DO
494
495 DO i = 0, this%numnodes - 1
496 CALL blocks_for_rank(i)%dealloc
497 CALL blocks_from_rank(i)%reset
498 END DO
499 DEALLOCATE (blocks_for_rank, blocks_from_rank)
500
501 IF (this%numnodes .GT. 1) THEN
502 ! Communicate. We attempt to balance communication load in the network here by starting our sends with our right neighbor
503 ! and first trying to receive from our left neighbor.
504 DO i = 1, this%numnodes
505 k = modulo(this%myrank - i, this%numnodes) ! rank to receive from
506 CALL this%group%irecv(msgout=this%recvbufs(k)%data, source=k, request=this%recvbufs(k)%mpi_request)
507 k = modulo(this%myrank + i, this%numnodes) ! rank to send to
508 CALL this%group%isend(msgin=sendbufs(k)%data, dest=k, request=sendbufs(k)%mpi_request)
509 END DO
510 DO i = 0, this%numnodes - 1
511 CALL sendbufs(i)%mpi_request%wait()
512 CALL this%recvbufs(i)%mpi_request%wait()
513 END DO
514 ELSE
515 this%recvbufs(0)%data = sendbufs(0)%data
516 END IF
517
518 DO i = 0, this%numnodes - 1
519 CALL sendbufs(i)%dealloc
520 END DO
521 DEALLOCATE (sendbufs)
522
523 this%initialized = .true.
524
525 CALL timestop(handle)
526 END SUBROUTINE submatrix_dissection_init
527
528! **************************************************************************************************
529!> \brief free all associated memory, afterwards submatrix_dissection_init needs to be called again
530!> \param this - object of class submatrix_dissection_type
531! **************************************************************************************************
532 PURE SUBROUTINE submatrix_dissection_final(this)
533 CLASS(submatrix_dissection_type), INTENT(INOUT) :: this
534 INTEGER :: i
535
536 this%initialized = .false.
537
538 IF (ALLOCATED(this%submatrix_sizes)) DEALLOCATE (this%submatrix_sizes)
539 IF (ALLOCATED(this%coo_cols_local)) DEALLOCATE (this%coo_cols_local)
540 IF (ALLOCATED(this%coo_rows_local)) DEALLOCATE (this%coo_rows_local)
541 IF (ALLOCATED(this%coo_col_offsets_local)) DEALLOCATE (this%coo_col_offsets_local)
542 IF (ALLOCATED(this%result_blocks_for_rank_buf_offsets)) THEN
543 DO i = 0, this%numnodes - 1
544 CALL this%result_blocks_for_rank_buf_offsets(i)%dealloc
545 END DO
546 DEALLOCATE (this%result_blocks_for_rank_buf_offsets)
547 END IF
548 IF (ALLOCATED(this%recvbufs)) THEN
549 DO i = 0, this%numnodes - 1
550 CALL this%recvbufs(i)%dealloc
551 END DO
552 DEALLOCATE (this%recvbufs)
553 END IF
554 IF (ALLOCATED(this%result_sendbufs)) THEN
555 DO i = 0, this%numnodes - 1
556 CALL this%result_sendbufs(i)%dealloc
557 END DO
558 DEALLOCATE (this%result_sendbufs)
559 END IF
560 IF (ALLOCATED(this%result_blocks_for_rank)) THEN
561 DO i = 0, this%numnodes - 1
562 CALL this%result_blocks_for_rank(i)%reset
563 END DO
564 DEALLOCATE (this%result_blocks_for_rank)
565 END IF
566 IF (ALLOCATED(this%result_blocks_from_rank)) THEN
567 DO i = 0, this%numnodes - 1
568 CALL this%result_blocks_from_rank(i)%reset
569 END DO
570 DEALLOCATE (this%result_blocks_from_rank)
571 END IF
572 IF (ALLOCATED(this%coo_cols)) DEALLOCATE (this%coo_cols)
573 IF (ALLOCATED(this%coo_rows)) DEALLOCATE (this%coo_rows)
574 IF (ALLOCATED(this%coo_col_offsets)) DEALLOCATE (this%coo_col_offsets)
575 IF (ALLOCATED(this%coo_dptr)) DEALLOCATE (this%coo_dptr)
576 IF (ALLOCATED(this%submatrix_owners)) DEALLOCATE (this%submatrix_owners)
577
578 END SUBROUTINE submatrix_dissection_final
579
580! **************************************************************************************************
581!> \brief generate a specific submatrix
582!> \param this - object of class submatrix_dissection_type
583!> \param sm_id - id of the submatrix to generate
584!> \param sm - generated submatrix
585! **************************************************************************************************
586 SUBROUTINE submatrix_generate_sm(this, sm_id, sm)
587 CLASS(submatrix_dissection_type), INTENT(IN) :: this
588 INTEGER, INTENT(IN) :: sm_id
589 REAL(KIND=dp), DIMENSION(:, :), ALLOCATABLE, INTENT(OUT) :: sm
590
591 INTEGER :: sm_dim, i, j, k, offset_x1, offset_x2, offset_y1, &
592 offset_y2, k_limit_left, k_limit_right
593 TYPE(set_type) :: nonzero_rows
594
595 IF (.NOT. this%initialized) THEN
596 cpabort("Submatrix dissection not initialized")
597 END IF
598
599 IF (this%myrank .NE. this%submatrix_owners(sm_id)) THEN
600 cpabort("This rank is not supposed to construct this submatrix")
601 END IF
602
603 ! TODO: Should we buffer the list of non-zero rows for each submatrix instead of recalculating it each time?
604 CALL nonzero_rows%reset
605 DO i = (sm_id - 1)*this%cols_per_sm + 1, sm_id*this%cols_per_sm ! all colums that determine submatrix sm_id
606 DO j = this%coo_col_offsets(i), this%coo_col_offsets(i + 1) - 1 ! all blocks that are within this column
607 CALL nonzero_rows%insert(this%coo_rows(j))
608 END DO
609 END DO
610 sm_dim = 0
611 DO i = 1, nonzero_rows%elements
612 sm_dim = sm_dim + this%col_blk_size(nonzero_rows%get(i))
613 END DO
614
615 ALLOCATE (sm(sm_dim, sm_dim))
616 sm = 0
617
618 offset_x1 = 0
619 DO j = 1, nonzero_rows%elements
620 offset_x2 = offset_x1 + this%col_blk_size(nonzero_rows%get(j))
621 offset_y1 = 0
622 k_limit_left = this%coo_col_offsets(nonzero_rows%get(j))
623 k_limit_right = this%coo_col_offsets(nonzero_rows%get(j) + 1) - 1
624 DO i = 1, nonzero_rows%elements
625 offset_y2 = offset_y1 + this%col_blk_size(nonzero_rows%get(i))
626 ! Copy block nonzero_rows(i),nonzero_rows(j) to sm(i,j) (if the block actually exists)
627 k = k_limit_left
628 DO WHILE (k .LE. k_limit_right)
629 IF (this%coo_rows(k) .GE. nonzero_rows%get(i)) EXIT
630 k = k + 1
631 END DO
632 k_limit_left = k
633 IF (this%coo_rows(k) .EQ. nonzero_rows%get(i)) THEN ! it does exist and k is our block id
634 sm(offset_y1 + 1:offset_y2, offset_x1 + 1:offset_x2) = reshape(this%coo_dptr(k)%target, &
635 (/offset_y2 - offset_y1, offset_x2 - offset_x1/))
636 END IF
637 offset_y1 = offset_y2
638 END DO
639 offset_x1 = offset_x2
640 END DO
641
642 ! Free memory allocated in nonzero_rows
643 CALL nonzero_rows%reset
644
645 END SUBROUTINE submatrix_generate_sm
646
647! **************************************************************************************************
648!> \brief determine submatrix ids that are handled by a specific rank
649!> \param this - object of class submatrix_dissection_type
650!> \param rank - rank id of interest
651!> \param sm_ids - list of submatrix ids handled by that rank
652! **************************************************************************************************
653 SUBROUTINE submatrix_get_sm_ids_for_rank(this, rank, sm_ids)
654 CLASS(submatrix_dissection_type), INTENT(IN) :: this
655 INTEGER, INTENT(IN) :: rank
656 INTEGER, DIMENSION(:), ALLOCATABLE, INTENT(OUT) :: sm_ids
657 INTEGER :: count, i
658
659 IF (.NOT. this%initialized) THEN
660 cpabort("Submatrix dissection not initialized")
661 END IF
662
663 count = 0
664 DO i = 1, this%number_of_submatrices
665 IF (this%submatrix_owners(i) .EQ. rank) count = count + 1
666 END DO
667
668 ALLOCATE (sm_ids(count))
669
670 count = 0
671 DO i = 1, this%number_of_submatrices
672 IF (this%submatrix_owners(i) .EQ. rank) THEN
673 count = count + 1
674 sm_ids(count) = i
675 END IF
676 END DO
677
678 END SUBROUTINE submatrix_get_sm_ids_for_rank
679
680! **************************************************************************************************
681!> \brief copy result columns from a submatrix into result buffer
682!> \param this - object of class submatrix_dissection_type
683!> \param sm_id - id of the submatrix
684!> \param sm - result-submatrix
685! **************************************************************************************************
686 SUBROUTINE submatrix_cpy_resultcol(this, sm_id, sm)
687 CLASS(submatrix_dissection_type), INTENT(INOUT) :: this
688 INTEGER, INTENT(in) :: sm_id
689 REAL(KIND=dp), DIMENSION(:, :), ALLOCATABLE, INTENT(IN) :: sm
690
691 TYPE(set_type) :: nonzero_rows
692 INTEGER :: i, j, k, m, sm_col_offset, offset_x1, offset_x2, offset_y1, &
693 offset_y2, bufsize, bufsize_next, k_limit_left, k_limit_right
694 INTEGER, DIMENSION(:), ALLOCATABLE :: buf_offset_idxs
695
696 IF (.NOT. this%initialized) THEN
697 cpabort("Submatrix dissection is not initizalized")
698 END IF
699
700 IF (this%myrank .NE. this%submatrix_owners(sm_id)) THEN
701 cpabort("This rank is not supposed to operate on this submatrix")
702 END IF
703
704 ALLOCATE (buf_offset_idxs(0:this%numnodes - 1))
705 buf_offset_idxs = 1
706
707 ! TODO: Should we buffer the list of non-zero rows for each submatrix instead of recalculating it each time?
708 sm_col_offset = 0
709 DO i = (sm_id - 1)*this%cols_per_sm + 1, sm_id*this%cols_per_sm ! all colums that determine submatrix sm_id
710 DO j = this%coo_col_offsets(i), this%coo_col_offsets(i + 1) - 1 ! all blocks that are within this column
711 CALL nonzero_rows%insert(this%coo_rows(j))
712 END DO
713 END DO
714
715 sm_col_offset = 0
716 DO i = 1, nonzero_rows%elements
717 IF (nonzero_rows%get(i) .EQ. (sm_id - 1)*this%cols_per_sm + 1) THEN
718 ! We just found the nonzero row that corresponds to the first inducing column of our submatrix
719 sm_col_offset = i
720 EXIT
721 END IF
722 END DO
723 IF (sm_col_offset .EQ. 0) THEN
724 cpabort("Could not determine relevant result columns of submatrix")
725 END IF
726
727 offset_x1 = 0
728 DO j = 1, nonzero_rows%elements
729 offset_x2 = offset_x1 + this%col_blk_size(nonzero_rows%get(j))
730 ! We only want to copy the blocks from the result columns
731 IF ((j .GE. sm_col_offset) .AND. (j .LT. sm_col_offset + this%cols_per_sm)) THEN
732 offset_y1 = 0
733 k_limit_left = this%coo_col_offsets(nonzero_rows%get(j))
734 k_limit_right = this%coo_col_offsets(nonzero_rows%get(j) + 1) - 1
735 DO i = 1, nonzero_rows%elements
736 offset_y2 = offset_y1 + this%col_blk_size(nonzero_rows%get(i))
737 ! Check if sm(i,j), i.e., (nonzero_rows(i),nonzero_rows(j)) exists in the original matrix and if so, copy it into the
738 ! correct send buffer.
739 k = k_limit_left
740 DO WHILE (k .LE. k_limit_right)
741 IF (this%coo_rows(k) .GE. nonzero_rows%get(i)) EXIT
742 k = k + 1
743 END DO
744 k_limit_left = k
745 IF (this%coo_rows(k) .EQ. nonzero_rows%get(i)) THEN ! it does exist and k is our block id
746 CALL dbcsr_get_stored_coordinates(matrix=this%dbcsr_mat, row=this%coo_rows(k), column=this%coo_cols(k), &
747 processor=m)
748 DO WHILE (buf_offset_idxs(m) .LE. this%result_blocks_for_rank(m)%elements)
749 IF (this%result_blocks_for_rank(m)%get(buf_offset_idxs(m)) .GE. k) EXIT
750 buf_offset_idxs(m) = buf_offset_idxs(m) + 1
751 END DO
752 IF (this%result_blocks_for_rank(m)%get(buf_offset_idxs(m)) .NE. k) THEN
753 cpabort("Could not determine buffer offset for block")
754 END IF
755 bufsize = this%result_blocks_for_rank_buf_offsets(m)%data(buf_offset_idxs(m))
756 bufsize_next = bufsize + this%col_blk_size(this%coo_cols(k))*this%col_blk_size(this%coo_rows(k))
757 this%result_sendbufs(m)%data(bufsize + 1:bufsize_next) = reshape( &
758 sm(offset_y1 + 1:offset_y2, offset_x1 + 1:offset_x2), &
759 (/bufsize_next - bufsize/))
760 END IF
761 offset_y1 = offset_y2
762 END DO
763 END IF
764 offset_x1 = offset_x2
765 END DO
766
767 DEALLOCATE (buf_offset_idxs)
768
769 ! Free memory in set
770 CALL nonzero_rows%reset
771
772 END SUBROUTINE submatrix_cpy_resultcol
773
774! **************************************************************************************************
775!> \brief finalize results back into a dbcsr matrix
776!> \param this - object of class submatrix_dissection_type
777!> \param resultmat - result dbcsr matrix
778!> \par History
779!> 2020.02 created [Michael Lass]
780!> 2020.05 add time measurements [Michael Lass]
781! **************************************************************************************************
782 SUBROUTINE submatrix_communicate_results(this, resultmat)
783 CLASS(submatrix_dissection_type), INTENT(INOUT) :: this
784 TYPE(dbcsr_type), INTENT(INOUT) :: resultmat
785
786 INTEGER :: i, j, k, cur_row, cur_col, cur_sm, cur_buf, last_buf, &
787 bufsize, bufsize_next, row_size, col_size
788 REAL(kind=dp), DIMENSION(:), POINTER :: vector
789 TYPE(buffer_type), DIMENSION(:), ALLOCATABLE :: result_recvbufs
790
791 CHARACTER(LEN=*), PARAMETER :: routinen = 'submatrix_communicate_results'
792 INTEGER :: handle
793
794 CALL timeset(routinen, handle)
795
796 ALLOCATE (result_recvbufs(0:this%numnodes - 1))
797 DO i = 0, this%numnodes - 1
798 bufsize = 0
799 DO j = 1, this%result_blocks_from_rank(i)%elements
800 k = this%result_blocks_from_rank(i)%get(j)
801 bufsize = bufsize + this%col_blk_size(this%coo_cols(k))*this%col_blk_size(this%coo_rows(k))
802 END DO
803 CALL result_recvbufs(i)%alloc(bufsize)
804 END DO
805
806 ! Communicate. We attempt to balance communication load in the network here by starting our sends with our right neighbor
807 ! and first trying to receive from our left neighbor.
808 IF (this%numnodes .GT. 1) THEN
809 DO i = 1, this%numnodes
810 k = modulo(this%myrank - i, this%numnodes) ! rank to receive from
811 CALL this%group%irecv(msgout=result_recvbufs(k)%data, source=k, request=result_recvbufs(k)%mpi_request)
812 k = modulo(this%myrank + i, this%numnodes) ! rank to send to
813 CALL this%group%isend(msgin=this%result_sendbufs(k)%data, dest=k, request=this%result_sendbufs(k)%mpi_request)
814 END DO
815 DO i = 0, this%numnodes - 1
816 CALL this%result_sendbufs(i)%mpi_request%wait()
817 CALL result_recvbufs(i)%mpi_request%wait()
818 END DO
819 ELSE
820 result_recvbufs(0)%data = this%result_sendbufs(0)%data
821 END IF
822
823 last_buf = -1
824 bufsize = 0
825 DO i = 1, this%local_blocks
826 cur_col = this%coo_cols_local(i)
827 cur_row = this%coo_rows_local(i)
828 cur_sm = (cur_col - 1)/this%cols_per_sm + 1
829 cur_buf = this%submatrix_owners(cur_sm)
830 IF (cur_buf .GT. last_buf) bufsize = 0
831 row_size = this%row_blk_size(cur_row)
832 col_size = this%col_blk_size(cur_col)
833 bufsize_next = bufsize + row_size*col_size
834 vector => result_recvbufs(cur_buf)%data(bufsize + 1:bufsize_next)
835 CALL dbcsr_put_block(matrix=resultmat, row=cur_row, col=cur_col, &
836 block=reshape(vector, [row_size, col_size]))
837 bufsize = bufsize_next
838 last_buf = cur_buf
839 END DO
840
841 DO i = 0, this%numnodes - 1
842 CALL result_recvbufs(i)%dealloc
843 END DO
844 DEALLOCATE (result_recvbufs)
845
846 CALL dbcsr_finalize(resultmat)
847
848 CALL timestop(handle)
849 END SUBROUTINE submatrix_communicate_results
850
851! **************************************************************************************************
852!> \brief sort two integer arrays using quicksort, using the second list as second-level sorting criterion
853!> \param arr_a - first input array
854!> \param arr_b - second input array
855!> \param left - left boundary of region to be sorted
856!> \param right - right boundary of region to be sorted
857! **************************************************************************************************
858 RECURSIVE PURE SUBROUTINE qsort_two(arr_a, arr_b, left, right)
859
860 INTEGER, DIMENSION(:), INTENT(inout) :: arr_a, arr_b
861 INTEGER, INTENT(in) :: left, right
862
863 INTEGER :: i, j, pivot_a, pivot_b, tmp
864
865 IF (right - left .LT. 1) RETURN
866
867 i = left
868 j = right - 1
869 pivot_a = arr_a(right)
870 pivot_b = arr_b(right)
871
872 DO
873 DO WHILE ((arr_a(i) .LT. pivot_a) .OR. ((arr_a(i) .EQ. pivot_a) .AND. (arr_b(i) .LT. pivot_b)))
874 i = i + 1
875 END DO
876 DO WHILE ((j .GT. i) .AND. ((arr_a(j) .GT. pivot_a) .OR. ((arr_a(j) .EQ. pivot_a) .AND. (arr_b(j) .GE. pivot_b))))
877 j = j - 1
878 END DO
879 IF (i .LT. j) THEN
880 tmp = arr_a(i)
881 arr_a(i) = arr_a(j)
882 arr_a(j) = tmp
883 tmp = arr_b(i)
884 arr_b(i) = arr_b(j)
885 arr_b(j) = tmp
886 ELSE
887 EXIT
888 END IF
889 END DO
890
891 IF ((arr_a(i) .GT. pivot_a) .OR. (arr_a(i) .EQ. pivot_a .AND. arr_b(i) .GT. pivot_b)) THEN
892 tmp = arr_a(i)
893 arr_a(i) = arr_a(right)
894 arr_a(right) = tmp
895 tmp = arr_b(i)
896 arr_b(i) = arr_b(right)
897 arr_b(right) = tmp
898 END IF
899
900 IF (i - 1 .GT. left) CALL qsort_two(arr_a, arr_b, left, i - 1)
901 IF (i + 1 .LT. right) CALL qsort_two(arr_a, arr_b, i + 1, right)
902
903 END SUBROUTINE qsort_two
904
905END MODULE submatrix_dissection
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
collects all references to literature in CP2K as new algorithms / method are included from literature...
integer, save, public lass2018
subroutine, public dbcsr_iterator_next_block(iterator, row, column, block, block_number_argument_has_been_removed, row_size, col_size, row_offset, col_offset)
...
logical function, public dbcsr_iterator_blocks_left(iterator)
...
subroutine, public dbcsr_iterator_stop(iterator)
...
subroutine, public dbcsr_get_block_p(matrix, row, col, block, found, row_size, col_size)
...
subroutine, public dbcsr_get_info(matrix, nblkrows_total, nblkcols_total, nfullrows_total, nfullcols_total, nblkrows_local, nblkcols_local, nfullrows_local, nfullcols_local, my_prow, my_pcol, local_rows, local_cols, proc_row_dist, proc_col_dist, row_blk_size, col_blk_size, row_blk_offset, col_blk_offset, distribution, name, matrix_type, group)
...
subroutine, public dbcsr_get_stored_coordinates(matrix, row, column, processor)
...
subroutine, public dbcsr_finalize(matrix)
...
subroutine, public dbcsr_iterator_readonly_start(iterator, matrix, shared, dynamic, dynamic_byrows)
Like dbcsr_iterator_start() but with matrix being INTENT(IN). When invoking this routine,...
subroutine, public dbcsr_put_block(matrix, row, col, block, summation)
...
subroutine, public dbcsr_distribution_get(dist, row_dist, col_dist, nrows, ncols, has_threads, group, mynode, numnodes, nprows, npcols, myprow, mypcol, pgrid, subgroups_defined, prow_group, pcol_group)
...
Defines the basic variable types.
Definition kinds.F:23
integer, parameter, public dp
Definition kinds.F:34
Interface to the message passing library MPI.
subroutine submatrix_get_relevant_sm_columns(this, sm_id, first, last)
determine which columns of the submatrix are relevant for the result matrix