(git:374b731)
Loading...
Searching...
No Matches
submatrix_dissection.F
Go to the documentation of this file.
1!--------------------------------------------------------------------------------------------------!
2! CP2K: A general program to perform molecular dynamics simulations !
3! Copyright 2000-2024 CP2K developers group <https://cp2k.org> !
4! !
5! SPDX-License-Identifier: GPL-2.0-or-later !
6!--------------------------------------------------------------------------------------------------!
7
8! **************************************************************************************************
9!> \note
10!> This module implements a modified version of the submatrix method, proposed in
11!> M. Lass, S. Mohr, H. Wiebeler, T. Kuehne, C. Plessl
12!> A Massively Parallel Algorithm for the Approximate Calculation of Inverse p-th Roots of Large Sparse Matrices
13!> Proc. Platform for Advanced Scientific Computing (PASC) Conference, ACM, 2018
14!>
15!> The method is extended to minimize the required data transfers and floating-point operations under the assumption that non-zero
16!> blocks of the matrix are close to its diagonal.
17!>
18!> Submatrices can be constructed not for single columns of the matrix but for a set of w consecutive submatrices. The underlying
19!> assumption is that columns next to each other have relatively similar occupation patterns, i.e., constructing a submatrix from
20!> columns x and x+1 should not lead to a much bigger submatrix than contructing it only from column x.
21!>
22!> The construction of the submatrices requires communication between all ranks. It is crucial to implement this communication as
23!> efficient as possible, i.e., data should only ever be transferred once between two ranks and message sizes need to be
24!> sufficiently large to utilize the communication bandwidth. To achieve this, we communicate the required blocks for all
25!> submatrices at once and copy them to large buffers before transmitting them via MPI.
26!>
27!> Note on multi-threading:
28!> Submatrices can be constructed and processed in parallel by multiple threads. However, generate_submatrix, get_sm_ids_for_rank
29!> and copy_resultcol are the only thread-safe routines in this module. All other routines involve MPI communication or operate on
30!> common, non-protected data and are hence not thread-safe.
31!>
32!> TODO:
33!> * generic types (for now only dp supported)
34!> * optimization of threaded initialization
35!> * sanity checks at the beginning of all methods
36!>
37!> \author Michael Lass
38! **************************************************************************************************
39
41
42 USE bibliography, ONLY: lass2018,&
43 cite_reference
44 USE dbcsr_api, ONLY: &
45 dbcsr_distribution_get, dbcsr_distribution_type, dbcsr_finalize, dbcsr_get_block_p, &
46 dbcsr_get_info, dbcsr_get_stored_coordinates, dbcsr_iterator_blocks_left, &
47 dbcsr_iterator_next_block, dbcsr_iterator_start, dbcsr_iterator_stop, dbcsr_iterator_type, &
48 dbcsr_put_block, dbcsr_type
49 USE kinds, ONLY: dp
51 USE submatrix_types, ONLY: buffer_type,&
54 set_type,&
56
57!$ USE omp_lib, ONLY: omp_get_max_threads, omp_get_thread_num
58
59#include "./base/base_uses.f90"
60
61 IMPLICIT NONE
62 PRIVATE
63
64 CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'submatrix_dissection'
65
67 TYPE(dbcsr_type) :: dbcsr_mat
68 TYPE(dbcsr_distribution_type) :: dist
69 LOGICAL :: initialized = .false.
70 TYPE(mp_comm_type) :: group
71 INTEGER :: numnodes, myrank, nblkcols, nblkrows, nblks, local_blocks, &
72 cols_per_sm, number_of_submatrices
73 INTEGER, DIMENSION(:), POINTER :: row_blk_size, col_blk_size
74 INTEGER, DIMENSION(:), ALLOCATABLE :: coo_cols, coo_rows, coo_col_offsets, coo_cols_local, coo_rows_local, &
75 coo_col_offsets_local, submatrix_owners, submatrix_sizes
76 TYPE(buffer_type), DIMENSION(:), ALLOCATABLE :: recvbufs, result_sendbufs ! Indexing starts with 0 to match rank ids!
77 TYPE(set_type), DIMENSION(:), ALLOCATABLE :: result_blocks_for_rank, result_blocks_from_rank
78 TYPE(bufptr_type), DIMENSION(:), ALLOCATABLE :: coo_dptr
79 TYPE(intbuffer_type), DIMENSION(:), ALLOCATABLE :: result_blocks_for_rank_buf_offsets
80 CONTAINS
81 PROCEDURE :: init => submatrix_dissection_init
82 PROCEDURE :: final => submatrix_dissection_final
83 PROCEDURE :: get_sm_ids_for_rank => submatrix_get_sm_ids_for_rank
84 PROCEDURE :: generate_submatrix => submatrix_generate_sm
85 PROCEDURE :: copy_resultcol => submatrix_cpy_resultcol
86 PROCEDURE :: communicate_results => submatrix_communicate_results
87 PROCEDURE :: get_relevant_sm_columns => submatrix_get_relevant_sm_columns
89
90CONTAINS
91
92! **************************************************************************************************
93!> \brief determine which columns of the submatrix are relevant for the result matrix
94!> \param this - object of class submatrix_dissection_type
95!> \param sm_id - id of the submatrix
96!> \param first - first column of submatrix that is relevant
97!> \param last - last column of submatrix that is relevant
98! **************************************************************************************************
99 SUBROUTINE submatrix_get_relevant_sm_columns(this, sm_id, first, last)
100 CLASS(submatrix_dissection_type), INTENT(IN) :: this
101 INTEGER, INTENT(IN) :: sm_id
102 INTEGER, INTENT(OUT) :: first, last
103 INTEGER :: i, j, blkid
104 TYPE(set_type) :: nonzero_rows
105
106 ! TODO: Should we buffer the list of non-zero rows for each submatrix instead of recalculating it each time?
107 DO i = (sm_id - 1)*this%cols_per_sm + 1, sm_id*this%cols_per_sm ! all colums that determine submatrix sm_id
108 DO j = this%coo_col_offsets(i), this%coo_col_offsets(i + 1) - 1 ! all blocks that are within this column
109 CALL nonzero_rows%insert(this%coo_rows(j))
110 END DO
111 END DO
112
113 first = 1
114 DO i = 1, nonzero_rows%elements
115 blkid = nonzero_rows%get(i)
116 IF (blkid .EQ. (sm_id - 1)*this%cols_per_sm + 1) THEN
117 ! We just found the nonzero row that corresponds to the first inducing column of our submatrix
118 ! Now add up block sizes to determine the last one as well
119 last = first - 1
120 DO j = i, nonzero_rows%elements
121 blkid = nonzero_rows%get(j)
122 last = last + this%col_blk_size(blkid)
123 IF (blkid .EQ. (sm_id)*this%cols_per_sm) EXIT
124 END DO
125 EXIT
126 END IF
127 first = first + this%col_blk_size(blkid)
128 END DO
129
130 CALL nonzero_rows%reset
131
133
134! **************************************************************************************************
135!> \brief initialize submatrix dissection and communicate, needs to be called before constructing any submatrices.
136!> \param this - object of class submatrix_dissection_type
137!> \param matrix_p - dbcsr input matrix
138!> \par History
139!> 2020.02 created [Michael Lass]
140!> 2020.05 add time measurements [Michael Lass]
141! **************************************************************************************************
142 SUBROUTINE submatrix_dissection_init(this, matrix_p) ! Should be PURE but the iterator routines are not
143 CLASS(submatrix_dissection_type), INTENT(INOUT) :: this
144 TYPE(dbcsr_type), INTENT(IN) :: matrix_p
145
146 INTEGER :: cur_row, cur_col, cur_blk, i, j, k, l, m, l_limit_left, l_limit_right, &
147 bufsize, bufsize_next
148 INTEGER, DIMENSION(:), ALLOCATABLE :: blocks_per_rank, coo_dsplcmnts, num_blockids_send, num_blockids_recv
149 TYPE(dbcsr_iterator_type) :: iter
150 TYPE(set_type) :: nonzero_rows
151 REAL(KIND=dp) :: flops_total, flops_per_rank, flops_per_sm, flops_threshold, &
152 flops_current, flops_remaining
153
154 ! Indexing for the following arrays starts with 0 to match rank ids
155 TYPE(set_type), DIMENSION(:), ALLOCATABLE :: blocks_from_rank
156 TYPE(buffer_type), DIMENSION(:), ALLOCATABLE :: sendbufs
157 TYPE(intbuffer_type), DIMENSION(:), ALLOCATABLE :: blocks_for_rank
158
159 ! Additional structures for threaded parts
160 INTEGER :: numthreads, mytid, group_handle
161 ! Indexing starts at 0 to match thread ids
162 TYPE(setarray_type), DIMENSION(:), ALLOCATABLE :: nonzero_rows_t
163 TYPE(setarray_type), DIMENSION(:), ALLOCATABLE :: result_blocks_from_rank_t, result_blocks_for_rank_t, &
164 blocks_from_rank_t
165
166 LOGICAL :: valid
167 REAL(KIND=dp), DIMENSION(:), POINTER :: blockp
168
169 CHARACTER(LEN=*), PARAMETER :: routineN = 'submatrix_dissection_init'
170 INTEGER :: handle
171
172 CALL timeset(routinen, handle)
173 CALL cite_reference(lass2018)
174
175 this%dbcsr_mat = matrix_p
176 CALL dbcsr_get_info(matrix=this%dbcsr_mat, nblkcols_total=this%nblkcols, nblkrows_total=this%nblkrows, &
177 row_blk_size=this%row_blk_size, col_blk_size=this%col_blk_size, group=group_handle, distribution=this%dist)
178 CALL dbcsr_distribution_get(dist=this%dist, mynode=this%myrank, numnodes=this%numnodes)
179
180 CALL this%group%set_handle(group_handle)
181
182 IF (this%nblkcols .NE. this%nblkrows) THEN
183 cpabort("Number of block rows and cols need to be identical")
184 END IF
185
186 DO i = 1, this%nblkcols
187 IF (this%col_blk_size(i) .NE. this%row_blk_size(i)) THEN
188 cpabort("Block row sizes and col sizes need to be identical")
189 END IF
190 END DO
191
192 ! TODO: We could do even more sanity checks here, e.g., the matrix must not be stored symmetrically
193
194 ! For the submatrix method, we need global knwoledge about which blocks are actually used. Therefore, we create a COO
195 ! representation of the blocks (without their contents) on all ranks.
196
197 ! TODO: Right now, the COO contains all blocks. Also we transfer all blocks. We could skip half of them due to the matrix
198 ! being symmetric (however, we need to transpose the blocks). This can increase performance only by a factor of 2 and is
199 ! therefore deferred.
200
201 ! Determine number of locally stored blocks
202 this%local_blocks = 0
203 CALL dbcsr_iterator_start(iter, this%dbcsr_mat, read_only=.true.)
204 DO WHILE (dbcsr_iterator_blocks_left(iter))
205 CALL dbcsr_iterator_next_block(iter, cur_row, cur_col, cur_blk)
206 this%local_blocks = this%local_blocks + 1
207 END DO
208 CALL dbcsr_iterator_stop(iter)
209
210 ALLOCATE (this%coo_cols_local(this%local_blocks), this%coo_rows_local(this%local_blocks), blocks_per_rank(this%numnodes), &
211 coo_dsplcmnts(this%numnodes), this%coo_col_offsets_local(this%nblkcols + 1), &
212 blocks_for_rank(0:this%numnodes - 1), blocks_from_rank(0:this%numnodes - 1), &
213 sendbufs(0:this%numnodes - 1), this%recvbufs(0:this%numnodes - 1), this%result_sendbufs(0:this%numnodes - 1), &
214 this%result_blocks_for_rank(0:this%numnodes - 1), this%result_blocks_from_rank(0:this%numnodes - 1), &
215 this%result_blocks_for_rank_buf_offsets(0:this%numnodes - 1))
216
217 i = 1
218 CALL dbcsr_iterator_start(iter, this%dbcsr_mat, read_only=.true.)
219 DO WHILE (dbcsr_iterator_blocks_left(iter))
220 CALL dbcsr_iterator_next_block(iter, cur_row, cur_col, cur_blk)
221 this%coo_cols_local(i) = cur_col
222 this%coo_rows_local(i) = cur_row
223 i = i + 1
224 END DO
225 CALL dbcsr_iterator_stop(iter)
226
227 ! We only know how many blocks we own. What's with the other ranks?
228 CALL this%group%allgather(msgout=this%local_blocks, msgin=blocks_per_rank)
229 coo_dsplcmnts(1) = 0
230 DO i = 1, this%numnodes - 1
231 coo_dsplcmnts(i + 1) = coo_dsplcmnts(i) + blocks_per_rank(i)
232 END DO
233
234 ! Get a global view on the matrix
235 this%nblks = sum(blocks_per_rank)
236 ALLOCATE (this%coo_cols(this%nblks), this%coo_rows(this%nblks), this%coo_col_offsets(this%nblkcols + 1), &
237 this%coo_dptr(this%nblks))
238 CALL this%group%allgatherv(msgout=this%coo_rows_local, msgin=this%coo_rows, rcount=blocks_per_rank, &
239 rdispl=coo_dsplcmnts)
240 CALL this%group%allgatherv(msgout=this%coo_cols_local, msgin=this%coo_cols, rcount=blocks_per_rank, &
241 rdispl=coo_dsplcmnts)
242
243 DEALLOCATE (blocks_per_rank, coo_dsplcmnts)
244
245 ! Sort COO arrays according to their columns
246 CALL qsort_two(this%coo_cols_local, this%coo_rows_local, 1, this%local_blocks)
247 CALL qsort_two(this%coo_cols, this%coo_rows, 1, this%nblks)
248
249 ! Get COO array offsets for columns to accelerate lookups
250 this%coo_col_offsets(this%nblkcols + 1) = this%nblks + 1
251 j = 1
252 DO i = 1, this%nblkcols
253 DO WHILE ((j .LE. this%nblks))
254 IF (this%coo_cols(j) .GE. i) EXIT
255 j = j + 1
256 END DO
257 this%coo_col_offsets(i) = j
258 END DO
259
260 ! Same for local COO
261 this%coo_col_offsets_local(this%nblkcols + 1) = this%local_blocks + 1
262 j = 1
263 DO i = 1, this%nblkcols
264 DO WHILE ((j .LE. this%local_blocks))
265 IF (this%coo_cols_local(j) .GE. i) EXIT
266 j = j + 1
267 END DO
268 this%coo_col_offsets_local(i) = j
269 END DO
270
271 ! We could combine multiple columns to generate a single submatrix. For now, we have not found a practical use-case for this
272 ! so we only look at single columns for now.
273 this%cols_per_sm = 1
274
275 ! Determine sizes of all submatrices. This is required in order to assess the computational effort that is required to process
276 ! the submatrices.
277 this%number_of_submatrices = this%nblkcols/this%cols_per_sm
278 ALLOCATE (this%submatrix_sizes(this%number_of_submatrices))
279 this%submatrix_sizes = 0
280 flops_total = 0.0d0
281 DO i = 1, this%number_of_submatrices
282 CALL nonzero_rows%reset
283 DO j = (i - 1)*this%cols_per_sm + 1, i*this%cols_per_sm ! all colums that determine submatrix i
284 DO k = this%coo_col_offsets(j), this%coo_col_offsets(j + 1) - 1 ! all blocks that are within this column
285 CALL nonzero_rows%insert(this%coo_rows(k))
286 END DO
287 END DO
288 DO j = 1, nonzero_rows%elements
289 this%submatrix_sizes(i) = this%submatrix_sizes(i) + this%col_blk_size(nonzero_rows%get(j))
290 END DO
291 flops_total = flops_total + 2.0d0*this%submatrix_sizes(i)*this%submatrix_sizes(i)*this%submatrix_sizes(i)
292 END DO
293
294 ! Create mapping from submatrices to ranks. Since submatrices can be of different sizes, we need to perform some load
295 ! balancing here. For that we assume that arithmetic operations performed on the submatrices scale cubically.
296 ALLOCATE (this%submatrix_owners(this%number_of_submatrices))
297 flops_per_sm = flops_total/this%number_of_submatrices
298 flops_per_rank = flops_total/this%numnodes
299 flops_current = 0.0d0
300 j = 0
301 DO i = 1, this%number_of_submatrices
302 this%submatrix_owners(i) = j
303 flops_current = flops_current + 2.0d0*this%submatrix_sizes(i)*this%submatrix_sizes(i)*this%submatrix_sizes(i)
304 flops_remaining = flops_total - flops_current
305 flops_threshold = (this%numnodes - j - 1)*flops_per_rank
306 IF ((j .LT. (this%numnodes - 1)) &
307 .AND. ((flops_remaining .LE. flops_threshold &
308 .OR. (this%number_of_submatrices - i) .LT. (this%numnodes - j)))) THEN
309 j = j + 1
310 flops_total = flops_total - flops_current
311 flops_current = 0.0d0
312 END IF
313 END DO
314
315 ! Prepare data structures for multithreaded loop
316 numthreads = 1
317!$ numthreads = omp_get_max_threads()
318
319 ALLOCATE (result_blocks_from_rank_t(0:numthreads - 1), &
320 result_blocks_for_rank_t(0:numthreads - 1), &
321 blocks_from_rank_t(0:numthreads - 1), &
322 nonzero_rows_t(0:numthreads - 1))
323
324 ! Figure out which blocks we need to receive. Blocks are identified here as indices into our COO representation.
325 ! TODO: This currently shows limited parallel efficiency. Investigate further.
326
327 !$OMP PARALLEL DEFAULT(OMP_DEFAULT_NONE_WITH_OOP) &
328 !$OMP NUM_THREADS(numthreads) &
329 !$OMP PRIVATE(i,j,k,l,m,l_limit_left,l_limit_right,cur_col,cur_row,mytid) &
330 !$OMP SHARED(result_blocks_from_rank_t,result_blocks_for_rank_t,blocks_from_rank_t,this,numthreads,nonzero_rows_t)
331 mytid = 0
332!$ mytid = omp_get_thread_num()
333
334 ALLOCATE (nonzero_rows_t(mytid)%sets(1), &
335 result_blocks_from_rank_t(mytid)%sets(0:this%numnodes - 1), &
336 result_blocks_for_rank_t(mytid)%sets(0:this%numnodes - 1), &
337 blocks_from_rank_t(mytid)%sets(0:this%numnodes - 1))
338
339 !$OMP DO schedule(guided)
340 DO i = 1, this%number_of_submatrices
341 CALL nonzero_rows_t(mytid)%sets(1)%reset
342 DO j = (i - 1)*this%cols_per_sm + 1, i*this%cols_per_sm ! all colums that determine submatrix i
343 DO k = this%coo_col_offsets(j), this%coo_col_offsets(j + 1) - 1 ! all blocks that are within this column
344 ! This block will be required to assemble the final block matrix as it is within an inducing column for submatrix i.
345 ! Figure out who stores it and insert it into the result_blocks_* sets.
346 CALL dbcsr_get_stored_coordinates(matrix=this%dbcsr_mat, row=this%coo_rows(k), column=j, processor=m)
347 IF (m .EQ. this%myrank) THEN
348 CALL result_blocks_from_rank_t(mytid)%sets(this%submatrix_owners(i))%insert(k)
349 END IF
350 IF (this%submatrix_owners(i) .EQ. this%myrank) THEN
351 CALL nonzero_rows_t(mytid)%sets(1)%insert(this%coo_rows(k))
352 CALL result_blocks_for_rank_t(mytid)%sets(m)%insert(k)
353 END IF
354 END DO
355 END DO
356
357 IF (this%submatrix_owners(i) .NE. this%myrank) cycle
358
359 ! In the following, we determine all blocks required to build the submatrix. We interpret nonzero_rows_t(mytid)(j) as
360 ! column and nonzero_rows_t(mytid)(k) as row.
361 DO j = 1, nonzero_rows_t(mytid)%sets(1)%elements
362 cur_col = nonzero_rows_t(mytid)%sets(1)%get(j)
363 l_limit_left = this%coo_col_offsets(cur_col)
364 l_limit_right = this%coo_col_offsets(cur_col + 1) - 1
365 DO k = 1, nonzero_rows_t(mytid)%sets(1)%elements
366 cur_row = nonzero_rows_t(mytid)%sets(1)%get(k)
367 l = l_limit_left
368 DO WHILE (l .LE. l_limit_right)
369 IF (this%coo_rows(l) .GE. cur_row) EXIT
370 l = l + 1
371 END DO
372 l_limit_left = l
373 IF (l .LE. l_limit_right) THEN
374 IF (this%coo_rows(l) .EQ. cur_row) THEN
375 ! We found a valid block. Figure out what to do with it.
376 CALL dbcsr_get_stored_coordinates(matrix=this%dbcsr_mat, row=this%coo_rows(l), &
377 column=this%coo_cols(l), processor=m)
378 CALL blocks_from_rank_t(mytid)%sets(m)%insert(l)
379 END IF
380 END IF
381 END DO
382 END DO
383 END DO
384 !$OMP END DO
385 !$OMP END PARALLEL
386
387 ! Merge partial results from threads
388 DO i = 0, this%numnodes - 1
389 DO j = 0, numthreads - 1
390 DO k = 1, result_blocks_from_rank_t(j)%sets(i)%elements
391 CALL this%result_blocks_from_rank(i)%insert(result_blocks_from_rank_t(j)%sets(i)%get(k))
392 END DO
393 CALL result_blocks_from_rank_t(j)%sets(i)%reset
394 DO k = 1, result_blocks_for_rank_t(j)%sets(i)%elements
395 CALL this%result_blocks_for_rank(i)%insert(result_blocks_for_rank_t(j)%sets(i)%get(k))
396 END DO
397 CALL result_blocks_for_rank_t(j)%sets(i)%reset
398 DO k = 1, blocks_from_rank_t(j)%sets(i)%elements
399 CALL blocks_from_rank(i)%insert(blocks_from_rank_t(j)%sets(i)%get(k))
400 END DO
401 CALL blocks_from_rank_t(j)%sets(i)%reset
402 END DO
403 END DO
404 DO i = 0, numthreads - 1
405 CALL nonzero_rows_t(i)%sets(1)%reset
406 DEALLOCATE (result_blocks_from_rank_t(i)%sets, result_blocks_for_rank_t(i)%sets, blocks_from_rank_t(i)%sets, &
407 nonzero_rows_t(i)%sets)
408 END DO
409 DEALLOCATE (result_blocks_from_rank_t, result_blocks_for_rank_t, blocks_from_rank_t, nonzero_rows_t)
410
411 ! Make other ranks aware of our needs
412 ALLOCATE (num_blockids_send(0:this%numnodes - 1), num_blockids_recv(0:this%numnodes - 1))
413 DO i = 0, this%numnodes - 1
414 num_blockids_send(i) = blocks_from_rank(i)%elements
415 END DO
416 CALL this%group%alltoall(num_blockids_send, num_blockids_recv, 1)
417 DO i = 0, this%numnodes - 1
418 CALL blocks_for_rank(i)%alloc(num_blockids_recv(i))
419 END DO
420 DEALLOCATE (num_blockids_send, num_blockids_recv)
421
422 IF (this%numnodes .GT. 1) THEN
423 DO i = 1, this%numnodes
424 k = modulo(this%myrank - i, this%numnodes) ! rank to receive from
425 CALL this%group%irecv(msgout=blocks_for_rank(k)%data, source=k, request=blocks_for_rank(k)%mpi_request)
426 END DO
427 DO i = 1, this%numnodes
428 k = modulo(this%myrank + i, this%numnodes) ! rank to send to
429 CALL this%group%send(blocks_from_rank(k)%getall(), k, 0)
430 END DO
431 DO i = 0, this%numnodes - 1
432 CALL blocks_for_rank(i)%mpi_request%wait()
433 END DO
434 ELSE
435 blocks_for_rank(0)%data = blocks_from_rank(0)%getall()
436 END IF
437
438 ! Free memory allocated in nonzero_rows
439 CALL nonzero_rows%reset
440
441 ! Make get calls on this%result_blocks_for_rank(...) threadsafe in other routines by updating the internal sorted list
442 DO m = 0, this%numnodes - 1
443 IF ((.NOT. this%result_blocks_for_rank(m)%sorted_up_to_date) .AND. (this%result_blocks_for_rank(m)%elements .GT. 0)) THEN
444 CALL this%result_blocks_for_rank(m)%update_sorted
445 END IF
446 END DO
447
448 ! Create and fill send buffers
449 DO i = 0, this%numnodes - 1
450 bufsize = 0
451 DO j = 1, blocks_for_rank(i)%size
452 k = blocks_for_rank(i)%data(j)
453 bufsize = bufsize + this%col_blk_size(this%coo_cols(k))*this%col_blk_size(this%coo_rows(k))
454 END DO
455 CALL sendbufs(i)%alloc(bufsize)
456
457 bufsize = 0
458 CALL this%result_blocks_for_rank_buf_offsets(i)%alloc(this%result_blocks_for_rank(i)%elements)
459 DO j = 1, this%result_blocks_for_rank(i)%elements
460 k = this%result_blocks_for_rank(i)%get(j)
461 this%result_blocks_for_rank_buf_offsets(i)%data(j) = bufsize
462 bufsize = bufsize + this%col_blk_size(this%coo_cols(k))*this%col_blk_size(this%coo_rows(k))
463 END DO
464 CALL this%result_sendbufs(i)%alloc(bufsize)
465
466 bufsize = 0
467 DO j = 1, blocks_for_rank(i)%size
468 k = blocks_for_rank(i)%data(j)
469 CALL dbcsr_get_block_p(this%dbcsr_mat, row=this%coo_rows(k), col=this%coo_cols(k), block=blockp, found=valid)
470 IF (.NOT. valid) THEN
471 cpabort("Block included in our COO and placed on our rank could not be fetched!")
472 END IF
473 bufsize_next = bufsize + SIZE(blockp)
474 sendbufs(i)%data(bufsize + 1:bufsize_next) = blockp
475 bufsize = bufsize_next
476 END DO
477 END DO
478
479 ! Create receive buffers and mapping from blocks to memory locations
480 DO i = 0, this%numnodes - 1
481 bufsize = 0
482 DO j = 1, blocks_from_rank(i)%elements
483 k = blocks_from_rank(i)%get(j)
484 bufsize = bufsize + this%col_blk_size(this%coo_cols(k))*this%col_blk_size(this%coo_rows(k))
485 END DO
486 CALL this%recvbufs(i)%alloc(bufsize)
487 bufsize = 0
488 DO j = 1, blocks_from_rank(i)%elements
489 k = blocks_from_rank(i)%get(j)
490 bufsize_next = bufsize + this%col_blk_size(this%coo_cols(k))*this%col_blk_size(this%coo_rows(k))
491 this%coo_dptr(k)%target => this%recvbufs(i)%data(bufsize + 1:bufsize_next)
492 bufsize = bufsize_next
493 END DO
494 END DO
495
496 DO i = 0, this%numnodes - 1
497 CALL blocks_for_rank(i)%dealloc
498 CALL blocks_from_rank(i)%reset
499 END DO
500 DEALLOCATE (blocks_for_rank, blocks_from_rank)
501
502 IF (this%numnodes .GT. 1) THEN
503 ! Communicate. We attempt to balance communication load in the network here by starting our sends with our right neighbor
504 ! and first trying to receive from our left neighbor.
505 DO i = 1, this%numnodes
506 k = modulo(this%myrank - i, this%numnodes) ! rank to receive from
507 CALL this%group%irecv(msgout=this%recvbufs(k)%data, source=k, request=this%recvbufs(k)%mpi_request)
508 k = modulo(this%myrank + i, this%numnodes) ! rank to send to
509 CALL this%group%isend(msgin=sendbufs(k)%data, dest=k, request=sendbufs(k)%mpi_request)
510 END DO
511 DO i = 0, this%numnodes - 1
512 CALL sendbufs(i)%mpi_request%wait()
513 CALL this%recvbufs(i)%mpi_request%wait()
514 END DO
515 ELSE
516 this%recvbufs(0)%data = sendbufs(0)%data
517 END IF
518
519 DO i = 0, this%numnodes - 1
520 CALL sendbufs(i)%dealloc
521 END DO
522 DEALLOCATE (sendbufs)
523
524 this%initialized = .true.
525
526 CALL timestop(handle)
527 END SUBROUTINE submatrix_dissection_init
528
529! **************************************************************************************************
530!> \brief free all associated memory, afterwards submatrix_dissection_init needs to be called again
531!> \param this - object of class submatrix_dissection_type
532! **************************************************************************************************
533 PURE SUBROUTINE submatrix_dissection_final(this)
534 CLASS(submatrix_dissection_type), INTENT(INOUT) :: this
535 INTEGER :: i
536
537 this%initialized = .false.
538
539 IF (ALLOCATED(this%submatrix_sizes)) DEALLOCATE (this%submatrix_sizes)
540 IF (ALLOCATED(this%coo_cols_local)) DEALLOCATE (this%coo_cols_local)
541 IF (ALLOCATED(this%coo_rows_local)) DEALLOCATE (this%coo_rows_local)
542 IF (ALLOCATED(this%coo_col_offsets_local)) DEALLOCATE (this%coo_col_offsets_local)
543 IF (ALLOCATED(this%result_blocks_for_rank_buf_offsets)) THEN
544 DO i = 0, this%numnodes - 1
545 CALL this%result_blocks_for_rank_buf_offsets(i)%dealloc
546 END DO
547 DEALLOCATE (this%result_blocks_for_rank_buf_offsets)
548 END IF
549 IF (ALLOCATED(this%recvbufs)) THEN
550 DO i = 0, this%numnodes - 1
551 CALL this%recvbufs(i)%dealloc
552 END DO
553 DEALLOCATE (this%recvbufs)
554 END IF
555 IF (ALLOCATED(this%result_sendbufs)) THEN
556 DO i = 0, this%numnodes - 1
557 CALL this%result_sendbufs(i)%dealloc
558 END DO
559 DEALLOCATE (this%result_sendbufs)
560 END IF
561 IF (ALLOCATED(this%result_blocks_for_rank)) THEN
562 DO i = 0, this%numnodes - 1
563 CALL this%result_blocks_for_rank(i)%reset
564 END DO
565 DEALLOCATE (this%result_blocks_for_rank)
566 END IF
567 IF (ALLOCATED(this%result_blocks_from_rank)) THEN
568 DO i = 0, this%numnodes - 1
569 CALL this%result_blocks_from_rank(i)%reset
570 END DO
571 DEALLOCATE (this%result_blocks_from_rank)
572 END IF
573 IF (ALLOCATED(this%coo_cols)) DEALLOCATE (this%coo_cols)
574 IF (ALLOCATED(this%coo_rows)) DEALLOCATE (this%coo_rows)
575 IF (ALLOCATED(this%coo_col_offsets)) DEALLOCATE (this%coo_col_offsets)
576 IF (ALLOCATED(this%coo_dptr)) DEALLOCATE (this%coo_dptr)
577 IF (ALLOCATED(this%submatrix_owners)) DEALLOCATE (this%submatrix_owners)
578
579 END SUBROUTINE submatrix_dissection_final
580
581! **************************************************************************************************
582!> \brief generate a specific submatrix
583!> \param this - object of class submatrix_dissection_type
584!> \param sm_id - id of the submatrix to generate
585!> \param sm - generated submatrix
586! **************************************************************************************************
587 SUBROUTINE submatrix_generate_sm(this, sm_id, sm)
588 CLASS(submatrix_dissection_type), INTENT(IN) :: this
589 INTEGER, INTENT(IN) :: sm_id
590 REAL(KIND=dp), DIMENSION(:, :), ALLOCATABLE, INTENT(OUT) :: sm
591
592 INTEGER :: sm_dim, i, j, k, offset_x1, offset_x2, offset_y1, &
593 offset_y2, k_limit_left, k_limit_right
594 TYPE(set_type) :: nonzero_rows
595
596 IF (.NOT. this%initialized) THEN
597 cpabort("Submatrix dissection not initialized")
598 END IF
599
600 IF (this%myrank .NE. this%submatrix_owners(sm_id)) THEN
601 cpabort("This rank is not supposed to construct this submatrix")
602 END IF
603
604 ! TODO: Should we buffer the list of non-zero rows for each submatrix instead of recalculating it each time?
605 CALL nonzero_rows%reset
606 DO i = (sm_id - 1)*this%cols_per_sm + 1, sm_id*this%cols_per_sm ! all colums that determine submatrix sm_id
607 DO j = this%coo_col_offsets(i), this%coo_col_offsets(i + 1) - 1 ! all blocks that are within this column
608 CALL nonzero_rows%insert(this%coo_rows(j))
609 END DO
610 END DO
611 sm_dim = 0
612 DO i = 1, nonzero_rows%elements
613 sm_dim = sm_dim + this%col_blk_size(nonzero_rows%get(i))
614 END DO
615
616 ALLOCATE (sm(sm_dim, sm_dim))
617 sm = 0
618
619 offset_x1 = 0
620 DO j = 1, nonzero_rows%elements
621 offset_x2 = offset_x1 + this%col_blk_size(nonzero_rows%get(j))
622 offset_y1 = 0
623 k_limit_left = this%coo_col_offsets(nonzero_rows%get(j))
624 k_limit_right = this%coo_col_offsets(nonzero_rows%get(j) + 1) - 1
625 DO i = 1, nonzero_rows%elements
626 offset_y2 = offset_y1 + this%col_blk_size(nonzero_rows%get(i))
627 ! Copy block nonzero_rows(i),nonzero_rows(j) to sm(i,j) (if the block actually exists)
628 k = k_limit_left
629 DO WHILE (k .LE. k_limit_right)
630 IF (this%coo_rows(k) .GE. nonzero_rows%get(i)) EXIT
631 k = k + 1
632 END DO
633 k_limit_left = k
634 IF (this%coo_rows(k) .EQ. nonzero_rows%get(i)) THEN ! it does exist and k is our block id
635 sm(offset_y1 + 1:offset_y2, offset_x1 + 1:offset_x2) = reshape(this%coo_dptr(k)%target, &
636 (/offset_y2 - offset_y1, offset_x2 - offset_x1/))
637 END IF
638 offset_y1 = offset_y2
639 END DO
640 offset_x1 = offset_x2
641 END DO
642
643 ! Free memory allocated in nonzero_rows
644 CALL nonzero_rows%reset
645
646 END SUBROUTINE submatrix_generate_sm
647
648! **************************************************************************************************
649!> \brief determine submatrix ids that are handled by a specific rank
650!> \param this - object of class submatrix_dissection_type
651!> \param rank - rank id of interest
652!> \param sm_ids - list of submatrix ids handled by that rank
653! **************************************************************************************************
654 SUBROUTINE submatrix_get_sm_ids_for_rank(this, rank, sm_ids)
655 CLASS(submatrix_dissection_type), INTENT(IN) :: this
656 INTEGER, INTENT(IN) :: rank
657 INTEGER, DIMENSION(:), ALLOCATABLE, INTENT(OUT) :: sm_ids
658 INTEGER :: count, i
659
660 IF (.NOT. this%initialized) THEN
661 cpabort("Submatrix dissection not initialized")
662 END IF
663
664 count = 0
665 DO i = 1, this%number_of_submatrices
666 IF (this%submatrix_owners(i) .EQ. rank) count = count + 1
667 END DO
668
669 ALLOCATE (sm_ids(count))
670
671 count = 0
672 DO i = 1, this%number_of_submatrices
673 IF (this%submatrix_owners(i) .EQ. rank) THEN
674 count = count + 1
675 sm_ids(count) = i
676 END IF
677 END DO
678
679 END SUBROUTINE submatrix_get_sm_ids_for_rank
680
681! **************************************************************************************************
682!> \brief copy result columns from a submatrix into result buffer
683!> \param this - object of class submatrix_dissection_type
684!> \param sm_id - id of the submatrix
685!> \param sm - result-submatrix
686! **************************************************************************************************
687 SUBROUTINE submatrix_cpy_resultcol(this, sm_id, sm)
688 CLASS(submatrix_dissection_type), INTENT(INOUT) :: this
689 INTEGER, INTENT(in) :: sm_id
690 REAL(KIND=dp), DIMENSION(:, :), ALLOCATABLE, INTENT(IN) :: sm
691
692 TYPE(set_type) :: nonzero_rows
693 INTEGER :: i, j, k, m, sm_col_offset, offset_x1, offset_x2, offset_y1, &
694 offset_y2, bufsize, bufsize_next, k_limit_left, k_limit_right
695 INTEGER, DIMENSION(:), ALLOCATABLE :: buf_offset_idxs
696
697 IF (.NOT. this%initialized) THEN
698 cpabort("Submatrix dissection is not initizalized")
699 END IF
700
701 IF (this%myrank .NE. this%submatrix_owners(sm_id)) THEN
702 cpabort("This rank is not supposed to operate on this submatrix")
703 END IF
704
705 ALLOCATE (buf_offset_idxs(0:this%numnodes - 1))
706 buf_offset_idxs = 1
707
708 ! TODO: Should we buffer the list of non-zero rows for each submatrix instead of recalculating it each time?
709 sm_col_offset = 0
710 DO i = (sm_id - 1)*this%cols_per_sm + 1, sm_id*this%cols_per_sm ! all colums that determine submatrix sm_id
711 DO j = this%coo_col_offsets(i), this%coo_col_offsets(i + 1) - 1 ! all blocks that are within this column
712 CALL nonzero_rows%insert(this%coo_rows(j))
713 END DO
714 END DO
715
716 sm_col_offset = 0
717 DO i = 1, nonzero_rows%elements
718 IF (nonzero_rows%get(i) .EQ. (sm_id - 1)*this%cols_per_sm + 1) THEN
719 ! We just found the nonzero row that corresponds to the first inducing column of our submatrix
720 sm_col_offset = i
721 EXIT
722 END IF
723 END DO
724 IF (sm_col_offset .EQ. 0) THEN
725 cpabort("Could not determine relevant result columns of submatrix")
726 END IF
727
728 offset_x1 = 0
729 DO j = 1, nonzero_rows%elements
730 offset_x2 = offset_x1 + this%col_blk_size(nonzero_rows%get(j))
731 ! We only want to copy the blocks from the result columns
732 IF ((j .GE. sm_col_offset) .AND. (j .LT. sm_col_offset + this%cols_per_sm)) THEN
733 offset_y1 = 0
734 k_limit_left = this%coo_col_offsets(nonzero_rows%get(j))
735 k_limit_right = this%coo_col_offsets(nonzero_rows%get(j) + 1) - 1
736 DO i = 1, nonzero_rows%elements
737 offset_y2 = offset_y1 + this%col_blk_size(nonzero_rows%get(i))
738 ! Check if sm(i,j), i.e., (nonzero_rows(i),nonzero_rows(j)) exists in the original matrix and if so, copy it into the
739 ! correct send buffer.
740 k = k_limit_left
741 DO WHILE (k .LE. k_limit_right)
742 IF (this%coo_rows(k) .GE. nonzero_rows%get(i)) EXIT
743 k = k + 1
744 END DO
745 k_limit_left = k
746 IF (this%coo_rows(k) .EQ. nonzero_rows%get(i)) THEN ! it does exist and k is our block id
747 CALL dbcsr_get_stored_coordinates(matrix=this%dbcsr_mat, row=this%coo_rows(k), column=this%coo_cols(k), &
748 processor=m)
749 DO WHILE (buf_offset_idxs(m) .LE. this%result_blocks_for_rank(m)%elements)
750 IF (this%result_blocks_for_rank(m)%get(buf_offset_idxs(m)) .GE. k) EXIT
751 buf_offset_idxs(m) = buf_offset_idxs(m) + 1
752 END DO
753 IF (this%result_blocks_for_rank(m)%get(buf_offset_idxs(m)) .NE. k) THEN
754 cpabort("Could not determine buffer offset for block")
755 END IF
756 bufsize = this%result_blocks_for_rank_buf_offsets(m)%data(buf_offset_idxs(m))
757 bufsize_next = bufsize + this%col_blk_size(this%coo_cols(k))*this%col_blk_size(this%coo_rows(k))
758 this%result_sendbufs(m)%data(bufsize + 1:bufsize_next) = reshape( &
759 sm(offset_y1 + 1:offset_y2, offset_x1 + 1:offset_x2), &
760 (/bufsize_next - bufsize/))
761 END IF
762 offset_y1 = offset_y2
763 END DO
764 END IF
765 offset_x1 = offset_x2
766 END DO
767
768 DEALLOCATE (buf_offset_idxs)
769
770 ! Free memory in set
771 CALL nonzero_rows%reset
772
773 END SUBROUTINE submatrix_cpy_resultcol
774
775! **************************************************************************************************
776!> \brief finalize results back into a dbcsr matrix
777!> \param this - object of class submatrix_dissection_type
778!> \param resultmat - result dbcsr matrix
779!> \par History
780!> 2020.02 created [Michael Lass]
781!> 2020.05 add time measurements [Michael Lass]
782! **************************************************************************************************
783 SUBROUTINE submatrix_communicate_results(this, resultmat)
784 CLASS(submatrix_dissection_type), INTENT(INOUT) :: this
785 TYPE(dbcsr_type), INTENT(INOUT) :: resultmat
786
787 INTEGER :: i, j, k, cur_row, cur_col, cur_sm, cur_buf, last_buf, &
788 bufsize, bufsize_next
789 TYPE(buffer_type), DIMENSION(:), ALLOCATABLE :: result_recvbufs
790
791 CHARACTER(LEN=*), PARAMETER :: routineN = 'submatrix_communicate_results'
792 INTEGER :: handle
793
794 CALL timeset(routinen, handle)
795
796 ALLOCATE (result_recvbufs(0:this%numnodes - 1))
797 DO i = 0, this%numnodes - 1
798 bufsize = 0
799 DO j = 1, this%result_blocks_from_rank(i)%elements
800 k = this%result_blocks_from_rank(i)%get(j)
801 bufsize = bufsize + this%col_blk_size(this%coo_cols(k))*this%col_blk_size(this%coo_rows(k))
802 END DO
803 CALL result_recvbufs(i)%alloc(bufsize)
804 END DO
805
806 ! Communicate. We attempt to balance communication load in the network here by starting our sends with our right neighbor
807 ! and first trying to receive from our left neighbor.
808 IF (this%numnodes .GT. 1) THEN
809 DO i = 1, this%numnodes
810 k = modulo(this%myrank - i, this%numnodes) ! rank to receive from
811 CALL this%group%irecv(msgout=result_recvbufs(k)%data, source=k, request=result_recvbufs(k)%mpi_request)
812 k = modulo(this%myrank + i, this%numnodes) ! rank to send to
813 CALL this%group%isend(msgin=this%result_sendbufs(k)%data, dest=k, request=this%result_sendbufs(k)%mpi_request)
814 END DO
815 DO i = 0, this%numnodes - 1
816 CALL this%result_sendbufs(i)%mpi_request%wait()
817 CALL result_recvbufs(i)%mpi_request%wait()
818 END DO
819 ELSE
820 result_recvbufs(0)%data = this%result_sendbufs(0)%data
821 END IF
822
823 last_buf = -1
824 bufsize = 0
825 DO i = 1, this%local_blocks
826 cur_col = this%coo_cols_local(i)
827 cur_row = this%coo_rows_local(i)
828 cur_sm = (cur_col - 1)/this%cols_per_sm + 1
829 cur_buf = this%submatrix_owners(cur_sm)
830 IF (cur_buf .GT. last_buf) bufsize = 0
831 bufsize_next = bufsize + this%col_blk_size(cur_row)*this%col_blk_size(cur_col)
832 CALL dbcsr_put_block(matrix=resultmat, row=cur_row, col=cur_col, &
833 block=result_recvbufs(cur_buf)%data(bufsize + 1:bufsize_next))
834 bufsize = bufsize_next
835 last_buf = cur_buf
836 END DO
837
838 DO i = 0, this%numnodes - 1
839 CALL result_recvbufs(i)%dealloc
840 END DO
841 DEALLOCATE (result_recvbufs)
842
843 CALL dbcsr_finalize(resultmat)
844
845 CALL timestop(handle)
846 END SUBROUTINE submatrix_communicate_results
847
848! **************************************************************************************************
849!> \brief sort two integer arrays using quicksort, using the second list as second-level sorting criterion
850!> \param arr_a - first input array
851!> \param arr_b - second input array
852!> \param left - left boundary of region to be sorted
853!> \param right - right boundary of region to be sorted
854! **************************************************************************************************
855 RECURSIVE PURE SUBROUTINE qsort_two(arr_a, arr_b, left, right)
856
857 INTEGER, DIMENSION(:), INTENT(inout) :: arr_a, arr_b
858 INTEGER, INTENT(in) :: left, right
859
860 INTEGER :: i, j, pivot_a, pivot_b, tmp
861
862 IF (right - left .LT. 1) RETURN
863
864 i = left
865 j = right - 1
866 pivot_a = arr_a(right)
867 pivot_b = arr_b(right)
868
869 DO
870 DO WHILE ((arr_a(i) .LT. pivot_a) .OR. ((arr_a(i) .EQ. pivot_a) .AND. (arr_b(i) .LT. pivot_b)))
871 i = i + 1
872 END DO
873 DO WHILE ((j .GT. i) .AND. ((arr_a(j) .GT. pivot_a) .OR. ((arr_a(j) .EQ. pivot_a) .AND. (arr_b(j) .GE. pivot_b))))
874 j = j - 1
875 END DO
876 IF (i .LT. j) THEN
877 tmp = arr_a(i)
878 arr_a(i) = arr_a(j)
879 arr_a(j) = tmp
880 tmp = arr_b(i)
881 arr_b(i) = arr_b(j)
882 arr_b(j) = tmp
883 ELSE
884 EXIT
885 END IF
886 END DO
887
888 IF ((arr_a(i) .GT. pivot_a) .OR. (arr_a(i) .EQ. pivot_a .AND. arr_b(i) .GT. pivot_b)) THEN
889 tmp = arr_a(i)
890 arr_a(i) = arr_a(right)
891 arr_a(right) = tmp
892 tmp = arr_b(i)
893 arr_b(i) = arr_b(right)
894 arr_b(right) = tmp
895 END IF
896
897 IF (i - 1 .GT. left) CALL qsort_two(arr_a, arr_b, left, i - 1)
898 IF (i + 1 .LT. right) CALL qsort_two(arr_a, arr_b, i + 1, right)
899
900 END SUBROUTINE qsort_two
901
902END MODULE submatrix_dissection
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
collects all references to literature in CP2K as new algorithms / method are included from literature...
integer, save, public lass2018
Defines the basic variable types.
Definition kinds.F:23
integer, parameter, public dp
Definition kinds.F:34
Interface to the message passing library MPI.
subroutine submatrix_get_relevant_sm_columns(this, sm_id, first, last)
determine which columns of the submatrix are relevant for the result matrix