(git:3bf5ae0)
Loading...
Searching...
No Matches
dbt_tas_reshape_ops.F
Go to the documentation of this file.
1!--------------------------------------------------------------------------------------------------!
2! CP2K: A general program to perform molecular dynamics simulations !
3! Copyright 2000-2026 CP2K developers group <https://cp2k.org> !
4! !
5! SPDX-License-Identifier: GPL-2.0-or-later !
6!--------------------------------------------------------------------------------------------------!
7
8! **************************************************************************************************
9!> \brief communication routines to reshape / replicate / merge tall-and-skinny matrices.
10!> \author Patrick Seewald
11! **************************************************************************************************
13 USE omp_lib, ONLY: omp_destroy_lock,&
14 omp_get_max_threads,&
15 omp_get_num_threads,&
16 omp_get_thread_num,&
17 omp_init_lock,&
18 omp_lock_kind,&
19 omp_set_lock,&
20 omp_unset_lock
21 USE dbm_api, ONLY: &
27 USE dbt_tas_base, ONLY: &
38 USE dbt_tas_split, ONLY: colsplit,&
45 USE dbt_tas_util, ONLY: swap
46 USE kinds, ONLY: dp,&
47 int_8
48 USE message_passing, ONLY: mp_cart_type,&
52#include "../../base/base_uses.f90"
53
54 IMPLICIT NONE
55 PRIVATE
56
57 CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'dbt_tas_reshape_ops'
58
59 PUBLIC :: &
63
64 TYPE dbt_buffer_type
65 INTEGER :: nblock = -1
66 INTEGER(KIND=int_8), DIMENSION(:, :), ALLOCATABLE :: indx
67 REAL(dp), DIMENSION(:), ALLOCATABLE :: msg
68 INTEGER :: endpos = -1
69 END TYPE dbt_buffer_type
70
71CONTAINS
72
73! **************************************************************************************************
74!> \brief copy data (involves reshape)
75!> \param matrix_in ...
76!> \param matrix_out ...
77!> \param summation whether matrix_out = matrix_out + matrix_in
78!> \param transposed ...
79!> \param move_data memory optimization: move data to matrix_out such that matrix_in is empty on return
80!> \author Patrick Seewald
81! **************************************************************************************************
82 RECURSIVE SUBROUTINE dbt_tas_reshape(matrix_in, matrix_out, summation, transposed, move_data)
83 TYPE(dbt_tas_type), INTENT(INOUT) :: matrix_in, matrix_out
84 LOGICAL, INTENT(IN), OPTIONAL :: summation, transposed, move_data
85
86 CHARACTER(LEN=*), PARAMETER :: routinen = 'dbt_tas_reshape'
87
88 INTEGER :: a, b, bcount, handle, handle2, iproc, &
89 nblk, nblk_per_thread, ndata, numnodes
90 INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:, :) :: blks_to_allocate, index_recv
91 INTEGER(KIND=int_8), DIMENSION(2) :: blk_index
92 INTEGER(kind=omp_lock_kind), ALLOCATABLE, &
93 DIMENSION(:) :: locks
94 INTEGER, ALLOCATABLE, DIMENSION(:) :: num_blocks_recv, num_blocks_send, &
95 num_entries_recv, num_entries_send, &
96 num_rec, num_send
97 INTEGER, DIMENSION(2) :: blk_size
98 LOGICAL :: move_prv, tr_in
99 REAL(kind=dp), DIMENSION(:, :), POINTER :: block
100 TYPE(dbt_buffer_type), ALLOCATABLE, DIMENSION(:) :: buffer_recv, buffer_send
101 TYPE(dbt_tas_iterator) :: iter
102 TYPE(dbt_tas_split_info) :: info
103 TYPE(mp_comm_type) :: mp_comm
104 TYPE(mp_request_type), ALLOCATABLE, &
105 DIMENSION(:, :) :: req_array
106
107 CALL timeset(routinen, handle)
108
109 IF (PRESENT(summation)) THEN
110 IF (.NOT. summation) CALL dbm_clear(matrix_out%matrix)
111 ELSE
112 CALL dbm_clear(matrix_out%matrix)
113 END IF
114
115 IF (PRESENT(move_data)) THEN
116 move_prv = move_data
117 ELSE
118 move_prv = .false.
119 END IF
120
121 IF (PRESENT(transposed)) THEN
122 tr_in = transposed
123 ELSE
124 tr_in = .false.
125 END IF
126
127 IF (.NOT. matrix_out%valid) THEN
128 cpabort("can not reshape into invalid matrix")
129 END IF
130
131 info = dbt_tas_info(matrix_in)
132 mp_comm = info%mp_comm
133 numnodes = mp_comm%num_pe
134 ALLOCATE (buffer_send(0:numnodes - 1))
135 ALLOCATE (buffer_recv(0:numnodes - 1))
136 ALLOCATE (num_blocks_recv(0:numnodes - 1))
137 ALLOCATE (num_blocks_send(0:numnodes - 1))
138 ALLOCATE (num_entries_recv(0:numnodes - 1))
139 ALLOCATE (num_entries_send(0:numnodes - 1))
140 ALLOCATE (num_rec(0:2*numnodes - 1))
141 ALLOCATE (num_send(0:2*numnodes - 1), source=0)
142 ALLOCATE (req_array(1:numnodes, 4))
143 ALLOCATE (locks(0:numnodes - 1))
144 DO iproc = 0, numnodes - 1
145 CALL omp_init_lock(locks(iproc))
146 END DO
147
148 CALL timeset(routinen//"_get_coord", handle2)
149!$OMP PARALLEL DEFAULT(NONE) SHARED(matrix_in,matrix_out,tr_in,num_send,locks) &
150!$OMP PRIVATE(iter,blk_index,blk_size,iproc)
151 CALL dbt_tas_iterator_start(iter, matrix_in)
152 DO WHILE (dbt_tas_iterator_blocks_left(iter))
153 CALL dbt_tas_iterator_next_block(iter, blk_index(1), blk_index(2), &
154 row_size=blk_size(1), col_size=blk_size(2))
155 IF (tr_in) THEN
156 CALL dbt_tas_get_stored_coordinates(matrix_out, blk_index(2), blk_index(1), iproc)
157 ELSE
158 CALL dbt_tas_get_stored_coordinates(matrix_out, blk_index(1), blk_index(2), iproc)
159 END IF
160 CALL omp_set_lock(locks(iproc))
161 num_send(2*iproc) = num_send(2*iproc) + product(blk_size)
162 num_send(2*iproc + 1) = num_send(2*iproc + 1) + 1
163 CALL omp_unset_lock(locks(iproc))
164 END DO
165 CALL dbt_tas_iterator_stop(iter)
166!$OMP END PARALLEL
167 CALL timestop(handle2)
168
169 CALL timeset(routinen//"_alltoall", handle2)
170 CALL mp_comm%alltoall(num_send, num_rec, 2)
171 CALL timestop(handle2)
172
173 CALL timeset(routinen//"_buffer_fill", handle2)
174 DO iproc = 0, numnodes - 1
175 num_entries_recv(iproc) = num_rec(2*iproc)
176 num_blocks_recv(iproc) = num_rec(2*iproc + 1)
177 num_entries_send(iproc) = num_send(2*iproc)
178 num_blocks_send(iproc) = num_send(2*iproc + 1)
179
180 CALL dbt_buffer_create(buffer_send(iproc), num_blocks_send(iproc), num_entries_send(iproc))
181 CALL dbt_buffer_create(buffer_recv(iproc), num_blocks_recv(iproc), num_entries_recv(iproc))
182 END DO
183
184!$OMP PARALLEL DEFAULT(NONE) SHARED(matrix_in,matrix_out,tr_in,buffer_send,locks) &
185!$OMP PRIVATE(iter,blk_index,blk_size,block,iproc)
186 CALL dbt_tas_iterator_start(iter, matrix_in)
187 DO WHILE (dbt_tas_iterator_blocks_left(iter))
188 CALL dbt_tas_iterator_next_block(iter, blk_index(1), blk_index(2), block, &
189 row_size=blk_size(1), col_size=blk_size(2))
190 IF (tr_in) THEN
191 CALL dbt_tas_get_stored_coordinates(matrix_out, blk_index(2), blk_index(1), iproc)
192 ELSE
193 CALL dbt_tas_get_stored_coordinates(matrix_out, blk_index(1), blk_index(2), iproc)
194 END IF
195 CALL omp_set_lock(locks(iproc))
196 CALL dbt_buffer_add_block(buffer_send(iproc), blk_index, block, transposed=tr_in)
197 CALL omp_unset_lock(locks(iproc))
198 END DO
199 CALL dbt_tas_iterator_stop(iter)
200!$OMP END PARALLEL
201
202 IF (move_prv) CALL dbt_tas_clear(matrix_in)
203 CALL timestop(handle2)
204
205 CALL timeset(routinen//"_communicate_buffer", handle2)
206 CALL dbt_tas_communicate_buffer(mp_comm, buffer_recv, buffer_send, req_array)
207
208 DO iproc = 0, numnodes - 1
209 CALL dbt_buffer_destroy(buffer_send(iproc))
210 CALL omp_destroy_lock(locks(iproc))
211 END DO
212 DEALLOCATE (locks)
213
214 CALL timestop(handle2)
215
216 CALL timeset(routinen//"_buffer_obtain", handle2)
217
218 ! Parallel unpack of received blocks.
219 nblk = sum(num_blocks_recv)
220 ALLOCATE (blks_to_allocate(nblk, 2))
221
222 bcount = 0
223 DO iproc = 0, numnodes - 1
224 CALL dbt_buffer_get_index(buffer_recv(iproc), index_recv)
225 blks_to_allocate(bcount + 1:bcount + SIZE(index_recv, 1), :) = index_recv(:, :)
226 bcount = bcount + SIZE(index_recv, 1)
227 DEALLOCATE (index_recv)
228 END DO
229
230!TODO: Parallelize creation of block list.
231!$OMP PARALLEL DEFAULT(NONE) SHARED(matrix_out,nblk,blks_to_allocate) PRIVATE(nblk_per_thread,A,b)
232 nblk_per_thread = nblk/omp_get_num_threads() + 1
233 a = omp_get_thread_num()*nblk_per_thread + 1
234 b = min(a + nblk_per_thread, nblk)
235 CALL dbt_tas_reserve_blocks(matrix_out, blks_to_allocate(a:b, 1), blks_to_allocate(a:b, 2))
236!$OMP END PARALLEL
237 DEALLOCATE (blks_to_allocate)
238
239!$OMP PARALLEL DEFAULT(NONE) SHARED(buffer_recv,matrix_out,numnodes,summation) &
240!$OMP PRIVATE(iproc,ndata,blk_index,blk_size,block)
241!$OMP DO SCHEDULE(DYNAMIC)
242 DO iproc = 0, numnodes - 1
243 ! First, we need to get the index to create block
244 DO WHILE (dbt_buffer_blocks_left(buffer_recv(iproc)))
245 CALL dbt_buffer_get_next_block(buffer_recv(iproc), ndata, blk_index)
246 CALL dbt_tas_blk_sizes(matrix_out, blk_index(1), blk_index(2), blk_size(1), blk_size(2))
247 ALLOCATE (block(blk_size(1), blk_size(2)))
248 CALL dbt_buffer_get_next_block(buffer_recv(iproc), ndata, blk_index, block)
249 CALL dbt_tas_put_block(matrix_out, blk_index(1), blk_index(2), block, summation=summation)
250 DEALLOCATE (block)
251 END DO
252 CALL dbt_buffer_destroy(buffer_recv(iproc))
253 END DO
254!$OMP END DO
255!$OMP END PARALLEL
256
257 CALL timestop(handle2)
258
259 CALL dbt_tas_finalize(matrix_out)
260
261 CALL timestop(handle)
262 END SUBROUTINE dbt_tas_reshape
263
264! **************************************************************************************************
265!> \brief Replicate matrix_in such that each submatrix of matrix_out is an exact copy of matrix_in
266!> \param matrix_in ...
267!> \param info ...
268!> \param matrix_out ...
269!> \param nodata Don't copy data but create matrix_out
270!> \param move_data memory optimization: move data to matrix_out such that matrix_in is empty on return
271!> \author Patrick Seewald
272! **************************************************************************************************
273 SUBROUTINE dbt_tas_replicate(matrix_in, info, matrix_out, nodata, move_data)
274 TYPE(dbm_type), INTENT(INOUT) :: matrix_in
275 TYPE(dbt_tas_split_info), INTENT(IN) :: info
276 TYPE(dbt_tas_type), INTENT(OUT) :: matrix_out
277 LOGICAL, INTENT(IN), OPTIONAL :: nodata, move_data
278
279 INTEGER :: a, b, nblk_per_thread, nblkcols, nblkrows
280 INTEGER, DIMENSION(2) :: pdims
281 INTEGER, DIMENSION(:), POINTER :: col_blk_size, col_dist, row_blk_size, &
282 row_dist
283 TYPE(dbm_distribution_obj) :: dbm_dist
284 TYPE(dbt_tas_dist_arb), TARGET :: dir_dist
285 TYPE(dbt_tas_dist_repl), TARGET :: repl_dist
286
287 CLASS(dbt_tas_distribution), ALLOCATABLE :: col_dist_obj, row_dist_obj
288 CLASS(dbt_tas_rowcol_data), ALLOCATABLE :: row_bsize_obj, col_bsize_obj
289 TYPE(dbt_tas_blk_size_repl), TARGET :: repl_blksize
290 TYPE(dbt_tas_blk_size_arb), TARGET :: dir_blksize
291 TYPE(dbt_tas_distribution_type) :: dist
292 INTEGER :: numnodes, ngroup, max_threads, cache_idx
293 INTEGER(kind=omp_lock_kind), ALLOCATABLE, DIMENSION(:) :: locks
294 TYPE(dbt_buffer_type), ALLOCATABLE, DIMENSION(:) :: buffer_recv, buffer_send
295 INTEGER, ALLOCATABLE, DIMENSION(:) :: num_blocks_recv, num_blocks_send, &
296 num_entries_recv, num_entries_send, &
297 num_rec, num_send
298 TYPE(mp_request_type), ALLOCATABLE, DIMENSION(:, :) :: req_array
299 INTEGER, ALLOCATABLE, DIMENSION(:, :) :: blks_to_allocate
300 INTEGER, DIMENSION(2) :: blk_size
301 INTEGER, DIMENSION(2) :: blk_index
302 INTEGER(KIND=int_8), DIMENSION(2) :: blk_index_i8
303 TYPE(dbm_iterator) :: iter
304 INTEGER :: i, iproc, bcount, nblk
305 INTEGER, ALLOCATABLE, DIMENSION(:, :) :: iprocs
306 LOGICAL :: nodata_prv, move_prv
307 INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:, :) :: index_recv
308 INTEGER :: ndata
309 TYPE(mp_cart_type) :: mp_comm
310
311 REAL(kind=dp), DIMENSION(:, :), POINTER :: block
312
313 CHARACTER(LEN=*), PARAMETER :: routinen = 'dbt_tas_replicate'
314
315 INTEGER :: handle, handle2
316
317 NULLIFY (col_blk_size, row_blk_size)
318
319 CALL timeset(routinen, handle)
320
321 IF (PRESENT(nodata)) THEN
322 nodata_prv = nodata
323 ELSE
324 nodata_prv = .false.
325 END IF
326
327 IF (PRESENT(move_data)) THEN
328 move_prv = move_data
329 ELSE
330 move_prv = .false.
331 END IF
332
333 row_blk_size => dbm_get_row_block_sizes(matrix_in)
334 col_blk_size => dbm_get_col_block_sizes(matrix_in)
335 nblkrows = SIZE(row_blk_size)
336 nblkcols = SIZE(col_blk_size)
337 dbm_dist = dbm_get_distribution(matrix_in)
338 row_dist => dbm_distribution_row_dist(dbm_dist)
339 col_dist => dbm_distribution_col_dist(dbm_dist)
340
341 mp_comm = info%mp_comm
342 ngroup = info%ngroup
343
344 numnodes = mp_comm%num_pe
345 pdims = mp_comm%num_pe_cart
346
347 SELECT CASE (info%split_rowcol)
348 CASE (rowsplit)
349 repl_dist = dbt_tas_dist_repl(row_dist, pdims(1), nblkrows, info%ngroup, info%pgrid_split_size)
350 dir_dist = dbt_tas_dist_arb(col_dist, pdims(2), int(nblkcols, kind=int_8))
351 repl_blksize = dbt_tas_blk_size_repl(row_blk_size, info%ngroup)
352 dir_blksize = dbt_tas_blk_size_arb(col_blk_size)
353 ALLOCATE (row_dist_obj, source=repl_dist)
354 ALLOCATE (col_dist_obj, source=dir_dist)
355 ALLOCATE (row_bsize_obj, source=repl_blksize)
356 ALLOCATE (col_bsize_obj, source=dir_blksize)
357 CASE (colsplit)
358 dir_dist = dbt_tas_dist_arb(row_dist, pdims(1), int(nblkrows, kind=int_8))
359 repl_dist = dbt_tas_dist_repl(col_dist, pdims(2), nblkcols, info%ngroup, info%pgrid_split_size)
360 dir_blksize = dbt_tas_blk_size_arb(row_blk_size)
361 repl_blksize = dbt_tas_blk_size_repl(col_blk_size, info%ngroup)
362 ALLOCATE (row_dist_obj, source=dir_dist)
363 ALLOCATE (col_dist_obj, source=repl_dist)
364 ALLOCATE (row_bsize_obj, source=dir_blksize)
365 ALLOCATE (col_bsize_obj, source=repl_blksize)
366 END SELECT
367
368 CALL dbt_tas_distribution_new(dist, mp_comm, row_dist_obj, col_dist_obj, split_info=info)
369 CALL dbt_tas_create(matrix_out, trim(dbm_get_name(matrix_in))//" replicated", &
370 dist, row_bsize_obj, col_bsize_obj, own_dist=.true.)
371
372 IF (nodata_prv) THEN
373 CALL dbt_tas_finalize(matrix_out)
374 CALL timestop(handle)
375 RETURN
376 END IF
377
378 ALLOCATE (buffer_send(0:numnodes - 1))
379 ALLOCATE (buffer_recv(0:numnodes - 1))
380 ALLOCATE (num_blocks_recv(0:numnodes - 1))
381 ALLOCATE (num_blocks_send(0:numnodes - 1))
382 ALLOCATE (num_entries_recv(0:numnodes - 1))
383 ALLOCATE (num_entries_send(0:numnodes - 1))
384 ALLOCATE (num_rec(0:2*numnodes - 1))
385 ALLOCATE (num_send(0:2*numnodes - 1), source=0)
386 ALLOCATE (req_array(1:numnodes, 4))
387 ALLOCATE (locks(0:numnodes - 1))
388 max_threads = 1
389!$ max_threads = omp_get_max_threads()
390 ALLOCATE (iprocs(ngroup, max_threads))
391 DO iproc = 0, numnodes - 1
392 CALL omp_init_lock(locks(iproc))
393 END DO
394
395!$OMP PARALLEL DEFAULT(NONE) SHARED(matrix_in,matrix_out,num_send,ngroup,iprocs,locks) &
396!$OMP PRIVATE(iter,blk_index,blk_size,cache_idx,i,iproc)
397 cache_idx = omp_get_thread_num() + 1
398 CALL dbm_iterator_start(iter, matrix_in)
399 DO WHILE (dbm_iterator_blocks_left(iter))
400 CALL dbm_iterator_next_block(iter, blk_index(1), blk_index(2), &
401 row_size=blk_size(1), col_size=blk_size(2))
402 CALL dbt_repl_get_stored_coordinates(matrix_out, blk_index(1), blk_index(2), &
403 iprocs(:, cache_idx))
404 DO i = 1, ngroup
405 iproc = iprocs(i, cache_idx)
406 CALL omp_set_lock(locks(iproc))
407 num_send(2*iproc) = num_send(2*iproc) + product(blk_size)
408 num_send(2*iproc + 1) = num_send(2*iproc + 1) + 1
409 CALL omp_unset_lock(locks(iproc))
410 END DO
411 END DO
412 CALL dbm_iterator_stop(iter)
413!$OMP END PARALLEL
414
415 CALL timeset(routinen//"_alltoall", handle2)
416 CALL mp_comm%alltoall(num_send, num_rec, 2)
417 CALL timestop(handle2)
418
419 DO iproc = 0, numnodes - 1
420 num_entries_recv(iproc) = num_rec(2*iproc)
421 num_blocks_recv(iproc) = num_rec(2*iproc + 1)
422 num_entries_send(iproc) = num_send(2*iproc)
423 num_blocks_send(iproc) = num_send(2*iproc + 1)
424
425 CALL dbt_buffer_create(buffer_send(iproc), num_blocks_send(iproc), num_entries_send(iproc))
426 CALL dbt_buffer_create(buffer_recv(iproc), num_blocks_recv(iproc), num_entries_recv(iproc))
427 END DO
428
429!$OMP PARALLEL DEFAULT(NONE) SHARED(matrix_in,matrix_out,buffer_send,locks,ngroup,iprocs) &
430!$OMP PRIVATE(iter,blk_index,blk_size,block,cache_idx,i,iproc)
431 cache_idx = omp_get_thread_num() + 1
432 CALL dbm_iterator_start(iter, matrix_in)
433 DO WHILE (dbm_iterator_blocks_left(iter))
434 CALL dbm_iterator_next_block(iter, blk_index(1), blk_index(2), block, &
435 row_size=blk_size(1), col_size=blk_size(2))
436 CALL dbt_repl_get_stored_coordinates(matrix_out, blk_index(1), blk_index(2), &
437 iprocs(:, cache_idx))
438 DO i = 1, ngroup
439 iproc = iprocs(i, cache_idx)
440 CALL omp_set_lock(locks(iproc))
441 CALL dbt_buffer_add_block(buffer_send(iproc), int(blk_index, kind=int_8), block)
442 CALL omp_unset_lock(locks(iproc))
443 END DO
444 END DO
445 CALL dbm_iterator_stop(iter)
446!$OMP END PARALLEL
447
448 DEALLOCATE (iprocs)
449
450 IF (move_prv) CALL dbm_clear(matrix_in)
451
452 CALL timeset(routinen//"_communicate_buffer", handle2)
453 CALL dbt_tas_communicate_buffer(mp_comm, buffer_recv, buffer_send, req_array)
454
455 DO iproc = 0, numnodes - 1
456 CALL dbt_buffer_destroy(buffer_send(iproc))
457 CALL omp_destroy_lock(locks(iproc))
458 END DO
459 DEALLOCATE (locks)
460
461 CALL timestop(handle2)
462
463 ! Parallel unpack of received blocks.
464 nblk = sum(num_blocks_recv)
465 ALLOCATE (blks_to_allocate(nblk, 2))
466
467 bcount = 0
468 DO iproc = 0, numnodes - 1
469 CALL dbt_buffer_get_index(buffer_recv(iproc), index_recv)
470 blks_to_allocate(bcount + 1:bcount + SIZE(index_recv, 1), :) = int(index_recv(:, :))
471 bcount = bcount + SIZE(index_recv, 1)
472 DEALLOCATE (index_recv)
473 END DO
474
475!TODO: Parallelize creation of block list.
476!$OMP PARALLEL DEFAULT(NONE) SHARED(matrix_out,nblk,blks_to_allocate) PRIVATE(nblk_per_thread,A,b)
477 nblk_per_thread = nblk/omp_get_num_threads() + 1
478 a = omp_get_thread_num()*nblk_per_thread + 1
479 b = min(a + nblk_per_thread, nblk)
480 CALL dbm_reserve_blocks(matrix_out%matrix, blks_to_allocate(a:b, 1), blks_to_allocate(a:b, 2))
481!$OMP END PARALLEL
482 DEALLOCATE (blks_to_allocate)
483
484!$OMP PARALLEL DEFAULT(NONE) SHARED(buffer_recv,matrix_out,numnodes) &
485!$OMP PRIVATE(iproc,ndata,blk_index_i8,blk_size,block)
486!$OMP DO SCHEDULE(DYNAMIC)
487 DO iproc = 0, numnodes - 1
488 ! First, we need to get the index to create block
489 DO WHILE (dbt_buffer_blocks_left(buffer_recv(iproc)))
490 CALL dbt_buffer_get_next_block(buffer_recv(iproc), ndata, blk_index_i8)
491 CALL dbt_tas_blk_sizes(matrix_out, blk_index_i8(1), blk_index_i8(2), blk_size(1), blk_size(2))
492 ALLOCATE (block(blk_size(1), blk_size(2)))
493 CALL dbt_buffer_get_next_block(buffer_recv(iproc), ndata, blk_index_i8, block)
494 CALL dbm_put_block(matrix_out%matrix, int(blk_index_i8(1)), int(blk_index_i8(2)), block)
495 DEALLOCATE (block)
496 END DO
497
498 CALL dbt_buffer_destroy(buffer_recv(iproc))
499 END DO
500!$OMP END DO
501!$OMP END PARALLEL
502
503 CALL dbt_tas_finalize(matrix_out)
504
505 CALL timestop(handle)
506
507 END SUBROUTINE dbt_tas_replicate
508
509! **************************************************************************************************
510!> \brief Merge submatrices of matrix_in to matrix_out by sum
511!> \param matrix_out ...
512!> \param matrix_in ...
513!> \param summation ...
514!> \param move_data memory optimization: move data to matrix_out such that matrix_in is empty on return
515!> \author Patrick Seewald
516! **************************************************************************************************
517 SUBROUTINE dbt_tas_merge(matrix_out, matrix_in, summation, move_data)
518 TYPE(dbm_type), INTENT(INOUT) :: matrix_out
519 TYPE(dbt_tas_type), INTENT(INOUT) :: matrix_in
520 LOGICAL, INTENT(IN), OPTIONAL :: summation, move_data
521
522 CHARACTER(LEN=*), PARAMETER :: routinen = 'dbt_tas_merge'
523
524 INTEGER :: a, b, bcount, handle, handle2, iproc, &
525 nblk, nblk_per_thread, ndata, numnodes
526 INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:, :) :: index_recv
527 INTEGER(KIND=int_8), DIMENSION(2) :: blk_index_i8
528 INTEGER(kind=omp_lock_kind), ALLOCATABLE, &
529 DIMENSION(:) :: locks
530 INTEGER, ALLOCATABLE, DIMENSION(:) :: num_blocks_recv, num_blocks_send, &
531 num_entries_recv, num_entries_send, &
532 num_rec, num_send
533 INTEGER, ALLOCATABLE, DIMENSION(:, :) :: blks_to_allocate
534 INTEGER, DIMENSION(2) :: blk_index, blk_size
535 INTEGER, DIMENSION(:), POINTER :: col_block_sizes, row_block_sizes
536 LOGICAL :: move_prv
537 REAL(dp), DIMENSION(:, :), POINTER :: block
538 TYPE(dbm_iterator) :: iter
539 TYPE(dbt_buffer_type), ALLOCATABLE, DIMENSION(:) :: buffer_recv, buffer_send
540 TYPE(dbt_tas_split_info) :: info
541 TYPE(mp_cart_type) :: mp_comm
542 TYPE(mp_request_type), ALLOCATABLE, &
543 DIMENSION(:, :) :: req_array
544
545 CALL timeset(routinen, handle)
546
547 IF (PRESENT(summation)) THEN
548 IF (.NOT. summation) CALL dbm_clear(matrix_out)
549 ELSE
550 CALL dbm_clear(matrix_out)
551 END IF
552
553 IF (PRESENT(move_data)) THEN
554 move_prv = move_data
555 ELSE
556 move_prv = .false.
557 END IF
558
559 info = dbt_tas_info(matrix_in)
560 CALL dbt_tas_get_split_info(info, mp_comm=mp_comm)
561 numnodes = mp_comm%num_pe
562
563 ALLOCATE (buffer_send(0:numnodes - 1))
564 ALLOCATE (buffer_recv(0:numnodes - 1))
565 ALLOCATE (num_blocks_recv(0:numnodes - 1))
566 ALLOCATE (num_blocks_send(0:numnodes - 1))
567 ALLOCATE (num_entries_recv(0:numnodes - 1))
568 ALLOCATE (num_entries_send(0:numnodes - 1))
569 ALLOCATE (num_rec(0:2*numnodes - 1))
570 ALLOCATE (num_send(0:2*numnodes - 1), source=0)
571 ALLOCATE (req_array(1:numnodes, 4))
572 ALLOCATE (locks(0:numnodes - 1))
573 DO iproc = 0, numnodes - 1
574 CALL omp_init_lock(locks(iproc))
575 END DO
576
577!$OMP PARALLEL DEFAULT(NONE) SHARED(matrix_in,matrix_out,num_send,locks) &
578!$OMP PRIVATE(iter,blk_index,blk_size,iproc)
579 CALL dbm_iterator_start(iter, matrix_in%matrix)
580 DO WHILE (dbm_iterator_blocks_left(iter))
581 CALL dbm_iterator_next_block(iter, blk_index(1), blk_index(2), &
582 row_size=blk_size(1), col_size=blk_size(2))
583 CALL dbm_get_stored_coordinates(matrix_out, blk_index(1), blk_index(2), iproc)
584 CALL omp_set_lock(locks(iproc))
585 num_send(2*iproc) = num_send(2*iproc) + product(blk_size)
586 num_send(2*iproc + 1) = num_send(2*iproc + 1) + 1
587 CALL omp_unset_lock(locks(iproc))
588 END DO
589 CALL dbm_iterator_stop(iter)
590!$OMP END PARALLEL
591
592 CALL timeset(routinen//"_alltoall", handle2)
593 CALL mp_comm%alltoall(num_send, num_rec, 2)
594 CALL timestop(handle2)
595
596 DO iproc = 0, numnodes - 1
597 num_entries_recv(iproc) = num_rec(2*iproc)
598 num_blocks_recv(iproc) = num_rec(2*iproc + 1)
599 num_entries_send(iproc) = num_send(2*iproc)
600 num_blocks_send(iproc) = num_send(2*iproc + 1)
601
602 CALL dbt_buffer_create(buffer_send(iproc), num_blocks_send(iproc), num_entries_send(iproc))
603 CALL dbt_buffer_create(buffer_recv(iproc), num_blocks_recv(iproc), num_entries_recv(iproc))
604 END DO
605
606!$OMP PARALLEL DEFAULT(NONE) SHARED(matrix_in,matrix_out,buffer_send,locks) &
607!$OMP PRIVATE(iter,blk_index,blk_size,block,iproc)
608 CALL dbm_iterator_start(iter, matrix_in%matrix)
609 DO WHILE (dbm_iterator_blocks_left(iter))
610 CALL dbm_iterator_next_block(iter, blk_index(1), blk_index(2), block, &
611 row_size=blk_size(1), col_size=blk_size(2))
612 CALL dbm_get_stored_coordinates(matrix_out, blk_index(1), blk_index(2), iproc)
613 CALL omp_set_lock(locks(iproc))
614 CALL dbt_buffer_add_block(buffer_send(iproc), int(blk_index, kind=int_8), block)
615 CALL omp_unset_lock(locks(iproc))
616 END DO
617 CALL dbm_iterator_stop(iter)
618!$OMP END PARALLEL
619
620 IF (move_prv) CALL dbt_tas_clear(matrix_in)
621
622 CALL timeset(routinen//"_communicate_buffer", handle2)
623 CALL dbt_tas_communicate_buffer(mp_comm, buffer_recv, buffer_send, req_array)
624
625 DO iproc = 0, numnodes - 1
626 CALL dbt_buffer_destroy(buffer_send(iproc))
627 CALL omp_destroy_lock(locks(iproc))
628 END DO
629 DEALLOCATE (locks)
630
631 CALL timestop(handle2)
632
633 ! Parallel unpack of received blocks.
634 nblk = sum(num_blocks_recv)
635 ALLOCATE (blks_to_allocate(nblk, 2))
636
637 bcount = 0
638 DO iproc = 0, numnodes - 1
639 CALL dbt_buffer_get_index(buffer_recv(iproc), index_recv)
640 blks_to_allocate(bcount + 1:bcount + SIZE(index_recv, 1), :) = int(index_recv(:, :))
641 bcount = bcount + SIZE(index_recv, 1)
642 DEALLOCATE (index_recv)
643 END DO
644
645!TODO: Parallelize creation of block list.
646!$OMP PARALLEL DEFAULT(NONE) SHARED(matrix_out,nblk,blks_to_allocate) PRIVATE(nblk_per_thread,A,b)
647 nblk_per_thread = nblk/omp_get_num_threads() + 1
648 a = omp_get_thread_num()*nblk_per_thread + 1
649 b = min(a + nblk_per_thread, nblk)
650 CALL dbm_reserve_blocks(matrix_out, blks_to_allocate(a:b, 1), blks_to_allocate(a:b, 2))
651!$OMP END PARALLEL
652 DEALLOCATE (blks_to_allocate)
653
654 row_block_sizes => dbm_get_row_block_sizes(matrix_out)
655 col_block_sizes => dbm_get_col_block_sizes(matrix_out)
656
657!$OMP PARALLEL DEFAULT(NONE) SHARED(buffer_recv,matrix_out,numnodes,row_block_sizes,col_block_sizes) &
658!$OMP PRIVATE(iproc,ndata,blk_index_i8,blk_size,block)
659!$OMP DO SCHEDULE(DYNAMIC)
660 DO iproc = 0, numnodes - 1
661 ! First, we need to get the index to create block
662 DO WHILE (dbt_buffer_blocks_left(buffer_recv(iproc)))
663 CALL dbt_buffer_get_next_block(buffer_recv(iproc), ndata, blk_index_i8)
664 blk_size(1) = row_block_sizes(int(blk_index_i8(1)))
665 blk_size(2) = col_block_sizes(int(blk_index_i8(2)))
666 ALLOCATE (block(blk_size(1), blk_size(2)))
667 CALL dbt_buffer_get_next_block(buffer_recv(iproc), ndata, blk_index_i8, block)
668 CALL dbm_put_block(matrix_out, int(blk_index_i8(1)), int(blk_index_i8(2)), block, summation=.true.)
669 DEALLOCATE (block)
670 END DO
671 CALL dbt_buffer_destroy(buffer_recv(iproc))
672 END DO
673!$OMP END DO
674!$OMP END PARALLEL
675
676 CALL dbm_finalize(matrix_out)
677
678 CALL timestop(handle)
679 END SUBROUTINE dbt_tas_merge
680
681! **************************************************************************************************
682!> \brief get all indices from buffer
683!> \param buffer ...
684!> \param index ...
685!> \author Patrick Seewald
686! **************************************************************************************************
687 SUBROUTINE dbt_buffer_get_index(buffer, index)
688 TYPE(dbt_buffer_type), INTENT(IN) :: buffer
689 INTEGER(KIND=int_8), ALLOCATABLE, &
690 DIMENSION(:, :), INTENT(OUT) :: index
691
692 CHARACTER(LEN=*), PARAMETER :: routinen = 'dbt_buffer_get_index'
693
694 INTEGER :: handle
695 INTEGER, DIMENSION(2) :: indx_shape
696
697 CALL timeset(routinen, handle)
698
699 indx_shape = shape(buffer%indx) - [0, 1]
700 ALLOCATE (index(indx_shape(1), indx_shape(2)))
701 index(:, :) = buffer%indx(1:indx_shape(1), 1:indx_shape(2))
702 CALL timestop(handle)
703 END SUBROUTINE dbt_buffer_get_index
704
705! **************************************************************************************************
706!> \brief how many blocks left in iterator
707!> \param buffer ...
708!> \return ...
709!> \author Patrick Seewald
710! **************************************************************************************************
711 PURE FUNCTION dbt_buffer_blocks_left(buffer)
712 TYPE(dbt_buffer_type), INTENT(IN) :: buffer
713 LOGICAL :: dbt_buffer_blocks_left
714
715 dbt_buffer_blocks_left = buffer%endpos < buffer%nblock
716 END FUNCTION dbt_buffer_blocks_left
717
718! **************************************************************************************************
719!> \brief Create block buffer for MPI communication.
720!> \param buffer block buffer
721!> \param nblock number of blocks
722!> \param ndata total number of block entries
723!> \author Patrick Seewald
724! **************************************************************************************************
725 SUBROUTINE dbt_buffer_create(buffer, nblock, ndata)
726 TYPE(dbt_buffer_type), INTENT(OUT) :: buffer
727 INTEGER, INTENT(IN) :: nblock, ndata
728
729 buffer%nblock = nblock
730 buffer%endpos = 0
731 ALLOCATE (buffer%msg(ndata))
732 ALLOCATE (buffer%indx(nblock, 3))
733 END SUBROUTINE dbt_buffer_create
734
735! **************************************************************************************************
736!> \brief ...
737!> \param buffer ...
738!> \author Patrick Seewald
739! **************************************************************************************************
740 SUBROUTINE dbt_buffer_destroy(buffer)
741 TYPE(dbt_buffer_type), INTENT(INOUT) :: buffer
742
743 DEALLOCATE (buffer%msg)
744 DEALLOCATE (buffer%indx)
745 buffer%nblock = -1
746 buffer%endpos = -1
747 END SUBROUTINE dbt_buffer_destroy
748
749! **************************************************************************************************
750!> \brief insert a block into block buffer (at current iterator position)
751!> \param buffer ...
752!> \param index index of block
753!> \param block ...
754!> \param transposed ...
755!> \author Patrick Seewald
756! **************************************************************************************************
757 SUBROUTINE dbt_buffer_add_block(buffer, index, block, transposed)
758 TYPE(dbt_buffer_type), INTENT(INOUT) :: buffer
759 INTEGER(KIND=int_8), DIMENSION(2), INTENT(IN) :: index
760 REAL(dp), DIMENSION(:, :), INTENT(IN) :: block
761 LOGICAL, INTENT(IN), OPTIONAL :: transposed
762
763 INTEGER :: ndata, p, p_data
764 INTEGER(KIND=int_8), DIMENSION(2) :: index_prv
765 LOGICAL :: tr
766
767 IF (PRESENT(transposed)) THEN
768 tr = transposed
769 ELSE
770 tr = .false.
771 END IF
772
773 index_prv(:) = index(:)
774 IF (tr) THEN
775 CALL swap(index_prv)
776 END IF
777 ndata = product(shape(block))
778
779 p = buffer%endpos
780 IF (p == 0) THEN
781 p_data = 0
782 ELSE
783 p_data = int(buffer%indx(p, 3))
784 END IF
785
786 IF (tr) THEN
787 buffer%msg(p_data + 1:p_data + ndata) = reshape(transpose(block), [ndata])
788 ELSE
789 buffer%msg(p_data + 1:p_data + ndata) = reshape(block, [ndata])
790 END IF
791
792 buffer%indx(p + 1, 1:2) = index_prv(:)
793 IF (p > 0) THEN
794 buffer%indx(p + 1, 3) = buffer%indx(p, 3) + int(ndata, kind=int_8)
795 ELSE
796 buffer%indx(p + 1, 3) = int(ndata, kind=int_8)
797 END IF
798 buffer%endpos = buffer%endpos + 1
799 END SUBROUTINE dbt_buffer_add_block
800
801! **************************************************************************************************
802!> \brief get next block from buffer. Iterator is advanced only if block is retrieved or advance_iter.
803!> \param buffer ...
804!> \param ndata ...
805!> \param index ...
806!> \param block ...
807!> \param advance_iter ...
808!> \author Patrick Seewald
809! **************************************************************************************************
810 SUBROUTINE dbt_buffer_get_next_block(buffer, ndata, index, block, advance_iter)
811 TYPE(dbt_buffer_type), INTENT(INOUT) :: buffer
812 INTEGER, INTENT(OUT) :: ndata
813 INTEGER(KIND=int_8), DIMENSION(2), INTENT(OUT) :: index
814 REAL(dp), DIMENSION(:, :), INTENT(OUT), OPTIONAL :: block
815 LOGICAL, INTENT(IN), OPTIONAL :: advance_iter
816
817 INTEGER :: p, p_data
818 LOGICAL :: do_advance
819
820 do_advance = .false.
821 IF (PRESENT(advance_iter)) THEN
822 do_advance = advance_iter
823 ELSE IF (PRESENT(block)) THEN
824 do_advance = .true.
825 END IF
826
827 p = buffer%endpos
828 IF (p == 0) THEN
829 p_data = 0
830 ELSE
831 p_data = int(buffer%indx(p, 3))
832 END IF
833
834 IF (p > 0) THEN
835 ndata = int(buffer%indx(p + 1, 3) - buffer%indx(p, 3))
836 ELSE
837 ndata = int(buffer%indx(p + 1, 3))
838 END IF
839 index(:) = buffer%indx(p + 1, 1:2)
840
841 IF (PRESENT(block)) THEN
842 block(:, :) = reshape(buffer%msg(p_data + 1:p_data + ndata), shape(block))
843 END IF
844
845 IF (do_advance) buffer%endpos = buffer%endpos + 1
846 END SUBROUTINE dbt_buffer_get_next_block
847
848! **************************************************************************************************
849!> \brief communicate buffer
850!> \param mp_comm ...
851!> \param buffer_recv ...
852!> \param buffer_send ...
853!> \param req_array ...
854!> \author Patrick Seewald
855! **************************************************************************************************
856 SUBROUTINE dbt_tas_communicate_buffer(mp_comm, buffer_recv, buffer_send, req_array)
857 CLASS(mp_comm_type), INTENT(IN) :: mp_comm
858 TYPE(dbt_buffer_type), DIMENSION(0:), &
859 INTENT(INOUT) :: buffer_recv, buffer_send
860 TYPE(mp_request_type), DIMENSION(:, :), &
861 INTENT(OUT) :: req_array
862
863 CHARACTER(LEN=*), PARAMETER :: routinen = 'dbt_tas_communicate_buffer'
864
865 INTEGER :: handle, iproc, numnodes, &
866 rec_counter, send_counter
867
868 CALL timeset(routinen, handle)
869 numnodes = mp_comm%num_pe
870
871 IF (numnodes > 1) THEN
872
873 send_counter = 0
874 rec_counter = 0
875
876 DO iproc = 0, numnodes - 1
877 IF (buffer_recv(iproc)%nblock > 0) THEN
878 rec_counter = rec_counter + 1
879 CALL mp_comm%irecv(buffer_recv(iproc)%indx, iproc, req_array(rec_counter, 3), tag=4)
880 CALL mp_comm%irecv(buffer_recv(iproc)%msg, iproc, req_array(rec_counter, 4), tag=7)
881 END IF
882 END DO
883
884 DO iproc = 0, numnodes - 1
885 IF (buffer_send(iproc)%nblock > 0) THEN
886 send_counter = send_counter + 1
887 CALL mp_comm%isend(buffer_send(iproc)%indx, iproc, req_array(send_counter, 1), tag=4)
888 CALL mp_comm%isend(buffer_send(iproc)%msg, iproc, req_array(send_counter, 2), tag=7)
889 END IF
890 END DO
891
892 IF (send_counter > 0) THEN
893 CALL mp_waitall(req_array(1:send_counter, 1:2))
894 END IF
895 IF (rec_counter > 0) THEN
896 CALL mp_waitall(req_array(1:rec_counter, 3:4))
897 END IF
898
899 ELSE
900 IF (buffer_recv(0)%nblock > 0) THEN
901 buffer_recv(0)%indx(:, :) = buffer_send(0)%indx(:, :)
902 buffer_recv(0)%msg(:) = buffer_send(0)%msg(:)
903 END IF
904 END IF
905 CALL timestop(handle)
906 END SUBROUTINE dbt_tas_communicate_buffer
907
908END MODULE dbt_tas_reshape_ops
subroutine, public dbm_clear(matrix)
Remove all blocks from given matrix, but does not release the underlying memory.
Definition dbm_api.F:528
subroutine, public dbm_iterator_next_block(iterator, row, column, block, row_size, col_size)
Returns the next block from the given iterator.
Definition dbm_api.F:909
subroutine, public dbm_get_stored_coordinates(matrix, row, column, processor)
Returns the MPI rank on which the given block should be stored.
Definition dbm_api.F:1232
type(dbm_distribution_obj) function, public dbm_get_distribution(matrix)
Returns the distribution of the given matrix.
Definition dbm_api.F:1265
integer function, dimension(:), pointer, contiguous, public dbm_get_row_block_sizes(matrix)
Returns the row block sizes of the given matrix.
Definition dbm_api.F:1102
logical function, public dbm_iterator_blocks_left(iterator)
Tests whether the given iterator has any block left.
Definition dbm_api.F:883
integer function, dimension(:), pointer, contiguous, public dbm_distribution_col_dist(dist)
Returns the columns of the given distribution.
Definition dbm_api.F:1454
subroutine, public dbm_reserve_blocks(matrix, rows, cols)
Adds given list of blocks efficiently. The blocks will be filled with zeros.
Definition dbm_api.F:588
subroutine, public dbm_iterator_stop(iterator)
Releases the given iterator.
Definition dbm_api.F:947
subroutine, public dbm_put_block(matrix, row, col, block, summation)
Adds a block to given matrix. This routine is thread-safe. If block already exist then it gets overwr...
Definition dbm_api.F:490
character(len=default_string_length) function, public dbm_get_name(matrix)
Returns the name of the matrix of the given matrix.
Definition dbm_api.F:1022
subroutine, public dbm_finalize(matrix)
Needed to be called for DBCSR after blocks where inserted. For DBM it's a no-opt.
Definition dbm_api.F:338
subroutine, public dbm_iterator_start(iterator, matrix)
Creates an iterator for the blocks of the given matrix. The iteration order is not stable.
Definition dbm_api.F:836
integer function, dimension(:), pointer, contiguous, public dbm_get_col_block_sizes(matrix)
Returns the column block sizes of the given matrix.
Definition dbm_api.F:1129
integer function, dimension(:), pointer, contiguous, public dbm_distribution_row_dist(dist)
Returns the rows of the given distribution.
Definition dbm_api.F:1424
Tall-and-skinny matrices: base routines similar to DBM API, mostly wrappers around existing DBM routi...
subroutine, public dbt_tas_blk_sizes(matrix, row, col, row_size, col_size)
Get block size for a given row & column.
subroutine, public dbt_tas_get_stored_coordinates(matrix, row, column, processor)
As dbt_get_stored_coordinates.
subroutine, public dbt_tas_iterator_start(iter, matrix_in)
As dbm_iterator_start.
logical function, public dbt_tas_iterator_blocks_left(iter)
As dbm_iterator_blocks_left.
subroutine, public dbt_repl_get_stored_coordinates(matrix, row, column, processors)
Get all processors for a given row/col combination if matrix is replicated on each process subgroup.
subroutine, public dbt_tas_iterator_stop(iter)
As dbm_iterator_stop.
subroutine, public dbt_tas_finalize(matrix)
...
subroutine, public dbt_tas_distribution_new(dist, mp_comm, row_dist, col_dist, split_info, nosplit)
create new distribution. Exactly like dbm_distribution_new but with custom types for row_dist and col...
subroutine, public dbt_tas_clear(matrix)
Clear matrix (erase all data)
type(dbt_tas_split_info) function, pointer, public dbt_tas_info(matrix)
get info on mpi grid splitting
subroutine, public dbt_tas_put_block(matrix, row, col, block, summation)
As dbm_put_block.
Global data (distribution and block sizes) for tall-and-skinny matrices For very sparse matrices with...
communication routines to reshape / replicate / merge tall-and-skinny matrices.
subroutine, public dbt_tas_merge(matrix_out, matrix_in, summation, move_data)
Merge submatrices of matrix_in to matrix_out by sum.
subroutine, public dbt_tas_replicate(matrix_in, info, matrix_out, nodata, move_data)
Replicate matrix_in such that each submatrix of matrix_out is an exact copy of matrix_in.
recursive subroutine, public dbt_tas_reshape(matrix_in, matrix_out, summation, transposed, move_data)
copy data (involves reshape)
methods to split tall-and-skinny matrices along longest dimension. Basically, we are splitting proces...
integer, parameter, public rowsplit
subroutine, public dbt_tas_get_split_info(info, mp_comm, nsplit, igroup, mp_comm_group, split_rowcol, pgrid_offset)
Get info on split.
integer, parameter, public colsplit
DBT tall-and-skinny base types. Mostly wrappers around existing DBM routines.
often used utilities for tall-and-skinny matrices
Defines the basic variable types.
Definition kinds.F:23
integer, parameter, public int_8
Definition kinds.F:54
integer, parameter, public dp
Definition kinds.F:34
Interface to the message passing library MPI.
type for arbitrary block sizes
type for replicated block sizes
type for arbitrary distributions
type for replicated distribution