(git:ccc2433)
dbt_tas_reshape_ops.F
Go to the documentation of this file.
1 !--------------------------------------------------------------------------------------------------!
2 ! CP2K: A general program to perform molecular dynamics simulations !
3 ! Copyright 2000-2024 CP2K developers group <https://cp2k.org> !
4 ! !
5 ! SPDX-License-Identifier: GPL-2.0-or-later !
6 !--------------------------------------------------------------------------------------------------!
7 
8 ! **************************************************************************************************
9 !> \brief communication routines to reshape / replicate / merge tall-and-skinny matrices.
10 !> \author Patrick Seewald
11 ! **************************************************************************************************
13  USE omp_lib, ONLY: omp_get_num_threads,&
14  omp_get_thread_num,&
15  omp_init_lock,&
16  omp_lock_kind,&
17  omp_set_lock,&
18  omp_unset_lock
19  USE dbm_api, ONLY: &
25  USE dbt_tas_base, ONLY: &
28  dbt_tas_iterator_blocks_left, dbt_tas_iterator_next_block, dbt_tas_iterator_start, &
29  dbt_tas_iterator_stop, dbt_tas_put_block, dbt_tas_reserve_blocks
30  USE dbt_tas_global, ONLY: dbt_tas_blk_size_arb,&
31  dbt_tas_blk_size_repl,&
32  dbt_tas_dist_arb,&
33  dbt_tas_dist_repl,&
34  dbt_tas_distribution,&
35  dbt_tas_rowcol_data
36  USE dbt_tas_split, ONLY: colsplit,&
38  rowsplit
39  USE dbt_tas_types, ONLY: dbt_tas_distribution_type,&
40  dbt_tas_iterator,&
41  dbt_tas_split_info,&
42  dbt_tas_type
43  USE dbt_tas_util, ONLY: swap
44  USE kinds, ONLY: dp,&
45  int_8
46  USE message_passing, ONLY: mp_cart_type,&
47  mp_comm_type,&
48  mp_request_type,&
49  mp_waitall
50 #include "../../base/base_uses.f90"
51 
52  IMPLICIT NONE
53  PRIVATE
54 
55  CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'dbt_tas_reshape_ops'
56 
57  PUBLIC :: &
58  dbt_tas_merge, &
61 
62  TYPE dbt_buffer_type
63  INTEGER :: nblock = -1
64  INTEGER(KIND=int_8), DIMENSION(:, :), ALLOCATABLE :: indx
65  REAL(dp), DIMENSION(:), ALLOCATABLE :: msg
66  INTEGER :: endpos = -1
67  END TYPE
68 
69 CONTAINS
70 
71 ! **************************************************************************************************
72 !> \brief copy data (involves reshape)
73 !> \param matrix_in ...
74 !> \param matrix_out ...
75 !> \param summation whether matrix_out = matrix_out + matrix_in
76 !> \param transposed ...
77 !> \param move_data memory optimization: move data to matrix_out such that matrix_in is empty on return
78 !> \author Patrick Seewald
79 ! **************************************************************************************************
80  RECURSIVE SUBROUTINE dbt_tas_reshape(matrix_in, matrix_out, summation, transposed, move_data)
81  TYPE(dbt_tas_type), INTENT(INOUT) :: matrix_in, matrix_out
82  LOGICAL, INTENT(IN), OPTIONAL :: summation, transposed, move_data
83 
84  CHARACTER(LEN=*), PARAMETER :: routinen = 'dbt_tas_reshape'
85 
86  INTEGER :: a, b, bcount, handle, handle2, iproc, &
87  nblk, nblk_per_thread, ndata, numnodes
88  INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:, :) :: blks_to_allocate, index_recv
89  INTEGER(KIND=int_8), DIMENSION(2) :: blk_index
90  INTEGER(kind=omp_lock_kind), ALLOCATABLE, &
91  DIMENSION(:) :: locks
92  INTEGER, ALLOCATABLE, DIMENSION(:) :: num_blocks_recv, num_blocks_send, &
93  num_entries_recv, num_entries_send, &
94  num_rec, num_send
95  INTEGER, DIMENSION(2) :: blk_size
96  LOGICAL :: move_prv, tr_in
97  REAL(kind=dp), DIMENSION(:, :), POINTER :: block
98  TYPE(dbt_buffer_type), ALLOCATABLE, DIMENSION(:) :: buffer_recv, buffer_send
99  TYPE(dbt_tas_iterator) :: iter
100  TYPE(dbt_tas_split_info) :: info
101  TYPE(mp_comm_type) :: mp_comm
102  TYPE(mp_request_type), ALLOCATABLE, &
103  DIMENSION(:, :) :: req_array
104 
105  CALL timeset(routinen, handle)
106 
107  IF (PRESENT(summation)) THEN
108  IF (.NOT. summation) CALL dbm_clear(matrix_out%matrix)
109  ELSE
110  CALL dbm_clear(matrix_out%matrix)
111  END IF
112 
113  IF (PRESENT(move_data)) THEN
114  move_prv = move_data
115  ELSE
116  move_prv = .false.
117  END IF
118 
119  IF (PRESENT(transposed)) THEN
120  tr_in = transposed
121  ELSE
122  tr_in = .false.
123  END IF
124 
125  IF (.NOT. matrix_out%valid) THEN
126  cpabort("can not reshape into invalid matrix")
127  END IF
128 
129  info = dbt_tas_info(matrix_in)
130  mp_comm = info%mp_comm
131  numnodes = mp_comm%num_pe
132  ALLOCATE (buffer_send(0:numnodes - 1))
133  ALLOCATE (buffer_recv(0:numnodes - 1))
134  ALLOCATE (num_blocks_recv(0:numnodes - 1))
135  ALLOCATE (num_blocks_send(0:numnodes - 1))
136  ALLOCATE (num_entries_recv(0:numnodes - 1))
137  ALLOCATE (num_entries_send(0:numnodes - 1))
138  ALLOCATE (num_rec(0:2*numnodes - 1))
139  ALLOCATE (num_send(0:2*numnodes - 1))
140  num_send(:) = 0
141  ALLOCATE (req_array(1:numnodes, 4))
142  ALLOCATE (locks(0:numnodes - 1))
143  DO iproc = 0, numnodes - 1
144  CALL omp_init_lock(locks(iproc))
145  END DO
146 
147  CALL timeset(routinen//"_get_coord", handle2)
148 !$OMP PARALLEL DEFAULT(NONE) SHARED(matrix_in,matrix_out,tr_in,num_send) &
149 !$OMP PRIVATE(iter,blk_index,blk_size,iproc)
150  CALL dbt_tas_iterator_start(iter, matrix_in)
151  DO WHILE (dbt_tas_iterator_blocks_left(iter))
152  CALL dbt_tas_iterator_next_block(iter, blk_index(1), blk_index(2), &
153  row_size=blk_size(1), col_size=blk_size(2))
154  IF (tr_in) THEN
155  CALL dbt_tas_get_stored_coordinates(matrix_out, blk_index(2), blk_index(1), iproc)
156  ELSE
157  CALL dbt_tas_get_stored_coordinates(matrix_out, blk_index(1), blk_index(2), iproc)
158  END IF
159 !$OMP ATOMIC
160  num_send(2*iproc) = num_send(2*iproc) + product(blk_size)
161 !$OMP ATOMIC
162  num_send(2*iproc + 1) = num_send(2*iproc + 1) + 1
163  END DO
164  CALL dbt_tas_iterator_stop(iter)
165 !$OMP END PARALLEL
166  CALL timestop(handle2)
167 
168  CALL timeset(routinen//"_alltoall", handle2)
169  CALL mp_comm%alltoall(num_send, num_rec, 2)
170  CALL timestop(handle2)
171 
172  CALL timeset(routinen//"_buffer_fill", handle2)
173  DO iproc = 0, numnodes - 1
174  num_entries_recv(iproc) = num_rec(2*iproc)
175  num_blocks_recv(iproc) = num_rec(2*iproc + 1)
176  num_entries_send(iproc) = num_send(2*iproc)
177  num_blocks_send(iproc) = num_send(2*iproc + 1)
178 
179  CALL dbt_buffer_create(buffer_send(iproc), num_blocks_send(iproc), num_entries_send(iproc))
180 
181  CALL dbt_buffer_create(buffer_recv(iproc), num_blocks_recv(iproc), num_entries_recv(iproc))
182 
183  END DO
184 
185 !$OMP PARALLEL DEFAULT(NONE) SHARED(matrix_in,matrix_out,tr_in,buffer_send,locks) &
186 !$OMP PRIVATE(iter,blk_index,blk_size,block,iproc)
187  CALL dbt_tas_iterator_start(iter, matrix_in)
188  DO WHILE (dbt_tas_iterator_blocks_left(iter))
189  CALL dbt_tas_iterator_next_block(iter, blk_index(1), blk_index(2), block, &
190  row_size=blk_size(1), col_size=blk_size(2))
191  IF (tr_in) THEN
192  CALL dbt_tas_get_stored_coordinates(matrix_out, blk_index(2), blk_index(1), iproc)
193  ELSE
194  CALL dbt_tas_get_stored_coordinates(matrix_out, blk_index(1), blk_index(2), iproc)
195  END IF
196  CALL omp_set_lock(locks(iproc))
197  CALL dbt_buffer_add_block(buffer_send(iproc), blk_index, block, transposed=tr_in)
198  CALL omp_unset_lock(locks(iproc))
199  END DO
200  CALL dbt_tas_iterator_stop(iter)
201 !$OMP END PARALLEL
202 
203  IF (move_prv) CALL dbt_tas_clear(matrix_in)
204 
205  CALL timestop(handle2)
206 
207  CALL timeset(routinen//"_communicate_buffer", handle2)
208  CALL dbt_tas_communicate_buffer(mp_comm, buffer_recv, buffer_send, req_array)
209 
210  DO iproc = 0, numnodes - 1
211  CALL dbt_buffer_destroy(buffer_send(iproc))
212  END DO
213 
214  CALL timestop(handle2)
215 
216  CALL timeset(routinen//"_buffer_obtain", handle2)
217 
218  ! TODO Add OpenMP to the buffer unpacking.
219  nblk = sum(num_blocks_recv)
220  ALLOCATE (blks_to_allocate(nblk, 2))
221 
222  bcount = 0
223  DO iproc = 0, numnodes - 1
224  CALL dbt_buffer_get_index(buffer_recv(iproc), index_recv)
225  blks_to_allocate(bcount + 1:bcount + SIZE(index_recv, 1), :) = index_recv(:, :)
226  bcount = bcount + SIZE(index_recv, 1)
227  DEALLOCATE (index_recv)
228  END DO
229 
230 !TODO: Parallelize creation of block list.
231 !$OMP PARALLEL DEFAULT(NONE) SHARED(matrix_out,nblk,blks_to_allocate) PRIVATE(nblk_per_thread,A,b)
232  nblk_per_thread = nblk/omp_get_num_threads() + 1
233  a = omp_get_thread_num()*nblk_per_thread + 1
234  b = min(a + nblk_per_thread, nblk)
235  CALL dbt_tas_reserve_blocks(matrix_out, blks_to_allocate(a:b, 1), blks_to_allocate(a:b, 2))
236 !$OMP END PARALLEL
237  DEALLOCATE (blks_to_allocate)
238 
239  DO iproc = 0, numnodes - 1
240  ! First, we need to get the index to create block
241  DO WHILE (dbt_buffer_blocks_left(buffer_recv(iproc)))
242  CALL dbt_buffer_get_next_block(buffer_recv(iproc), ndata, blk_index)
243  CALL dbt_tas_blk_sizes(matrix_out, blk_index(1), blk_index(2), blk_size(1), blk_size(2))
244  ALLOCATE (block(blk_size(1), blk_size(2)))
245  CALL dbt_buffer_get_next_block(buffer_recv(iproc), ndata, blk_index, block)
246  CALL dbt_tas_put_block(matrix_out, blk_index(1), blk_index(2), block, summation=summation)
247  DEALLOCATE (block)
248  END DO
249  CALL dbt_buffer_destroy(buffer_recv(iproc))
250  END DO
251 
252  CALL timestop(handle2)
253 
254  CALL dbt_tas_finalize(matrix_out)
255 
256  CALL timestop(handle)
257  END SUBROUTINE
258 
259 ! **************************************************************************************************
260 !> \brief Replicate matrix_in such that each submatrix of matrix_out is an exact copy of matrix_in
261 !> \param matrix_in ...
262 !> \param info ...
263 !> \param matrix_out ...
264 !> \param nodata Don't copy data but create matrix_out
265 !> \param move_data memory optimization: move data to matrix_out such that matrix_in is empty on return
266 !> \author Patrick Seewald
267 ! **************************************************************************************************
268  SUBROUTINE dbt_tas_replicate(matrix_in, info, matrix_out, nodata, move_data)
269  TYPE(dbm_type), INTENT(INOUT) :: matrix_in
270  TYPE(dbt_tas_split_info), INTENT(IN) :: info
271  TYPE(dbt_tas_type), INTENT(OUT) :: matrix_out
272  LOGICAL, INTENT(IN), OPTIONAL :: nodata, move_data
273 
274  INTEGER :: a, b, nblk_per_thread, nblkcols, nblkrows
275  INTEGER, DIMENSION(2) :: pdims
276  INTEGER, DIMENSION(:), POINTER :: col_blk_size, col_dist, row_blk_size, &
277  row_dist
278  TYPE(dbm_distribution_obj) :: dbm_dist
279  TYPE(dbt_tas_dist_arb), TARGET :: dir_dist
280  TYPE(dbt_tas_dist_repl), TARGET :: repl_dist
281 
282  CLASS(dbt_tas_distribution), ALLOCATABLE :: col_dist_obj, row_dist_obj
283  CLASS(dbt_tas_rowcol_data), ALLOCATABLE :: row_bsize_obj, col_bsize_obj
284  TYPE(dbt_tas_blk_size_repl), TARGET :: repl_blksize
285  TYPE(dbt_tas_blk_size_arb), TARGET :: dir_blksize
286  TYPE(dbt_tas_distribution_type) :: dist
287  INTEGER :: numnodes, ngroup
288  INTEGER(kind=omp_lock_kind), ALLOCATABLE, DIMENSION(:) :: locks
289  TYPE(dbt_buffer_type), ALLOCATABLE, DIMENSION(:) :: buffer_recv, buffer_send
290  INTEGER, ALLOCATABLE, DIMENSION(:) :: num_blocks_recv, num_blocks_send, &
291  num_entries_recv, num_entries_send, &
292  num_rec, num_send
293  TYPE(mp_request_type), ALLOCATABLE, DIMENSION(:, :) :: req_array
294  INTEGER, ALLOCATABLE, DIMENSION(:, :) :: blks_to_allocate
295  INTEGER, DIMENSION(2) :: blk_size
296  INTEGER, DIMENSION(2) :: blk_index
297  INTEGER(KIND=int_8), DIMENSION(2) :: blk_index_i8
298  TYPE(dbm_iterator) :: iter
299  INTEGER :: i, iproc, bcount, nblk
300  INTEGER, DIMENSION(:), ALLOCATABLE :: iprocs
301  LOGICAL :: nodata_prv, move_prv
302  INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:, :) :: index_recv
303  INTEGER :: ndata
304  TYPE(mp_cart_type) :: mp_comm
305 
306  REAL(kind=dp), DIMENSION(:, :), POINTER :: block
307 
308  CHARACTER(LEN=*), PARAMETER :: routinen = 'dbt_tas_replicate'
309 
310  INTEGER :: handle, handle2
311 
312  NULLIFY (col_blk_size, row_blk_size)
313 
314  CALL timeset(routinen, handle)
315 
316  IF (PRESENT(nodata)) THEN
317  nodata_prv = nodata
318  ELSE
319  nodata_prv = .false.
320  END IF
321 
322  IF (PRESENT(move_data)) THEN
323  move_prv = move_data
324  ELSE
325  move_prv = .false.
326  END IF
327 
328  row_blk_size => dbm_get_row_block_sizes(matrix_in)
329  col_blk_size => dbm_get_col_block_sizes(matrix_in)
330  nblkrows = SIZE(row_blk_size)
331  nblkcols = SIZE(col_blk_size)
332  dbm_dist = dbm_get_distribution(matrix_in)
333  row_dist => dbm_distribution_row_dist(dbm_dist)
334  col_dist => dbm_distribution_col_dist(dbm_dist)
335 
336  mp_comm = info%mp_comm
337  ngroup = info%ngroup
338 
339  numnodes = mp_comm%num_pe
340  pdims = mp_comm%num_pe_cart
341 
342  SELECT CASE (info%split_rowcol)
343  CASE (rowsplit)
344  repl_dist = dbt_tas_dist_repl(row_dist, pdims(1), nblkrows, info%ngroup, info%pgrid_split_size)
345  dir_dist = dbt_tas_dist_arb(col_dist, pdims(2), int(nblkcols, kind=int_8))
346  repl_blksize = dbt_tas_blk_size_repl(row_blk_size, info%ngroup)
347  dir_blksize = dbt_tas_blk_size_arb(col_blk_size)
348  ALLOCATE (row_dist_obj, source=repl_dist)
349  ALLOCATE (col_dist_obj, source=dir_dist)
350  ALLOCATE (row_bsize_obj, source=repl_blksize)
351  ALLOCATE (col_bsize_obj, source=dir_blksize)
352  CASE (colsplit)
353  dir_dist = dbt_tas_dist_arb(row_dist, pdims(1), int(nblkrows, kind=int_8))
354  repl_dist = dbt_tas_dist_repl(col_dist, pdims(2), nblkcols, info%ngroup, info%pgrid_split_size)
355  dir_blksize = dbt_tas_blk_size_arb(row_blk_size)
356  repl_blksize = dbt_tas_blk_size_repl(col_blk_size, info%ngroup)
357  ALLOCATE (row_dist_obj, source=dir_dist)
358  ALLOCATE (col_dist_obj, source=repl_dist)
359  ALLOCATE (row_bsize_obj, source=dir_blksize)
360  ALLOCATE (col_bsize_obj, source=repl_blksize)
361  END SELECT
362 
363  CALL dbt_tas_distribution_new(dist, mp_comm, row_dist_obj, col_dist_obj, split_info=info)
364  CALL dbt_tas_create(matrix_out, trim(dbm_get_name(matrix_in))//" replicated", &
365  dist, row_bsize_obj, col_bsize_obj, own_dist=.true.)
366 
367  IF (nodata_prv) THEN
368  CALL dbt_tas_finalize(matrix_out)
369  CALL timestop(handle)
370  RETURN
371  END IF
372 
373  ALLOCATE (buffer_send(0:numnodes - 1))
374  ALLOCATE (buffer_recv(0:numnodes - 1))
375  ALLOCATE (num_blocks_recv(0:numnodes - 1))
376  ALLOCATE (num_blocks_send(0:numnodes - 1))
377  ALLOCATE (num_entries_recv(0:numnodes - 1))
378  ALLOCATE (num_entries_send(0:numnodes - 1))
379  ALLOCATE (num_rec(0:2*numnodes - 1))
380  ALLOCATE (num_send(0:2*numnodes - 1))
381  num_send(:) = 0
382  ALLOCATE (req_array(1:numnodes, 4))
383  ALLOCATE (locks(0:numnodes - 1))
384  DO iproc = 0, numnodes - 1
385  CALL omp_init_lock(locks(iproc))
386  END DO
387 
388 !$OMP PARALLEL DEFAULT(NONE) SHARED(matrix_in,matrix_out,num_send,ngroup) &
389 !$OMP PRIVATE(iter,blk_index,blk_size,iprocs)
390  ALLOCATE (iprocs(ngroup))
391  CALL dbm_iterator_start(iter, matrix_in)
392  DO WHILE (dbm_iterator_blocks_left(iter))
393  CALL dbm_iterator_next_block(iter, blk_index(1), blk_index(2), &
394  row_size=blk_size(1), col_size=blk_size(2))
395  CALL dbt_repl_get_stored_coordinates(matrix_out, blk_index(1), blk_index(2), iprocs)
396  DO i = 1, SIZE(iprocs)
397 !$OMP ATOMIC
398  num_send(2*iprocs(i)) = num_send(2*iprocs(i)) + product(blk_size)
399 !$OMP ATOMIC
400  num_send(2*iprocs(i) + 1) = num_send(2*iprocs(i) + 1) + 1
401  END DO
402  END DO
403  CALL dbm_iterator_stop(iter)
404  DEALLOCATE (iprocs)
405 !$OMP END PARALLEL
406 
407  CALL timeset(routinen//"_alltoall", handle2)
408  CALL mp_comm%alltoall(num_send, num_rec, 2)
409  CALL timestop(handle2)
410 
411  DO iproc = 0, numnodes - 1
412  num_entries_recv(iproc) = num_rec(2*iproc)
413  num_blocks_recv(iproc) = num_rec(2*iproc + 1)
414  num_entries_send(iproc) = num_send(2*iproc)
415  num_blocks_send(iproc) = num_send(2*iproc + 1)
416 
417  CALL dbt_buffer_create(buffer_send(iproc), num_blocks_send(iproc), num_entries_send(iproc))
418 
419  CALL dbt_buffer_create(buffer_recv(iproc), num_blocks_recv(iproc), num_entries_recv(iproc))
420 
421  END DO
422 
423 !$OMP PARALLEL DEFAULT(NONE) SHARED(matrix_in,matrix_out,buffer_send,locks,ngroup) &
424 !$OMP PRIVATE(iter,blk_index,blk_size,block,iprocs)
425  ALLOCATE (iprocs(ngroup))
426  CALL dbm_iterator_start(iter, matrix_in)
427  DO WHILE (dbm_iterator_blocks_left(iter))
428  CALL dbm_iterator_next_block(iter, blk_index(1), blk_index(2), block, &
429  row_size=blk_size(1), col_size=blk_size(2))
430  CALL dbt_repl_get_stored_coordinates(matrix_out, blk_index(1), blk_index(2), iprocs)
431  DO i = 1, SIZE(iprocs)
432  CALL omp_set_lock(locks(iprocs(i)))
433  CALL dbt_buffer_add_block(buffer_send(iprocs(i)), int(blk_index, kind=int_8), block)
434  CALL omp_unset_lock(locks(iprocs(i)))
435  END DO
436  END DO
437  CALL dbm_iterator_stop(iter)
438  DEALLOCATE (iprocs)
439 !$OMP END PARALLEL
440 
441  IF (move_prv) CALL dbm_clear(matrix_in)
442 
443  CALL timeset(routinen//"_communicate_buffer", handle2)
444  CALL dbt_tas_communicate_buffer(mp_comm, buffer_recv, buffer_send, req_array)
445 
446  DO iproc = 0, numnodes - 1
447  CALL dbt_buffer_destroy(buffer_send(iproc))
448  END DO
449 
450  CALL timestop(handle2)
451 
452  ! TODO Add OpenMP to the buffer unpacking.
453  nblk = sum(num_blocks_recv)
454  ALLOCATE (blks_to_allocate(nblk, 2))
455 
456  bcount = 0
457  DO iproc = 0, numnodes - 1
458  CALL dbt_buffer_get_index(buffer_recv(iproc), index_recv)
459  blks_to_allocate(bcount + 1:bcount + SIZE(index_recv, 1), :) = int(index_recv(:, :))
460  bcount = bcount + SIZE(index_recv, 1)
461  DEALLOCATE (index_recv)
462  END DO
463 
464 !TODO: Parallelize creation of block list.
465 !$OMP PARALLEL DEFAULT(NONE) SHARED(matrix_out,nblk,blks_to_allocate) PRIVATE(nblk_per_thread,A,b)
466  nblk_per_thread = nblk/omp_get_num_threads() + 1
467  a = omp_get_thread_num()*nblk_per_thread + 1
468  b = min(a + nblk_per_thread, nblk)
469  CALL dbm_reserve_blocks(matrix_out%matrix, blks_to_allocate(a:b, 1), blks_to_allocate(a:b, 2))
470 !$OMP END PARALLEL
471  DEALLOCATE (blks_to_allocate)
472 
473  DO iproc = 0, numnodes - 1
474  ! First, we need to get the index to create block
475  DO WHILE (dbt_buffer_blocks_left(buffer_recv(iproc)))
476  CALL dbt_buffer_get_next_block(buffer_recv(iproc), ndata, blk_index_i8)
477  CALL dbt_tas_blk_sizes(matrix_out, blk_index_i8(1), blk_index_i8(2), blk_size(1), blk_size(2))
478  ALLOCATE (block(blk_size(1), blk_size(2)))
479  CALL dbt_buffer_get_next_block(buffer_recv(iproc), ndata, blk_index_i8, block)
480  CALL dbm_put_block(matrix_out%matrix, int(blk_index_i8(1)), int(blk_index_i8(2)), block)
481  DEALLOCATE (block)
482  END DO
483 
484  CALL dbt_buffer_destroy(buffer_recv(iproc))
485  END DO
486 
487  CALL dbt_tas_finalize(matrix_out)
488 
489  CALL timestop(handle)
490 
491  END SUBROUTINE
492 
493 ! **************************************************************************************************
494 !> \brief Merge submatrices of matrix_in to matrix_out by sum
495 !> \param matrix_out ...
496 !> \param matrix_in ...
497 !> \param summation ...
498 !> \param move_data memory optimization: move data to matrix_out such that matrix_in is empty on return
499 !> \author Patrick Seewald
500 ! **************************************************************************************************
501  SUBROUTINE dbt_tas_merge(matrix_out, matrix_in, summation, move_data)
502  TYPE(dbm_type), INTENT(INOUT) :: matrix_out
503  TYPE(dbt_tas_type), INTENT(INOUT) :: matrix_in
504  LOGICAL, INTENT(IN), OPTIONAL :: summation, move_data
505 
506  CHARACTER(LEN=*), PARAMETER :: routinen = 'dbt_tas_merge'
507 
508  INTEGER :: a, b, bcount, handle, handle2, iproc, &
509  nblk, nblk_per_thread, ndata, numnodes
510  INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:, :) :: index_recv
511  INTEGER(KIND=int_8), DIMENSION(2) :: blk_index_i8
512  INTEGER(kind=omp_lock_kind), ALLOCATABLE, &
513  DIMENSION(:) :: locks
514  INTEGER, ALLOCATABLE, DIMENSION(:) :: num_blocks_recv, num_blocks_send, &
515  num_entries_recv, num_entries_send, &
516  num_rec, num_send
517  INTEGER, ALLOCATABLE, DIMENSION(:, :) :: blks_to_allocate
518  INTEGER, DIMENSION(2) :: blk_index, blk_size
519  INTEGER, DIMENSION(:), POINTER :: col_block_sizes, row_block_sizes
520  LOGICAL :: move_prv
521  REAL(dp), DIMENSION(:, :), POINTER :: block
522  TYPE(dbm_iterator) :: iter
523  TYPE(dbt_buffer_type), ALLOCATABLE, DIMENSION(:) :: buffer_recv, buffer_send
524  TYPE(dbt_tas_split_info) :: info
525  TYPE(mp_cart_type) :: mp_comm
526  TYPE(mp_request_type), ALLOCATABLE, &
527  DIMENSION(:, :) :: req_array
528 
529 !!
530 
531  CALL timeset(routinen, handle)
532 
533  IF (PRESENT(summation)) THEN
534  IF (.NOT. summation) CALL dbm_clear(matrix_out)
535  ELSE
536  CALL dbm_clear(matrix_out)
537  END IF
538 
539  IF (PRESENT(move_data)) THEN
540  move_prv = move_data
541  ELSE
542  move_prv = .false.
543  END IF
544 
545  info = dbt_tas_info(matrix_in)
546  CALL dbt_tas_get_split_info(info, mp_comm=mp_comm)
547  numnodes = mp_comm%num_pe
548 
549  ALLOCATE (buffer_send(0:numnodes - 1))
550  ALLOCATE (buffer_recv(0:numnodes - 1))
551  ALLOCATE (num_blocks_recv(0:numnodes - 1))
552  ALLOCATE (num_blocks_send(0:numnodes - 1))
553  ALLOCATE (num_entries_recv(0:numnodes - 1))
554  ALLOCATE (num_entries_send(0:numnodes - 1))
555  ALLOCATE (num_rec(0:2*numnodes - 1))
556  ALLOCATE (num_send(0:2*numnodes - 1))
557  num_send(:) = 0
558  ALLOCATE (req_array(1:numnodes, 4))
559  ALLOCATE (locks(0:numnodes - 1))
560  DO iproc = 0, numnodes - 1
561  CALL omp_init_lock(locks(iproc))
562  END DO
563 
564 !$OMP PARALLEL DEFAULT(NONE) SHARED(matrix_in,matrix_out,num_send) &
565 !$OMP PRIVATE(iter,blk_index,blk_size,iproc)
566  CALL dbm_iterator_start(iter, matrix_in%matrix)
567  DO WHILE (dbm_iterator_blocks_left(iter))
568  CALL dbm_iterator_next_block(iter, blk_index(1), blk_index(2), &
569  row_size=blk_size(1), col_size=blk_size(2))
570  CALL dbm_get_stored_coordinates(matrix_out, blk_index(1), blk_index(2), iproc)
571 !$OMP ATOMIC
572  num_send(2*iproc) = num_send(2*iproc) + product(blk_size)
573 !$OMP ATOMIC
574  num_send(2*iproc + 1) = num_send(2*iproc + 1) + 1
575  END DO
576  CALL dbm_iterator_stop(iter)
577 !$OMP END PARALLEL
578 
579  CALL timeset(routinen//"_alltoall", handle2)
580  CALL mp_comm%alltoall(num_send, num_rec, 2)
581  CALL timestop(handle2)
582 
583  DO iproc = 0, numnodes - 1
584  num_entries_recv(iproc) = num_rec(2*iproc)
585  num_blocks_recv(iproc) = num_rec(2*iproc + 1)
586  num_entries_send(iproc) = num_send(2*iproc)
587  num_blocks_send(iproc) = num_send(2*iproc + 1)
588 
589  CALL dbt_buffer_create(buffer_send(iproc), num_blocks_send(iproc), num_entries_send(iproc))
590 
591  CALL dbt_buffer_create(buffer_recv(iproc), num_blocks_recv(iproc), num_entries_recv(iproc))
592 
593  END DO
594 
595 !$OMP PARALLEL DEFAULT(NONE) SHARED(matrix_in,matrix_out,buffer_send,locks) &
596 !$OMP PRIVATE(iter,blk_index,blk_size,block,iproc)
597  CALL dbm_iterator_start(iter, matrix_in%matrix)
598  DO WHILE (dbm_iterator_blocks_left(iter))
599  CALL dbm_iterator_next_block(iter, blk_index(1), blk_index(2), block, &
600  row_size=blk_size(1), col_size=blk_size(2))
601  CALL dbm_get_stored_coordinates(matrix_out, blk_index(1), blk_index(2), iproc)
602  CALL omp_set_lock(locks(iproc))
603  CALL dbt_buffer_add_block(buffer_send(iproc), int(blk_index, kind=int_8), block)
604  CALL omp_unset_lock(locks(iproc))
605  END DO
606  CALL dbm_iterator_stop(iter)
607 !$OMP END PARALLEL
608 
609  IF (move_prv) CALL dbt_tas_clear(matrix_in)
610 
611  CALL timeset(routinen//"_communicate_buffer", handle2)
612  CALL dbt_tas_communicate_buffer(mp_comm, buffer_recv, buffer_send, req_array)
613 
614  DO iproc = 0, numnodes - 1
615  CALL dbt_buffer_destroy(buffer_send(iproc))
616  END DO
617 
618  CALL timestop(handle2)
619 
620  ! TODO Add OpenMP to the buffer unpacking.
621  nblk = sum(num_blocks_recv)
622  ALLOCATE (blks_to_allocate(nblk, 2))
623 
624  bcount = 0
625  DO iproc = 0, numnodes - 1
626  CALL dbt_buffer_get_index(buffer_recv(iproc), index_recv)
627  blks_to_allocate(bcount + 1:bcount + SIZE(index_recv, 1), :) = int(index_recv(:, :))
628  bcount = bcount + SIZE(index_recv, 1)
629  DEALLOCATE (index_recv)
630  END DO
631 
632 !TODO: Parallelize creation of block list.
633 !$OMP PARALLEL DEFAULT(NONE) SHARED(matrix_out,nblk,blks_to_allocate) PRIVATE(nblk_per_thread,A,b)
634  nblk_per_thread = nblk/omp_get_num_threads() + 1
635  a = omp_get_thread_num()*nblk_per_thread + 1
636  b = min(a + nblk_per_thread, nblk)
637  CALL dbm_reserve_blocks(matrix_out, blks_to_allocate(a:b, 1), blks_to_allocate(a:b, 2))
638 !$OMP END PARALLEL
639  DEALLOCATE (blks_to_allocate)
640 
641  DO iproc = 0, numnodes - 1
642  ! First, we need to get the index to create block
643  DO WHILE (dbt_buffer_blocks_left(buffer_recv(iproc)))
644  CALL dbt_buffer_get_next_block(buffer_recv(iproc), ndata, blk_index_i8)
645  row_block_sizes => dbm_get_row_block_sizes(matrix_out)
646  col_block_sizes => dbm_get_col_block_sizes(matrix_out)
647  blk_size(1) = row_block_sizes(int(blk_index_i8(1)))
648  blk_size(2) = col_block_sizes(int(blk_index_i8(2)))
649  ALLOCATE (block(blk_size(1), blk_size(2)))
650  CALL dbt_buffer_get_next_block(buffer_recv(iproc), ndata, blk_index_i8, block)
651  CALL dbm_put_block(matrix_out, int(blk_index_i8(1)), int(blk_index_i8(2)), block, summation=.true.)
652  DEALLOCATE (block)
653  END DO
654  CALL dbt_buffer_destroy(buffer_recv(iproc))
655  END DO
656 
657  CALL dbm_finalize(matrix_out)
658 
659  CALL timestop(handle)
660  END SUBROUTINE
661 
662 ! **************************************************************************************************
663 !> \brief get all indices from buffer
664 !> \param buffer ...
665 !> \param index ...
666 !> \author Patrick Seewald
667 ! **************************************************************************************************
668  SUBROUTINE dbt_buffer_get_index(buffer, index)
669  TYPE(dbt_buffer_type), INTENT(IN) :: buffer
670  INTEGER(KIND=int_8), ALLOCATABLE, &
671  DIMENSION(:, :), INTENT(OUT) :: index
672 
673  CHARACTER(LEN=*), PARAMETER :: routinen = 'dbt_buffer_get_index'
674 
675  INTEGER :: handle
676  INTEGER, DIMENSION(2) :: indx_shape
677 
678  CALL timeset(routinen, handle)
679 
680  indx_shape = shape(buffer%indx) - [0, 1]
681  ALLOCATE (index(indx_shape(1), indx_shape(2)))
682  index(:, :) = buffer%indx(1:indx_shape(1), 1:indx_shape(2))
683  CALL timestop(handle)
684  END SUBROUTINE
685 
686 ! **************************************************************************************************
687 !> \brief how many blocks left in iterator
688 !> \param buffer ...
689 !> \return ...
690 !> \author Patrick Seewald
691 ! **************************************************************************************************
692  PURE FUNCTION dbt_buffer_blocks_left(buffer)
693  TYPE(dbt_buffer_type), INTENT(IN) :: buffer
694  LOGICAL :: dbt_buffer_blocks_left
695 
696  dbt_buffer_blocks_left = buffer%endpos .LT. buffer%nblock
697  END FUNCTION
698 
699 ! **************************************************************************************************
700 !> \brief Create block buffer for MPI communication.
701 !> \param buffer block buffer
702 !> \param nblock number of blocks
703 !> \param ndata total number of block entries
704 !> \author Patrick Seewald
705 ! **************************************************************************************************
706  SUBROUTINE dbt_buffer_create(buffer, nblock, ndata)
707  TYPE(dbt_buffer_type), INTENT(OUT) :: buffer
708  INTEGER, INTENT(IN) :: nblock, ndata
709 
710  buffer%nblock = nblock
711  buffer%endpos = 0
712  ALLOCATE (buffer%msg(ndata))
713  ALLOCATE (buffer%indx(nblock, 3))
714  END SUBROUTINE
715 
716 ! **************************************************************************************************
717 !> \brief ...
718 !> \param buffer ...
719 !> \author Patrick Seewald
720 ! **************************************************************************************************
721  SUBROUTINE dbt_buffer_destroy(buffer)
722  TYPE(dbt_buffer_type), INTENT(INOUT) :: buffer
723 
724  DEALLOCATE (buffer%msg)
725  DEALLOCATE (buffer%indx)
726  buffer%nblock = -1
727  buffer%endpos = -1
728  END SUBROUTINE dbt_buffer_destroy
729 
730 ! **************************************************************************************************
731 !> \brief insert a block into block buffer (at current iterator position)
732 !> \param buffer ...
733 !> \param index index of block
734 !> \param block ...
735 !> \param transposed ...
736 !> \author Patrick Seewald
737 ! **************************************************************************************************
738  SUBROUTINE dbt_buffer_add_block(buffer, index, block, transposed)
739  TYPE(dbt_buffer_type), INTENT(INOUT) :: buffer
740  INTEGER(KIND=int_8), DIMENSION(2), INTENT(IN) :: index
741  REAL(dp), DIMENSION(:, :), INTENT(IN) :: block
742  LOGICAL, INTENT(IN), OPTIONAL :: transposed
743 
744  INTEGER :: ndata, p, p_data
745  INTEGER(KIND=int_8), DIMENSION(2) :: index_prv
746  LOGICAL :: tr
747 
748  IF (PRESENT(transposed)) THEN
749  tr = transposed
750  ELSE
751  tr = .false.
752  END IF
753 
754  index_prv(:) = index(:)
755  IF (tr) THEN
756  CALL swap(index_prv)
757  END IF
758  ndata = product(shape(block))
759 
760  p = buffer%endpos
761  IF (p .EQ. 0) THEN
762  p_data = 0
763  ELSE
764  p_data = int(buffer%indx(p, 3))
765  END IF
766 
767  IF (tr) THEN
768  buffer%msg(p_data + 1:p_data + ndata) = reshape(transpose(block), [ndata])
769  ELSE
770  buffer%msg(p_data + 1:p_data + ndata) = reshape(block, [ndata])
771  END IF
772 
773  buffer%indx(p + 1, 1:2) = index_prv(:)
774  IF (p > 0) THEN
775  buffer%indx(p + 1, 3) = buffer%indx(p, 3) + int(ndata, kind=int_8)
776  ELSE
777  buffer%indx(p + 1, 3) = int(ndata, kind=int_8)
778  END IF
779  buffer%endpos = buffer%endpos + 1
780  END SUBROUTINE
781 
782 ! **************************************************************************************************
783 !> \brief get next block from buffer. Iterator is advanced only if block is retrieved or advance_iter.
784 !> \param buffer ...
785 !> \param ndata ...
786 !> \param index ...
787 !> \param block ...
788 !> \param advance_iter ...
789 !> \author Patrick Seewald
790 ! **************************************************************************************************
791  SUBROUTINE dbt_buffer_get_next_block(buffer, ndata, index, block, advance_iter)
792  TYPE(dbt_buffer_type), INTENT(INOUT) :: buffer
793  INTEGER, INTENT(OUT) :: ndata
794  INTEGER(KIND=int_8), DIMENSION(2), INTENT(OUT) :: index
795  REAL(dp), DIMENSION(:, :), INTENT(OUT), OPTIONAL :: block
796  LOGICAL, INTENT(IN), OPTIONAL :: advance_iter
797 
798  INTEGER :: p, p_data
799  LOGICAL :: do_advance
800 
801  do_advance = .false.
802  IF (PRESENT(advance_iter)) THEN
803  do_advance = advance_iter
804  ELSE IF (PRESENT(block)) THEN
805  do_advance = .true.
806  END IF
807 
808  p = buffer%endpos
809  IF (p .EQ. 0) THEN
810  p_data = 0
811  ELSE
812  p_data = int(buffer%indx(p, 3))
813  END IF
814 
815  IF (p > 0) THEN
816  ndata = int(buffer%indx(p + 1, 3) - buffer%indx(p, 3))
817  ELSE
818  ndata = int(buffer%indx(p + 1, 3))
819  END IF
820  index(:) = buffer%indx(p + 1, 1:2)
821 
822  IF (PRESENT(block)) THEN
823  block(:, :) = reshape(buffer%msg(p_data + 1:p_data + ndata), shape(block))
824  END IF
825 
826  IF (do_advance) buffer%endpos = buffer%endpos + 1
827  END SUBROUTINE
828 
829 ! **************************************************************************************************
830 !> \brief communicate buffer
831 !> \param mp_comm ...
832 !> \param buffer_recv ...
833 !> \param buffer_send ...
834 !> \param req_array ...
835 !> \author Patrick Seewald
836 ! **************************************************************************************************
837  SUBROUTINE dbt_tas_communicate_buffer(mp_comm, buffer_recv, buffer_send, req_array)
838  CLASS(mp_comm_type), INTENT(IN) :: mp_comm
839  TYPE(dbt_buffer_type), DIMENSION(0:), &
840  INTENT(INOUT) :: buffer_recv, buffer_send
841  TYPE(mp_request_type), DIMENSION(:, :), &
842  INTENT(OUT) :: req_array
843 
844  CHARACTER(LEN=*), PARAMETER :: routinen = 'dbt_tas_communicate_buffer'
845 
846  INTEGER :: handle, iproc, numnodes, &
847  rec_counter, send_counter
848 
849  CALL timeset(routinen, handle)
850  numnodes = mp_comm%num_pe
851 
852  IF (numnodes > 1) THEN
853 
854  send_counter = 0
855  rec_counter = 0
856 
857  DO iproc = 0, numnodes - 1
858  IF (buffer_recv(iproc)%nblock > 0) THEN
859  rec_counter = rec_counter + 1
860  CALL mp_comm%irecv(buffer_recv(iproc)%indx, iproc, req_array(rec_counter, 3), tag=4)
861  CALL mp_comm%irecv(buffer_recv(iproc)%msg, iproc, req_array(rec_counter, 4), tag=7)
862  END IF
863  END DO
864 
865  DO iproc = 0, numnodes - 1
866  IF (buffer_send(iproc)%nblock > 0) THEN
867  send_counter = send_counter + 1
868  CALL mp_comm%isend(buffer_send(iproc)%indx, iproc, req_array(send_counter, 1), tag=4)
869  CALL mp_comm%isend(buffer_send(iproc)%msg, iproc, req_array(send_counter, 2), tag=7)
870  END IF
871  END DO
872 
873  IF (send_counter > 0) THEN
874  CALL mp_waitall(req_array(1:send_counter, 1:2))
875  END IF
876  IF (rec_counter > 0) THEN
877  CALL mp_waitall(req_array(1:rec_counter, 3:4))
878  END IF
879 
880  ELSE
881  IF (buffer_recv(0)%nblock > 0) THEN
882  buffer_recv(0)%indx(:, :) = buffer_send(0)%indx(:, :)
883  buffer_recv(0)%msg(:) = buffer_send(0)%msg(:)
884  END IF
885  END IF
886  CALL timestop(handle)
887  END SUBROUTINE
888 
889 END MODULE
Definition: dbm_api.F:8
subroutine, public dbm_clear(matrix)
Remove all blocks from given matrix, but does not release the underlying memory.
Definition: dbm_api.F:529
subroutine, public dbm_iterator_next_block(iterator, row, column, block, row_size, col_size)
Returns the next block from the given iterator.
Definition: dbm_api.F:910
subroutine, public dbm_get_stored_coordinates(matrix, row, column, processor)
Returns the MPI rank on which the given block should be stored.
Definition: dbm_api.F:1233
type(dbm_distribution_obj) function, public dbm_get_distribution(matrix)
Returns the distribution of the given matrix.
Definition: dbm_api.F:1266
integer function, dimension(:), pointer, contiguous, public dbm_get_row_block_sizes(matrix)
Returns the row block sizes of the given matrix.
Definition: dbm_api.F:1103
logical function, public dbm_iterator_blocks_left(iterator)
Tests whether the given iterator has any block left.
Definition: dbm_api.F:884
integer function, dimension(:), pointer, contiguous, public dbm_distribution_col_dist(dist)
Returns the columns of the given distribution.
Definition: dbm_api.F:1455
subroutine, public dbm_reserve_blocks(matrix, rows, cols)
Adds given list of blocks efficiently. The blocks will be filled with zeros.
Definition: dbm_api.F:589
subroutine, public dbm_iterator_stop(iterator)
Releases the given iterator.
Definition: dbm_api.F:948
subroutine, public dbm_put_block(matrix, row, col, block, summation)
Adds a block to given matrix. This routine is thread-safe. If block already exist then it gets overwr...
Definition: dbm_api.F:491
character(len=default_string_length) function, public dbm_get_name(matrix)
Returns the name of the matrix of the given matrix.
Definition: dbm_api.F:1023
subroutine, public dbm_finalize(matrix)
Needed to be called for DBCSR after blocks where inserted. For DBM it's a no-opt.
Definition: dbm_api.F:339
subroutine, public dbm_iterator_start(iterator, matrix)
Creates an iterator for the blocks of the given matrix. The iteration order is not stable.
Definition: dbm_api.F:837
integer function, dimension(:), pointer, contiguous, public dbm_get_col_block_sizes(matrix)
Returns the column block sizes of the given matrix.
Definition: dbm_api.F:1130
integer function, dimension(:), pointer, contiguous, public dbm_distribution_row_dist(dist)
Returns the rows of the given distribution.
Definition: dbm_api.F:1425
Tall-and-skinny matrices: base routines similar to DBM API, mostly wrappers around existing DBM routi...
Definition: dbt_tas_base.F:13
subroutine, public dbt_tas_blk_sizes(matrix, row, col, row_size, col_size)
Get block size for a given row & column.
Definition: dbt_tas_base.F:807
subroutine, public dbt_tas_get_stored_coordinates(matrix, row, column, processor)
As dbt_get_stored_coordinates.
Definition: dbt_tas_base.F:462
subroutine, public dbt_tas_iterator_start(iter, matrix_in)
As dbm_iterator_start.
Definition: dbt_tas_base.F:670
logical function, public dbt_tas_iterator_blocks_left(iter)
As dbm_iterator_blocks_left.
Definition: dbt_tas_base.F:698
subroutine, public dbt_repl_get_stored_coordinates(matrix, row, column, processors)
Get all processors for a given row/col combination if matrix is replicated on each process subgroup.
Definition: dbt_tas_base.F:488
subroutine, public dbt_tas_iterator_stop(iter)
As dbm_iterator_stop.
Definition: dbt_tas_base.F:710
subroutine, public dbt_tas_finalize(matrix)
...
Definition: dbt_tas_base.F:327
subroutine, public dbt_tas_distribution_new(dist, mp_comm, row_dist, col_dist, split_info, nosplit)
create new distribution. Exactly like dbm_distribution_new but with custom types for row_dist and col...
Definition: dbt_tas_base.F:346
type(dbt_tas_split_info) function, public dbt_tas_info(matrix)
get info on mpi grid splitting
Definition: dbt_tas_base.F:822
subroutine, public dbt_tas_clear(matrix)
Clear matrix (erase all data)
Definition: dbt_tas_base.F:974
subroutine, public dbt_tas_put_block(matrix, row, col, block, summation)
As dbm_put_block.
Global data (distribution and block sizes) for tall-and-skinny matrices For very sparse matrices with...
communication routines to reshape / replicate / merge tall-and-skinny matrices.
subroutine, public dbt_tas_merge(matrix_out, matrix_in, summation, move_data)
Merge submatrices of matrix_in to matrix_out by sum.
subroutine, public dbt_tas_replicate(matrix_in, info, matrix_out, nodata, move_data)
Replicate matrix_in such that each submatrix of matrix_out is an exact copy of matrix_in.
recursive subroutine, public dbt_tas_reshape(matrix_in, matrix_out, summation, transposed, move_data)
copy data (involves reshape)
methods to split tall-and-skinny matrices along longest dimension. Basically, we are splitting proces...
Definition: dbt_tas_split.F:13
integer, parameter, public rowsplit
Definition: dbt_tas_split.F:50
subroutine, public dbt_tas_get_split_info(info, mp_comm, nsplit, igroup, mp_comm_group, split_rowcol, pgrid_offset)
Get info on split.
integer, parameter, public colsplit
Definition: dbt_tas_split.F:50
DBT tall-and-skinny base types. Mostly wrappers around existing DBM routines.
Definition: dbt_tas_types.F:13
often used utilities for tall-and-skinny matrices
Definition: dbt_tas_util.F:12
Defines the basic variable types.
Definition: kinds.F:23
integer, parameter, public int_8
Definition: kinds.F:54
integer, parameter, public dp
Definition: kinds.F:34
Interface to the message passing library MPI.