(git:f56c6e3)
Loading...
Searching...
No Matches
task_list_methods.F
Go to the documentation of this file.
1!--------------------------------------------------------------------------------------------------!
2! CP2K: A general program to perform molecular dynamics simulations !
3! Copyright 2000-2025 CP2K developers group <https://cp2k.org> !
4! !
5! SPDX-License-Identifier: GPL-2.0-or-later !
6!--------------------------------------------------------------------------------------------------!
7
8! **************************************************************************************************
9!> \brief generate the tasks lists used by collocate and integrate routines
10!> \par History
11!> 01.2008 [Joost VandeVondele] refactered out of qs_collocate / qs_integrate
12!> \author Joost VandeVondele
13! **************************************************************************************************
21 USE cell_types, ONLY: cell_type, &
22 pbc
28 USE cp_dbcsr_api, ONLY: dbcsr_convert_sizes_to_offsets, &
34 dbcsr_type, &
38 USE kinds, ONLY: default_string_length, &
39 dp, &
40 int_8
41 USE kpoint_types, ONLY: get_kpoint_info, &
44 USE message_passing, ONLY: &
48 USE pw_env_types, ONLY: pw_env_get, &
50 USE qs_kind_types, ONLY: get_qs_kind, &
52 USE qs_ks_types, ONLY: get_ks_env, &
68 USE util, ONLY: sort
69
70!$ USE OMP_LIB, ONLY: omp_destroy_lock, omp_get_num_threads, omp_init_lock, &
71!$ omp_lock_kind, omp_set_lock, omp_unset_lock, omp_get_max_threads
72#include "./base/base_uses.f90"
73
74
75 IMPLICIT NONE
76
77 LOGICAL, PRIVATE, PARAMETER :: debug_this_module = .false.
78
79 PRIVATE
80
81 CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'task_list_methods'
82
83 PUBLIC :: generate_qs_task_list, &
85 PUBLIC :: distribute_tasks, &
91
92CONTAINS
93
94! **************************************************************************************************
95!> \brief ...
96!> \param ks_env ...
97!> \param task_list ...
98!> \param basis_type ...
99!> \param reorder_rs_grid_ranks Flag that indicates if this routine should
100!> or should not overwrite the rs descriptor (see comment below)
101!> \param skip_load_balance_distributed ...
102!> \param pw_env_external ...
103!> \param sab_orb_external ...
104!> \par History
105!> 01.2008 factored out of calculate_rho_elec [Joost VandeVondele]
106!> 04.2010 divides tasks into grid levels and atom pairs for integrate/collocate [Iain Bethune]
107!> (c) The Numerical Algorithms Group (NAG) Ltd, 2010 on behalf of the HECToR project
108!> 06.2015 adjusted to be used with multiple images (k-points) [JGH]
109!> \note If this routine is called several times with different task lists,
110!> the default behaviour is to re-optimize the grid ranks and overwrite
111!> the rs descriptor and grids. reorder_rs_grid_ranks = .FALSE. prevents the code
112!> of performing a new optimization by leaving the rank order in
113!> its current state.
114! **************************************************************************************************
115 SUBROUTINE generate_qs_task_list(ks_env, task_list, basis_type, &
116 reorder_rs_grid_ranks, skip_load_balance_distributed, &
117 pw_env_external, sab_orb_external)
118
119 TYPE(qs_ks_env_type), POINTER :: ks_env
120 TYPE(task_list_type), POINTER :: task_list
121 CHARACTER(LEN=*), INTENT(IN) :: basis_type
122 LOGICAL, INTENT(IN) :: reorder_rs_grid_ranks, &
123 skip_load_balance_distributed
124 TYPE(pw_env_type), OPTIONAL, POINTER :: pw_env_external
125 TYPE(neighbor_list_set_p_type), DIMENSION(:), &
126 OPTIONAL, POINTER :: sab_orb_external
127
128 CHARACTER(LEN=*), PARAMETER :: routinen = 'generate_qs_task_list'
129 INTEGER, PARAMETER :: max_tasks = 2000
130
131 INTEGER :: cindex, curr_tasks, handle, i, iatom, iatom_old, igrid_level, igrid_level_old, &
132 ikind, ilevel, img, img_old, ipair, ipgf, iset, itask, jatom, jatom_old, jkind, jpgf, &
133 jset, maxpgf, maxset, natoms, nimages, nkind, nseta, nsetb, slot
134 INTEGER, ALLOCATABLE, DIMENSION(:, :, :) :: blocks
135 INTEGER, DIMENSION(3) :: cellind
136 INTEGER, DIMENSION(:), POINTER :: la_max, la_min, lb_max, lb_min, npgfa, &
137 npgfb, nsgf
138 INTEGER, DIMENSION(:, :, :), POINTER :: cell_to_index
139 LOGICAL :: dokp
140 REAL(kind=dp) :: kind_radius_a, kind_radius_b
141 REAL(kind=dp), DIMENSION(3) :: ra, rab
142 REAL(kind=dp), DIMENSION(:), POINTER :: set_radius_a, set_radius_b
143 REAL(kind=dp), DIMENSION(:, :), POINTER :: rpgfa, rpgfb, zeta, zetb
144 TYPE(cell_type), POINTER :: cell
145 TYPE(cube_info_type), DIMENSION(:), POINTER :: cube_info
146 TYPE(dft_control_type), POINTER :: dft_control
147 TYPE(gridlevel_info_type), POINTER :: gridlevel_info
148 TYPE(gto_basis_set_p_type), DIMENSION(:), POINTER :: basis_set_list
149 TYPE(gto_basis_set_type), POINTER :: basis_set_a, basis_set_b, orb_basis_set
150 TYPE(kpoint_type), POINTER :: kpoints
151 TYPE(neighbor_list_set_p_type), DIMENSION(:), &
152 POINTER :: sab_orb
153 TYPE(particle_type), DIMENSION(:), POINTER :: particle_set
154 TYPE(pw_env_type), POINTER :: pw_env
155 TYPE(qs_kind_type), DIMENSION(:), POINTER :: qs_kind_set
156 TYPE(qs_kind_type), POINTER :: qs_kind
157 TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
158 POINTER :: rs_descs
159 TYPE(realspace_grid_type), DIMENSION(:), POINTER :: rs_grids
160 TYPE(task_type), DIMENSION(:), POINTER :: tasks
161
162 CALL timeset(routinen, handle)
163
164 CALL get_ks_env(ks_env, &
165 qs_kind_set=qs_kind_set, &
166 cell=cell, &
167 particle_set=particle_set, &
168 dft_control=dft_control)
169
170 CALL get_ks_env(ks_env, sab_orb=sab_orb)
171 IF (PRESENT(sab_orb_external)) sab_orb => sab_orb_external
172
173 CALL get_ks_env(ks_env, pw_env=pw_env)
174 IF (PRESENT(pw_env_external)) pw_env => pw_env_external
175 CALL pw_env_get(pw_env, rs_descs=rs_descs, rs_grids=rs_grids)
176
177 ! *** assign from pw_env
178 gridlevel_info => pw_env%gridlevel_info
179 cube_info => pw_env%cube_info
180
181 ! find maximum numbers
182 nkind = SIZE(qs_kind_set)
183 natoms = SIZE(particle_set)
184 maxset = 0
185 maxpgf = 0
186 DO ikind = 1, nkind
187 qs_kind => qs_kind_set(ikind)
188 CALL get_qs_kind(qs_kind=qs_kind, &
189 basis_set=orb_basis_set, basis_type=basis_type)
190
191 IF (.NOT. ASSOCIATED(orb_basis_set)) cycle
192 CALL get_gto_basis_set(gto_basis_set=orb_basis_set, npgf=npgfa, nset=nseta)
193
194 maxset = max(nseta, maxset)
195 maxpgf = max(maxval(npgfa), maxpgf)
196 END DO
197
198 ! kpoint related
199 nimages = dft_control%nimages
200 IF (nimages > 1) THEN
201 dokp = .true.
202 NULLIFY (kpoints)
203 CALL get_ks_env(ks_env=ks_env, kpoints=kpoints)
204 CALL get_kpoint_info(kpoint=kpoints, cell_to_index=cell_to_index)
205 ELSE
206 dokp = .false.
207 NULLIFY (cell_to_index)
208 END IF
209
210 ! free the atom_pair lists if allocated
211 IF (ASSOCIATED(task_list%atom_pair_send)) DEALLOCATE (task_list%atom_pair_send)
212 IF (ASSOCIATED(task_list%atom_pair_recv)) DEALLOCATE (task_list%atom_pair_recv)
213
214 ! construct a list of all tasks
215 IF (.NOT. ASSOCIATED(task_list%tasks)) THEN
216 CALL reallocate_tasks(task_list%tasks, max_tasks)
217 END IF
218 task_list%ntasks = 0
219 curr_tasks = SIZE(task_list%tasks)
220
221 ALLOCATE (basis_set_list(nkind))
222 DO ikind = 1, nkind
223 qs_kind => qs_kind_set(ikind)
224 CALL get_qs_kind(qs_kind=qs_kind, basis_set=basis_set_a, &
225 basis_type=basis_type)
226 IF (ASSOCIATED(basis_set_a)) THEN
227 basis_set_list(ikind)%gto_basis_set => basis_set_a
228 ELSE
229 NULLIFY (basis_set_list(ikind)%gto_basis_set)
230 END IF
231 END DO
232!!$OMP PARALLEL DEFAULT(NONE) &
233!!$OMP SHARED (sab_orb, dokp, basis_set_list, task_list, rs_descs, dft_control, cube_info, gridlevel_info, &
234!!$OMP curr_tasks, maxpgf, maxset, natoms, nimages, particle_set, cell_to_index, cell) &
235!!$OMP PRIVATE (ikind, jkind, iatom, jatom, rab, cellind, basis_set_a, basis_set_b, ra, &
236!!$OMP la_max, la_min, npgfa, nseta, rpgfa, set_radius_a, kind_radius_a, zeta, &
237!!$OMP lb_max, lb_min, npgfb, nsetb, rpgfb, set_radius_b, kind_radius_b, zetb, &
238!!$OMP cindex, slot)
239 ! Loop over neighbor list
240!!$OMP DO SCHEDULE(GUIDED)
241 DO slot = 1, sab_orb(1)%nl_size
242 ikind = sab_orb(1)%nlist_task(slot)%ikind
243 jkind = sab_orb(1)%nlist_task(slot)%jkind
244 iatom = sab_orb(1)%nlist_task(slot)%iatom
245 jatom = sab_orb(1)%nlist_task(slot)%jatom
246 rab(1:3) = sab_orb(1)%nlist_task(slot)%r(1:3)
247 cellind(1:3) = sab_orb(1)%nlist_task(slot)%cell(1:3)
248
249 basis_set_a => basis_set_list(ikind)%gto_basis_set
250 IF (.NOT. ASSOCIATED(basis_set_a)) cycle
251 basis_set_b => basis_set_list(jkind)%gto_basis_set
252 IF (.NOT. ASSOCIATED(basis_set_b)) cycle
253 ra(:) = pbc(particle_set(iatom)%r, cell)
254 ! basis ikind
255 la_max => basis_set_a%lmax
256 la_min => basis_set_a%lmin
257 npgfa => basis_set_a%npgf
258 nseta = basis_set_a%nset
259 rpgfa => basis_set_a%pgf_radius
260 set_radius_a => basis_set_a%set_radius
261 kind_radius_a = basis_set_a%kind_radius
262 zeta => basis_set_a%zet
263 ! basis jkind
264 lb_max => basis_set_b%lmax
265 lb_min => basis_set_b%lmin
266 npgfb => basis_set_b%npgf
267 nsetb = basis_set_b%nset
268 rpgfb => basis_set_b%pgf_radius
269 set_radius_b => basis_set_b%set_radius
270 kind_radius_b = basis_set_b%kind_radius
271 zetb => basis_set_b%zet
272
273 IF (dokp) THEN
274 cindex = cell_to_index(cellind(1), cellind(2), cellind(3))
275 ELSE
276 cindex = 1
277 END IF
278
279 CALL task_list_inner_loop(task_list%tasks, task_list%ntasks, curr_tasks, &
280 rs_descs, dft_control, cube_info, gridlevel_info, cindex, &
281 iatom, jatom, rpgfa, rpgfb, zeta, zetb, kind_radius_b, &
282 set_radius_a, set_radius_b, ra, rab, &
283 la_max, la_min, lb_max, lb_min, npgfa, npgfb, nseta, nsetb)
284
285 END DO
286!!$OMP END PARALLEL
287
288 ! redistribute the task list so that all tasks map on the local rs grids
289 CALL distribute_tasks( &
290 rs_descs=rs_descs, ntasks=task_list%ntasks, natoms=natoms, &
291 tasks=task_list%tasks, atom_pair_send=task_list%atom_pair_send, &
292 atom_pair_recv=task_list%atom_pair_recv, symmetric=.true., &
293 reorder_rs_grid_ranks=reorder_rs_grid_ranks, &
294 skip_load_balance_distributed=skip_load_balance_distributed)
295
296 ! compute offsets for rs_scatter_matrix / rs_copy_matrix
297 ALLOCATE (nsgf(natoms))
298 CALL get_particle_set(particle_set, qs_kind_set, basis=basis_set_list, nsgf=nsgf)
299 IF (ASSOCIATED(task_list%atom_pair_send)) THEN
300 ! only needed when there is a distributed grid
301 CALL rs_calc_offsets(pairs=task_list%atom_pair_send, &
302 nsgf=nsgf, &
303 group_size=rs_descs(1)%rs_desc%group_size, &
304 pair_offsets=task_list%pair_offsets_send, &
305 rank_offsets=task_list%rank_offsets_send, &
306 rank_sizes=task_list%rank_sizes_send, &
307 buffer_size=task_list%buffer_size_send)
308 END IF
309 CALL rs_calc_offsets(pairs=task_list%atom_pair_recv, &
310 nsgf=nsgf, &
311 group_size=rs_descs(1)%rs_desc%group_size, &
312 pair_offsets=task_list%pair_offsets_recv, &
313 rank_offsets=task_list%rank_offsets_recv, &
314 rank_sizes=task_list%rank_sizes_recv, &
315 buffer_size=task_list%buffer_size_recv)
316 DEALLOCATE (basis_set_list, nsgf)
317
318 ! If the rank order has changed, reallocate any of the distributed rs_grids
319 IF (reorder_rs_grid_ranks) THEN
320 DO i = 1, gridlevel_info%ngrid_levels
321 IF (rs_descs(i)%rs_desc%distributed) THEN
322 CALL rs_grid_release(rs_grids(i))
323 CALL rs_grid_create(rs_grids(i), rs_descs(i)%rs_desc)
324 END IF
325 END DO
326 END IF
327
328 CALL create_grid_task_list(task_list=task_list, &
329 qs_kind_set=qs_kind_set, &
330 particle_set=particle_set, &
331 cell=cell, &
332 basis_type=basis_type, &
333 rs_grids=rs_grids)
334
335 ! Now we have the final list of tasks, setup the task_list with the
336 ! data needed for the loops in integrate_v/calculate_rho
337
338 IF (ASSOCIATED(task_list%taskstart)) THEN
339 DEALLOCATE (task_list%taskstart)
340 END IF
341 IF (ASSOCIATED(task_list%taskstop)) THEN
342 DEALLOCATE (task_list%taskstop)
343 END IF
344 IF (ASSOCIATED(task_list%npairs)) THEN
345 DEALLOCATE (task_list%npairs)
346 END IF
347
348 ! First, count the number of unique atom pairs per grid level
349
350 ALLOCATE (task_list%npairs(SIZE(rs_descs)))
351
352 iatom_old = -1; jatom_old = -1; igrid_level_old = -1; img_old = -1
353 ipair = 0
354 task_list%npairs = 0
355
356 DO i = 1, task_list%ntasks
357 igrid_level = task_list%tasks(i)%grid_level
358 img = task_list%tasks(i)%image
359 iatom = task_list%tasks(i)%iatom
360 jatom = task_list%tasks(i)%jatom
361 iset = task_list%tasks(i)%iset
362 jset = task_list%tasks(i)%jset
363 ipgf = task_list%tasks(i)%ipgf
364 jpgf = task_list%tasks(i)%jpgf
365 IF (igrid_level /= igrid_level_old) THEN
366 IF (igrid_level_old /= -1) THEN
367 task_list%npairs(igrid_level_old) = ipair
368 END IF
369 ipair = 1
370 igrid_level_old = igrid_level
371 iatom_old = iatom
372 jatom_old = jatom
373 img_old = img
374 ELSE IF (iatom /= iatom_old .OR. jatom /= jatom_old .OR. img /= img_old) THEN
375 ipair = ipair + 1
376 iatom_old = iatom
377 jatom_old = jatom
378 img_old = img
379 END IF
380 END DO
381 ! Take care of the last iteration
382 IF (task_list%ntasks /= 0) THEN
383 task_list%npairs(igrid_level) = ipair
384 END IF
385
386 ! Second, for each atom pair, find the indices in the task list
387 ! of the first and last task
388
389 ! Array sized for worst case
390 ALLOCATE (task_list%taskstart(maxval(task_list%npairs), SIZE(rs_descs)))
391 ALLOCATE (task_list%taskstop(maxval(task_list%npairs), SIZE(rs_descs)))
392
393 iatom_old = -1; jatom_old = -1; igrid_level_old = -1; img_old = -1
394 ipair = 0
395 task_list%taskstart = 0
396 task_list%taskstop = 0
397
398 DO i = 1, task_list%ntasks
399 igrid_level = task_list%tasks(i)%grid_level
400 img = task_list%tasks(i)%image
401 iatom = task_list%tasks(i)%iatom
402 jatom = task_list%tasks(i)%jatom
403 iset = task_list%tasks(i)%iset
404 jset = task_list%tasks(i)%jset
405 ipgf = task_list%tasks(i)%ipgf
406 jpgf = task_list%tasks(i)%jpgf
407 IF (igrid_level /= igrid_level_old) THEN
408 IF (igrid_level_old /= -1) THEN
409 task_list%taskstop(ipair, igrid_level_old) = i - 1
410 END IF
411 ipair = 1
412 task_list%taskstart(ipair, igrid_level) = i
413 igrid_level_old = igrid_level
414 iatom_old = iatom
415 jatom_old = jatom
416 img_old = img
417 ELSE IF (iatom /= iatom_old .OR. jatom /= jatom_old .OR. img /= img_old) THEN
418 ipair = ipair + 1
419 task_list%taskstart(ipair, igrid_level) = i
420 task_list%taskstop(ipair - 1, igrid_level) = i - 1
421 iatom_old = iatom
422 jatom_old = jatom
423 img_old = img
424 END IF
425 END DO
426 ! Take care of the last iteration
427 IF (task_list%ntasks /= 0) THEN
428 task_list%taskstop(ipair, igrid_level) = task_list%ntasks
429 END IF
430
431 ! Debug task destribution
432 IF (debug_this_module) THEN
433 tasks => task_list%tasks
434 WRITE (6, *)
435 WRITE (6, *) "Total number of tasks ", task_list%ntasks
436 DO igrid_level = 1, gridlevel_info%ngrid_levels
437 WRITE (6, *) "Total number of pairs(grid_level) ", &
438 igrid_level, task_list%npairs(igrid_level)
439 END DO
440 WRITE (6, *)
441
442 DO igrid_level = 1, gridlevel_info%ngrid_levels
443
444 ALLOCATE (blocks(natoms, natoms, nimages))
445 blocks = -1
446 DO ipair = 1, task_list%npairs(igrid_level)
447 itask = task_list%taskstart(ipair, igrid_level)
448 ilevel = task_list%tasks(itask)%grid_level
449 img = task_list%tasks(itask)%image
450 iatom = task_list%tasks(itask)%iatom
451 jatom = task_list%tasks(itask)%jatom
452 iset = task_list%tasks(itask)%iset
453 jset = task_list%tasks(itask)%jset
454 ipgf = task_list%tasks(itask)%ipgf
455 jpgf = task_list%tasks(itask)%jpgf
456 IF (blocks(iatom, jatom, img) == -1 .AND. blocks(jatom, iatom, img) == -1) THEN
457 blocks(iatom, jatom, img) = 1
458 blocks(jatom, iatom, img) = 1
459 ELSE
460 WRITE (6, *) "TASK LIST CONFLICT IN PAIR ", ipair
461 WRITE (6, *) "Reuse of iatom, jatom, image ", iatom, jatom, img
462 END IF
463
464 iatom_old = iatom
465 jatom_old = jatom
466 img_old = img
467 DO itask = task_list%taskstart(ipair, igrid_level), task_list%taskstop(ipair, igrid_level)
468 ilevel = task_list%tasks(itask)%grid_level
469 img = task_list%tasks(itask)%image
470 iatom = task_list%tasks(itask)%iatom
471 jatom = task_list%tasks(itask)%jatom
472 iset = task_list%tasks(itask)%iset
473 jset = task_list%tasks(itask)%jset
474 ipgf = task_list%tasks(itask)%ipgf
475 jpgf = task_list%tasks(itask)%jpgf
476 IF (iatom /= iatom_old .OR. jatom /= jatom_old .OR. img /= img_old) THEN
477 WRITE (6, *) "TASK LIST CONFLICT IN TASK ", itask
478 WRITE (6, *) "Inconsistent iatom, jatom, image ", iatom, jatom, img
479 WRITE (6, *) "Should be iatom, jatom, image ", iatom_old, jatom_old, img_old
480 END IF
481
482 END DO
483 END DO
484 DEALLOCATE (blocks)
485
486 END DO
487
488 END IF
489
490 CALL timestop(handle)
491
492 END SUBROUTINE generate_qs_task_list
493
494! **************************************************************************************************
495!> \brief Sends the task list data to the grid API.
496!> \author Ole Schuett
497! **************************************************************************************************
498 SUBROUTINE create_grid_task_list(task_list, qs_kind_set, particle_set, cell, basis_type, rs_grids)
499 TYPE(task_list_type), POINTER :: task_list
500 TYPE(qs_kind_type), DIMENSION(:), POINTER :: qs_kind_set
501 TYPE(particle_type), DIMENSION(:), POINTER :: particle_set
502 TYPE(cell_type), POINTER :: cell
503 CHARACTER(LEN=*) :: basis_type
504 TYPE(realspace_grid_type), DIMENSION(:), POINTER :: rs_grids
505
506 TYPE(gto_basis_set_type), POINTER :: orb_basis_set
507 INTEGER :: nset, natoms, nkinds, ntasks, &
508 ikind, iatom, itask, nsgf
509 INTEGER, DIMENSION(:), ALLOCATABLE :: atom_kinds, level_list, iatom_list, jatom_list, &
510 iset_list, jset_list, ipgf_list, jpgf_list, &
511 border_mask_list, block_num_list
512 REAL(kind=dp), DIMENSION(:), ALLOCATABLE :: radius_list
513 REAL(kind=dp), DIMENSION(:, :), ALLOCATABLE :: rab_list, atom_positions
514 TYPE(task_type), DIMENSION(:), POINTER :: tasks
515 INTEGER, DIMENSION(:, :), POINTER :: first_sgf
516 REAL(kind=dp), DIMENSION(:, :), POINTER :: sphi, zet
517 INTEGER, DIMENSION(:), POINTER :: lmax, lmin, npgf, nsgf_set
518
519 nkinds = SIZE(qs_kind_set)
520 natoms = SIZE(particle_set)
521 ntasks = task_list%ntasks
522 tasks => task_list%tasks
523
524 IF (.NOT. ASSOCIATED(task_list%grid_basis_sets)) THEN
525 ! Basis sets do not change during simulation - only need to create them once.
526 ALLOCATE (task_list%grid_basis_sets(nkinds))
527 DO ikind = 1, nkinds
528 CALL get_qs_kind(qs_kind_set(ikind), basis_type=basis_type, basis_set=orb_basis_set)
529 CALL get_gto_basis_set(gto_basis_set=orb_basis_set, &
530 nset=nset, &
531 nsgf=nsgf, &
532 nsgf_set=nsgf_set, &
533 npgf=npgf, &
534 first_sgf=first_sgf, &
535 lmax=lmax, &
536 lmin=lmin, &
537 sphi=sphi, &
538 zet=zet)
539 CALL grid_create_basis_set(nset=nset, &
540 nsgf=nsgf, &
541 maxco=SIZE(sphi, 1), &
542 maxpgf=SIZE(zet, 1), &
543 lmin=lmin, &
544 lmax=lmax, &
545 npgf=npgf, &
546 nsgf_set=nsgf_set, &
547 first_sgf=first_sgf, &
548 sphi=sphi, &
549 zet=zet, &
550 basis_set=task_list%grid_basis_sets(ikind))
551 END DO
552 END IF
553
554 ! Pack task list infos
555 ALLOCATE (atom_kinds(natoms), atom_positions(3, natoms))
556 DO iatom = 1, natoms
557 atom_kinds(iatom) = particle_set(iatom)%atomic_kind%kind_number
558 atom_positions(:, iatom) = pbc(particle_set(iatom)%r, cell)
559 END DO
560
561 ALLOCATE (level_list(ntasks), iatom_list(ntasks), jatom_list(ntasks))
562 ALLOCATE (iset_list(ntasks), jset_list(ntasks), ipgf_list(ntasks), jpgf_list(ntasks))
563 ALLOCATE (border_mask_list(ntasks), block_num_list(ntasks))
564 ALLOCATE (radius_list(ntasks), rab_list(3, ntasks))
565
566 DO itask = 1, ntasks
567 level_list(itask) = tasks(itask)%grid_level
568 iatom_list(itask) = tasks(itask)%iatom
569 jatom_list(itask) = tasks(itask)%jatom
570 iset_list(itask) = tasks(itask)%iset
571 jset_list(itask) = tasks(itask)%jset
572 ipgf_list(itask) = tasks(itask)%ipgf
573 jpgf_list(itask) = tasks(itask)%jpgf
574 IF (tasks(itask)%dist_type == 2) THEN
575 border_mask_list(itask) = iand(63, not(tasks(itask)%subpatch_pattern)) ! invert last 6 bits
576 ELSE
577 border_mask_list(itask) = 0 ! no masking
578 END IF
579 block_num_list(itask) = tasks(itask)%pair_index ! change of nomenclature pair_index -> block_num
580 radius_list(itask) = tasks(itask)%radius
581 rab_list(:, itask) = tasks(itask)%rab(:)
582 END DO
583
584 CALL grid_create_task_list(ntasks=ntasks, &
585 natoms=natoms, &
586 nkinds=nkinds, &
587 nblocks=SIZE(task_list%pair_offsets_recv), &
588 block_offsets=task_list%pair_offsets_recv, &
589 atom_positions=atom_positions, &
590 atom_kinds=atom_kinds, &
591 basis_sets=task_list%grid_basis_sets, &
592 level_list=level_list, &
593 iatom_list=iatom_list, &
594 jatom_list=jatom_list, &
595 iset_list=iset_list, &
596 jset_list=jset_list, &
597 ipgf_list=ipgf_list, &
598 jpgf_list=jpgf_list, &
599 border_mask_list=border_mask_list, &
600 block_num_list=block_num_list, &
601 radius_list=radius_list, &
602 rab_list=rab_list, &
603 rs_grids=rs_grids, &
604 task_list=task_list%grid_task_list)
605
606 CALL offload_create_buffer(task_list%buffer_size_recv, task_list%pab_buffer)
607 CALL offload_create_buffer(task_list%buffer_size_recv, task_list%hab_buffer)
608
609 END SUBROUTINE create_grid_task_list
610
611! **************************************************************************************************
612!> \brief ...
613!> \param tasks ...
614!> \param ntasks ...
615!> \param curr_tasks ...
616!> \param rs_descs ...
617!> \param dft_control ...
618!> \param cube_info ...
619!> \param gridlevel_info ...
620!> \param cindex ...
621!> \param iatom ...
622!> \param jatom ...
623!> \param rpgfa ...
624!> \param rpgfb ...
625!> \param zeta ...
626!> \param zetb ...
627!> \param kind_radius_b ...
628!> \param set_radius_a ...
629!> \param set_radius_b ...
630!> \param ra ...
631!> \param rab ...
632!> \param la_max ...
633!> \param la_min ...
634!> \param lb_max ...
635!> \param lb_min ...
636!> \param npgfa ...
637!> \param npgfb ...
638!> \param nseta ...
639!> \param nsetb ...
640!> \par History
641!> Joost VandeVondele: 10.2008 refactored
642! **************************************************************************************************
643 SUBROUTINE task_list_inner_loop(tasks, ntasks, curr_tasks, rs_descs, dft_control, &
644 cube_info, gridlevel_info, cindex, &
645 iatom, jatom, rpgfa, rpgfb, zeta, zetb, kind_radius_b, set_radius_a, set_radius_b, ra, rab, &
646 la_max, la_min, lb_max, lb_min, npgfa, npgfb, nseta, nsetb)
647
648 TYPE(task_type), DIMENSION(:), POINTER :: tasks
649 INTEGER :: ntasks, curr_tasks
650 TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
651 POINTER :: rs_descs
652 TYPE(dft_control_type), POINTER :: dft_control
653 TYPE(cube_info_type), DIMENSION(:), POINTER :: cube_info
654 TYPE(gridlevel_info_type), POINTER :: gridlevel_info
655 INTEGER :: cindex, iatom, jatom
656 REAL(kind=dp), DIMENSION(:, :), POINTER :: rpgfa, rpgfb, zeta, zetb
657 REAL(kind=dp) :: kind_radius_b
658 REAL(kind=dp), DIMENSION(:), POINTER :: set_radius_a, set_radius_b
659 REAL(kind=dp), DIMENSION(3) :: ra, rab
660 INTEGER, DIMENSION(:), POINTER :: la_max, la_min, lb_max, lb_min, npgfa, &
661 npgfb
662 INTEGER :: nseta, nsetb
663
664 INTEGER :: cube_center(3), igrid_level, ipgf, iset, &
665 jpgf, jset, lb_cube(3), ub_cube(3)
666 REAL(kind=dp) :: dab, rab2, radius, zetp
667
668 rab2 = rab(1)*rab(1) + rab(2)*rab(2) + rab(3)*rab(3)
669 dab = sqrt(rab2)
670
671 loop_iset: DO iset = 1, nseta
672
673 IF (set_radius_a(iset) + kind_radius_b < dab) cycle loop_iset
674
675 loop_jset: DO jset = 1, nsetb
676
677 IF (set_radius_a(iset) + set_radius_b(jset) < dab) cycle loop_jset
678
679 loop_ipgf: DO ipgf = 1, npgfa(iset)
680
681 IF (rpgfa(ipgf, iset) + set_radius_b(jset) < dab) cycle loop_ipgf
682
683 loop_jpgf: DO jpgf = 1, npgfb(jset)
684
685 IF (rpgfa(ipgf, iset) + rpgfb(jpgf, jset) < dab) cycle loop_jpgf
686
687 zetp = zeta(ipgf, iset) + zetb(jpgf, jset)
688 igrid_level = gaussian_gridlevel(gridlevel_info, zetp)
689
690 CALL compute_pgf_properties(cube_center, lb_cube, ub_cube, radius, &
691 rs_descs(igrid_level)%rs_desc, cube_info(igrid_level), &
692 la_max(iset), zeta(ipgf, iset), la_min(iset), &
693 lb_max(jset), zetb(jpgf, jset), lb_min(jset), &
694 ra, rab, rab2, dft_control%qs_control%eps_rho_rspace)
695
696 CALL pgf_to_tasks(tasks, ntasks, curr_tasks, &
697 rab, cindex, iatom, jatom, iset, jset, ipgf, jpgf, &
698 la_max(iset), lb_max(jset), rs_descs(igrid_level)%rs_desc, &
699 igrid_level, gridlevel_info%ngrid_levels, cube_center, &
700 lb_cube, ub_cube, radius)
701
702 END DO loop_jpgf
703
704 END DO loop_ipgf
705
706 END DO loop_jset
707
708 END DO loop_iset
709
710 END SUBROUTINE task_list_inner_loop
711
712! **************************************************************************************************
713!> \brief combines the calculation of several basic properties of a given pgf:
714!> its center, the bounding cube, the radius, the cost,
715!> tries to predict the time needed for processing this task
716!> in this way an improved load balance might be obtained
717!> \param cube_center ...
718!> \param lb_cube ...
719!> \param ub_cube ...
720!> \param radius ...
721!> \param rs_desc ...
722!> \param cube_info ...
723!> \param la_max ...
724!> \param zeta ...
725!> \param la_min ...
726!> \param lb_max ...
727!> \param zetb ...
728!> \param lb_min ...
729!> \param ra ...
730!> \param rab ...
731!> \param rab2 ...
732!> \param eps ...
733!> \par History
734!> 10.2008 refactored [Joost VandeVondele]
735!> \note
736!> -) this requires the radius to be computed in the same way as
737!> collocate_pgf_product, we should factor that part into a subroutine
738!> -) we're assuming that integrate_pgf and collocate_pgf are the same cost for load balancing
739!> this is more or less true for map_consistent
740!> -) in principle, the computed radius could be recycled in integrate_pgf/collocate_pgf if it is certainly
741!> the same, this could lead to a small speedup
742!> -) the cost function is a fit through the median cost of mapping a pgf with a given l and a given radius (in grid points)
743!> fitting the measured data on an opteron/g95 using the expression
744!> a*(l+b)(r+c)**3+d which is based on the innerloop of the collocating routines
745! **************************************************************************************************
746 SUBROUTINE compute_pgf_properties(cube_center, lb_cube, ub_cube, radius, &
747 rs_desc, cube_info, la_max, zeta, la_min, lb_max, zetb, lb_min, ra, rab, rab2, eps)
748
749 INTEGER, DIMENSION(3), INTENT(OUT) :: cube_center, lb_cube, ub_cube
750 REAL(kind=dp), INTENT(OUT) :: radius
751 TYPE(realspace_grid_desc_type), POINTER :: rs_desc
752 TYPE(cube_info_type), INTENT(IN) :: cube_info
753 INTEGER, INTENT(IN) :: la_max
754 REAL(kind=dp), INTENT(IN) :: zeta
755 INTEGER, INTENT(IN) :: la_min, lb_max
756 REAL(kind=dp), INTENT(IN) :: zetb
757 INTEGER, INTENT(IN) :: lb_min
758 REAL(kind=dp), INTENT(IN) :: ra(3), rab(3), rab2, eps
759
760 INTEGER :: extent(3)
761 INTEGER, DIMENSION(:), POINTER :: sphere_bounds
762 REAL(kind=dp) :: cutoff, f, prefactor, rb(3), zetp
763 REAL(kind=dp), DIMENSION(3) :: rp
764
765! the radius for this task
766
767 zetp = zeta + zetb
768 rp(:) = ra(:) + zetb/zetp*rab(:)
769 rb(:) = ra(:) + rab(:)
770 cutoff = 1.0_dp
771 f = zetb/zetp
772 prefactor = exp(-zeta*f*rab2)
773 radius = exp_radius_very_extended(la_min, la_max, lb_min, lb_max, ra=ra, rb=rb, rp=rp, &
774 zetp=zetp, eps=eps, prefactor=prefactor, cutoff=cutoff)
775
776 CALL compute_cube_center(cube_center, rs_desc, zeta, zetb, ra, rab)
777 ! compute cube_center, the center of the gaussian product to map (folded to within the unit cell)
778 cube_center(:) = modulo(cube_center(:), rs_desc%npts(:))
779 cube_center(:) = cube_center(:) + rs_desc%lb(:)
780
781 IF (rs_desc%orthorhombic) THEN
782 CALL return_cube(cube_info, radius, lb_cube, ub_cube, sphere_bounds)
783 ELSE
784 CALL return_cube_nonortho(cube_info, radius, lb_cube, ub_cube, rp)
785 !? unclear if extent is computed correctly.
786 extent(:) = ub_cube(:) - lb_cube(:)
787 lb_cube(:) = -extent(:)/2 - 1
788 ub_cube(:) = extent(:)/2
789 END IF
790
791 END SUBROUTINE compute_pgf_properties
792! **************************************************************************************************
793!> \brief predicts the cost of a task in kcycles for a given task
794!> the model is based on a fit of actual data, and might need updating
795!> as collocate_pgf_product changes (or CPUs/compilers change)
796!> maybe some dynamic approach, improving the cost model on the fly could
797!> work as well
798!> the cost model does not yet take into account the fraction of space
799!> that is mapped locally for a given cube and rs_grid (generalised tasks)
800!> \param lb_cube ...
801!> \param ub_cube ...
802!> \param fraction ...
803!> \param lmax ...
804!> \param is_ortho ...
805!> \return ...
806! **************************************************************************************************
807 INTEGER FUNCTION cost_model(lb_cube, ub_cube, fraction, lmax, is_ortho)
808 INTEGER, DIMENSION(3), INTENT(IN) :: lb_cube, ub_cube
809 REAL(kind=dp), INTENT(IN) :: fraction
810 INTEGER :: lmax
811 LOGICAL :: is_ortho
812
813 INTEGER :: cmax
814 REAL(kind=dp) :: v1, v2, v3, v4, v5
815
816 cmax = maxval(((ub_cube - lb_cube) + 1)/2)
817
818 IF (is_ortho) THEN
819 v1 = 1.504760e+00_dp
820 v2 = 3.126770e+00_dp
821 v3 = 5.074106e+00_dp
822 v4 = 1.091568e+00_dp
823 v5 = 1.070187e+00_dp
824 ELSE
825 v1 = 7.831105e+00_dp
826 v2 = 2.675174e+00_dp
827 v3 = 7.546553e+00_dp
828 v4 = 6.122446e-01_dp
829 v5 = 3.886382e+00_dp
830 END IF
831 cost_model = ceiling(((lmax + v1)*(cmax + v2)**3*v3*fraction + v4 + v5*lmax**7)/1000.0_dp)
832
833 END FUNCTION cost_model
834! **************************************************************************************************
835!> \brief pgf_to_tasks converts a given pgf to one or more tasks, in particular
836!> this determines by which CPUs a given pgf gets collocated
837!> the format of the task array is as follows
838!> tasks(1,i) := destination
839!> tasks(2,i) := source
840!> tasks(3,i) := compressed type (iatom, jatom, ....)
841!> tasks(4,i) := type (0: replicated, 1: distributed local, 2: distributed generalised)
842!> tasks(5,i) := cost
843!> tasks(6,i) := alternate destination code (0 if none available)
844!>
845!> \param tasks ...
846!> \param ntasks ...
847!> \param curr_tasks ...
848!> \param rab ...
849!> \param cindex ...
850!> \param iatom ...
851!> \param jatom ...
852!> \param iset ...
853!> \param jset ...
854!> \param ipgf ...
855!> \param jpgf ...
856!> \param la_max ...
857!> \param lb_max ...
858!> \param rs_desc ...
859!> \param igrid_level ...
860!> \param n_levels ...
861!> \param cube_center ...
862!> \param lb_cube ...
863!> \param ub_cube ...
864!> \par History
865!> 10.2008 Refactored based on earlier routines by MattW [Joost VandeVondele]
866! **************************************************************************************************
867 SUBROUTINE pgf_to_tasks(tasks, ntasks, curr_tasks, &
868 rab, cindex, iatom, jatom, iset, jset, ipgf, jpgf, &
869 la_max, lb_max, rs_desc, igrid_level, n_levels, &
870 cube_center, lb_cube, ub_cube, radius)
871
872 TYPE(task_type), DIMENSION(:), POINTER :: tasks
873 INTEGER, INTENT(INOUT) :: ntasks, curr_tasks
874 REAL(kind=dp), DIMENSION(3), INTENT(IN) :: rab
875 INTEGER, INTENT(IN) :: cindex, iatom, jatom, iset, jset, ipgf, &
876 jpgf, la_max, lb_max
877 TYPE(realspace_grid_desc_type), POINTER :: rs_desc
878 INTEGER, INTENT(IN) :: igrid_level, n_levels
879 INTEGER, DIMENSION(3), INTENT(IN) :: cube_center, lb_cube, ub_cube
880 REAL(kind=dp), INTENT(IN) :: radius
881
882 INTEGER, PARAMETER :: add_tasks = 1000
883 REAL(kind=dp), PARAMETER :: mult_tasks = 2.0_dp
884
885 INTEGER :: added_tasks, cost, j, lmax
886 LOGICAL :: is_ortho
887 REAL(kind=dp) :: tfraction
888
889!$OMP SINGLE
890 ntasks = ntasks + 1
891 IF (ntasks > curr_tasks) THEN
892 curr_tasks = int((curr_tasks + add_tasks)*mult_tasks)
893 CALL reallocate_tasks(tasks, curr_tasks)
894 END IF
895!$OMP END SINGLE
896
897 IF (rs_desc%distributed) THEN
898
899 ! finds the node(s) that need to process this task
900 ! on exit tasks(:)%dist_type is 1 for distributed tasks and 2 for generalised tasks
901 CALL rs_find_node(rs_desc, igrid_level, n_levels, cube_center, &
902 ntasks=ntasks, tasks=tasks, lb_cube=lb_cube, ub_cube=ub_cube, added_tasks=added_tasks)
903
904 ELSE
905 tasks(ntasks)%destination = encode_rank(rs_desc%my_pos, igrid_level, n_levels)
906 tasks(ntasks)%dist_type = 0
907 tasks(ntasks)%subpatch_pattern = 0
908 added_tasks = 1
909 END IF
910
911 lmax = la_max + lb_max
912 is_ortho = (tasks(ntasks)%dist_type == 0 .OR. tasks(ntasks)%dist_type == 1) .AND. rs_desc%orthorhombic
913 ! we assume the load is shared equally between processes dealing with a generalised Gaussian.
914 ! this could be refined in the future
915 tfraction = 1.0_dp/added_tasks
916
917 cost = cost_model(lb_cube, ub_cube, tfraction, lmax, is_ortho)
918
919 DO j = 1, added_tasks
920 tasks(ntasks - added_tasks + j)%source = encode_rank(rs_desc%my_pos, igrid_level, n_levels)
921 tasks(ntasks - added_tasks + j)%cost = cost
922 tasks(ntasks - added_tasks + j)%grid_level = igrid_level
923 tasks(ntasks - added_tasks + j)%image = cindex
924 tasks(ntasks - added_tasks + j)%iatom = iatom
925 tasks(ntasks - added_tasks + j)%jatom = jatom
926 tasks(ntasks - added_tasks + j)%iset = iset
927 tasks(ntasks - added_tasks + j)%jset = jset
928 tasks(ntasks - added_tasks + j)%ipgf = ipgf
929 tasks(ntasks - added_tasks + j)%jpgf = jpgf
930 tasks(ntasks - added_tasks + j)%rab = rab
931 tasks(ntasks - added_tasks + j)%radius = radius
932 END DO
933
934 END SUBROUTINE pgf_to_tasks
935
936! **************************************************************************************************
937!> \brief performs load balancing of the tasks on the distributed grids
938!> \param tasks ...
939!> \param ntasks ...
940!> \param rs_descs ...
941!> \param grid_level ...
942!> \param natoms ...
943!> \par History
944!> created 2008-10-03 [Joost VandeVondele]
945! **************************************************************************************************
946 SUBROUTINE load_balance_distributed(tasks, ntasks, rs_descs, grid_level, natoms)
947
948 TYPE(task_type), DIMENSION(:), POINTER :: tasks
949 INTEGER :: ntasks
950 TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
951 POINTER :: rs_descs
952 INTEGER :: grid_level, natoms
953
954 CHARACTER(LEN=*), PARAMETER :: routinen = 'load_balance_distributed'
955
956 INTEGER :: handle
957 INTEGER, DIMENSION(:, :, :), POINTER :: list
958
959 CALL timeset(routinen, handle)
960
961 NULLIFY (list)
962 ! here we create for each cpu (0:ncpu-1) a list of possible destinations.
963 ! if a destination would not be in this list, it is a bug
964 CALL create_destination_list(list, rs_descs, grid_level)
965
966 ! now, walk over the tasks, filling in the loads of each destination
967 CALL compute_load_list(list, rs_descs, grid_level, tasks, ntasks, natoms, create_list=.true.)
968
969 ! optimize loads & fluxes
970 CALL optimize_load_list(list, rs_descs(1)%rs_desc%group, rs_descs(1)%rs_desc%my_pos)
971
972 ! now, walk over the tasks, using the list to set the destinations
973 CALL compute_load_list(list, rs_descs, grid_level, tasks, ntasks, natoms, create_list=.false.)
974
975 DEALLOCATE (list)
976
977 CALL timestop(handle)
978
979 END SUBROUTINE load_balance_distributed
980
981! **************************************************************************************************
982!> \brief this serial routine adjusts the fluxes in the global list
983!>
984!> \param list_global ...
985!> \par History
986!> created 2008-10-06 [Joost VandeVondele]
987! **************************************************************************************************
988 SUBROUTINE balance_global_list(list_global)
989 INTEGER, DIMENSION(:, :, 0:) :: list_global
990
991 CHARACTER(LEN=*), PARAMETER :: routinen = 'balance_global_list'
992 INTEGER, PARAMETER :: max_iter = 100
993 REAL(kind=dp), PARAMETER :: tolerance_factor = 0.005_dp
994
995 INTEGER :: dest, handle, icpu, idest, iflux, &
996 ilocal, k, maxdest, ncpu, nflux
997 INTEGER, ALLOCATABLE, DIMENSION(:, :) :: flux_connections
998 LOGICAL :: solution_optimal
999 REAL(kind=dp) :: average, load_shift, max_load_shift, &
1000 tolerance
1001 REAL(kind=dp), ALLOCATABLE, DIMENSION(:) :: load, optimized_flux, optimized_load
1002 REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :) :: flux_limits
1003
1004 CALL timeset(routinen, handle)
1005
1006 ncpu = SIZE(list_global, 3)
1007 maxdest = SIZE(list_global, 2)
1008 ALLOCATE (load(0:ncpu - 1))
1009 load = 0.0_dp
1010 ALLOCATE (optimized_load(0:ncpu - 1))
1011
1012 ! figure out the number of fluxes
1013 ! we assume that the global_list is symmetric
1014 nflux = 0
1015 DO icpu = 0, ncpu - 1
1016 DO idest = 1, maxdest
1017 dest = list_global(1, idest, icpu)
1018 IF (dest < ncpu .AND. dest > icpu) nflux = nflux + 1
1019 END DO
1020 END DO
1021 ALLOCATE (optimized_flux(nflux))
1022 ALLOCATE (flux_limits(2, nflux))
1023 ALLOCATE (flux_connections(2, nflux))
1024
1025 ! reorder data
1026 flux_limits = 0
1027 nflux = 0
1028 DO icpu = 0, ncpu - 1
1029 load(icpu) = sum(list_global(2, :, icpu))
1030 DO idest = 1, maxdest
1031 dest = list_global(1, idest, icpu)
1032 IF (dest < ncpu) THEN
1033 IF (dest /= icpu) THEN
1034 IF (dest > icpu) THEN
1035 nflux = nflux + 1
1036 flux_limits(2, nflux) = list_global(2, idest, icpu)
1037 flux_connections(1, nflux) = icpu
1038 flux_connections(2, nflux) = dest
1039 ELSE
1040 DO iflux = 1, nflux
1041 IF (flux_connections(1, iflux) == dest .AND. flux_connections(2, iflux) == icpu) THEN
1042 flux_limits(1, iflux) = -list_global(2, idest, icpu)
1043 EXIT
1044 END IF
1045 END DO
1046 END IF
1047 END IF
1048 END IF
1049 END DO
1050 END DO
1051
1052 solution_optimal = .false.
1053 optimized_flux = 0.0_dp
1054
1055 ! an iterative solver, if iterated till convergence the maximum load is minimal
1056 ! we terminate before things are fully converged, since this does show up in the timings
1057 ! once the largest shift becomes less than a small fraction of the average load, we're done
1058 ! we're perfectly happy if the load balance is within 1 percent or so
1059 ! the maximum load normally converges even faster
1060 average = sum(load)/SIZE(load)
1061 tolerance = tolerance_factor*average
1062
1063 optimized_load(:) = load
1064 DO k = 1, max_iter
1065 max_load_shift = 0.0_dp
1066 DO iflux = 1, nflux
1067 load_shift = (optimized_load(flux_connections(1, iflux)) - optimized_load(flux_connections(2, iflux)))/2
1068 load_shift = max(flux_limits(1, iflux) - optimized_flux(iflux), load_shift)
1069 load_shift = min(flux_limits(2, iflux) - optimized_flux(iflux), load_shift)
1070 max_load_shift = max(abs(load_shift), max_load_shift)
1071 optimized_load(flux_connections(1, iflux)) = optimized_load(flux_connections(1, iflux)) - load_shift
1072 optimized_load(flux_connections(2, iflux)) = optimized_load(flux_connections(2, iflux)) + load_shift
1073 optimized_flux(iflux) = optimized_flux(iflux) + load_shift
1074 END DO
1075 IF (max_load_shift < tolerance) THEN
1076 solution_optimal = .true.
1077 EXIT
1078 END IF
1079 END DO
1080
1081 ! now adjust the load list to reflect the optimized fluxes
1082 ! reorder data
1083 nflux = 0
1084 DO icpu = 0, ncpu - 1
1085 DO idest = 1, maxdest
1086 IF (list_global(1, idest, icpu) == icpu) ilocal = idest
1087 END DO
1088 DO idest = 1, maxdest
1089 dest = list_global(1, idest, icpu)
1090 IF (dest < ncpu) THEN
1091 IF (dest /= icpu) THEN
1092 IF (dest > icpu) THEN
1093 nflux = nflux + 1
1094 IF (optimized_flux(nflux) > 0) THEN
1095 list_global(2, ilocal, icpu) = list_global(2, ilocal, icpu) + &
1096 list_global(2, idest, icpu) - nint(optimized_flux(nflux))
1097 list_global(2, idest, icpu) = nint(optimized_flux(nflux))
1098 ELSE
1099 list_global(2, ilocal, icpu) = list_global(2, ilocal, icpu) + &
1100 list_global(2, idest, icpu)
1101 list_global(2, idest, icpu) = 0
1102 END IF
1103 ELSE
1104 DO iflux = 1, nflux
1105 IF (flux_connections(1, iflux) == dest .AND. flux_connections(2, iflux) == icpu) THEN
1106 IF (optimized_flux(iflux) > 0) THEN
1107 list_global(2, ilocal, icpu) = list_global(2, ilocal, icpu) + &
1108 list_global(2, idest, icpu)
1109 list_global(2, idest, icpu) = 0
1110 ELSE
1111 list_global(2, ilocal, icpu) = list_global(2, ilocal, icpu) + &
1112 list_global(2, idest, icpu) + nint(optimized_flux(iflux))
1113 list_global(2, idest, icpu) = -nint(optimized_flux(iflux))
1114 END IF
1115 EXIT
1116 END IF
1117 END DO
1118 END IF
1119 END IF
1120 END IF
1121 END DO
1122 END DO
1123
1124 CALL timestop(handle)
1125
1126 END SUBROUTINE balance_global_list
1127
1128! **************************************************************************************************
1129!> \brief this routine gets back optimized loads for all destinations
1130!>
1131!> \param list ...
1132!> \param group ...
1133!> \param my_pos ...
1134!> \par History
1135!> created 2008-10-06 [Joost VandeVondele]
1136!> Modified 2016-01 [EPCC] Reduce memory requirements on P processes
1137!> from O(P^2) to O(P)
1138! **************************************************************************************************
1139 SUBROUTINE optimize_load_list(list, group, my_pos)
1140 INTEGER, DIMENSION(:, :, 0:) :: list
1141 TYPE(mp_comm_type), INTENT(IN) :: group
1142 INTEGER, INTENT(IN) :: my_pos
1143
1144 CHARACTER(LEN=*), PARAMETER :: routinen = 'optimize_load_list'
1145 INTEGER, PARAMETER :: rank_of_root = 0
1146
1147 INTEGER :: handle, icpu, idest, maxdest, ncpu
1148 INTEGER, ALLOCATABLE, DIMENSION(:) :: load_all
1149 INTEGER, ALLOCATABLE, DIMENSION(:, :) :: load_partial
1150 INTEGER, ALLOCATABLE, DIMENSION(:, :, :) :: list_global
1151
1152 CALL timeset(routinen, handle)
1153
1154 ncpu = SIZE(list, 3)
1155 maxdest = SIZE(list, 2)
1156
1157 !find total workload ...
1158 ALLOCATE (load_all(maxdest*ncpu))
1159 load_all(:) = reshape(list(2, :, :), [maxdest*ncpu])
1160 CALL group%sum(load_all(:), rank_of_root)
1161
1162 ! ... and optimise the work per process
1163 ALLOCATE (list_global(2, maxdest, ncpu))
1164 IF (rank_of_root == my_pos) THEN
1165 list_global(1, :, :) = list(1, :, :)
1166 list_global(2, :, :) = reshape(load_all, [maxdest, ncpu])
1167 CALL balance_global_list(list_global)
1168 END IF
1169 CALL group%bcast(list_global, rank_of_root)
1170
1171 !figure out how much can be sent to other processes
1172 ALLOCATE (load_partial(maxdest, ncpu))
1173 ! send 'load_all', which is a copy of 'list' (but without leading dimension/stride)
1174 CALL group%sum_partial(reshape(load_all, [maxdest, ncpu]), load_partial(:, :))
1175
1176 DO icpu = 1, ncpu
1177 DO idest = 1, maxdest
1178
1179 !need to deduct 1 because `list' was passed in to this routine as being indexed from zero
1180 IF (load_partial(idest, icpu) > list_global(2, idest, icpu)) THEN
1181 IF (load_partial(idest, icpu) - list(2, idest, icpu - 1) < list_global(2, idest, icpu)) THEN
1182 list(2, idest, icpu - 1) = list_global(2, idest, icpu) &
1183 - (load_partial(idest, icpu) - list(2, idest, icpu - 1))
1184 ELSE
1185 list(2, idest, icpu - 1) = 0
1186 END IF
1187 END IF
1188
1189 END DO
1190 END DO
1191
1192 !clean up before leaving
1193 DEALLOCATE (load_all)
1194 DEALLOCATE (list_global)
1195 DEALLOCATE (load_partial)
1196
1197 CALL timestop(handle)
1198 END SUBROUTINE optimize_load_list
1199
1200! **************************************************************************************************
1201!> \brief fill the load list with values derived from the tasks array
1202!> from the alternate locations, we select the alternate location that
1203!> can be used without increasing the number of matrix blocks needed to
1204!> distribute.
1205!> Replicated tasks are not yet considered
1206!>
1207!> \param list ...
1208!> \param rs_descs ...
1209!> \param grid_level ...
1210!> \param tasks ...
1211!> \param ntasks ...
1212!> \param natoms ...
1213!> \param create_list ...
1214!> \par History
1215!> created 2008-10-06 [Joost VandeVondele]
1216! **************************************************************************************************
1217 SUBROUTINE compute_load_list(list, rs_descs, grid_level, tasks, ntasks, natoms, create_list)
1218 INTEGER, DIMENSION(:, :, 0:) :: list
1219 TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
1220 POINTER :: rs_descs
1221 INTEGER :: grid_level
1222 TYPE(task_type), DIMENSION(:), POINTER :: tasks
1223 INTEGER :: ntasks, natoms
1224 LOGICAL :: create_list
1225
1226 CHARACTER(LEN=*), PARAMETER :: routinen = 'compute_load_list'
1227
1228 INTEGER :: cost, dest, handle, i, iatom, ilevel, img, img_old, iopt, ipgf, iset, itask, &
1229 itask_start, itask_stop, jatom, jpgf, jset, li, maxdest, ncpu, ndest_pair, nopt, nshort, &
1230 rank
1231 INTEGER(KIND=int_8) :: bit_pattern, ipair, ipair_old, natom8
1232 INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:) :: loads
1233 INTEGER, ALLOCATABLE, DIMENSION(:) :: all_dests, index
1234 INTEGER, DIMENSION(6) :: options
1235
1236 CALL timeset(routinen, handle)
1237
1238 ALLOCATE (loads(0:rs_descs(grid_level)%rs_desc%group_size - 1))
1239 CALL get_current_loads(loads, rs_descs, grid_level, ntasks, tasks, use_reordered_ranks=.false.)
1240
1241 maxdest = SIZE(list, 2)
1242 ncpu = SIZE(list, 3)
1243 natom8 = natoms
1244
1245 ! first find the tasks that deal with the same atom pair
1246 itask_stop = 0
1247 ipair_old = huge(ipair_old)
1248 img_old = -1
1249 ALLOCATE (all_dests(0))
1250 ALLOCATE (index(0))
1251
1252 DO
1253
1254 ! first find the range of tasks that deal with the same atom pair
1255 itask_start = itask_stop + 1
1256 itask_stop = itask_start
1257 IF (itask_stop > ntasks) EXIT
1258 ilevel = tasks(itask_stop)%grid_level
1259 img_old = tasks(itask_stop)%image
1260 iatom = tasks(itask_stop)%iatom
1261 jatom = tasks(itask_stop)%jatom
1262 iset = tasks(itask_stop)%iset
1263 jset = tasks(itask_stop)%jset
1264 ipgf = tasks(itask_stop)%ipgf
1265 jpgf = tasks(itask_stop)%jpgf
1266
1267 ipair_old = (iatom - 1)*natom8 + (jatom - 1)
1268 DO
1269 IF (itask_stop + 1 > ntasks) EXIT
1270 ilevel = tasks(itask_stop + 1)%grid_level
1271 img = tasks(itask_stop + 1)%image
1272 iatom = tasks(itask_stop + 1)%iatom
1273 jatom = tasks(itask_stop + 1)%jatom
1274 iset = tasks(itask_stop + 1)%iset
1275 jset = tasks(itask_stop + 1)%jset
1276 ipgf = tasks(itask_stop + 1)%ipgf
1277 jpgf = tasks(itask_stop + 1)%jpgf
1278
1279 ipair = (iatom - 1)*natom8 + (jatom - 1)
1280 IF (ipair == ipair_old .AND. img == img_old) THEN
1281 itask_stop = itask_stop + 1
1282 ELSE
1283 EXIT
1284 END IF
1285 END DO
1286 ipair = ipair_old
1287 nshort = itask_stop - itask_start + 1
1288
1289 ! find the unique list of destinations on this grid level only
1290 DEALLOCATE (all_dests)
1291 ALLOCATE (all_dests(nshort))
1292 DEALLOCATE (index)
1293 ALLOCATE (index(nshort))
1294 DO i = 1, nshort
1295 ilevel = tasks(itask_start + i - 1)%grid_level
1296 img = tasks(itask_start + i - 1)%image
1297 iatom = tasks(itask_start + i - 1)%iatom
1298 jatom = tasks(itask_start + i - 1)%jatom
1299 iset = tasks(itask_start + i - 1)%iset
1300 jset = tasks(itask_start + i - 1)%jset
1301 ipgf = tasks(itask_start + i - 1)%ipgf
1302 jpgf = tasks(itask_start + i - 1)%jpgf
1303
1304 IF (ilevel == grid_level) THEN
1305 all_dests(i) = decode_rank(tasks(itask_start + i - 1)%destination, SIZE(rs_descs))
1306 ELSE
1307 all_dests(i) = huge(all_dests(i))
1308 END IF
1309 END DO
1310 CALL sort(all_dests, nshort, index)
1311 ndest_pair = 1
1312 DO i = 2, nshort
1313 IF ((all_dests(ndest_pair) /= all_dests(i)) .AND. (all_dests(i) /= huge(all_dests(i)))) THEN
1314 ndest_pair = ndest_pair + 1
1315 all_dests(ndest_pair) = all_dests(i)
1316 END IF
1317 END DO
1318
1319 DO itask = itask_start, itask_stop
1320
1321 dest = decode_rank(tasks(itask)%destination, SIZE(rs_descs)) ! notice that dest can be changed
1322 ilevel = tasks(itask)%grid_level
1323 img = tasks(itask)%image
1324 iatom = tasks(itask)%iatom
1325 jatom = tasks(itask)%jatom
1326 iset = tasks(itask)%iset
1327 jset = tasks(itask)%jset
1328 ipgf = tasks(itask)%ipgf
1329 jpgf = tasks(itask)%jpgf
1330
1331 ! Only proceed with tasks which are on this grid level
1332 IF (ilevel /= grid_level) cycle
1333 ipair = (iatom - 1)*natom8 + (jatom - 1)
1334 cost = int(tasks(itask)%cost)
1335
1336 SELECT CASE (tasks(itask)%dist_type)
1337 CASE (1)
1338 bit_pattern = tasks(itask)%subpatch_pattern
1339 nopt = 0
1340 IF (btest(bit_pattern, 0)) THEN
1341 rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, [-1, 0, 0])
1342 IF (any(all_dests(1:ndest_pair) == rank)) THEN
1343 nopt = nopt + 1
1344 options(nopt) = rank
1345 END IF
1346 END IF
1347 IF (btest(bit_pattern, 1)) THEN
1348 rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, [+1, 0, 0])
1349 IF (any(all_dests(1:ndest_pair) == rank)) THEN
1350 nopt = nopt + 1
1351 options(nopt) = rank
1352 END IF
1353 END IF
1354 IF (btest(bit_pattern, 2)) THEN
1355 rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, [0, -1, 0])
1356 IF (any(all_dests(1:ndest_pair) == rank)) THEN
1357 nopt = nopt + 1
1358 options(nopt) = rank
1359 END IF
1360 END IF
1361 IF (btest(bit_pattern, 3)) THEN
1362 rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, [0, +1, 0])
1363 IF (any(all_dests(1:ndest_pair) == rank)) THEN
1364 nopt = nopt + 1
1365 options(nopt) = rank
1366 END IF
1367 END IF
1368 IF (btest(bit_pattern, 4)) THEN
1369 rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, [0, 0, -1])
1370 IF (any(all_dests(1:ndest_pair) == rank)) THEN
1371 nopt = nopt + 1
1372 options(nopt) = rank
1373 END IF
1374 END IF
1375 IF (btest(bit_pattern, 5)) THEN
1376 rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, [0, 0, +1])
1377 IF (any(all_dests(1:ndest_pair) == rank)) THEN
1378 nopt = nopt + 1
1379 options(nopt) = rank
1380 END IF
1381 END IF
1382 IF (nopt > 0) THEN
1383 ! set it to the rank with the lowest load
1384 rank = options(1)
1385 DO iopt = 2, nopt
1386 IF (loads(rank) > loads(options(iopt))) rank = options(iopt)
1387 END DO
1388 ELSE
1389 rank = dest
1390 END IF
1391 li = list_index(list, rank, dest)
1392 IF (create_list) THEN
1393 list(2, li, dest) = list(2, li, dest) + cost
1394 ELSE
1395 IF (list(1, li, dest) == dest) THEN
1396 tasks(itask)%destination = encode_rank(dest, ilevel, SIZE(rs_descs))
1397 ELSE
1398 IF (list(2, li, dest) >= cost) THEN
1399 list(2, li, dest) = list(2, li, dest) - cost
1400 tasks(itask)%destination = encode_rank(list(1, li, dest), ilevel, SIZE(rs_descs))
1401 ELSE
1402 tasks(itask)%destination = encode_rank(dest, ilevel, SIZE(rs_descs))
1403 END IF
1404 END IF
1405 END IF
1406 CASE (2) ! generalised
1407 li = list_index(list, dest, dest)
1408 IF (create_list) THEN
1409 list(2, li, dest) = list(2, li, dest) + cost
1410 ELSE
1411 IF (list(1, li, dest) == dest) THEN
1412 tasks(itask)%destination = encode_rank(dest, ilevel, SIZE(rs_descs))
1413 ELSE
1414 IF (list(2, li, dest) >= cost) THEN
1415 list(2, li, dest) = list(2, li, dest) - cost
1416 tasks(itask)%destination = encode_rank(list(1, li, dest), ilevel, SIZE(rs_descs))
1417 ELSE
1418 tasks(itask)%destination = encode_rank(dest, ilevel, SIZE(rs_descs))
1419 END IF
1420 END IF
1421 END IF
1422 CASE DEFAULT
1423 cpabort("")
1424 END SELECT
1425
1426 END DO
1427
1428 END DO
1429
1430 CALL timestop(handle)
1431
1432 END SUBROUTINE compute_load_list
1433! **************************************************************************************************
1434!> \brief small helper function to return the proper index in the list array
1435!>
1436!> \param list ...
1437!> \param rank ...
1438!> \param dest ...
1439!> \return ...
1440!> \par History
1441!> created 2008-10-06 [Joost VandeVondele]
1442! **************************************************************************************************
1443 INTEGER FUNCTION list_index(list, rank, dest)
1444 INTEGER, DIMENSION(:, :, 0:), INTENT(IN) :: list
1445 INTEGER, INTENT(IN) :: rank, dest
1446
1447 list_index = 1
1448 DO
1449 IF (list(1, list_index, dest) == rank) EXIT
1450 list_index = list_index + 1
1451 END DO
1452 END FUNCTION list_index
1453! **************************************************************************************************
1454!> \brief create a list with possible destinations (i.e. the central cpu and neighbors) for each cpu
1455!> note that we allocate it with an additional field to store the load of this destination
1456!>
1457!> \param list ...
1458!> \param rs_descs ...
1459!> \param grid_level ...
1460!> \par History
1461!> created 2008-10-06 [Joost VandeVondele]
1462! **************************************************************************************************
1463 SUBROUTINE create_destination_list(list, rs_descs, grid_level)
1464 INTEGER, DIMENSION(:, :, :), POINTER :: list
1465 TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
1466 POINTER :: rs_descs
1467 INTEGER, INTENT(IN) :: grid_level
1468
1469 CHARACTER(LEN=*), PARAMETER :: routinen = 'create_destination_list'
1470
1471 INTEGER :: handle, i, icpu, j, maxcount, ncpu, &
1472 ultimate_max
1473 INTEGER, ALLOCATABLE, DIMENSION(:) :: index, sublist
1474
1475 CALL timeset(routinen, handle)
1476
1477 cpassert(.NOT. ASSOCIATED(list))
1478 ncpu = rs_descs(grid_level)%rs_desc%group_size
1479 ultimate_max = 7
1480
1481 ALLOCATE (list(2, ultimate_max, 0:ncpu - 1))
1482
1483 ALLOCATE (index(ultimate_max))
1484 ALLOCATE (sublist(ultimate_max))
1485 sublist = huge(sublist)
1486
1487 maxcount = 1
1488 DO icpu = 0, ncpu - 1
1489 sublist(1) = icpu
1490 sublist(2) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, [-1, 0, 0])
1491 sublist(3) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, [+1, 0, 0])
1492 sublist(4) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, [0, -1, 0])
1493 sublist(5) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, [0, +1, 0])
1494 sublist(6) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, [0, 0, -1])
1495 sublist(7) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, [0, 0, +1])
1496 ! only retain unique values of the destination
1497 CALL sort(sublist, ultimate_max, index)
1498 j = 1
1499 DO i = 2, 7
1500 IF (sublist(i) /= sublist(j)) THEN
1501 j = j + 1
1502 sublist(j) = sublist(i)
1503 END IF
1504 END DO
1505 maxcount = max(maxcount, j)
1506 sublist(j + 1:ultimate_max) = huge(sublist)
1507 list(1, :, icpu) = sublist
1508 list(2, :, icpu) = 0
1509 END DO
1510
1511 CALL reallocate(list, 1, 2, 1, maxcount, 0, ncpu - 1)
1512
1513 CALL timestop(handle)
1514
1515 END SUBROUTINE create_destination_list
1516
1517! **************************************************************************************************
1518!> \brief given a task list, compute the load of each process everywhere
1519!> giving this function the ability to loop over a (sub)set of rs_grids,
1520!> and do all the communication in one shot, would speed it up
1521!> \param loads ...
1522!> \param rs_descs ...
1523!> \param grid_level ...
1524!> \param ntasks ...
1525!> \param tasks ...
1526!> \param use_reordered_ranks ...
1527!> \par History
1528!> none
1529!> \author MattW 21/11/2007
1530! **************************************************************************************************
1531 SUBROUTINE get_current_loads(loads, rs_descs, grid_level, ntasks, tasks, use_reordered_ranks)
1532 INTEGER(KIND=int_8), DIMENSION(:) :: loads
1533 TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
1534 POINTER :: rs_descs
1535 INTEGER :: grid_level, ntasks
1536 TYPE(task_type), DIMENSION(:), POINTER :: tasks
1537 LOGICAL, INTENT(IN) :: use_reordered_ranks
1538
1539 CHARACTER(LEN=*), PARAMETER :: routinen = 'get_current_loads'
1540
1541 INTEGER :: handle, i, iatom, ilevel, img, ipgf, &
1542 iset, jatom, jpgf, jset
1543 INTEGER(KIND=int_8) :: total_cost_local
1544 INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:) :: recv_buf_i, send_buf_i
1545 TYPE(realspace_grid_desc_type), POINTER :: desc
1546
1547 CALL timeset(routinen, handle)
1548
1549 desc => rs_descs(grid_level)%rs_desc
1550
1551 ! allocate local arrays
1552 ALLOCATE (send_buf_i(desc%group_size))
1553 ALLOCATE (recv_buf_i(desc%group_size))
1554
1555 ! communication step 1 : compute the total local cost of the tasks
1556 ! each proc needs to know the amount of work he will receive
1557
1558 ! send buffer now contains for each target the cost of the tasks it will receive
1559 send_buf_i = 0
1560 DO i = 1, ntasks
1561 ilevel = tasks(i)%grid_level
1562 img = tasks(i)%image
1563 iatom = tasks(i)%iatom
1564 jatom = tasks(i)%jatom
1565 iset = tasks(i)%iset
1566 jset = tasks(i)%jset
1567 ipgf = tasks(i)%ipgf
1568 jpgf = tasks(i)%jpgf
1569 IF (ilevel /= grid_level) cycle
1570 IF (use_reordered_ranks) THEN
1571 send_buf_i(rs_descs(ilevel)%rs_desc%virtual2real(decode_rank(tasks(i)%destination, SIZE(rs_descs))) + 1) = &
1572 send_buf_i(rs_descs(ilevel)%rs_desc%virtual2real(decode_rank(tasks(i)%destination, SIZE(rs_descs))) + 1) &
1573 + tasks(i)%cost
1574 ELSE
1575 send_buf_i(decode_rank(tasks(i)%destination, SIZE(rs_descs)) + 1) = &
1576 send_buf_i(decode_rank(tasks(i)%destination, SIZE(rs_descs)) + 1) &
1577 + tasks(i)%cost
1578 END IF
1579 END DO
1580 CALL desc%group%alltoall(send_buf_i, recv_buf_i, 1)
1581
1582 ! communication step 2 : compute the global cost of the tasks
1583 total_cost_local = sum(recv_buf_i)
1584
1585 ! after this step, the recv buffer contains the local cost for each CPU
1586 CALL desc%group%allgather(total_cost_local, loads)
1587
1588 CALL timestop(handle)
1589
1590 END SUBROUTINE get_current_loads
1591! **************************************************************************************************
1592!> \brief performs load balancing shifting tasks on the replicated grids
1593!> this modifies the destination of some of the tasks on replicated
1594!> grids, and in this way balances the load
1595!> \param rs_descs ...
1596!> \param ntasks ...
1597!> \param tasks ...
1598!> \par History
1599!> none
1600!> \author MattW 21/11/2007
1601! **************************************************************************************************
1602 SUBROUTINE load_balance_replicated(rs_descs, ntasks, tasks)
1603
1604 TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
1605 POINTER :: rs_descs
1606 INTEGER :: ntasks
1607 TYPE(task_type), DIMENSION(:), POINTER :: tasks
1608
1609 CHARACTER(LEN=*), PARAMETER :: routinen = 'load_balance_replicated'
1610
1611 INTEGER :: handle, i, iatom, ilevel, img, ipgf, &
1612 iset, j, jatom, jpgf, jset, &
1613 no_overloaded, no_underloaded, &
1614 proc_receiving
1615 INTEGER(KIND=int_8) :: average_cost, cost_task_rep, count, &
1616 offset, total_cost_global
1617 INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:) :: load_imbalance, loads, recv_buf_i
1618 INTEGER, ALLOCATABLE, DIMENSION(:) :: index
1619 TYPE(realspace_grid_desc_type), POINTER :: desc
1620
1621 CALL timeset(routinen, handle)
1622
1623 desc => rs_descs(1)%rs_desc
1624
1625 ! allocate local arrays
1626 ALLOCATE (recv_buf_i(desc%group_size))
1627 ALLOCATE (loads(desc%group_size))
1628
1629 recv_buf_i = 0
1630 DO i = 1, SIZE(rs_descs)
1631 CALL get_current_loads(loads, rs_descs, i, ntasks, tasks, use_reordered_ranks=.true.)
1632 recv_buf_i(:) = recv_buf_i + loads
1633 END DO
1634
1635 total_cost_global = sum(recv_buf_i)
1636 average_cost = total_cost_global/desc%group_size
1637
1638 !
1639 ! compute how to redistribute the replicated tasks so that the average cost is reached
1640 !
1641
1642 ! load imbalance measures the load of a given CPU relative
1643 ! to the optimal load distribution (load=average)
1644 ALLOCATE (load_imbalance(desc%group_size))
1645 ALLOCATE (index(desc%group_size))
1646
1647 load_imbalance(:) = recv_buf_i - average_cost
1648 no_overloaded = 0
1649 no_underloaded = 0
1650
1651 DO i = 1, desc%group_size
1652 IF (load_imbalance(i) > 0) no_overloaded = no_overloaded + 1
1653 IF (load_imbalance(i) < 0) no_underloaded = no_underloaded + 1
1654 END DO
1655
1656 ! sort the recv_buffer on number of tasks, gives us index which provides a
1657 ! mapping between processor ranks and how overloaded the processor
1658 CALL sort(recv_buf_i, SIZE(recv_buf_i), index)
1659
1660 ! find out the number of replicated tasks each proc has
1661 ! but only those tasks which have not yet been assigned
1662 cost_task_rep = 0
1663 DO i = 1, ntasks
1664 IF (tasks(i)%dist_type == 0 &
1665 .AND. decode_rank(tasks(i)%destination, SIZE(rs_descs)) == decode_rank(tasks(i)%source, SIZE(rs_descs))) THEN
1666 cost_task_rep = cost_task_rep + tasks(i)%cost
1667 END IF
1668 END DO
1669
1670 ! now, correct the load imbalance for the overloaded CPUs
1671 ! they will send away not more than the total load of replicated tasks
1672 CALL desc%group%allgather(cost_task_rep, recv_buf_i)
1673
1674 DO i = 1, desc%group_size
1675 ! At the moment we can only offload replicated tasks
1676 IF (load_imbalance(i) > 0) &
1677 load_imbalance(i) = min(load_imbalance(i), recv_buf_i(i))
1678 END DO
1679
1680 ! simplest algorithm I can think of of is that the processor with the most
1681 ! excess tasks fills up the process needing most, then moves on to next most.
1682 ! At the moment if we've got less replicated tasks than we're overloaded then
1683 ! task balancing will be incomplete
1684
1685 ! only need to do anything if I've excess tasks
1686 IF (load_imbalance(desc%my_pos + 1) > 0) THEN
1687
1688 count = 0 ! weighted amount of tasks offloaded
1689 offset = 0 ! no of underloaded processes already filled by other more overloaded procs
1690
1691 ! calculate offset
1692 DO i = desc%group_size, desc%group_size - no_overloaded + 1, -1
1693 IF (index(i) == desc%my_pos + 1) THEN
1694 EXIT
1695 ELSE
1696 offset = offset + load_imbalance(index(i))
1697 END IF
1698 END DO
1699
1700 ! find my starting processor to send to
1701 proc_receiving = huge(proc_receiving)
1702 DO i = 1, no_underloaded
1703 offset = offset + load_imbalance(index(i))
1704 IF (offset <= 0) THEN
1705 proc_receiving = i
1706 EXIT
1707 END IF
1708 END DO
1709
1710 ! offset now contains minus the number of tasks proc_receiving requires
1711 ! we fill this up by adjusting the destination of tasks on the replicated grid,
1712 ! then move to next most underloaded proc
1713 DO j = 1, ntasks
1714 IF (tasks(j)%dist_type == 0 &
1715 .AND. decode_rank(tasks(j)%destination, SIZE(rs_descs)) == decode_rank(tasks(j)%source, SIZE(rs_descs))) THEN
1716 ! just avoid sending to non existing procs due to integer truncation
1717 ! in the computation of the average
1718 IF (proc_receiving > no_underloaded) EXIT
1719 ! set new destination
1720 ilevel = tasks(j)%grid_level
1721 img = tasks(j)%image
1722 iatom = tasks(j)%iatom
1723 jatom = tasks(j)%jatom
1724 iset = tasks(j)%iset
1725 jset = tasks(j)%jset
1726 ipgf = tasks(j)%ipgf
1727 jpgf = tasks(j)%jpgf
1728 tasks(j)%destination = encode_rank(index(proc_receiving) - 1, ilevel, SIZE(rs_descs))
1729 offset = offset + tasks(j)%cost
1730 count = count + tasks(j)%cost
1731 IF (count >= load_imbalance(desc%my_pos + 1)) EXIT
1732 IF (offset > 0) THEN
1733 proc_receiving = proc_receiving + 1
1734 ! just avoid sending to non existing procs due to integer truncation
1735 ! in the computation of the average
1736 IF (proc_receiving > no_underloaded) EXIT
1737 offset = load_imbalance(index(proc_receiving))
1738 END IF
1739 END IF
1740 END DO
1741 END IF
1742
1743 DEALLOCATE (index)
1744 DEALLOCATE (load_imbalance)
1745
1746 CALL timestop(handle)
1747
1748 END SUBROUTINE load_balance_replicated
1749
1750! **************************************************************************************************
1751!> \brief given an input task list, redistribute so that all tasks can be processed locally,
1752!> i.e. dest equals rank
1753!> \param rs_descs ...
1754!> \param ntasks ...
1755!> \param tasks ...
1756!> \param ntasks_recv ...
1757!> \param tasks_recv ...
1758!> \par History
1759!> none
1760!> \author MattW 21/11/2007
1761! **************************************************************************************************
1762 SUBROUTINE create_local_tasks(rs_descs, ntasks, tasks, ntasks_recv, tasks_recv)
1763
1764 TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
1765 POINTER :: rs_descs
1766 INTEGER :: ntasks
1767 TYPE(task_type), DIMENSION(:), POINTER :: tasks
1768 INTEGER :: ntasks_recv
1769 TYPE(task_type), DIMENSION(:), POINTER :: tasks_recv
1770
1771 CHARACTER(LEN=*), PARAMETER :: routinen = 'create_local_tasks'
1772
1773 INTEGER :: handle, i, j, k, l, rank
1774 INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:) :: recv_buf, send_buf
1775 INTEGER, ALLOCATABLE, DIMENSION(:) :: recv_disps, recv_sizes, send_disps, &
1776 send_sizes
1777 TYPE(realspace_grid_desc_type), POINTER :: desc
1778
1779 CALL timeset(routinen, handle)
1780
1781 desc => rs_descs(1)%rs_desc
1782
1783 ! allocate local arrays
1784 ALLOCATE (send_sizes(desc%group_size))
1785 ALLOCATE (recv_sizes(desc%group_size))
1786 ALLOCATE (send_disps(desc%group_size))
1787 ALLOCATE (recv_disps(desc%group_size))
1788 ALLOCATE (send_buf(desc%group_size))
1789 ALLOCATE (recv_buf(desc%group_size))
1790
1791 ! fill send buffer, now counting how many tasks will be send (stored in an int8 array for convenience only).
1792 send_buf = 0
1793 DO i = 1, ntasks
1794 rank = rs_descs(decode_level(tasks(i)%destination, SIZE(rs_descs))) &
1795 %rs_desc%virtual2real(decode_rank(tasks(i)%destination, SIZE(rs_descs)))
1796 send_buf(rank + 1) = send_buf(rank + 1) + 1
1797 END DO
1798
1799 CALL desc%group%alltoall(send_buf, recv_buf, 1)
1800
1801 ! pack the tasks, and send them around
1802
1803 send_sizes = 0
1804 send_disps = 0
1805 recv_sizes = 0
1806 recv_disps = 0
1807
1808 send_sizes(1) = int(send_buf(1)*task_size_in_int8)
1809 recv_sizes(1) = int(recv_buf(1)*task_size_in_int8)
1810 DO i = 2, desc%group_size
1811 send_sizes(i) = int(send_buf(i)*task_size_in_int8)
1812 recv_sizes(i) = int(recv_buf(i)*task_size_in_int8)
1813 send_disps(i) = send_disps(i - 1) + send_sizes(i - 1)
1814 recv_disps(i) = recv_disps(i - 1) + recv_sizes(i - 1)
1815 END DO
1816
1817 ! deallocate old send/recv buffers
1818 DEALLOCATE (send_buf)
1819 DEALLOCATE (recv_buf)
1820
1821 ! allocate them with new sizes
1822 ALLOCATE (send_buf(sum(send_sizes)))
1823 ALLOCATE (recv_buf(sum(recv_sizes)))
1824
1825 ! do packing
1826 send_buf = 0
1827 send_sizes = 0
1828 DO j = 1, ntasks
1829 i = rs_descs(decode_level(tasks(j)%destination, SIZE(rs_descs))) &
1830 %rs_desc%virtual2real(decode_rank(tasks(j)%destination, SIZE(rs_descs))) + 1
1831 l = send_disps(i) + send_sizes(i)
1832 CALL serialize_task(tasks(j), send_buf(l + 1:l + task_size_in_int8))
1833 send_sizes(i) = send_sizes(i) + task_size_in_int8
1834 END DO
1835
1836 ! do communication
1837 CALL desc%group%alltoall(send_buf, send_sizes, send_disps, recv_buf, recv_sizes, recv_disps)
1838
1839 DEALLOCATE (send_buf)
1840
1841 ntasks_recv = sum(recv_sizes)/task_size_in_int8
1842 ALLOCATE (tasks_recv(ntasks_recv))
1843
1844 ! do unpacking
1845 l = 0
1846 DO i = 1, desc%group_size
1847 DO j = 0, recv_sizes(i)/task_size_in_int8 - 1
1848 l = l + 1
1849 k = recv_disps(i) + j*task_size_in_int8
1850 CALL deserialize_task(tasks_recv(l), recv_buf(k + 1:k + task_size_in_int8))
1851 END DO
1852 END DO
1853
1854 DEALLOCATE (recv_buf)
1855 DEALLOCATE (send_sizes)
1856 DEALLOCATE (recv_sizes)
1857 DEALLOCATE (send_disps)
1858 DEALLOCATE (recv_disps)
1859
1860 CALL timestop(handle)
1861
1862 END SUBROUTINE create_local_tasks
1863
1864! **************************************************************************************************
1865!> \brief Assembles tasks to be performed on local grid
1866!> \param rs_descs the grids
1867!> \param ntasks Number of tasks for local processing
1868!> \param natoms ...
1869!> \param nimages ...
1870!> \param tasks the task set generated on this processor
1871!> \param rval ...
1872!> \param atom_pair_send ...
1873!> \param atom_pair_recv ...
1874!> \param symmetric ...
1875!> \param reorder_rs_grid_ranks ...
1876!> \param skip_load_balance_distributed ...
1877!> \par History
1878!> none
1879!> \author MattW 21/11/2007
1880! **************************************************************************************************
1881 SUBROUTINE distribute_tasks(rs_descs, ntasks, natoms, &
1882 tasks, atom_pair_send, atom_pair_recv, &
1883 symmetric, reorder_rs_grid_ranks, skip_load_balance_distributed)
1884
1885 TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
1886 POINTER :: rs_descs
1887 INTEGER :: ntasks, natoms
1888 TYPE(task_type), DIMENSION(:), POINTER :: tasks
1889 TYPE(atom_pair_type), DIMENSION(:), POINTER :: atom_pair_send, atom_pair_recv
1890 LOGICAL, INTENT(IN) :: symmetric, reorder_rs_grid_ranks, &
1891 skip_load_balance_distributed
1892
1893 CHARACTER(LEN=*), PARAMETER :: routinen = 'distribute_tasks'
1894
1895 INTEGER :: handle, igrid_level, irank, ntasks_recv
1896 INTEGER(KIND=int_8) :: load_gap, max_load, replicated_load
1897 INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:) :: total_loads, total_loads_tmp, trial_loads
1898 INTEGER(KIND=int_8), DIMENSION(:, :), POINTER :: loads
1899 INTEGER, ALLOCATABLE, DIMENSION(:) :: indices, real2virtual, total_index
1900 LOGICAL :: distributed_grids, fixed_first_grid
1901 TYPE(realspace_grid_desc_type), POINTER :: desc
1902 TYPE(task_type), DIMENSION(:), POINTER :: tasks_recv
1903
1904 CALL timeset(routinen, handle)
1905
1906 cpassert(ASSOCIATED(tasks))
1907
1908 ! *** figure out if we have distributed grids
1909 distributed_grids = .false.
1910 DO igrid_level = 1, SIZE(rs_descs)
1911 IF (rs_descs(igrid_level)%rs_desc%distributed) THEN
1912 distributed_grids = .true.
1913 END IF
1914 END DO
1915 desc => rs_descs(1)%rs_desc
1916
1917 IF (distributed_grids) THEN
1918
1919 ALLOCATE (loads(0:desc%group_size - 1, SIZE(rs_descs)))
1920 ALLOCATE (total_loads(0:desc%group_size - 1))
1921
1922 total_loads = 0
1923
1924 ! First round of balancing on the distributed grids
1925 ! we just balance each of the distributed grids independently
1926 DO igrid_level = 1, SIZE(rs_descs)
1927 IF (rs_descs(igrid_level)%rs_desc%distributed) THEN
1928
1929 IF (.NOT. skip_load_balance_distributed) &
1930 CALL load_balance_distributed(tasks, ntasks, rs_descs, igrid_level, natoms)
1931
1932 CALL get_current_loads(loads(:, igrid_level), rs_descs, igrid_level, ntasks, &
1933 tasks, use_reordered_ranks=.false.)
1934
1935 total_loads(:) = total_loads + loads(:, igrid_level)
1936
1937 END IF
1938 END DO
1939
1940 ! calculate the total load of replicated tasks, so we can decide if it is worth
1941 ! reordering the distributed grid levels
1942
1943 replicated_load = 0
1944 DO igrid_level = 1, SIZE(rs_descs)
1945 IF (.NOT. rs_descs(igrid_level)%rs_desc%distributed) THEN
1946 CALL get_current_loads(loads(:, igrid_level), rs_descs, igrid_level, ntasks, &
1947 tasks, use_reordered_ranks=.false.)
1948 replicated_load = replicated_load + sum(loads(:, igrid_level))
1949 END IF
1950 END DO
1951
1952 !IF (desc%my_pos==0) THEN
1953 ! WRITE(*,*) "Total replicated load is ",replicated_load
1954 !END IF
1955
1956 ! Now we adjust the rank ordering based on the current loads
1957 ! we leave the first distributed level and all the replicated levels in the default order
1958 IF (reorder_rs_grid_ranks) THEN
1959 fixed_first_grid = .false.
1960 DO igrid_level = 1, SIZE(rs_descs)
1961 IF (rs_descs(igrid_level)%rs_desc%distributed) THEN
1962 IF (fixed_first_grid .EQV. .false.) THEN
1963 total_loads(:) = loads(:, igrid_level)
1964 fixed_first_grid = .true.
1965 ELSE
1966 ALLOCATE (trial_loads(0:desc%group_size - 1))
1967
1968 trial_loads(:) = total_loads + loads(:, igrid_level)
1969 max_load = maxval(trial_loads)
1970 load_gap = 0
1971 DO irank = 0, desc%group_size - 1
1972 load_gap = load_gap + max_load - trial_loads(irank)
1973 END DO
1974
1975 ! If there is not enough replicated load to load balance well enough
1976 ! then we will reorder this grid level
1977 IF (load_gap > replicated_load*1.05_dp) THEN
1978
1979 ALLOCATE (indices(0:desc%group_size - 1))
1980 ALLOCATE (total_index(0:desc%group_size - 1))
1981 ALLOCATE (total_loads_tmp(0:desc%group_size - 1))
1982 ALLOCATE (real2virtual(0:desc%group_size - 1))
1983
1984 total_loads_tmp(:) = total_loads
1985 CALL sort(total_loads_tmp, desc%group_size, total_index)
1986 CALL sort(loads(:, igrid_level), desc%group_size, indices)
1987
1988 ! Reorder so that the rank with smallest load on this grid level is paired with
1989 ! the highest load in total
1990 DO irank = 0, desc%group_size - 1
1991 total_loads(total_index(irank) - 1) = total_loads(total_index(irank) - 1) + &
1992 loads(desc%group_size - irank - 1, igrid_level)
1993 real2virtual(total_index(irank) - 1) = indices(desc%group_size - irank - 1) - 1
1994 END DO
1995
1996 CALL rs_grid_reorder_ranks(rs_descs(igrid_level)%rs_desc, real2virtual)
1997
1998 DEALLOCATE (indices)
1999 DEALLOCATE (total_index)
2000 DEALLOCATE (total_loads_tmp)
2001 DEALLOCATE (real2virtual)
2002 ELSE
2003 total_loads(:) = trial_loads
2004 END IF
2005
2006 DEALLOCATE (trial_loads)
2007
2008 END IF
2009 END IF
2010 END DO
2011 END IF
2012
2013 ! Now we use the replicated tasks to balance out the rest of the load
2014 CALL load_balance_replicated(rs_descs, ntasks, tasks)
2015
2016 !total_loads = 0
2017 !DO igrid_level=1,SIZE(rs_descs)
2018 ! CALL get_current_loads(loads(:,igrid_level), rs_descs, igrid_level, ntasks, &
2019 ! tasks, use_reordered_ranks=.TRUE.)
2020 ! total_loads = total_loads + loads(:, igrid_level)
2021 !END DO
2022
2023 !IF (desc%my_pos==0) THEN
2024 ! WRITE(*,*) ""
2025 ! WRITE(*,*) "At the end of the load balancing procedure"
2026 ! WRITE(*,*) "Maximum load:",MAXVAL(total_loads)
2027 ! WRITE(*,*) "Average load:",SUM(total_loads)/SIZE(total_loads)
2028 ! WRITE(*,*) "Minimum load:",MINVAL(total_loads)
2029 !ENDIF
2030
2031 ! given a list of tasks, this will do the needed reshuffle so that all tasks will be local
2032 CALL create_local_tasks(rs_descs, ntasks, tasks, ntasks_recv, tasks_recv)
2033
2034 !
2035 ! tasks list are complete, we can compute the list of atomic blocks (atom pairs)
2036 ! we will be sending. These lists are needed for redistribute_matrix.
2037 !
2038 CALL get_atom_pair(atom_pair_send, tasks, ntasks=ntasks, send=.true., symmetric=symmetric, rs_descs=rs_descs)
2039
2040 ! natom_send=SIZE(atom_pair_send)
2041 ! CALL desc%group%sum(natom_send)
2042 ! IF (desc%my_pos==0) THEN
2043 ! WRITE(*,*) ""
2044 ! WRITE(*,*) "Total number of atomic blocks to be send:",natom_send
2045 ! ENDIF
2046
2047 CALL get_atom_pair(atom_pair_recv, tasks_recv, ntasks=ntasks_recv, send=.false., symmetric=symmetric, rs_descs=rs_descs)
2048
2049 ! cleanup, at this point we don't need the original tasks anymore
2050 DEALLOCATE (tasks)
2051 DEALLOCATE (loads)
2052 DEALLOCATE (total_loads)
2053
2054 ELSE
2055 tasks_recv => tasks
2056 ntasks_recv = ntasks
2057 CALL get_atom_pair(atom_pair_recv, tasks_recv, ntasks=ntasks_recv, send=.false., symmetric=symmetric, rs_descs=rs_descs)
2058 ! not distributed, hence atom_pair_send not needed
2059 END IF
2060
2061 ! here we sort the task list we will process locally.
2062 ALLOCATE (indices(ntasks_recv))
2063 CALL tasks_sort(tasks_recv, ntasks_recv, indices)
2064 DEALLOCATE (indices)
2065
2066 !
2067 ! final lists are ready
2068 !
2069
2070 tasks => tasks_recv
2071 ntasks = ntasks_recv
2072
2073 CALL timestop(handle)
2074
2075 END SUBROUTINE distribute_tasks
2076
2077! **************************************************************************************************
2078!> \brief ...
2079!> \param atom_pair ...
2080!> \param my_tasks ...
2081!> \param send ...
2082!> \param symmetric ...
2083!> \param natoms ...
2084!> \param nimages ...
2085!> \param rs_descs ...
2086! **************************************************************************************************
2087 SUBROUTINE get_atom_pair(atom_pair, tasks, ntasks, send, symmetric, rs_descs)
2088
2089 TYPE(atom_pair_type), DIMENSION(:), POINTER :: atom_pair
2090 TYPE(task_type), DIMENSION(:), INTENT(INOUT) :: tasks
2091 INTEGER, INTENT(IN) :: ntasks
2092 LOGICAL, INTENT(IN) :: send, symmetric
2093 TYPE(realspace_grid_desc_p_type), DIMENSION(:), INTENT(IN) :: rs_descs
2094
2095 INTEGER :: i, ilevel, iatom, jatom, npairs, virt_rank
2096 INTEGER, DIMENSION(:), ALLOCATABLE :: indices
2097 TYPE(atom_pair_type), DIMENSION(:), ALLOCATABLE :: atom_pair_tmp
2098
2099 cpassert(.NOT. ASSOCIATED(atom_pair))
2100 IF (ntasks == 0) THEN
2101 ALLOCATE (atom_pair(0))
2102 RETURN
2103 END IF
2104
2105 ! calculate list of atom pairs
2106 ! fill pair list taking into account symmetry
2107 ALLOCATE (atom_pair_tmp(ntasks))
2108 DO i = 1, ntasks
2109 atom_pair_tmp(i)%image = tasks(i)%image
2110 iatom = tasks(i)%iatom
2111 jatom = tasks(i)%jatom
2112 IF (symmetric .AND. iatom > jatom) THEN
2113 ! iatom / jatom swapped
2114 atom_pair_tmp(i)%row = jatom
2115 atom_pair_tmp(i)%col = iatom
2116 ELSE
2117 atom_pair_tmp(i)%row = iatom
2118 atom_pair_tmp(i)%col = jatom
2119 END IF
2120
2121 IF (send) THEN
2122 ! If sending, we need to use the 'real rank' as the pair has to be sent to the process which
2123 ! actually has the correct part of the rs_grid to do the mapping
2124 ilevel = tasks(i)%grid_level
2125 virt_rank = decode_rank(tasks(i)%destination, SIZE(rs_descs))
2126 atom_pair_tmp(i)%rank = rs_descs(ilevel)%rs_desc%virtual2real(virt_rank)
2127 ELSE
2128 ! If we are receiving, then no conversion is needed as the rank is that of the process with the
2129 ! required matrix block, and the ordering of the rs grid is irrelevant
2130 atom_pair_tmp(i)%rank = decode_rank(tasks(i)%source, SIZE(rs_descs))
2131 END IF
2132 END DO
2133
2134 ! find unique atom pairs that I'm sending/receiving
2135 ALLOCATE (indices(ntasks))
2136 CALL atom_pair_sort(atom_pair_tmp, ntasks, indices)
2137 npairs = 1
2138 tasks(indices(1))%pair_index = 1
2139 DO i = 2, ntasks
2140 IF (atom_pair_less_than(atom_pair_tmp(i - 1), atom_pair_tmp(i))) THEN
2141 npairs = npairs + 1
2142 atom_pair_tmp(npairs) = atom_pair_tmp(i)
2143 END IF
2144 tasks(indices(i))%pair_index = npairs
2145 END DO
2146 DEALLOCATE (indices)
2147
2148 ! Copy unique pairs to final location.
2149 ALLOCATE (atom_pair(npairs))
2150 atom_pair(:) = atom_pair_tmp(:npairs)
2151 DEALLOCATE (atom_pair_tmp)
2152
2153 END SUBROUTINE get_atom_pair
2154
2155! **************************************************************************************************
2156!> \brief redistributes the matrix so that it can be used in realspace operations
2157!> i.e. according to the task lists for collocate and integrate.
2158!> This routine can become a bottleneck in large calculations.
2159!> \param rs_descs ...
2160!> \param pmats ...
2161!> \param atom_pair_send ...
2162!> \param atom_pair_recv ...
2163!> \param natoms ...
2164!> \param nimages ...
2165!> \param scatter ...
2166!> \param hmats ...
2167! **************************************************************************************************
2168 SUBROUTINE rs_distribute_matrix(rs_descs, pmats, atom_pair_send, atom_pair_recv, &
2169 nimages, scatter, hmats)
2170
2171 TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
2172 POINTER :: rs_descs
2173 TYPE(dbcsr_p_type), DIMENSION(:), POINTER :: pmats
2174 TYPE(atom_pair_type), DIMENSION(:), POINTER :: atom_pair_send, atom_pair_recv
2175 INTEGER :: nimages
2176 LOGICAL :: scatter
2177 TYPE(dbcsr_p_type), DIMENSION(:), OPTIONAL, &
2178 POINTER :: hmats
2179
2180 CHARACTER(LEN=*), PARAMETER :: routinen = 'rs_distribute_matrix'
2181
2182 INTEGER :: acol, arow, handle, i, img, j, k, l, me, &
2183 nblkcols_total, nblkrows_total, ncol, &
2184 nrow, nthread, nthread_left
2185 INTEGER, ALLOCATABLE, DIMENSION(:) :: first_col, first_row, last_col, last_row, recv_disps, &
2186 recv_pair_count, recv_pair_disps, recv_sizes, send_disps, send_pair_count, &
2187 send_pair_disps, send_sizes
2188 INTEGER, DIMENSION(:), POINTER :: col_blk_size, row_blk_size
2189 LOGICAL :: found
2190 REAL(kind=dp), ALLOCATABLE, DIMENSION(:), TARGET :: recv_buf_r, send_buf_r
2191 REAL(kind=dp), DIMENSION(:, :), POINTER :: h_block, p_block
2192 TYPE(dbcsr_type), POINTER :: hmat, pmat
2193 TYPE(realspace_grid_desc_type), POINTER :: desc
2194 REAL(kind=dp), DIMENSION(:), POINTER :: vector
2195
2196!$ INTEGER(kind=omp_lock_kind), ALLOCATABLE, DIMENSION(:) :: locks
2197
2198 CALL timeset(routinen, handle)
2199
2200 IF (.NOT. scatter) THEN
2201 cpassert(PRESENT(hmats))
2202 END IF
2203
2204 desc => rs_descs(1)%rs_desc
2205 me = desc%my_pos + 1
2206
2207 ! allocate local arrays
2208 ALLOCATE (send_sizes(desc%group_size))
2209 ALLOCATE (recv_sizes(desc%group_size))
2210 ALLOCATE (send_disps(desc%group_size))
2211 ALLOCATE (recv_disps(desc%group_size))
2212 ALLOCATE (send_pair_count(desc%group_size))
2213 ALLOCATE (recv_pair_count(desc%group_size))
2214 ALLOCATE (send_pair_disps(desc%group_size))
2215 ALLOCATE (recv_pair_disps(desc%group_size))
2216
2217 pmat => pmats(1)%matrix
2218 CALL dbcsr_get_info(pmat, &
2219 row_blk_size=row_blk_size, &
2220 col_blk_size=col_blk_size, &
2221 nblkrows_total=nblkrows_total, &
2222 nblkcols_total=nblkcols_total)
2223 ALLOCATE (first_row(nblkrows_total), last_row(nblkrows_total), &
2224 first_col(nblkcols_total), last_col(nblkcols_total))
2225 CALL dbcsr_convert_sizes_to_offsets(row_blk_size, first_row, last_row)
2226 CALL dbcsr_convert_sizes_to_offsets(col_blk_size, first_col, last_col)
2227
2228 ! set up send buffer sizes
2229 send_sizes = 0
2230 send_pair_count = 0
2231 DO i = 1, SIZE(atom_pair_send)
2232 k = atom_pair_send(i)%rank + 1 ! proc we're sending this block to
2233 arow = atom_pair_send(i)%row
2234 acol = atom_pair_send(i)%col
2235 nrow = last_row(arow) - first_row(arow) + 1
2236 ncol = last_col(acol) - first_col(acol) + 1
2237 send_sizes(k) = send_sizes(k) + nrow*ncol
2238 send_pair_count(k) = send_pair_count(k) + 1
2239 END DO
2240
2241 send_disps = 0
2242 send_pair_disps = 0
2243 DO i = 2, desc%group_size
2244 send_disps(i) = send_disps(i - 1) + send_sizes(i - 1)
2245 send_pair_disps(i) = send_pair_disps(i - 1) + send_pair_count(i - 1)
2246 END DO
2247
2248 ALLOCATE (send_buf_r(sum(send_sizes)))
2249
2250 ! set up recv buffer
2251
2252 recv_sizes = 0
2253 recv_pair_count = 0
2254 DO i = 1, SIZE(atom_pair_recv)
2255 k = atom_pair_recv(i)%rank + 1 ! proc we're receiving this data from
2256 arow = atom_pair_recv(i)%row
2257 acol = atom_pair_recv(i)%col
2258 nrow = last_row(arow) - first_row(arow) + 1
2259 ncol = last_col(acol) - first_col(acol) + 1
2260 recv_sizes(k) = recv_sizes(k) + nrow*ncol
2261 recv_pair_count(k) = recv_pair_count(k) + 1
2262 END DO
2263
2264 recv_disps = 0
2265 recv_pair_disps = 0
2266 DO i = 2, desc%group_size
2267 recv_disps(i) = recv_disps(i - 1) + recv_sizes(i - 1)
2268 recv_pair_disps(i) = recv_pair_disps(i - 1) + recv_pair_count(i - 1)
2269 END DO
2270 ALLOCATE (recv_buf_r(sum(recv_sizes)))
2271
2272!$OMP PARALLEL DEFAULT(OMP_DEFAULT_NONE_WITH_OOP) &
2273!$OMP SHARED(desc,send_pair_count,send_pair_disps,nimages),&
2274!$OMP SHARED(last_row,first_row,last_col,first_col),&
2275!$OMP SHARED(pmats,send_buf_r,send_disps,send_sizes),&
2276!$OMP SHARED(atom_pair_send,me,hmats,nblkrows_total),&
2277!$OMP SHARED(atom_pair_recv,recv_buf_r,scatter,recv_pair_disps), &
2278!$OMP SHARED(recv_sizes,recv_disps,recv_pair_count,locks), &
2279!$OMP PRIVATE(i,img,arow,acol,nrow,ncol,p_block,found,j,k,l),&
2280!$OMP PRIVATE(nthread,h_block,nthread_left,hmat,pmat,vector)
2281
2282 nthread = 1
2283!$ nthread = omp_get_num_threads()
2284 nthread_left = 1
2285!$ nthread_left = MAX(1, nthread - 1)
2286
2287 ! do packing
2288!$OMP DO schedule(guided)
2289 DO l = 1, desc%group_size
2290 IF (l == me) cycle
2291 send_sizes(l) = 0
2292 DO i = 1, send_pair_count(l)
2293 arow = atom_pair_send(send_pair_disps(l) + i)%row
2294 acol = atom_pair_send(send_pair_disps(l) + i)%col
2295 img = atom_pair_send(send_pair_disps(l) + i)%image
2296 nrow = last_row(arow) - first_row(arow) + 1
2297 ncol = last_col(acol) - first_col(acol) + 1
2298 pmat => pmats(img)%matrix
2299 CALL dbcsr_get_block_p(matrix=pmat, row=arow, col=acol, block=p_block, found=found)
2300 cpassert(found)
2301
2302 DO k = 1, ncol
2303 DO j = 1, nrow
2304 send_buf_r(send_disps(l) + send_sizes(l) + j + (k - 1)*nrow) = p_block(j, k)
2305 END DO
2306 END DO
2307 send_sizes(l) = send_sizes(l) + nrow*ncol
2308 END DO
2309 END DO
2310!$OMP END DO
2311
2312 IF (.NOT. scatter) THEN
2313 ! We need locks to protect concurrent summation into H
2314!$OMP SINGLE
2315!$ ALLOCATE (locks(nthread*10))
2316!$OMP END SINGLE
2317
2318!$OMP DO
2319!$ do i = 1, nthread*10
2320!$ call omp_init_lock(locks(i))
2321!$ end do
2322!$OMP END DO
2323 END IF
2324
2325!$OMP MASTER
2326 ! do communication
2327 CALL desc%group%alltoall(send_buf_r, send_sizes, send_disps, &
2328 recv_buf_r, recv_sizes, recv_disps)
2329!$OMP END MASTER
2330
2331 ! If this is a scatter, then no need to copy local blocks,
2332 ! If not, we sum them directly into H (bypassing the alltoall)
2333 IF (.NOT. scatter) THEN
2334
2335 ! Distribute work over remaining threads assuming one is still in the alltoall
2336!$OMP DO schedule(dynamic,MAX(1,send_pair_count(me)/nthread_left))
2337 DO i = 1, send_pair_count(me)
2338 arow = atom_pair_send(send_pair_disps(me) + i)%row
2339 acol = atom_pair_send(send_pair_disps(me) + i)%col
2340 img = atom_pair_send(send_pair_disps(me) + i)%image
2341 nrow = last_row(arow) - first_row(arow) + 1
2342 ncol = last_col(acol) - first_col(acol) + 1
2343 hmat => hmats(img)%matrix
2344 pmat => pmats(img)%matrix
2345 CALL dbcsr_get_block_p(matrix=hmat, row=arow, col=acol, block=h_block, found=found)
2346 cpassert(found)
2347 CALL dbcsr_get_block_p(matrix=pmat, row=arow, col=acol, block=p_block, found=found)
2348 cpassert(found)
2349
2350!$ call omp_set_lock(locks((arow - 1)*nthread*10/nblkrows_total + 1))
2351 DO k = 1, ncol
2352 DO j = 1, nrow
2353 h_block(j, k) = h_block(j, k) + p_block(j, k)
2354 END DO
2355 END DO
2356!$ call omp_unset_lock(locks((arow - 1)*nthread*10/nblkrows_total + 1))
2357 END DO
2358!$OMP END DO
2359 ELSE
2360 ! We will insert new blocks into P, so create mutable work matrices
2361 DO img = 1, nimages
2362 pmat => pmats(img)%matrix
2363 CALL dbcsr_work_create(pmat, work_mutable=.true., &
2364 nblks_guess=SIZE(atom_pair_recv)/nthread, sizedata_guess=SIZE(recv_buf_r)/nthread, &
2365 n=nthread)
2366 END DO
2367 END IF
2368
2369! wait for comm and setup to finish
2370!$OMP BARRIER
2371
2372 !do unpacking
2373!$OMP DO schedule(guided)
2374 DO l = 1, desc%group_size
2375 IF (l == me) cycle
2376 recv_sizes(l) = 0
2377 DO i = 1, recv_pair_count(l)
2378 arow = atom_pair_recv(recv_pair_disps(l) + i)%row
2379 acol = atom_pair_recv(recv_pair_disps(l) + i)%col
2380 img = atom_pair_recv(recv_pair_disps(l) + i)%image
2381 nrow = last_row(arow) - first_row(arow) + 1
2382 ncol = last_col(acol) - first_col(acol) + 1
2383 pmat => pmats(img)%matrix
2384 NULLIFY (p_block)
2385 CALL dbcsr_get_block_p(matrix=pmat, row=arow, col=acol, block=p_block, found=found)
2386
2387 IF (PRESENT(hmats)) THEN
2388 hmat => hmats(img)%matrix
2389 CALL dbcsr_get_block_p(matrix=hmat, row=arow, col=acol, block=h_block, found=found)
2390 cpassert(found)
2391 END IF
2392
2393 IF (scatter .AND. .NOT. ASSOCIATED(p_block)) THEN
2394 vector => recv_buf_r(recv_disps(l) + recv_sizes(l) + 1:recv_disps(l) + recv_sizes(l) + nrow*ncol)
2395 CALL dbcsr_put_block(pmat, arow, acol, block=reshape(vector, [nrow, ncol]))
2396 END IF
2397 IF (.NOT. scatter) THEN
2398!$ call omp_set_lock(locks((arow - 1)*nthread*10/nblkrows_total + 1))
2399 DO k = 1, ncol
2400 DO j = 1, nrow
2401 h_block(j, k) = h_block(j, k) + recv_buf_r(recv_disps(l) + recv_sizes(l) + j + (k - 1)*nrow)
2402 END DO
2403 END DO
2404!$ call omp_unset_lock(locks((arow - 1)*nthread*10/nblkrows_total + 1))
2405 END IF
2406 recv_sizes(l) = recv_sizes(l) + nrow*ncol
2407 END DO
2408 END DO
2409!$OMP END DO
2410
2411!$ IF (.not. scatter) THEN
2412!$OMP DO
2413!$ do i = 1, nthread*10
2414!$ call omp_destroy_lock(locks(i))
2415!$ end do
2416!$OMP END DO
2417!$ END IF
2418
2419!$OMP SINGLE
2420!$ IF (.not. scatter) THEN
2421!$ DEALLOCATE (locks)
2422!$ END IF
2423!$OMP END SINGLE NOWAIT
2424
2425 IF (scatter) THEN
2426 ! Blocks were added to P
2427 DO img = 1, nimages
2428 pmat => pmats(img)%matrix
2429 CALL dbcsr_finalize(pmat)
2430 END DO
2431 END IF
2432!$OMP END PARALLEL
2433
2434 DEALLOCATE (send_buf_r)
2435 DEALLOCATE (recv_buf_r)
2436
2437 DEALLOCATE (send_sizes)
2438 DEALLOCATE (recv_sizes)
2439 DEALLOCATE (send_disps)
2440 DEALLOCATE (recv_disps)
2441 DEALLOCATE (send_pair_count)
2442 DEALLOCATE (recv_pair_count)
2443 DEALLOCATE (send_pair_disps)
2444 DEALLOCATE (recv_pair_disps)
2445
2446 DEALLOCATE (first_row, last_row, first_col, last_col)
2447
2448 CALL timestop(handle)
2449
2450 END SUBROUTINE rs_distribute_matrix
2451
2452! **************************************************************************************************
2453!> \brief Calculates offsets and sizes for rs_scatter_matrix and rs_copy_matrix.
2454!> \author Ole Schuett
2455! **************************************************************************************************
2456 SUBROUTINE rs_calc_offsets(pairs, nsgf, group_size, &
2457 pair_offsets, rank_offsets, rank_sizes, buffer_size)
2458 TYPE(atom_pair_type), DIMENSION(:), INTENT(IN) :: pairs
2459 INTEGER, DIMENSION(:), INTENT(IN) :: nsgf
2460 INTEGER, INTENT(IN) :: group_size
2461 INTEGER, DIMENSION(:), POINTER :: pair_offsets, rank_offsets, rank_sizes
2462 INTEGER, INTENT(INOUT) :: buffer_size
2463
2464 INTEGER :: acol, arow, i, block_size, total_size, k, prev_k
2465
2466 IF (ASSOCIATED(pair_offsets)) DEALLOCATE (pair_offsets)
2467 IF (ASSOCIATED(rank_offsets)) DEALLOCATE (rank_offsets)
2468 IF (ASSOCIATED(rank_sizes)) DEALLOCATE (rank_sizes)
2469
2470 ! calculate buffer_size and pair_offsets
2471 ALLOCATE (pair_offsets(SIZE(pairs)))
2472 total_size = 0
2473 DO i = 1, SIZE(pairs)
2474 pair_offsets(i) = total_size
2475 arow = pairs(i)%row
2476 acol = pairs(i)%col
2477 block_size = nsgf(arow)*nsgf(acol)
2478 total_size = total_size + block_size
2479 END DO
2480 buffer_size = total_size
2481
2482 ! calculate rank_offsets and rank_sizes
2483 ALLOCATE (rank_offsets(group_size))
2484 ALLOCATE (rank_sizes(group_size))
2485 rank_offsets = 0
2486 rank_sizes = 0
2487 IF (SIZE(pairs) > 0) THEN
2488 prev_k = pairs(1)%rank + 1
2489 DO i = 1, SIZE(pairs)
2490 k = pairs(i)%rank + 1
2491 cpassert(k >= prev_k) ! expecting the pairs to be ordered by rank
2492 IF (k > prev_k) THEN
2493 rank_offsets(k) = pair_offsets(i)
2494 rank_sizes(prev_k) = rank_offsets(k) - rank_offsets(prev_k)
2495 prev_k = k
2496 END IF
2497 END DO
2498 rank_sizes(k) = buffer_size - rank_offsets(k) ! complete last rank
2499 END IF
2500
2501 END SUBROUTINE rs_calc_offsets
2502
2503! **************************************************************************************************
2504!> \brief Scatters dbcsr matrix blocks and receives them into a buffer as needed before collocation.
2505!> \author Ole Schuett
2506! **************************************************************************************************
2507 SUBROUTINE rs_scatter_matrices(src_matrices, dest_buffer, task_list, group)
2508 TYPE(dbcsr_p_type), DIMENSION(:), INTENT(IN) :: src_matrices
2509 TYPE(offload_buffer_type), INTENT(INOUT) :: dest_buffer
2510 TYPE(task_list_type), INTENT(IN) :: task_list
2511 TYPE(mp_comm_type), INTENT(IN) :: group
2512
2513 CHARACTER(LEN=*), PARAMETER :: routinen = 'rs_scatter_matrices'
2514
2515 INTEGER :: handle
2516 REAL(kind=dp), DIMENSION(:), ALLOCATABLE :: buffer_send
2517
2518 CALL timeset(routinen, handle)
2519 ALLOCATE (buffer_send(task_list%buffer_size_send))
2520
2521 ! pack dbcsr blocks into send buffer
2522 cpassert(ASSOCIATED(task_list%atom_pair_send))
2523 CALL rs_pack_buffer(src_matrices=src_matrices, &
2524 dest_buffer=buffer_send, &
2525 atom_pair=task_list%atom_pair_send, &
2526 pair_offsets=task_list%pair_offsets_send)
2527
2528 ! mpi all-to-all communication, receiving directly into blocks_recv%buffer.
2529 CALL group%alltoall(buffer_send, task_list%rank_sizes_send, task_list%rank_offsets_send, &
2530 dest_buffer%host_buffer, &
2531 task_list%rank_sizes_recv, task_list%rank_offsets_recv)
2532
2533 DEALLOCATE (buffer_send)
2534 CALL timestop(handle)
2535
2536 END SUBROUTINE rs_scatter_matrices
2537
2538! **************************************************************************************************
2539!> \brief Gather the dbcsr matrix blocks and receives them into a buffer as needed after integration.
2540!> \author Ole Schuett
2541! **************************************************************************************************
2542 SUBROUTINE rs_gather_matrices(src_buffer, dest_matrices, task_list, group)
2543 TYPE(offload_buffer_type), INTENT(IN) :: src_buffer
2544 TYPE(dbcsr_p_type), DIMENSION(:), INTENT(INOUT) :: dest_matrices
2545 TYPE(task_list_type), INTENT(IN) :: task_list
2546 TYPE(mp_comm_type), INTENT(IN) :: group
2547
2548 CHARACTER(LEN=*), PARAMETER :: routinen = 'rs_gather_matrices'
2549
2550 INTEGER :: handle
2551 REAL(kind=dp), DIMENSION(:), ALLOCATABLE :: buffer_send
2552
2553 CALL timeset(routinen, handle)
2554
2555 ! Caution: The meaning of send and recv are reversed in this routine.
2556 ALLOCATE (buffer_send(task_list%buffer_size_send)) ! e.g. this is actually used for receiving
2557
2558 ! mpi all-to-all communication
2559 CALL group%alltoall(src_buffer%host_buffer, task_list%rank_sizes_recv, task_list%rank_offsets_recv, &
2560 buffer_send, task_list%rank_sizes_send, task_list%rank_offsets_send)
2561
2562 ! unpack dbcsr blocks from send buffer
2563 cpassert(ASSOCIATED(task_list%atom_pair_send))
2564 CALL rs_unpack_buffer(src_buffer=buffer_send, &
2565 dest_matrices=dest_matrices, &
2566 atom_pair=task_list%atom_pair_send, &
2567 pair_offsets=task_list%pair_offsets_send)
2568
2569 DEALLOCATE (buffer_send)
2570 CALL timestop(handle)
2571
2572 END SUBROUTINE rs_gather_matrices
2573
2574! **************************************************************************************************
2575!> \brief Copies the DBCSR blocks into buffer, replaces rs_scatter_matrix for non-distributed grids.
2576!> \author Ole Schuett
2577! **************************************************************************************************
2578 SUBROUTINE rs_copy_to_buffer(src_matrices, dest_buffer, task_list)
2579 TYPE(dbcsr_p_type), DIMENSION(:), INTENT(IN) :: src_matrices
2580 TYPE(offload_buffer_type), INTENT(INOUT) :: dest_buffer
2581 TYPE(task_list_type), INTENT(IN) :: task_list
2582
2583 CALL rs_pack_buffer(src_matrices=src_matrices, &
2584 dest_buffer=dest_buffer%host_buffer, &
2585 atom_pair=task_list%atom_pair_recv, &
2586 pair_offsets=task_list%pair_offsets_recv)
2587
2588 END SUBROUTINE rs_copy_to_buffer
2589
2590! **************************************************************************************************
2591!> \brief Copies from buffer into DBCSR matrics, replaces rs_gather_matrix for non-distributed grids.
2592!> \author Ole Schuett
2593! **************************************************************************************************
2594 SUBROUTINE rs_copy_to_matrices(src_buffer, dest_matrices, task_list)
2595 TYPE(offload_buffer_type), INTENT(IN) :: src_buffer
2596 TYPE(dbcsr_p_type), DIMENSION(:), INTENT(INOUT) :: dest_matrices
2597 TYPE(task_list_type), INTENT(IN) :: task_list
2598
2599 CALL rs_unpack_buffer(src_buffer=src_buffer%host_buffer, &
2600 dest_matrices=dest_matrices, &
2601 atom_pair=task_list%atom_pair_recv, &
2602 pair_offsets=task_list%pair_offsets_recv)
2603
2604 END SUBROUTINE rs_copy_to_matrices
2605
2606! **************************************************************************************************
2607!> \brief Helper routine for rs_scatter_matrix and rs_copy_to_buffer.
2608!> \author Ole Schuett
2609! **************************************************************************************************
2610 SUBROUTINE rs_pack_buffer(src_matrices, dest_buffer, atom_pair, pair_offsets)
2611 TYPE(dbcsr_p_type), DIMENSION(:), INTENT(IN) :: src_matrices
2612 REAL(kind=dp), DIMENSION(:), INTENT(INOUT) :: dest_buffer
2613 TYPE(atom_pair_type), DIMENSION(:), INTENT(IN) :: atom_pair
2614 INTEGER, DIMENSION(:), INTENT(IN) :: pair_offsets
2615
2616 INTEGER :: acol, arow, img, i, offset, block_size
2617 LOGICAL :: found
2618 REAL(kind=dp), DIMENSION(:, :), POINTER :: block
2619
2620!$OMP PARALLEL DEFAULT(NONE), &
2621!$OMP SHARED(src_matrices,atom_pair,pair_offsets,dest_buffer), &
2622!$OMP PRIVATE(acol,arow,img,i,offset,block_size,found,block)
2623!$OMP DO schedule(guided)
2624 DO i = 1, SIZE(atom_pair)
2625 arow = atom_pair(i)%row
2626 acol = atom_pair(i)%col
2627 img = atom_pair(i)%image
2628 CALL dbcsr_get_block_p(matrix=src_matrices(img)%matrix, row=arow, col=acol, &
2629 block=block, found=found)
2630 cpassert(found)
2631 block_size = SIZE(block)
2632 offset = pair_offsets(i)
2633 dest_buffer(offset + 1:offset + block_size) = reshape(block, shape=[block_size])
2634 END DO
2635!$OMP END DO
2636!$OMP END PARALLEL
2637
2638 END SUBROUTINE rs_pack_buffer
2639
2640! **************************************************************************************************
2641!> \brief Helper routine for rs_gather_matrix and rs_copy_to_matrices.
2642!> \author Ole Schuett
2643! **************************************************************************************************
2644 SUBROUTINE rs_unpack_buffer(src_buffer, dest_matrices, atom_pair, pair_offsets)
2645 REAL(kind=dp), DIMENSION(:), INTENT(IN) :: src_buffer
2646 TYPE(dbcsr_p_type), DIMENSION(:), INTENT(INOUT) :: dest_matrices
2647 TYPE(atom_pair_type), DIMENSION(:), INTENT(IN) :: atom_pair
2648 INTEGER, DIMENSION(:), INTENT(IN) :: pair_offsets
2649
2650 INTEGER :: acol, arow, img, i, offset, &
2651 nrows, ncols, lock_num
2652 LOGICAL :: found
2653 REAL(kind=dp), DIMENSION(:, :), POINTER :: block
2654 INTEGER(kind=omp_lock_kind), ALLOCATABLE, DIMENSION(:) :: locks
2655
2656 ! initialize locks
2657 ALLOCATE (locks(10*omp_get_max_threads()))
2658 DO i = 1, SIZE(locks)
2659 CALL omp_init_lock(locks(i))
2660 END DO
2661
2662!$OMP PARALLEL DEFAULT(NONE), &
2663!$OMP SHARED(src_buffer,atom_pair,pair_offsets,dest_matrices,locks), &
2664!$OMP PRIVATE(acol,arow,img,i,offset,nrows,ncols,lock_num,found,block)
2665!$OMP DO schedule(guided)
2666 DO i = 1, SIZE(atom_pair)
2667 arow = atom_pair(i)%row
2668 acol = atom_pair(i)%col
2669 img = atom_pair(i)%image
2670 CALL dbcsr_get_block_p(matrix=dest_matrices(img)%matrix, row=arow, col=acol, &
2671 block=block, found=found)
2672 cpassert(found)
2673 nrows = SIZE(block, 1)
2674 ncols = SIZE(block, 2)
2675 offset = pair_offsets(i)
2676 lock_num = modulo(arow, SIZE(locks)) + 1 ! map matrix rows round-robin to available locks
2677
2678 CALL omp_set_lock(locks(lock_num))
2679 block = block + reshape(src_buffer(offset + 1:offset + nrows*ncols), shape=[nrows, ncols])
2680 CALL omp_unset_lock(locks(lock_num))
2681 END DO
2682!$OMP END DO
2683!$OMP END PARALLEL
2684
2685 ! destroy locks
2686 DO i = 1, SIZE(locks)
2687 CALL omp_destroy_lock(locks(i))
2688 END DO
2689 DEALLOCATE (locks)
2690
2691 END SUBROUTINE rs_unpack_buffer
2692
2693! **************************************************************************************************
2694!> \brief determines the rank of the processor who's real rs grid contains point
2695!> \param rs_desc ...
2696!> \param igrid_level ...
2697!> \param n_levels ...
2698!> \param cube_center ...
2699!> \param ntasks ...
2700!> \param tasks ...
2701!> \param lb_cube ...
2702!> \param ub_cube ...
2703!> \param added_tasks ...
2704!> \par History
2705!> 11.2007 created [MattW]
2706!> 10.2008 rewritten [Joost VandeVondele]
2707!> \author MattW
2708! **************************************************************************************************
2709 SUBROUTINE rs_find_node(rs_desc, igrid_level, n_levels, cube_center, ntasks, tasks, &
2710 lb_cube, ub_cube, added_tasks)
2711
2712 TYPE(realspace_grid_desc_type), POINTER :: rs_desc
2713 INTEGER, INTENT(IN) :: igrid_level, n_levels
2714 INTEGER, DIMENSION(3), INTENT(IN) :: cube_center
2715 INTEGER, INTENT(INOUT) :: ntasks
2716 TYPE(task_type), DIMENSION(:), POINTER :: tasks
2717 INTEGER, DIMENSION(3), INTENT(IN) :: lb_cube, ub_cube
2718 INTEGER, INTENT(OUT) :: added_tasks
2719
2720 INTEGER, PARAMETER :: add_tasks = 1000
2721 REAL(kind=dp), PARAMETER :: mult_tasks = 2.0_dp
2722
2723 INTEGER :: bit_index, coord(3), curr_tasks, dest, i, icoord(3), idest, itask, ix, iy, iz, &
2724 lb_coord(3), lb_domain(3), lbc(3), ub_coord(3), ub_domain(3), ubc(3)
2725 INTEGER :: bit_pattern
2726 LOGICAL :: dir_periodic(3)
2727
2728 coord(1) = rs_desc%x2coord(cube_center(1))
2729 coord(2) = rs_desc%y2coord(cube_center(2))
2730 coord(3) = rs_desc%z2coord(cube_center(3))
2731 dest = rs_desc%coord2rank(coord(1), coord(2), coord(3))
2732
2733 ! the real cube coordinates
2734 lbc = lb_cube + cube_center
2735 ubc = ub_cube + cube_center
2736
2737 IF (all((rs_desc%lb_global(:, dest) - rs_desc%border) <= lbc) .AND. &
2738 all((rs_desc%ub_global(:, dest) + rs_desc%border) >= ubc)) THEN
2739 !standard distributed collocation/integration
2740 tasks(ntasks)%destination = encode_rank(dest, igrid_level, n_levels)
2741 tasks(ntasks)%dist_type = 1
2742 tasks(ntasks)%subpatch_pattern = 0
2743 added_tasks = 1
2744
2745 ! here we figure out if there is an alternate location for this task
2746 ! i.e. even though the cube_center is not in the real local domain,
2747 ! it might fully fit in the halo of the neighbor
2748 ! if its radius is smaller than the maximum radius
2749 ! the list of possible neighbors is stored here as a bit pattern
2750 ! to reconstruct what the rank of the neigbor is,
2751 ! we can use (note this requires the correct rs_grid)
2752 ! IF (BTEST(bit_pattern,0)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,[-1,0,0])
2753 ! IF (BTEST(bit_pattern,1)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,[+1,0,0])
2754 ! IF (BTEST(bit_pattern,2)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,[0,-1,0])
2755 ! IF (BTEST(bit_pattern,3)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,[0,+1,0])
2756 ! IF (BTEST(bit_pattern,4)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,[0,0,-1])
2757 ! IF (BTEST(bit_pattern,5)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,[0,0,+1])
2758 bit_index = 0
2759 bit_pattern = 0
2760 DO i = 1, 3
2761 IF (rs_desc%perd(i) == 1) THEN
2762 bit_pattern = ibclr(bit_pattern, bit_index)
2763 bit_index = bit_index + 1
2764 bit_pattern = ibclr(bit_pattern, bit_index)
2765 bit_index = bit_index + 1
2766 ELSE
2767 ! fits the left neighbor ?
2768 IF (ubc(i) <= rs_desc%lb_global(i, dest) - 1 + rs_desc%border) THEN
2769 bit_pattern = ibset(bit_pattern, bit_index)
2770 bit_index = bit_index + 1
2771 ELSE
2772 bit_pattern = ibclr(bit_pattern, bit_index)
2773 bit_index = bit_index + 1
2774 END IF
2775 ! fits the right neighbor ?
2776 IF (lbc(i) >= rs_desc%ub_global(i, dest) + 1 - rs_desc%border) THEN
2777 bit_pattern = ibset(bit_pattern, bit_index)
2778 bit_index = bit_index + 1
2779 ELSE
2780 bit_pattern = ibclr(bit_pattern, bit_index)
2781 bit_index = bit_index + 1
2782 END IF
2783 END IF
2784 END DO
2785 tasks(ntasks)%subpatch_pattern = bit_pattern
2786
2787 ELSE
2788 ! generalised collocation/integration needed
2789 ! first we figure out how many neighbors we have to add to include the lbc/ubc
2790 ! in the available domains (inclusive of halo points)
2791 ! first we 'ignore' periodic boundary conditions
2792 ! i.e. ub_coord-lb_coord+1 might be larger than group_dim
2793 lb_coord = coord
2794 ub_coord = coord
2795 lb_domain = rs_desc%lb_global(:, dest) - rs_desc%border
2796 ub_domain = rs_desc%ub_global(:, dest) + rs_desc%border
2797 DO i = 1, 3
2798 ! only if the grid is not periodic in this direction we need to take care of adding neighbors
2799 IF (rs_desc%perd(i) == 0) THEN
2800 ! if the domain lower bound is greater than the lbc we need to add the size of the neighbor domain
2801 DO
2802 IF (lb_domain(i) > lbc(i)) THEN
2803 lb_coord(i) = lb_coord(i) - 1
2804 icoord = modulo(lb_coord, rs_desc%group_dim)
2805 idest = rs_desc%coord2rank(icoord(1), icoord(2), icoord(3))
2806 lb_domain(i) = lb_domain(i) - (rs_desc%ub_global(i, idest) - rs_desc%lb_global(i, idest) + 1)
2807 ELSE
2808 EXIT
2809 END IF
2810 END DO
2811 ! same for the upper bound
2812 DO
2813 IF (ub_domain(i) < ubc(i)) THEN
2814 ub_coord(i) = ub_coord(i) + 1
2815 icoord = modulo(ub_coord, rs_desc%group_dim)
2816 idest = rs_desc%coord2rank(icoord(1), icoord(2), icoord(3))
2817 ub_domain(i) = ub_domain(i) + (rs_desc%ub_global(i, idest) - rs_desc%lb_global(i, idest) + 1)
2818 ELSE
2819 EXIT
2820 END IF
2821 END DO
2822 END IF
2823 END DO
2824
2825 ! some care is needed for the periodic boundaries
2826 DO i = 1, 3
2827 IF (ub_domain(i) - lb_domain(i) + 1 >= rs_desc%npts(i)) THEN
2828 dir_periodic(i) = .true.
2829 lb_coord(i) = 0
2830 ub_coord(i) = rs_desc%group_dim(i) - 1
2831 ELSE
2832 dir_periodic(i) = .false.
2833 END IF
2834 END DO
2835
2836 added_tasks = product(ub_coord - lb_coord + 1)
2837 itask = ntasks
2838 ntasks = ntasks + added_tasks - 1
2839 IF (ntasks > SIZE(tasks)) THEN
2840 curr_tasks = int((SIZE(tasks) + add_tasks)*mult_tasks)
2841 CALL reallocate_tasks(tasks, curr_tasks)
2842 END IF
2843 DO iz = lb_coord(3), ub_coord(3)
2844 DO iy = lb_coord(2), ub_coord(2)
2845 DO ix = lb_coord(1), ub_coord(1)
2846 icoord = modulo([ix, iy, iz], rs_desc%group_dim)
2847 idest = rs_desc%coord2rank(icoord(1), icoord(2), icoord(3))
2848 tasks(itask)%destination = encode_rank(idest, igrid_level, n_levels)
2849 tasks(itask)%dist_type = 2
2850 tasks(itask)%subpatch_pattern = 0
2851 ! encode the domain size for this task
2852 ! if the bit is set, we need to add the border in that direction
2853 IF (ix == lb_coord(1) .AND. .NOT. dir_periodic(1)) &
2854 tasks(itask)%subpatch_pattern = ibset(tasks(itask)%subpatch_pattern, 0)
2855 IF (ix == ub_coord(1) .AND. .NOT. dir_periodic(1)) &
2856 tasks(itask)%subpatch_pattern = ibset(tasks(itask)%subpatch_pattern, 1)
2857 IF (iy == lb_coord(2) .AND. .NOT. dir_periodic(2)) &
2858 tasks(itask)%subpatch_pattern = ibset(tasks(itask)%subpatch_pattern, 2)
2859 IF (iy == ub_coord(2) .AND. .NOT. dir_periodic(2)) &
2860 tasks(itask)%subpatch_pattern = ibset(tasks(itask)%subpatch_pattern, 3)
2861 IF (iz == lb_coord(3) .AND. .NOT. dir_periodic(3)) &
2862 tasks(itask)%subpatch_pattern = ibset(tasks(itask)%subpatch_pattern, 4)
2863 IF (iz == ub_coord(3) .AND. .NOT. dir_periodic(3)) &
2864 tasks(itask)%subpatch_pattern = ibset(tasks(itask)%subpatch_pattern, 5)
2865 itask = itask + 1
2866 END DO
2867 END DO
2868 END DO
2869 END IF
2870
2871 END SUBROUTINE rs_find_node
2872
2873! **************************************************************************************************
2874!> \brief utility functions for encoding the grid level with a rank, allowing
2875!> different grid levels to maintain a different rank ordering without
2876!> losing information. These encoded_ints are stored in the tasks(1:2,:) array
2877!> \param rank ...
2878!> \param grid_level ...
2879!> \param n_levels ...
2880!> \return ...
2881!> \par History
2882!> 4.2009 created [Iain Bethune]
2883!> (c) The Numerical Algorithms Group (NAG) Ltd, 2009 on behalf of the HECToR project
2884! **************************************************************************************************
2885 FUNCTION encode_rank(rank, grid_level, n_levels) RESULT(encoded_int)
2886
2887 INTEGER, INTENT(IN) :: rank, grid_level, n_levels
2888 INTEGER :: encoded_int
2889
2890! ordered so can still sort by rank
2891
2892 encoded_int = rank*n_levels + grid_level - 1
2893
2894 END FUNCTION encode_rank
2895
2896! **************************************************************************************************
2897!> \brief ...
2898!> \param encoded_int ...
2899!> \param n_levels ...
2900!> \return ...
2901! **************************************************************************************************
2902 FUNCTION decode_rank(encoded_int, n_levels) RESULT(rank)
2903
2904 INTEGER, INTENT(IN) :: encoded_int
2905 INTEGER, INTENT(IN) :: n_levels
2906 INTEGER :: rank
2907
2908 rank = int(encoded_int/n_levels)
2909
2910 END FUNCTION decode_rank
2911
2912! **************************************************************************************************
2913!> \brief ...
2914!> \param encoded_int ...
2915!> \param n_levels ...
2916!> \return ...
2917! **************************************************************************************************
2918 FUNCTION decode_level(encoded_int, n_levels) RESULT(grid_level)
2919
2920 INTEGER, INTENT(IN) :: encoded_int
2921 INTEGER, INTENT(IN) :: n_levels
2922 INTEGER :: grid_level
2923
2924 grid_level = int(modulo(encoded_int, n_levels)) + 1
2925
2926 END FUNCTION decode_level
2927
2928! **************************************************************************************************
2929!> \brief Sort pgf index pair (ipgf,iset,iatom),(jpgf,jset,jatom) for which all atom pairs are
2930!> grouped, and for each atom pair all set pairs are grouped and for each set pair,
2931!> all pgfs are grouped.
2932!> This yields the right order of the tasks for collocation after the sort
2933!> in distribute_tasks. E.g. for a atom pair, all sets and pgfs are computed in one go.
2934!> The exception is the gridlevel. Tasks are first ordered wrt to grid_level. This implies
2935!> that a given density matrix block will be decontracted several times,
2936!> but cache effects on the grid make up for this.
2937!> \param a ...
2938!> \param b ...
2939!> \return ...
2940!> \author Ole Schuett
2941! **************************************************************************************************
2942 PURE FUNCTION tasks_less_than(a, b) RESULT(res)
2943 TYPE(task_type), INTENT(IN) :: a, b
2944 LOGICAL :: res
2945
2946 IF (a%grid_level /= b%grid_level) THEN
2947 res = a%grid_level < b%grid_level
2948
2949 ELSE IF (a%image /= b%image) THEN
2950 res = a%image < b%image
2951
2952 ELSE IF (a%iatom /= b%iatom) THEN
2953 res = a%iatom < b%iatom
2954
2955 ELSE IF (a%jatom /= b%jatom) THEN
2956 res = a%jatom < b%jatom
2957
2958 ELSE IF (a%iset /= b%iset) THEN
2959 res = a%iset < b%iset
2960
2961 ELSE IF (a%jset /= b%jset) THEN
2962 res = a%jset < b%jset
2963
2964 ELSE IF (a%ipgf /= b%ipgf) THEN
2965 res = a%ipgf < b%ipgf
2966
2967 ELSE
2968 res = a%jpgf < b%jpgf
2969
2970 END IF
2971 END FUNCTION tasks_less_than
2972
2973
2974! **************************************************************************************************
2975!> \brief Sorts an array inplace using a combination of merge- and bubble-sort.
2976!> It also returns the indices, which the elements had before the sort.
2977!> \param arr the array to sort
2978!> \param n length of array
2979!> \param indices returns elements-indices before the sort
2980!> \par History
2981!> 12.2012 created [ole]
2982!> \author Ole Schuett
2983! **************************************************************************************************
2984 SUBROUTINE tasks_sort(arr, n, indices)
2985 INTEGER, INTENT(IN) :: n
2986 TYPE(task_type), DIMENSION(1:n), INTENT(INOUT) :: arr
2987 integer, DIMENSION(1:n), INTENT(INOUT) :: indices
2988
2989 INTEGER :: i
2990 TYPE(task_type), ALLOCATABLE :: tmp_arr(:)
2991 INTEGER, ALLOCATABLE :: tmp_idx(:)
2992
2993 IF (n > 1) THEN
2994 ! scratch space used during the merge step
2995 ALLOCATE (tmp_arr((n + 1)/2), tmp_idx((n + 1)/2))
2996
2997 indices = [(i, i=1, n)]
2998
2999 CALL tasks_sort_low(arr(1:n), indices, tmp_arr, tmp_idx)
3000
3001 DEALLOCATE (tmp_arr, tmp_idx)
3002 ELSE IF (n > 0) THEN ! can be a frequent case in cp2k
3003 indices(1) = 1
3004 END IF
3005
3006 END SUBROUTINE tasks_sort
3007
3008! **************************************************************************************************
3009!> \brief The actual sort routing. Only tasks_sort and itself should call this.
3010!> \param arr the array to sort
3011!> \param indices elements-indices before the sort
3012!> \param tmp_arr scratch space
3013!> \param tmp_idx scratch space
3014!> \par History
3015!> 12.2012 created [ole]
3016!> \author Ole Schuett
3017! **************************************************************************************************
3018 RECURSIVE SUBROUTINE tasks_sort_low(arr, indices, tmp_arr, tmp_idx)
3019 TYPE(task_type), DIMENSION(:), INTENT(INOUT) :: arr
3020 INTEGER, DIMENSION(size(arr)), INTENT(INOUT) :: indices
3021 TYPE(task_type), DIMENSION((size(arr) + 1)/2), INTENT(INOUT) :: tmp_arr
3022 INTEGER, DIMENSION((size(arr) + 1)/2), INTENT(INOUT) :: tmp_idx
3023 TYPE(task_type) :: a
3024 INTEGER :: t, m, i, j, k
3025 LOGICAL :: swapped
3026 ! a,t: used during swaping of elements in arr and indices
3027
3028 swapped = .true.
3029
3030 ! If only a few elements are left we switch to bubble-sort for efficiency.
3031 IF (size(arr) <= 7) THEN ! 7 seems to be a good choice for the moment
3032 DO j = size(arr) - 1, 1, -1
3033 swapped = .false.
3034 DO i = 1, j
3035 IF (tasks_less_than(arr(i + 1), arr(i))) THEN
3036 ! swap arr(i) with arr(i+1)
3037 a = arr(i)
3038 arr(i) = arr(i + 1)
3039 arr(i + 1) = a
3040 ! swap indices(i) with indices(i+1)
3041 t = indices(i)
3042 indices(i) = indices(i + 1)
3043 indices(i + 1) = t
3044 swapped = .true.
3045 END IF
3046 END DO
3047 IF (.NOT. swapped) EXIT
3048 END DO
3049 RETURN
3050 END IF
3051
3052 ! split list in half and recursively sort both sublists
3053 m = (size(arr) + 1)/2 ! index where we're going to divide the list in two
3054 CALL tasks_sort_low(arr(1:m), indices(1:m), tmp_arr, tmp_idx)
3055 CALL tasks_sort_low(arr(m + 1:), indices(m + 1:), tmp_arr, tmp_idx)
3056
3057 ! Check for a special case: Can we just concate the two sorted sublists?
3058 ! This leads to O(n) scaling if the input is already sorted.
3059 IF (tasks_less_than(arr(m + 1), arr(m))) THEN
3060 ! ...no - let's merge the two sorted sublists arr(:m) and arr(m+1:)
3061 ! Merge will be performed directly in arr. Need backup of first sublist.
3062 tmp_arr(1:m) = arr(1:m)
3063 tmp_idx(1:m) = indices(1:m)
3064 i = 1; ! number of elemens consumed from 1st sublist
3065 j = 1; ! number of elemens consumed from 2nd sublist
3066 k = 1; ! number of elemens already merged
3067
3068 DO WHILE (i <= m .and. j <= size(arr) - m)
3069 IF (tasks_less_than(arr(m + j), tmp_arr(i))) THEN
3070 arr(k) = arr(m + j)
3071 indices(k) = indices(m + j)
3072 j = j + 1
3073 ELSE
3074 arr(k) = tmp_arr(i)
3075 indices(k) = tmp_idx(i)
3076 i = i + 1
3077 END IF
3078 k = k + 1
3079 END DO
3080
3081 ! One of the two sublist is now empty.
3082 ! Copy possibly remaining tail of 1st sublist
3083 DO WHILE (i <= m)
3084 arr(k) = tmp_arr(i)
3085 indices(k) = tmp_idx(i)
3086 i = i + 1
3087 k = k + 1
3088 END DO
3089 ! The possibly remaining tail of 2nd sublist is already at the right spot.
3090 END IF
3091
3092 END SUBROUTINE tasks_sort_low
3093
3094
3095! **************************************************************************************************
3096!> \brief Order atom pairs to find duplicates.
3097!> \param a ...
3098!> \param b ...
3099!> \return ...
3100!> \author Ole Schuett
3101! **************************************************************************************************
3102 PURE FUNCTION atom_pair_less_than(a, b) RESULT(res)
3103 TYPE(atom_pair_type), INTENT(IN) :: a, b
3104 LOGICAL :: res
3105
3106 IF (a%rank /= b%rank) THEN
3107 res = a%rank < b%rank
3108
3109 ELSE IF (a%row /= b%row) THEN
3110 res = a%row < b%row
3111
3112 ELSE IF (a%col /= b%col) THEN
3113 res = a%col < b%col
3114
3115 ELSE
3116 res = a%image < b%image
3117
3118 END IF
3119 END FUNCTION atom_pair_less_than
3120
3121
3122! **************************************************************************************************
3123!> \brief Sorts an array inplace using a combination of merge- and bubble-sort.
3124!> It also returns the indices, which the elements had before the sort.
3125!> \param arr the array to sort
3126!> \param n length of array
3127!> \param indices returns elements-indices before the sort
3128!> \par History
3129!> 12.2012 created [ole]
3130!> \author Ole Schuett
3131! **************************************************************************************************
3132 SUBROUTINE atom_pair_sort(arr, n, indices)
3133 INTEGER, INTENT(IN) :: n
3134 TYPE(atom_pair_type), DIMENSION(1:n), INTENT(INOUT) :: arr
3135 integer, DIMENSION(1:n), INTENT(INOUT) :: indices
3136
3137 INTEGER :: i
3138 TYPE(atom_pair_type), ALLOCATABLE :: tmp_arr(:)
3139 INTEGER, ALLOCATABLE :: tmp_idx(:)
3140
3141 IF (n > 1) THEN
3142 ! scratch space used during the merge step
3143 ALLOCATE (tmp_arr((n + 1)/2), tmp_idx((n + 1)/2))
3144
3145 indices = [(i, i=1, n)]
3146
3147 CALL atom_pair_sort_low(arr(1:n), indices, tmp_arr, tmp_idx)
3148
3149 DEALLOCATE (tmp_arr, tmp_idx)
3150 ELSE IF (n > 0) THEN ! can be a frequent case in cp2k
3151 indices(1) = 1
3152 END IF
3153
3154 END SUBROUTINE atom_pair_sort
3155
3156! **************************************************************************************************
3157!> \brief The actual sort routing. Only atom_pair_sort and itself should call this.
3158!> \param arr the array to sort
3159!> \param indices elements-indices before the sort
3160!> \param tmp_arr scratch space
3161!> \param tmp_idx scratch space
3162!> \par History
3163!> 12.2012 created [ole]
3164!> \author Ole Schuett
3165! **************************************************************************************************
3166 RECURSIVE SUBROUTINE atom_pair_sort_low(arr, indices, tmp_arr, tmp_idx)
3167 TYPE(atom_pair_type), DIMENSION(:), INTENT(INOUT) :: arr
3168 INTEGER, DIMENSION(size(arr)), INTENT(INOUT) :: indices
3169 TYPE(atom_pair_type), DIMENSION((size(arr) + 1)/2), INTENT(INOUT) :: tmp_arr
3170 INTEGER, DIMENSION((size(arr) + 1)/2), INTENT(INOUT) :: tmp_idx
3171 TYPE(atom_pair_type) :: a
3172 INTEGER :: t, m, i, j, k
3173 LOGICAL :: swapped
3174 ! a,t: used during swaping of elements in arr and indices
3175
3176 swapped = .true.
3177
3178 ! If only a few elements are left we switch to bubble-sort for efficiency.
3179 IF (size(arr) <= 7) THEN ! 7 seems to be a good choice for the moment
3180 DO j = size(arr) - 1, 1, -1
3181 swapped = .false.
3182 DO i = 1, j
3183 IF (atom_pair_less_than(arr(i + 1), arr(i))) THEN
3184 ! swap arr(i) with arr(i+1)
3185 a = arr(i)
3186 arr(i) = arr(i + 1)
3187 arr(i + 1) = a
3188 ! swap indices(i) with indices(i+1)
3189 t = indices(i)
3190 indices(i) = indices(i + 1)
3191 indices(i + 1) = t
3192 swapped = .true.
3193 END IF
3194 END DO
3195 IF (.NOT. swapped) EXIT
3196 END DO
3197 RETURN
3198 END IF
3199
3200 ! split list in half and recursively sort both sublists
3201 m = (size(arr) + 1)/2 ! index where we're going to divide the list in two
3202 CALL atom_pair_sort_low(arr(1:m), indices(1:m), tmp_arr, tmp_idx)
3203 CALL atom_pair_sort_low(arr(m + 1:), indices(m + 1:), tmp_arr, tmp_idx)
3204
3205 ! Check for a special case: Can we just concate the two sorted sublists?
3206 ! This leads to O(n) scaling if the input is already sorted.
3207 IF (atom_pair_less_than(arr(m + 1), arr(m))) THEN
3208 ! ...no - let's merge the two sorted sublists arr(:m) and arr(m+1:)
3209 ! Merge will be performed directly in arr. Need backup of first sublist.
3210 tmp_arr(1:m) = arr(1:m)
3211 tmp_idx(1:m) = indices(1:m)
3212 i = 1; ! number of elemens consumed from 1st sublist
3213 j = 1; ! number of elemens consumed from 2nd sublist
3214 k = 1; ! number of elemens already merged
3215
3216 DO WHILE (i <= m .and. j <= size(arr) - m)
3217 IF (atom_pair_less_than(arr(m + j), tmp_arr(i))) THEN
3218 arr(k) = arr(m + j)
3219 indices(k) = indices(m + j)
3220 j = j + 1
3221 ELSE
3222 arr(k) = tmp_arr(i)
3223 indices(k) = tmp_idx(i)
3224 i = i + 1
3225 END IF
3226 k = k + 1
3227 END DO
3228
3229 ! One of the two sublist is now empty.
3230 ! Copy possibly remaining tail of 1st sublist
3231 DO WHILE (i <= m)
3232 arr(k) = tmp_arr(i)
3233 indices(k) = tmp_idx(i)
3234 i = i + 1
3235 k = k + 1
3236 END DO
3237 ! The possibly remaining tail of 2nd sublist is already at the right spot.
3238 END IF
3239
3240 END SUBROUTINE atom_pair_sort_low
3241
3242
3243END MODULE task_list_methods
void grid_create_basis_set(const int nset, const int nsgf, const int maxco, const int maxpgf, const int lmin[nset], const int lmax[nset], const int npgf[nset], const int nsgf_set[nset], const int first_sgf[nset], const double sphi[nsgf][maxco], const double zet[nset][maxpgf], grid_basis_set **basis_set_out)
Allocates a basis set which can be passed to grid_create_task_list. See grid_task_list....
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
All kind of helpful little routines.
Definition ao_util.F:14
real(kind=dp) function, public exp_radius_very_extended(la_min, la_max, lb_min, lb_max, pab, o1, o2, ra, rb, rp, zetp, eps, prefactor, cutoff, epsabs)
computes the radius of the Gaussian outside of which it is smaller than eps
Definition ao_util.F:209
subroutine, public get_gto_basis_set(gto_basis_set, name, aliases, norm_type, kind_radius, ncgf, nset, nsgf, cgf_symbol, sgf_symbol, norm_cgf, set_radius, lmax, lmin, lx, ly, lz, m, ncgf_set, npgf, nsgf_set, nshell, cphi, pgf_radius, sphi, scon, zet, first_cgf, first_sgf, l, last_cgf, last_sgf, n, gcc, maxco, maxl, maxpgf, maxsgf_set, maxshell, maxso, nco_sum, npgf_sum, nshell_sum, maxder, short_kind_radius, npgf_seg_sum)
...
Handles all functions related to the CELL.
Definition cell_types.F:15
Defines control structures, which contain the parameters and the settings for the DFT-based calculati...
subroutine, public dbcsr_get_block_p(matrix, row, col, block, found, row_size, col_size)
...
subroutine, public dbcsr_get_info(matrix, nblkrows_total, nblkcols_total, nfullrows_total, nfullcols_total, nblkrows_local, nblkcols_local, nfullrows_local, nfullcols_local, my_prow, my_pcol, local_rows, local_cols, proc_row_dist, proc_col_dist, row_blk_size, col_blk_size, row_blk_offset, col_blk_offset, distribution, name, matrix_type, group)
...
subroutine, public dbcsr_work_create(matrix, nblks_guess, sizedata_guess, n, work_mutable)
...
subroutine, public dbcsr_finalize(matrix)
...
subroutine, public dbcsr_put_block(matrix, row, col, block, summation)
...
for a given dr()/dh(r) this will provide the bounds to be used if one wants to go over a sphere-subre...
Definition cube_utils.F:18
subroutine, public compute_cube_center(cube_center, rs_desc, zeta, zetb, ra, rab)
unifies the computation of the cube center, so that differences in implementation,...
Definition cube_utils.F:68
subroutine, public return_cube(info, radius, lb_cube, ub_cube, sphere_bounds)
...
Definition cube_utils.F:140
subroutine, public return_cube_nonortho(info, radius, lb, ub, rp)
...
Definition cube_utils.F:91
integer function, public gaussian_gridlevel(gridlevel_info, exponent)
...
Fortran API for the grid package, which is written in C.
Definition grid_api.F:12
subroutine, public grid_create_task_list(ntasks, natoms, nkinds, nblocks, block_offsets, atom_positions, atom_kinds, basis_sets, level_list, iatom_list, jatom_list, iset_list, jset_list, ipgf_list, jpgf_list, border_mask_list, block_num_list, radius_list, rab_list, rs_grids, task_list)
Allocates a task list which can be passed to grid_collocate_task_list.
Definition grid_api.F:724
Defines the basic variable types.
Definition kinds.F:23
integer, parameter, public int_8
Definition kinds.F:54
integer, parameter, public dp
Definition kinds.F:34
integer, parameter, public default_string_length
Definition kinds.F:57
Types and basic routines needed for a kpoint calculation.
subroutine, public get_kpoint_info(kpoint, kp_scheme, nkp_grid, kp_shift, symmetry, verbose, full_grid, use_real_wfn, eps_geo, parallel_group_size, kp_range, nkp, xkp, wkp, para_env, blacs_env_all, para_env_kp, para_env_inter_kp, blacs_env, kp_env, kp_aux_env, mpools, iogrp, nkp_groups, kp_dist, cell_to_index, index_to_cell, sab_nl, sab_nl_nosym)
Retrieve information from a kpoint environment.
An array-based list which grows on demand. When the internal array is full, a new array of twice the ...
Definition list.F:24
Utility routines for the memory handling.
Interface to the message passing library MPI.
Fortran API for the offload package, which is written in C.
Definition offload_api.F:12
subroutine, public offload_create_buffer(length, buffer)
Allocates a buffer of given length, ie. number of elements.
Define methods related to particle_type.
subroutine, public get_particle_set(particle_set, qs_kind_set, first_sgf, last_sgf, nsgf, nmao, basis)
Get the components of a particle set.
Define the data structure for the particle information.
container for various plainwaves related things
subroutine, public pw_env_get(pw_env, pw_pools, cube_info, gridlevel_info, auxbas_pw_pool, auxbas_grid, auxbas_rs_desc, auxbas_rs_grid, rs_descs, rs_grids, xc_pw_pool, vdw_pw_pool, poisson_env, interp_section)
returns the various attributes of the pw env
Define the quickstep kind type and their sub types.
subroutine, public get_qs_kind(qs_kind, basis_set, basis_type, ncgf, nsgf, all_potential, tnadd_potential, gth_potential, sgp_potential, upf_potential, cneo_potential, se_parameter, dftb_parameter, xtb_parameter, dftb3_param, zatom, zeff, elec_conf, mao, lmax_dftb, alpha_core_charge, ccore_charge, core_charge, core_charge_radius, paw_proj_set, paw_atom, hard_radius, hard0_radius, max_rad_local, covalent_radius, vdw_radius, gpw_type_forced, harmonics, max_iso_not0, max_s_harm, grid_atom, ngrid_ang, ngrid_rad, lmax_rho0, dft_plus_u_atom, l_of_dft_plus_u, n_of_dft_plus_u, u_minus_j, u_of_dft_plus_u, j_of_dft_plus_u, alpha_of_dft_plus_u, beta_of_dft_plus_u, j0_of_dft_plus_u, occupation_of_dft_plus_u, dispersion, bs_occupation, magnetization, no_optimize, addel, laddel, naddel, orbitals, max_scf, eps_scf, smear, u_ramping, u_minus_j_target, eps_u_ramping, init_u_ramping_each_scf, reltmat, ghost, monovalent, floating, name, element_symbol, pao_basis_size, pao_model_file, pao_potentials, pao_descriptors, nelec)
Get attributes of an atomic kind.
subroutine, public get_ks_env(ks_env, v_hartree_rspace, s_mstruct_changed, rho_changed, potential_changed, forces_up_to_date, complex_ks, matrix_h, matrix_h_im, matrix_ks, matrix_ks_im, matrix_vxc, kinetic, matrix_s, matrix_s_ri_aux, matrix_w, matrix_p_mp2, matrix_p_mp2_admm, matrix_h_kp, matrix_h_im_kp, matrix_ks_kp, matrix_vxc_kp, kinetic_kp, matrix_s_kp, matrix_w_kp, matrix_s_ri_aux_kp, matrix_ks_im_kp, rho, rho_xc, vppl, rho_core, rho_nlcc, rho_nlcc_g, vee, neighbor_list_id, sab_orb, sab_all, sac_ae, sac_ppl, sac_lri, sap_ppnl, sap_oce, sab_lrc, sab_se, sab_xtbe, sab_tbe, sab_core, sab_xb, sab_xtb_pp, sab_xtb_nonbond, sab_vdw, sab_scp, sab_almo, sab_kp, sab_kp_nosym, sab_cneo, task_list, task_list_soft, kpoints, do_kpoints, atomic_kind_set, qs_kind_set, cell, cell_ref, use_ref_cell, particle_set, energy, force, local_particles, local_molecules, molecule_kind_set, molecule_set, subsys, cp_subsys, virial, results, atprop, nkind, natom, dft_control, dbcsr_dist, distribution_2d, pw_env, para_env, blacs_env, nelectron_total, nelectron_spin)
...
Define the neighbor list data types and the corresponding functionality.
subroutine, public rs_grid_create(rs, desc)
...
pure integer function, public rs_grid_locate_rank(rs_desc, rank_in, shift)
returns the 1D rank of the task which is a cartesian shift away from 1D rank rank_in only possible if...
pure subroutine, public rs_grid_reorder_ranks(desc, real2virtual)
Defines a new ordering of ranks on this realspace grid, recalculating the data bounds and reallocatin...
subroutine, public rs_grid_release(rs_grid)
releases the given rs grid (see doc/ReferenceCounting.html)
generate the tasks lists used by collocate and integrate routines
subroutine, public rs_copy_to_matrices(src_buffer, dest_matrices, task_list)
Copies from buffer into DBCSR matrics, replaces rs_gather_matrix for non-distributed grids.
subroutine, public rs_scatter_matrices(src_matrices, dest_buffer, task_list, group)
Scatters dbcsr matrix blocks and receives them into a buffer as needed before collocation.
subroutine, public rs_distribute_matrix(rs_descs, pmats, atom_pair_send, atom_pair_recv, nimages, scatter, hmats)
redistributes the matrix so that it can be used in realspace operations i.e. according to the task li...
subroutine, public distribute_tasks(rs_descs, ntasks, natoms, tasks, atom_pair_send, atom_pair_recv, symmetric, reorder_rs_grid_ranks, skip_load_balance_distributed)
Assembles tasks to be performed on local grid.
subroutine, public generate_qs_task_list(ks_env, task_list, basis_type, reorder_rs_grid_ranks, skip_load_balance_distributed, pw_env_external, sab_orb_external)
...
subroutine, public rs_gather_matrices(src_buffer, dest_matrices, task_list, group)
Gather the dbcsr matrix blocks and receives them into a buffer as needed after integration.
subroutine, public task_list_inner_loop(tasks, ntasks, curr_tasks, rs_descs, dft_control, cube_info, gridlevel_info, cindex, iatom, jatom, rpgfa, rpgfb, zeta, zetb, kind_radius_b, set_radius_a, set_radius_b, ra, rab, la_max, la_min, lb_max, lb_min, npgfa, npgfb, nseta, nsetb)
...
subroutine, public rs_copy_to_buffer(src_matrices, dest_buffer, task_list)
Copies the DBCSR blocks into buffer, replaces rs_scatter_matrix for non-distributed grids.
types for task lists
subroutine, public serialize_task(task, serialized_task)
Serialize a task into an integer array. Used for MPI communication.
subroutine, public deserialize_task(task, serialized_task)
De-serialize a task from an integer array. Used for MPI communication.
subroutine, public reallocate_tasks(tasks, new_size)
Grow an array of tasks while preserving the existing entries.
integer, parameter, public task_size_in_int8
All kind of helpful little routines.
Definition util.F:14
Type defining parameters related to the simulation cell.
Definition cell_types.F:55
Contains information about kpoints.
contained for different pw related things
Provides all information about a quickstep kind.
calculation environment to calculate the ks matrix, holds all the needed vars. assumes that the core ...