(git:03cdb6f)
Loading...
Searching...
No Matches
task_list_methods.F
Go to the documentation of this file.
1!--------------------------------------------------------------------------------------------------!
2! CP2K: A general program to perform molecular dynamics simulations !
3! Copyright 2000-2026 CP2K developers group <https://cp2k.org> !
4! !
5! SPDX-License-Identifier: GPL-2.0-or-later !
6!--------------------------------------------------------------------------------------------------!
7
8! **************************************************************************************************
9!> \brief generate the tasks lists used by collocate and integrate routines
10!> \par History
11!> 01.2008 [Joost VandeVondele] refactered out of qs_collocate / qs_integrate
12!> \author Joost VandeVondele
13! **************************************************************************************************
21 USE cell_types, ONLY: cell_type, &
22 pbc
28 USE cp_dbcsr_api, ONLY: dbcsr_convert_sizes_to_offsets, &
34 dbcsr_type, &
38 USE kinds, ONLY: default_string_length, &
39 dp, &
40 int_8
41 USE kpoint_types, ONLY: get_kpoint_info, &
44 USE message_passing, ONLY: &
48 USE pw_env_types, ONLY: pw_env_get, &
50 USE qs_kind_types, ONLY: get_qs_kind, &
52 USE qs_ks_types, ONLY: get_ks_env, &
68 USE util, ONLY: sort
69
70!$ USE OMP_LIB, ONLY: omp_destroy_lock, omp_get_num_threads, omp_init_lock, &
71!$ omp_lock_kind, omp_set_lock, omp_unset_lock, omp_get_max_threads
72#include "./base/base_uses.f90"
73
74
75 IMPLICIT NONE
76
77 LOGICAL, PRIVATE, PARAMETER :: debug_this_module = .false.
78
79 PRIVATE
80
81 CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'task_list_methods'
82
83 PUBLIC :: generate_qs_task_list, &
85 PUBLIC :: distribute_tasks, &
91
92CONTAINS
93
94! **************************************************************************************************
95!> \brief ...
96!> \param ks_env ...
97!> \param task_list ...
98!> \param basis_type ...
99!> \param reorder_rs_grid_ranks Flag that indicates if this routine should
100!> or should not overwrite the rs descriptor (see comment below)
101!> \param skip_load_balance_distributed ...
102!> \param pw_env_external ...
103!> \param sab_orb_external ...
104!> \param ext_kpoints ...
105!> \par History
106!> 01.2008 factored out of calculate_rho_elec [Joost VandeVondele]
107!> 04.2010 divides tasks into grid levels and atom pairs for integrate/collocate [Iain Bethune]
108!> (c) The Numerical Algorithms Group (NAG) Ltd, 2010 on behalf of the HECToR project
109!> 06.2015 adjusted to be used with multiple images (k-points) [JGH]
110!> \note If this routine is called several times with different task lists,
111!> the default behaviour is to re-optimize the grid ranks and overwrite
112!> the rs descriptor and grids. reorder_rs_grid_ranks = .FALSE. prevents the code
113!> of performing a new optimization by leaving the rank order in
114!> its current state.
115! **************************************************************************************************
116 SUBROUTINE generate_qs_task_list(ks_env, task_list, basis_type, &
117 reorder_rs_grid_ranks, skip_load_balance_distributed, &
118 pw_env_external, sab_orb_external, ext_kpoints)
119
120 TYPE(qs_ks_env_type), POINTER :: ks_env
121 TYPE(task_list_type), POINTER :: task_list
122 CHARACTER(LEN=*), INTENT(IN) :: basis_type
123 LOGICAL, INTENT(IN) :: reorder_rs_grid_ranks, &
124 skip_load_balance_distributed
125 TYPE(pw_env_type), OPTIONAL, POINTER :: pw_env_external
126 TYPE(neighbor_list_set_p_type), DIMENSION(:), &
127 OPTIONAL, POINTER :: sab_orb_external
128 TYPE(kpoint_type), OPTIONAL, POINTER :: ext_kpoints
129
130 CHARACTER(LEN=*), PARAMETER :: routinen = 'generate_qs_task_list'
131 INTEGER, PARAMETER :: max_tasks = 2000
132
133 INTEGER :: cindex, curr_tasks, handle, i, iatom, iatom_old, igrid_level, igrid_level_old, &
134 ikind, ilevel, img, img_old, ipair, ipgf, iset, itask, jatom, jatom_old, jkind, jpgf, &
135 jset, maxpgf, maxset, natoms, nimages, nkind, nseta, nsetb, slot
136 INTEGER, ALLOCATABLE, DIMENSION(:, :, :) :: blocks
137 INTEGER, DIMENSION(3) :: cellind
138 INTEGER, DIMENSION(:), POINTER :: la_max, la_min, lb_max, lb_min, npgfa, &
139 npgfb, nsgf
140 INTEGER, DIMENSION(:, :, :), POINTER :: cell_to_index
141 LOGICAL :: dokp
142 REAL(kind=dp) :: kind_radius_a, kind_radius_b
143 REAL(kind=dp), DIMENSION(3) :: ra, rab
144 REAL(kind=dp), DIMENSION(:), POINTER :: set_radius_a, set_radius_b
145 REAL(kind=dp), DIMENSION(:, :), POINTER :: rpgfa, rpgfb, zeta, zetb
146 TYPE(cell_type), POINTER :: cell
147 TYPE(cube_info_type), DIMENSION(:), POINTER :: cube_info
148 TYPE(dft_control_type), POINTER :: dft_control
149 TYPE(gridlevel_info_type), POINTER :: gridlevel_info
150 TYPE(gto_basis_set_p_type), DIMENSION(:), POINTER :: basis_set_list
151 TYPE(gto_basis_set_type), POINTER :: basis_set_a, basis_set_b, orb_basis_set
152 TYPE(kpoint_type), POINTER :: kpoints
153 TYPE(neighbor_list_set_p_type), DIMENSION(:), &
154 POINTER :: sab_orb
155 TYPE(particle_type), DIMENSION(:), POINTER :: particle_set
156 TYPE(pw_env_type), POINTER :: pw_env
157 TYPE(qs_kind_type), DIMENSION(:), POINTER :: qs_kind_set
158 TYPE(qs_kind_type), POINTER :: qs_kind
159 TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
160 POINTER :: rs_descs
161 TYPE(realspace_grid_type), DIMENSION(:), POINTER :: rs_grids
162 TYPE(task_type), DIMENSION(:), POINTER :: tasks
163
164 CALL timeset(routinen, handle)
165
166 CALL get_ks_env(ks_env, &
167 qs_kind_set=qs_kind_set, &
168 cell=cell, &
169 particle_set=particle_set, &
170 dft_control=dft_control)
171
172 CALL get_ks_env(ks_env, sab_orb=sab_orb)
173 IF (PRESENT(sab_orb_external)) sab_orb => sab_orb_external
174
175 CALL get_ks_env(ks_env, pw_env=pw_env)
176 IF (PRESENT(pw_env_external)) pw_env => pw_env_external
177 CALL pw_env_get(pw_env, rs_descs=rs_descs, rs_grids=rs_grids)
178
179 ! *** assign from pw_env
180 gridlevel_info => pw_env%gridlevel_info
181 cube_info => pw_env%cube_info
182
183 ! find maximum numbers
184 nkind = SIZE(qs_kind_set)
185 natoms = SIZE(particle_set)
186 maxset = 0
187 maxpgf = 0
188 DO ikind = 1, nkind
189 qs_kind => qs_kind_set(ikind)
190 CALL get_qs_kind(qs_kind=qs_kind, &
191 basis_set=orb_basis_set, basis_type=basis_type)
192
193 IF (.NOT. ASSOCIATED(orb_basis_set)) cycle
194 CALL get_gto_basis_set(gto_basis_set=orb_basis_set, npgf=npgfa, nset=nseta)
195
196 maxset = max(nseta, maxset)
197 maxpgf = max(maxval(npgfa), maxpgf)
198 END DO
199
200 ! kpoint related
201 nimages = dft_control%nimages
202 dokp = .false.
203 NULLIFY (cell_to_index)
204 NULLIFY (kpoints)
205 IF (PRESENT(ext_kpoints)) THEN
206 IF (ASSOCIATED(ext_kpoints)) THEN
207 dokp = .true.
208 CALL get_kpoint_info(kpoint=ext_kpoints, cell_to_index=cell_to_index)
209 nimages = SIZE(ext_kpoints%index_to_cell, 2)
210 END IF
211 END IF
212 IF (.NOT. dokp .AND. nimages > 1) THEN
213 dokp = .true.
214 CALL get_ks_env(ks_env=ks_env, kpoints=kpoints)
215 CALL get_kpoint_info(kpoint=kpoints, cell_to_index=cell_to_index)
216 END IF
217
218 ! free the atom_pair lists if allocated
219 IF (ASSOCIATED(task_list%atom_pair_send)) DEALLOCATE (task_list%atom_pair_send)
220 IF (ASSOCIATED(task_list%atom_pair_recv)) DEALLOCATE (task_list%atom_pair_recv)
221
222 ! construct a list of all tasks
223 IF (.NOT. ASSOCIATED(task_list%tasks)) THEN
224 CALL reallocate_tasks(task_list%tasks, max_tasks)
225 END IF
226 task_list%ntasks = 0
227 curr_tasks = SIZE(task_list%tasks)
228
229 ALLOCATE (basis_set_list(nkind))
230 DO ikind = 1, nkind
231 qs_kind => qs_kind_set(ikind)
232 CALL get_qs_kind(qs_kind=qs_kind, basis_set=basis_set_a, &
233 basis_type=basis_type)
234 IF (ASSOCIATED(basis_set_a)) THEN
235 basis_set_list(ikind)%gto_basis_set => basis_set_a
236 ELSE
237 NULLIFY (basis_set_list(ikind)%gto_basis_set)
238 END IF
239 END DO
240!!$OMP PARALLEL DEFAULT(NONE) &
241!!$OMP SHARED (sab_orb, dokp, basis_set_list, task_list, rs_descs, dft_control, cube_info, gridlevel_info, &
242!!$OMP curr_tasks, maxpgf, maxset, natoms, nimages, particle_set, cell_to_index, cell) &
243!!$OMP PRIVATE (ikind, jkind, iatom, jatom, rab, cellind, basis_set_a, basis_set_b, ra, &
244!!$OMP la_max, la_min, npgfa, nseta, rpgfa, set_radius_a, kind_radius_a, zeta, &
245!!$OMP lb_max, lb_min, npgfb, nsetb, rpgfb, set_radius_b, kind_radius_b, zetb, &
246!!$OMP cindex, slot)
247 ! Loop over neighbor list
248!!$OMP DO SCHEDULE(GUIDED)
249 DO slot = 1, sab_orb(1)%nl_size
250 ikind = sab_orb(1)%nlist_task(slot)%ikind
251 jkind = sab_orb(1)%nlist_task(slot)%jkind
252 iatom = sab_orb(1)%nlist_task(slot)%iatom
253 jatom = sab_orb(1)%nlist_task(slot)%jatom
254 rab(1:3) = sab_orb(1)%nlist_task(slot)%r(1:3)
255 cellind(1:3) = sab_orb(1)%nlist_task(slot)%cell(1:3)
256
257 basis_set_a => basis_set_list(ikind)%gto_basis_set
258 IF (.NOT. ASSOCIATED(basis_set_a)) cycle
259 basis_set_b => basis_set_list(jkind)%gto_basis_set
260 IF (.NOT. ASSOCIATED(basis_set_b)) cycle
261 ra(:) = pbc(particle_set(iatom)%r, cell)
262 ! basis ikind
263 la_max => basis_set_a%lmax
264 la_min => basis_set_a%lmin
265 npgfa => basis_set_a%npgf
266 nseta = basis_set_a%nset
267 rpgfa => basis_set_a%pgf_radius
268 set_radius_a => basis_set_a%set_radius
269 kind_radius_a = basis_set_a%kind_radius
270 zeta => basis_set_a%zet
271 ! basis jkind
272 lb_max => basis_set_b%lmax
273 lb_min => basis_set_b%lmin
274 npgfb => basis_set_b%npgf
275 nsetb = basis_set_b%nset
276 rpgfb => basis_set_b%pgf_radius
277 set_radius_b => basis_set_b%set_radius
278 kind_radius_b = basis_set_b%kind_radius
279 zetb => basis_set_b%zet
280
281 IF (dokp) THEN
282 cindex = cell_to_index(cellind(1), cellind(2), cellind(3))
283 ELSE
284 cindex = 1
285 END IF
286
287 CALL task_list_inner_loop(task_list%tasks, task_list%ntasks, curr_tasks, &
288 rs_descs, dft_control, cube_info, gridlevel_info, cindex, &
289 iatom, jatom, rpgfa, rpgfb, zeta, zetb, kind_radius_b, &
290 set_radius_a, set_radius_b, ra, rab, &
291 la_max, la_min, lb_max, lb_min, npgfa, npgfb, nseta, nsetb)
292
293 END DO
294!!$OMP END PARALLEL
295
296 ! redistribute the task list so that all tasks map on the local rs grids
297 CALL distribute_tasks( &
298 rs_descs=rs_descs, ntasks=task_list%ntasks, natoms=natoms, &
299 tasks=task_list%tasks, atom_pair_send=task_list%atom_pair_send, &
300 atom_pair_recv=task_list%atom_pair_recv, symmetric=.true., &
301 reorder_rs_grid_ranks=reorder_rs_grid_ranks, &
302 skip_load_balance_distributed=skip_load_balance_distributed)
303
304 ! compute offsets for rs_scatter_matrix / rs_copy_matrix
305 ALLOCATE (nsgf(natoms))
306 CALL get_particle_set(particle_set, qs_kind_set, basis=basis_set_list, nsgf=nsgf)
307 IF (ASSOCIATED(task_list%atom_pair_send)) THEN
308 ! only needed when there is a distributed grid
309 CALL rs_calc_offsets(pairs=task_list%atom_pair_send, &
310 nsgf=nsgf, &
311 group_size=rs_descs(1)%rs_desc%group_size, &
312 pair_offsets=task_list%pair_offsets_send, &
313 rank_offsets=task_list%rank_offsets_send, &
314 rank_sizes=task_list%rank_sizes_send, &
315 buffer_size=task_list%buffer_size_send)
316 END IF
317 CALL rs_calc_offsets(pairs=task_list%atom_pair_recv, &
318 nsgf=nsgf, &
319 group_size=rs_descs(1)%rs_desc%group_size, &
320 pair_offsets=task_list%pair_offsets_recv, &
321 rank_offsets=task_list%rank_offsets_recv, &
322 rank_sizes=task_list%rank_sizes_recv, &
323 buffer_size=task_list%buffer_size_recv)
324 DEALLOCATE (basis_set_list, nsgf)
325
326 ! If the rank order has changed, reallocate any of the distributed rs_grids
327 IF (reorder_rs_grid_ranks) THEN
328 DO i = 1, gridlevel_info%ngrid_levels
329 IF (rs_descs(i)%rs_desc%distributed) THEN
330 CALL rs_grid_release(rs_grids(i))
331 CALL rs_grid_create(rs_grids(i), rs_descs(i)%rs_desc)
332 END IF
333 END DO
334 END IF
335
336 CALL create_grid_task_list(task_list=task_list, &
337 qs_kind_set=qs_kind_set, &
338 particle_set=particle_set, &
339 cell=cell, &
340 basis_type=basis_type, &
341 rs_grids=rs_grids)
342
343 ! Now we have the final list of tasks, setup the task_list with the
344 ! data needed for the loops in integrate_v/calculate_rho
345
346 IF (ASSOCIATED(task_list%taskstart)) THEN
347 DEALLOCATE (task_list%taskstart)
348 END IF
349 IF (ASSOCIATED(task_list%taskstop)) THEN
350 DEALLOCATE (task_list%taskstop)
351 END IF
352 IF (ASSOCIATED(task_list%npairs)) THEN
353 DEALLOCATE (task_list%npairs)
354 END IF
355
356 ! First, count the number of unique atom pairs per grid level
357
358 ALLOCATE (task_list%npairs(SIZE(rs_descs)))
359
360 iatom_old = -1; jatom_old = -1; igrid_level_old = -1; img_old = -1
361 ipair = 0
362 task_list%npairs = 0
363
364 DO i = 1, task_list%ntasks
365 igrid_level = task_list%tasks(i)%grid_level
366 img = task_list%tasks(i)%image
367 iatom = task_list%tasks(i)%iatom
368 jatom = task_list%tasks(i)%jatom
369 iset = task_list%tasks(i)%iset
370 jset = task_list%tasks(i)%jset
371 ipgf = task_list%tasks(i)%ipgf
372 jpgf = task_list%tasks(i)%jpgf
373 IF (igrid_level /= igrid_level_old) THEN
374 IF (igrid_level_old /= -1) THEN
375 task_list%npairs(igrid_level_old) = ipair
376 END IF
377 ipair = 1
378 igrid_level_old = igrid_level
379 iatom_old = iatom
380 jatom_old = jatom
381 img_old = img
382 ELSE IF (iatom /= iatom_old .OR. jatom /= jatom_old .OR. img /= img_old) THEN
383 ipair = ipair + 1
384 iatom_old = iatom
385 jatom_old = jatom
386 img_old = img
387 END IF
388 END DO
389 ! Take care of the last iteration
390 IF (task_list%ntasks /= 0) THEN
391 task_list%npairs(igrid_level) = ipair
392 END IF
393
394 ! Second, for each atom pair, find the indices in the task list
395 ! of the first and last task
396
397 ! Array sized for worst case
398 ALLOCATE (task_list%taskstart(maxval(task_list%npairs), SIZE(rs_descs)))
399 ALLOCATE (task_list%taskstop(maxval(task_list%npairs), SIZE(rs_descs)))
400
401 iatom_old = -1; jatom_old = -1; igrid_level_old = -1; img_old = -1
402 ipair = 0
403 task_list%taskstart = 0
404 task_list%taskstop = 0
405
406 DO i = 1, task_list%ntasks
407 igrid_level = task_list%tasks(i)%grid_level
408 img = task_list%tasks(i)%image
409 iatom = task_list%tasks(i)%iatom
410 jatom = task_list%tasks(i)%jatom
411 iset = task_list%tasks(i)%iset
412 jset = task_list%tasks(i)%jset
413 ipgf = task_list%tasks(i)%ipgf
414 jpgf = task_list%tasks(i)%jpgf
415 IF (igrid_level /= igrid_level_old) THEN
416 IF (igrid_level_old /= -1) THEN
417 task_list%taskstop(ipair, igrid_level_old) = i - 1
418 END IF
419 ipair = 1
420 task_list%taskstart(ipair, igrid_level) = i
421 igrid_level_old = igrid_level
422 iatom_old = iatom
423 jatom_old = jatom
424 img_old = img
425 ELSE IF (iatom /= iatom_old .OR. jatom /= jatom_old .OR. img /= img_old) THEN
426 ipair = ipair + 1
427 task_list%taskstart(ipair, igrid_level) = i
428 task_list%taskstop(ipair - 1, igrid_level) = i - 1
429 iatom_old = iatom
430 jatom_old = jatom
431 img_old = img
432 END IF
433 END DO
434 ! Take care of the last iteration
435 IF (task_list%ntasks /= 0) THEN
436 task_list%taskstop(ipair, igrid_level) = task_list%ntasks
437 END IF
438
439 ! Debug task destribution
440 IF (debug_this_module) THEN
441 tasks => task_list%tasks
442 WRITE (6, *)
443 WRITE (6, *) "Total number of tasks ", task_list%ntasks
444 DO igrid_level = 1, gridlevel_info%ngrid_levels
445 WRITE (6, *) "Total number of pairs(grid_level) ", &
446 igrid_level, task_list%npairs(igrid_level)
447 END DO
448 WRITE (6, *)
449
450 DO igrid_level = 1, gridlevel_info%ngrid_levels
451
452 ALLOCATE (blocks(natoms, natoms, nimages))
453 blocks = -1
454 DO ipair = 1, task_list%npairs(igrid_level)
455 itask = task_list%taskstart(ipair, igrid_level)
456 ilevel = task_list%tasks(itask)%grid_level
457 img = task_list%tasks(itask)%image
458 iatom = task_list%tasks(itask)%iatom
459 jatom = task_list%tasks(itask)%jatom
460 iset = task_list%tasks(itask)%iset
461 jset = task_list%tasks(itask)%jset
462 ipgf = task_list%tasks(itask)%ipgf
463 jpgf = task_list%tasks(itask)%jpgf
464 IF (blocks(iatom, jatom, img) == -1 .AND. blocks(jatom, iatom, img) == -1) THEN
465 blocks(iatom, jatom, img) = 1
466 blocks(jatom, iatom, img) = 1
467 ELSE
468 WRITE (6, *) "TASK LIST CONFLICT IN PAIR ", ipair
469 WRITE (6, *) "Reuse of iatom, jatom, image ", iatom, jatom, img
470 END IF
471
472 iatom_old = iatom
473 jatom_old = jatom
474 img_old = img
475 DO itask = task_list%taskstart(ipair, igrid_level), task_list%taskstop(ipair, igrid_level)
476 ilevel = task_list%tasks(itask)%grid_level
477 img = task_list%tasks(itask)%image
478 iatom = task_list%tasks(itask)%iatom
479 jatom = task_list%tasks(itask)%jatom
480 iset = task_list%tasks(itask)%iset
481 jset = task_list%tasks(itask)%jset
482 ipgf = task_list%tasks(itask)%ipgf
483 jpgf = task_list%tasks(itask)%jpgf
484 IF (iatom /= iatom_old .OR. jatom /= jatom_old .OR. img /= img_old) THEN
485 WRITE (6, *) "TASK LIST CONFLICT IN TASK ", itask
486 WRITE (6, *) "Inconsistent iatom, jatom, image ", iatom, jatom, img
487 WRITE (6, *) "Should be iatom, jatom, image ", iatom_old, jatom_old, img_old
488 END IF
489
490 END DO
491 END DO
492 DEALLOCATE (blocks)
493
494 END DO
495
496 END IF
497
498 CALL timestop(handle)
499
500 END SUBROUTINE generate_qs_task_list
501
502! **************************************************************************************************
503!> \brief Sends the task list data to the grid API.
504!> \author Ole Schuett
505! **************************************************************************************************
506 SUBROUTINE create_grid_task_list(task_list, qs_kind_set, particle_set, cell, basis_type, rs_grids)
507 TYPE(task_list_type), POINTER :: task_list
508 TYPE(qs_kind_type), DIMENSION(:), POINTER :: qs_kind_set
509 TYPE(particle_type), DIMENSION(:), POINTER :: particle_set
510 TYPE(cell_type), POINTER :: cell
511 CHARACTER(LEN=*) :: basis_type
512 TYPE(realspace_grid_type), DIMENSION(:), POINTER :: rs_grids
513
514 TYPE(gto_basis_set_type), POINTER :: orb_basis_set
515 INTEGER :: nset, natoms, nkinds, ntasks, &
516 ikind, iatom, itask, nsgf
517 INTEGER, DIMENSION(:), ALLOCATABLE :: atom_kinds, level_list, iatom_list, jatom_list, &
518 iset_list, jset_list, ipgf_list, jpgf_list, &
519 border_mask_list, block_num_list
520 REAL(kind=dp), DIMENSION(:), ALLOCATABLE :: radius_list
521 REAL(kind=dp), DIMENSION(:, :), ALLOCATABLE :: rab_list, atom_positions
522 TYPE(task_type), DIMENSION(:), POINTER :: tasks
523 INTEGER, DIMENSION(:, :), POINTER :: first_sgf
524 REAL(kind=dp), DIMENSION(:, :), POINTER :: sphi, zet
525 INTEGER, DIMENSION(:), POINTER :: lmax, lmin, npgf, nsgf_set
526
527 nkinds = SIZE(qs_kind_set)
528 natoms = SIZE(particle_set)
529 ntasks = task_list%ntasks
530 tasks => task_list%tasks
531
532 IF (.NOT. ASSOCIATED(task_list%grid_basis_sets)) THEN
533 ! Basis sets do not change during simulation - only need to create them once.
534 ALLOCATE (task_list%grid_basis_sets(nkinds))
535 DO ikind = 1, nkinds
536 CALL get_qs_kind(qs_kind_set(ikind), basis_type=basis_type, basis_set=orb_basis_set)
537 CALL get_gto_basis_set(gto_basis_set=orb_basis_set, &
538 nset=nset, &
539 nsgf=nsgf, &
540 nsgf_set=nsgf_set, &
541 npgf=npgf, &
542 first_sgf=first_sgf, &
543 lmax=lmax, &
544 lmin=lmin, &
545 sphi=sphi, &
546 zet=zet)
547 CALL grid_create_basis_set(nset=nset, &
548 nsgf=nsgf, &
549 maxco=SIZE(sphi, 1), &
550 maxpgf=SIZE(zet, 1), &
551 lmin=lmin, &
552 lmax=lmax, &
553 npgf=npgf, &
554 nsgf_set=nsgf_set, &
555 first_sgf=first_sgf, &
556 sphi=sphi, &
557 zet=zet, &
558 basis_set=task_list%grid_basis_sets(ikind))
559 END DO
560 END IF
561
562 ! Pack task list infos
563 ALLOCATE (atom_kinds(natoms), atom_positions(3, natoms))
564 DO iatom = 1, natoms
565 atom_kinds(iatom) = particle_set(iatom)%atomic_kind%kind_number
566 atom_positions(:, iatom) = pbc(particle_set(iatom)%r, cell)
567 END DO
568
569 ALLOCATE (level_list(ntasks), iatom_list(ntasks), jatom_list(ntasks))
570 ALLOCATE (iset_list(ntasks), jset_list(ntasks), ipgf_list(ntasks), jpgf_list(ntasks))
571 ALLOCATE (border_mask_list(ntasks), block_num_list(ntasks))
572 ALLOCATE (radius_list(ntasks), rab_list(3, ntasks))
573
574 DO itask = 1, ntasks
575 level_list(itask) = tasks(itask)%grid_level
576 iatom_list(itask) = tasks(itask)%iatom
577 jatom_list(itask) = tasks(itask)%jatom
578 iset_list(itask) = tasks(itask)%iset
579 jset_list(itask) = tasks(itask)%jset
580 ipgf_list(itask) = tasks(itask)%ipgf
581 jpgf_list(itask) = tasks(itask)%jpgf
582 IF (tasks(itask)%dist_type == 2) THEN
583 border_mask_list(itask) = iand(63, not(tasks(itask)%subpatch_pattern)) ! invert last 6 bits
584 ELSE
585 border_mask_list(itask) = 0 ! no masking
586 END IF
587 block_num_list(itask) = tasks(itask)%pair_index ! change of nomenclature pair_index -> block_num
588 radius_list(itask) = tasks(itask)%radius
589 rab_list(:, itask) = tasks(itask)%rab(:)
590 END DO
591
592 CALL grid_create_task_list(ntasks=ntasks, &
593 natoms=natoms, &
594 nkinds=nkinds, &
595 nblocks=SIZE(task_list%pair_offsets_recv), &
596 block_offsets=task_list%pair_offsets_recv, &
597 atom_positions=atom_positions, &
598 atom_kinds=atom_kinds, &
599 basis_sets=task_list%grid_basis_sets, &
600 level_list=level_list, &
601 iatom_list=iatom_list, &
602 jatom_list=jatom_list, &
603 iset_list=iset_list, &
604 jset_list=jset_list, &
605 ipgf_list=ipgf_list, &
606 jpgf_list=jpgf_list, &
607 border_mask_list=border_mask_list, &
608 block_num_list=block_num_list, &
609 radius_list=radius_list, &
610 rab_list=rab_list, &
611 rs_grids=rs_grids, &
612 task_list=task_list%grid_task_list)
613
614 CALL offload_create_buffer(task_list%buffer_size_recv, task_list%pab_buffer)
615 CALL offload_create_buffer(task_list%buffer_size_recv, task_list%hab_buffer)
616
617 END SUBROUTINE create_grid_task_list
618
619! **************************************************************************************************
620!> \brief ...
621!> \param tasks ...
622!> \param ntasks ...
623!> \param curr_tasks ...
624!> \param rs_descs ...
625!> \param dft_control ...
626!> \param cube_info ...
627!> \param gridlevel_info ...
628!> \param cindex ...
629!> \param iatom ...
630!> \param jatom ...
631!> \param rpgfa ...
632!> \param rpgfb ...
633!> \param zeta ...
634!> \param zetb ...
635!> \param kind_radius_b ...
636!> \param set_radius_a ...
637!> \param set_radius_b ...
638!> \param ra ...
639!> \param rab ...
640!> \param la_max ...
641!> \param la_min ...
642!> \param lb_max ...
643!> \param lb_min ...
644!> \param npgfa ...
645!> \param npgfb ...
646!> \param nseta ...
647!> \param nsetb ...
648!> \par History
649!> Joost VandeVondele: 10.2008 refactored
650! **************************************************************************************************
651 SUBROUTINE task_list_inner_loop(tasks, ntasks, curr_tasks, rs_descs, dft_control, &
652 cube_info, gridlevel_info, cindex, &
653 iatom, jatom, rpgfa, rpgfb, zeta, zetb, kind_radius_b, set_radius_a, set_radius_b, ra, rab, &
654 la_max, la_min, lb_max, lb_min, npgfa, npgfb, nseta, nsetb)
655
656 TYPE(task_type), DIMENSION(:), POINTER :: tasks
657 INTEGER :: ntasks, curr_tasks
658 TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
659 POINTER :: rs_descs
660 TYPE(dft_control_type), POINTER :: dft_control
661 TYPE(cube_info_type), DIMENSION(:), POINTER :: cube_info
662 TYPE(gridlevel_info_type), POINTER :: gridlevel_info
663 INTEGER :: cindex, iatom, jatom
664 REAL(kind=dp), DIMENSION(:, :), POINTER :: rpgfa, rpgfb, zeta, zetb
665 REAL(kind=dp) :: kind_radius_b
666 REAL(kind=dp), DIMENSION(:), POINTER :: set_radius_a, set_radius_b
667 REAL(kind=dp), DIMENSION(3) :: ra, rab
668 INTEGER, DIMENSION(:), POINTER :: la_max, la_min, lb_max, lb_min, npgfa, &
669 npgfb
670 INTEGER :: nseta, nsetb
671
672 INTEGER :: cube_center(3), igrid_level, ipgf, iset, &
673 jpgf, jset, lb_cube(3), ub_cube(3)
674 REAL(kind=dp) :: dab, rab2, radius, zetp
675
676 rab2 = rab(1)*rab(1) + rab(2)*rab(2) + rab(3)*rab(3)
677 dab = sqrt(rab2)
678
679 loop_iset: DO iset = 1, nseta
680
681 IF (set_radius_a(iset) + kind_radius_b < dab) cycle loop_iset
682
683 loop_jset: DO jset = 1, nsetb
684
685 IF (set_radius_a(iset) + set_radius_b(jset) < dab) cycle loop_jset
686
687 loop_ipgf: DO ipgf = 1, npgfa(iset)
688
689 IF (rpgfa(ipgf, iset) + set_radius_b(jset) < dab) cycle loop_ipgf
690
691 loop_jpgf: DO jpgf = 1, npgfb(jset)
692
693 IF (rpgfa(ipgf, iset) + rpgfb(jpgf, jset) < dab) cycle loop_jpgf
694
695 zetp = zeta(ipgf, iset) + zetb(jpgf, jset)
696 igrid_level = gaussian_gridlevel(gridlevel_info, zetp)
697
698 CALL compute_pgf_properties(cube_center, lb_cube, ub_cube, radius, &
699 rs_descs(igrid_level)%rs_desc, cube_info(igrid_level), &
700 la_max(iset), zeta(ipgf, iset), la_min(iset), &
701 lb_max(jset), zetb(jpgf, jset), lb_min(jset), &
702 ra, rab, rab2, dft_control%qs_control%eps_rho_rspace)
703
704 CALL pgf_to_tasks(tasks, ntasks, curr_tasks, &
705 rab, cindex, iatom, jatom, iset, jset, ipgf, jpgf, &
706 la_max(iset), lb_max(jset), rs_descs(igrid_level)%rs_desc, &
707 igrid_level, gridlevel_info%ngrid_levels, cube_center, &
708 lb_cube, ub_cube, radius)
709
710 END DO loop_jpgf
711
712 END DO loop_ipgf
713
714 END DO loop_jset
715
716 END DO loop_iset
717
718 END SUBROUTINE task_list_inner_loop
719
720! **************************************************************************************************
721!> \brief combines the calculation of several basic properties of a given pgf:
722!> its center, the bounding cube, the radius, the cost,
723!> tries to predict the time needed for processing this task
724!> in this way an improved load balance might be obtained
725!> \param cube_center ...
726!> \param lb_cube ...
727!> \param ub_cube ...
728!> \param radius ...
729!> \param rs_desc ...
730!> \param cube_info ...
731!> \param la_max ...
732!> \param zeta ...
733!> \param la_min ...
734!> \param lb_max ...
735!> \param zetb ...
736!> \param lb_min ...
737!> \param ra ...
738!> \param rab ...
739!> \param rab2 ...
740!> \param eps ...
741!> \par History
742!> 10.2008 refactored [Joost VandeVondele]
743!> \note
744!> -) this requires the radius to be computed in the same way as
745!> collocate_pgf_product, we should factor that part into a subroutine
746!> -) we're assuming that integrate_pgf and collocate_pgf are the same cost for load balancing
747!> this is more or less true for map_consistent
748!> -) in principle, the computed radius could be recycled in integrate_pgf/collocate_pgf if it is certainly
749!> the same, this could lead to a small speedup
750!> -) the cost function is a fit through the median cost of mapping a pgf with a given l and a given radius (in grid points)
751!> fitting the measured data on an opteron/g95 using the expression
752!> a*(l+b)(r+c)**3+d which is based on the innerloop of the collocating routines
753! **************************************************************************************************
754 SUBROUTINE compute_pgf_properties(cube_center, lb_cube, ub_cube, radius, &
755 rs_desc, cube_info, la_max, zeta, la_min, lb_max, zetb, lb_min, ra, rab, rab2, eps)
756
757 INTEGER, DIMENSION(3), INTENT(OUT) :: cube_center, lb_cube, ub_cube
758 REAL(kind=dp), INTENT(OUT) :: radius
759 TYPE(realspace_grid_desc_type), POINTER :: rs_desc
760 TYPE(cube_info_type), INTENT(IN) :: cube_info
761 INTEGER, INTENT(IN) :: la_max
762 REAL(kind=dp), INTENT(IN) :: zeta
763 INTEGER, INTENT(IN) :: la_min, lb_max
764 REAL(kind=dp), INTENT(IN) :: zetb
765 INTEGER, INTENT(IN) :: lb_min
766 REAL(kind=dp), INTENT(IN) :: ra(3), rab(3), rab2, eps
767
768 INTEGER :: extent(3)
769 INTEGER, DIMENSION(:), POINTER :: sphere_bounds
770 REAL(kind=dp) :: cutoff, f, prefactor, rb(3), zetp
771 REAL(kind=dp), DIMENSION(3) :: rp
772
773! the radius for this task
774
775 zetp = zeta + zetb
776 rp(:) = ra(:) + zetb/zetp*rab(:)
777 rb(:) = ra(:) + rab(:)
778 cutoff = 1.0_dp
779 f = zetb/zetp
780 prefactor = exp(-zeta*f*rab2)
781 radius = exp_radius_very_extended(la_min, la_max, lb_min, lb_max, ra=ra, rb=rb, rp=rp, &
782 zetp=zetp, eps=eps, prefactor=prefactor, cutoff=cutoff)
783
784 CALL compute_cube_center(cube_center, rs_desc, zeta, zetb, ra, rab)
785 ! compute cube_center, the center of the gaussian product to map (folded to within the unit cell)
786 cube_center(:) = modulo(cube_center(:), rs_desc%npts(:))
787 cube_center(:) = cube_center(:) + rs_desc%lb(:)
788
789 IF (rs_desc%orthorhombic) THEN
790 CALL return_cube(cube_info, radius, lb_cube, ub_cube, sphere_bounds)
791 ELSE
792 CALL return_cube_nonortho(cube_info, radius, lb_cube, ub_cube, rp)
793 !? unclear if extent is computed correctly.
794 extent(:) = ub_cube(:) - lb_cube(:)
795 lb_cube(:) = -extent(:)/2 - 1
796 ub_cube(:) = extent(:)/2
797 END IF
798
799 END SUBROUTINE compute_pgf_properties
800! **************************************************************************************************
801!> \brief predicts the cost of a task in kcycles for a given task
802!> the model is based on a fit of actual data, and might need updating
803!> as collocate_pgf_product changes (or CPUs/compilers change)
804!> maybe some dynamic approach, improving the cost model on the fly could
805!> work as well
806!> the cost model does not yet take into account the fraction of space
807!> that is mapped locally for a given cube and rs_grid (generalised tasks)
808!> \param lb_cube ...
809!> \param ub_cube ...
810!> \param fraction ...
811!> \param lmax ...
812!> \param is_ortho ...
813!> \return ...
814! **************************************************************************************************
815 INTEGER FUNCTION cost_model(lb_cube, ub_cube, fraction, lmax, is_ortho)
816 INTEGER, DIMENSION(3), INTENT(IN) :: lb_cube, ub_cube
817 REAL(kind=dp), INTENT(IN) :: fraction
818 INTEGER :: lmax
819 LOGICAL :: is_ortho
820
821 INTEGER :: cmax
822 REAL(kind=dp) :: v1, v2, v3, v4, v5
823
824 cmax = maxval(((ub_cube - lb_cube) + 1)/2)
825
826 IF (is_ortho) THEN
827 v1 = 1.504760e+00_dp
828 v2 = 3.126770e+00_dp
829 v3 = 5.074106e+00_dp
830 v4 = 1.091568e+00_dp
831 v5 = 1.070187e+00_dp
832 ELSE
833 v1 = 7.831105e+00_dp
834 v2 = 2.675174e+00_dp
835 v3 = 7.546553e+00_dp
836 v4 = 6.122446e-01_dp
837 v5 = 3.886382e+00_dp
838 END IF
839 cost_model = ceiling(((lmax + v1)*(cmax + v2)**3*v3*fraction + v4 + v5*lmax**7)/1000.0_dp)
840
841 END FUNCTION cost_model
842! **************************************************************************************************
843!> \brief pgf_to_tasks converts a given pgf to one or more tasks, in particular
844!> this determines by which CPUs a given pgf gets collocated
845!> the format of the task array is as follows
846!> tasks(1,i) := destination
847!> tasks(2,i) := source
848!> tasks(3,i) := compressed type (iatom, jatom, ....)
849!> tasks(4,i) := type (0: replicated, 1: distributed local, 2: distributed generalised)
850!> tasks(5,i) := cost
851!> tasks(6,i) := alternate destination code (0 if none available)
852!>
853!> \param tasks ...
854!> \param ntasks ...
855!> \param curr_tasks ...
856!> \param rab ...
857!> \param cindex ...
858!> \param iatom ...
859!> \param jatom ...
860!> \param iset ...
861!> \param jset ...
862!> \param ipgf ...
863!> \param jpgf ...
864!> \param la_max ...
865!> \param lb_max ...
866!> \param rs_desc ...
867!> \param igrid_level ...
868!> \param n_levels ...
869!> \param cube_center ...
870!> \param lb_cube ...
871!> \param ub_cube ...
872!> \par History
873!> 10.2008 Refactored based on earlier routines by MattW [Joost VandeVondele]
874! **************************************************************************************************
875 SUBROUTINE pgf_to_tasks(tasks, ntasks, curr_tasks, &
876 rab, cindex, iatom, jatom, iset, jset, ipgf, jpgf, &
877 la_max, lb_max, rs_desc, igrid_level, n_levels, &
878 cube_center, lb_cube, ub_cube, radius)
879
880 TYPE(task_type), DIMENSION(:), POINTER :: tasks
881 INTEGER, INTENT(INOUT) :: ntasks, curr_tasks
882 REAL(kind=dp), DIMENSION(3), INTENT(IN) :: rab
883 INTEGER, INTENT(IN) :: cindex, iatom, jatom, iset, jset, ipgf, &
884 jpgf, la_max, lb_max
885 TYPE(realspace_grid_desc_type), POINTER :: rs_desc
886 INTEGER, INTENT(IN) :: igrid_level, n_levels
887 INTEGER, DIMENSION(3), INTENT(IN) :: cube_center, lb_cube, ub_cube
888 REAL(kind=dp), INTENT(IN) :: radius
889
890 INTEGER, PARAMETER :: add_tasks = 1000
891 REAL(kind=dp), PARAMETER :: mult_tasks = 2.0_dp
892
893 INTEGER :: added_tasks, cost, j, lmax
894 LOGICAL :: is_ortho
895 REAL(kind=dp) :: tfraction
896
897!$OMP SINGLE
898 ntasks = ntasks + 1
899 IF (ntasks > curr_tasks) THEN
900 curr_tasks = int((curr_tasks + add_tasks)*mult_tasks)
901 CALL reallocate_tasks(tasks, curr_tasks)
902 END IF
903!$OMP END SINGLE
904
905 IF (rs_desc%distributed) THEN
906
907 ! finds the node(s) that need to process this task
908 ! on exit tasks(:)%dist_type is 1 for distributed tasks and 2 for generalised tasks
909 CALL rs_find_node(rs_desc, igrid_level, n_levels, cube_center, &
910 ntasks=ntasks, tasks=tasks, lb_cube=lb_cube, ub_cube=ub_cube, added_tasks=added_tasks)
911
912 ELSE
913 tasks(ntasks)%destination = encode_rank(rs_desc%my_pos, igrid_level, n_levels)
914 tasks(ntasks)%dist_type = 0
915 tasks(ntasks)%subpatch_pattern = 0
916 added_tasks = 1
917 END IF
918
919 lmax = la_max + lb_max
920 is_ortho = (tasks(ntasks)%dist_type == 0 .OR. tasks(ntasks)%dist_type == 1) .AND. rs_desc%orthorhombic
921 ! we assume the load is shared equally between processes dealing with a generalised Gaussian.
922 ! this could be refined in the future
923 tfraction = 1.0_dp/added_tasks
924
925 cost = cost_model(lb_cube, ub_cube, tfraction, lmax, is_ortho)
926
927 DO j = 1, added_tasks
928 tasks(ntasks - added_tasks + j)%source = encode_rank(rs_desc%my_pos, igrid_level, n_levels)
929 tasks(ntasks - added_tasks + j)%cost = cost
930 tasks(ntasks - added_tasks + j)%grid_level = igrid_level
931 tasks(ntasks - added_tasks + j)%image = cindex
932 tasks(ntasks - added_tasks + j)%iatom = iatom
933 tasks(ntasks - added_tasks + j)%jatom = jatom
934 tasks(ntasks - added_tasks + j)%iset = iset
935 tasks(ntasks - added_tasks + j)%jset = jset
936 tasks(ntasks - added_tasks + j)%ipgf = ipgf
937 tasks(ntasks - added_tasks + j)%jpgf = jpgf
938 tasks(ntasks - added_tasks + j)%rab = rab
939 tasks(ntasks - added_tasks + j)%radius = radius
940 END DO
941
942 END SUBROUTINE pgf_to_tasks
943
944! **************************************************************************************************
945!> \brief performs load balancing of the tasks on the distributed grids
946!> \param tasks ...
947!> \param ntasks ...
948!> \param rs_descs ...
949!> \param grid_level ...
950!> \param natoms ...
951!> \par History
952!> created 2008-10-03 [Joost VandeVondele]
953! **************************************************************************************************
954 SUBROUTINE load_balance_distributed(tasks, ntasks, rs_descs, grid_level, natoms)
955
956 TYPE(task_type), DIMENSION(:), POINTER :: tasks
957 INTEGER :: ntasks
958 TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
959 POINTER :: rs_descs
960 INTEGER :: grid_level, natoms
961
962 CHARACTER(LEN=*), PARAMETER :: routinen = 'load_balance_distributed'
963
964 INTEGER :: handle
965 INTEGER, DIMENSION(:, :, :), POINTER :: list
966
967 CALL timeset(routinen, handle)
968
969 NULLIFY (list)
970 ! here we create for each cpu (0:ncpu-1) a list of possible destinations.
971 ! if a destination would not be in this list, it is a bug
972 CALL create_destination_list(list, rs_descs, grid_level)
973
974 ! now, walk over the tasks, filling in the loads of each destination
975 CALL compute_load_list(list, rs_descs, grid_level, tasks, ntasks, natoms, create_list=.true.)
976
977 ! optimize loads & fluxes
978 CALL optimize_load_list(list, rs_descs(1)%rs_desc%group, rs_descs(1)%rs_desc%my_pos)
979
980 ! now, walk over the tasks, using the list to set the destinations
981 CALL compute_load_list(list, rs_descs, grid_level, tasks, ntasks, natoms, create_list=.false.)
982
983 DEALLOCATE (list)
984
985 CALL timestop(handle)
986
987 END SUBROUTINE load_balance_distributed
988
989! **************************************************************************************************
990!> \brief this serial routine adjusts the fluxes in the global list
991!>
992!> \param list_global ...
993!> \par History
994!> created 2008-10-06 [Joost VandeVondele]
995! **************************************************************************************************
996 SUBROUTINE balance_global_list(list_global)
997 INTEGER, DIMENSION(:, :, 0:) :: list_global
998
999 CHARACTER(LEN=*), PARAMETER :: routinen = 'balance_global_list'
1000 INTEGER, PARAMETER :: max_iter = 100
1001 REAL(kind=dp), PARAMETER :: tolerance_factor = 0.005_dp
1002
1003 INTEGER :: dest, handle, icpu, idest, iflux, &
1004 ilocal, k, maxdest, ncpu, nflux
1005 INTEGER, ALLOCATABLE, DIMENSION(:, :) :: flux_connections
1006 LOGICAL :: solution_optimal
1007 REAL(kind=dp) :: average, load_shift, max_load_shift, &
1008 tolerance
1009 REAL(kind=dp), ALLOCATABLE, DIMENSION(:) :: load, optimized_flux, optimized_load
1010 REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :) :: flux_limits
1011
1012 CALL timeset(routinen, handle)
1013
1014 ncpu = SIZE(list_global, 3)
1015 maxdest = SIZE(list_global, 2)
1016 ALLOCATE (load(0:ncpu - 1))
1017 load = 0.0_dp
1018 ALLOCATE (optimized_load(0:ncpu - 1))
1019
1020 ! figure out the number of fluxes
1021 ! we assume that the global_list is symmetric
1022 nflux = 0
1023 DO icpu = 0, ncpu - 1
1024 DO idest = 1, maxdest
1025 dest = list_global(1, idest, icpu)
1026 IF (dest < ncpu .AND. dest > icpu) nflux = nflux + 1
1027 END DO
1028 END DO
1029 ALLOCATE (optimized_flux(nflux))
1030 ALLOCATE (flux_limits(2, nflux))
1031 ALLOCATE (flux_connections(2, nflux))
1032
1033 ! reorder data
1034 flux_limits = 0
1035 nflux = 0
1036 DO icpu = 0, ncpu - 1
1037 load(icpu) = sum(list_global(2, :, icpu))
1038 DO idest = 1, maxdest
1039 dest = list_global(1, idest, icpu)
1040 IF (dest < ncpu) THEN
1041 IF (dest /= icpu) THEN
1042 IF (dest > icpu) THEN
1043 nflux = nflux + 1
1044 flux_limits(2, nflux) = list_global(2, idest, icpu)
1045 flux_connections(1, nflux) = icpu
1046 flux_connections(2, nflux) = dest
1047 ELSE
1048 DO iflux = 1, nflux
1049 IF (flux_connections(1, iflux) == dest .AND. flux_connections(2, iflux) == icpu) THEN
1050 flux_limits(1, iflux) = -list_global(2, idest, icpu)
1051 EXIT
1052 END IF
1053 END DO
1054 END IF
1055 END IF
1056 END IF
1057 END DO
1058 END DO
1059
1060 solution_optimal = .false.
1061 optimized_flux = 0.0_dp
1062
1063 ! an iterative solver, if iterated till convergence the maximum load is minimal
1064 ! we terminate before things are fully converged, since this does show up in the timings
1065 ! once the largest shift becomes less than a small fraction of the average load, we're done
1066 ! we're perfectly happy if the load balance is within 1 percent or so
1067 ! the maximum load normally converges even faster
1068 average = sum(load)/SIZE(load)
1069 tolerance = tolerance_factor*average
1070
1071 optimized_load(:) = load
1072 DO k = 1, max_iter
1073 max_load_shift = 0.0_dp
1074 DO iflux = 1, nflux
1075 load_shift = (optimized_load(flux_connections(1, iflux)) - optimized_load(flux_connections(2, iflux)))/2
1076 load_shift = max(flux_limits(1, iflux) - optimized_flux(iflux), load_shift)
1077 load_shift = min(flux_limits(2, iflux) - optimized_flux(iflux), load_shift)
1078 max_load_shift = max(abs(load_shift), max_load_shift)
1079 optimized_load(flux_connections(1, iflux)) = optimized_load(flux_connections(1, iflux)) - load_shift
1080 optimized_load(flux_connections(2, iflux)) = optimized_load(flux_connections(2, iflux)) + load_shift
1081 optimized_flux(iflux) = optimized_flux(iflux) + load_shift
1082 END DO
1083 IF (max_load_shift < tolerance) THEN
1084 solution_optimal = .true.
1085 EXIT
1086 END IF
1087 END DO
1088
1089 ! now adjust the load list to reflect the optimized fluxes
1090 ! reorder data
1091 nflux = 0
1092 DO icpu = 0, ncpu - 1
1093 DO idest = 1, maxdest
1094 IF (list_global(1, idest, icpu) == icpu) ilocal = idest
1095 END DO
1096 DO idest = 1, maxdest
1097 dest = list_global(1, idest, icpu)
1098 IF (dest < ncpu) THEN
1099 IF (dest /= icpu) THEN
1100 IF (dest > icpu) THEN
1101 nflux = nflux + 1
1102 IF (optimized_flux(nflux) > 0) THEN
1103 list_global(2, ilocal, icpu) = list_global(2, ilocal, icpu) + &
1104 list_global(2, idest, icpu) - nint(optimized_flux(nflux))
1105 list_global(2, idest, icpu) = nint(optimized_flux(nflux))
1106 ELSE
1107 list_global(2, ilocal, icpu) = list_global(2, ilocal, icpu) + &
1108 list_global(2, idest, icpu)
1109 list_global(2, idest, icpu) = 0
1110 END IF
1111 ELSE
1112 DO iflux = 1, nflux
1113 IF (flux_connections(1, iflux) == dest .AND. flux_connections(2, iflux) == icpu) THEN
1114 IF (optimized_flux(iflux) > 0) THEN
1115 list_global(2, ilocal, icpu) = list_global(2, ilocal, icpu) + &
1116 list_global(2, idest, icpu)
1117 list_global(2, idest, icpu) = 0
1118 ELSE
1119 list_global(2, ilocal, icpu) = list_global(2, ilocal, icpu) + &
1120 list_global(2, idest, icpu) + nint(optimized_flux(iflux))
1121 list_global(2, idest, icpu) = -nint(optimized_flux(iflux))
1122 END IF
1123 EXIT
1124 END IF
1125 END DO
1126 END IF
1127 END IF
1128 END IF
1129 END DO
1130 END DO
1131
1132 CALL timestop(handle)
1133
1134 END SUBROUTINE balance_global_list
1135
1136! **************************************************************************************************
1137!> \brief this routine gets back optimized loads for all destinations
1138!>
1139!> \param list ...
1140!> \param group ...
1141!> \param my_pos ...
1142!> \par History
1143!> created 2008-10-06 [Joost VandeVondele]
1144!> Modified 2016-01 [EPCC] Reduce memory requirements on P processes
1145!> from O(P^2) to O(P)
1146! **************************************************************************************************
1147 SUBROUTINE optimize_load_list(list, group, my_pos)
1148 INTEGER, DIMENSION(:, :, 0:) :: list
1149 TYPE(mp_comm_type), INTENT(IN) :: group
1150 INTEGER, INTENT(IN) :: my_pos
1151
1152 CHARACTER(LEN=*), PARAMETER :: routinen = 'optimize_load_list'
1153 INTEGER, PARAMETER :: rank_of_root = 0
1154
1155 INTEGER :: handle, icpu, idest, maxdest, ncpu
1156 INTEGER, ALLOCATABLE, DIMENSION(:) :: load_all
1157 INTEGER, ALLOCATABLE, DIMENSION(:, :) :: load_partial
1158 INTEGER, ALLOCATABLE, DIMENSION(:, :, :) :: list_global
1159
1160 CALL timeset(routinen, handle)
1161
1162 ncpu = SIZE(list, 3)
1163 maxdest = SIZE(list, 2)
1164
1165 !find total workload ...
1166 ALLOCATE (load_all(maxdest*ncpu))
1167 load_all(:) = reshape(list(2, :, :), [maxdest*ncpu])
1168 CALL group%sum(load_all(:), rank_of_root)
1169
1170 ! ... and optimise the work per process
1171 ALLOCATE (list_global(2, maxdest, ncpu))
1172 IF (rank_of_root == my_pos) THEN
1173 list_global(1, :, :) = list(1, :, :)
1174 list_global(2, :, :) = reshape(load_all, [maxdest, ncpu])
1175 CALL balance_global_list(list_global)
1176 END IF
1177 CALL group%bcast(list_global, rank_of_root)
1178
1179 !figure out how much can be sent to other processes
1180 ALLOCATE (load_partial(maxdest, ncpu))
1181 ! send 'load_all', which is a copy of 'list' (but without leading dimension/stride)
1182 CALL group%sum_partial(reshape(load_all, [maxdest, ncpu]), load_partial(:, :))
1183
1184 DO icpu = 1, ncpu
1185 DO idest = 1, maxdest
1186
1187 !need to deduct 1 because `list' was passed in to this routine as being indexed from zero
1188 IF (load_partial(idest, icpu) > list_global(2, idest, icpu)) THEN
1189 IF (load_partial(idest, icpu) - list(2, idest, icpu - 1) < list_global(2, idest, icpu)) THEN
1190 list(2, idest, icpu - 1) = list_global(2, idest, icpu) &
1191 - (load_partial(idest, icpu) - list(2, idest, icpu - 1))
1192 ELSE
1193 list(2, idest, icpu - 1) = 0
1194 END IF
1195 END IF
1196
1197 END DO
1198 END DO
1199
1200 !clean up before leaving
1201 DEALLOCATE (load_all)
1202 DEALLOCATE (list_global)
1203 DEALLOCATE (load_partial)
1204
1205 CALL timestop(handle)
1206 END SUBROUTINE optimize_load_list
1207
1208! **************************************************************************************************
1209!> \brief fill the load list with values derived from the tasks array
1210!> from the alternate locations, we select the alternate location that
1211!> can be used without increasing the number of matrix blocks needed to
1212!> distribute.
1213!> Replicated tasks are not yet considered
1214!>
1215!> \param list ...
1216!> \param rs_descs ...
1217!> \param grid_level ...
1218!> \param tasks ...
1219!> \param ntasks ...
1220!> \param natoms ...
1221!> \param create_list ...
1222!> \par History
1223!> created 2008-10-06 [Joost VandeVondele]
1224! **************************************************************************************************
1225 SUBROUTINE compute_load_list(list, rs_descs, grid_level, tasks, ntasks, natoms, create_list)
1226 INTEGER, DIMENSION(:, :, 0:) :: list
1227 TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
1228 POINTER :: rs_descs
1229 INTEGER :: grid_level
1230 TYPE(task_type), DIMENSION(:), POINTER :: tasks
1231 INTEGER :: ntasks, natoms
1232 LOGICAL :: create_list
1233
1234 CHARACTER(LEN=*), PARAMETER :: routinen = 'compute_load_list'
1235
1236 INTEGER :: cost, dest, handle, i, iatom, ilevel, img, img_old, iopt, ipgf, iset, itask, &
1237 itask_start, itask_stop, jatom, jpgf, jset, li, maxdest, ncpu, ndest_pair, nopt, nshort, &
1238 rank
1239 INTEGER(KIND=int_8) :: bit_pattern, ipair, ipair_old, natom8
1240 INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:) :: loads
1241 INTEGER, ALLOCATABLE, DIMENSION(:) :: all_dests, index
1242 INTEGER, DIMENSION(6) :: options
1243
1244 CALL timeset(routinen, handle)
1245
1246 ALLOCATE (loads(0:rs_descs(grid_level)%rs_desc%group_size - 1))
1247 CALL get_current_loads(loads, rs_descs, grid_level, ntasks, tasks, use_reordered_ranks=.false.)
1248
1249 maxdest = SIZE(list, 2)
1250 ncpu = SIZE(list, 3)
1251 natom8 = natoms
1252
1253 ! first find the tasks that deal with the same atom pair
1254 itask_stop = 0
1255 ipair_old = huge(ipair_old)
1256 img_old = -1
1257 ALLOCATE (all_dests(0))
1258 ALLOCATE (index(0))
1259
1260 DO
1261
1262 ! first find the range of tasks that deal with the same atom pair
1263 itask_start = itask_stop + 1
1264 itask_stop = itask_start
1265 IF (itask_stop > ntasks) EXIT
1266 ilevel = tasks(itask_stop)%grid_level
1267 img_old = tasks(itask_stop)%image
1268 iatom = tasks(itask_stop)%iatom
1269 jatom = tasks(itask_stop)%jatom
1270 iset = tasks(itask_stop)%iset
1271 jset = tasks(itask_stop)%jset
1272 ipgf = tasks(itask_stop)%ipgf
1273 jpgf = tasks(itask_stop)%jpgf
1274
1275 ipair_old = (iatom - 1)*natom8 + (jatom - 1)
1276 DO
1277 IF (itask_stop + 1 > ntasks) EXIT
1278 ilevel = tasks(itask_stop + 1)%grid_level
1279 img = tasks(itask_stop + 1)%image
1280 iatom = tasks(itask_stop + 1)%iatom
1281 jatom = tasks(itask_stop + 1)%jatom
1282 iset = tasks(itask_stop + 1)%iset
1283 jset = tasks(itask_stop + 1)%jset
1284 ipgf = tasks(itask_stop + 1)%ipgf
1285 jpgf = tasks(itask_stop + 1)%jpgf
1286
1287 ipair = (iatom - 1)*natom8 + (jatom - 1)
1288 IF (ipair == ipair_old .AND. img == img_old) THEN
1289 itask_stop = itask_stop + 1
1290 ELSE
1291 EXIT
1292 END IF
1293 END DO
1294 ipair = ipair_old
1295 nshort = itask_stop - itask_start + 1
1296
1297 ! find the unique list of destinations on this grid level only
1298 DEALLOCATE (all_dests)
1299 ALLOCATE (all_dests(nshort))
1300 DEALLOCATE (index)
1301 ALLOCATE (index(nshort))
1302 DO i = 1, nshort
1303 ilevel = tasks(itask_start + i - 1)%grid_level
1304 img = tasks(itask_start + i - 1)%image
1305 iatom = tasks(itask_start + i - 1)%iatom
1306 jatom = tasks(itask_start + i - 1)%jatom
1307 iset = tasks(itask_start + i - 1)%iset
1308 jset = tasks(itask_start + i - 1)%jset
1309 ipgf = tasks(itask_start + i - 1)%ipgf
1310 jpgf = tasks(itask_start + i - 1)%jpgf
1311
1312 IF (ilevel == grid_level) THEN
1313 all_dests(i) = decode_rank(tasks(itask_start + i - 1)%destination, SIZE(rs_descs))
1314 ELSE
1315 all_dests(i) = huge(all_dests(i))
1316 END IF
1317 END DO
1318 CALL sort(all_dests, nshort, index)
1319 ndest_pair = 1
1320 DO i = 2, nshort
1321 IF ((all_dests(ndest_pair) /= all_dests(i)) .AND. (all_dests(i) /= huge(all_dests(i)))) THEN
1322 ndest_pair = ndest_pair + 1
1323 all_dests(ndest_pair) = all_dests(i)
1324 END IF
1325 END DO
1326
1327 DO itask = itask_start, itask_stop
1328
1329 dest = decode_rank(tasks(itask)%destination, SIZE(rs_descs)) ! notice that dest can be changed
1330 ilevel = tasks(itask)%grid_level
1331 img = tasks(itask)%image
1332 iatom = tasks(itask)%iatom
1333 jatom = tasks(itask)%jatom
1334 iset = tasks(itask)%iset
1335 jset = tasks(itask)%jset
1336 ipgf = tasks(itask)%ipgf
1337 jpgf = tasks(itask)%jpgf
1338
1339 ! Only proceed with tasks which are on this grid level
1340 IF (ilevel /= grid_level) cycle
1341 ipair = (iatom - 1)*natom8 + (jatom - 1)
1342 cost = int(tasks(itask)%cost)
1343
1344 SELECT CASE (tasks(itask)%dist_type)
1345 CASE (1)
1346 bit_pattern = tasks(itask)%subpatch_pattern
1347 nopt = 0
1348 IF (btest(bit_pattern, 0)) THEN
1349 rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, [-1, 0, 0])
1350 IF (any(all_dests(1:ndest_pair) == rank)) THEN
1351 nopt = nopt + 1
1352 options(nopt) = rank
1353 END IF
1354 END IF
1355 IF (btest(bit_pattern, 1)) THEN
1356 rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, [+1, 0, 0])
1357 IF (any(all_dests(1:ndest_pair) == rank)) THEN
1358 nopt = nopt + 1
1359 options(nopt) = rank
1360 END IF
1361 END IF
1362 IF (btest(bit_pattern, 2)) THEN
1363 rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, [0, -1, 0])
1364 IF (any(all_dests(1:ndest_pair) == rank)) THEN
1365 nopt = nopt + 1
1366 options(nopt) = rank
1367 END IF
1368 END IF
1369 IF (btest(bit_pattern, 3)) THEN
1370 rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, [0, +1, 0])
1371 IF (any(all_dests(1:ndest_pair) == rank)) THEN
1372 nopt = nopt + 1
1373 options(nopt) = rank
1374 END IF
1375 END IF
1376 IF (btest(bit_pattern, 4)) THEN
1377 rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, [0, 0, -1])
1378 IF (any(all_dests(1:ndest_pair) == rank)) THEN
1379 nopt = nopt + 1
1380 options(nopt) = rank
1381 END IF
1382 END IF
1383 IF (btest(bit_pattern, 5)) THEN
1384 rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, [0, 0, +1])
1385 IF (any(all_dests(1:ndest_pair) == rank)) THEN
1386 nopt = nopt + 1
1387 options(nopt) = rank
1388 END IF
1389 END IF
1390 IF (nopt > 0) THEN
1391 ! set it to the rank with the lowest load
1392 rank = options(1)
1393 DO iopt = 2, nopt
1394 IF (loads(rank) > loads(options(iopt))) rank = options(iopt)
1395 END DO
1396 ELSE
1397 rank = dest
1398 END IF
1399 li = list_index(list, rank, dest)
1400 IF (create_list) THEN
1401 list(2, li, dest) = list(2, li, dest) + cost
1402 ELSE
1403 IF (list(1, li, dest) == dest) THEN
1404 tasks(itask)%destination = encode_rank(dest, ilevel, SIZE(rs_descs))
1405 ELSE
1406 IF (list(2, li, dest) >= cost) THEN
1407 list(2, li, dest) = list(2, li, dest) - cost
1408 tasks(itask)%destination = encode_rank(list(1, li, dest), ilevel, SIZE(rs_descs))
1409 ELSE
1410 tasks(itask)%destination = encode_rank(dest, ilevel, SIZE(rs_descs))
1411 END IF
1412 END IF
1413 END IF
1414 CASE (2) ! generalised
1415 li = list_index(list, dest, dest)
1416 IF (create_list) THEN
1417 list(2, li, dest) = list(2, li, dest) + cost
1418 ELSE
1419 IF (list(1, li, dest) == dest) THEN
1420 tasks(itask)%destination = encode_rank(dest, ilevel, SIZE(rs_descs))
1421 ELSE
1422 IF (list(2, li, dest) >= cost) THEN
1423 list(2, li, dest) = list(2, li, dest) - cost
1424 tasks(itask)%destination = encode_rank(list(1, li, dest), ilevel, SIZE(rs_descs))
1425 ELSE
1426 tasks(itask)%destination = encode_rank(dest, ilevel, SIZE(rs_descs))
1427 END IF
1428 END IF
1429 END IF
1430 CASE DEFAULT
1431 cpabort("")
1432 END SELECT
1433
1434 END DO
1435
1436 END DO
1437
1438 CALL timestop(handle)
1439
1440 END SUBROUTINE compute_load_list
1441! **************************************************************************************************
1442!> \brief small helper function to return the proper index in the list array
1443!>
1444!> \param list ...
1445!> \param rank ...
1446!> \param dest ...
1447!> \return ...
1448!> \par History
1449!> created 2008-10-06 [Joost VandeVondele]
1450! **************************************************************************************************
1451 INTEGER FUNCTION list_index(list, rank, dest)
1452 INTEGER, DIMENSION(:, :, 0:), INTENT(IN) :: list
1453 INTEGER, INTENT(IN) :: rank, dest
1454
1455 list_index = 1
1456 DO
1457 IF (list(1, list_index, dest) == rank) EXIT
1458 list_index = list_index + 1
1459 END DO
1460 END FUNCTION list_index
1461! **************************************************************************************************
1462!> \brief create a list with possible destinations (i.e. the central cpu and neighbors) for each cpu
1463!> note that we allocate it with an additional field to store the load of this destination
1464!>
1465!> \param list ...
1466!> \param rs_descs ...
1467!> \param grid_level ...
1468!> \par History
1469!> created 2008-10-06 [Joost VandeVondele]
1470! **************************************************************************************************
1471 SUBROUTINE create_destination_list(list, rs_descs, grid_level)
1472 INTEGER, DIMENSION(:, :, :), POINTER :: list
1473 TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
1474 POINTER :: rs_descs
1475 INTEGER, INTENT(IN) :: grid_level
1476
1477 CHARACTER(LEN=*), PARAMETER :: routinen = 'create_destination_list'
1478
1479 INTEGER :: handle, i, icpu, j, maxcount, ncpu, &
1480 ultimate_max
1481 INTEGER, ALLOCATABLE, DIMENSION(:) :: index, sublist
1482
1483 CALL timeset(routinen, handle)
1484
1485 cpassert(.NOT. ASSOCIATED(list))
1486 ncpu = rs_descs(grid_level)%rs_desc%group_size
1487 ultimate_max = 7
1488
1489 ALLOCATE (list(2, ultimate_max, 0:ncpu - 1))
1490
1491 ALLOCATE (index(ultimate_max))
1492 ALLOCATE (sublist(ultimate_max))
1493 sublist = huge(sublist)
1494
1495 maxcount = 1
1496 DO icpu = 0, ncpu - 1
1497 sublist(1) = icpu
1498 sublist(2) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, [-1, 0, 0])
1499 sublist(3) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, [+1, 0, 0])
1500 sublist(4) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, [0, -1, 0])
1501 sublist(5) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, [0, +1, 0])
1502 sublist(6) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, [0, 0, -1])
1503 sublist(7) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, [0, 0, +1])
1504 ! only retain unique values of the destination
1505 CALL sort(sublist, ultimate_max, index)
1506 j = 1
1507 DO i = 2, 7
1508 IF (sublist(i) /= sublist(j)) THEN
1509 j = j + 1
1510 sublist(j) = sublist(i)
1511 END IF
1512 END DO
1513 maxcount = max(maxcount, j)
1514 sublist(j + 1:ultimate_max) = huge(sublist)
1515 list(1, :, icpu) = sublist
1516 list(2, :, icpu) = 0
1517 END DO
1518
1519 CALL reallocate(list, 1, 2, 1, maxcount, 0, ncpu - 1)
1520
1521 CALL timestop(handle)
1522
1523 END SUBROUTINE create_destination_list
1524
1525! **************************************************************************************************
1526!> \brief given a task list, compute the load of each process everywhere
1527!> giving this function the ability to loop over a (sub)set of rs_grids,
1528!> and do all the communication in one shot, would speed it up
1529!> \param loads ...
1530!> \param rs_descs ...
1531!> \param grid_level ...
1532!> \param ntasks ...
1533!> \param tasks ...
1534!> \param use_reordered_ranks ...
1535!> \par History
1536!> none
1537!> \author MattW 21/11/2007
1538! **************************************************************************************************
1539 SUBROUTINE get_current_loads(loads, rs_descs, grid_level, ntasks, tasks, use_reordered_ranks)
1540 INTEGER(KIND=int_8), DIMENSION(:) :: loads
1541 TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
1542 POINTER :: rs_descs
1543 INTEGER :: grid_level, ntasks
1544 TYPE(task_type), DIMENSION(:), POINTER :: tasks
1545 LOGICAL, INTENT(IN) :: use_reordered_ranks
1546
1547 CHARACTER(LEN=*), PARAMETER :: routinen = 'get_current_loads'
1548
1549 INTEGER :: handle, i, iatom, ilevel, img, ipgf, &
1550 iset, jatom, jpgf, jset
1551 INTEGER(KIND=int_8) :: total_cost_local
1552 INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:) :: recv_buf_i, send_buf_i
1553 TYPE(realspace_grid_desc_type), POINTER :: desc
1554
1555 CALL timeset(routinen, handle)
1556
1557 desc => rs_descs(grid_level)%rs_desc
1558
1559 ! allocate local arrays
1560 ALLOCATE (send_buf_i(desc%group_size))
1561 ALLOCATE (recv_buf_i(desc%group_size))
1562
1563 ! communication step 1 : compute the total local cost of the tasks
1564 ! each proc needs to know the amount of work he will receive
1565
1566 ! send buffer now contains for each target the cost of the tasks it will receive
1567 send_buf_i = 0
1568 DO i = 1, ntasks
1569 ilevel = tasks(i)%grid_level
1570 img = tasks(i)%image
1571 iatom = tasks(i)%iatom
1572 jatom = tasks(i)%jatom
1573 iset = tasks(i)%iset
1574 jset = tasks(i)%jset
1575 ipgf = tasks(i)%ipgf
1576 jpgf = tasks(i)%jpgf
1577 IF (ilevel /= grid_level) cycle
1578 IF (use_reordered_ranks) THEN
1579 send_buf_i(rs_descs(ilevel)%rs_desc%virtual2real(decode_rank(tasks(i)%destination, SIZE(rs_descs))) + 1) = &
1580 send_buf_i(rs_descs(ilevel)%rs_desc%virtual2real(decode_rank(tasks(i)%destination, SIZE(rs_descs))) + 1) &
1581 + tasks(i)%cost
1582 ELSE
1583 send_buf_i(decode_rank(tasks(i)%destination, SIZE(rs_descs)) + 1) = &
1584 send_buf_i(decode_rank(tasks(i)%destination, SIZE(rs_descs)) + 1) &
1585 + tasks(i)%cost
1586 END IF
1587 END DO
1588 CALL desc%group%alltoall(send_buf_i, recv_buf_i, 1)
1589
1590 ! communication step 2 : compute the global cost of the tasks
1591 total_cost_local = sum(recv_buf_i)
1592
1593 ! after this step, the recv buffer contains the local cost for each CPU
1594 CALL desc%group%allgather(total_cost_local, loads)
1595
1596 CALL timestop(handle)
1597
1598 END SUBROUTINE get_current_loads
1599! **************************************************************************************************
1600!> \brief performs load balancing shifting tasks on the replicated grids
1601!> this modifies the destination of some of the tasks on replicated
1602!> grids, and in this way balances the load
1603!> \param rs_descs ...
1604!> \param ntasks ...
1605!> \param tasks ...
1606!> \par History
1607!> none
1608!> \author MattW 21/11/2007
1609! **************************************************************************************************
1610 SUBROUTINE load_balance_replicated(rs_descs, ntasks, tasks)
1611
1612 TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
1613 POINTER :: rs_descs
1614 INTEGER :: ntasks
1615 TYPE(task_type), DIMENSION(:), POINTER :: tasks
1616
1617 CHARACTER(LEN=*), PARAMETER :: routinen = 'load_balance_replicated'
1618
1619 INTEGER :: handle, i, iatom, ilevel, img, ipgf, &
1620 iset, j, jatom, jpgf, jset, &
1621 no_overloaded, no_underloaded, &
1622 proc_receiving
1623 INTEGER(KIND=int_8) :: average_cost, cost_task_rep, count, &
1624 offset, total_cost_global
1625 INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:) :: load_imbalance, loads, recv_buf_i
1626 INTEGER, ALLOCATABLE, DIMENSION(:) :: index
1627 TYPE(realspace_grid_desc_type), POINTER :: desc
1628
1629 CALL timeset(routinen, handle)
1630
1631 desc => rs_descs(1)%rs_desc
1632
1633 ! allocate local arrays
1634 ALLOCATE (recv_buf_i(desc%group_size))
1635 ALLOCATE (loads(desc%group_size))
1636
1637 recv_buf_i = 0
1638 DO i = 1, SIZE(rs_descs)
1639 CALL get_current_loads(loads, rs_descs, i, ntasks, tasks, use_reordered_ranks=.true.)
1640 recv_buf_i(:) = recv_buf_i + loads
1641 END DO
1642
1643 total_cost_global = sum(recv_buf_i)
1644 average_cost = total_cost_global/desc%group_size
1645
1646 !
1647 ! compute how to redistribute the replicated tasks so that the average cost is reached
1648 !
1649
1650 ! load imbalance measures the load of a given CPU relative
1651 ! to the optimal load distribution (load=average)
1652 ALLOCATE (load_imbalance(desc%group_size))
1653 ALLOCATE (index(desc%group_size))
1654
1655 load_imbalance(:) = recv_buf_i - average_cost
1656 no_overloaded = 0
1657 no_underloaded = 0
1658
1659 DO i = 1, desc%group_size
1660 IF (load_imbalance(i) > 0) no_overloaded = no_overloaded + 1
1661 IF (load_imbalance(i) < 0) no_underloaded = no_underloaded + 1
1662 END DO
1663
1664 ! sort the recv_buffer on number of tasks, gives us index which provides a
1665 ! mapping between processor ranks and how overloaded the processor
1666 CALL sort(recv_buf_i, SIZE(recv_buf_i), index)
1667
1668 ! find out the number of replicated tasks each proc has
1669 ! but only those tasks which have not yet been assigned
1670 cost_task_rep = 0
1671 DO i = 1, ntasks
1672 IF (tasks(i)%dist_type == 0 &
1673 .AND. decode_rank(tasks(i)%destination, SIZE(rs_descs)) == decode_rank(tasks(i)%source, SIZE(rs_descs))) THEN
1674 cost_task_rep = cost_task_rep + tasks(i)%cost
1675 END IF
1676 END DO
1677
1678 ! now, correct the load imbalance for the overloaded CPUs
1679 ! they will send away not more than the total load of replicated tasks
1680 CALL desc%group%allgather(cost_task_rep, recv_buf_i)
1681
1682 DO i = 1, desc%group_size
1683 ! At the moment we can only offload replicated tasks
1684 IF (load_imbalance(i) > 0) &
1685 load_imbalance(i) = min(load_imbalance(i), recv_buf_i(i))
1686 END DO
1687
1688 ! simplest algorithm I can think of of is that the processor with the most
1689 ! excess tasks fills up the process needing most, then moves on to next most.
1690 ! At the moment if we've got less replicated tasks than we're overloaded then
1691 ! task balancing will be incomplete
1692
1693 ! only need to do anything if I've excess tasks
1694 IF (load_imbalance(desc%my_pos + 1) > 0) THEN
1695
1696 count = 0 ! weighted amount of tasks offloaded
1697 offset = 0 ! no of underloaded processes already filled by other more overloaded procs
1698
1699 ! calculate offset
1700 DO i = desc%group_size, desc%group_size - no_overloaded + 1, -1
1701 IF (index(i) == desc%my_pos + 1) THEN
1702 EXIT
1703 ELSE
1704 offset = offset + load_imbalance(index(i))
1705 END IF
1706 END DO
1707
1708 ! find my starting processor to send to
1709 proc_receiving = huge(proc_receiving)
1710 DO i = 1, no_underloaded
1711 offset = offset + load_imbalance(index(i))
1712 IF (offset <= 0) THEN
1713 proc_receiving = i
1714 EXIT
1715 END IF
1716 END DO
1717
1718 ! offset now contains minus the number of tasks proc_receiving requires
1719 ! we fill this up by adjusting the destination of tasks on the replicated grid,
1720 ! then move to next most underloaded proc
1721 DO j = 1, ntasks
1722 IF (tasks(j)%dist_type == 0 &
1723 .AND. decode_rank(tasks(j)%destination, SIZE(rs_descs)) == decode_rank(tasks(j)%source, SIZE(rs_descs))) THEN
1724 ! just avoid sending to non existing procs due to integer truncation
1725 ! in the computation of the average
1726 IF (proc_receiving > no_underloaded) EXIT
1727 ! set new destination
1728 ilevel = tasks(j)%grid_level
1729 img = tasks(j)%image
1730 iatom = tasks(j)%iatom
1731 jatom = tasks(j)%jatom
1732 iset = tasks(j)%iset
1733 jset = tasks(j)%jset
1734 ipgf = tasks(j)%ipgf
1735 jpgf = tasks(j)%jpgf
1736 tasks(j)%destination = encode_rank(index(proc_receiving) - 1, ilevel, SIZE(rs_descs))
1737 offset = offset + tasks(j)%cost
1738 count = count + tasks(j)%cost
1739 IF (count >= load_imbalance(desc%my_pos + 1)) EXIT
1740 IF (offset > 0) THEN
1741 proc_receiving = proc_receiving + 1
1742 ! just avoid sending to non existing procs due to integer truncation
1743 ! in the computation of the average
1744 IF (proc_receiving > no_underloaded) EXIT
1745 offset = load_imbalance(index(proc_receiving))
1746 END IF
1747 END IF
1748 END DO
1749 END IF
1750
1751 DEALLOCATE (index)
1752 DEALLOCATE (load_imbalance)
1753
1754 CALL timestop(handle)
1755
1756 END SUBROUTINE load_balance_replicated
1757
1758! **************************************************************************************************
1759!> \brief given an input task list, redistribute so that all tasks can be processed locally,
1760!> i.e. dest equals rank
1761!> \param rs_descs ...
1762!> \param ntasks ...
1763!> \param tasks ...
1764!> \param ntasks_recv ...
1765!> \param tasks_recv ...
1766!> \par History
1767!> none
1768!> \author MattW 21/11/2007
1769! **************************************************************************************************
1770 SUBROUTINE create_local_tasks(rs_descs, ntasks, tasks, ntasks_recv, tasks_recv)
1771
1772 TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
1773 POINTER :: rs_descs
1774 INTEGER :: ntasks
1775 TYPE(task_type), DIMENSION(:), POINTER :: tasks
1776 INTEGER :: ntasks_recv
1777 TYPE(task_type), DIMENSION(:), POINTER :: tasks_recv
1778
1779 CHARACTER(LEN=*), PARAMETER :: routinen = 'create_local_tasks'
1780
1781 INTEGER :: handle, i, j, k, l, rank
1782 INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:) :: recv_buf, send_buf
1783 INTEGER, ALLOCATABLE, DIMENSION(:) :: recv_disps, recv_sizes, send_disps, &
1784 send_sizes
1785 TYPE(realspace_grid_desc_type), POINTER :: desc
1786
1787 CALL timeset(routinen, handle)
1788
1789 desc => rs_descs(1)%rs_desc
1790
1791 ! allocate local arrays
1792 ALLOCATE (send_sizes(desc%group_size))
1793 ALLOCATE (recv_sizes(desc%group_size))
1794 ALLOCATE (send_disps(desc%group_size))
1795 ALLOCATE (recv_disps(desc%group_size))
1796 ALLOCATE (send_buf(desc%group_size))
1797 ALLOCATE (recv_buf(desc%group_size))
1798
1799 ! fill send buffer, now counting how many tasks will be send (stored in an int8 array for convenience only).
1800 send_buf = 0
1801 DO i = 1, ntasks
1802 rank = rs_descs(decode_level(tasks(i)%destination, SIZE(rs_descs))) &
1803 %rs_desc%virtual2real(decode_rank(tasks(i)%destination, SIZE(rs_descs)))
1804 send_buf(rank + 1) = send_buf(rank + 1) + 1
1805 END DO
1806
1807 CALL desc%group%alltoall(send_buf, recv_buf, 1)
1808
1809 ! pack the tasks, and send them around
1810
1811 send_sizes = 0
1812 send_disps = 0
1813 recv_sizes = 0
1814 recv_disps = 0
1815
1816 send_sizes(1) = int(send_buf(1)*task_size_in_int8)
1817 recv_sizes(1) = int(recv_buf(1)*task_size_in_int8)
1818 DO i = 2, desc%group_size
1819 send_sizes(i) = int(send_buf(i)*task_size_in_int8)
1820 recv_sizes(i) = int(recv_buf(i)*task_size_in_int8)
1821 send_disps(i) = send_disps(i - 1) + send_sizes(i - 1)
1822 recv_disps(i) = recv_disps(i - 1) + recv_sizes(i - 1)
1823 END DO
1824
1825 ! deallocate old send/recv buffers
1826 DEALLOCATE (send_buf)
1827 DEALLOCATE (recv_buf)
1828
1829 ! allocate them with new sizes
1830 ALLOCATE (send_buf(sum(send_sizes)))
1831 ALLOCATE (recv_buf(sum(recv_sizes)))
1832
1833 ! do packing
1834 send_buf = 0
1835 send_sizes = 0
1836 DO j = 1, ntasks
1837 i = rs_descs(decode_level(tasks(j)%destination, SIZE(rs_descs))) &
1838 %rs_desc%virtual2real(decode_rank(tasks(j)%destination, SIZE(rs_descs))) + 1
1839 l = send_disps(i) + send_sizes(i)
1840 CALL serialize_task(tasks(j), send_buf(l + 1:l + task_size_in_int8))
1841 send_sizes(i) = send_sizes(i) + task_size_in_int8
1842 END DO
1843
1844 ! do communication
1845 CALL desc%group%alltoall(send_buf, send_sizes, send_disps, recv_buf, recv_sizes, recv_disps)
1846
1847 DEALLOCATE (send_buf)
1848
1849 ntasks_recv = sum(recv_sizes)/task_size_in_int8
1850 ALLOCATE (tasks_recv(ntasks_recv))
1851
1852 ! do unpacking
1853 l = 0
1854 DO i = 1, desc%group_size
1855 DO j = 0, recv_sizes(i)/task_size_in_int8 - 1
1856 l = l + 1
1857 k = recv_disps(i) + j*task_size_in_int8
1858 CALL deserialize_task(tasks_recv(l), recv_buf(k + 1:k + task_size_in_int8))
1859 END DO
1860 END DO
1861
1862 DEALLOCATE (recv_buf)
1863 DEALLOCATE (send_sizes)
1864 DEALLOCATE (recv_sizes)
1865 DEALLOCATE (send_disps)
1866 DEALLOCATE (recv_disps)
1867
1868 CALL timestop(handle)
1869
1870 END SUBROUTINE create_local_tasks
1871
1872! **************************************************************************************************
1873!> \brief Assembles tasks to be performed on local grid
1874!> \param rs_descs the grids
1875!> \param ntasks Number of tasks for local processing
1876!> \param natoms ...
1877!> \param nimages ...
1878!> \param tasks the task set generated on this processor
1879!> \param rval ...
1880!> \param atom_pair_send ...
1881!> \param atom_pair_recv ...
1882!> \param symmetric ...
1883!> \param reorder_rs_grid_ranks ...
1884!> \param skip_load_balance_distributed ...
1885!> \par History
1886!> none
1887!> \author MattW 21/11/2007
1888! **************************************************************************************************
1889 SUBROUTINE distribute_tasks(rs_descs, ntasks, natoms, &
1890 tasks, atom_pair_send, atom_pair_recv, &
1891 symmetric, reorder_rs_grid_ranks, skip_load_balance_distributed)
1892
1893 TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
1894 POINTER :: rs_descs
1895 INTEGER :: ntasks, natoms
1896 TYPE(task_type), DIMENSION(:), POINTER :: tasks
1897 TYPE(atom_pair_type), DIMENSION(:), POINTER :: atom_pair_send, atom_pair_recv
1898 LOGICAL, INTENT(IN) :: symmetric, reorder_rs_grid_ranks, &
1899 skip_load_balance_distributed
1900
1901 CHARACTER(LEN=*), PARAMETER :: routinen = 'distribute_tasks'
1902
1903 INTEGER :: handle, igrid_level, irank, ntasks_recv
1904 INTEGER(KIND=int_8) :: load_gap, max_load, replicated_load
1905 INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:) :: total_loads, total_loads_tmp, trial_loads
1906 INTEGER(KIND=int_8), DIMENSION(:, :), POINTER :: loads
1907 INTEGER, ALLOCATABLE, DIMENSION(:) :: indices, real2virtual, total_index
1908 LOGICAL :: distributed_grids, fixed_first_grid
1909 TYPE(realspace_grid_desc_type), POINTER :: desc
1910 TYPE(task_type), DIMENSION(:), POINTER :: tasks_recv
1911
1912 CALL timeset(routinen, handle)
1913
1914 cpassert(ASSOCIATED(tasks))
1915
1916 ! *** figure out if we have distributed grids
1917 distributed_grids = .false.
1918 DO igrid_level = 1, SIZE(rs_descs)
1919 IF (rs_descs(igrid_level)%rs_desc%distributed) THEN
1920 distributed_grids = .true.
1921 END IF
1922 END DO
1923 desc => rs_descs(1)%rs_desc
1924
1925 IF (distributed_grids) THEN
1926
1927 ALLOCATE (loads(0:desc%group_size - 1, SIZE(rs_descs)))
1928 ALLOCATE (total_loads(0:desc%group_size - 1))
1929
1930 total_loads = 0
1931
1932 ! First round of balancing on the distributed grids
1933 ! we just balance each of the distributed grids independently
1934 DO igrid_level = 1, SIZE(rs_descs)
1935 IF (rs_descs(igrid_level)%rs_desc%distributed) THEN
1936
1937 IF (.NOT. skip_load_balance_distributed) &
1938 CALL load_balance_distributed(tasks, ntasks, rs_descs, igrid_level, natoms)
1939
1940 CALL get_current_loads(loads(:, igrid_level), rs_descs, igrid_level, ntasks, &
1941 tasks, use_reordered_ranks=.false.)
1942
1943 total_loads(:) = total_loads + loads(:, igrid_level)
1944
1945 END IF
1946 END DO
1947
1948 ! calculate the total load of replicated tasks, so we can decide if it is worth
1949 ! reordering the distributed grid levels
1950
1951 replicated_load = 0
1952 DO igrid_level = 1, SIZE(rs_descs)
1953 IF (.NOT. rs_descs(igrid_level)%rs_desc%distributed) THEN
1954 CALL get_current_loads(loads(:, igrid_level), rs_descs, igrid_level, ntasks, &
1955 tasks, use_reordered_ranks=.false.)
1956 replicated_load = replicated_load + sum(loads(:, igrid_level))
1957 END IF
1958 END DO
1959
1960 !IF (desc%my_pos==0) THEN
1961 ! WRITE(*,*) "Total replicated load is ",replicated_load
1962 !END IF
1963
1964 ! Now we adjust the rank ordering based on the current loads
1965 ! we leave the first distributed level and all the replicated levels in the default order
1966 IF (reorder_rs_grid_ranks) THEN
1967 fixed_first_grid = .false.
1968 DO igrid_level = 1, SIZE(rs_descs)
1969 IF (rs_descs(igrid_level)%rs_desc%distributed) THEN
1970 IF (fixed_first_grid .EQV. .false.) THEN
1971 total_loads(:) = loads(:, igrid_level)
1972 fixed_first_grid = .true.
1973 ELSE
1974 ALLOCATE (trial_loads(0:desc%group_size - 1))
1975
1976 trial_loads(:) = total_loads + loads(:, igrid_level)
1977 max_load = maxval(trial_loads)
1978 load_gap = 0
1979 DO irank = 0, desc%group_size - 1
1980 load_gap = load_gap + max_load - trial_loads(irank)
1981 END DO
1982
1983 ! If there is not enough replicated load to load balance well enough
1984 ! then we will reorder this grid level
1985 IF (load_gap > replicated_load*1.05_dp) THEN
1986
1987 ALLOCATE (indices(0:desc%group_size - 1))
1988 ALLOCATE (total_index(0:desc%group_size - 1))
1989 ALLOCATE (total_loads_tmp(0:desc%group_size - 1))
1990 ALLOCATE (real2virtual(0:desc%group_size - 1))
1991
1992 total_loads_tmp(:) = total_loads
1993 CALL sort(total_loads_tmp, desc%group_size, total_index)
1994 CALL sort(loads(:, igrid_level), desc%group_size, indices)
1995
1996 ! Reorder so that the rank with smallest load on this grid level is paired with
1997 ! the highest load in total
1998 DO irank = 0, desc%group_size - 1
1999 total_loads(total_index(irank) - 1) = total_loads(total_index(irank) - 1) + &
2000 loads(desc%group_size - irank - 1, igrid_level)
2001 real2virtual(total_index(irank) - 1) = indices(desc%group_size - irank - 1) - 1
2002 END DO
2003
2004 CALL rs_grid_reorder_ranks(rs_descs(igrid_level)%rs_desc, real2virtual)
2005
2006 DEALLOCATE (indices)
2007 DEALLOCATE (total_index)
2008 DEALLOCATE (total_loads_tmp)
2009 DEALLOCATE (real2virtual)
2010 ELSE
2011 total_loads(:) = trial_loads
2012 END IF
2013
2014 DEALLOCATE (trial_loads)
2015
2016 END IF
2017 END IF
2018 END DO
2019 END IF
2020
2021 ! Now we use the replicated tasks to balance out the rest of the load
2022 CALL load_balance_replicated(rs_descs, ntasks, tasks)
2023
2024 !total_loads = 0
2025 !DO igrid_level=1,SIZE(rs_descs)
2026 ! CALL get_current_loads(loads(:,igrid_level), rs_descs, igrid_level, ntasks, &
2027 ! tasks, use_reordered_ranks=.TRUE.)
2028 ! total_loads = total_loads + loads(:, igrid_level)
2029 !END DO
2030
2031 !IF (desc%my_pos==0) THEN
2032 ! WRITE(*,*) ""
2033 ! WRITE(*,*) "At the end of the load balancing procedure"
2034 ! WRITE(*,*) "Maximum load:",MAXVAL(total_loads)
2035 ! WRITE(*,*) "Average load:",SUM(total_loads)/SIZE(total_loads)
2036 ! WRITE(*,*) "Minimum load:",MINVAL(total_loads)
2037 !ENDIF
2038
2039 ! given a list of tasks, this will do the needed reshuffle so that all tasks will be local
2040 CALL create_local_tasks(rs_descs, ntasks, tasks, ntasks_recv, tasks_recv)
2041
2042 !
2043 ! tasks list are complete, we can compute the list of atomic blocks (atom pairs)
2044 ! we will be sending. These lists are needed for redistribute_matrix.
2045 !
2046 CALL get_atom_pair(atom_pair_send, tasks, ntasks=ntasks, send=.true., symmetric=symmetric, rs_descs=rs_descs)
2047
2048 ! natom_send=SIZE(atom_pair_send)
2049 ! CALL desc%group%sum(natom_send)
2050 ! IF (desc%my_pos==0) THEN
2051 ! WRITE(*,*) ""
2052 ! WRITE(*,*) "Total number of atomic blocks to be send:",natom_send
2053 ! ENDIF
2054
2055 CALL get_atom_pair(atom_pair_recv, tasks_recv, ntasks=ntasks_recv, send=.false., symmetric=symmetric, rs_descs=rs_descs)
2056
2057 ! cleanup, at this point we don't need the original tasks anymore
2058 DEALLOCATE (tasks)
2059 DEALLOCATE (loads)
2060 DEALLOCATE (total_loads)
2061
2062 ELSE
2063 tasks_recv => tasks
2064 ntasks_recv = ntasks
2065 CALL get_atom_pair(atom_pair_recv, tasks_recv, ntasks=ntasks_recv, send=.false., symmetric=symmetric, rs_descs=rs_descs)
2066 ! not distributed, hence atom_pair_send not needed
2067 END IF
2068
2069 ! here we sort the task list we will process locally.
2070 ALLOCATE (indices(ntasks_recv))
2071 CALL tasks_sort(tasks_recv, ntasks_recv, indices)
2072 DEALLOCATE (indices)
2073
2074 !
2075 ! final lists are ready
2076 !
2077
2078 tasks => tasks_recv
2079 ntasks = ntasks_recv
2080
2081 CALL timestop(handle)
2082
2083 END SUBROUTINE distribute_tasks
2084
2085! **************************************************************************************************
2086!> \brief ...
2087!> \param atom_pair ...
2088!> \param my_tasks ...
2089!> \param send ...
2090!> \param symmetric ...
2091!> \param natoms ...
2092!> \param nimages ...
2093!> \param rs_descs ...
2094! **************************************************************************************************
2095 SUBROUTINE get_atom_pair(atom_pair, tasks, ntasks, send, symmetric, rs_descs)
2096
2097 TYPE(atom_pair_type), DIMENSION(:), POINTER :: atom_pair
2098 TYPE(task_type), DIMENSION(:), INTENT(INOUT) :: tasks
2099 INTEGER, INTENT(IN) :: ntasks
2100 LOGICAL, INTENT(IN) :: send, symmetric
2101 TYPE(realspace_grid_desc_p_type), DIMENSION(:), INTENT(IN) :: rs_descs
2102
2103 INTEGER :: i, ilevel, iatom, jatom, npairs, virt_rank
2104 INTEGER, DIMENSION(:), ALLOCATABLE :: indices
2105 TYPE(atom_pair_type), DIMENSION(:), ALLOCATABLE :: atom_pair_tmp
2106
2107 cpassert(.NOT. ASSOCIATED(atom_pair))
2108 IF (ntasks == 0) THEN
2109 ALLOCATE (atom_pair(0))
2110 RETURN
2111 END IF
2112
2113 ! calculate list of atom pairs
2114 ! fill pair list taking into account symmetry
2115 ALLOCATE (atom_pair_tmp(ntasks))
2116 DO i = 1, ntasks
2117 atom_pair_tmp(i)%image = tasks(i)%image
2118 iatom = tasks(i)%iatom
2119 jatom = tasks(i)%jatom
2120 IF (symmetric .AND. iatom > jatom) THEN
2121 ! iatom / jatom swapped
2122 atom_pair_tmp(i)%row = jatom
2123 atom_pair_tmp(i)%col = iatom
2124 ELSE
2125 atom_pair_tmp(i)%row = iatom
2126 atom_pair_tmp(i)%col = jatom
2127 END IF
2128
2129 IF (send) THEN
2130 ! If sending, we need to use the 'real rank' as the pair has to be sent to the process which
2131 ! actually has the correct part of the rs_grid to do the mapping
2132 ilevel = tasks(i)%grid_level
2133 virt_rank = decode_rank(tasks(i)%destination, SIZE(rs_descs))
2134 atom_pair_tmp(i)%rank = rs_descs(ilevel)%rs_desc%virtual2real(virt_rank)
2135 ELSE
2136 ! If we are receiving, then no conversion is needed as the rank is that of the process with the
2137 ! required matrix block, and the ordering of the rs grid is irrelevant
2138 atom_pair_tmp(i)%rank = decode_rank(tasks(i)%source, SIZE(rs_descs))
2139 END IF
2140 END DO
2141
2142 ! find unique atom pairs that I'm sending/receiving
2143 ALLOCATE (indices(ntasks))
2144 CALL atom_pair_sort(atom_pair_tmp, ntasks, indices)
2145 npairs = 1
2146 tasks(indices(1))%pair_index = 1
2147 DO i = 2, ntasks
2148 IF (atom_pair_less_than(atom_pair_tmp(i - 1), atom_pair_tmp(i))) THEN
2149 npairs = npairs + 1
2150 atom_pair_tmp(npairs) = atom_pair_tmp(i)
2151 END IF
2152 tasks(indices(i))%pair_index = npairs
2153 END DO
2154 DEALLOCATE (indices)
2155
2156 ! Copy unique pairs to final location.
2157 ALLOCATE (atom_pair(npairs))
2158 atom_pair(:) = atom_pair_tmp(:npairs)
2159 DEALLOCATE (atom_pair_tmp)
2160
2161 END SUBROUTINE get_atom_pair
2162
2163! **************************************************************************************************
2164!> \brief redistributes the matrix so that it can be used in realspace operations
2165!> i.e. according to the task lists for collocate and integrate.
2166!> This routine can become a bottleneck in large calculations.
2167!> \param rs_descs ...
2168!> \param pmats ...
2169!> \param atom_pair_send ...
2170!> \param atom_pair_recv ...
2171!> \param natoms ...
2172!> \param nimages ...
2173!> \param scatter ...
2174!> \param hmats ...
2175! **************************************************************************************************
2176 SUBROUTINE rs_distribute_matrix(rs_descs, pmats, atom_pair_send, atom_pair_recv, &
2177 nimages, scatter, hmats)
2178
2179 TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
2180 POINTER :: rs_descs
2181 TYPE(dbcsr_p_type), DIMENSION(:), POINTER :: pmats
2182 TYPE(atom_pair_type), DIMENSION(:), POINTER :: atom_pair_send, atom_pair_recv
2183 INTEGER :: nimages
2184 LOGICAL :: scatter
2185 TYPE(dbcsr_p_type), DIMENSION(:), OPTIONAL, &
2186 POINTER :: hmats
2187
2188 CHARACTER(LEN=*), PARAMETER :: routinen = 'rs_distribute_matrix'
2189
2190 INTEGER :: acol, arow, handle, i, img, j, k, l, me, &
2191 nblkcols_total, nblkrows_total, ncol, &
2192 nrow, nthread, nthread_left
2193 INTEGER, ALLOCATABLE, DIMENSION(:) :: first_col, first_row, last_col, last_row, recv_disps, &
2194 recv_pair_count, recv_pair_disps, recv_sizes, send_disps, send_pair_count, &
2195 send_pair_disps, send_sizes
2196 INTEGER, DIMENSION(:), POINTER :: col_blk_size, row_blk_size
2197 LOGICAL :: found
2198 REAL(kind=dp), ALLOCATABLE, DIMENSION(:), TARGET :: recv_buf_r, send_buf_r
2199 REAL(kind=dp), DIMENSION(:, :), POINTER :: h_block, p_block
2200 TYPE(dbcsr_type), POINTER :: hmat, pmat
2201 TYPE(realspace_grid_desc_type), POINTER :: desc
2202 REAL(kind=dp), DIMENSION(:), POINTER :: vector
2203
2204!$ INTEGER(kind=omp_lock_kind), ALLOCATABLE, DIMENSION(:) :: locks
2205
2206 CALL timeset(routinen, handle)
2207
2208 IF (.NOT. scatter) THEN
2209 cpassert(PRESENT(hmats))
2210 END IF
2211
2212 desc => rs_descs(1)%rs_desc
2213 me = desc%my_pos + 1
2214
2215 ! allocate local arrays
2216 ALLOCATE (send_sizes(desc%group_size))
2217 ALLOCATE (recv_sizes(desc%group_size))
2218 ALLOCATE (send_disps(desc%group_size))
2219 ALLOCATE (recv_disps(desc%group_size))
2220 ALLOCATE (send_pair_count(desc%group_size))
2221 ALLOCATE (recv_pair_count(desc%group_size))
2222 ALLOCATE (send_pair_disps(desc%group_size))
2223 ALLOCATE (recv_pair_disps(desc%group_size))
2224
2225 pmat => pmats(1)%matrix
2226 CALL dbcsr_get_info(pmat, &
2227 row_blk_size=row_blk_size, &
2228 col_blk_size=col_blk_size, &
2229 nblkrows_total=nblkrows_total, &
2230 nblkcols_total=nblkcols_total)
2231 ALLOCATE (first_row(nblkrows_total), last_row(nblkrows_total), &
2232 first_col(nblkcols_total), last_col(nblkcols_total))
2233 CALL dbcsr_convert_sizes_to_offsets(row_blk_size, first_row, last_row)
2234 CALL dbcsr_convert_sizes_to_offsets(col_blk_size, first_col, last_col)
2235
2236 ! set up send buffer sizes
2237 send_sizes = 0
2238 send_pair_count = 0
2239 DO i = 1, SIZE(atom_pair_send)
2240 k = atom_pair_send(i)%rank + 1 ! proc we're sending this block to
2241 arow = atom_pair_send(i)%row
2242 acol = atom_pair_send(i)%col
2243 nrow = last_row(arow) - first_row(arow) + 1
2244 ncol = last_col(acol) - first_col(acol) + 1
2245 send_sizes(k) = send_sizes(k) + nrow*ncol
2246 send_pair_count(k) = send_pair_count(k) + 1
2247 END DO
2248
2249 send_disps = 0
2250 send_pair_disps = 0
2251 DO i = 2, desc%group_size
2252 send_disps(i) = send_disps(i - 1) + send_sizes(i - 1)
2253 send_pair_disps(i) = send_pair_disps(i - 1) + send_pair_count(i - 1)
2254 END DO
2255
2256 ALLOCATE (send_buf_r(sum(send_sizes)))
2257
2258 ! set up recv buffer
2259
2260 recv_sizes = 0
2261 recv_pair_count = 0
2262 DO i = 1, SIZE(atom_pair_recv)
2263 k = atom_pair_recv(i)%rank + 1 ! proc we're receiving this data from
2264 arow = atom_pair_recv(i)%row
2265 acol = atom_pair_recv(i)%col
2266 nrow = last_row(arow) - first_row(arow) + 1
2267 ncol = last_col(acol) - first_col(acol) + 1
2268 recv_sizes(k) = recv_sizes(k) + nrow*ncol
2269 recv_pair_count(k) = recv_pair_count(k) + 1
2270 END DO
2271
2272 recv_disps = 0
2273 recv_pair_disps = 0
2274 DO i = 2, desc%group_size
2275 recv_disps(i) = recv_disps(i - 1) + recv_sizes(i - 1)
2276 recv_pair_disps(i) = recv_pair_disps(i - 1) + recv_pair_count(i - 1)
2277 END DO
2278 ALLOCATE (recv_buf_r(sum(recv_sizes)))
2279
2280!$OMP PARALLEL DEFAULT(OMP_DEFAULT_NONE_WITH_OOP) &
2281!$OMP SHARED(desc,send_pair_count,send_pair_disps,nimages),&
2282!$OMP SHARED(last_row,first_row,last_col,first_col),&
2283!$OMP SHARED(pmats,send_buf_r,send_disps,send_sizes),&
2284!$OMP SHARED(atom_pair_send,me,hmats,nblkrows_total),&
2285!$OMP SHARED(atom_pair_recv,recv_buf_r,scatter,recv_pair_disps), &
2286!$OMP SHARED(recv_sizes,recv_disps,recv_pair_count,locks), &
2287!$OMP PRIVATE(i,img,arow,acol,nrow,ncol,p_block,found,j,k,l),&
2288!$OMP PRIVATE(nthread,h_block,nthread_left,hmat,pmat,vector)
2289
2290 nthread = 1
2291!$ nthread = omp_get_num_threads()
2292 nthread_left = 1
2293!$ nthread_left = MAX(1, nthread - 1)
2294
2295 ! do packing
2296!$OMP DO schedule(guided)
2297 DO l = 1, desc%group_size
2298 IF (l == me) cycle
2299 send_sizes(l) = 0
2300 DO i = 1, send_pair_count(l)
2301 arow = atom_pair_send(send_pair_disps(l) + i)%row
2302 acol = atom_pair_send(send_pair_disps(l) + i)%col
2303 img = atom_pair_send(send_pair_disps(l) + i)%image
2304 nrow = last_row(arow) - first_row(arow) + 1
2305 ncol = last_col(acol) - first_col(acol) + 1
2306 pmat => pmats(img)%matrix
2307 CALL dbcsr_get_block_p(matrix=pmat, row=arow, col=acol, block=p_block, found=found)
2308 cpassert(found)
2309
2310 DO k = 1, ncol
2311 DO j = 1, nrow
2312 send_buf_r(send_disps(l) + send_sizes(l) + j + (k - 1)*nrow) = p_block(j, k)
2313 END DO
2314 END DO
2315 send_sizes(l) = send_sizes(l) + nrow*ncol
2316 END DO
2317 END DO
2318!$OMP END DO
2319
2320 IF (.NOT. scatter) THEN
2321 ! We need locks to protect concurrent summation into H
2322!$OMP SINGLE
2323!$ ALLOCATE (locks(nthread*10))
2324!$OMP END SINGLE
2325
2326!$OMP DO
2327!$ do i = 1, nthread*10
2328!$ call omp_init_lock(locks(i))
2329!$ end do
2330!$OMP END DO
2331 END IF
2332
2333!$OMP MASTER
2334 ! do communication
2335 CALL desc%group%alltoall(send_buf_r, send_sizes, send_disps, &
2336 recv_buf_r, recv_sizes, recv_disps)
2337!$OMP END MASTER
2338
2339 ! If this is a scatter, then no need to copy local blocks,
2340 ! If not, we sum them directly into H (bypassing the alltoall)
2341 IF (.NOT. scatter) THEN
2342
2343 ! Distribute work over remaining threads assuming one is still in the alltoall
2344!$OMP DO schedule(dynamic,MAX(1,send_pair_count(me)/nthread_left))
2345 DO i = 1, send_pair_count(me)
2346 arow = atom_pair_send(send_pair_disps(me) + i)%row
2347 acol = atom_pair_send(send_pair_disps(me) + i)%col
2348 img = atom_pair_send(send_pair_disps(me) + i)%image
2349 nrow = last_row(arow) - first_row(arow) + 1
2350 ncol = last_col(acol) - first_col(acol) + 1
2351 hmat => hmats(img)%matrix
2352 pmat => pmats(img)%matrix
2353 CALL dbcsr_get_block_p(matrix=hmat, row=arow, col=acol, block=h_block, found=found)
2354 cpassert(found)
2355 CALL dbcsr_get_block_p(matrix=pmat, row=arow, col=acol, block=p_block, found=found)
2356 cpassert(found)
2357
2358!$ call omp_set_lock(locks((arow - 1)*nthread*10/nblkrows_total + 1))
2359 DO k = 1, ncol
2360 DO j = 1, nrow
2361 h_block(j, k) = h_block(j, k) + p_block(j, k)
2362 END DO
2363 END DO
2364!$ call omp_unset_lock(locks((arow - 1)*nthread*10/nblkrows_total + 1))
2365 END DO
2366!$OMP END DO
2367 ELSE
2368 ! We will insert new blocks into P, so create mutable work matrices
2369 DO img = 1, nimages
2370 pmat => pmats(img)%matrix
2371 CALL dbcsr_work_create(pmat, work_mutable=.true., &
2372 nblks_guess=SIZE(atom_pair_recv)/nthread, sizedata_guess=SIZE(recv_buf_r)/nthread, &
2373 n=nthread)
2374 END DO
2375 END IF
2376
2377! wait for comm and setup to finish
2378!$OMP BARRIER
2379
2380 !do unpacking
2381!$OMP DO schedule(guided)
2382 DO l = 1, desc%group_size
2383 IF (l == me) cycle
2384 recv_sizes(l) = 0
2385 DO i = 1, recv_pair_count(l)
2386 arow = atom_pair_recv(recv_pair_disps(l) + i)%row
2387 acol = atom_pair_recv(recv_pair_disps(l) + i)%col
2388 img = atom_pair_recv(recv_pair_disps(l) + i)%image
2389 nrow = last_row(arow) - first_row(arow) + 1
2390 ncol = last_col(acol) - first_col(acol) + 1
2391 pmat => pmats(img)%matrix
2392 NULLIFY (p_block)
2393 CALL dbcsr_get_block_p(matrix=pmat, row=arow, col=acol, block=p_block, found=found)
2394
2395 IF (PRESENT(hmats)) THEN
2396 hmat => hmats(img)%matrix
2397 CALL dbcsr_get_block_p(matrix=hmat, row=arow, col=acol, block=h_block, found=found)
2398 cpassert(found)
2399 END IF
2400
2401 IF (scatter .AND. .NOT. ASSOCIATED(p_block)) THEN
2402 vector => recv_buf_r(recv_disps(l) + recv_sizes(l) + 1:recv_disps(l) + recv_sizes(l) + nrow*ncol)
2403 CALL dbcsr_put_block(pmat, arow, acol, block=reshape(vector, [nrow, ncol]))
2404 END IF
2405 IF (.NOT. scatter) THEN
2406!$ call omp_set_lock(locks((arow - 1)*nthread*10/nblkrows_total + 1))
2407 DO k = 1, ncol
2408 DO j = 1, nrow
2409 h_block(j, k) = h_block(j, k) + recv_buf_r(recv_disps(l) + recv_sizes(l) + j + (k - 1)*nrow)
2410 END DO
2411 END DO
2412!$ call omp_unset_lock(locks((arow - 1)*nthread*10/nblkrows_total + 1))
2413 END IF
2414 recv_sizes(l) = recv_sizes(l) + nrow*ncol
2415 END DO
2416 END DO
2417!$OMP END DO
2418
2419!$ IF (.not. scatter) THEN
2420!$OMP DO
2421!$ do i = 1, nthread*10
2422!$ call omp_destroy_lock(locks(i))
2423!$ end do
2424!$OMP END DO
2425!$ END IF
2426
2427!$OMP SINGLE
2428!$ IF (.not. scatter) THEN
2429!$ DEALLOCATE (locks)
2430!$ END IF
2431!$OMP END SINGLE NOWAIT
2432
2433 IF (scatter) THEN
2434 ! Blocks were added to P
2435 DO img = 1, nimages
2436 pmat => pmats(img)%matrix
2437 CALL dbcsr_finalize(pmat)
2438 END DO
2439 END IF
2440!$OMP END PARALLEL
2441
2442 DEALLOCATE (send_buf_r)
2443 DEALLOCATE (recv_buf_r)
2444
2445 DEALLOCATE (send_sizes)
2446 DEALLOCATE (recv_sizes)
2447 DEALLOCATE (send_disps)
2448 DEALLOCATE (recv_disps)
2449 DEALLOCATE (send_pair_count)
2450 DEALLOCATE (recv_pair_count)
2451 DEALLOCATE (send_pair_disps)
2452 DEALLOCATE (recv_pair_disps)
2453
2454 DEALLOCATE (first_row, last_row, first_col, last_col)
2455
2456 CALL timestop(handle)
2457
2458 END SUBROUTINE rs_distribute_matrix
2459
2460! **************************************************************************************************
2461!> \brief Calculates offsets and sizes for rs_scatter_matrix and rs_copy_matrix.
2462!> \author Ole Schuett
2463! **************************************************************************************************
2464 SUBROUTINE rs_calc_offsets(pairs, nsgf, group_size, &
2465 pair_offsets, rank_offsets, rank_sizes, buffer_size)
2466 TYPE(atom_pair_type), DIMENSION(:), INTENT(IN) :: pairs
2467 INTEGER, DIMENSION(:), INTENT(IN) :: nsgf
2468 INTEGER, INTENT(IN) :: group_size
2469 INTEGER, DIMENSION(:), POINTER :: pair_offsets, rank_offsets, rank_sizes
2470 INTEGER, INTENT(INOUT) :: buffer_size
2471
2472 INTEGER :: acol, arow, i, block_size, total_size, k, prev_k
2473
2474 IF (ASSOCIATED(pair_offsets)) DEALLOCATE (pair_offsets)
2475 IF (ASSOCIATED(rank_offsets)) DEALLOCATE (rank_offsets)
2476 IF (ASSOCIATED(rank_sizes)) DEALLOCATE (rank_sizes)
2477
2478 ! calculate buffer_size and pair_offsets
2479 ALLOCATE (pair_offsets(SIZE(pairs)))
2480 total_size = 0
2481 DO i = 1, SIZE(pairs)
2482 pair_offsets(i) = total_size
2483 arow = pairs(i)%row
2484 acol = pairs(i)%col
2485 block_size = nsgf(arow)*nsgf(acol)
2486 total_size = total_size + block_size
2487 END DO
2488 buffer_size = total_size
2489
2490 ! calculate rank_offsets and rank_sizes
2491 ALLOCATE (rank_offsets(group_size))
2492 ALLOCATE (rank_sizes(group_size))
2493 rank_offsets = 0
2494 rank_sizes = 0
2495 IF (SIZE(pairs) > 0) THEN
2496 prev_k = pairs(1)%rank + 1
2497 DO i = 1, SIZE(pairs)
2498 k = pairs(i)%rank + 1
2499 cpassert(k >= prev_k) ! expecting the pairs to be ordered by rank
2500 IF (k > prev_k) THEN
2501 rank_offsets(k) = pair_offsets(i)
2502 rank_sizes(prev_k) = rank_offsets(k) - rank_offsets(prev_k)
2503 prev_k = k
2504 END IF
2505 END DO
2506 rank_sizes(k) = buffer_size - rank_offsets(k) ! complete last rank
2507 END IF
2508
2509 END SUBROUTINE rs_calc_offsets
2510
2511! **************************************************************************************************
2512!> \brief Scatters dbcsr matrix blocks and receives them into a buffer as needed before collocation.
2513!> \author Ole Schuett
2514! **************************************************************************************************
2515 SUBROUTINE rs_scatter_matrices(src_matrices, dest_buffer, task_list, group)
2516 TYPE(dbcsr_p_type), DIMENSION(:), INTENT(IN) :: src_matrices
2517 TYPE(offload_buffer_type), INTENT(INOUT) :: dest_buffer
2518 TYPE(task_list_type), INTENT(IN) :: task_list
2519 TYPE(mp_comm_type), INTENT(IN) :: group
2520
2521 CHARACTER(LEN=*), PARAMETER :: routinen = 'rs_scatter_matrices'
2522
2523 INTEGER :: handle
2524 REAL(kind=dp), DIMENSION(:), ALLOCATABLE :: buffer_send
2525
2526 CALL timeset(routinen, handle)
2527 ALLOCATE (buffer_send(task_list%buffer_size_send))
2528
2529 ! pack dbcsr blocks into send buffer
2530 cpassert(ASSOCIATED(task_list%atom_pair_send))
2531 CALL rs_pack_buffer(src_matrices=src_matrices, &
2532 dest_buffer=buffer_send, &
2533 atom_pair=task_list%atom_pair_send, &
2534 pair_offsets=task_list%pair_offsets_send)
2535
2536 ! mpi all-to-all communication, receiving directly into blocks_recv%buffer.
2537 CALL group%alltoall(buffer_send, task_list%rank_sizes_send, task_list%rank_offsets_send, &
2538 dest_buffer%host_buffer, &
2539 task_list%rank_sizes_recv, task_list%rank_offsets_recv)
2540
2541 DEALLOCATE (buffer_send)
2542 CALL timestop(handle)
2543
2544 END SUBROUTINE rs_scatter_matrices
2545
2546! **************************************************************************************************
2547!> \brief Gather the dbcsr matrix blocks and receives them into a buffer as needed after integration.
2548!> \author Ole Schuett
2549! **************************************************************************************************
2550 SUBROUTINE rs_gather_matrices(src_buffer, dest_matrices, task_list, group)
2551 TYPE(offload_buffer_type), INTENT(IN) :: src_buffer
2552 TYPE(dbcsr_p_type), DIMENSION(:), INTENT(INOUT) :: dest_matrices
2553 TYPE(task_list_type), INTENT(IN) :: task_list
2554 TYPE(mp_comm_type), INTENT(IN) :: group
2555
2556 CHARACTER(LEN=*), PARAMETER :: routinen = 'rs_gather_matrices'
2557
2558 INTEGER :: handle
2559 REAL(kind=dp), DIMENSION(:), ALLOCATABLE :: buffer_send
2560
2561 CALL timeset(routinen, handle)
2562
2563 ! Caution: The meaning of send and recv are reversed in this routine.
2564 ALLOCATE (buffer_send(task_list%buffer_size_send)) ! e.g. this is actually used for receiving
2565
2566 ! mpi all-to-all communication
2567 CALL group%alltoall(src_buffer%host_buffer, task_list%rank_sizes_recv, task_list%rank_offsets_recv, &
2568 buffer_send, task_list%rank_sizes_send, task_list%rank_offsets_send)
2569
2570 ! unpack dbcsr blocks from send buffer
2571 cpassert(ASSOCIATED(task_list%atom_pair_send))
2572 CALL rs_unpack_buffer(src_buffer=buffer_send, &
2573 dest_matrices=dest_matrices, &
2574 atom_pair=task_list%atom_pair_send, &
2575 pair_offsets=task_list%pair_offsets_send)
2576
2577 DEALLOCATE (buffer_send)
2578 CALL timestop(handle)
2579
2580 END SUBROUTINE rs_gather_matrices
2581
2582! **************************************************************************************************
2583!> \brief Copies the DBCSR blocks into buffer, replaces rs_scatter_matrix for non-distributed grids.
2584!> \author Ole Schuett
2585! **************************************************************************************************
2586 SUBROUTINE rs_copy_to_buffer(src_matrices, dest_buffer, task_list)
2587 TYPE(dbcsr_p_type), DIMENSION(:), INTENT(IN) :: src_matrices
2588 TYPE(offload_buffer_type), INTENT(INOUT) :: dest_buffer
2589 TYPE(task_list_type), INTENT(IN) :: task_list
2590
2591 CALL rs_pack_buffer(src_matrices=src_matrices, &
2592 dest_buffer=dest_buffer%host_buffer, &
2593 atom_pair=task_list%atom_pair_recv, &
2594 pair_offsets=task_list%pair_offsets_recv)
2595
2596 END SUBROUTINE rs_copy_to_buffer
2597
2598! **************************************************************************************************
2599!> \brief Copies from buffer into DBCSR matrics, replaces rs_gather_matrix for non-distributed grids.
2600!> \author Ole Schuett
2601! **************************************************************************************************
2602 SUBROUTINE rs_copy_to_matrices(src_buffer, dest_matrices, task_list)
2603 TYPE(offload_buffer_type), INTENT(IN) :: src_buffer
2604 TYPE(dbcsr_p_type), DIMENSION(:), INTENT(INOUT) :: dest_matrices
2605 TYPE(task_list_type), INTENT(IN) :: task_list
2606
2607 CALL rs_unpack_buffer(src_buffer=src_buffer%host_buffer, &
2608 dest_matrices=dest_matrices, &
2609 atom_pair=task_list%atom_pair_recv, &
2610 pair_offsets=task_list%pair_offsets_recv)
2611
2612 END SUBROUTINE rs_copy_to_matrices
2613
2614! **************************************************************************************************
2615!> \brief Helper routine for rs_scatter_matrix and rs_copy_to_buffer.
2616!> \author Ole Schuett
2617! **************************************************************************************************
2618 SUBROUTINE rs_pack_buffer(src_matrices, dest_buffer, atom_pair, pair_offsets)
2619 TYPE(dbcsr_p_type), DIMENSION(:), INTENT(IN) :: src_matrices
2620 REAL(kind=dp), DIMENSION(:), INTENT(INOUT) :: dest_buffer
2621 TYPE(atom_pair_type), DIMENSION(:), INTENT(IN) :: atom_pair
2622 INTEGER, DIMENSION(:), INTENT(IN) :: pair_offsets
2623
2624 INTEGER :: acol, arow, img, i, offset, block_size
2625 LOGICAL :: found
2626 REAL(kind=dp), DIMENSION(:, :), POINTER :: block
2627
2628!$OMP PARALLEL DEFAULT(NONE), &
2629!$OMP SHARED(src_matrices,atom_pair,pair_offsets,dest_buffer), &
2630!$OMP PRIVATE(acol,arow,img,i,offset,block_size,found,block)
2631!$OMP DO schedule(guided)
2632 DO i = 1, SIZE(atom_pair)
2633 arow = atom_pair(i)%row
2634 acol = atom_pair(i)%col
2635 img = atom_pair(i)%image
2636 CALL dbcsr_get_block_p(matrix=src_matrices(img)%matrix, row=arow, col=acol, &
2637 block=block, found=found)
2638 cpassert(found)
2639 block_size = SIZE(block)
2640 offset = pair_offsets(i)
2641 dest_buffer(offset + 1:offset + block_size) = reshape(block, shape=[block_size])
2642 END DO
2643!$OMP END DO
2644!$OMP END PARALLEL
2645
2646 END SUBROUTINE rs_pack_buffer
2647
2648! **************************************************************************************************
2649!> \brief Helper routine for rs_gather_matrix and rs_copy_to_matrices.
2650!> \author Ole Schuett
2651! **************************************************************************************************
2652 SUBROUTINE rs_unpack_buffer(src_buffer, dest_matrices, atom_pair, pair_offsets)
2653 REAL(kind=dp), DIMENSION(:), INTENT(IN) :: src_buffer
2654 TYPE(dbcsr_p_type), DIMENSION(:), INTENT(INOUT) :: dest_matrices
2655 TYPE(atom_pair_type), DIMENSION(:), INTENT(IN) :: atom_pair
2656 INTEGER, DIMENSION(:), INTENT(IN) :: pair_offsets
2657
2658 INTEGER :: acol, arow, img, i, offset, &
2659 nrows, ncols, lock_num
2660 LOGICAL :: found
2661 REAL(kind=dp), DIMENSION(:, :), POINTER :: block
2662 INTEGER(kind=omp_lock_kind), ALLOCATABLE, DIMENSION(:) :: locks
2663
2664 ! initialize locks
2665 ALLOCATE (locks(10*omp_get_max_threads()))
2666 DO i = 1, SIZE(locks)
2667 CALL omp_init_lock(locks(i))
2668 END DO
2669
2670!$OMP PARALLEL DEFAULT(NONE), &
2671!$OMP SHARED(src_buffer,atom_pair,pair_offsets,dest_matrices,locks), &
2672!$OMP PRIVATE(acol,arow,img,i,offset,nrows,ncols,lock_num,found,block)
2673!$OMP DO schedule(guided)
2674 DO i = 1, SIZE(atom_pair)
2675 arow = atom_pair(i)%row
2676 acol = atom_pair(i)%col
2677 img = atom_pair(i)%image
2678 CALL dbcsr_get_block_p(matrix=dest_matrices(img)%matrix, row=arow, col=acol, &
2679 block=block, found=found)
2680 cpassert(found)
2681 nrows = SIZE(block, 1)
2682 ncols = SIZE(block, 2)
2683 offset = pair_offsets(i)
2684 lock_num = modulo(arow, SIZE(locks)) + 1 ! map matrix rows round-robin to available locks
2685
2686 CALL omp_set_lock(locks(lock_num))
2687 block = block + reshape(src_buffer(offset + 1:offset + nrows*ncols), shape=[nrows, ncols])
2688 CALL omp_unset_lock(locks(lock_num))
2689 END DO
2690!$OMP END DO
2691!$OMP END PARALLEL
2692
2693 ! destroy locks
2694 DO i = 1, SIZE(locks)
2695 CALL omp_destroy_lock(locks(i))
2696 END DO
2697 DEALLOCATE (locks)
2698
2699 END SUBROUTINE rs_unpack_buffer
2700
2701! **************************************************************************************************
2702!> \brief determines the rank of the processor who's real rs grid contains point
2703!> \param rs_desc ...
2704!> \param igrid_level ...
2705!> \param n_levels ...
2706!> \param cube_center ...
2707!> \param ntasks ...
2708!> \param tasks ...
2709!> \param lb_cube ...
2710!> \param ub_cube ...
2711!> \param added_tasks ...
2712!> \par History
2713!> 11.2007 created [MattW]
2714!> 10.2008 rewritten [Joost VandeVondele]
2715!> \author MattW
2716! **************************************************************************************************
2717 SUBROUTINE rs_find_node(rs_desc, igrid_level, n_levels, cube_center, ntasks, tasks, &
2718 lb_cube, ub_cube, added_tasks)
2719
2720 TYPE(realspace_grid_desc_type), POINTER :: rs_desc
2721 INTEGER, INTENT(IN) :: igrid_level, n_levels
2722 INTEGER, DIMENSION(3), INTENT(IN) :: cube_center
2723 INTEGER, INTENT(INOUT) :: ntasks
2724 TYPE(task_type), DIMENSION(:), POINTER :: tasks
2725 INTEGER, DIMENSION(3), INTENT(IN) :: lb_cube, ub_cube
2726 INTEGER, INTENT(OUT) :: added_tasks
2727
2728 INTEGER, PARAMETER :: add_tasks = 1000
2729 REAL(kind=dp), PARAMETER :: mult_tasks = 2.0_dp
2730
2731 INTEGER :: bit_index, coord(3), curr_tasks, dest, i, icoord(3), idest, itask, ix, iy, iz, &
2732 lb_coord(3), lb_domain(3), lbc(3), ub_coord(3), ub_domain(3), ubc(3)
2733 INTEGER :: bit_pattern
2734 LOGICAL :: dir_periodic(3)
2735
2736 coord(1) = rs_desc%x2coord(cube_center(1))
2737 coord(2) = rs_desc%y2coord(cube_center(2))
2738 coord(3) = rs_desc%z2coord(cube_center(3))
2739 dest = rs_desc%coord2rank(coord(1), coord(2), coord(3))
2740
2741 ! the real cube coordinates
2742 lbc = lb_cube + cube_center
2743 ubc = ub_cube + cube_center
2744
2745 IF (all((rs_desc%lb_global(:, dest) - rs_desc%border) <= lbc) .AND. &
2746 all((rs_desc%ub_global(:, dest) + rs_desc%border) >= ubc)) THEN
2747 !standard distributed collocation/integration
2748 tasks(ntasks)%destination = encode_rank(dest, igrid_level, n_levels)
2749 tasks(ntasks)%dist_type = 1
2750 tasks(ntasks)%subpatch_pattern = 0
2751 added_tasks = 1
2752
2753 ! here we figure out if there is an alternate location for this task
2754 ! i.e. even though the cube_center is not in the real local domain,
2755 ! it might fully fit in the halo of the neighbor
2756 ! if its radius is smaller than the maximum radius
2757 ! the list of possible neighbors is stored here as a bit pattern
2758 ! to reconstruct what the rank of the neigbor is,
2759 ! we can use (note this requires the correct rs_grid)
2760 ! IF (BTEST(bit_pattern,0)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,[-1,0,0])
2761 ! IF (BTEST(bit_pattern,1)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,[+1,0,0])
2762 ! IF (BTEST(bit_pattern,2)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,[0,-1,0])
2763 ! IF (BTEST(bit_pattern,3)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,[0,+1,0])
2764 ! IF (BTEST(bit_pattern,4)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,[0,0,-1])
2765 ! IF (BTEST(bit_pattern,5)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,[0,0,+1])
2766 bit_index = 0
2767 bit_pattern = 0
2768 DO i = 1, 3
2769 IF (rs_desc%perd(i) == 1) THEN
2770 bit_pattern = ibclr(bit_pattern, bit_index)
2771 bit_index = bit_index + 1
2772 bit_pattern = ibclr(bit_pattern, bit_index)
2773 bit_index = bit_index + 1
2774 ELSE
2775 ! fits the left neighbor ?
2776 IF (ubc(i) <= rs_desc%lb_global(i, dest) - 1 + rs_desc%border) THEN
2777 bit_pattern = ibset(bit_pattern, bit_index)
2778 bit_index = bit_index + 1
2779 ELSE
2780 bit_pattern = ibclr(bit_pattern, bit_index)
2781 bit_index = bit_index + 1
2782 END IF
2783 ! fits the right neighbor ?
2784 IF (lbc(i) >= rs_desc%ub_global(i, dest) + 1 - rs_desc%border) THEN
2785 bit_pattern = ibset(bit_pattern, bit_index)
2786 bit_index = bit_index + 1
2787 ELSE
2788 bit_pattern = ibclr(bit_pattern, bit_index)
2789 bit_index = bit_index + 1
2790 END IF
2791 END IF
2792 END DO
2793 tasks(ntasks)%subpatch_pattern = bit_pattern
2794
2795 ELSE
2796 ! generalised collocation/integration needed
2797 ! first we figure out how many neighbors we have to add to include the lbc/ubc
2798 ! in the available domains (inclusive of halo points)
2799 ! first we 'ignore' periodic boundary conditions
2800 ! i.e. ub_coord-lb_coord+1 might be larger than group_dim
2801 lb_coord = coord
2802 ub_coord = coord
2803 lb_domain = rs_desc%lb_global(:, dest) - rs_desc%border
2804 ub_domain = rs_desc%ub_global(:, dest) + rs_desc%border
2805 DO i = 1, 3
2806 ! only if the grid is not periodic in this direction we need to take care of adding neighbors
2807 IF (rs_desc%perd(i) == 0) THEN
2808 ! if the domain lower bound is greater than the lbc we need to add the size of the neighbor domain
2809 DO
2810 IF (lb_domain(i) > lbc(i)) THEN
2811 lb_coord(i) = lb_coord(i) - 1
2812 icoord = modulo(lb_coord, rs_desc%group_dim)
2813 idest = rs_desc%coord2rank(icoord(1), icoord(2), icoord(3))
2814 lb_domain(i) = lb_domain(i) - (rs_desc%ub_global(i, idest) - rs_desc%lb_global(i, idest) + 1)
2815 ELSE
2816 EXIT
2817 END IF
2818 END DO
2819 ! same for the upper bound
2820 DO
2821 IF (ub_domain(i) < ubc(i)) THEN
2822 ub_coord(i) = ub_coord(i) + 1
2823 icoord = modulo(ub_coord, rs_desc%group_dim)
2824 idest = rs_desc%coord2rank(icoord(1), icoord(2), icoord(3))
2825 ub_domain(i) = ub_domain(i) + (rs_desc%ub_global(i, idest) - rs_desc%lb_global(i, idest) + 1)
2826 ELSE
2827 EXIT
2828 END IF
2829 END DO
2830 END IF
2831 END DO
2832
2833 ! some care is needed for the periodic boundaries
2834 DO i = 1, 3
2835 IF (ub_domain(i) - lb_domain(i) + 1 >= rs_desc%npts(i)) THEN
2836 dir_periodic(i) = .true.
2837 lb_coord(i) = 0
2838 ub_coord(i) = rs_desc%group_dim(i) - 1
2839 ELSE
2840 dir_periodic(i) = .false.
2841 END IF
2842 END DO
2843
2844 added_tasks = product(ub_coord - lb_coord + 1)
2845 itask = ntasks
2846 ntasks = ntasks + added_tasks - 1
2847 IF (ntasks > SIZE(tasks)) THEN
2848 curr_tasks = int((SIZE(tasks) + add_tasks)*mult_tasks)
2849 CALL reallocate_tasks(tasks, curr_tasks)
2850 END IF
2851 DO iz = lb_coord(3), ub_coord(3)
2852 DO iy = lb_coord(2), ub_coord(2)
2853 DO ix = lb_coord(1), ub_coord(1)
2854 icoord = modulo([ix, iy, iz], rs_desc%group_dim)
2855 idest = rs_desc%coord2rank(icoord(1), icoord(2), icoord(3))
2856 tasks(itask)%destination = encode_rank(idest, igrid_level, n_levels)
2857 tasks(itask)%dist_type = 2
2858 tasks(itask)%subpatch_pattern = 0
2859 ! encode the domain size for this task
2860 ! if the bit is set, we need to add the border in that direction
2861 IF (ix == lb_coord(1) .AND. .NOT. dir_periodic(1)) &
2862 tasks(itask)%subpatch_pattern = ibset(tasks(itask)%subpatch_pattern, 0)
2863 IF (ix == ub_coord(1) .AND. .NOT. dir_periodic(1)) &
2864 tasks(itask)%subpatch_pattern = ibset(tasks(itask)%subpatch_pattern, 1)
2865 IF (iy == lb_coord(2) .AND. .NOT. dir_periodic(2)) &
2866 tasks(itask)%subpatch_pattern = ibset(tasks(itask)%subpatch_pattern, 2)
2867 IF (iy == ub_coord(2) .AND. .NOT. dir_periodic(2)) &
2868 tasks(itask)%subpatch_pattern = ibset(tasks(itask)%subpatch_pattern, 3)
2869 IF (iz == lb_coord(3) .AND. .NOT. dir_periodic(3)) &
2870 tasks(itask)%subpatch_pattern = ibset(tasks(itask)%subpatch_pattern, 4)
2871 IF (iz == ub_coord(3) .AND. .NOT. dir_periodic(3)) &
2872 tasks(itask)%subpatch_pattern = ibset(tasks(itask)%subpatch_pattern, 5)
2873 itask = itask + 1
2874 END DO
2875 END DO
2876 END DO
2877 END IF
2878
2879 END SUBROUTINE rs_find_node
2880
2881! **************************************************************************************************
2882!> \brief utility functions for encoding the grid level with a rank, allowing
2883!> different grid levels to maintain a different rank ordering without
2884!> losing information. These encoded_ints are stored in the tasks(1:2,:) array
2885!> \param rank ...
2886!> \param grid_level ...
2887!> \param n_levels ...
2888!> \return ...
2889!> \par History
2890!> 4.2009 created [Iain Bethune]
2891!> (c) The Numerical Algorithms Group (NAG) Ltd, 2009 on behalf of the HECToR project
2892! **************************************************************************************************
2893 FUNCTION encode_rank(rank, grid_level, n_levels) RESULT(encoded_int)
2894
2895 INTEGER, INTENT(IN) :: rank, grid_level, n_levels
2896 INTEGER :: encoded_int
2897
2898! ordered so can still sort by rank
2899
2900 encoded_int = rank*n_levels + grid_level - 1
2901
2902 END FUNCTION encode_rank
2903
2904! **************************************************************************************************
2905!> \brief ...
2906!> \param encoded_int ...
2907!> \param n_levels ...
2908!> \return ...
2909! **************************************************************************************************
2910 FUNCTION decode_rank(encoded_int, n_levels) RESULT(rank)
2911
2912 INTEGER, INTENT(IN) :: encoded_int
2913 INTEGER, INTENT(IN) :: n_levels
2914 INTEGER :: rank
2915
2916 rank = int(encoded_int/n_levels)
2917
2918 END FUNCTION decode_rank
2919
2920! **************************************************************************************************
2921!> \brief ...
2922!> \param encoded_int ...
2923!> \param n_levels ...
2924!> \return ...
2925! **************************************************************************************************
2926 FUNCTION decode_level(encoded_int, n_levels) RESULT(grid_level)
2927
2928 INTEGER, INTENT(IN) :: encoded_int
2929 INTEGER, INTENT(IN) :: n_levels
2930 INTEGER :: grid_level
2931
2932 grid_level = int(modulo(encoded_int, n_levels)) + 1
2933
2934 END FUNCTION decode_level
2935
2936! **************************************************************************************************
2937!> \brief Sort pgf index pair (ipgf,iset,iatom),(jpgf,jset,jatom) for which all atom pairs are
2938!> grouped, and for each atom pair all set pairs are grouped and for each set pair,
2939!> all pgfs are grouped.
2940!> This yields the right order of the tasks for collocation after the sort
2941!> in distribute_tasks. E.g. for a atom pair, all sets and pgfs are computed in one go.
2942!> The exception is the gridlevel. Tasks are first ordered wrt to grid_level. This implies
2943!> that a given density matrix block will be decontracted several times,
2944!> but cache effects on the grid make up for this.
2945!> \param a ...
2946!> \param b ...
2947!> \return ...
2948!> \author Ole Schuett
2949! **************************************************************************************************
2950 PURE FUNCTION tasks_less_than(a, b) RESULT(res)
2951 TYPE(task_type), INTENT(IN) :: a, b
2952 LOGICAL :: res
2953
2954 IF (a%grid_level /= b%grid_level) THEN
2955 res = a%grid_level < b%grid_level
2956
2957 ELSE IF (a%image /= b%image) THEN
2958 res = a%image < b%image
2959
2960 ELSE IF (a%iatom /= b%iatom) THEN
2961 res = a%iatom < b%iatom
2962
2963 ELSE IF (a%jatom /= b%jatom) THEN
2964 res = a%jatom < b%jatom
2965
2966 ELSE IF (a%iset /= b%iset) THEN
2967 res = a%iset < b%iset
2968
2969 ELSE IF (a%jset /= b%jset) THEN
2970 res = a%jset < b%jset
2971
2972 ELSE IF (a%ipgf /= b%ipgf) THEN
2973 res = a%ipgf < b%ipgf
2974
2975 ELSE
2976 res = a%jpgf < b%jpgf
2977
2978 END IF
2979 END FUNCTION tasks_less_than
2980
2981
2982! **************************************************************************************************
2983!> \brief Sorts an array inplace using a combination of merge- and bubble-sort.
2984!> It also returns the indices, which the elements had before the sort.
2985!> \param arr the array to sort
2986!> \param n length of array
2987!> \param indices returns elements-indices before the sort
2988!> \par History
2989!> 12.2012 created [ole]
2990!> \author Ole Schuett
2991! **************************************************************************************************
2992 SUBROUTINE tasks_sort(arr, n, indices)
2993 INTEGER, INTENT(IN) :: n
2994 TYPE(task_type), DIMENSION(1:n), INTENT(INOUT) :: arr
2995 integer, DIMENSION(1:n), INTENT(INOUT) :: indices
2996
2997 INTEGER :: i
2998 TYPE(task_type), ALLOCATABLE :: tmp_arr(:)
2999 INTEGER, ALLOCATABLE :: tmp_idx(:)
3000
3001 IF (n > 1) THEN
3002 ! scratch space used during the merge step
3003 ALLOCATE (tmp_arr((n + 1)/2), tmp_idx((n + 1)/2))
3004
3005 indices = [(i, i=1, n)]
3006
3007 CALL tasks_sort_low(arr(1:n), indices, tmp_arr, tmp_idx)
3008
3009 DEALLOCATE (tmp_arr, tmp_idx)
3010 ELSE IF (n > 0) THEN ! can be a frequent case in cp2k
3011 indices(1) = 1
3012 END IF
3013
3014 END SUBROUTINE tasks_sort
3015
3016! **************************************************************************************************
3017!> \brief The actual sort routing. Only tasks_sort and itself should call this.
3018!> \param arr the array to sort
3019!> \param indices elements-indices before the sort
3020!> \param tmp_arr scratch space
3021!> \param tmp_idx scratch space
3022!> \par History
3023!> 12.2012 created [ole]
3024!> \author Ole Schuett
3025! **************************************************************************************************
3026 RECURSIVE SUBROUTINE tasks_sort_low(arr, indices, tmp_arr, tmp_idx)
3027 TYPE(task_type), DIMENSION(:), INTENT(INOUT) :: arr
3028 INTEGER, DIMENSION(size(arr)), INTENT(INOUT) :: indices
3029 TYPE(task_type), DIMENSION((size(arr) + 1)/2), INTENT(INOUT) :: tmp_arr
3030 INTEGER, DIMENSION((size(arr) + 1)/2), INTENT(INOUT) :: tmp_idx
3031 TYPE(task_type) :: a
3032 INTEGER :: t, m, i, j, k
3033 LOGICAL :: swapped
3034 ! a,t: used during swaping of elements in arr and indices
3035
3036 swapped = .true.
3037
3038 ! If only a few elements are left we switch to bubble-sort for efficiency.
3039 IF (size(arr) <= 7) THEN ! 7 seems to be a good choice for the moment
3040 DO j = size(arr) - 1, 1, -1
3041 swapped = .false.
3042 DO i = 1, j
3043 IF (tasks_less_than(arr(i + 1), arr(i))) THEN
3044 ! swap arr(i) with arr(i+1)
3045 a = arr(i)
3046 arr(i) = arr(i + 1)
3047 arr(i + 1) = a
3048 ! swap indices(i) with indices(i+1)
3049 t = indices(i)
3050 indices(i) = indices(i + 1)
3051 indices(i + 1) = t
3052 swapped = .true.
3053 END IF
3054 END DO
3055 IF (.NOT. swapped) EXIT
3056 END DO
3057 RETURN
3058 END IF
3059
3060 ! split list in half and recursively sort both sublists
3061 m = (size(arr) + 1)/2 ! index where we're going to divide the list in two
3062 CALL tasks_sort_low(arr(1:m), indices(1:m), tmp_arr, tmp_idx)
3063 CALL tasks_sort_low(arr(m + 1:), indices(m + 1:), tmp_arr, tmp_idx)
3064
3065 ! Check for a special case: Can we just concate the two sorted sublists?
3066 ! This leads to O(n) scaling if the input is already sorted.
3067 IF (tasks_less_than(arr(m + 1), arr(m))) THEN
3068 ! ...no - let's merge the two sorted sublists arr(:m) and arr(m+1:)
3069 ! Merge will be performed directly in arr. Need backup of first sublist.
3070 tmp_arr(1:m) = arr(1:m)
3071 tmp_idx(1:m) = indices(1:m)
3072 i = 1; ! number of elemens consumed from 1st sublist
3073 j = 1; ! number of elemens consumed from 2nd sublist
3074 k = 1; ! number of elemens already merged
3075
3076 DO WHILE (i <= m .and. j <= size(arr) - m)
3077 IF (tasks_less_than(arr(m + j), tmp_arr(i))) THEN
3078 arr(k) = arr(m + j)
3079 indices(k) = indices(m + j)
3080 j = j + 1
3081 ELSE
3082 arr(k) = tmp_arr(i)
3083 indices(k) = tmp_idx(i)
3084 i = i + 1
3085 END IF
3086 k = k + 1
3087 END DO
3088
3089 ! One of the two sublist is now empty.
3090 ! Copy possibly remaining tail of 1st sublist
3091 DO WHILE (i <= m)
3092 arr(k) = tmp_arr(i)
3093 indices(k) = tmp_idx(i)
3094 i = i + 1
3095 k = k + 1
3096 END DO
3097 ! The possibly remaining tail of 2nd sublist is already at the right spot.
3098 END IF
3099
3100 END SUBROUTINE tasks_sort_low
3101
3102
3103! **************************************************************************************************
3104!> \brief Order atom pairs to find duplicates.
3105!> \param a ...
3106!> \param b ...
3107!> \return ...
3108!> \author Ole Schuett
3109! **************************************************************************************************
3110 PURE FUNCTION atom_pair_less_than(a, b) RESULT(res)
3111 TYPE(atom_pair_type), INTENT(IN) :: a, b
3112 LOGICAL :: res
3113
3114 IF (a%rank /= b%rank) THEN
3115 res = a%rank < b%rank
3116
3117 ELSE IF (a%row /= b%row) THEN
3118 res = a%row < b%row
3119
3120 ELSE IF (a%col /= b%col) THEN
3121 res = a%col < b%col
3122
3123 ELSE
3124 res = a%image < b%image
3125
3126 END IF
3127 END FUNCTION atom_pair_less_than
3128
3129
3130! **************************************************************************************************
3131!> \brief Sorts an array inplace using a combination of merge- and bubble-sort.
3132!> It also returns the indices, which the elements had before the sort.
3133!> \param arr the array to sort
3134!> \param n length of array
3135!> \param indices returns elements-indices before the sort
3136!> \par History
3137!> 12.2012 created [ole]
3138!> \author Ole Schuett
3139! **************************************************************************************************
3140 SUBROUTINE atom_pair_sort(arr, n, indices)
3141 INTEGER, INTENT(IN) :: n
3142 TYPE(atom_pair_type), DIMENSION(1:n), INTENT(INOUT) :: arr
3143 integer, DIMENSION(1:n), INTENT(INOUT) :: indices
3144
3145 INTEGER :: i
3146 TYPE(atom_pair_type), ALLOCATABLE :: tmp_arr(:)
3147 INTEGER, ALLOCATABLE :: tmp_idx(:)
3148
3149 IF (n > 1) THEN
3150 ! scratch space used during the merge step
3151 ALLOCATE (tmp_arr((n + 1)/2), tmp_idx((n + 1)/2))
3152
3153 indices = [(i, i=1, n)]
3154
3155 CALL atom_pair_sort_low(arr(1:n), indices, tmp_arr, tmp_idx)
3156
3157 DEALLOCATE (tmp_arr, tmp_idx)
3158 ELSE IF (n > 0) THEN ! can be a frequent case in cp2k
3159 indices(1) = 1
3160 END IF
3161
3162 END SUBROUTINE atom_pair_sort
3163
3164! **************************************************************************************************
3165!> \brief The actual sort routing. Only atom_pair_sort and itself should call this.
3166!> \param arr the array to sort
3167!> \param indices elements-indices before the sort
3168!> \param tmp_arr scratch space
3169!> \param tmp_idx scratch space
3170!> \par History
3171!> 12.2012 created [ole]
3172!> \author Ole Schuett
3173! **************************************************************************************************
3174 RECURSIVE SUBROUTINE atom_pair_sort_low(arr, indices, tmp_arr, tmp_idx)
3175 TYPE(atom_pair_type), DIMENSION(:), INTENT(INOUT) :: arr
3176 INTEGER, DIMENSION(size(arr)), INTENT(INOUT) :: indices
3177 TYPE(atom_pair_type), DIMENSION((size(arr) + 1)/2), INTENT(INOUT) :: tmp_arr
3178 INTEGER, DIMENSION((size(arr) + 1)/2), INTENT(INOUT) :: tmp_idx
3179 TYPE(atom_pair_type) :: a
3180 INTEGER :: t, m, i, j, k
3181 LOGICAL :: swapped
3182 ! a,t: used during swaping of elements in arr and indices
3183
3184 swapped = .true.
3185
3186 ! If only a few elements are left we switch to bubble-sort for efficiency.
3187 IF (size(arr) <= 7) THEN ! 7 seems to be a good choice for the moment
3188 DO j = size(arr) - 1, 1, -1
3189 swapped = .false.
3190 DO i = 1, j
3191 IF (atom_pair_less_than(arr(i + 1), arr(i))) THEN
3192 ! swap arr(i) with arr(i+1)
3193 a = arr(i)
3194 arr(i) = arr(i + 1)
3195 arr(i + 1) = a
3196 ! swap indices(i) with indices(i+1)
3197 t = indices(i)
3198 indices(i) = indices(i + 1)
3199 indices(i + 1) = t
3200 swapped = .true.
3201 END IF
3202 END DO
3203 IF (.NOT. swapped) EXIT
3204 END DO
3205 RETURN
3206 END IF
3207
3208 ! split list in half and recursively sort both sublists
3209 m = (size(arr) + 1)/2 ! index where we're going to divide the list in two
3210 CALL atom_pair_sort_low(arr(1:m), indices(1:m), tmp_arr, tmp_idx)
3211 CALL atom_pair_sort_low(arr(m + 1:), indices(m + 1:), tmp_arr, tmp_idx)
3212
3213 ! Check for a special case: Can we just concate the two sorted sublists?
3214 ! This leads to O(n) scaling if the input is already sorted.
3215 IF (atom_pair_less_than(arr(m + 1), arr(m))) THEN
3216 ! ...no - let's merge the two sorted sublists arr(:m) and arr(m+1:)
3217 ! Merge will be performed directly in arr. Need backup of first sublist.
3218 tmp_arr(1:m) = arr(1:m)
3219 tmp_idx(1:m) = indices(1:m)
3220 i = 1; ! number of elemens consumed from 1st sublist
3221 j = 1; ! number of elemens consumed from 2nd sublist
3222 k = 1; ! number of elemens already merged
3223
3224 DO WHILE (i <= m .and. j <= size(arr) - m)
3225 IF (atom_pair_less_than(arr(m + j), tmp_arr(i))) THEN
3226 arr(k) = arr(m + j)
3227 indices(k) = indices(m + j)
3228 j = j + 1
3229 ELSE
3230 arr(k) = tmp_arr(i)
3231 indices(k) = tmp_idx(i)
3232 i = i + 1
3233 END IF
3234 k = k + 1
3235 END DO
3236
3237 ! One of the two sublist is now empty.
3238 ! Copy possibly remaining tail of 1st sublist
3239 DO WHILE (i <= m)
3240 arr(k) = tmp_arr(i)
3241 indices(k) = tmp_idx(i)
3242 i = i + 1
3243 k = k + 1
3244 END DO
3245 ! The possibly remaining tail of 2nd sublist is already at the right spot.
3246 END IF
3247
3248 END SUBROUTINE atom_pair_sort_low
3249
3250
3251END MODULE task_list_methods
void grid_create_basis_set(const int nset, const int nsgf, const int maxco, const int maxpgf, const int lmin[nset], const int lmax[nset], const int npgf[nset], const int nsgf_set[nset], const int first_sgf[nset], const double sphi[nsgf][maxco], const double zet[nset][maxpgf], grid_basis_set **basis_set_out)
Allocates a basis set which can be passed to grid_create_task_list. See grid_task_list....
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
All kind of helpful little routines.
Definition ao_util.F:14
real(kind=dp) function, public exp_radius_very_extended(la_min, la_max, lb_min, lb_max, pab, o1, o2, ra, rb, rp, zetp, eps, prefactor, cutoff, epsabs)
computes the radius of the Gaussian outside of which it is smaller than eps
Definition ao_util.F:209
subroutine, public get_gto_basis_set(gto_basis_set, name, aliases, norm_type, kind_radius, ncgf, nset, nsgf, cgf_symbol, sgf_symbol, norm_cgf, set_radius, lmax, lmin, lx, ly, lz, m, ncgf_set, npgf, nsgf_set, nshell, cphi, pgf_radius, sphi, scon, zet, first_cgf, first_sgf, l, last_cgf, last_sgf, n, gcc, maxco, maxl, maxpgf, maxsgf_set, maxshell, maxso, nco_sum, npgf_sum, nshell_sum, maxder, short_kind_radius, npgf_seg_sum)
...
Handles all functions related to the CELL.
Definition cell_types.F:15
Defines control structures, which contain the parameters and the settings for the DFT-based calculati...
subroutine, public dbcsr_get_block_p(matrix, row, col, block, found, row_size, col_size)
...
subroutine, public dbcsr_get_info(matrix, nblkrows_total, nblkcols_total, nfullrows_total, nfullcols_total, nblkrows_local, nblkcols_local, nfullrows_local, nfullcols_local, my_prow, my_pcol, local_rows, local_cols, proc_row_dist, proc_col_dist, row_blk_size, col_blk_size, row_blk_offset, col_blk_offset, distribution, name, matrix_type, group)
...
subroutine, public dbcsr_work_create(matrix, nblks_guess, sizedata_guess, n, work_mutable)
...
subroutine, public dbcsr_finalize(matrix)
...
subroutine, public dbcsr_put_block(matrix, row, col, block, summation)
...
for a given dr()/dh(r) this will provide the bounds to be used if one wants to go over a sphere-subre...
Definition cube_utils.F:18
subroutine, public compute_cube_center(cube_center, rs_desc, zeta, zetb, ra, rab)
unifies the computation of the cube center, so that differences in implementation,...
Definition cube_utils.F:68
subroutine, public return_cube(info, radius, lb_cube, ub_cube, sphere_bounds)
...
Definition cube_utils.F:140
subroutine, public return_cube_nonortho(info, radius, lb, ub, rp)
...
Definition cube_utils.F:91
integer function, public gaussian_gridlevel(gridlevel_info, exponent)
...
Fortran API for the grid package, which is written in C.
Definition grid_api.F:12
subroutine, public grid_create_task_list(ntasks, natoms, nkinds, nblocks, block_offsets, atom_positions, atom_kinds, basis_sets, level_list, iatom_list, jatom_list, iset_list, jset_list, ipgf_list, jpgf_list, border_mask_list, block_num_list, radius_list, rab_list, rs_grids, task_list)
Allocates a task list which can be passed to grid_collocate_task_list.
Definition grid_api.F:724
Defines the basic variable types.
Definition kinds.F:23
integer, parameter, public int_8
Definition kinds.F:54
integer, parameter, public dp
Definition kinds.F:34
integer, parameter, public default_string_length
Definition kinds.F:57
Types and basic routines needed for a kpoint calculation.
subroutine, public get_kpoint_info(kpoint, kp_scheme, nkp_grid, kp_shift, symmetry, verbose, full_grid, use_real_wfn, eps_geo, parallel_group_size, kp_range, nkp, xkp, wkp, para_env, blacs_env_all, para_env_kp, para_env_inter_kp, blacs_env, kp_env, kp_aux_env, mpools, iogrp, nkp_groups, kp_dist, cell_to_index, index_to_cell, sab_nl, sab_nl_nosym)
Retrieve information from a kpoint environment.
An array-based list which grows on demand. When the internal array is full, a new array of twice the ...
Definition list.F:24
Utility routines for the memory handling.
Interface to the message passing library MPI.
Fortran API for the offload package, which is written in C.
Definition offload_api.F:12
subroutine, public offload_create_buffer(length, buffer)
Allocates a buffer of given length, ie. number of elements.
Define methods related to particle_type.
subroutine, public get_particle_set(particle_set, qs_kind_set, first_sgf, last_sgf, nsgf, nmao, basis)
Get the components of a particle set.
Define the data structure for the particle information.
container for various plainwaves related things
subroutine, public pw_env_get(pw_env, pw_pools, cube_info, gridlevel_info, auxbas_pw_pool, auxbas_grid, auxbas_rs_desc, auxbas_rs_grid, rs_descs, rs_grids, xc_pw_pool, vdw_pw_pool, poisson_env, interp_section)
returns the various attributes of the pw env
Define the quickstep kind type and their sub types.
subroutine, public get_qs_kind(qs_kind, basis_set, basis_type, ncgf, nsgf, all_potential, tnadd_potential, gth_potential, sgp_potential, upf_potential, cneo_potential, se_parameter, dftb_parameter, xtb_parameter, dftb3_param, zatom, zeff, elec_conf, mao, lmax_dftb, alpha_core_charge, ccore_charge, core_charge, core_charge_radius, paw_proj_set, paw_atom, hard_radius, hard0_radius, max_rad_local, covalent_radius, vdw_radius, gpw_type_forced, harmonics, max_iso_not0, max_s_harm, grid_atom, ngrid_ang, ngrid_rad, lmax_rho0, dft_plus_u_atom, l_of_dft_plus_u, n_of_dft_plus_u, u_minus_j, u_of_dft_plus_u, j_of_dft_plus_u, alpha_of_dft_plus_u, beta_of_dft_plus_u, j0_of_dft_plus_u, occupation_of_dft_plus_u, dispersion, bs_occupation, magnetization, no_optimize, addel, laddel, naddel, orbitals, max_scf, eps_scf, smear, u_ramping, u_minus_j_target, eps_u_ramping, init_u_ramping_each_scf, reltmat, ghost, monovalent, floating, name, element_symbol, pao_basis_size, pao_model_file, pao_potentials, pao_descriptors, nelec)
Get attributes of an atomic kind.
subroutine, public get_ks_env(ks_env, v_hartree_rspace, s_mstruct_changed, rho_changed, exc_accint, potential_changed, forces_up_to_date, complex_ks, matrix_h, matrix_h_im, matrix_ks, matrix_ks_im, matrix_vxc, kinetic, matrix_s, matrix_s_ri_aux, matrix_w, matrix_p_mp2, matrix_p_mp2_admm, matrix_h_kp, matrix_h_im_kp, matrix_ks_kp, matrix_vxc_kp, kinetic_kp, matrix_s_kp, matrix_w_kp, matrix_s_ri_aux_kp, matrix_ks_im_kp, rho, rho_xc, vppl, xcint_weights, rho_core, rho_nlcc, rho_nlcc_g, vee, neighbor_list_id, sab_orb, sab_all, sac_ae, sac_ppl, sac_lri, sap_ppnl, sap_oce, sab_lrc, sab_se, sab_xtbe, sab_tbe, sab_core, sab_xb, sab_xtb_pp, sab_xtb_nonbond, sab_vdw, sab_scp, sab_almo, sab_kp, sab_kp_nosym, sab_cneo, task_list, task_list_soft, kpoints, do_kpoints, atomic_kind_set, qs_kind_set, cell, cell_ref, use_ref_cell, particle_set, energy, force, local_particles, local_molecules, molecule_kind_set, molecule_set, subsys, cp_subsys, virial, results, atprop, nkind, natom, dft_control, dbcsr_dist, distribution_2d, pw_env, para_env, blacs_env, nelectron_total, nelectron_spin)
...
Define the neighbor list data types and the corresponding functionality.
subroutine, public rs_grid_create(rs, desc)
...
pure integer function, public rs_grid_locate_rank(rs_desc, rank_in, shift)
returns the 1D rank of the task which is a cartesian shift away from 1D rank rank_in only possible if...
pure subroutine, public rs_grid_reorder_ranks(desc, real2virtual)
Defines a new ordering of ranks on this realspace grid, recalculating the data bounds and reallocatin...
subroutine, public rs_grid_release(rs_grid)
releases the given rs grid (see doc/ReferenceCounting.html)
generate the tasks lists used by collocate and integrate routines
subroutine, public rs_copy_to_matrices(src_buffer, dest_matrices, task_list)
Copies from buffer into DBCSR matrics, replaces rs_gather_matrix for non-distributed grids.
subroutine, public rs_scatter_matrices(src_matrices, dest_buffer, task_list, group)
Scatters dbcsr matrix blocks and receives them into a buffer as needed before collocation.
subroutine, public rs_distribute_matrix(rs_descs, pmats, atom_pair_send, atom_pair_recv, nimages, scatter, hmats)
redistributes the matrix so that it can be used in realspace operations i.e. according to the task li...
subroutine, public distribute_tasks(rs_descs, ntasks, natoms, tasks, atom_pair_send, atom_pair_recv, symmetric, reorder_rs_grid_ranks, skip_load_balance_distributed)
Assembles tasks to be performed on local grid.
subroutine, public rs_gather_matrices(src_buffer, dest_matrices, task_list, group)
Gather the dbcsr matrix blocks and receives them into a buffer as needed after integration.
subroutine, public task_list_inner_loop(tasks, ntasks, curr_tasks, rs_descs, dft_control, cube_info, gridlevel_info, cindex, iatom, jatom, rpgfa, rpgfb, zeta, zetb, kind_radius_b, set_radius_a, set_radius_b, ra, rab, la_max, la_min, lb_max, lb_min, npgfa, npgfb, nseta, nsetb)
...
subroutine, public generate_qs_task_list(ks_env, task_list, basis_type, reorder_rs_grid_ranks, skip_load_balance_distributed, pw_env_external, sab_orb_external, ext_kpoints)
...
subroutine, public rs_copy_to_buffer(src_matrices, dest_buffer, task_list)
Copies the DBCSR blocks into buffer, replaces rs_scatter_matrix for non-distributed grids.
types for task lists
subroutine, public serialize_task(task, serialized_task)
Serialize a task into an integer array. Used for MPI communication.
subroutine, public deserialize_task(task, serialized_task)
De-serialize a task from an integer array. Used for MPI communication.
subroutine, public reallocate_tasks(tasks, new_size)
Grow an array of tasks while preserving the existing entries.
integer, parameter, public task_size_in_int8
All kind of helpful little routines.
Definition util.F:14
Type defining parameters related to the simulation cell.
Definition cell_types.F:60
Contains information about kpoints.
contained for different pw related things
Provides all information about a quickstep kind.
calculation environment to calculate the ks matrix, holds all the needed vars. assumes that the core ...