(git:374b731)
Loading...
Searching...
No Matches
task_list_methods.F
Go to the documentation of this file.
1!--------------------------------------------------------------------------------------------------!
2! CP2K: A general program to perform molecular dynamics simulations !
3! Copyright 2000-2024 CP2K developers group <https://cp2k.org> !
4! !
5! SPDX-License-Identifier: GPL-2.0-or-later !
6!--------------------------------------------------------------------------------------------------!
7
8! **************************************************************************************************
9!> \brief generate the tasks lists used by collocate and integrate routines
10!> \par History
11!> 01.2008 [Joost VandeVondele] refactered out of qs_collocate / qs_integrate
12!> \author Joost VandeVondele
13! **************************************************************************************************
21 USE cell_types, ONLY: cell_type, &
22 pbc
28 USE dbcsr_api, ONLY: dbcsr_convert_sizes_to_offsets, &
29 dbcsr_finalize, &
30 dbcsr_get_block_p, &
31 dbcsr_get_info, &
32 dbcsr_p_type, &
33 dbcsr_put_block, &
34 dbcsr_type, &
35 dbcsr_work_create
38 USE kinds, ONLY: default_string_length, &
39 dp, &
40 int_8
41 USE kpoint_types, ONLY: get_kpoint_info, &
44 USE message_passing, ONLY: &
48 USE pw_env_types, ONLY: pw_env_get, &
50 USE qs_kind_types, ONLY: get_qs_kind, &
52 USE qs_ks_types, ONLY: get_ks_env, &
68 USE util, ONLY: sort
69
70!$ USE OMP_LIB, ONLY: omp_destroy_lock, omp_get_num_threads, omp_init_lock, &
71!$ omp_lock_kind, omp_set_lock, omp_unset_lock, omp_get_max_threads
72#include "./base/base_uses.f90"
73
74
75 IMPLICIT NONE
76
77 LOGICAL, PRIVATE, PARAMETER :: debug_this_module = .false.
78
79 PRIVATE
80
81 CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'task_list_methods'
82
83 PUBLIC :: generate_qs_task_list, &
85 PUBLIC :: distribute_tasks, &
91
92CONTAINS
93
94! **************************************************************************************************
95!> \brief ...
96!> \param ks_env ...
97!> \param task_list ...
98!> \param reorder_rs_grid_ranks Flag that indicates if this routine should
99!> or should not overwrite the rs descriptor (see comment below)
100!> \param skip_load_balance_distributed ...
101!> \param soft_valid ...
102!> \param basis_type ...
103!> \param pw_env_external ...
104!> \param sab_orb_external ...
105!> \par History
106!> 01.2008 factored out of calculate_rho_elec [Joost VandeVondele]
107!> 04.2010 divides tasks into grid levels and atom pairs for integrate/collocate [Iain Bethune]
108!> (c) The Numerical Algorithms Group (NAG) Ltd, 2010 on behalf of the HECToR project
109!> 06.2015 adjusted to be used with multiple images (k-points) [JGH]
110!> \note If this routine is called several times with different task lists,
111!> the default behaviour is to re-optimize the grid ranks and overwrite
112!> the rs descriptor and grids. reorder_rs_grid_ranks = .FALSE. prevents the code
113!> of performing a new optimization by leaving the rank order in
114!> its current state.
115! **************************************************************************************************
116 SUBROUTINE generate_qs_task_list(ks_env, task_list, &
117 reorder_rs_grid_ranks, skip_load_balance_distributed, &
118 soft_valid, basis_type, pw_env_external, sab_orb_external)
119
120 TYPE(qs_ks_env_type), POINTER :: ks_env
121 TYPE(task_list_type), POINTER :: task_list
122 LOGICAL, INTENT(IN) :: reorder_rs_grid_ranks, &
123 skip_load_balance_distributed
124 LOGICAL, INTENT(IN), OPTIONAL :: soft_valid
125 CHARACTER(LEN=*), INTENT(IN), OPTIONAL :: basis_type
126 TYPE(pw_env_type), OPTIONAL, POINTER :: pw_env_external
127 TYPE(neighbor_list_set_p_type), DIMENSION(:), &
128 OPTIONAL, POINTER :: sab_orb_external
129
130 CHARACTER(LEN=*), PARAMETER :: routinen = 'generate_qs_task_list'
131 INTEGER, PARAMETER :: max_tasks = 2000
132
133 CHARACTER(LEN=default_string_length) :: my_basis_type
134 INTEGER :: cindex, curr_tasks, handle, i, iatom, iatom_old, igrid_level, igrid_level_old, &
135 ikind, ilevel, img, img_old, ipair, ipgf, iset, itask, jatom, jatom_old, jkind, jpgf, &
136 jset, maxpgf, maxset, natoms, nimages, nkind, nseta, nsetb, slot
137 INTEGER, ALLOCATABLE, DIMENSION(:, :, :) :: blocks
138 INTEGER, DIMENSION(3) :: cellind
139 INTEGER, DIMENSION(:), POINTER :: la_max, la_min, lb_max, lb_min, npgfa, &
140 npgfb, nsgf
141 INTEGER, DIMENSION(:, :, :), POINTER :: cell_to_index
142 LOGICAL :: dokp, my_soft
143 REAL(kind=dp) :: kind_radius_a, kind_radius_b
144 REAL(kind=dp), DIMENSION(3) :: ra, rab
145 REAL(kind=dp), DIMENSION(:), POINTER :: set_radius_a, set_radius_b
146 REAL(kind=dp), DIMENSION(:, :), POINTER :: rpgfa, rpgfb, zeta, zetb
147 TYPE(cell_type), POINTER :: cell
148 TYPE(cube_info_type), DIMENSION(:), POINTER :: cube_info
149 TYPE(dft_control_type), POINTER :: dft_control
150 TYPE(gridlevel_info_type), POINTER :: gridlevel_info
151 TYPE(gto_basis_set_p_type), DIMENSION(:), POINTER :: basis_set_list
152 TYPE(gto_basis_set_type), POINTER :: basis_set_a, basis_set_b, orb_basis_set
153 TYPE(kpoint_type), POINTER :: kpoints
154 TYPE(neighbor_list_set_p_type), DIMENSION(:), &
155 POINTER :: sab_orb
156 TYPE(particle_type), DIMENSION(:), POINTER :: particle_set
157 TYPE(pw_env_type), POINTER :: pw_env
158 TYPE(qs_kind_type), DIMENSION(:), POINTER :: qs_kind_set
159 TYPE(qs_kind_type), POINTER :: qs_kind
160 TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
161 POINTER :: rs_descs
162 TYPE(realspace_grid_type), DIMENSION(:), POINTER :: rs_grids
163 TYPE(task_type), DIMENSION(:), POINTER :: tasks
164
165 CALL timeset(routinen, handle)
166
167 CALL get_ks_env(ks_env, &
168 qs_kind_set=qs_kind_set, &
169 cell=cell, &
170 particle_set=particle_set, &
171 dft_control=dft_control)
172
173 ! OPTION 1) basis is set through input
174 ! OPTION 2) soft orb basis is requested
175 ! OPTION 3) by default, the full orb basis density is calculated
176 my_soft = .false.
177 IF (PRESENT(soft_valid)) my_soft = soft_valid
178 IF (PRESENT(basis_type)) THEN
179 cpassert(.NOT. my_soft)
180 my_basis_type = basis_type
181 ELSEIF (my_soft) THEN
182 my_basis_type = "ORB_SOFT"
183 ELSE
184 my_basis_type = "ORB"
185 END IF
186
187 CALL get_ks_env(ks_env, sab_orb=sab_orb)
188 IF (PRESENT(sab_orb_external)) sab_orb => sab_orb_external
189
190 CALL get_ks_env(ks_env, pw_env=pw_env)
191 IF (PRESENT(pw_env_external)) pw_env => pw_env_external
192 CALL pw_env_get(pw_env, rs_descs=rs_descs, rs_grids=rs_grids)
193
194 ! *** assign from pw_env
195 gridlevel_info => pw_env%gridlevel_info
196 cube_info => pw_env%cube_info
197
198 ! find maximum numbers
199 nkind = SIZE(qs_kind_set)
200 natoms = SIZE(particle_set)
201 maxset = 0
202 maxpgf = 0
203 DO ikind = 1, nkind
204 qs_kind => qs_kind_set(ikind)
205 CALL get_qs_kind(qs_kind=qs_kind, &
206 basis_set=orb_basis_set, basis_type=my_basis_type)
207
208 IF (.NOT. ASSOCIATED(orb_basis_set)) cycle
209 CALL get_gto_basis_set(gto_basis_set=orb_basis_set, npgf=npgfa, nset=nseta)
210
211 maxset = max(nseta, maxset)
212 maxpgf = max(maxval(npgfa), maxpgf)
213 END DO
214
215 ! kpoint related
216 nimages = dft_control%nimages
217 IF (nimages > 1) THEN
218 dokp = .true.
219 NULLIFY (kpoints)
220 CALL get_ks_env(ks_env=ks_env, kpoints=kpoints)
221 CALL get_kpoint_info(kpoint=kpoints, cell_to_index=cell_to_index)
222 ELSE
223 dokp = .false.
224 NULLIFY (cell_to_index)
225 END IF
226
227 ! free the atom_pair lists if allocated
228 IF (ASSOCIATED(task_list%atom_pair_send)) DEALLOCATE (task_list%atom_pair_send)
229 IF (ASSOCIATED(task_list%atom_pair_recv)) DEALLOCATE (task_list%atom_pair_recv)
230
231 ! construct a list of all tasks
232 IF (.NOT. ASSOCIATED(task_list%tasks)) THEN
233 CALL reallocate_tasks(task_list%tasks, max_tasks)
234 END IF
235 task_list%ntasks = 0
236 curr_tasks = SIZE(task_list%tasks)
237
238 ALLOCATE (basis_set_list(nkind))
239 DO ikind = 1, nkind
240 qs_kind => qs_kind_set(ikind)
241 CALL get_qs_kind(qs_kind=qs_kind, basis_set=basis_set_a, &
242 basis_type=my_basis_type)
243 IF (ASSOCIATED(basis_set_a)) THEN
244 basis_set_list(ikind)%gto_basis_set => basis_set_a
245 ELSE
246 NULLIFY (basis_set_list(ikind)%gto_basis_set)
247 END IF
248 END DO
249!!$OMP PARALLEL DEFAULT(NONE) &
250!!$OMP SHARED (sab_orb, dokp, basis_set_list, task_list, rs_descs, dft_control, cube_info, gridlevel_info, &
251!!$OMP curr_tasks, maxpgf, maxset, natoms, nimages, particle_set, cell_to_index, cell) &
252!!$OMP PRIVATE (ikind, jkind, iatom, jatom, rab, cellind, basis_set_a, basis_set_b, ra, &
253!!$OMP la_max, la_min, npgfa, nseta, rpgfa, set_radius_a, kind_radius_a, zeta, &
254!!$OMP lb_max, lb_min, npgfb, nsetb, rpgfb, set_radius_b, kind_radius_b, zetb, &
255!!$OMP cindex, slot)
256 ! Loop over neighbor list
257!!$OMP DO SCHEDULE(GUIDED)
258 DO slot = 1, sab_orb(1)%nl_size
259 ikind = sab_orb(1)%nlist_task(slot)%ikind
260 jkind = sab_orb(1)%nlist_task(slot)%jkind
261 iatom = sab_orb(1)%nlist_task(slot)%iatom
262 jatom = sab_orb(1)%nlist_task(slot)%jatom
263 rab(1:3) = sab_orb(1)%nlist_task(slot)%r(1:3)
264 cellind(1:3) = sab_orb(1)%nlist_task(slot)%cell(1:3)
265
266 basis_set_a => basis_set_list(ikind)%gto_basis_set
267 IF (.NOT. ASSOCIATED(basis_set_a)) cycle
268 basis_set_b => basis_set_list(jkind)%gto_basis_set
269 IF (.NOT. ASSOCIATED(basis_set_b)) cycle
270 ra(:) = pbc(particle_set(iatom)%r, cell)
271 ! basis ikind
272 la_max => basis_set_a%lmax
273 la_min => basis_set_a%lmin
274 npgfa => basis_set_a%npgf
275 nseta = basis_set_a%nset
276 rpgfa => basis_set_a%pgf_radius
277 set_radius_a => basis_set_a%set_radius
278 kind_radius_a = basis_set_a%kind_radius
279 zeta => basis_set_a%zet
280 ! basis jkind
281 lb_max => basis_set_b%lmax
282 lb_min => basis_set_b%lmin
283 npgfb => basis_set_b%npgf
284 nsetb = basis_set_b%nset
285 rpgfb => basis_set_b%pgf_radius
286 set_radius_b => basis_set_b%set_radius
287 kind_radius_b = basis_set_b%kind_radius
288 zetb => basis_set_b%zet
289
290 IF (dokp) THEN
291 cindex = cell_to_index(cellind(1), cellind(2), cellind(3))
292 ELSE
293 cindex = 1
294 END IF
295
296 CALL task_list_inner_loop(task_list%tasks, task_list%ntasks, curr_tasks, &
297 rs_descs, dft_control, cube_info, gridlevel_info, cindex, &
298 iatom, jatom, rpgfa, rpgfb, zeta, zetb, kind_radius_b, &
299 set_radius_a, set_radius_b, ra, rab, &
300 la_max, la_min, lb_max, lb_min, npgfa, npgfb, nseta, nsetb)
301
302 END DO
303!!$OMP END PARALLEL
304
305 ! redistribute the task list so that all tasks map on the local rs grids
306 CALL distribute_tasks( &
307 rs_descs=rs_descs, ntasks=task_list%ntasks, natoms=natoms, &
308 tasks=task_list%tasks, atom_pair_send=task_list%atom_pair_send, &
309 atom_pair_recv=task_list%atom_pair_recv, symmetric=.true., &
310 reorder_rs_grid_ranks=reorder_rs_grid_ranks, &
311 skip_load_balance_distributed=skip_load_balance_distributed)
312
313 ! compute offsets for rs_scatter_matrix / rs_copy_matrix
314 ALLOCATE (nsgf(natoms))
315 CALL get_particle_set(particle_set, qs_kind_set, basis=basis_set_list, nsgf=nsgf)
316 IF (ASSOCIATED(task_list%atom_pair_send)) THEN
317 ! only needed when there is a distributed grid
318 CALL rs_calc_offsets(pairs=task_list%atom_pair_send, &
319 nsgf=nsgf, &
320 group_size=rs_descs(1)%rs_desc%group_size, &
321 pair_offsets=task_list%pair_offsets_send, &
322 rank_offsets=task_list%rank_offsets_send, &
323 rank_sizes=task_list%rank_sizes_send, &
324 buffer_size=task_list%buffer_size_send)
325 END IF
326 CALL rs_calc_offsets(pairs=task_list%atom_pair_recv, &
327 nsgf=nsgf, &
328 group_size=rs_descs(1)%rs_desc%group_size, &
329 pair_offsets=task_list%pair_offsets_recv, &
330 rank_offsets=task_list%rank_offsets_recv, &
331 rank_sizes=task_list%rank_sizes_recv, &
332 buffer_size=task_list%buffer_size_recv)
333 DEALLOCATE (basis_set_list, nsgf)
334
335 ! If the rank order has changed, reallocate any of the distributed rs_grids
336 IF (reorder_rs_grid_ranks) THEN
337 DO i = 1, gridlevel_info%ngrid_levels
338 IF (rs_descs(i)%rs_desc%distributed) THEN
339 CALL rs_grid_release(rs_grids(i))
340 CALL rs_grid_create(rs_grids(i), rs_descs(i)%rs_desc)
341 END IF
342 END DO
343 END IF
344
345 CALL create_grid_task_list(task_list=task_list, &
346 qs_kind_set=qs_kind_set, &
347 particle_set=particle_set, &
348 cell=cell, &
349 basis_type=my_basis_type, &
350 rs_grids=rs_grids)
351
352 ! Now we have the final list of tasks, setup the task_list with the
353 ! data needed for the loops in integrate_v/calculate_rho
354
355 IF (ASSOCIATED(task_list%taskstart)) THEN
356 DEALLOCATE (task_list%taskstart)
357 END IF
358 IF (ASSOCIATED(task_list%taskstop)) THEN
359 DEALLOCATE (task_list%taskstop)
360 END IF
361 IF (ASSOCIATED(task_list%npairs)) THEN
362 DEALLOCATE (task_list%npairs)
363 END IF
364
365 ! First, count the number of unique atom pairs per grid level
366
367 ALLOCATE (task_list%npairs(SIZE(rs_descs)))
368
369 iatom_old = -1; jatom_old = -1; igrid_level_old = -1; img_old = -1
370 ipair = 0
371 task_list%npairs = 0
372
373 DO i = 1, task_list%ntasks
374 igrid_level = task_list%tasks(i)%grid_level
375 img = task_list%tasks(i)%image
376 iatom = task_list%tasks(i)%iatom
377 jatom = task_list%tasks(i)%jatom
378 iset = task_list%tasks(i)%iset
379 jset = task_list%tasks(i)%jset
380 ipgf = task_list%tasks(i)%ipgf
381 jpgf = task_list%tasks(i)%jpgf
382 IF (igrid_level .NE. igrid_level_old) THEN
383 IF (igrid_level_old .NE. -1) THEN
384 task_list%npairs(igrid_level_old) = ipair
385 END IF
386 ipair = 1
387 igrid_level_old = igrid_level
388 iatom_old = iatom
389 jatom_old = jatom
390 img_old = img
391 ELSE IF (iatom .NE. iatom_old .OR. jatom .NE. jatom_old .OR. img .NE. img_old) THEN
392 ipair = ipair + 1
393 iatom_old = iatom
394 jatom_old = jatom
395 img_old = img
396 END IF
397 END DO
398 ! Take care of the last iteration
399 IF (task_list%ntasks /= 0) THEN
400 task_list%npairs(igrid_level) = ipair
401 END IF
402
403 ! Second, for each atom pair, find the indices in the task list
404 ! of the first and last task
405
406 ! Array sized for worst case
407 ALLOCATE (task_list%taskstart(maxval(task_list%npairs), SIZE(rs_descs)))
408 ALLOCATE (task_list%taskstop(maxval(task_list%npairs), SIZE(rs_descs)))
409
410 iatom_old = -1; jatom_old = -1; igrid_level_old = -1; img_old = -1
411 ipair = 0
412 task_list%taskstart = 0
413 task_list%taskstop = 0
414
415 DO i = 1, task_list%ntasks
416 igrid_level = task_list%tasks(i)%grid_level
417 img = task_list%tasks(i)%image
418 iatom = task_list%tasks(i)%iatom
419 jatom = task_list%tasks(i)%jatom
420 iset = task_list%tasks(i)%iset
421 jset = task_list%tasks(i)%jset
422 ipgf = task_list%tasks(i)%ipgf
423 jpgf = task_list%tasks(i)%jpgf
424 IF (igrid_level .NE. igrid_level_old) THEN
425 IF (igrid_level_old .NE. -1) THEN
426 task_list%taskstop(ipair, igrid_level_old) = i - 1
427 END IF
428 ipair = 1
429 task_list%taskstart(ipair, igrid_level) = i
430 igrid_level_old = igrid_level
431 iatom_old = iatom
432 jatom_old = jatom
433 img_old = img
434 ELSE IF (iatom .NE. iatom_old .OR. jatom .NE. jatom_old .OR. img .NE. img_old) THEN
435 ipair = ipair + 1
436 task_list%taskstart(ipair, igrid_level) = i
437 task_list%taskstop(ipair - 1, igrid_level) = i - 1
438 iatom_old = iatom
439 jatom_old = jatom
440 img_old = img
441 END IF
442 END DO
443 ! Take care of the last iteration
444 IF (task_list%ntasks /= 0) THEN
445 task_list%taskstop(ipair, igrid_level) = task_list%ntasks
446 END IF
447
448 ! Debug task destribution
449 IF (debug_this_module) THEN
450 tasks => task_list%tasks
451 WRITE (6, *)
452 WRITE (6, *) "Total number of tasks ", task_list%ntasks
453 DO igrid_level = 1, gridlevel_info%ngrid_levels
454 WRITE (6, *) "Total number of pairs(grid_level) ", &
455 igrid_level, task_list%npairs(igrid_level)
456 END DO
457 WRITE (6, *)
458
459 DO igrid_level = 1, gridlevel_info%ngrid_levels
460
461 ALLOCATE (blocks(natoms, natoms, nimages))
462 blocks = -1
463 DO ipair = 1, task_list%npairs(igrid_level)
464 itask = task_list%taskstart(ipair, igrid_level)
465 ilevel = task_list%tasks(itask)%grid_level
466 img = task_list%tasks(itask)%image
467 iatom = task_list%tasks(itask)%iatom
468 jatom = task_list%tasks(itask)%jatom
469 iset = task_list%tasks(itask)%iset
470 jset = task_list%tasks(itask)%jset
471 ipgf = task_list%tasks(itask)%ipgf
472 jpgf = task_list%tasks(itask)%jpgf
473 IF (blocks(iatom, jatom, img) == -1 .AND. blocks(jatom, iatom, img) == -1) THEN
474 blocks(iatom, jatom, img) = 1
475 blocks(jatom, iatom, img) = 1
476 ELSE
477 WRITE (6, *) "TASK LIST CONFLICT IN PAIR ", ipair
478 WRITE (6, *) "Reuse of iatom, jatom, image ", iatom, jatom, img
479 END IF
480
481 iatom_old = iatom
482 jatom_old = jatom
483 img_old = img
484 DO itask = task_list%taskstart(ipair, igrid_level), task_list%taskstop(ipair, igrid_level)
485 ilevel = task_list%tasks(itask)%grid_level
486 img = task_list%tasks(itask)%image
487 iatom = task_list%tasks(itask)%iatom
488 jatom = task_list%tasks(itask)%jatom
489 iset = task_list%tasks(itask)%iset
490 jset = task_list%tasks(itask)%jset
491 ipgf = task_list%tasks(itask)%ipgf
492 jpgf = task_list%tasks(itask)%jpgf
493 IF (iatom /= iatom_old .OR. jatom /= jatom_old .OR. img /= img_old) THEN
494 WRITE (6, *) "TASK LIST CONFLICT IN TASK ", itask
495 WRITE (6, *) "Inconsistent iatom, jatom, image ", iatom, jatom, img
496 WRITE (6, *) "Should be iatom, jatom, image ", iatom_old, jatom_old, img_old
497 END IF
498
499 END DO
500 END DO
501 DEALLOCATE (blocks)
502
503 END DO
504
505 END IF
506
507 CALL timestop(handle)
508
509 END SUBROUTINE generate_qs_task_list
510
511! **************************************************************************************************
512!> \brief Sends the task list data to the grid API.
513!> \author Ole Schuett
514! **************************************************************************************************
515 SUBROUTINE create_grid_task_list(task_list, qs_kind_set, particle_set, cell, basis_type, rs_grids)
516 TYPE(task_list_type), POINTER :: task_list
517 TYPE(qs_kind_type), DIMENSION(:), POINTER :: qs_kind_set
518 TYPE(particle_type), DIMENSION(:), POINTER :: particle_set
519 TYPE(cell_type), POINTER :: cell
520 CHARACTER(LEN=default_string_length) :: basis_type
521 TYPE(realspace_grid_type), DIMENSION(:), POINTER :: rs_grids
522
523 TYPE(gto_basis_set_type), POINTER :: orb_basis_set
524 INTEGER :: nset, natoms, nkinds, ntasks, &
525 ikind, iatom, itask, nsgf
526 INTEGER, DIMENSION(:), ALLOCATABLE :: atom_kinds, level_list, iatom_list, jatom_list, &
527 iset_list, jset_list, ipgf_list, jpgf_list, &
528 border_mask_list, block_num_list
529 REAL(kind=dp), DIMENSION(:), ALLOCATABLE :: radius_list
530 REAL(kind=dp), DIMENSION(:, :), ALLOCATABLE :: rab_list, atom_positions
531 TYPE(task_type), DIMENSION(:), POINTER :: tasks
532 INTEGER, DIMENSION(:, :), POINTER :: first_sgf
533 REAL(kind=dp), DIMENSION(:, :), POINTER :: sphi, zet
534 INTEGER, DIMENSION(:), POINTER :: lmax, lmin, npgf, nsgf_set
535
536 nkinds = SIZE(qs_kind_set)
537 natoms = SIZE(particle_set)
538 ntasks = task_list%ntasks
539 tasks => task_list%tasks
540
541 IF (.NOT. ASSOCIATED(task_list%grid_basis_sets)) THEN
542 ! Basis sets do not change during simulation - only need to create them once.
543 ALLOCATE (task_list%grid_basis_sets(nkinds))
544 DO ikind = 1, nkinds
545 CALL get_qs_kind(qs_kind_set(ikind), basis_type=basis_type, basis_set=orb_basis_set)
546 CALL get_gto_basis_set(gto_basis_set=orb_basis_set, &
547 nset=nset, &
548 nsgf=nsgf, &
549 nsgf_set=nsgf_set, &
550 npgf=npgf, &
551 first_sgf=first_sgf, &
552 lmax=lmax, &
553 lmin=lmin, &
554 sphi=sphi, &
555 zet=zet)
556 CALL grid_create_basis_set(nset=nset, &
557 nsgf=nsgf, &
558 maxco=SIZE(sphi, 1), &
559 maxpgf=SIZE(zet, 1), &
560 lmin=lmin, &
561 lmax=lmax, &
562 npgf=npgf, &
563 nsgf_set=nsgf_set, &
564 first_sgf=first_sgf, &
565 sphi=sphi, &
566 zet=zet, &
567 basis_set=task_list%grid_basis_sets(ikind))
568 END DO
569 END IF
570
571 ! Pack task list infos
572 ALLOCATE (atom_kinds(natoms), atom_positions(3, natoms))
573 DO iatom = 1, natoms
574 atom_kinds(iatom) = particle_set(iatom)%atomic_kind%kind_number
575 atom_positions(:, iatom) = pbc(particle_set(iatom)%r, cell)
576 END DO
577
578 ALLOCATE (level_list(ntasks), iatom_list(ntasks), jatom_list(ntasks))
579 ALLOCATE (iset_list(ntasks), jset_list(ntasks), ipgf_list(ntasks), jpgf_list(ntasks))
580 ALLOCATE (border_mask_list(ntasks), block_num_list(ntasks))
581 ALLOCATE (radius_list(ntasks), rab_list(3, ntasks))
582
583 DO itask = 1, ntasks
584 level_list(itask) = tasks(itask)%grid_level
585 iatom_list(itask) = tasks(itask)%iatom
586 jatom_list(itask) = tasks(itask)%jatom
587 iset_list(itask) = tasks(itask)%iset
588 jset_list(itask) = tasks(itask)%jset
589 ipgf_list(itask) = tasks(itask)%ipgf
590 jpgf_list(itask) = tasks(itask)%jpgf
591 IF (tasks(itask)%dist_type == 2) THEN
592 border_mask_list(itask) = iand(63, not(tasks(itask)%subpatch_pattern)) ! invert last 6 bits
593 ELSE
594 border_mask_list(itask) = 0 ! no masking
595 END IF
596 block_num_list(itask) = tasks(itask)%pair_index ! change of nomenclature pair_index -> block_num
597 radius_list(itask) = tasks(itask)%radius
598 rab_list(:, itask) = tasks(itask)%rab(:)
599 END DO
600
601 CALL grid_create_task_list(ntasks=ntasks, &
602 natoms=natoms, &
603 nkinds=nkinds, &
604 nblocks=SIZE(task_list%pair_offsets_recv), &
605 block_offsets=task_list%pair_offsets_recv, &
606 atom_positions=atom_positions, &
607 atom_kinds=atom_kinds, &
608 basis_sets=task_list%grid_basis_sets, &
609 level_list=level_list, &
610 iatom_list=iatom_list, &
611 jatom_list=jatom_list, &
612 iset_list=iset_list, &
613 jset_list=jset_list, &
614 ipgf_list=ipgf_list, &
615 jpgf_list=jpgf_list, &
616 border_mask_list=border_mask_list, &
617 block_num_list=block_num_list, &
618 radius_list=radius_list, &
619 rab_list=rab_list, &
620 rs_grids=rs_grids, &
621 task_list=task_list%grid_task_list)
622
623 CALL offload_create_buffer(task_list%buffer_size_recv, task_list%pab_buffer)
624 CALL offload_create_buffer(task_list%buffer_size_recv, task_list%hab_buffer)
625
626 END SUBROUTINE create_grid_task_list
627
628! **************************************************************************************************
629!> \brief ...
630!> \param tasks ...
631!> \param ntasks ...
632!> \param curr_tasks ...
633!> \param rs_descs ...
634!> \param dft_control ...
635!> \param cube_info ...
636!> \param gridlevel_info ...
637!> \param cindex ...
638!> \param iatom ...
639!> \param jatom ...
640!> \param rpgfa ...
641!> \param rpgfb ...
642!> \param zeta ...
643!> \param zetb ...
644!> \param kind_radius_b ...
645!> \param set_radius_a ...
646!> \param set_radius_b ...
647!> \param ra ...
648!> \param rab ...
649!> \param la_max ...
650!> \param la_min ...
651!> \param lb_max ...
652!> \param lb_min ...
653!> \param npgfa ...
654!> \param npgfb ...
655!> \param nseta ...
656!> \param nsetb ...
657!> \par History
658!> Joost VandeVondele: 10.2008 refactored
659! **************************************************************************************************
660 SUBROUTINE task_list_inner_loop(tasks, ntasks, curr_tasks, rs_descs, dft_control, &
661 cube_info, gridlevel_info, cindex, &
662 iatom, jatom, rpgfa, rpgfb, zeta, zetb, kind_radius_b, set_radius_a, set_radius_b, ra, rab, &
663 la_max, la_min, lb_max, lb_min, npgfa, npgfb, nseta, nsetb)
664
665 TYPE(task_type), DIMENSION(:), POINTER :: tasks
666 INTEGER :: ntasks, curr_tasks
667 TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
668 POINTER :: rs_descs
669 TYPE(dft_control_type), POINTER :: dft_control
670 TYPE(cube_info_type), DIMENSION(:), POINTER :: cube_info
671 TYPE(gridlevel_info_type), POINTER :: gridlevel_info
672 INTEGER :: cindex, iatom, jatom
673 REAL(kind=dp), DIMENSION(:, :), POINTER :: rpgfa, rpgfb, zeta, zetb
674 REAL(kind=dp) :: kind_radius_b
675 REAL(kind=dp), DIMENSION(:), POINTER :: set_radius_a, set_radius_b
676 REAL(kind=dp), DIMENSION(3) :: ra, rab
677 INTEGER, DIMENSION(:), POINTER :: la_max, la_min, lb_max, lb_min, npgfa, &
678 npgfb
679 INTEGER :: nseta, nsetb
680
681 INTEGER :: cube_center(3), igrid_level, ipgf, iset, &
682 jpgf, jset, lb_cube(3), ub_cube(3)
683 REAL(kind=dp) :: dab, rab2, radius, zetp
684
685 rab2 = rab(1)*rab(1) + rab(2)*rab(2) + rab(3)*rab(3)
686 dab = sqrt(rab2)
687
688 loop_iset: DO iset = 1, nseta
689
690 IF (set_radius_a(iset) + kind_radius_b < dab) cycle
691
692 loop_jset: DO jset = 1, nsetb
693
694 IF (set_radius_a(iset) + set_radius_b(jset) < dab) cycle
695
696 loop_ipgf: DO ipgf = 1, npgfa(iset)
697
698 IF (rpgfa(ipgf, iset) + set_radius_b(jset) < dab) cycle
699
700 loop_jpgf: DO jpgf = 1, npgfb(jset)
701
702 IF (rpgfa(ipgf, iset) + rpgfb(jpgf, jset) < dab) cycle
703
704 zetp = zeta(ipgf, iset) + zetb(jpgf, jset)
705 igrid_level = gaussian_gridlevel(gridlevel_info, zetp)
706
707 CALL compute_pgf_properties(cube_center, lb_cube, ub_cube, radius, &
708 rs_descs(igrid_level)%rs_desc, cube_info(igrid_level), &
709 la_max(iset), zeta(ipgf, iset), la_min(iset), &
710 lb_max(jset), zetb(jpgf, jset), lb_min(jset), &
711 ra, rab, rab2, dft_control%qs_control%eps_rho_rspace)
712
713 CALL pgf_to_tasks(tasks, ntasks, curr_tasks, &
714 rab, cindex, iatom, jatom, iset, jset, ipgf, jpgf, &
715 la_max(iset), lb_max(jset), rs_descs(igrid_level)%rs_desc, &
716 igrid_level, gridlevel_info%ngrid_levels, cube_center, &
717 lb_cube, ub_cube, radius)
718
719 END DO loop_jpgf
720
721 END DO loop_ipgf
722
723 END DO loop_jset
724
725 END DO loop_iset
726
727 END SUBROUTINE task_list_inner_loop
728
729! **************************************************************************************************
730!> \brief combines the calculation of several basic properties of a given pgf:
731!> its center, the bounding cube, the radius, the cost,
732!> tries to predict the time needed for processing this task
733!> in this way an improved load balance might be obtained
734!> \param cube_center ...
735!> \param lb_cube ...
736!> \param ub_cube ...
737!> \param radius ...
738!> \param rs_desc ...
739!> \param cube_info ...
740!> \param la_max ...
741!> \param zeta ...
742!> \param la_min ...
743!> \param lb_max ...
744!> \param zetb ...
745!> \param lb_min ...
746!> \param ra ...
747!> \param rab ...
748!> \param rab2 ...
749!> \param eps ...
750!> \par History
751!> 10.2008 refactored [Joost VandeVondele]
752!> \note
753!> -) this requires the radius to be computed in the same way as
754!> collocate_pgf_product, we should factor that part into a subroutine
755!> -) we're assuming that integrate_pgf and collocate_pgf are the same cost for load balancing
756!> this is more or less true for map_consistent
757!> -) in principle, the computed radius could be recycled in integrate_pgf/collocate_pgf if it is certainly
758!> the same, this could lead to a small speedup
759!> -) the cost function is a fit through the median cost of mapping a pgf with a given l and a given radius (in grid points)
760!> fitting the measured data on an opteron/g95 using the expression
761!> a*(l+b)(r+c)**3+d which is based on the innerloop of the collocating routines
762! **************************************************************************************************
763 SUBROUTINE compute_pgf_properties(cube_center, lb_cube, ub_cube, radius, &
764 rs_desc, cube_info, la_max, zeta, la_min, lb_max, zetb, lb_min, ra, rab, rab2, eps)
765
766 INTEGER, DIMENSION(3), INTENT(OUT) :: cube_center, lb_cube, ub_cube
767 REAL(kind=dp), INTENT(OUT) :: radius
768 TYPE(realspace_grid_desc_type), POINTER :: rs_desc
769 TYPE(cube_info_type), INTENT(IN) :: cube_info
770 INTEGER, INTENT(IN) :: la_max
771 REAL(kind=dp), INTENT(IN) :: zeta
772 INTEGER, INTENT(IN) :: la_min, lb_max
773 REAL(kind=dp), INTENT(IN) :: zetb
774 INTEGER, INTENT(IN) :: lb_min
775 REAL(kind=dp), INTENT(IN) :: ra(3), rab(3), rab2, eps
776
777 INTEGER :: extent(3)
778 INTEGER, DIMENSION(:), POINTER :: sphere_bounds
779 REAL(kind=dp) :: cutoff, f, prefactor, rb(3), zetp
780 REAL(kind=dp), DIMENSION(3) :: rp
781
782! the radius for this task
783
784 zetp = zeta + zetb
785 rp(:) = ra(:) + zetb/zetp*rab(:)
786 rb(:) = ra(:) + rab(:)
787 cutoff = 1.0_dp
788 f = zetb/zetp
789 prefactor = exp(-zeta*f*rab2)
790 radius = exp_radius_very_extended(la_min, la_max, lb_min, lb_max, ra=ra, rb=rb, rp=rp, &
791 zetp=zetp, eps=eps, prefactor=prefactor, cutoff=cutoff)
792
793 CALL compute_cube_center(cube_center, rs_desc, zeta, zetb, ra, rab)
794 ! compute cube_center, the center of the gaussian product to map (folded to within the unit cell)
795 cube_center(:) = modulo(cube_center(:), rs_desc%npts(:))
796 cube_center(:) = cube_center(:) + rs_desc%lb(:)
797
798 IF (rs_desc%orthorhombic) THEN
799 CALL return_cube(cube_info, radius, lb_cube, ub_cube, sphere_bounds)
800 ELSE
801 CALL return_cube_nonortho(cube_info, radius, lb_cube, ub_cube, rp)
802 !? unclear if extent is computed correctly.
803 extent(:) = ub_cube(:) - lb_cube(:)
804 lb_cube(:) = -extent(:)/2 - 1
805 ub_cube(:) = extent(:)/2
806 END IF
807
808 END SUBROUTINE compute_pgf_properties
809! **************************************************************************************************
810!> \brief predicts the cost of a task in kcycles for a given task
811!> the model is based on a fit of actual data, and might need updating
812!> as collocate_pgf_product changes (or CPUs/compilers change)
813!> maybe some dynamic approach, improving the cost model on the fly could
814!> work as well
815!> the cost model does not yet take into account the fraction of space
816!> that is mapped locally for a given cube and rs_grid (generalised tasks)
817!> \param lb_cube ...
818!> \param ub_cube ...
819!> \param fraction ...
820!> \param lmax ...
821!> \param is_ortho ...
822!> \return ...
823! **************************************************************************************************
824 INTEGER FUNCTION cost_model(lb_cube, ub_cube, fraction, lmax, is_ortho)
825 INTEGER, DIMENSION(3), INTENT(IN) :: lb_cube, ub_cube
826 REAL(kind=dp), INTENT(IN) :: fraction
827 INTEGER :: lmax
828 LOGICAL :: is_ortho
829
830 INTEGER :: cmax
831 REAL(kind=dp) :: v1, v2, v3, v4, v5
832
833 cmax = maxval(((ub_cube - lb_cube) + 1)/2)
834
835 IF (is_ortho) THEN
836 v1 = 1.504760e+00_dp
837 v2 = 3.126770e+00_dp
838 v3 = 5.074106e+00_dp
839 v4 = 1.091568e+00_dp
840 v5 = 1.070187e+00_dp
841 ELSE
842 v1 = 7.831105e+00_dp
843 v2 = 2.675174e+00_dp
844 v3 = 7.546553e+00_dp
845 v4 = 6.122446e-01_dp
846 v5 = 3.886382e+00_dp
847 END IF
848 cost_model = ceiling(((lmax + v1)*(cmax + v2)**3*v3*fraction + v4 + v5*lmax**7)/1000.0_dp)
849
850 END FUNCTION cost_model
851! **************************************************************************************************
852!> \brief pgf_to_tasks converts a given pgf to one or more tasks, in particular
853!> this determines by which CPUs a given pgf gets collocated
854!> the format of the task array is as follows
855!> tasks(1,i) := destination
856!> tasks(2,i) := source
857!> tasks(3,i) := compressed type (iatom, jatom, ....)
858!> tasks(4,i) := type (0: replicated, 1: distributed local, 2: distributed generalised)
859!> tasks(5,i) := cost
860!> tasks(6,i) := alternate destination code (0 if none available)
861!>
862!> \param tasks ...
863!> \param ntasks ...
864!> \param curr_tasks ...
865!> \param rab ...
866!> \param cindex ...
867!> \param iatom ...
868!> \param jatom ...
869!> \param iset ...
870!> \param jset ...
871!> \param ipgf ...
872!> \param jpgf ...
873!> \param la_max ...
874!> \param lb_max ...
875!> \param rs_desc ...
876!> \param igrid_level ...
877!> \param n_levels ...
878!> \param cube_center ...
879!> \param lb_cube ...
880!> \param ub_cube ...
881!> \par History
882!> 10.2008 Refactored based on earlier routines by MattW [Joost VandeVondele]
883! **************************************************************************************************
884 SUBROUTINE pgf_to_tasks(tasks, ntasks, curr_tasks, &
885 rab, cindex, iatom, jatom, iset, jset, ipgf, jpgf, &
886 la_max, lb_max, rs_desc, igrid_level, n_levels, &
887 cube_center, lb_cube, ub_cube, radius)
888
889 TYPE(task_type), DIMENSION(:), POINTER :: tasks
890 INTEGER, INTENT(INOUT) :: ntasks, curr_tasks
891 REAL(kind=dp), DIMENSION(3), INTENT(IN) :: rab
892 INTEGER, INTENT(IN) :: cindex, iatom, jatom, iset, jset, ipgf, &
893 jpgf, la_max, lb_max
894 TYPE(realspace_grid_desc_type), POINTER :: rs_desc
895 INTEGER, INTENT(IN) :: igrid_level, n_levels
896 INTEGER, DIMENSION(3), INTENT(IN) :: cube_center, lb_cube, ub_cube
897 REAL(kind=dp), INTENT(IN) :: radius
898
899 INTEGER, PARAMETER :: add_tasks = 1000
900 REAL(kind=dp), PARAMETER :: mult_tasks = 2.0_dp
901
902 INTEGER :: added_tasks, cost, j, lmax
903 LOGICAL :: is_ortho
904 REAL(kind=dp) :: tfraction
905
906!$OMP SINGLE
907 ntasks = ntasks + 1
908 IF (ntasks > curr_tasks) THEN
909 curr_tasks = int((curr_tasks + add_tasks)*mult_tasks)
910 CALL reallocate_tasks(tasks, curr_tasks)
911 END IF
912!$OMP END SINGLE
913
914 IF (rs_desc%distributed) THEN
915
916 ! finds the node(s) that need to process this task
917 ! on exit tasks(:)%dist_type is 1 for distributed tasks and 2 for generalised tasks
918 CALL rs_find_node(rs_desc, igrid_level, n_levels, cube_center, &
919 ntasks=ntasks, tasks=tasks, lb_cube=lb_cube, ub_cube=ub_cube, added_tasks=added_tasks)
920
921 ELSE
922 tasks(ntasks)%destination = encode_rank(rs_desc%my_pos, igrid_level, n_levels)
923 tasks(ntasks)%dist_type = 0
924 tasks(ntasks)%subpatch_pattern = 0
925 added_tasks = 1
926 END IF
927
928 lmax = la_max + lb_max
929 is_ortho = (tasks(ntasks)%dist_type == 0 .OR. tasks(ntasks)%dist_type == 1) .AND. rs_desc%orthorhombic
930 ! we assume the load is shared equally between processes dealing with a generalised Gaussian.
931 ! this could be refined in the future
932 tfraction = 1.0_dp/added_tasks
933
934 cost = cost_model(lb_cube, ub_cube, tfraction, lmax, is_ortho)
935
936 DO j = 1, added_tasks
937 tasks(ntasks - added_tasks + j)%source = encode_rank(rs_desc%my_pos, igrid_level, n_levels)
938 tasks(ntasks - added_tasks + j)%cost = cost
939 tasks(ntasks - added_tasks + j)%grid_level = igrid_level
940 tasks(ntasks - added_tasks + j)%image = cindex
941 tasks(ntasks - added_tasks + j)%iatom = iatom
942 tasks(ntasks - added_tasks + j)%jatom = jatom
943 tasks(ntasks - added_tasks + j)%iset = iset
944 tasks(ntasks - added_tasks + j)%jset = jset
945 tasks(ntasks - added_tasks + j)%ipgf = ipgf
946 tasks(ntasks - added_tasks + j)%jpgf = jpgf
947 tasks(ntasks - added_tasks + j)%rab = rab
948 tasks(ntasks - added_tasks + j)%radius = radius
949 END DO
950
951 END SUBROUTINE pgf_to_tasks
952
953! **************************************************************************************************
954!> \brief performs load balancing of the tasks on the distributed grids
955!> \param tasks ...
956!> \param ntasks ...
957!> \param rs_descs ...
958!> \param grid_level ...
959!> \param natoms ...
960!> \par History
961!> created 2008-10-03 [Joost VandeVondele]
962! **************************************************************************************************
963 SUBROUTINE load_balance_distributed(tasks, ntasks, rs_descs, grid_level, natoms)
964
965 TYPE(task_type), DIMENSION(:), POINTER :: tasks
966 INTEGER :: ntasks
967 TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
968 POINTER :: rs_descs
969 INTEGER :: grid_level, natoms
970
971 CHARACTER(LEN=*), PARAMETER :: routinen = 'load_balance_distributed'
972
973 INTEGER :: handle
974 INTEGER, DIMENSION(:, :, :), POINTER :: list
975
976 CALL timeset(routinen, handle)
977
978 NULLIFY (list)
979 ! here we create for each cpu (0:ncpu-1) a list of possible destinations.
980 ! if a destination would not be in this list, it is a bug
981 CALL create_destination_list(list, rs_descs, grid_level)
982
983 ! now, walk over the tasks, filling in the loads of each destination
984 CALL compute_load_list(list, rs_descs, grid_level, tasks, ntasks, natoms, create_list=.true.)
985
986 ! optimize loads & fluxes
987 CALL optimize_load_list(list, rs_descs(1)%rs_desc%group, rs_descs(1)%rs_desc%my_pos)
988
989 ! now, walk over the tasks, using the list to set the destinations
990 CALL compute_load_list(list, rs_descs, grid_level, tasks, ntasks, natoms, create_list=.false.)
991
992 DEALLOCATE (list)
993
994 CALL timestop(handle)
995
996 END SUBROUTINE load_balance_distributed
997
998! **************************************************************************************************
999!> \brief this serial routine adjusts the fluxes in the global list
1000!>
1001!> \param list_global ...
1002!> \par History
1003!> created 2008-10-06 [Joost VandeVondele]
1004! **************************************************************************************************
1005 SUBROUTINE balance_global_list(list_global)
1006 INTEGER, DIMENSION(:, :, 0:) :: list_global
1007
1008 CHARACTER(LEN=*), PARAMETER :: routinen = 'balance_global_list'
1009 INTEGER, PARAMETER :: max_iter = 100
1010 REAL(kind=dp), PARAMETER :: tolerance_factor = 0.005_dp
1011
1012 INTEGER :: dest, handle, icpu, idest, iflux, &
1013 ilocal, k, maxdest, ncpu, nflux
1014 INTEGER, ALLOCATABLE, DIMENSION(:, :) :: flux_connections
1015 LOGICAL :: solution_optimal
1016 REAL(kind=dp) :: average, load_shift, max_load_shift, &
1017 tolerance
1018 REAL(kind=dp), ALLOCATABLE, DIMENSION(:) :: load, optimized_flux, optimized_load
1019 REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :) :: flux_limits
1020
1021 CALL timeset(routinen, handle)
1022
1023 ncpu = SIZE(list_global, 3)
1024 maxdest = SIZE(list_global, 2)
1025 ALLOCATE (load(0:ncpu - 1))
1026 load = 0.0_dp
1027 ALLOCATE (optimized_load(0:ncpu - 1))
1028
1029 ! figure out the number of fluxes
1030 ! we assume that the global_list is symmetric
1031 nflux = 0
1032 DO icpu = 0, ncpu - 1
1033 DO idest = 1, maxdest
1034 dest = list_global(1, idest, icpu)
1035 IF (dest < ncpu .AND. dest > icpu) nflux = nflux + 1
1036 END DO
1037 END DO
1038 ALLOCATE (optimized_flux(nflux))
1039 ALLOCATE (flux_limits(2, nflux))
1040 ALLOCATE (flux_connections(2, nflux))
1041
1042 ! reorder data
1043 flux_limits = 0
1044 nflux = 0
1045 DO icpu = 0, ncpu - 1
1046 load(icpu) = sum(list_global(2, :, icpu))
1047 DO idest = 1, maxdest
1048 dest = list_global(1, idest, icpu)
1049 IF (dest < ncpu) THEN
1050 IF (dest .NE. icpu) THEN
1051 IF (dest > icpu) THEN
1052 nflux = nflux + 1
1053 flux_limits(2, nflux) = list_global(2, idest, icpu)
1054 flux_connections(1, nflux) = icpu
1055 flux_connections(2, nflux) = dest
1056 ELSE
1057 DO iflux = 1, nflux
1058 IF (flux_connections(1, iflux) == dest .AND. flux_connections(2, iflux) == icpu) THEN
1059 flux_limits(1, iflux) = -list_global(2, idest, icpu)
1060 EXIT
1061 END IF
1062 END DO
1063 END IF
1064 END IF
1065 END IF
1066 END DO
1067 END DO
1068
1069 solution_optimal = .false.
1070 optimized_flux = 0.0_dp
1071
1072 ! an iterative solver, if iterated till convergence the maximum load is minimal
1073 ! we terminate before things are fully converged, since this does show up in the timings
1074 ! once the largest shift becomes less than a small fraction of the average load, we're done
1075 ! we're perfectly happy if the load balance is within 1 percent or so
1076 ! the maximum load normally converges even faster
1077 average = sum(load)/SIZE(load)
1078 tolerance = tolerance_factor*average
1079
1080 optimized_load(:) = load
1081 DO k = 1, max_iter
1082 max_load_shift = 0.0_dp
1083 DO iflux = 1, nflux
1084 load_shift = (optimized_load(flux_connections(1, iflux)) - optimized_load(flux_connections(2, iflux)))/2
1085 load_shift = max(flux_limits(1, iflux) - optimized_flux(iflux), load_shift)
1086 load_shift = min(flux_limits(2, iflux) - optimized_flux(iflux), load_shift)
1087 max_load_shift = max(abs(load_shift), max_load_shift)
1088 optimized_load(flux_connections(1, iflux)) = optimized_load(flux_connections(1, iflux)) - load_shift
1089 optimized_load(flux_connections(2, iflux)) = optimized_load(flux_connections(2, iflux)) + load_shift
1090 optimized_flux(iflux) = optimized_flux(iflux) + load_shift
1091 END DO
1092 IF (max_load_shift < tolerance) THEN
1093 solution_optimal = .true.
1094 EXIT
1095 END IF
1096 END DO
1097
1098 ! now adjust the load list to reflect the optimized fluxes
1099 ! reorder data
1100 nflux = 0
1101 DO icpu = 0, ncpu - 1
1102 DO idest = 1, maxdest
1103 IF (list_global(1, idest, icpu) == icpu) ilocal = idest
1104 END DO
1105 DO idest = 1, maxdest
1106 dest = list_global(1, idest, icpu)
1107 IF (dest < ncpu) THEN
1108 IF (dest .NE. icpu) THEN
1109 IF (dest > icpu) THEN
1110 nflux = nflux + 1
1111 IF (optimized_flux(nflux) > 0) THEN
1112 list_global(2, ilocal, icpu) = list_global(2, ilocal, icpu) + &
1113 list_global(2, idest, icpu) - nint(optimized_flux(nflux))
1114 list_global(2, idest, icpu) = nint(optimized_flux(nflux))
1115 ELSE
1116 list_global(2, ilocal, icpu) = list_global(2, ilocal, icpu) + &
1117 list_global(2, idest, icpu)
1118 list_global(2, idest, icpu) = 0
1119 END IF
1120 ELSE
1121 DO iflux = 1, nflux
1122 IF (flux_connections(1, iflux) == dest .AND. flux_connections(2, iflux) == icpu) THEN
1123 IF (optimized_flux(iflux) > 0) THEN
1124 list_global(2, ilocal, icpu) = list_global(2, ilocal, icpu) + &
1125 list_global(2, idest, icpu)
1126 list_global(2, idest, icpu) = 0
1127 ELSE
1128 list_global(2, ilocal, icpu) = list_global(2, ilocal, icpu) + &
1129 list_global(2, idest, icpu) + nint(optimized_flux(iflux))
1130 list_global(2, idest, icpu) = -nint(optimized_flux(iflux))
1131 END IF
1132 EXIT
1133 END IF
1134 END DO
1135 END IF
1136 END IF
1137 END IF
1138 END DO
1139 END DO
1140
1141 CALL timestop(handle)
1142
1143 END SUBROUTINE balance_global_list
1144
1145! **************************************************************************************************
1146!> \brief this routine gets back optimized loads for all destinations
1147!>
1148!> \param list ...
1149!> \param group ...
1150!> \param my_pos ...
1151!> \par History
1152!> created 2008-10-06 [Joost VandeVondele]
1153!> Modified 2016-01 [EPCC] Reduce memory requirements on P processes
1154!> from O(P^2) to O(P)
1155! **************************************************************************************************
1156 SUBROUTINE optimize_load_list(list, group, my_pos)
1157 INTEGER, DIMENSION(:, :, 0:) :: list
1158 TYPE(mp_comm_type), INTENT(IN) :: group
1159 INTEGER, INTENT(IN) :: my_pos
1160
1161 CHARACTER(LEN=*), PARAMETER :: routinen = 'optimize_load_list'
1162 INTEGER, PARAMETER :: rank_of_root = 0
1163
1164 INTEGER :: handle, icpu, idest, maxdest, ncpu
1165 INTEGER, ALLOCATABLE, DIMENSION(:) :: load_all
1166 INTEGER, ALLOCATABLE, DIMENSION(:, :) :: load_partial
1167 INTEGER, ALLOCATABLE, DIMENSION(:, :, :) :: list_global
1168
1169 CALL timeset(routinen, handle)
1170
1171 ncpu = SIZE(list, 3)
1172 maxdest = SIZE(list, 2)
1173
1174 !find total workload ...
1175 ALLOCATE (load_all(maxdest*ncpu))
1176 load_all(:) = reshape(list(2, :, :), (/maxdest*ncpu/))
1177 CALL group%sum(load_all(:), rank_of_root)
1178
1179 ! ... and optimise the work per process
1180 ALLOCATE (list_global(2, maxdest, ncpu))
1181 IF (rank_of_root .EQ. my_pos) THEN
1182 list_global(1, :, :) = list(1, :, :)
1183 list_global(2, :, :) = reshape(load_all, (/maxdest, ncpu/))
1184 CALL balance_global_list(list_global)
1185 END IF
1186 CALL group%bcast(list_global, rank_of_root)
1187
1188 !figure out how much can be sent to other processes
1189 ALLOCATE (load_partial(maxdest, ncpu))
1190 ! send 'load_all', which is a copy of 'list' (but without leading dimension/stride)
1191 CALL group%sum_partial(reshape(load_all, (/maxdest, ncpu/)), load_partial(:, :))
1192
1193 DO icpu = 1, ncpu
1194 DO idest = 1, maxdest
1195
1196 !need to deduct 1 because `list' was passed in to this routine as being indexed from zero
1197 IF (load_partial(idest, icpu) > list_global(2, idest, icpu)) THEN
1198 IF (load_partial(idest, icpu) - list(2, idest, icpu - 1) < list_global(2, idest, icpu)) THEN
1199 list(2, idest, icpu - 1) = list_global(2, idest, icpu) &
1200 - (load_partial(idest, icpu) - list(2, idest, icpu - 1))
1201 ELSE
1202 list(2, idest, icpu - 1) = 0
1203 END IF
1204 END IF
1205
1206 END DO
1207 END DO
1208
1209 !clean up before leaving
1210 DEALLOCATE (load_all)
1211 DEALLOCATE (list_global)
1212 DEALLOCATE (load_partial)
1213
1214 CALL timestop(handle)
1215 END SUBROUTINE optimize_load_list
1216
1217! **************************************************************************************************
1218!> \brief fill the load list with values derived from the tasks array
1219!> from the alternate locations, we select the alternate location that
1220!> can be used without increasing the number of matrix blocks needed to
1221!> distribute.
1222!> Replicated tasks are not yet considered
1223!>
1224!> \param list ...
1225!> \param rs_descs ...
1226!> \param grid_level ...
1227!> \param tasks ...
1228!> \param ntasks ...
1229!> \param natoms ...
1230!> \param create_list ...
1231!> \par History
1232!> created 2008-10-06 [Joost VandeVondele]
1233! **************************************************************************************************
1234 SUBROUTINE compute_load_list(list, rs_descs, grid_level, tasks, ntasks, natoms, create_list)
1235 INTEGER, DIMENSION(:, :, 0:) :: list
1236 TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
1237 POINTER :: rs_descs
1238 INTEGER :: grid_level
1239 TYPE(task_type), DIMENSION(:), POINTER :: tasks
1240 INTEGER :: ntasks, natoms
1241 LOGICAL :: create_list
1242
1243 CHARACTER(LEN=*), PARAMETER :: routinen = 'compute_load_list'
1244
1245 INTEGER :: cost, dest, handle, i, iatom, ilevel, img, img_old, iopt, ipgf, iset, itask, &
1246 itask_start, itask_stop, jatom, jpgf, jset, li, maxdest, ncpu, ndest_pair, nopt, nshort, &
1247 rank
1248 INTEGER(KIND=int_8) :: bit_pattern, ipair, ipair_old, natom8
1249 INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:) :: loads
1250 INTEGER, ALLOCATABLE, DIMENSION(:) :: all_dests, index
1251 INTEGER, DIMENSION(6) :: options
1252
1253 CALL timeset(routinen, handle)
1254
1255 ALLOCATE (loads(0:rs_descs(grid_level)%rs_desc%group_size - 1))
1256 CALL get_current_loads(loads, rs_descs, grid_level, ntasks, tasks, use_reordered_ranks=.false.)
1257
1258 maxdest = SIZE(list, 2)
1259 ncpu = SIZE(list, 3)
1260 natom8 = natoms
1261
1262 ! first find the tasks that deal with the same atom pair
1263 itask_stop = 0
1264 ipair_old = huge(ipair_old)
1265 img_old = -1
1266 ALLOCATE (all_dests(0))
1267 ALLOCATE (index(0))
1268
1269 DO
1270
1271 ! first find the range of tasks that deal with the same atom pair
1272 itask_start = itask_stop + 1
1273 itask_stop = itask_start
1274 IF (itask_stop > ntasks) EXIT
1275 ilevel = tasks(itask_stop)%grid_level
1276 img_old = tasks(itask_stop)%image
1277 iatom = tasks(itask_stop)%iatom
1278 jatom = tasks(itask_stop)%jatom
1279 iset = tasks(itask_stop)%iset
1280 jset = tasks(itask_stop)%jset
1281 ipgf = tasks(itask_stop)%ipgf
1282 jpgf = tasks(itask_stop)%jpgf
1283
1284 ipair_old = (iatom - 1)*natom8 + (jatom - 1)
1285 DO
1286 IF (itask_stop + 1 > ntasks) EXIT
1287 ilevel = tasks(itask_stop + 1)%grid_level
1288 img = tasks(itask_stop + 1)%image
1289 iatom = tasks(itask_stop + 1)%iatom
1290 jatom = tasks(itask_stop + 1)%jatom
1291 iset = tasks(itask_stop + 1)%iset
1292 jset = tasks(itask_stop + 1)%jset
1293 ipgf = tasks(itask_stop + 1)%ipgf
1294 jpgf = tasks(itask_stop + 1)%jpgf
1295
1296 ipair = (iatom - 1)*natom8 + (jatom - 1)
1297 IF (ipair == ipair_old .AND. img == img_old) THEN
1298 itask_stop = itask_stop + 1
1299 ELSE
1300 EXIT
1301 END IF
1302 END DO
1303 ipair = ipair_old
1304 nshort = itask_stop - itask_start + 1
1305
1306 ! find the unique list of destinations on this grid level only
1307 DEALLOCATE (all_dests)
1308 ALLOCATE (all_dests(nshort))
1309 DEALLOCATE (index)
1310 ALLOCATE (index(nshort))
1311 DO i = 1, nshort
1312 ilevel = tasks(itask_start + i - 1)%grid_level
1313 img = tasks(itask_start + i - 1)%image
1314 iatom = tasks(itask_start + i - 1)%iatom
1315 jatom = tasks(itask_start + i - 1)%jatom
1316 iset = tasks(itask_start + i - 1)%iset
1317 jset = tasks(itask_start + i - 1)%jset
1318 ipgf = tasks(itask_start + i - 1)%ipgf
1319 jpgf = tasks(itask_start + i - 1)%jpgf
1320
1321 IF (ilevel .EQ. grid_level) THEN
1322 all_dests(i) = decode_rank(tasks(itask_start + i - 1)%destination, SIZE(rs_descs))
1323 ELSE
1324 all_dests(i) = huge(all_dests(i))
1325 END IF
1326 END DO
1327 CALL sort(all_dests, nshort, index)
1328 ndest_pair = 1
1329 DO i = 2, nshort
1330 IF ((all_dests(ndest_pair) .NE. all_dests(i)) .AND. (all_dests(i) .NE. huge(all_dests(i)))) THEN
1331 ndest_pair = ndest_pair + 1
1332 all_dests(ndest_pair) = all_dests(i)
1333 END IF
1334 END DO
1335
1336 DO itask = itask_start, itask_stop
1337
1338 dest = decode_rank(tasks(itask)%destination, SIZE(rs_descs)) ! notice that dest can be changed
1339 ilevel = tasks(itask)%grid_level
1340 img = tasks(itask)%image
1341 iatom = tasks(itask)%iatom
1342 jatom = tasks(itask)%jatom
1343 iset = tasks(itask)%iset
1344 jset = tasks(itask)%jset
1345 ipgf = tasks(itask)%ipgf
1346 jpgf = tasks(itask)%jpgf
1347
1348 ! Only proceed with tasks which are on this grid level
1349 IF (ilevel .NE. grid_level) cycle
1350 ipair = (iatom - 1)*natom8 + (jatom - 1)
1351 cost = int(tasks(itask)%cost)
1352
1353 SELECT CASE (tasks(itask)%dist_type)
1354 CASE (1)
1355 bit_pattern = tasks(itask)%subpatch_pattern
1356 nopt = 0
1357 IF (btest(bit_pattern, 0)) THEN
1358 rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, (/-1, 0, 0/))
1359 IF (any(all_dests(1:ndest_pair) .EQ. rank)) THEN
1360 nopt = nopt + 1
1361 options(nopt) = rank
1362 END IF
1363 END IF
1364 IF (btest(bit_pattern, 1)) THEN
1365 rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, (/+1, 0, 0/))
1366 IF (any(all_dests(1:ndest_pair) .EQ. rank)) THEN
1367 nopt = nopt + 1
1368 options(nopt) = rank
1369 END IF
1370 END IF
1371 IF (btest(bit_pattern, 2)) THEN
1372 rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, (/0, -1, 0/))
1373 IF (any(all_dests(1:ndest_pair) .EQ. rank)) THEN
1374 nopt = nopt + 1
1375 options(nopt) = rank
1376 END IF
1377 END IF
1378 IF (btest(bit_pattern, 3)) THEN
1379 rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, (/0, +1, 0/))
1380 IF (any(all_dests(1:ndest_pair) .EQ. rank)) THEN
1381 nopt = nopt + 1
1382 options(nopt) = rank
1383 END IF
1384 END IF
1385 IF (btest(bit_pattern, 4)) THEN
1386 rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, (/0, 0, -1/))
1387 IF (any(all_dests(1:ndest_pair) .EQ. rank)) THEN
1388 nopt = nopt + 1
1389 options(nopt) = rank
1390 END IF
1391 END IF
1392 IF (btest(bit_pattern, 5)) THEN
1393 rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, (/0, 0, +1/))
1394 IF (any(all_dests(1:ndest_pair) .EQ. rank)) THEN
1395 nopt = nopt + 1
1396 options(nopt) = rank
1397 END IF
1398 END IF
1399 IF (nopt > 0) THEN
1400 ! set it to the rank with the lowest load
1401 rank = options(1)
1402 DO iopt = 2, nopt
1403 IF (loads(rank) > loads(options(iopt))) rank = options(iopt)
1404 END DO
1405 ELSE
1406 rank = dest
1407 END IF
1408 li = list_index(list, rank, dest)
1409 IF (create_list) THEN
1410 list(2, li, dest) = list(2, li, dest) + cost
1411 ELSE
1412 IF (list(1, li, dest) == dest) THEN
1413 tasks(itask)%destination = encode_rank(dest, ilevel, SIZE(rs_descs))
1414 ELSE
1415 IF (list(2, li, dest) >= cost) THEN
1416 list(2, li, dest) = list(2, li, dest) - cost
1417 tasks(itask)%destination = encode_rank(list(1, li, dest), ilevel, SIZE(rs_descs))
1418 ELSE
1419 tasks(itask)%destination = encode_rank(dest, ilevel, SIZE(rs_descs))
1420 END IF
1421 END IF
1422 END IF
1423 CASE (2) ! generalised
1424 li = list_index(list, dest, dest)
1425 IF (create_list) THEN
1426 list(2, li, dest) = list(2, li, dest) + cost
1427 ELSE
1428 IF (list(1, li, dest) == dest) THEN
1429 tasks(itask)%destination = encode_rank(dest, ilevel, SIZE(rs_descs))
1430 ELSE
1431 IF (list(2, li, dest) >= cost) THEN
1432 list(2, li, dest) = list(2, li, dest) - cost
1433 tasks(itask)%destination = encode_rank(list(1, li, dest), ilevel, SIZE(rs_descs))
1434 ELSE
1435 tasks(itask)%destination = encode_rank(dest, ilevel, SIZE(rs_descs))
1436 END IF
1437 END IF
1438 END IF
1439 CASE DEFAULT
1440 cpabort("")
1441 END SELECT
1442
1443 END DO
1444
1445 END DO
1446
1447 CALL timestop(handle)
1448
1449 END SUBROUTINE compute_load_list
1450! **************************************************************************************************
1451!> \brief small helper function to return the proper index in the list array
1452!>
1453!> \param list ...
1454!> \param rank ...
1455!> \param dest ...
1456!> \return ...
1457!> \par History
1458!> created 2008-10-06 [Joost VandeVondele]
1459! **************************************************************************************************
1460 INTEGER FUNCTION list_index(list, rank, dest)
1461 INTEGER, DIMENSION(:, :, 0:), INTENT(IN) :: list
1462 INTEGER, INTENT(IN) :: rank, dest
1463
1464 list_index = 1
1465 DO
1466 IF (list(1, list_index, dest) == rank) EXIT
1467 list_index = list_index + 1
1468 END DO
1469 END FUNCTION list_index
1470! **************************************************************************************************
1471!> \brief create a list with possible destinations (i.e. the central cpu and neighbors) for each cpu
1472!> note that we allocate it with an additional field to store the load of this destination
1473!>
1474!> \param list ...
1475!> \param rs_descs ...
1476!> \param grid_level ...
1477!> \par History
1478!> created 2008-10-06 [Joost VandeVondele]
1479! **************************************************************************************************
1480 SUBROUTINE create_destination_list(list, rs_descs, grid_level)
1481 INTEGER, DIMENSION(:, :, :), POINTER :: list
1482 TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
1483 POINTER :: rs_descs
1484 INTEGER, INTENT(IN) :: grid_level
1485
1486 CHARACTER(LEN=*), PARAMETER :: routinen = 'create_destination_list'
1487
1488 INTEGER :: handle, i, icpu, j, maxcount, ncpu, &
1489 ultimate_max
1490 INTEGER, ALLOCATABLE, DIMENSION(:) :: index, sublist
1491
1492 CALL timeset(routinen, handle)
1493
1494 cpassert(.NOT. ASSOCIATED(list))
1495 ncpu = rs_descs(grid_level)%rs_desc%group_size
1496 ultimate_max = 7
1497
1498 ALLOCATE (list(2, ultimate_max, 0:ncpu - 1))
1499
1500 ALLOCATE (index(ultimate_max))
1501 ALLOCATE (sublist(ultimate_max))
1502 sublist = huge(sublist)
1503
1504 maxcount = 1
1505 DO icpu = 0, ncpu - 1
1506 sublist(1) = icpu
1507 sublist(2) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, (/-1, 0, 0/))
1508 sublist(3) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, (/+1, 0, 0/))
1509 sublist(4) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, (/0, -1, 0/))
1510 sublist(5) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, (/0, +1, 0/))
1511 sublist(6) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, (/0, 0, -1/))
1512 sublist(7) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, (/0, 0, +1/))
1513 ! only retain unique values of the destination
1514 CALL sort(sublist, ultimate_max, index)
1515 j = 1
1516 DO i = 2, 7
1517 IF (sublist(i) .NE. sublist(j)) THEN
1518 j = j + 1
1519 sublist(j) = sublist(i)
1520 END IF
1521 END DO
1522 maxcount = max(maxcount, j)
1523 sublist(j + 1:ultimate_max) = huge(sublist)
1524 list(1, :, icpu) = sublist
1525 list(2, :, icpu) = 0
1526 END DO
1527
1528 CALL reallocate(list, 1, 2, 1, maxcount, 0, ncpu - 1)
1529
1530 CALL timestop(handle)
1531
1532 END SUBROUTINE create_destination_list
1533
1534! **************************************************************************************************
1535!> \brief given a task list, compute the load of each process everywhere
1536!> giving this function the ability to loop over a (sub)set of rs_grids,
1537!> and do all the communication in one shot, would speed it up
1538!> \param loads ...
1539!> \param rs_descs ...
1540!> \param grid_level ...
1541!> \param ntasks ...
1542!> \param tasks ...
1543!> \param use_reordered_ranks ...
1544!> \par History
1545!> none
1546!> \author MattW 21/11/2007
1547! **************************************************************************************************
1548 SUBROUTINE get_current_loads(loads, rs_descs, grid_level, ntasks, tasks, use_reordered_ranks)
1549 INTEGER(KIND=int_8), DIMENSION(:) :: loads
1550 TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
1551 POINTER :: rs_descs
1552 INTEGER :: grid_level, ntasks
1553 TYPE(task_type), DIMENSION(:), POINTER :: tasks
1554 LOGICAL, INTENT(IN) :: use_reordered_ranks
1555
1556 CHARACTER(LEN=*), PARAMETER :: routinen = 'get_current_loads'
1557
1558 INTEGER :: handle, i, iatom, ilevel, img, ipgf, &
1559 iset, jatom, jpgf, jset
1560 INTEGER(KIND=int_8) :: total_cost_local
1561 INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:) :: recv_buf_i, send_buf_i
1562 TYPE(realspace_grid_desc_type), POINTER :: desc
1563
1564 CALL timeset(routinen, handle)
1565
1566 desc => rs_descs(grid_level)%rs_desc
1567
1568 ! allocate local arrays
1569 ALLOCATE (send_buf_i(desc%group_size))
1570 ALLOCATE (recv_buf_i(desc%group_size))
1571
1572 ! communication step 1 : compute the total local cost of the tasks
1573 ! each proc needs to know the amount of work he will receive
1574
1575 ! send buffer now contains for each target the cost of the tasks it will receive
1576 send_buf_i = 0
1577 DO i = 1, ntasks
1578 ilevel = tasks(i)%grid_level
1579 img = tasks(i)%image
1580 iatom = tasks(i)%iatom
1581 jatom = tasks(i)%jatom
1582 iset = tasks(i)%iset
1583 jset = tasks(i)%jset
1584 ipgf = tasks(i)%ipgf
1585 jpgf = tasks(i)%jpgf
1586 IF (ilevel .NE. grid_level) cycle
1587 IF (use_reordered_ranks) THEN
1588 send_buf_i(rs_descs(ilevel)%rs_desc%virtual2real(decode_rank(tasks(i)%destination, SIZE(rs_descs))) + 1) = &
1589 send_buf_i(rs_descs(ilevel)%rs_desc%virtual2real(decode_rank(tasks(i)%destination, SIZE(rs_descs))) + 1) &
1590 + tasks(i)%cost
1591 ELSE
1592 send_buf_i(decode_rank(tasks(i)%destination, SIZE(rs_descs)) + 1) = &
1593 send_buf_i(decode_rank(tasks(i)%destination, SIZE(rs_descs)) + 1) &
1594 + tasks(i)%cost
1595 END IF
1596 END DO
1597 CALL desc%group%alltoall(send_buf_i, recv_buf_i, 1)
1598
1599 ! communication step 2 : compute the global cost of the tasks
1600 total_cost_local = sum(recv_buf_i)
1601
1602 ! after this step, the recv buffer contains the local cost for each CPU
1603 CALL desc%group%allgather(total_cost_local, loads)
1604
1605 CALL timestop(handle)
1606
1607 END SUBROUTINE get_current_loads
1608! **************************************************************************************************
1609!> \brief performs load balancing shifting tasks on the replicated grids
1610!> this modifies the destination of some of the tasks on replicated
1611!> grids, and in this way balances the load
1612!> \param rs_descs ...
1613!> \param ntasks ...
1614!> \param tasks ...
1615!> \par History
1616!> none
1617!> \author MattW 21/11/2007
1618! **************************************************************************************************
1619 SUBROUTINE load_balance_replicated(rs_descs, ntasks, tasks)
1620
1621 TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
1622 POINTER :: rs_descs
1623 INTEGER :: ntasks
1624 TYPE(task_type), DIMENSION(:), POINTER :: tasks
1625
1626 CHARACTER(LEN=*), PARAMETER :: routinen = 'load_balance_replicated'
1627
1628 INTEGER :: handle, i, iatom, ilevel, img, ipgf, &
1629 iset, j, jatom, jpgf, jset, &
1630 no_overloaded, no_underloaded, &
1631 proc_receiving
1632 INTEGER(KIND=int_8) :: average_cost, cost_task_rep, count, &
1633 offset, total_cost_global
1634 INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:) :: load_imbalance, loads, recv_buf_i
1635 INTEGER, ALLOCATABLE, DIMENSION(:) :: index
1636 TYPE(realspace_grid_desc_type), POINTER :: desc
1637
1638 CALL timeset(routinen, handle)
1639
1640 desc => rs_descs(1)%rs_desc
1641
1642 ! allocate local arrays
1643 ALLOCATE (recv_buf_i(desc%group_size))
1644 ALLOCATE (loads(desc%group_size))
1645
1646 recv_buf_i = 0
1647 DO i = 1, SIZE(rs_descs)
1648 CALL get_current_loads(loads, rs_descs, i, ntasks, tasks, use_reordered_ranks=.true.)
1649 recv_buf_i(:) = recv_buf_i + loads
1650 END DO
1651
1652 total_cost_global = sum(recv_buf_i)
1653 average_cost = total_cost_global/desc%group_size
1654
1655 !
1656 ! compute how to redistribute the replicated tasks so that the average cost is reached
1657 !
1658
1659 ! load imbalance measures the load of a given CPU relative
1660 ! to the optimal load distribution (load=average)
1661 ALLOCATE (load_imbalance(desc%group_size))
1662 ALLOCATE (index(desc%group_size))
1663
1664 load_imbalance(:) = recv_buf_i - average_cost
1665 no_overloaded = 0
1666 no_underloaded = 0
1667
1668 DO i = 1, desc%group_size
1669 IF (load_imbalance(i) .GT. 0) no_overloaded = no_overloaded + 1
1670 IF (load_imbalance(i) .LT. 0) no_underloaded = no_underloaded + 1
1671 END DO
1672
1673 ! sort the recv_buffer on number of tasks, gives us index which provides a
1674 ! mapping between processor ranks and how overloaded the processor
1675 CALL sort(recv_buf_i, SIZE(recv_buf_i), index)
1676
1677 ! find out the number of replicated tasks each proc has
1678 ! but only those tasks which have not yet been assigned
1679 cost_task_rep = 0
1680 DO i = 1, ntasks
1681 IF (tasks(i)%dist_type .EQ. 0 &
1682 .AND. decode_rank(tasks(i)%destination, SIZE(rs_descs)) == decode_rank(tasks(i)%source, SIZE(rs_descs))) THEN
1683 cost_task_rep = cost_task_rep + tasks(i)%cost
1684 END IF
1685 END DO
1686
1687 ! now, correct the load imbalance for the overloaded CPUs
1688 ! they will send away not more than the total load of replicated tasks
1689 CALL desc%group%allgather(cost_task_rep, recv_buf_i)
1690
1691 DO i = 1, desc%group_size
1692 ! At the moment we can only offload replicated tasks
1693 IF (load_imbalance(i) .GT. 0) &
1694 load_imbalance(i) = min(load_imbalance(i), recv_buf_i(i))
1695 END DO
1696
1697 ! simplest algorithm I can think of of is that the processor with the most
1698 ! excess tasks fills up the process needing most, then moves on to next most.
1699 ! At the moment if we've got less replicated tasks than we're overloaded then
1700 ! task balancing will be incomplete
1701
1702 ! only need to do anything if I've excess tasks
1703 IF (load_imbalance(desc%my_pos + 1) .GT. 0) THEN
1704
1705 count = 0 ! weighted amount of tasks offloaded
1706 offset = 0 ! no of underloaded processes already filled by other more overloaded procs
1707
1708 ! calculate offset
1709 DO i = desc%group_size, desc%group_size - no_overloaded + 1, -1
1710 IF (index(i) .EQ. desc%my_pos + 1) THEN
1711 EXIT
1712 ELSE
1713 offset = offset + load_imbalance(index(i))
1714 END IF
1715 END DO
1716
1717 ! find my starting processor to send to
1718 proc_receiving = huge(proc_receiving)
1719 DO i = 1, no_underloaded
1720 offset = offset + load_imbalance(index(i))
1721 IF (offset .LE. 0) THEN
1722 proc_receiving = i
1723 EXIT
1724 END IF
1725 END DO
1726
1727 ! offset now contains minus the number of tasks proc_receiving requires
1728 ! we fill this up by adjusting the destination of tasks on the replicated grid,
1729 ! then move to next most underloaded proc
1730 DO j = 1, ntasks
1731 IF (tasks(j)%dist_type .EQ. 0 &
1732 .AND. decode_rank(tasks(j)%destination, SIZE(rs_descs)) == decode_rank(tasks(j)%source, SIZE(rs_descs))) THEN
1733 ! just avoid sending to non existing procs due to integer truncation
1734 ! in the computation of the average
1735 IF (proc_receiving .GT. no_underloaded) EXIT
1736 ! set new destination
1737 ilevel = tasks(j)%grid_level
1738 img = tasks(j)%image
1739 iatom = tasks(j)%iatom
1740 jatom = tasks(j)%jatom
1741 iset = tasks(j)%iset
1742 jset = tasks(j)%jset
1743 ipgf = tasks(j)%ipgf
1744 jpgf = tasks(j)%jpgf
1745 tasks(j)%destination = encode_rank(index(proc_receiving) - 1, ilevel, SIZE(rs_descs))
1746 offset = offset + tasks(j)%cost
1747 count = count + tasks(j)%cost
1748 IF (count .GE. load_imbalance(desc%my_pos + 1)) EXIT
1749 IF (offset .GT. 0) THEN
1750 proc_receiving = proc_receiving + 1
1751 ! just avoid sending to non existing procs due to integer truncation
1752 ! in the computation of the average
1753 IF (proc_receiving .GT. no_underloaded) EXIT
1754 offset = load_imbalance(index(proc_receiving))
1755 END IF
1756 END IF
1757 END DO
1758 END IF
1759
1760 DEALLOCATE (index)
1761 DEALLOCATE (load_imbalance)
1762
1763 CALL timestop(handle)
1764
1765 END SUBROUTINE load_balance_replicated
1766
1767! **************************************************************************************************
1768!> \brief given an input task list, redistribute so that all tasks can be processed locally,
1769!> i.e. dest equals rank
1770!> \param rs_descs ...
1771!> \param ntasks ...
1772!> \param tasks ...
1773!> \param ntasks_recv ...
1774!> \param tasks_recv ...
1775!> \par History
1776!> none
1777!> \author MattW 21/11/2007
1778! **************************************************************************************************
1779 SUBROUTINE create_local_tasks(rs_descs, ntasks, tasks, ntasks_recv, tasks_recv)
1780
1781 TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
1782 POINTER :: rs_descs
1783 INTEGER :: ntasks
1784 TYPE(task_type), DIMENSION(:), POINTER :: tasks
1785 INTEGER :: ntasks_recv
1786 TYPE(task_type), DIMENSION(:), POINTER :: tasks_recv
1787
1788 CHARACTER(LEN=*), PARAMETER :: routinen = 'create_local_tasks'
1789
1790 INTEGER :: handle, i, j, k, l, rank
1791 INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:) :: recv_buf, send_buf
1792 INTEGER, ALLOCATABLE, DIMENSION(:) :: recv_disps, recv_sizes, send_disps, &
1793 send_sizes
1794 TYPE(realspace_grid_desc_type), POINTER :: desc
1795
1796 CALL timeset(routinen, handle)
1797
1798 desc => rs_descs(1)%rs_desc
1799
1800 ! allocate local arrays
1801 ALLOCATE (send_sizes(desc%group_size))
1802 ALLOCATE (recv_sizes(desc%group_size))
1803 ALLOCATE (send_disps(desc%group_size))
1804 ALLOCATE (recv_disps(desc%group_size))
1805 ALLOCATE (send_buf(desc%group_size))
1806 ALLOCATE (recv_buf(desc%group_size))
1807
1808 ! fill send buffer, now counting how many tasks will be send (stored in an int8 array for convenience only).
1809 send_buf = 0
1810 DO i = 1, ntasks
1811 rank = rs_descs(decode_level(tasks(i)%destination, SIZE(rs_descs))) &
1812 %rs_desc%virtual2real(decode_rank(tasks(i)%destination, SIZE(rs_descs)))
1813 send_buf(rank + 1) = send_buf(rank + 1) + 1
1814 END DO
1815
1816 CALL desc%group%alltoall(send_buf, recv_buf, 1)
1817
1818 ! pack the tasks, and send them around
1819
1820 send_sizes = 0
1821 send_disps = 0
1822 recv_sizes = 0
1823 recv_disps = 0
1824
1825 send_sizes(1) = int(send_buf(1)*task_size_in_int8)
1826 recv_sizes(1) = int(recv_buf(1)*task_size_in_int8)
1827 DO i = 2, desc%group_size
1828 send_sizes(i) = int(send_buf(i)*task_size_in_int8)
1829 recv_sizes(i) = int(recv_buf(i)*task_size_in_int8)
1830 send_disps(i) = send_disps(i - 1) + send_sizes(i - 1)
1831 recv_disps(i) = recv_disps(i - 1) + recv_sizes(i - 1)
1832 END DO
1833
1834 ! deallocate old send/recv buffers
1835 DEALLOCATE (send_buf)
1836 DEALLOCATE (recv_buf)
1837
1838 ! allocate them with new sizes
1839 ALLOCATE (send_buf(sum(send_sizes)))
1840 ALLOCATE (recv_buf(sum(recv_sizes)))
1841
1842 ! do packing
1843 send_buf = 0
1844 send_sizes = 0
1845 DO j = 1, ntasks
1846 i = rs_descs(decode_level(tasks(j)%destination, SIZE(rs_descs))) &
1847 %rs_desc%virtual2real(decode_rank(tasks(j)%destination, SIZE(rs_descs))) + 1
1848 l = send_disps(i) + send_sizes(i)
1849 CALL serialize_task(tasks(j), send_buf(l + 1:l + task_size_in_int8))
1850 send_sizes(i) = send_sizes(i) + task_size_in_int8
1851 END DO
1852
1853 ! do communication
1854 CALL desc%group%alltoall(send_buf, send_sizes, send_disps, recv_buf, recv_sizes, recv_disps)
1855
1856 DEALLOCATE (send_buf)
1857
1858 ntasks_recv = sum(recv_sizes)/task_size_in_int8
1859 ALLOCATE (tasks_recv(ntasks_recv))
1860
1861 ! do unpacking
1862 l = 0
1863 DO i = 1, desc%group_size
1864 DO j = 0, recv_sizes(i)/task_size_in_int8 - 1
1865 l = l + 1
1866 k = recv_disps(i) + j*task_size_in_int8
1867 CALL deserialize_task(tasks_recv(l), recv_buf(k + 1:k + task_size_in_int8))
1868 END DO
1869 END DO
1870
1871 DEALLOCATE (recv_buf)
1872 DEALLOCATE (send_sizes)
1873 DEALLOCATE (recv_sizes)
1874 DEALLOCATE (send_disps)
1875 DEALLOCATE (recv_disps)
1876
1877 CALL timestop(handle)
1878
1879 END SUBROUTINE create_local_tasks
1880
1881! **************************************************************************************************
1882!> \brief Assembles tasks to be performed on local grid
1883!> \param rs_descs the grids
1884!> \param ntasks Number of tasks for local processing
1885!> \param natoms ...
1886!> \param nimages ...
1887!> \param tasks the task set generated on this processor
1888!> \param rval ...
1889!> \param atom_pair_send ...
1890!> \param atom_pair_recv ...
1891!> \param symmetric ...
1892!> \param reorder_rs_grid_ranks ...
1893!> \param skip_load_balance_distributed ...
1894!> \par History
1895!> none
1896!> \author MattW 21/11/2007
1897! **************************************************************************************************
1898 SUBROUTINE distribute_tasks(rs_descs, ntasks, natoms, &
1899 tasks, atom_pair_send, atom_pair_recv, &
1900 symmetric, reorder_rs_grid_ranks, skip_load_balance_distributed)
1901
1902 TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
1903 POINTER :: rs_descs
1904 INTEGER :: ntasks, natoms
1905 TYPE(task_type), DIMENSION(:), POINTER :: tasks
1906 TYPE(atom_pair_type), DIMENSION(:), POINTER :: atom_pair_send, atom_pair_recv
1907 LOGICAL, INTENT(IN) :: symmetric, reorder_rs_grid_ranks, &
1908 skip_load_balance_distributed
1909
1910 CHARACTER(LEN=*), PARAMETER :: routinen = 'distribute_tasks'
1911
1912 INTEGER :: handle, igrid_level, irank, ntasks_recv
1913 INTEGER(KIND=int_8) :: load_gap, max_load, replicated_load
1914 INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:) :: total_loads, total_loads_tmp, trial_loads
1915 INTEGER(KIND=int_8), DIMENSION(:, :), POINTER :: loads
1916 INTEGER, ALLOCATABLE, DIMENSION(:) :: indices, real2virtual, total_index
1917 LOGICAL :: distributed_grids, fixed_first_grid
1918 TYPE(realspace_grid_desc_type), POINTER :: desc
1919 TYPE(task_type), DIMENSION(:), POINTER :: tasks_recv
1920
1921 CALL timeset(routinen, handle)
1922
1923 cpassert(ASSOCIATED(tasks))
1924
1925 ! *** figure out if we have distributed grids
1926 distributed_grids = .false.
1927 DO igrid_level = 1, SIZE(rs_descs)
1928 IF (rs_descs(igrid_level)%rs_desc%distributed) THEN
1929 distributed_grids = .true.
1930 END IF
1931 END DO
1932 desc => rs_descs(1)%rs_desc
1933
1934 IF (distributed_grids) THEN
1935
1936 ALLOCATE (loads(0:desc%group_size - 1, SIZE(rs_descs)))
1937 ALLOCATE (total_loads(0:desc%group_size - 1))
1938
1939 total_loads = 0
1940
1941 ! First round of balancing on the distributed grids
1942 ! we just balance each of the distributed grids independently
1943 DO igrid_level = 1, SIZE(rs_descs)
1944 IF (rs_descs(igrid_level)%rs_desc%distributed) THEN
1945
1946 IF (.NOT. skip_load_balance_distributed) &
1947 CALL load_balance_distributed(tasks, ntasks, rs_descs, igrid_level, natoms)
1948
1949 CALL get_current_loads(loads(:, igrid_level), rs_descs, igrid_level, ntasks, &
1950 tasks, use_reordered_ranks=.false.)
1951
1952 total_loads(:) = total_loads + loads(:, igrid_level)
1953
1954 END IF
1955 END DO
1956
1957 ! calculate the total load of replicated tasks, so we can decide if it is worth
1958 ! reordering the distributed grid levels
1959
1960 replicated_load = 0
1961 DO igrid_level = 1, SIZE(rs_descs)
1962 IF (.NOT. rs_descs(igrid_level)%rs_desc%distributed) THEN
1963 CALL get_current_loads(loads(:, igrid_level), rs_descs, igrid_level, ntasks, &
1964 tasks, use_reordered_ranks=.false.)
1965 replicated_load = replicated_load + sum(loads(:, igrid_level))
1966 END IF
1967 END DO
1968
1969 !IF (desc%my_pos==0) THEN
1970 ! WRITE(*,*) "Total replicated load is ",replicated_load
1971 !END IF
1972
1973 ! Now we adjust the rank ordering based on the current loads
1974 ! we leave the first distributed level and all the replicated levels in the default order
1975 IF (reorder_rs_grid_ranks) THEN
1976 fixed_first_grid = .false.
1977 DO igrid_level = 1, SIZE(rs_descs)
1978 IF (rs_descs(igrid_level)%rs_desc%distributed) THEN
1979 IF (fixed_first_grid .EQV. .false.) THEN
1980 total_loads(:) = loads(:, igrid_level)
1981 fixed_first_grid = .true.
1982 ELSE
1983 ALLOCATE (trial_loads(0:desc%group_size - 1))
1984
1985 trial_loads(:) = total_loads + loads(:, igrid_level)
1986 max_load = maxval(trial_loads)
1987 load_gap = 0
1988 DO irank = 0, desc%group_size - 1
1989 load_gap = load_gap + max_load - trial_loads(irank)
1990 END DO
1991
1992 ! If there is not enough replicated load to load balance well enough
1993 ! then we will reorder this grid level
1994 IF (load_gap > replicated_load*1.05_dp) THEN
1995
1996 ALLOCATE (indices(0:desc%group_size - 1))
1997 ALLOCATE (total_index(0:desc%group_size - 1))
1998 ALLOCATE (total_loads_tmp(0:desc%group_size - 1))
1999 ALLOCATE (real2virtual(0:desc%group_size - 1))
2000
2001 total_loads_tmp(:) = total_loads
2002 CALL sort(total_loads_tmp, desc%group_size, total_index)
2003 CALL sort(loads(:, igrid_level), desc%group_size, indices)
2004
2005 ! Reorder so that the rank with smallest load on this grid level is paired with
2006 ! the highest load in total
2007 DO irank = 0, desc%group_size - 1
2008 total_loads(total_index(irank) - 1) = total_loads(total_index(irank) - 1) + &
2009 loads(desc%group_size - irank - 1, igrid_level)
2010 real2virtual(total_index(irank) - 1) = indices(desc%group_size - irank - 1) - 1
2011 END DO
2012
2013 CALL rs_grid_reorder_ranks(rs_descs(igrid_level)%rs_desc, real2virtual)
2014
2015 DEALLOCATE (indices)
2016 DEALLOCATE (total_index)
2017 DEALLOCATE (total_loads_tmp)
2018 DEALLOCATE (real2virtual)
2019 ELSE
2020 total_loads(:) = trial_loads
2021 END IF
2022
2023 DEALLOCATE (trial_loads)
2024
2025 END IF
2026 END IF
2027 END DO
2028 END IF
2029
2030 ! Now we use the replicated tasks to balance out the rest of the load
2031 CALL load_balance_replicated(rs_descs, ntasks, tasks)
2032
2033 !total_loads = 0
2034 !DO igrid_level=1,SIZE(rs_descs)
2035 ! CALL get_current_loads(loads(:,igrid_level), rs_descs, igrid_level, ntasks, &
2036 ! tasks, use_reordered_ranks=.TRUE.)
2037 ! total_loads = total_loads + loads(:, igrid_level)
2038 !END DO
2039
2040 !IF (desc%my_pos==0) THEN
2041 ! WRITE(*,*) ""
2042 ! WRITE(*,*) "At the end of the load balancing procedure"
2043 ! WRITE(*,*) "Maximum load:",MAXVAL(total_loads)
2044 ! WRITE(*,*) "Average load:",SUM(total_loads)/SIZE(total_loads)
2045 ! WRITE(*,*) "Minimum load:",MINVAL(total_loads)
2046 !ENDIF
2047
2048 ! given a list of tasks, this will do the needed reshuffle so that all tasks will be local
2049 CALL create_local_tasks(rs_descs, ntasks, tasks, ntasks_recv, tasks_recv)
2050
2051 !
2052 ! tasks list are complete, we can compute the list of atomic blocks (atom pairs)
2053 ! we will be sending. These lists are needed for redistribute_matrix.
2054 !
2055 CALL get_atom_pair(atom_pair_send, tasks, ntasks=ntasks, send=.true., symmetric=symmetric, rs_descs=rs_descs)
2056
2057 ! natom_send=SIZE(atom_pair_send)
2058 ! CALL desc%group%sum(natom_send)
2059 ! IF (desc%my_pos==0) THEN
2060 ! WRITE(*,*) ""
2061 ! WRITE(*,*) "Total number of atomic blocks to be send:",natom_send
2062 ! ENDIF
2063
2064 CALL get_atom_pair(atom_pair_recv, tasks_recv, ntasks=ntasks_recv, send=.false., symmetric=symmetric, rs_descs=rs_descs)
2065
2066 ! cleanup, at this point we don't need the original tasks anymore
2067 DEALLOCATE (tasks)
2068 DEALLOCATE (loads)
2069 DEALLOCATE (total_loads)
2070
2071 ELSE
2072 tasks_recv => tasks
2073 ntasks_recv = ntasks
2074 CALL get_atom_pair(atom_pair_recv, tasks_recv, ntasks=ntasks_recv, send=.false., symmetric=symmetric, rs_descs=rs_descs)
2075 ! not distributed, hence atom_pair_send not needed
2076 END IF
2077
2078 ! here we sort the task list we will process locally.
2079 ALLOCATE (indices(ntasks_recv))
2080 CALL tasks_sort(tasks_recv, ntasks_recv, indices)
2081 DEALLOCATE (indices)
2082
2083 !
2084 ! final lists are ready
2085 !
2086
2087 tasks => tasks_recv
2088 ntasks = ntasks_recv
2089
2090 CALL timestop(handle)
2091
2092 END SUBROUTINE distribute_tasks
2093
2094! **************************************************************************************************
2095!> \brief ...
2096!> \param atom_pair ...
2097!> \param my_tasks ...
2098!> \param send ...
2099!> \param symmetric ...
2100!> \param natoms ...
2101!> \param nimages ...
2102!> \param rs_descs ...
2103! **************************************************************************************************
2104 SUBROUTINE get_atom_pair(atom_pair, tasks, ntasks, send, symmetric, rs_descs)
2105
2106 TYPE(atom_pair_type), DIMENSION(:), POINTER :: atom_pair
2107 TYPE(task_type), DIMENSION(:), INTENT(INOUT) :: tasks
2108 INTEGER, INTENT(IN) :: ntasks
2109 LOGICAL, INTENT(IN) :: send, symmetric
2110 TYPE(realspace_grid_desc_p_type), DIMENSION(:), INTENT(IN) :: rs_descs
2111
2112 INTEGER :: i, ilevel, iatom, jatom, npairs, virt_rank
2113 INTEGER, DIMENSION(:), ALLOCATABLE :: indices
2114 TYPE(atom_pair_type), DIMENSION(:), ALLOCATABLE :: atom_pair_tmp
2115
2116 cpassert(.NOT. ASSOCIATED(atom_pair))
2117 IF (ntasks == 0) THEN
2118 ALLOCATE (atom_pair(0))
2119 RETURN
2120 END IF
2121
2122 ! calculate list of atom pairs
2123 ! fill pair list taking into account symmetry
2124 ALLOCATE (atom_pair_tmp(ntasks))
2125 DO i = 1, ntasks
2126 atom_pair_tmp(i)%image = tasks(i)%image
2127 iatom = tasks(i)%iatom
2128 jatom = tasks(i)%jatom
2129 IF (symmetric .AND. iatom > jatom) THEN
2130 ! iatom / jatom swapped
2131 atom_pair_tmp(i)%row = jatom
2132 atom_pair_tmp(i)%col = iatom
2133 ELSE
2134 atom_pair_tmp(i)%row = iatom
2135 atom_pair_tmp(i)%col = jatom
2136 END IF
2137
2138 IF (send) THEN
2139 ! If sending, we need to use the 'real rank' as the pair has to be sent to the process which
2140 ! actually has the correct part of the rs_grid to do the mapping
2141 ilevel = tasks(i)%grid_level
2142 virt_rank = decode_rank(tasks(i)%destination, SIZE(rs_descs))
2143 atom_pair_tmp(i)%rank = rs_descs(ilevel)%rs_desc%virtual2real(virt_rank)
2144 ELSE
2145 ! If we are receiving, then no conversion is needed as the rank is that of the process with the
2146 ! required matrix block, and the ordering of the rs grid is irrelevant
2147 atom_pair_tmp(i)%rank = decode_rank(tasks(i)%source, SIZE(rs_descs))
2148 END IF
2149 END DO
2150
2151 ! find unique atom pairs that I'm sending/receiving
2152 ALLOCATE (indices(ntasks))
2153 CALL atom_pair_sort(atom_pair_tmp, ntasks, indices)
2154 npairs = 1
2155 tasks(indices(1))%pair_index = 1
2156 DO i = 2, ntasks
2157 IF (atom_pair_less_than(atom_pair_tmp(i - 1), atom_pair_tmp(i))) THEN
2158 npairs = npairs + 1
2159 atom_pair_tmp(npairs) = atom_pair_tmp(i)
2160 END IF
2161 tasks(indices(i))%pair_index = npairs
2162 END DO
2163 DEALLOCATE (indices)
2164
2165 ! Copy unique pairs to final location.
2166 ALLOCATE (atom_pair(npairs))
2167 atom_pair(:) = atom_pair_tmp(:npairs)
2168 DEALLOCATE (atom_pair_tmp)
2169
2170 END SUBROUTINE get_atom_pair
2171
2172! **************************************************************************************************
2173!> \brief redistributes the matrix so that it can be used in realspace operations
2174!> i.e. according to the task lists for collocate and integrate.
2175!> This routine can become a bottleneck in large calculations.
2176!> \param rs_descs ...
2177!> \param pmats ...
2178!> \param atom_pair_send ...
2179!> \param atom_pair_recv ...
2180!> \param natoms ...
2181!> \param nimages ...
2182!> \param scatter ...
2183!> \param hmats ...
2184! **************************************************************************************************
2185 SUBROUTINE rs_distribute_matrix(rs_descs, pmats, atom_pair_send, atom_pair_recv, &
2186 nimages, scatter, hmats)
2187
2188 TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
2189 POINTER :: rs_descs
2190 TYPE(dbcsr_p_type), DIMENSION(:), POINTER :: pmats
2191 TYPE(atom_pair_type), DIMENSION(:), POINTER :: atom_pair_send, atom_pair_recv
2192 INTEGER :: nimages
2193 LOGICAL :: scatter
2194 TYPE(dbcsr_p_type), DIMENSION(:), OPTIONAL, &
2195 POINTER :: hmats
2196
2197 CHARACTER(LEN=*), PARAMETER :: routinen = 'rs_distribute_matrix'
2198
2199 INTEGER :: acol, arow, handle, i, img, j, k, l, me, &
2200 nblkcols_total, nblkrows_total, ncol, &
2201 nrow, nthread, nthread_left
2202 INTEGER, ALLOCATABLE, DIMENSION(:) :: first_col, first_row, last_col, last_row, recv_disps, &
2203 recv_pair_count, recv_pair_disps, recv_sizes, send_disps, send_pair_count, &
2204 send_pair_disps, send_sizes
2205 INTEGER, DIMENSION(:), POINTER :: col_blk_size, row_blk_size
2206 LOGICAL :: found
2207 REAL(kind=dp), ALLOCATABLE, DIMENSION(:) :: recv_buf_r, send_buf_r
2208 REAL(kind=dp), DIMENSION(:, :), POINTER :: h_block, p_block
2209 TYPE(dbcsr_type), POINTER :: hmat, pmat
2210 TYPE(realspace_grid_desc_type), POINTER :: desc
2211
2212!$ INTEGER(kind=omp_lock_kind), ALLOCATABLE, DIMENSION(:) :: locks
2213
2214 CALL timeset(routinen, handle)
2215
2216 IF (.NOT. scatter) THEN
2217 cpassert(PRESENT(hmats))
2218 END IF
2219
2220 desc => rs_descs(1)%rs_desc
2221 me = desc%my_pos + 1
2222
2223 ! allocate local arrays
2224 ALLOCATE (send_sizes(desc%group_size))
2225 ALLOCATE (recv_sizes(desc%group_size))
2226 ALLOCATE (send_disps(desc%group_size))
2227 ALLOCATE (recv_disps(desc%group_size))
2228 ALLOCATE (send_pair_count(desc%group_size))
2229 ALLOCATE (recv_pair_count(desc%group_size))
2230 ALLOCATE (send_pair_disps(desc%group_size))
2231 ALLOCATE (recv_pair_disps(desc%group_size))
2232
2233 pmat => pmats(1)%matrix
2234 CALL dbcsr_get_info(pmat, &
2235 row_blk_size=row_blk_size, &
2236 col_blk_size=col_blk_size, &
2237 nblkrows_total=nblkrows_total, &
2238 nblkcols_total=nblkcols_total)
2239 ALLOCATE (first_row(nblkrows_total), last_row(nblkrows_total), &
2240 first_col(nblkcols_total), last_col(nblkcols_total))
2241 CALL dbcsr_convert_sizes_to_offsets(row_blk_size, first_row, last_row)
2242 CALL dbcsr_convert_sizes_to_offsets(col_blk_size, first_col, last_col)
2243
2244 ! set up send buffer sizes
2245 send_sizes = 0
2246 send_pair_count = 0
2247 DO i = 1, SIZE(atom_pair_send)
2248 k = atom_pair_send(i)%rank + 1 ! proc we're sending this block to
2249 arow = atom_pair_send(i)%row
2250 acol = atom_pair_send(i)%col
2251 nrow = last_row(arow) - first_row(arow) + 1
2252 ncol = last_col(acol) - first_col(acol) + 1
2253 send_sizes(k) = send_sizes(k) + nrow*ncol
2254 send_pair_count(k) = send_pair_count(k) + 1
2255 END DO
2256
2257 send_disps = 0
2258 send_pair_disps = 0
2259 DO i = 2, desc%group_size
2260 send_disps(i) = send_disps(i - 1) + send_sizes(i - 1)
2261 send_pair_disps(i) = send_pair_disps(i - 1) + send_pair_count(i - 1)
2262 END DO
2263
2264 ALLOCATE (send_buf_r(sum(send_sizes)))
2265
2266 ! set up recv buffer
2267
2268 recv_sizes = 0
2269 recv_pair_count = 0
2270 DO i = 1, SIZE(atom_pair_recv)
2271 k = atom_pair_recv(i)%rank + 1 ! proc we're receiving this data from
2272 arow = atom_pair_recv(i)%row
2273 acol = atom_pair_recv(i)%col
2274 nrow = last_row(arow) - first_row(arow) + 1
2275 ncol = last_col(acol) - first_col(acol) + 1
2276 recv_sizes(k) = recv_sizes(k) + nrow*ncol
2277 recv_pair_count(k) = recv_pair_count(k) + 1
2278 END DO
2279
2280 recv_disps = 0
2281 recv_pair_disps = 0
2282 DO i = 2, desc%group_size
2283 recv_disps(i) = recv_disps(i - 1) + recv_sizes(i - 1)
2284 recv_pair_disps(i) = recv_pair_disps(i - 1) + recv_pair_count(i - 1)
2285 END DO
2286 ALLOCATE (recv_buf_r(sum(recv_sizes)))
2287
2288!$OMP PARALLEL DEFAULT(OMP_DEFAULT_NONE_WITH_OOP) &
2289!$OMP SHARED(desc,send_pair_count,send_pair_disps,nimages),&
2290!$OMP SHARED(last_row,first_row,last_col,first_col),&
2291!$OMP SHARED(pmats,send_buf_r,send_disps,send_sizes),&
2292!$OMP SHARED(atom_pair_send,me,hmats,nblkrows_total),&
2293!$OMP SHARED(atom_pair_recv,recv_buf_r,scatter,recv_pair_disps), &
2294!$OMP SHARED(recv_sizes,recv_disps,recv_pair_count,locks), &
2295!$OMP PRIVATE(i,img,arow,acol,nrow,ncol,p_block,found,j,k,l),&
2296!$OMP PRIVATE(nthread,h_block,nthread_left,hmat,pmat)
2297
2298 nthread = 1
2299!$ nthread = omp_get_num_threads()
2300 nthread_left = 1
2301!$ nthread_left = MAX(1, nthread - 1)
2302
2303 ! do packing
2304!$OMP DO schedule(guided)
2305 DO l = 1, desc%group_size
2306 IF (l .EQ. me) cycle
2307 send_sizes(l) = 0
2308 DO i = 1, send_pair_count(l)
2309 arow = atom_pair_send(send_pair_disps(l) + i)%row
2310 acol = atom_pair_send(send_pair_disps(l) + i)%col
2311 img = atom_pair_send(send_pair_disps(l) + i)%image
2312 nrow = last_row(arow) - first_row(arow) + 1
2313 ncol = last_col(acol) - first_col(acol) + 1
2314 pmat => pmats(img)%matrix
2315 CALL dbcsr_get_block_p(matrix=pmat, row=arow, col=acol, block=p_block, found=found)
2316 cpassert(found)
2317
2318 DO k = 1, ncol
2319 DO j = 1, nrow
2320 send_buf_r(send_disps(l) + send_sizes(l) + j + (k - 1)*nrow) = p_block(j, k)
2321 END DO
2322 END DO
2323 send_sizes(l) = send_sizes(l) + nrow*ncol
2324 END DO
2325 END DO
2326!$OMP END DO
2327
2328 IF (.NOT. scatter) THEN
2329 ! We need locks to protect concurrent summation into H
2330!$OMP SINGLE
2331!$ ALLOCATE (locks(nthread*10))
2332!$OMP END SINGLE
2333
2334!$OMP DO
2335!$ do i = 1, nthread*10
2336!$ call omp_init_lock(locks(i))
2337!$ end do
2338!$OMP END DO
2339 END IF
2340
2341!$OMP MASTER
2342 ! do communication
2343 CALL desc%group%alltoall(send_buf_r, send_sizes, send_disps, &
2344 recv_buf_r, recv_sizes, recv_disps)
2345!$OMP END MASTER
2346
2347 ! If this is a scatter, then no need to copy local blocks,
2348 ! If not, we sum them directly into H (bypassing the alltoall)
2349 IF (.NOT. scatter) THEN
2350
2351 ! Distribute work over remaining threads assuming one is still in the alltoall
2352!$OMP DO schedule(dynamic,MAX(1,send_pair_count(me)/nthread_left))
2353 DO i = 1, send_pair_count(me)
2354 arow = atom_pair_send(send_pair_disps(me) + i)%row
2355 acol = atom_pair_send(send_pair_disps(me) + i)%col
2356 img = atom_pair_send(send_pair_disps(me) + i)%image
2357 nrow = last_row(arow) - first_row(arow) + 1
2358 ncol = last_col(acol) - first_col(acol) + 1
2359 hmat => hmats(img)%matrix
2360 pmat => pmats(img)%matrix
2361 CALL dbcsr_get_block_p(matrix=hmat, row=arow, col=acol, block=h_block, found=found)
2362 cpassert(found)
2363 CALL dbcsr_get_block_p(matrix=pmat, row=arow, col=acol, block=p_block, found=found)
2364 cpassert(found)
2365
2366!$ call omp_set_lock(locks((arow - 1)*nthread*10/nblkrows_total + 1))
2367 DO k = 1, ncol
2368 DO j = 1, nrow
2369 h_block(j, k) = h_block(j, k) + p_block(j, k)
2370 END DO
2371 END DO
2372!$ call omp_unset_lock(locks((arow - 1)*nthread*10/nblkrows_total + 1))
2373 END DO
2374!$OMP END DO
2375 ELSE
2376 ! We will insert new blocks into P, so create mutable work matrices
2377 DO img = 1, nimages
2378 pmat => pmats(img)%matrix
2379 CALL dbcsr_work_create(pmat, work_mutable=.true., &
2380 nblks_guess=SIZE(atom_pair_recv)/nthread, sizedata_guess=SIZE(recv_buf_r)/nthread, &
2381 n=nthread)
2382 END DO
2383 END IF
2384
2385! wait for comm and setup to finish
2386!$OMP BARRIER
2387
2388 !do unpacking
2389!$OMP DO schedule(guided)
2390 DO l = 1, desc%group_size
2391 IF (l .EQ. me) cycle
2392 recv_sizes(l) = 0
2393 DO i = 1, recv_pair_count(l)
2394 arow = atom_pair_recv(recv_pair_disps(l) + i)%row
2395 acol = atom_pair_recv(recv_pair_disps(l) + i)%col
2396 img = atom_pair_recv(recv_pair_disps(l) + i)%image
2397 nrow = last_row(arow) - first_row(arow) + 1
2398 ncol = last_col(acol) - first_col(acol) + 1
2399 pmat => pmats(img)%matrix
2400 NULLIFY (p_block)
2401 CALL dbcsr_get_block_p(matrix=pmat, row=arow, col=acol, block=p_block, found=found)
2402
2403 IF (PRESENT(hmats)) THEN
2404 hmat => hmats(img)%matrix
2405 CALL dbcsr_get_block_p(matrix=hmat, row=arow, col=acol, block=h_block, found=found)
2406 cpassert(found)
2407 END IF
2408
2409 IF (scatter .AND. .NOT. ASSOCIATED(p_block)) THEN
2410 CALL dbcsr_put_block(pmat, arow, acol, &
2411 block=recv_buf_r(recv_disps(l) + recv_sizes(l) + 1:recv_disps(l) + recv_sizes(l) + nrow*ncol))
2412 END IF
2413 IF (.NOT. scatter) THEN
2414!$ call omp_set_lock(locks((arow - 1)*nthread*10/nblkrows_total + 1))
2415 DO k = 1, ncol
2416 DO j = 1, nrow
2417 h_block(j, k) = h_block(j, k) + recv_buf_r(recv_disps(l) + recv_sizes(l) + j + (k - 1)*nrow)
2418 END DO
2419 END DO
2420!$ call omp_unset_lock(locks((arow - 1)*nthread*10/nblkrows_total + 1))
2421 END IF
2422 recv_sizes(l) = recv_sizes(l) + nrow*ncol
2423 END DO
2424 END DO
2425!$OMP END DO
2426
2427!$ IF (.not. scatter) THEN
2428!$OMP DO
2429!$ do i = 1, nthread*10
2430!$ call omp_destroy_lock(locks(i))
2431!$ end do
2432!$OMP END DO
2433!$ END IF
2434
2435!$OMP SINGLE
2436!$ IF (.not. scatter) THEN
2437!$ DEALLOCATE (locks)
2438!$ END IF
2439!$OMP END SINGLE NOWAIT
2440
2441 IF (scatter) THEN
2442 ! Blocks were added to P
2443 DO img = 1, nimages
2444 pmat => pmats(img)%matrix
2445 CALL dbcsr_finalize(pmat)
2446 END DO
2447 END IF
2448!$OMP END PARALLEL
2449
2450 DEALLOCATE (send_buf_r)
2451 DEALLOCATE (recv_buf_r)
2452
2453 DEALLOCATE (send_sizes)
2454 DEALLOCATE (recv_sizes)
2455 DEALLOCATE (send_disps)
2456 DEALLOCATE (recv_disps)
2457 DEALLOCATE (send_pair_count)
2458 DEALLOCATE (recv_pair_count)
2459 DEALLOCATE (send_pair_disps)
2460 DEALLOCATE (recv_pair_disps)
2461
2462 DEALLOCATE (first_row, last_row, first_col, last_col)
2463
2464 CALL timestop(handle)
2465
2466 END SUBROUTINE rs_distribute_matrix
2467
2468! **************************************************************************************************
2469!> \brief Calculates offsets and sizes for rs_scatter_matrix and rs_copy_matrix.
2470!> \author Ole Schuett
2471! **************************************************************************************************
2472 SUBROUTINE rs_calc_offsets(pairs, nsgf, group_size, &
2473 pair_offsets, rank_offsets, rank_sizes, buffer_size)
2474 TYPE(atom_pair_type), DIMENSION(:), INTENT(IN) :: pairs
2475 INTEGER, DIMENSION(:), INTENT(IN) :: nsgf
2476 INTEGER, INTENT(IN) :: group_size
2477 INTEGER, DIMENSION(:), POINTER :: pair_offsets, rank_offsets, rank_sizes
2478 INTEGER, INTENT(INOUT) :: buffer_size
2479
2480 INTEGER :: acol, arow, i, block_size, total_size, k, prev_k
2481
2482 IF (ASSOCIATED(pair_offsets)) DEALLOCATE (pair_offsets)
2483 IF (ASSOCIATED(rank_offsets)) DEALLOCATE (rank_offsets)
2484 IF (ASSOCIATED(rank_sizes)) DEALLOCATE (rank_sizes)
2485
2486 ! calculate buffer_size and pair_offsets
2487 ALLOCATE (pair_offsets(SIZE(pairs)))
2488 total_size = 0
2489 DO i = 1, SIZE(pairs)
2490 pair_offsets(i) = total_size
2491 arow = pairs(i)%row
2492 acol = pairs(i)%col
2493 block_size = nsgf(arow)*nsgf(acol)
2494 total_size = total_size + block_size
2495 END DO
2496 buffer_size = total_size
2497
2498 ! calculate rank_offsets and rank_sizes
2499 ALLOCATE (rank_offsets(group_size))
2500 ALLOCATE (rank_sizes(group_size))
2501 rank_offsets = 0
2502 rank_sizes = 0
2503 IF (SIZE(pairs) > 0) THEN
2504 prev_k = pairs(1)%rank + 1
2505 DO i = 1, SIZE(pairs)
2506 k = pairs(i)%rank + 1
2507 cpassert(k >= prev_k) ! expecting the pairs to be ordered by rank
2508 IF (k > prev_k) THEN
2509 rank_offsets(k) = pair_offsets(i)
2510 rank_sizes(prev_k) = rank_offsets(k) - rank_offsets(prev_k)
2511 prev_k = k
2512 END IF
2513 END DO
2514 rank_sizes(k) = buffer_size - rank_offsets(k) ! complete last rank
2515 END IF
2516
2517 END SUBROUTINE rs_calc_offsets
2518
2519! **************************************************************************************************
2520!> \brief Scatters dbcsr matrix blocks and receives them into a buffer as needed before collocation.
2521!> \author Ole Schuett
2522! **************************************************************************************************
2523 SUBROUTINE rs_scatter_matrices(src_matrices, dest_buffer, task_list, group)
2524 TYPE(dbcsr_p_type), DIMENSION(:), INTENT(IN) :: src_matrices
2525 TYPE(offload_buffer_type), INTENT(INOUT) :: dest_buffer
2526 TYPE(task_list_type), INTENT(IN) :: task_list
2527 TYPE(mp_comm_type), INTENT(IN) :: group
2528
2529 CHARACTER(LEN=*), PARAMETER :: routinen = 'rs_scatter_matrices'
2530
2531 INTEGER :: handle
2532 REAL(kind=dp), DIMENSION(:), ALLOCATABLE :: buffer_send
2533
2534 CALL timeset(routinen, handle)
2535 ALLOCATE (buffer_send(task_list%buffer_size_send))
2536
2537 ! pack dbcsr blocks into send buffer
2538 cpassert(ASSOCIATED(task_list%atom_pair_send))
2539 CALL rs_pack_buffer(src_matrices=src_matrices, &
2540 dest_buffer=buffer_send, &
2541 atom_pair=task_list%atom_pair_send, &
2542 pair_offsets=task_list%pair_offsets_send)
2543
2544 ! mpi all-to-all communication, receiving directly into blocks_recv%buffer.
2545 CALL group%alltoall(buffer_send, task_list%rank_sizes_send, task_list%rank_offsets_send, &
2546 dest_buffer%host_buffer, &
2547 task_list%rank_sizes_recv, task_list%rank_offsets_recv)
2548
2549 DEALLOCATE (buffer_send)
2550 CALL timestop(handle)
2551
2552 END SUBROUTINE rs_scatter_matrices
2553
2554! **************************************************************************************************
2555!> \brief Gather the dbcsr matrix blocks and receives them into a buffer as needed after integration.
2556!> \author Ole Schuett
2557! **************************************************************************************************
2558 SUBROUTINE rs_gather_matrices(src_buffer, dest_matrices, task_list, group)
2559 TYPE(offload_buffer_type), INTENT(IN) :: src_buffer
2560 TYPE(dbcsr_p_type), DIMENSION(:), INTENT(INOUT) :: dest_matrices
2561 TYPE(task_list_type), INTENT(IN) :: task_list
2562 TYPE(mp_comm_type), INTENT(IN) :: group
2563
2564 CHARACTER(LEN=*), PARAMETER :: routinen = 'rs_gather_matrices'
2565
2566 INTEGER :: handle
2567 REAL(kind=dp), DIMENSION(:), ALLOCATABLE :: buffer_send
2568
2569 CALL timeset(routinen, handle)
2570
2571 ! Caution: The meaning of send and recv are reversed in this routine.
2572 ALLOCATE (buffer_send(task_list%buffer_size_send)) ! e.g. this is actually used for receiving
2573
2574 ! mpi all-to-all communication
2575 CALL group%alltoall(src_buffer%host_buffer, task_list%rank_sizes_recv, task_list%rank_offsets_recv, &
2576 buffer_send, task_list%rank_sizes_send, task_list%rank_offsets_send)
2577
2578 ! unpack dbcsr blocks from send buffer
2579 cpassert(ASSOCIATED(task_list%atom_pair_send))
2580 CALL rs_unpack_buffer(src_buffer=buffer_send, &
2581 dest_matrices=dest_matrices, &
2582 atom_pair=task_list%atom_pair_send, &
2583 pair_offsets=task_list%pair_offsets_send)
2584
2585 DEALLOCATE (buffer_send)
2586 CALL timestop(handle)
2587
2588 END SUBROUTINE rs_gather_matrices
2589
2590! **************************************************************************************************
2591!> \brief Copies the DBCSR blocks into buffer, replaces rs_scatter_matrix for non-distributed grids.
2592!> \author Ole Schuett
2593! **************************************************************************************************
2594 SUBROUTINE rs_copy_to_buffer(src_matrices, dest_buffer, task_list)
2595 TYPE(dbcsr_p_type), DIMENSION(:), INTENT(IN) :: src_matrices
2596 TYPE(offload_buffer_type), INTENT(INOUT) :: dest_buffer
2597 TYPE(task_list_type), INTENT(IN) :: task_list
2598
2599 CALL rs_pack_buffer(src_matrices=src_matrices, &
2600 dest_buffer=dest_buffer%host_buffer, &
2601 atom_pair=task_list%atom_pair_recv, &
2602 pair_offsets=task_list%pair_offsets_recv)
2603
2604 END SUBROUTINE rs_copy_to_buffer
2605
2606! **************************************************************************************************
2607!> \brief Copies from buffer into DBCSR matrics, replaces rs_gather_matrix for non-distributed grids.
2608!> \author Ole Schuett
2609! **************************************************************************************************
2610 SUBROUTINE rs_copy_to_matrices(src_buffer, dest_matrices, task_list)
2611 TYPE(offload_buffer_type), INTENT(IN) :: src_buffer
2612 TYPE(dbcsr_p_type), DIMENSION(:), INTENT(INOUT) :: dest_matrices
2613 TYPE(task_list_type), INTENT(IN) :: task_list
2614
2615 CALL rs_unpack_buffer(src_buffer=src_buffer%host_buffer, &
2616 dest_matrices=dest_matrices, &
2617 atom_pair=task_list%atom_pair_recv, &
2618 pair_offsets=task_list%pair_offsets_recv)
2619
2620 END SUBROUTINE rs_copy_to_matrices
2621
2622! **************************************************************************************************
2623!> \brief Helper routine for rs_scatter_matrix and rs_copy_to_buffer.
2624!> \author Ole Schuett
2625! **************************************************************************************************
2626 SUBROUTINE rs_pack_buffer(src_matrices, dest_buffer, atom_pair, pair_offsets)
2627 TYPE(dbcsr_p_type), DIMENSION(:), INTENT(IN) :: src_matrices
2628 REAL(kind=dp), DIMENSION(:), INTENT(INOUT) :: dest_buffer
2629 TYPE(atom_pair_type), DIMENSION(:), INTENT(IN) :: atom_pair
2630 INTEGER, DIMENSION(:), INTENT(IN) :: pair_offsets
2631
2632 INTEGER :: acol, arow, img, i, offset, block_size
2633 LOGICAL :: found
2634 REAL(kind=dp), DIMENSION(:, :), POINTER :: block
2635
2636!$OMP PARALLEL DEFAULT(NONE), &
2637!$OMP SHARED(src_matrices,atom_pair,pair_offsets,dest_buffer), &
2638!$OMP PRIVATE(acol,arow,img,i,offset,block_size,found,block)
2639!$OMP DO schedule(guided)
2640 DO i = 1, SIZE(atom_pair)
2641 arow = atom_pair(i)%row
2642 acol = atom_pair(i)%col
2643 img = atom_pair(i)%image
2644 CALL dbcsr_get_block_p(matrix=src_matrices(img)%matrix, row=arow, col=acol, &
2645 block=block, found=found)
2646 cpassert(found)
2647 block_size = SIZE(block)
2648 offset = pair_offsets(i)
2649 dest_buffer(offset + 1:offset + block_size) = reshape(block, shape=(/block_size/))
2650 END DO
2651!$OMP END DO
2652!$OMP END PARALLEL
2653
2654 END SUBROUTINE rs_pack_buffer
2655
2656! **************************************************************************************************
2657!> \brief Helper routine for rs_gather_matrix and rs_copy_to_matrices.
2658!> \author Ole Schuett
2659! **************************************************************************************************
2660 SUBROUTINE rs_unpack_buffer(src_buffer, dest_matrices, atom_pair, pair_offsets)
2661 REAL(kind=dp), DIMENSION(:), INTENT(IN) :: src_buffer
2662 TYPE(dbcsr_p_type), DIMENSION(:), INTENT(INOUT) :: dest_matrices
2663 TYPE(atom_pair_type), DIMENSION(:), INTENT(IN) :: atom_pair
2664 INTEGER, DIMENSION(:), INTENT(IN) :: pair_offsets
2665
2666 INTEGER :: acol, arow, img, i, offset, &
2667 nrows, ncols, lock_num
2668 LOGICAL :: found
2669 REAL(kind=dp), DIMENSION(:, :), POINTER :: block
2670 INTEGER(kind=omp_lock_kind), ALLOCATABLE, DIMENSION(:) :: locks
2671
2672 ! initialize locks
2673 ALLOCATE (locks(10*omp_get_max_threads()))
2674 DO i = 1, SIZE(locks)
2675 CALL omp_init_lock(locks(i))
2676 END DO
2677
2678!$OMP PARALLEL DEFAULT(NONE), &
2679!$OMP SHARED(src_buffer,atom_pair,pair_offsets,dest_matrices,locks), &
2680!$OMP PRIVATE(acol,arow,img,i,offset,nrows,ncols,lock_num,found,block)
2681!$OMP DO schedule(guided)
2682 DO i = 1, SIZE(atom_pair)
2683 arow = atom_pair(i)%row
2684 acol = atom_pair(i)%col
2685 img = atom_pair(i)%image
2686 CALL dbcsr_get_block_p(matrix=dest_matrices(img)%matrix, row=arow, col=acol, &
2687 block=block, found=found)
2688 cpassert(found)
2689 nrows = SIZE(block, 1)
2690 ncols = SIZE(block, 2)
2691 offset = pair_offsets(i)
2692 lock_num = modulo(arow, SIZE(locks)) + 1 ! map matrix rows round-robin to available locks
2693
2694 CALL omp_set_lock(locks(lock_num))
2695 block = block + reshape(src_buffer(offset + 1:offset + nrows*ncols), shape=(/nrows, ncols/))
2696 CALL omp_unset_lock(locks(lock_num))
2697 END DO
2698!$OMP END DO
2699!$OMP END PARALLEL
2700
2701 ! destroy locks
2702 DO i = 1, SIZE(locks)
2703 CALL omp_destroy_lock(locks(i))
2704 END DO
2705 DEALLOCATE (locks)
2706
2707 END SUBROUTINE rs_unpack_buffer
2708
2709! **************************************************************************************************
2710!> \brief determines the rank of the processor who's real rs grid contains point
2711!> \param rs_desc ...
2712!> \param igrid_level ...
2713!> \param n_levels ...
2714!> \param cube_center ...
2715!> \param ntasks ...
2716!> \param tasks ...
2717!> \param lb_cube ...
2718!> \param ub_cube ...
2719!> \param added_tasks ...
2720!> \par History
2721!> 11.2007 created [MattW]
2722!> 10.2008 rewritten [Joost VandeVondele]
2723!> \author MattW
2724! **************************************************************************************************
2725 SUBROUTINE rs_find_node(rs_desc, igrid_level, n_levels, cube_center, ntasks, tasks, &
2726 lb_cube, ub_cube, added_tasks)
2727
2728 TYPE(realspace_grid_desc_type), POINTER :: rs_desc
2729 INTEGER, INTENT(IN) :: igrid_level, n_levels
2730 INTEGER, DIMENSION(3), INTENT(IN) :: cube_center
2731 INTEGER, INTENT(INOUT) :: ntasks
2732 TYPE(task_type), DIMENSION(:), POINTER :: tasks
2733 INTEGER, DIMENSION(3), INTENT(IN) :: lb_cube, ub_cube
2734 INTEGER, INTENT(OUT) :: added_tasks
2735
2736 INTEGER, PARAMETER :: add_tasks = 1000
2737 REAL(kind=dp), PARAMETER :: mult_tasks = 2.0_dp
2738
2739 INTEGER :: bit_index, coord(3), curr_tasks, dest, i, icoord(3), idest, itask, ix, iy, iz, &
2740 lb_coord(3), lb_domain(3), lbc(3), ub_coord(3), ub_domain(3), ubc(3)
2741 INTEGER :: bit_pattern
2742 LOGICAL :: dir_periodic(3)
2743
2744 coord(1) = rs_desc%x2coord(cube_center(1))
2745 coord(2) = rs_desc%y2coord(cube_center(2))
2746 coord(3) = rs_desc%z2coord(cube_center(3))
2747 dest = rs_desc%coord2rank(coord(1), coord(2), coord(3))
2748
2749 ! the real cube coordinates
2750 lbc = lb_cube + cube_center
2751 ubc = ub_cube + cube_center
2752
2753 IF (all((rs_desc%lb_global(:, dest) - rs_desc%border) .LE. lbc) .AND. &
2754 all((rs_desc%ub_global(:, dest) + rs_desc%border) .GE. ubc)) THEN
2755 !standard distributed collocation/integration
2756 tasks(ntasks)%destination = encode_rank(dest, igrid_level, n_levels)
2757 tasks(ntasks)%dist_type = 1
2758 tasks(ntasks)%subpatch_pattern = 0
2759 added_tasks = 1
2760
2761 ! here we figure out if there is an alternate location for this task
2762 ! i.e. even though the cube_center is not in the real local domain,
2763 ! it might fully fit in the halo of the neighbor
2764 ! if its radius is smaller than the maximum radius
2765 ! the list of possible neighbors is stored here as a bit pattern
2766 ! to reconstruct what the rank of the neigbor is,
2767 ! we can use (note this requires the correct rs_grid)
2768 ! IF (BTEST(bit_pattern,0)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,(/-1,0,0/))
2769 ! IF (BTEST(bit_pattern,1)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,(/+1,0,0/))
2770 ! IF (BTEST(bit_pattern,2)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,(/0,-1,0/))
2771 ! IF (BTEST(bit_pattern,3)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,(/0,+1,0/))
2772 ! IF (BTEST(bit_pattern,4)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,(/0,0,-1/))
2773 ! IF (BTEST(bit_pattern,5)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,(/0,0,+1/))
2774 bit_index = 0
2775 bit_pattern = 0
2776 DO i = 1, 3
2777 IF (rs_desc%perd(i) == 1) THEN
2778 bit_pattern = ibclr(bit_pattern, bit_index)
2779 bit_index = bit_index + 1
2780 bit_pattern = ibclr(bit_pattern, bit_index)
2781 bit_index = bit_index + 1
2782 ELSE
2783 ! fits the left neighbor ?
2784 IF (ubc(i) <= rs_desc%lb_global(i, dest) - 1 + rs_desc%border) THEN
2785 bit_pattern = ibset(bit_pattern, bit_index)
2786 bit_index = bit_index + 1
2787 ELSE
2788 bit_pattern = ibclr(bit_pattern, bit_index)
2789 bit_index = bit_index + 1
2790 END IF
2791 ! fits the right neighbor ?
2792 IF (lbc(i) >= rs_desc%ub_global(i, dest) + 1 - rs_desc%border) THEN
2793 bit_pattern = ibset(bit_pattern, bit_index)
2794 bit_index = bit_index + 1
2795 ELSE
2796 bit_pattern = ibclr(bit_pattern, bit_index)
2797 bit_index = bit_index + 1
2798 END IF
2799 END IF
2800 END DO
2801 tasks(ntasks)%subpatch_pattern = bit_pattern
2802
2803 ELSE
2804 ! generalised collocation/integration needed
2805 ! first we figure out how many neighbors we have to add to include the lbc/ubc
2806 ! in the available domains (inclusive of halo points)
2807 ! first we 'ignore' periodic boundary conditions
2808 ! i.e. ub_coord-lb_coord+1 might be larger than group_dim
2809 lb_coord = coord
2810 ub_coord = coord
2811 lb_domain = rs_desc%lb_global(:, dest) - rs_desc%border
2812 ub_domain = rs_desc%ub_global(:, dest) + rs_desc%border
2813 DO i = 1, 3
2814 ! only if the grid is not periodic in this direction we need to take care of adding neighbors
2815 IF (rs_desc%perd(i) == 0) THEN
2816 ! if the domain lower bound is greater than the lbc we need to add the size of the neighbor domain
2817 DO
2818 IF (lb_domain(i) > lbc(i)) THEN
2819 lb_coord(i) = lb_coord(i) - 1
2820 icoord = modulo(lb_coord, rs_desc%group_dim)
2821 idest = rs_desc%coord2rank(icoord(1), icoord(2), icoord(3))
2822 lb_domain(i) = lb_domain(i) - (rs_desc%ub_global(i, idest) - rs_desc%lb_global(i, idest) + 1)
2823 ELSE
2824 EXIT
2825 END IF
2826 END DO
2827 ! same for the upper bound
2828 DO
2829 IF (ub_domain(i) < ubc(i)) THEN
2830 ub_coord(i) = ub_coord(i) + 1
2831 icoord = modulo(ub_coord, rs_desc%group_dim)
2832 idest = rs_desc%coord2rank(icoord(1), icoord(2), icoord(3))
2833 ub_domain(i) = ub_domain(i) + (rs_desc%ub_global(i, idest) - rs_desc%lb_global(i, idest) + 1)
2834 ELSE
2835 EXIT
2836 END IF
2837 END DO
2838 END IF
2839 END DO
2840
2841 ! some care is needed for the periodic boundaries
2842 DO i = 1, 3
2843 IF (ub_domain(i) - lb_domain(i) + 1 >= rs_desc%npts(i)) THEN
2844 dir_periodic(i) = .true.
2845 lb_coord(i) = 0
2846 ub_coord(i) = rs_desc%group_dim(i) - 1
2847 ELSE
2848 dir_periodic(i) = .false.
2849 END IF
2850 END DO
2851
2852 added_tasks = product(ub_coord - lb_coord + 1)
2853 itask = ntasks
2854 ntasks = ntasks + added_tasks - 1
2855 IF (ntasks > SIZE(tasks)) THEN
2856 curr_tasks = int((SIZE(tasks) + add_tasks)*mult_tasks)
2857 CALL reallocate_tasks(tasks, curr_tasks)
2858 END IF
2859 DO iz = lb_coord(3), ub_coord(3)
2860 DO iy = lb_coord(2), ub_coord(2)
2861 DO ix = lb_coord(1), ub_coord(1)
2862 icoord = modulo((/ix, iy, iz/), rs_desc%group_dim)
2863 idest = rs_desc%coord2rank(icoord(1), icoord(2), icoord(3))
2864 tasks(itask)%destination = encode_rank(idest, igrid_level, n_levels)
2865 tasks(itask)%dist_type = 2
2866 tasks(itask)%subpatch_pattern = 0
2867 ! encode the domain size for this task
2868 ! if the bit is set, we need to add the border in that direction
2869 IF (ix == lb_coord(1) .AND. .NOT. dir_periodic(1)) &
2870 tasks(itask)%subpatch_pattern = ibset(tasks(itask)%subpatch_pattern, 0)
2871 IF (ix == ub_coord(1) .AND. .NOT. dir_periodic(1)) &
2872 tasks(itask)%subpatch_pattern = ibset(tasks(itask)%subpatch_pattern, 1)
2873 IF (iy == lb_coord(2) .AND. .NOT. dir_periodic(2)) &
2874 tasks(itask)%subpatch_pattern = ibset(tasks(itask)%subpatch_pattern, 2)
2875 IF (iy == ub_coord(2) .AND. .NOT. dir_periodic(2)) &
2876 tasks(itask)%subpatch_pattern = ibset(tasks(itask)%subpatch_pattern, 3)
2877 IF (iz == lb_coord(3) .AND. .NOT. dir_periodic(3)) &
2878 tasks(itask)%subpatch_pattern = ibset(tasks(itask)%subpatch_pattern, 4)
2879 IF (iz == ub_coord(3) .AND. .NOT. dir_periodic(3)) &
2880 tasks(itask)%subpatch_pattern = ibset(tasks(itask)%subpatch_pattern, 5)
2881 itask = itask + 1
2882 END DO
2883 END DO
2884 END DO
2885 END IF
2886
2887 END SUBROUTINE rs_find_node
2888
2889! **************************************************************************************************
2890!> \brief utility functions for encoding the grid level with a rank, allowing
2891!> different grid levels to maintain a different rank ordering without
2892!> losing information. These encoded_ints are stored in the tasks(1:2,:) array
2893!> \param rank ...
2894!> \param grid_level ...
2895!> \param n_levels ...
2896!> \return ...
2897!> \par History
2898!> 4.2009 created [Iain Bethune]
2899!> (c) The Numerical Algorithms Group (NAG) Ltd, 2009 on behalf of the HECToR project
2900! **************************************************************************************************
2901 FUNCTION encode_rank(rank, grid_level, n_levels) RESULT(encoded_int)
2902
2903 INTEGER, INTENT(IN) :: rank, grid_level, n_levels
2904 INTEGER :: encoded_int
2905
2906! ordered so can still sort by rank
2907
2908 encoded_int = rank*n_levels + grid_level - 1
2909
2910 END FUNCTION
2911
2912! **************************************************************************************************
2913!> \brief ...
2914!> \param encoded_int ...
2915!> \param n_levels ...
2916!> \return ...
2917! **************************************************************************************************
2918 FUNCTION decode_rank(encoded_int, n_levels) RESULT(rank)
2919
2920 INTEGER, INTENT(IN) :: encoded_int
2921 INTEGER, INTENT(IN) :: n_levels
2922 INTEGER :: rank
2923
2924 rank = int(encoded_int/n_levels)
2925
2926 END FUNCTION
2927
2928! **************************************************************************************************
2929!> \brief ...
2930!> \param encoded_int ...
2931!> \param n_levels ...
2932!> \return ...
2933! **************************************************************************************************
2934 FUNCTION decode_level(encoded_int, n_levels) RESULT(grid_level)
2935
2936 INTEGER, INTENT(IN) :: encoded_int
2937 INTEGER, INTENT(IN) :: n_levels
2938 INTEGER :: grid_level
2939
2940 grid_level = int(modulo(encoded_int, n_levels)) + 1
2941
2942 END FUNCTION decode_level
2943
2944! **************************************************************************************************
2945!> \brief Sort pgf index pair (ipgf,iset,iatom),(jpgf,jset,jatom) for which all atom pairs are
2946!> grouped, and for each atom pair all set pairs are grouped and for each set pair,
2947!> all pgfs are grouped.
2948!> This yields the right order of the tasks for collocation after the sort
2949!> in distribute_tasks. E.g. for a atom pair, all sets and pgfs are computed in one go.
2950!> The exception is the gridlevel. Tasks are first ordered wrt to grid_level. This implies
2951!> that a given density matrix block will be decontracted several times,
2952!> but cache effects on the grid make up for this.
2953!> \param a ...
2954!> \param b ...
2955!> \return ...
2956!> \author Ole Schuett
2957! **************************************************************************************************
2958 PURE FUNCTION tasks_less_than(a, b) RESULT(res)
2959 TYPE(task_type), INTENT(IN) :: a, b
2960 LOGICAL :: res
2961
2962 IF (a%grid_level /= b%grid_level) THEN
2963 res = a%grid_level < b%grid_level
2964
2965 ELSE IF (a%image /= b%image) THEN
2966 res = a%image < b%image
2967
2968 ELSE IF (a%iatom /= b%iatom) THEN
2969 res = a%iatom < b%iatom
2970
2971 ELSE IF (a%jatom /= b%jatom) THEN
2972 res = a%jatom < b%jatom
2973
2974 ELSE IF (a%iset /= b%iset) THEN
2975 res = a%iset < b%iset
2976
2977 ELSE IF (a%jset /= b%jset) THEN
2978 res = a%jset < b%jset
2979
2980 ELSE IF (a%ipgf /= b%ipgf) THEN
2981 res = a%ipgf < b%ipgf
2982
2983 ELSE
2984 res = a%jpgf < b%jpgf
2985
2986 END IF
2987 END FUNCTION tasks_less_than
2988
2989
2990! **************************************************************************************************
2991!> \brief Sorts an array inplace using a combination of merge- and bubble-sort.
2992!> It also returns the indices, which the elements had before the sort.
2993!> \param arr the array to sort
2994!> \param n length of array
2995!> \param indices returns elements-indices before the sort
2996!> \par History
2997!> 12.2012 created [ole]
2998!> \author Ole Schuett
2999! **************************************************************************************************
3000 SUBROUTINE tasks_sort(arr, n, indices)
3001 INTEGER, INTENT(IN) :: n
3002 TYPE(task_type), DIMENSION(1:n), INTENT(INOUT) :: arr
3003 integer, DIMENSION(1:n), INTENT(INOUT) :: indices
3004
3005 INTEGER :: i
3006 TYPE(task_type), ALLOCATABLE :: tmp_arr(:)
3007 INTEGER, ALLOCATABLE :: tmp_idx(:)
3008
3009 IF (n > 1) THEN
3010 ! scratch space used during the merge step
3011 ALLOCATE (tmp_arr((n + 1)/2), tmp_idx((n + 1)/2))
3012
3013 indices = (/(i, i=1, n)/)
3014
3015 CALL tasks_sort_low(arr(1:n), indices, tmp_arr, tmp_idx)
3016
3017 DEALLOCATE (tmp_arr, tmp_idx)
3018 ELSE IF (n > 0) THEN ! can be a frequent case in cp2k
3019 indices(1) = 1
3020 END IF
3021
3022 END SUBROUTINE tasks_sort
3023
3024! **************************************************************************************************
3025!> \brief The actual sort routing. Only tasks_sort and itself should call this.
3026!> \param arr the array to sort
3027!> \param indices elements-indices before the sort
3028!> \param tmp_arr scratch space
3029!> \param tmp_idx scratch space
3030!> \par History
3031!> 12.2012 created [ole]
3032!> \author Ole Schuett
3033! **************************************************************************************************
3034 RECURSIVE SUBROUTINE tasks_sort_low(arr, indices, tmp_arr, tmp_idx)
3035 TYPE(task_type), DIMENSION(:), INTENT(INOUT) :: arr
3036 INTEGER, DIMENSION(size(arr)), INTENT(INOUT) :: indices
3037 TYPE(task_type), DIMENSION((size(arr) + 1)/2), INTENT(INOUT) :: tmp_arr
3038 INTEGER, DIMENSION((size(arr) + 1)/2), INTENT(INOUT) :: tmp_idx
3039 TYPE(task_type) :: a
3040 INTEGER :: t, m, i, j, k
3041 LOGICAL :: swapped
3042 ! a,t: used during swaping of elements in arr and indices
3043
3044 swapped = .true.
3045
3046 ! If only a few elements are left we switch to bubble-sort for efficiency.
3047 IF (size(arr) <= 7) THEN ! 7 seems to be a good choice for the moment
3048 DO j = size(arr) - 1, 1, -1
3049 swapped = .false.
3050 DO i = 1, j
3051 IF (tasks_less_than(arr(i + 1), arr(i))) THEN
3052 ! swap arr(i) with arr(i+1)
3053 a = arr(i)
3054 arr(i) = arr(i + 1)
3055 arr(i + 1) = a
3056 ! swap indices(i) with indices(i+1)
3057 t = indices(i)
3058 indices(i) = indices(i + 1)
3059 indices(i + 1) = t
3060 swapped = .true.
3061 END IF
3062 END DO
3063 IF (.NOT. swapped) EXIT
3064 END DO
3065 RETURN
3066 END IF
3067
3068 ! split list in half and recursively sort both sublists
3069 m = (size(arr) + 1)/2 ! index where we're going to divide the list in two
3070 CALL tasks_sort_low(arr(1:m), indices(1:m), tmp_arr, tmp_idx)
3071 CALL tasks_sort_low(arr(m + 1:), indices(m + 1:), tmp_arr, tmp_idx)
3072
3073 ! Check for a special case: Can we just concate the two sorted sublists?
3074 ! This leads to O(n) scaling if the input is already sorted.
3075 IF (tasks_less_than(arr(m + 1), arr(m))) THEN
3076 ! ...no - let's merge the two sorted sublists arr(:m) and arr(m+1:)
3077 ! Merge will be performed directly in arr. Need backup of first sublist.
3078 tmp_arr(1:m) = arr(1:m)
3079 tmp_idx(1:m) = indices(1:m)
3080 i = 1; ! number of elemens consumed from 1st sublist
3081 j = 1; ! number of elemens consumed from 2nd sublist
3082 k = 1; ! number of elemens already merged
3083
3084 DO WHILE (i <= m .and. j <= size(arr) - m)
3085 IF (tasks_less_than(arr(m + j), tmp_arr(i))) THEN
3086 arr(k) = arr(m + j)
3087 indices(k) = indices(m + j)
3088 j = j + 1
3089 ELSE
3090 arr(k) = tmp_arr(i)
3091 indices(k) = tmp_idx(i)
3092 i = i + 1
3093 END IF
3094 k = k + 1
3095 END DO
3096
3097 ! One of the two sublist is now empty.
3098 ! Copy possibly remaining tail of 1st sublist
3099 DO WHILE (i <= m)
3100 arr(k) = tmp_arr(i)
3101 indices(k) = tmp_idx(i)
3102 i = i + 1
3103 k = k + 1
3104 END DO
3105 ! The possibly remaining tail of 2nd sublist is already at the right spot.
3106 END IF
3107
3108 END SUBROUTINE tasks_sort_low
3109
3110
3111! **************************************************************************************************
3112!> \brief Order atom pairs to find duplicates.
3113!> \param a ...
3114!> \param b ...
3115!> \return ...
3116!> \author Ole Schuett
3117! **************************************************************************************************
3118 PURE FUNCTION atom_pair_less_than(a, b) RESULT(res)
3119 TYPE(atom_pair_type), INTENT(IN) :: a, b
3120 LOGICAL :: res
3121
3122 IF (a%rank /= b%rank) THEN
3123 res = a%rank < b%rank
3124
3125 ELSE IF (a%row /= b%row) THEN
3126 res = a%row < b%row
3127
3128 ELSE IF (a%col /= b%col) THEN
3129 res = a%col < b%col
3130
3131 ELSE
3132 res = a%image < b%image
3133
3134 END IF
3135 END FUNCTION atom_pair_less_than
3136
3137
3138! **************************************************************************************************
3139!> \brief Sorts an array inplace using a combination of merge- and bubble-sort.
3140!> It also returns the indices, which the elements had before the sort.
3141!> \param arr the array to sort
3142!> \param n length of array
3143!> \param indices returns elements-indices before the sort
3144!> \par History
3145!> 12.2012 created [ole]
3146!> \author Ole Schuett
3147! **************************************************************************************************
3148 SUBROUTINE atom_pair_sort(arr, n, indices)
3149 INTEGER, INTENT(IN) :: n
3150 TYPE(atom_pair_type), DIMENSION(1:n), INTENT(INOUT) :: arr
3151 integer, DIMENSION(1:n), INTENT(INOUT) :: indices
3152
3153 INTEGER :: i
3154 TYPE(atom_pair_type), ALLOCATABLE :: tmp_arr(:)
3155 INTEGER, ALLOCATABLE :: tmp_idx(:)
3156
3157 IF (n > 1) THEN
3158 ! scratch space used during the merge step
3159 ALLOCATE (tmp_arr((n + 1)/2), tmp_idx((n + 1)/2))
3160
3161 indices = (/(i, i=1, n)/)
3162
3163 CALL atom_pair_sort_low(arr(1:n), indices, tmp_arr, tmp_idx)
3164
3165 DEALLOCATE (tmp_arr, tmp_idx)
3166 ELSE IF (n > 0) THEN ! can be a frequent case in cp2k
3167 indices(1) = 1
3168 END IF
3169
3170 END SUBROUTINE atom_pair_sort
3171
3172! **************************************************************************************************
3173!> \brief The actual sort routing. Only atom_pair_sort and itself should call this.
3174!> \param arr the array to sort
3175!> \param indices elements-indices before the sort
3176!> \param tmp_arr scratch space
3177!> \param tmp_idx scratch space
3178!> \par History
3179!> 12.2012 created [ole]
3180!> \author Ole Schuett
3181! **************************************************************************************************
3182 RECURSIVE SUBROUTINE atom_pair_sort_low(arr, indices, tmp_arr, tmp_idx)
3183 TYPE(atom_pair_type), DIMENSION(:), INTENT(INOUT) :: arr
3184 INTEGER, DIMENSION(size(arr)), INTENT(INOUT) :: indices
3185 TYPE(atom_pair_type), DIMENSION((size(arr) + 1)/2), INTENT(INOUT) :: tmp_arr
3186 INTEGER, DIMENSION((size(arr) + 1)/2), INTENT(INOUT) :: tmp_idx
3187 TYPE(atom_pair_type) :: a
3188 INTEGER :: t, m, i, j, k
3189 LOGICAL :: swapped
3190 ! a,t: used during swaping of elements in arr and indices
3191
3192 swapped = .true.
3193
3194 ! If only a few elements are left we switch to bubble-sort for efficiency.
3195 IF (size(arr) <= 7) THEN ! 7 seems to be a good choice for the moment
3196 DO j = size(arr) - 1, 1, -1
3197 swapped = .false.
3198 DO i = 1, j
3199 IF (atom_pair_less_than(arr(i + 1), arr(i))) THEN
3200 ! swap arr(i) with arr(i+1)
3201 a = arr(i)
3202 arr(i) = arr(i + 1)
3203 arr(i + 1) = a
3204 ! swap indices(i) with indices(i+1)
3205 t = indices(i)
3206 indices(i) = indices(i + 1)
3207 indices(i + 1) = t
3208 swapped = .true.
3209 END IF
3210 END DO
3211 IF (.NOT. swapped) EXIT
3212 END DO
3213 RETURN
3214 END IF
3215
3216 ! split list in half and recursively sort both sublists
3217 m = (size(arr) + 1)/2 ! index where we're going to divide the list in two
3218 CALL atom_pair_sort_low(arr(1:m), indices(1:m), tmp_arr, tmp_idx)
3219 CALL atom_pair_sort_low(arr(m + 1:), indices(m + 1:), tmp_arr, tmp_idx)
3220
3221 ! Check for a special case: Can we just concate the two sorted sublists?
3222 ! This leads to O(n) scaling if the input is already sorted.
3223 IF (atom_pair_less_than(arr(m + 1), arr(m))) THEN
3224 ! ...no - let's merge the two sorted sublists arr(:m) and arr(m+1:)
3225 ! Merge will be performed directly in arr. Need backup of first sublist.
3226 tmp_arr(1:m) = arr(1:m)
3227 tmp_idx(1:m) = indices(1:m)
3228 i = 1; ! number of elemens consumed from 1st sublist
3229 j = 1; ! number of elemens consumed from 2nd sublist
3230 k = 1; ! number of elemens already merged
3231
3232 DO WHILE (i <= m .and. j <= size(arr) - m)
3233 IF (atom_pair_less_than(arr(m + j), tmp_arr(i))) THEN
3234 arr(k) = arr(m + j)
3235 indices(k) = indices(m + j)
3236 j = j + 1
3237 ELSE
3238 arr(k) = tmp_arr(i)
3239 indices(k) = tmp_idx(i)
3240 i = i + 1
3241 END IF
3242 k = k + 1
3243 END DO
3244
3245 ! One of the two sublist is now empty.
3246 ! Copy possibly remaining tail of 1st sublist
3247 DO WHILE (i <= m)
3248 arr(k) = tmp_arr(i)
3249 indices(k) = tmp_idx(i)
3250 i = i + 1
3251 k = k + 1
3252 END DO
3253 ! The possibly remaining tail of 2nd sublist is already at the right spot.
3254 END IF
3255
3256 END SUBROUTINE atom_pair_sort_low
3257
3258
3259END MODULE task_list_methods
void grid_create_basis_set(const int nset, const int nsgf, const int maxco, const int maxpgf, const int lmin[nset], const int lmax[nset], const int npgf[nset], const int nsgf_set[nset], const int first_sgf[nset], const double sphi[nsgf][maxco], const double zet[nset][maxpgf], grid_basis_set **basis_set_out)
Allocates a basis set which can be passed to grid_create_task_list. See grid_task_list....
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
All kind of helpful little routines.
Definition ao_util.F:14
real(kind=dp) function, public exp_radius_very_extended(la_min, la_max, lb_min, lb_max, pab, o1, o2, ra, rb, rp, zetp, eps, prefactor, cutoff, epsabs)
computes the radius of the Gaussian outside of which it is smaller than eps
Definition ao_util.F:209
subroutine, public get_gto_basis_set(gto_basis_set, name, aliases, norm_type, kind_radius, ncgf, nset, nsgf, cgf_symbol, sgf_symbol, norm_cgf, set_radius, lmax, lmin, lx, ly, lz, m, ncgf_set, npgf, nsgf_set, nshell, cphi, pgf_radius, sphi, scon, zet, first_cgf, first_sgf, l, last_cgf, last_sgf, n, gcc, maxco, maxl, maxpgf, maxsgf_set, maxshell, maxso, nco_sum, npgf_sum, nshell_sum, maxder, short_kind_radius)
...
Handles all functions related to the CELL.
Definition cell_types.F:15
Defines control structures, which contain the parameters and the settings for the DFT-based calculati...
for a given dr()/dh(r) this will provide the bounds to be used if one wants to go over a sphere-subre...
Definition cube_utils.F:18
subroutine, public compute_cube_center(cube_center, rs_desc, zeta, zetb, ra, rab)
unifies the computation of the cube center, so that differences in implementation,...
Definition cube_utils.F:68
subroutine, public return_cube(info, radius, lb_cube, ub_cube, sphere_bounds)
...
Definition cube_utils.F:140
subroutine, public return_cube_nonortho(info, radius, lb, ub, rp)
...
Definition cube_utils.F:91
integer function, public gaussian_gridlevel(gridlevel_info, exponent)
...
Fortran API for the grid package, which is written in C.
Definition grid_api.F:12
subroutine, public grid_create_task_list(ntasks, natoms, nkinds, nblocks, block_offsets, atom_positions, atom_kinds, basis_sets, level_list, iatom_list, jatom_list, iset_list, jset_list, ipgf_list, jpgf_list, border_mask_list, block_num_list, radius_list, rab_list, rs_grids, task_list)
Allocates a task list which can be passed to grid_collocate_task_list.
Definition grid_api.F:726
Defines the basic variable types.
Definition kinds.F:23
integer, parameter, public int_8
Definition kinds.F:54
integer, parameter, public dp
Definition kinds.F:34
integer, parameter, public default_string_length
Definition kinds.F:57
Types and basic routines needed for a kpoint calculation.
subroutine, public get_kpoint_info(kpoint, kp_scheme, nkp_grid, kp_shift, symmetry, verbose, full_grid, use_real_wfn, eps_geo, parallel_group_size, kp_range, nkp, xkp, wkp, para_env, blacs_env_all, para_env_kp, para_env_inter_kp, blacs_env, kp_env, kp_aux_env, mpools, iogrp, nkp_groups, kp_dist, cell_to_index, index_to_cell, sab_nl, sab_nl_nosym)
Retrieve information from a kpoint environment.
An array-based list which grows on demand. When the internal array is full, a new array of twice the ...
Definition list.F:24
Utility routines for the memory handling.
Interface to the message passing library MPI.
Fortran API for the offload package, which is written in C.
Definition offload_api.F:12
subroutine, public offload_create_buffer(length, buffer)
Allocates a buffer of given length, ie. number of elements.
Define methods related to particle_type.
subroutine, public get_particle_set(particle_set, qs_kind_set, first_sgf, last_sgf, nsgf, nmao, basis)
Get the components of a particle set.
Define the data structure for the particle information.
container for various plainwaves related things
subroutine, public pw_env_get(pw_env, pw_pools, cube_info, gridlevel_info, auxbas_pw_pool, auxbas_grid, auxbas_rs_desc, auxbas_rs_grid, rs_descs, rs_grids, xc_pw_pool, vdw_pw_pool, poisson_env, interp_section)
returns the various attributes of the pw env
Define the quickstep kind type and their sub types.
subroutine, public get_qs_kind(qs_kind, basis_set, basis_type, ncgf, nsgf, all_potential, tnadd_potential, gth_potential, sgp_potential, upf_potential, se_parameter, dftb_parameter, xtb_parameter, dftb3_param, zeff, elec_conf, mao, lmax_dftb, alpha_core_charge, ccore_charge, core_charge, core_charge_radius, paw_proj_set, paw_atom, hard_radius, hard0_radius, max_rad_local, covalent_radius, vdw_radius, gpw_r3d_rs_type_forced, harmonics, max_iso_not0, max_s_harm, grid_atom, ngrid_ang, ngrid_rad, lmax_rho0, dft_plus_u_atom, l_of_dft_plus_u, n_of_dft_plus_u, u_minus_j, u_of_dft_plus_u, j_of_dft_plus_u, alpha_of_dft_plus_u, beta_of_dft_plus_u, j0_of_dft_plus_u, occupation_of_dft_plus_u, dispersion, bs_occupation, magnetization, no_optimize, addel, laddel, naddel, orbitals, max_scf, eps_scf, smear, u_ramping, u_minus_j_target, eps_u_ramping, init_u_ramping_each_scf, reltmat, ghost, floating, name, element_symbol, pao_basis_size, pao_potentials, pao_descriptors, nelec)
Get attributes of an atomic kind.
subroutine, public get_ks_env(ks_env, v_hartree_rspace, s_mstruct_changed, rho_changed, potential_changed, forces_up_to_date, complex_ks, matrix_h, matrix_h_im, matrix_ks, matrix_ks_im, matrix_vxc, kinetic, matrix_s, matrix_s_ri_aux, matrix_w, matrix_p_mp2, matrix_p_mp2_admm, matrix_h_kp, matrix_h_im_kp, matrix_ks_kp, matrix_vxc_kp, kinetic_kp, matrix_s_kp, matrix_w_kp, matrix_s_ri_aux_kp, matrix_ks_im_kp, rho, rho_xc, vppl, rho_core, rho_nlcc, rho_nlcc_g, vee, neighbor_list_id, sab_orb, sab_all, sac_ae, sac_ppl, sac_lri, sap_ppnl, sap_oce, sab_lrc, sab_se, sab_xtbe, sab_tbe, sab_core, sab_xb, sab_xtb_nonbond, sab_vdw, sab_scp, sab_almo, sab_kp, sab_kp_nosym, task_list, task_list_soft, kpoints, do_kpoints, atomic_kind_set, qs_kind_set, cell, cell_ref, use_ref_cell, particle_set, energy, force, local_particles, local_molecules, molecule_kind_set, molecule_set, subsys, cp_subsys, virial, results, atprop, nkind, natom, dft_control, dbcsr_dist, distribution_2d, pw_env, para_env, blacs_env, nelectron_total, nelectron_spin)
...
Define the neighbor list data types and the corresponding functionality.
subroutine, public rs_grid_create(rs, desc)
...
pure integer function, public rs_grid_locate_rank(rs_desc, rank_in, shift)
returns the 1D rank of the task which is a cartesian shift away from 1D rank rank_in only possible if...
pure subroutine, public rs_grid_reorder_ranks(desc, real2virtual)
Defines a new ordering of ranks on this realspace grid, recalculating the data bounds and reallocatin...
subroutine, public rs_grid_release(rs_grid)
releases the given rs grid (see doc/ReferenceCounting.html)
generate the tasks lists used by collocate and integrate routines
subroutine, public rs_copy_to_matrices(src_buffer, dest_matrices, task_list)
Copies from buffer into DBCSR matrics, replaces rs_gather_matrix for non-distributed grids.
subroutine, public generate_qs_task_list(ks_env, task_list, reorder_rs_grid_ranks, skip_load_balance_distributed, soft_valid, basis_type, pw_env_external, sab_orb_external)
...
subroutine, public rs_scatter_matrices(src_matrices, dest_buffer, task_list, group)
Scatters dbcsr matrix blocks and receives them into a buffer as needed before collocation.
subroutine, public rs_distribute_matrix(rs_descs, pmats, atom_pair_send, atom_pair_recv, nimages, scatter, hmats)
redistributes the matrix so that it can be used in realspace operations i.e. according to the task li...
subroutine, public distribute_tasks(rs_descs, ntasks, natoms, tasks, atom_pair_send, atom_pair_recv, symmetric, reorder_rs_grid_ranks, skip_load_balance_distributed)
Assembles tasks to be performed on local grid.
subroutine, public rs_gather_matrices(src_buffer, dest_matrices, task_list, group)
Gather the dbcsr matrix blocks and receives them into a buffer as needed after integration.
subroutine, public task_list_inner_loop(tasks, ntasks, curr_tasks, rs_descs, dft_control, cube_info, gridlevel_info, cindex, iatom, jatom, rpgfa, rpgfb, zeta, zetb, kind_radius_b, set_radius_a, set_radius_b, ra, rab, la_max, la_min, lb_max, lb_min, npgfa, npgfb, nseta, nsetb)
...
subroutine, public rs_copy_to_buffer(src_matrices, dest_buffer, task_list)
Copies the DBCSR blocks into buffer, replaces rs_scatter_matrix for non-distributed grids.
types for task lists
subroutine, public serialize_task(task, serialized_task)
Serialize a task into an integer array. Used for MPI communication.
subroutine, public deserialize_task(task, serialized_task)
De-serialize a task from an integer array. Used for MPI communication.
subroutine, public reallocate_tasks(tasks, new_size)
Grow an array of tasks while preserving the existing entries.
integer, parameter, public task_size_in_int8
All kind of helpful little routines.
Definition util.F:14
Type defining parameters related to the simulation cell.
Definition cell_types.F:55
Contains information about kpoints.
contained for different pw related things
Provides all information about a quickstep kind.
calculation environment to calculate the ks matrix, holds all the needed vars. assumes that the core ...