(git:ed6f26b)
Loading...
Searching...
No Matches
hfx_load_balance_methods.F
Go to the documentation of this file.
1!--------------------------------------------------------------------------------------------------!
2! CP2K: A general program to perform molecular dynamics simulations !
3! Copyright 2000-2025 CP2K developers group <https://cp2k.org> !
4! !
5! SPDX-License-Identifier: GPL-2.0-or-later !
6!--------------------------------------------------------------------------------------------------!
7
8! **************************************************************************************************
9!> \brief Routines for optimizing load balance between processes in HFX calculations
10!> \par History
11!> 04.2008 created [Manuel Guidon]
12!> \author Manuel Guidon
13! **************************************************************************************************
15 USE cell_types, ONLY: cell_type
16 USE cp_files, ONLY: close_file, &
21 USE hfx_types, ONLY: &
27 USE kinds, ONLY: dp, &
28 int_8
30 USE parallel_rng_types, ONLY: uniform, &
33 USE util, ONLY: sort
34#include "./base/base_uses.f90"
35
36 IMPLICIT NONE
37 PRIVATE
38
39 PUBLIC :: hfx_load_balance, &
42
43 CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'hfx_load_balance_methods'
44
45 REAL(kind=dp), PARAMETER :: p1_energy(12) = (/2.9461408209700424_dp, 1.0624718662999657_dp, &
46 -1.91570128356921242e-002_dp, -1.6668495454436603_dp, &
47 1.7512639006523709_dp, -9.76074323945336081e-002_dp, &
48 2.6230786127311889_dp, -0.31870737623014189_dp, &
49 7.9588203912690973_dp, 1.8331423413134813_dp, &
50 -0.15427618665346299_dp, 0.19749436090711650_dp/)
51 REAL(kind=dp), PARAMETER :: p2_energy(12) = (/2.3104682960662593_dp, 1.8744052737304417_dp, &
52 -9.36564055598656797e-002_dp, 0.64284973765086939_dp, &
53 1.0137565430060556_dp, -6.80088178288954567e-003_dp, &
54 1.1692629207374552_dp, -2.6314710080507573_dp, &
55 19.237814781880786_dp, 1.0505934173661349_dp, &
56 0.80382371955699250_dp, 0.49903401991818103_dp/)
57 REAL(kind=dp), PARAMETER :: p3_energy(2) = (/7.82336287670072350e-002_dp, 0.38073304105744837_dp/)
58 REAL(kind=dp), PARAMETER :: p1_forces(12) = (/2.5746279948798874_dp, 1.3420575378609276_dp, &
59 -9.41673106447732111e-002_dp, 0.94568006899317825_dp, &
60 -1.4511897117448544_dp, 0.59178934677316952_dp, &
61 2.7291149361757236_dp, -0.50555512044800210_dp, &
62 8.3508180969609871_dp, 1.6829982496141809_dp, &
63 -0.74895370472152600_dp, 0.43801726744197500_dp/)
64 REAL(kind=dp), PARAMETER :: p2_forces(12) = (/2.6398568961569020_dp, 2.3024918834564101_dp, &
65 5.33216585432061581e-003_dp, 0.45572145697283628_dp, &
66 1.8119743851500618_dp, -0.12533918548421166_dp, &
67 -1.4040312084552751_dp, -4.5331650463917859_dp, &
68 12.593431549069477_dp, 1.1311978374487595_dp, &
69 1.4245996087624646_dp, 1.1425350529853495_dp/)
70 REAL(kind=dp), PARAMETER :: p3_forces(2) = (/0.12051930516830946_dp, 1.3828051586144336_dp/)
71
72!***
73
74CONTAINS
75
76! **************************************************************************************************
77!> \brief Distributes the computation of eri's to all available processes.
78!> \param x_data Object that stores the indices array
79!> \param eps_schwarz screening parameter
80!> \param particle_set , atomic_kind_set, para_env ...
81!> \param max_set Maximum number of set to be considered
82!> \param para_env para_env
83!> \param coeffs_set screening functions
84!> \param coeffs_kind screening functions
85!> \param is_assoc_atomic_block_global KS-matrix sparsity
86!> \param do_periodic flag for periodicity
87!> \param load_balance_parameter Parameters for Monte-Carlo routines
88!> \param kind_of helper array for mapping
89!> \param basis_parameter Basis set parameters
90!> \param pmax_set Initial screening matrix
91!> \param pmax_atom ...
92!> \param i_thread Process ID of current Thread
93!> \param n_threads Total Number of Threads
94!> \param cell cell
95!> \param do_p_screening Flag for initial p screening
96!> \param map_atom_to_kind_atom ...
97!> \param nkind ...
98!> \param eval_type ...
99!> \param pmax_block ...
100!> \param use_virial ...
101!> \par History
102!> 06.2007 created [Manuel Guidon]
103!> 08.2007 new parallel scheme [Manuel Guidon]
104!> 09.2007 new 'modulo' parellel scheme and Monte Carlo step [Manuel Guidon]
105!> 11.2007 parallelize load balance on box_idx1 [Manuel Guidon]
106!> 02.2009 completely refactored [Manuel Guidon]
107!> \author Manuel Guidon
108!> \note
109!> The optimization is done via a binning procedure followed by simple
110!> Monte Carlo procedure:
111!> In a first step the total amount of integrals in the system is calculated,
112!> taking into account the sparsity of the KS-matrix , the screening based
113!> on near/farfield approximations and if desired the screening on an initial
114!> density matrix.
115!> In a second step, bins are generate that contain approximately the same number
116!> of integrals, and a cost for these bins is estimated (currently the number of integrals)
117!> In a third step, a Monte Carlo procedure optimizes the assignment
118!> of the different loads to each process
119!> At the end each process owns an unique array of *atomic* indices-ranges
120!> that are used to decide whether a process has to calculate a certain
121!> bunch of integrals or not
122! **************************************************************************************************
123 SUBROUTINE hfx_load_balance(x_data, eps_schwarz, particle_set, max_set, para_env, &
124 coeffs_set, coeffs_kind, &
125 is_assoc_atomic_block_global, do_periodic, &
126 load_balance_parameter, kind_of, basis_parameter, pmax_set, &
127 pmax_atom, i_thread, n_threads, cell, &
128 do_p_screening, map_atom_to_kind_atom, nkind, eval_type, &
129 pmax_block, use_virial)
130 TYPE(hfx_type), POINTER :: x_data
131 REAL(dp), INTENT(IN) :: eps_schwarz
132 TYPE(particle_type), DIMENSION(:), POINTER :: particle_set
133 INTEGER, INTENT(IN) :: max_set
134 TYPE(mp_para_env_type), POINTER :: para_env
135 TYPE(hfx_screen_coeff_type), &
136 DIMENSION(:, :, :, :), POINTER :: coeffs_set
137 TYPE(hfx_screen_coeff_type), DIMENSION(:, :), &
138 POINTER :: coeffs_kind
139 INTEGER, DIMENSION(:, :) :: is_assoc_atomic_block_global
140 LOGICAL :: do_periodic
141 TYPE(hfx_load_balance_type), POINTER :: load_balance_parameter
142 INTEGER :: kind_of(*)
143 TYPE(hfx_basis_type), DIMENSION(:), POINTER :: basis_parameter
144 TYPE(hfx_p_kind), DIMENSION(:), POINTER :: pmax_set
145 REAL(dp), DIMENSION(:, :), POINTER :: pmax_atom
146 INTEGER, INTENT(IN) :: i_thread, n_threads
147 TYPE(cell_type), POINTER :: cell
148 LOGICAL, INTENT(IN) :: do_p_screening
149 INTEGER, DIMENSION(:), POINTER :: map_atom_to_kind_atom
150 INTEGER, INTENT(IN) :: nkind, eval_type
151 REAL(dp), DIMENSION(:, :), POINTER :: pmax_block
152 LOGICAL, INTENT(IN) :: use_virial
153
154 CHARACTER(LEN=*), PARAMETER :: routinen = 'hfx_load_balance'
155
156 CHARACTER(LEN=512) :: error_msg
157 INTEGER :: block_size, current_block_id, data_from, dest, handle, handle_inner, &
158 handle_range, i, iatom_block, iatom_end, iatom_start, ibin, icpu, j, jatom_block, &
159 jatom_end, jatom_start, katom_block, katom_end, katom_start, latom_block, latom_end, &
160 latom_start, mepos, my_process_id, n_processes, natom, nbins, nblocks, ncpu, &
161 new_iatom_end, new_iatom_start, new_jatom_end, new_jatom_start, non_empty_blocks, &
162 objective_block_size, objective_nblocks, source, total_blocks
163 TYPE(mp_request_type), DIMENSION(2) :: req
164 INTEGER(int_8) :: atom_block, cost_per_bin, cost_per_core, current_cost, &
165 distribution_counter_end, distribution_counter_start, global_quartet_counter, &
166 local_quartet_counter, self_cost_per_block, tmp_block, total_block_self_cost
167 INTEGER(int_8), ALLOCATABLE, DIMENSION(:) :: buffer_in, buffer_out
168 INTEGER(int_8), DIMENSION(:), POINTER :: local_cost_matrix, recbuffer, &
169 sendbuffer, swapbuffer
170 INTEGER(int_8), DIMENSION(:), POINTER, SAVE :: cost_matrix
171 INTEGER(int_8), SAVE :: shm_global_quartet_counter, &
172 shm_local_quartet_counter
173 INTEGER, ALLOCATABLE, DIMENSION(:) :: rcount, rdispl, tmp_index, tmp_pos, &
174 to_be_sorted
175 INTEGER, DIMENSION(:), POINTER, SAVE :: shm_distribution_vector
176 INTEGER, SAVE :: shm_nblocks
177 LOGICAL :: changed, last_bin_needs_to_be_filled, &
178 optimized
179 LOGICAL, DIMENSION(:, :), POINTER, SAVE :: atomic_pair_list
180 REAL(dp) :: coeffs_kind_max0, log10_eps_schwarz, &
181 log_2, pmax_blocks
182 TYPE(hfx_block_range_type), DIMENSION(:), POINTER :: blocks_guess, tmp_blocks, tmp_blocks2
183 TYPE(hfx_block_range_type), DIMENSION(:), &
184 POINTER, SAVE :: shm_blocks
185 TYPE(hfx_distribution), DIMENSION(:), POINTER :: binned_dist, ptr_to_tmp_dist, tmp_dist
186 TYPE(hfx_distribution), DIMENSION(:, :), POINTER, &
187 SAVE :: full_dist
188 TYPE(pair_list_type) :: list_ij, list_kl
189 TYPE(pair_set_list_type), ALLOCATABLE, &
190 DIMENSION(:) :: set_list_ij, set_list_kl
191
192!$OMP BARRIER
193!$OMP MASTER
194 CALL timeset(routinen, handle)
195!$OMP END MASTER
196!$OMP BARRIER
197
198 log10_eps_schwarz = log10(eps_schwarz)
199 log_2 = log10(2.0_dp)
200 coeffs_kind_max0 = maxval(coeffs_kind(:, :)%x(2))
201 ncpu = para_env%num_pe
202 n_processes = ncpu*n_threads
203 natom = SIZE(particle_set)
204
205 block_size = load_balance_parameter%block_size
206 ALLOCATE (set_list_ij((max_set*block_size)**2))
207 ALLOCATE (set_list_kl((max_set*block_size)**2))
208
209 IF (.NOT. load_balance_parameter%blocks_initialized) THEN
210!$OMP BARRIER
211!$OMP MASTER
212 CALL timeset(routinen//"_range", handle_range)
213
214 nblocks = max((natom + block_size - 1)/block_size, 1)
215 ALLOCATE (blocks_guess(nblocks))
216 ALLOCATE (tmp_blocks(natom))
217 ALLOCATE (tmp_blocks2(natom))
218
219 pmax_blocks = 0.0_dp
220 SELECT CASE (eval_type)
221 CASE (hfx_do_eval_energy)
222 atomic_pair_list => x_data%atomic_pair_list
223 CASE (hfx_do_eval_forces)
224 atomic_pair_list => x_data%atomic_pair_list_forces
225 END SELECT
226 atomic_pair_list = .true.
227 CALL init_blocks(nkind, para_env, natom, block_size, nblocks, blocks_guess, &
228 list_ij, list_kl, set_list_ij, set_list_kl, &
229 particle_set, &
230 coeffs_set, coeffs_kind, &
231 is_assoc_atomic_block_global, do_periodic, &
232 kind_of, basis_parameter, pmax_set, pmax_atom, &
233 pmax_blocks, cell, &
234 do_p_screening, map_atom_to_kind_atom, eval_type, &
235 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
236
237 total_block_self_cost = 0
238
239 DO i = 1, nblocks
240 total_block_self_cost = total_block_self_cost + blocks_guess(i)%cost
241 END DO
242
243 CALL para_env%sum(total_block_self_cost)
244
245 objective_block_size = load_balance_parameter%block_size
246 objective_nblocks = max((natom + objective_block_size - 1)/objective_block_size, 1)
247
248 self_cost_per_block = (total_block_self_cost + objective_nblocks - 1)/(objective_nblocks)
249
250 DO i = 1, nblocks
251 tmp_blocks2(i) = blocks_guess(i)
252 END DO
253
254 optimized = .false.
255 i = 0
256 DO WHILE (.NOT. optimized)
257 i = i + 1
258 current_block_id = 0
259 changed = .false.
260 DO atom_block = 1, nblocks
261 current_block_id = current_block_id + 1
262 iatom_start = tmp_blocks2(atom_block)%istart
263 iatom_end = tmp_blocks2(atom_block)%iend
264 IF (tmp_blocks2(atom_block)%cost > 1.5_dp*self_cost_per_block .AND. iatom_end - iatom_start > 0) THEN
265 changed = .true.
266 new_iatom_start = iatom_start
267 new_iatom_end = (iatom_end - iatom_start + 1)/2 + iatom_start - 1
268 new_jatom_start = new_iatom_end + 1
269 new_jatom_end = iatom_end
270 tmp_blocks(current_block_id)%istart = new_iatom_start
271 tmp_blocks(current_block_id)%iend = new_iatom_end
272 tmp_blocks(current_block_id)%cost = estimate_block_cost( &
273 natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
274 new_iatom_start, new_iatom_end, new_iatom_start, new_iatom_end, &
275 new_iatom_start, new_iatom_end, new_iatom_start, new_iatom_end, &
276 particle_set, &
277 coeffs_set, coeffs_kind, &
278 is_assoc_atomic_block_global, do_periodic, &
279 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
280 cell, &
281 do_p_screening, map_atom_to_kind_atom, eval_type, &
282 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
283 current_block_id = current_block_id + 1
284 tmp_blocks(current_block_id)%istart = new_jatom_start
285 tmp_blocks(current_block_id)%iend = new_jatom_end
286 tmp_blocks(current_block_id)%cost = estimate_block_cost( &
287 natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
288 new_jatom_start, new_jatom_end, new_jatom_start, new_jatom_end, &
289 new_jatom_start, new_jatom_end, new_jatom_start, new_jatom_end, &
290 particle_set, &
291 coeffs_set, coeffs_kind, &
292 is_assoc_atomic_block_global, do_periodic, &
293 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
294 cell, &
295 do_p_screening, map_atom_to_kind_atom, eval_type, &
296 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
297 ELSE
298 tmp_blocks(current_block_id)%istart = iatom_start
299 tmp_blocks(current_block_id)%iend = iatom_end
300 tmp_blocks(current_block_id)%cost = tmp_blocks2(atom_block)%cost
301 END IF
302 END DO
303 IF (.NOT. changed) optimized = .true.
304 IF (i > 20) optimized = .true.
305 nblocks = current_block_id
306 DO atom_block = 1, nblocks
307 tmp_blocks2(atom_block) = tmp_blocks(atom_block)
308 END DO
309 END DO
310
311 DEALLOCATE (tmp_blocks2)
312
313 ! ** count number of non empty blocks on each node
314 non_empty_blocks = 0
315 DO atom_block = 1, nblocks
316 IF (tmp_blocks(atom_block)%istart == 0) cycle
317 non_empty_blocks = non_empty_blocks + 1
318 END DO
319
320 ALLOCATE (rcount(ncpu))
321 rcount = 0
322 rcount(para_env%mepos + 1) = non_empty_blocks
323 CALL para_env%sum(rcount)
324
325 ! ** sum all non_empty_blocks
326 total_blocks = 0
327 DO i = 1, ncpu
328 total_blocks = total_blocks + rcount(i)
329 END DO
330
331 ! ** calculate offsets
332 ALLOCATE (rdispl(ncpu))
333 rcount(:) = rcount(:)*3
334 rdispl(1) = 0
335 DO i = 2, ncpu
336 rdispl(i) = rdispl(i - 1) + rcount(i - 1)
337 END DO
338
339 ALLOCATE (buffer_in(3*non_empty_blocks))
340
341 non_empty_blocks = 0
342 DO atom_block = 1, nblocks
343 IF (tmp_blocks(atom_block)%istart == 0) cycle
344 buffer_in(non_empty_blocks*3 + 1) = tmp_blocks(atom_block)%istart
345 buffer_in(non_empty_blocks*3 + 2) = tmp_blocks(atom_block)%iend
346 buffer_in(non_empty_blocks*3 + 3) = tmp_blocks(atom_block)%cost
347 non_empty_blocks = non_empty_blocks + 1
348 END DO
349
350 nblocks = total_blocks
351
352 ALLOCATE (tmp_blocks2(nblocks))
353
354 ALLOCATE (buffer_out(3*nblocks))
355
356 ! ** Gather all three arrays
357 CALL para_env%allgatherv(buffer_in, buffer_out, rcount, rdispl)
358
359 DO i = 1, nblocks
360 tmp_blocks2(i)%istart = int(buffer_out((i - 1)*3 + 1))
361 tmp_blocks2(i)%iend = int(buffer_out((i - 1)*3 + 2))
362 tmp_blocks2(i)%cost = buffer_out((i - 1)*3 + 3)
363 END DO
364
365 ! ** Now we sort the blocks
366 ALLOCATE (to_be_sorted(nblocks))
367 ALLOCATE (tmp_index(nblocks))
368
369 DO atom_block = 1, nblocks
370 to_be_sorted(atom_block) = tmp_blocks2(atom_block)%istart
371 END DO
372
373 CALL sort(to_be_sorted, nblocks, tmp_index)
374
375 ALLOCATE (x_data%blocks(nblocks))
376
377 DO atom_block = 1, nblocks
378 x_data%blocks(atom_block) = tmp_blocks2(tmp_index(atom_block))
379 END DO
380
381 shm_blocks => x_data%blocks
382 shm_nblocks = nblocks
383
384 ! ** Set nblocks in structure
385 load_balance_parameter%nblocks = nblocks
386
387 DEALLOCATE (blocks_guess, tmp_blocks, tmp_blocks2)
388
389 DEALLOCATE (rcount, rdispl, buffer_in, buffer_out, to_be_sorted, tmp_index)
390
391 load_balance_parameter%blocks_initialized = .true.
392
393 x_data%blocks = shm_blocks
394 load_balance_parameter%nblocks = shm_nblocks
395 load_balance_parameter%blocks_initialized = .true.
396
397 ALLOCATE (x_data%pmax_block(shm_nblocks, shm_nblocks))
398 x_data%pmax_block = 0.0_dp
399 pmax_block => x_data%pmax_block
400 CALL timestop(handle_range)
401!$OMP END MASTER
402!$OMP BARRIER
403
404 IF (.NOT. load_balance_parameter%blocks_initialized) THEN
405 ALLOCATE (x_data%blocks(shm_nblocks))
406 x_data%blocks = shm_blocks
407 load_balance_parameter%nblocks = shm_nblocks
408 load_balance_parameter%blocks_initialized = .true.
409 END IF
410 !! ** precalculate maximum density matrix elements in blocks
411!$OMP BARRIER
412 END IF
413
414!$OMP BARRIER
415!$OMP MASTER
416 pmax_block => x_data%pmax_block
417 pmax_block = 0.0_dp
418 IF (do_p_screening) THEN
419 DO iatom_block = 1, shm_nblocks
420 iatom_start = x_data%blocks(iatom_block)%istart
421 iatom_end = x_data%blocks(iatom_block)%iend
422 DO jatom_block = 1, shm_nblocks
423 jatom_start = x_data%blocks(jatom_block)%istart
424 jatom_end = x_data%blocks(jatom_block)%iend
425 pmax_block(iatom_block, jatom_block) = maxval(pmax_atom(iatom_start:iatom_end, jatom_start:jatom_end))
426 END DO
427 END DO
428 END IF
429
430 SELECT CASE (eval_type)
431 CASE (hfx_do_eval_energy)
432 atomic_pair_list => x_data%atomic_pair_list
433 CASE (hfx_do_eval_forces)
434 atomic_pair_list => x_data%atomic_pair_list_forces
435 END SELECT
436 CALL build_atomic_pair_list(natom, atomic_pair_list, kind_of, basis_parameter, particle_set, &
437 do_periodic, coeffs_kind, coeffs_kind_max0, log10_eps_schwarz, cell, &
438 x_data%blocks)
439
440!$OMP END MASTER
441!$OMP BARRIER
442
443 !! If there is only 1 cpu skip the binning
444 IF (n_processes == 1) THEN
445 ALLOCATE (tmp_dist(1))
446 tmp_dist(1)%number_of_atom_quartets = huge(tmp_dist(1)%number_of_atom_quartets)
447 tmp_dist(1)%istart = 0_int_8
448 ptr_to_tmp_dist => tmp_dist(:)
449 SELECT CASE (eval_type)
450 CASE (hfx_do_eval_energy)
451 CALL hfx_set_distr_energy(ptr_to_tmp_dist, x_data)
452 CASE (hfx_do_eval_forces)
453 CALL hfx_set_distr_forces(ptr_to_tmp_dist, x_data)
454 END SELECT
455 DEALLOCATE (tmp_dist)
456 ELSE
457 !! Calculate total numbers of integrals that have to be calculated (wrt screening and symmetry)
458!$OMP BARRIER
459!$OMP MASTER
460 CALL timeset(routinen//"_count", handle_inner)
461!$OMP END MASTER
462!$OMP BARRIER
463
464 cost_per_core = 0_int_8
465 my_process_id = para_env%mepos*n_threads + i_thread
466 nblocks = load_balance_parameter%nblocks
467
468 DO atom_block = my_process_id, int(nblocks, kind=int_8)**4 - 1, n_processes
469
470 latom_block = int(modulo(atom_block, int(nblocks, kind=int_8))) + 1
471 tmp_block = atom_block/nblocks
472 katom_block = int(modulo(tmp_block, int(nblocks, kind=int_8))) + 1
473 IF (latom_block < katom_block) cycle
474 tmp_block = tmp_block/nblocks
475 jatom_block = int(modulo(tmp_block, int(nblocks, kind=int_8))) + 1
476 tmp_block = tmp_block/nblocks
477 iatom_block = int(modulo(tmp_block, int(nblocks, kind=int_8))) + 1
478 IF (jatom_block < iatom_block) cycle
479
480 iatom_start = x_data%blocks(iatom_block)%istart
481 iatom_end = x_data%blocks(iatom_block)%iend
482 jatom_start = x_data%blocks(jatom_block)%istart
483 jatom_end = x_data%blocks(jatom_block)%iend
484 katom_start = x_data%blocks(katom_block)%istart
485 katom_end = x_data%blocks(katom_block)%iend
486 latom_start = x_data%blocks(latom_block)%istart
487 latom_end = x_data%blocks(latom_block)%iend
488
489 SELECT CASE (eval_type)
490 CASE (hfx_do_eval_energy)
491 pmax_blocks = max(pmax_block(katom_block, iatom_block), &
492 pmax_block(latom_block, jatom_block), &
493 pmax_block(latom_block, iatom_block), &
494 pmax_block(katom_block, jatom_block))
495 CASE (hfx_do_eval_forces)
496 pmax_blocks = max(pmax_block(katom_block, iatom_block) + &
497 pmax_block(latom_block, jatom_block), &
498 pmax_block(latom_block, iatom_block) + &
499 pmax_block(katom_block, jatom_block))
500 END SELECT
501
502 IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) cycle
503
504 cost_per_core = cost_per_core &
505 + estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
506 iatom_start, iatom_end, jatom_start, jatom_end, &
507 katom_start, katom_end, latom_start, latom_end, &
508 particle_set, &
509 coeffs_set, coeffs_kind, &
510 is_assoc_atomic_block_global, do_periodic, &
511 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
512 cell, &
513 do_p_screening, map_atom_to_kind_atom, eval_type, &
514 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
515
516 END DO ! atom_block
517
518 nbins = load_balance_parameter%nbins
519 cost_per_bin = (cost_per_core + nbins - 1)/(nbins)
520
521!$OMP BARRIER
522!$OMP MASTER
523 CALL timestop(handle_inner)
524!$OMP END MASTER
525!$OMP BARRIER
526
527! new load balancing test
528 IF (.false.) THEN
529 CALL hfx_recursive_load_balance(n_processes, my_process_id, nblocks, &
530 natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
531 particle_set, &
532 coeffs_set, coeffs_kind, &
533 is_assoc_atomic_block_global, do_periodic, &
534 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
535 cell, x_data, para_env, pmax_block, &
536 do_p_screening, map_atom_to_kind_atom, eval_type, &
537 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
538 END IF
539
540!$OMP BARRIER
541!$OMP MASTER
542 CALL timeset(routinen//"_bin", handle_inner)
543!$OMP END MASTER
544!$OMP BARRIER
545
546 ALLOCATE (binned_dist(nbins))
547 binned_dist(:)%istart = -1_int_8
548 binned_dist(:)%number_of_atom_quartets = 0_int_8
549 binned_dist(:)%cost = 0_int_8
550 binned_dist(:)%time_first_scf = 0.0_dp
551 binned_dist(:)%time_other_scf = 0.0_dp
552 binned_dist(:)%time_forces = 0.0_dp
553
554 current_cost = 0
555 mepos = 1
556 distribution_counter_start = 1
557 distribution_counter_end = 0
558 ibin = 1
559
560 global_quartet_counter = 0
561 local_quartet_counter = 0
562 last_bin_needs_to_be_filled = .false.
563 DO atom_block = my_process_id, int(nblocks, kind=int_8)**4 - 1, n_processes
564 latom_block = int(modulo(atom_block, int(nblocks, kind=int_8))) + 1
565 tmp_block = atom_block/nblocks
566 katom_block = int(modulo(tmp_block, int(nblocks, kind=int_8))) + 1
567 IF (latom_block < katom_block) cycle
568 tmp_block = tmp_block/nblocks
569 jatom_block = int(modulo(tmp_block, int(nblocks, kind=int_8))) + 1
570 tmp_block = tmp_block/nblocks
571 iatom_block = int(modulo(tmp_block, int(nblocks, kind=int_8))) + 1
572 IF (jatom_block < iatom_block) cycle
573
574 distribution_counter_end = distribution_counter_end + 1
575 global_quartet_counter = global_quartet_counter + 1
576 last_bin_needs_to_be_filled = .true.
577
578 IF (binned_dist(ibin)%istart == -1_int_8) binned_dist(ibin)%istart = atom_block
579
580 iatom_start = x_data%blocks(iatom_block)%istart
581 iatom_end = x_data%blocks(iatom_block)%iend
582 jatom_start = x_data%blocks(jatom_block)%istart
583 jatom_end = x_data%blocks(jatom_block)%iend
584 katom_start = x_data%blocks(katom_block)%istart
585 katom_end = x_data%blocks(katom_block)%iend
586 latom_start = x_data%blocks(latom_block)%istart
587 latom_end = x_data%blocks(latom_block)%iend
588
589 SELECT CASE (eval_type)
590 CASE (hfx_do_eval_energy)
591 pmax_blocks = max(pmax_block(katom_block, iatom_block), &
592 pmax_block(latom_block, jatom_block), &
593 pmax_block(latom_block, iatom_block), &
594 pmax_block(katom_block, jatom_block))
595 CASE (hfx_do_eval_forces)
596 pmax_blocks = max(pmax_block(katom_block, iatom_block) + &
597 pmax_block(latom_block, jatom_block), &
598 pmax_block(latom_block, iatom_block) + &
599 pmax_block(katom_block, jatom_block))
600 END SELECT
601
602 IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) cycle
603
604 current_cost = current_cost &
605 + estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
606 iatom_start, iatom_end, jatom_start, jatom_end, &
607 katom_start, katom_end, latom_start, latom_end, &
608 particle_set, &
609 coeffs_set, coeffs_kind, &
610 is_assoc_atomic_block_global, do_periodic, &
611 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
612 cell, &
613 do_p_screening, map_atom_to_kind_atom, eval_type, &
614 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
615
616 IF (current_cost >= cost_per_bin) THEN
617 IF (ibin == nbins) THEN
618 binned_dist(ibin)%number_of_atom_quartets = binned_dist(ibin)%number_of_atom_quartets + &
619 distribution_counter_end - distribution_counter_start + 1
620 ELSE
621 binned_dist(ibin)%number_of_atom_quartets = distribution_counter_end - distribution_counter_start + 1
622 END IF
623 binned_dist(ibin)%cost = binned_dist(ibin)%cost + current_cost
624 ibin = min(ibin + 1, nbins)
625 distribution_counter_start = distribution_counter_end + 1
626 current_cost = 0
627 last_bin_needs_to_be_filled = .false.
628 END IF
629 END DO
630
631!$OMP BARRIER
632!$OMP MASTER
633 CALL timestop(handle_inner)
634 CALL timeset(routinen//"_dist", handle_inner)
635!$OMP END MASTER
636!$OMP BARRIER
637 !! Fill the last bin if necessary
638 IF (last_bin_needs_to_be_filled) THEN
639 binned_dist(ibin)%cost = binned_dist(ibin)%cost + current_cost
640 IF (ibin == nbins) THEN
641 binned_dist(ibin)%number_of_atom_quartets = binned_dist(ibin)%number_of_atom_quartets + &
642 distribution_counter_end - distribution_counter_start + 1
643 ELSE
644 binned_dist(ibin)%number_of_atom_quartets = distribution_counter_end - distribution_counter_start + 1
645 END IF
646 END IF
647
648 !! Sanity-Check
649 DO ibin = 1, nbins
650 local_quartet_counter = local_quartet_counter + binned_dist(ibin)%number_of_atom_quartets
651 END DO
652!$OMP BARRIER
653!$OMP MASTER
654 shm_local_quartet_counter = 0
655 shm_global_quartet_counter = 0
656!$OMP END MASTER
657!$OMP BARRIER
658!$OMP ATOMIC
659 shm_local_quartet_counter = shm_local_quartet_counter + local_quartet_counter
660!$OMP ATOMIC
661 shm_global_quartet_counter = shm_global_quartet_counter + global_quartet_counter
662
663!$OMP BARRIER
664!$OMP MASTER
665 CALL para_env%sum(shm_local_quartet_counter)
666 CALL para_env%sum(shm_global_quartet_counter)
667 IF (para_env%is_source()) THEN
668 IF (shm_local_quartet_counter /= shm_global_quartet_counter) THEN
669 WRITE (error_msg, '(A,I0,A,I0,A)') "HFX Sanity check for parallel distribution failed. "// &
670 "Number of local quartets (", shm_local_quartet_counter, &
671 ") and number of global quartets (", shm_global_quartet_counter, &
672 ") are different. Please send in a bug report."
673 cpabort(error_msg)
674 END IF
675 END IF
676!$OMP END MASTER
677
678!$OMP BARRIER
679!$OMP MASTER
680 ALLOCATE (cost_matrix(ncpu*nbins*n_threads))
681 cost_matrix = 0
682!$OMP END MASTER
683!$OMP BARRIER
684 icpu = para_env%mepos + 1
685 DO i = 1, nbins
686 cost_matrix((icpu - 1)*nbins*n_threads + i_thread*nbins + i) = binned_dist(i)%cost
687 END DO
688 mepos = para_env%mepos
689!$OMP BARRIER
690
691!$OMP MASTER
692 ! sync before/after ring of isendrecv
693 CALL para_env%sync()
694
695 ALLOCATE (sendbuffer(nbins*n_threads))
696 ALLOCATE (recbuffer(nbins*n_threads))
697
698 sendbuffer = cost_matrix(mepos*nbins*n_threads + 1:mepos*nbins*n_threads + nbins*n_threads)
699
700 dest = modulo(mepos + 1, ncpu)
701 source = modulo(mepos - 1, ncpu)
702 DO icpu = 0, ncpu - 1
703 IF (icpu .NE. ncpu - 1) THEN
704 CALL para_env%isendrecv(sendbuffer, dest, recbuffer, source, &
705 req(1), req(2), 13)
706 END IF
707 data_from = modulo(mepos - icpu, ncpu)
708 cost_matrix(data_from*nbins*n_threads + 1:data_from*nbins*n_threads + nbins*n_threads) = sendbuffer
709 IF (icpu .NE. ncpu - 1) THEN
710 CALL mp_waitall(req)
711 END IF
712 swapbuffer => sendbuffer
713 sendbuffer => recbuffer
714 recbuffer => swapbuffer
715 END DO
716 DEALLOCATE (recbuffer, sendbuffer)
717!$OMP END MASTER
718!$OMP BARRIER
719
720!$OMP BARRIER
721!$OMP MASTER
722 CALL timestop(handle_inner)
723 CALL timeset(routinen//"_opt", handle_inner)
724!$OMP END MASTER
725!$OMP BARRIER
726
727 !! Find an optimal distribution i.e. assign each element of the cost matrix to a certain process
728!$OMP BARRIER
729 ALLOCATE (local_cost_matrix(SIZE(cost_matrix, 1)))
730 local_cost_matrix = cost_matrix
731!$OMP MASTER
732 ALLOCATE (shm_distribution_vector(ncpu*nbins*n_threads))
733
734 CALL optimize_distribution(ncpu*nbins*n_threads, ncpu*n_threads, local_cost_matrix, &
735 shm_distribution_vector, x_data%load_balance_parameter%do_randomize)
736
737 CALL timestop(handle_inner)
738 CALL timeset(routinen//"_redist", handle_inner)
739 !! Collect local data to global array
740 ALLOCATE (full_dist(ncpu*n_threads, nbins))
741
742 full_dist(:, :)%istart = 0_int_8
743 full_dist(:, :)%number_of_atom_quartets = 0_int_8
744 full_dist(:, :)%cost = 0_int_8
745 full_dist(:, :)%time_first_scf = 0.0_dp
746 full_dist(:, :)%time_other_scf = 0.0_dp
747 full_dist(:, :)%time_forces = 0.0_dp
748!$OMP END MASTER
749!$OMP BARRIER
750 mepos = para_env%mepos + 1
751 full_dist((mepos - 1)*n_threads + i_thread + 1, :) = binned_dist(:)
752
753!$OMP BARRIER
754!$OMP MASTER
755 ALLOCATE (sendbuffer(3*nbins*n_threads))
756 ALLOCATE (recbuffer(3*nbins*n_threads))
757 mepos = para_env%mepos
758 DO j = 1, n_threads
759 DO i = 1, nbins
760 sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 1) = full_dist(mepos*n_threads + j, i)%istart
761 sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 2) = full_dist(mepos*n_threads + j, i)%number_of_atom_quartets
762 sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 3) = full_dist(mepos*n_threads + j, i)%cost
763 END DO
764 END DO
765
766 ! sync before/after ring of isendrecv
767 CALL para_env%sync()
768 dest = modulo(mepos + 1, ncpu)
769 source = modulo(mepos - 1, ncpu)
770 DO icpu = 0, ncpu - 1
771 IF (icpu .NE. ncpu - 1) THEN
772 CALL para_env%isendrecv(sendbuffer, dest, recbuffer, source, &
773 req(1), req(2), 13)
774 END IF
775 data_from = modulo(mepos - icpu, ncpu)
776 DO j = 1, n_threads
777 DO i = 1, nbins
778 full_dist(data_from*n_threads + j, i)%istart = sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 1)
779 full_dist(data_from*n_threads + j, i)%number_of_atom_quartets = sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 2)
780 full_dist(data_from*n_threads + j, i)%cost = sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 3)
781 END DO
782 END DO
783
784 IF (icpu .NE. ncpu - 1) THEN
785 CALL mp_waitall(req)
786 END IF
787 swapbuffer => sendbuffer
788 sendbuffer => recbuffer
789 recbuffer => swapbuffer
790 END DO
791 DEALLOCATE (recbuffer, sendbuffer)
792
793 ! sync before/after ring of isendrecv
794 CALL para_env%sync()
795!$OMP END MASTER
796!$OMP BARRIER
797 !! reorder the distribution according to the distribution vector
798 ALLOCATE (tmp_pos(ncpu*n_threads))
799 tmp_pos = 1
800 ALLOCATE (tmp_dist(nbins*ncpu*n_threads))
801
802 tmp_dist(:)%istart = 0_int_8
803 tmp_dist(:)%number_of_atom_quartets = 0_int_8
804 tmp_dist(:)%cost = 0_int_8
805 tmp_dist(:)%time_first_scf = 0.0_dp
806 tmp_dist(:)%time_other_scf = 0.0_dp
807 tmp_dist(:)%time_forces = 0.0_dp
808
809 DO icpu = 1, n_processes
810 DO i = 1, nbins
811 mepos = my_process_id + 1
812 IF (shm_distribution_vector((icpu - 1)*nbins + i) == mepos) THEN
813 tmp_dist(tmp_pos(mepos)) = full_dist(icpu, i)
814 tmp_pos(mepos) = tmp_pos(mepos) + 1
815 END IF
816 END DO
817 END DO
818
819 !! Assign the load to each process
820 NULLIFY (ptr_to_tmp_dist)
821 mepos = my_process_id + 1
822 ptr_to_tmp_dist => tmp_dist(1:tmp_pos(mepos) - 1)
823 SELECT CASE (eval_type)
824 CASE (hfx_do_eval_energy)
825 CALL hfx_set_distr_energy(ptr_to_tmp_dist, x_data)
826 CASE (hfx_do_eval_forces)
827 CALL hfx_set_distr_forces(ptr_to_tmp_dist, x_data)
828 END SELECT
829
830!$OMP BARRIER
831!$OMP MASTER
832 DEALLOCATE (full_dist, cost_matrix, shm_distribution_vector)
833!$OMP END MASTER
834!$OMP BARRIER
835 DEALLOCATE (tmp_dist, tmp_pos)
836 DEALLOCATE (binned_dist, local_cost_matrix)
837 DEALLOCATE (set_list_ij, set_list_kl)
838
839!$OMP BARRIER
840!$OMP MASTER
841 CALL timestop(handle_inner)
842!$OMP END MASTER
843!$OMP BARRIER
844 END IF
845!$OMP BARRIER
846!$OMP MASTER
847 CALL timestop(handle)
848!$OMP END MASTER
849!$OMP BARRIER
850 END SUBROUTINE hfx_load_balance
851
852! **************************************************************************************************
853!> \brief Reference implementation of new recursive load balancing routine
854!> Computes a local list of atom_blocks (p_atom_blocks,q_atom_blocks) for
855!> each process in a P-Q grid such that every process has more or less the
856!> same amount of work. Has no output at the moment (not used) but writes
857!> its computed load balance values into a file. Possible output is ready
858!> to use in the two arrays p_atom_blocks & q_atom_blocks
859!> \param n_processes ...
860!> \param my_process_id ...
861!> \param nblocks ...
862!> \param natom ...
863!> \param nkind ...
864!> \param list_ij ...
865!> \param list_kl ...
866!> \param set_list_ij ...
867!> \param set_list_kl ...
868!> \param particle_set ...
869!> \param coeffs_set ...
870!> \param coeffs_kind ...
871!> \param is_assoc_atomic_block_global ...
872!> \param do_periodic ...
873!> \param kind_of ...
874!> \param basis_parameter ...
875!> \param pmax_set ...
876!> \param pmax_atom ...
877!> \param pmax_blocks ...
878!> \param cell ...
879!> \param x_data ...
880!> \param para_env ...
881!> \param pmax_block ...
882!> \param do_p_screening ...
883!> \param map_atom_to_kind_atom ...
884!> \param eval_type ...
885!> \param log10_eps_schwarz ...
886!> \param log_2 ...
887!> \param coeffs_kind_max0 ...
888!> \param use_virial ...
889!> \param atomic_pair_list ...
890!> \par History
891!> 03.2011 created [Michael Steinlechner]
892!> \author Michael Steinlechner
893! **************************************************************************************************
894
895 SUBROUTINE hfx_recursive_load_balance(n_processes, my_process_id, nblocks, &
896 natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
897 particle_set, &
898 coeffs_set, coeffs_kind, &
899 is_assoc_atomic_block_global, do_periodic, &
900 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
901 cell, x_data, para_env, pmax_block, &
902 do_p_screening, map_atom_to_kind_atom, eval_type, &
903 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
904
905! input variables:
906 INTEGER, INTENT(IN) :: n_processes, my_process_id, nblocks, &
907 natom, nkind
908 TYPE(pair_list_type), INTENT(IN) :: list_ij, list_kl
909 TYPE(pair_set_list_type), ALLOCATABLE, &
910 DIMENSION(:), INTENT(IN) :: set_list_ij, set_list_kl
911 TYPE(particle_type), DIMENSION(:), INTENT(IN), &
912 POINTER :: particle_set
913 TYPE(hfx_screen_coeff_type), &
914 DIMENSION(:, :, :, :), INTENT(IN), POINTER :: coeffs_set
915 TYPE(hfx_screen_coeff_type), DIMENSION(:, :), &
916 INTENT(IN), POINTER :: coeffs_kind
917 INTEGER, DIMENSION(:, :), INTENT(IN) :: is_assoc_atomic_block_global
918 LOGICAL, INTENT(IN) :: do_periodic
919 INTEGER, INTENT(IN) :: kind_of(*)
920 TYPE(hfx_basis_type), DIMENSION(:), INTENT(IN), &
921 POINTER :: basis_parameter
922 TYPE(hfx_p_kind), DIMENSION(:), INTENT(IN), &
923 POINTER :: pmax_set
924 REAL(dp), DIMENSION(:, :), INTENT(IN), POINTER :: pmax_atom
925 REAL(dp) :: pmax_blocks
926 TYPE(cell_type), INTENT(IN), POINTER :: cell
927 TYPE(hfx_type), INTENT(IN), POINTER :: x_data
928 TYPE(mp_para_env_type), INTENT(IN) :: para_env
929 REAL(dp), DIMENSION(:, :), INTENT(IN), POINTER :: pmax_block
930 LOGICAL, INTENT(IN) :: do_p_screening
931 INTEGER, DIMENSION(:), INTENT(IN), POINTER :: map_atom_to_kind_atom
932 INTEGER, INTENT(IN) :: eval_type
933 REAL(dp), INTENT(IN) :: log10_eps_schwarz, log_2, &
934 coeffs_kind_max0
935 LOGICAL, INTENT(IN) :: use_virial
936 LOGICAL, DIMENSION(:, :), INTENT(IN), POINTER :: atomic_pair_list
937
938 CHARACTER(LEN=*), PARAMETER :: routinen = 'hfx_recursive_load_balance'
939
940 INTEGER :: handle, i, iatom_block, iatom_end, iatom_start, j, jatom_block, jatom_end, &
941 jatom_start, katom_block, katom_end, katom_start, latom_block, latom_end, latom_start, &
942 np, nq, numbins, p, q, sizep, sizeq, unit_nr
943 INTEGER(int_8) :: local_cost, pidx, qidx, sump, sumq
944 INTEGER(int_8), ALLOCATABLE, DIMENSION(:) :: local_cost_vector
945 INTEGER, ALLOCATABLE, DIMENSION(:) :: blocksize, p_atom_blocks, permute, &
946 q_atom_blocks
947 REAL(dp) :: maximum, mean
948
949! internal variables:
950
951!$OMP BARRIER
952!$OMP MASTER
953 CALL timeset(routinen, handle)
954!$OMP END MASTER
955!$OMP BARRIER
956
957 ! calculate best p/q distribution grid for the n_processes
958 CALL hfx_calculate_pq(p, q, numbins, n_processes)
959
960 ALLOCATE (blocksize(numbins))
961 ALLOCATE (permute(nblocks**2))
962 DO i = 1, nblocks**2
963 permute(i) = i
964 END DO
965
966 ! call the main recursive permutation routine.
967 ! Output:
968 ! blocksize :: vector (size numBins) with the sizes for each column/row block
969 ! permute :: permutation vector
970 CALL hfx_recursive_permute(blocksize, 1, nblocks**2, numbins, &
971 permute, 1, &
972 my_process_id, n_processes, nblocks, &
973 natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
974 particle_set, &
975 coeffs_set, coeffs_kind, &
976 is_assoc_atomic_block_global, do_periodic, &
977 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
978 cell, x_data, para_env, pmax_block, &
979 do_p_screening, map_atom_to_kind_atom, eval_type, &
980 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
981
982 ! number of blocks per processor in p-direction (vertical)
983 np = numbins/p
984 ! number of blocks per processor in q-direction (horizontal)
985 nq = numbins/q
986
987 ! calc own position in P-Q-processor grid (PQ-grid is column-major)
988 pidx = modulo(int(my_process_id), int(p)) + 1
989 qidx = my_process_id/p + 1
990
991 sizep = sum(blocksize((np*(pidx - 1) + 1):(np*pidx)))
992 sizeq = sum(blocksize((nq*(qidx - 1) + 1):(nq*qidx)))
993
994 sump = sum(blocksize(1:(np*(pidx - 1))))
995 sumq = sum(blocksize(1:(nq*(qidx - 1))))
996
997 ALLOCATE (p_atom_blocks(sizep))
998 ALLOCATE (q_atom_blocks(sizeq))
999
1000 p_atom_blocks(:) = permute((sump + 1):(sump + sizep))
1001 q_atom_blocks(:) = permute((sumq + 1):(sumq + sizeq))
1002
1003 ! from here on, we are actually finished, each process has been
1004 ! assigned a (p_atom_blocks,q_atom_blocks) pair list.
1005 ! what follows is just a small routine to calculate the local cost
1006 ! for each processor which is then written to a file.
1007
1008 ! calculate local cost for each processor!
1009 ! ****************************************
1010 local_cost = 0
1011 DO i = 1, sizep
1012 DO j = 1, sizeq
1013
1014 ! get corresponding 4D block indices out of our own P-Q-block
1015 latom_block = modulo(q_atom_blocks(j), nblocks)
1016 iatom_block = q_atom_blocks(j)/nblocks + 1
1017 jatom_block = modulo(p_atom_blocks(i), nblocks)
1018 katom_block = p_atom_blocks(i)/nblocks + 1
1019
1020 ! symmetry checks.
1021 IF (latom_block < katom_block) cycle
1022 IF (jatom_block < iatom_block) cycle
1023
1024 iatom_start = x_data%blocks(iatom_block)%istart
1025 iatom_end = x_data%blocks(iatom_block)%iend
1026 jatom_start = x_data%blocks(jatom_block)%istart
1027 jatom_end = x_data%blocks(jatom_block)%iend
1028 katom_start = x_data%blocks(katom_block)%istart
1029 katom_end = x_data%blocks(katom_block)%iend
1030 latom_start = x_data%blocks(latom_block)%istart
1031 latom_end = x_data%blocks(latom_block)%iend
1032
1033 ! whatever.
1034 SELECT CASE (eval_type)
1035 CASE (hfx_do_eval_energy)
1036 pmax_blocks = max(pmax_block(katom_block, iatom_block), &
1037 pmax_block(latom_block, jatom_block), &
1038 pmax_block(latom_block, iatom_block), &
1039 pmax_block(katom_block, jatom_block))
1040 CASE (hfx_do_eval_forces)
1041 pmax_blocks = max(pmax_block(katom_block, iatom_block) + &
1042 pmax_block(latom_block, jatom_block), &
1043 pmax_block(latom_block, iatom_block) + &
1044 pmax_block(katom_block, jatom_block))
1045 END SELECT
1046
1047 ! screening.
1048 IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) cycle
1049
1050 ! estimate the cost of this atom_block.
1051 local_cost = local_cost + estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, &
1052 set_list_kl, &
1053 iatom_start, iatom_end, jatom_start, jatom_end, &
1054 katom_start, katom_end, latom_start, latom_end, &
1055 particle_set, &
1056 coeffs_set, coeffs_kind, &
1057 is_assoc_atomic_block_global, do_periodic, &
1058 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
1059 cell, &
1060 do_p_screening, map_atom_to_kind_atom, eval_type, &
1061 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
1062 END DO
1063 END DO
1064
1065 ALLOCATE (local_cost_vector(n_processes))
1066 local_cost_vector = 0
1067 local_cost_vector(my_process_id + 1) = local_cost
1068 CALL para_env%sum(local_cost_vector)
1069
1070 mean = sum(local_cost_vector)/n_processes
1071 maximum = maxval(local_cost_vector)
1072
1073!$OMP BARRIER
1074!$OMP MASTER
1075 ! only output once
1076 IF (my_process_id == 0) THEN
1077 CALL open_file(unit_number=unit_nr, file_name="loads.dat")
1078 WRITE (unit_nr, *) 'maximum cost:', maximum
1079 WRITE (unit_nr, *) 'mean cost:', mean
1080 WRITE (unit_nr, *) 'load balance ratio max/mean: ', maximum/mean
1081 WRITE (unit_nr, *) '-------- detailed per-process costs ---------'
1082 DO i = 1, n_processes
1083 WRITE (unit_nr, *) local_cost_vector(i)
1084 END DO
1085 CALL close_file(unit_nr)
1086 END IF
1087!$OMP END MASTER
1088!$OMP BARRIER
1089
1090 DEALLOCATE (local_cost_vector)
1091 DEALLOCATE (p_atom_blocks, q_atom_blocks)
1092 DEALLOCATE (blocksize, permute)
1093
1094!$OMP BARRIER
1095!$OMP MASTER
1096 CALL timestop(handle)
1097!$OMP END MASTER
1098!$OMP BARRIER
1099
1100 END SUBROUTINE hfx_recursive_load_balance
1101
1102! **************************************************************************************************
1103!> \brief Small routine to calculate the optimal P-Q-processor grid distribution
1104!> for a given number of processors N
1105!> and the corresponding number of Bins for the load balancing routine
1106!> \param p number of rows on P-Q process grid (output)
1107!> \param q number of columns on P-Q process grid (output)
1108!> \param nBins number of Bins (output)
1109!> \param N number of processes (input)
1110!> \par History
1111!> 03.2011 created [Michael Steinlechner]
1112!> \author Michael Steinlechner
1113! **************************************************************************************************
1114 SUBROUTINE hfx_calculate_pq(p, q, nBins, N)
1115
1116 INTEGER, INTENT(OUT) :: p, q, nbins
1117 INTEGER, INTENT(IN) :: n
1118
1119 INTEGER :: a, b, k
1120 REAL(dp) :: sqn
1121
1122 k = 2
1123 sqn = sqrt(real(n, kind=dp))
1124 p = 1
1125
1126 DO WHILE (real(k, kind=dp) <= sqn)
1127 IF (modulo(n, k) == 0) THEN
1128 p = k
1129 END IF
1130 k = k + 1
1131 END DO
1132 q = n/p
1133
1134 ! now compute the least common multiple of p & q to get the number of necessary bins
1135 ! compute using the relation LCM(p,q) = abs(p*q) / GCD(p,q)
1136 ! and use euclid's algorithm for GCD computation.
1137 a = p
1138 b = q
1139
1140 DO WHILE (b .NE. 0)
1141 IF (a > b) THEN
1142 a = a - b
1143 ELSE
1144 b = b - a
1145 END IF
1146 END DO
1147 ! gcd(p,q) is now saved in a
1148
1149 nbins = p*q/a
1150
1151 END SUBROUTINE
1152
1153! **************************************************************************************************
1154!> \brief Recursive permutation routine for the load balancing of the integral
1155!> computation
1156!> \param blocksize vector of blocksizes, size(nProc), which contains for
1157!> each process the local blocksize (OUTPUT)
1158!> \param blockstart starting row/column idx of the block which is to be examined
1159!> at this point (INPUT)
1160!> \param blockend ending row/column idx of the block which is to be examined
1161!> (INPUT)
1162!> \param nProc_in number of bins into which the current block has to be divided
1163!> (INPUT)
1164!> \param permute permutation vector which balances column/row cost
1165!> size(nblocks^2). (OUTPUT)
1166!> \param step ...
1167!> \param my_process_id ...
1168!> \param n_processes ...
1169!> \param nblocks ...
1170!> \param natom ...
1171!> \param nkind ...
1172!> \param list_ij ...
1173!> \param list_kl ...
1174!> \param set_list_ij ...
1175!> \param set_list_kl ...
1176!> \param particle_set ...
1177!> \param coeffs_set ...
1178!> \param coeffs_kind ...
1179!> \param is_assoc_atomic_block_global ...
1180!> \param do_periodic ...
1181!> \param kind_of ...
1182!> \param basis_parameter ...
1183!> \param pmax_set ...
1184!> \param pmax_atom ...
1185!> \param pmax_blocks ...
1186!> \param cell ...
1187!> \param x_data ...
1188!> \param para_env ...
1189!> \param pmax_block ...
1190!> \param do_p_screening ...
1191!> \param map_atom_to_kind_atom ...
1192!> \param eval_type ...
1193!> \param log10_eps_schwarz ...
1194!> \param log_2 ...
1195!> \param coeffs_kind_max0 ...
1196!> \param use_virial ...
1197!> \param atomic_pair_list ...
1198!> \par History
1199!> 03.2011 created [Michael Steinlechner]
1200!> \author Michael Steinlechner
1201! **************************************************************************************************
1202 RECURSIVE SUBROUTINE hfx_recursive_permute(blocksize, blockstart, blockend, nProc_in, &
1203 permute, step, &
1204 my_process_id, n_processes, nblocks, &
1205 natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
1206 particle_set, &
1207 coeffs_set, coeffs_kind, &
1208 is_assoc_atomic_block_global, do_periodic, &
1209 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
1210 cell, x_data, para_env, pmax_block, &
1211 do_p_screening, map_atom_to_kind_atom, eval_type, &
1212 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
1213
1214 INTEGER :: nproc_in, blockend, blockstart
1215 INTEGER, DIMENSION(nProc_in) :: blocksize
1216 INTEGER :: nblocks, n_processes, my_process_id
1217 INTEGER, INTENT(IN) :: step
1218 INTEGER, DIMENSION(nblocks*nblocks) :: permute
1219 INTEGER :: natom
1220 INTEGER, INTENT(IN) :: nkind
1221 TYPE(pair_list_type) :: list_ij, list_kl
1222 TYPE(pair_set_list_type), ALLOCATABLE, &
1223 DIMENSION(:) :: set_list_ij, set_list_kl
1224 TYPE(particle_type), DIMENSION(:), POINTER :: particle_set
1225 TYPE(hfx_screen_coeff_type), &
1226 DIMENSION(:, :, :, :), POINTER :: coeffs_set
1227 TYPE(hfx_screen_coeff_type), DIMENSION(:, :), &
1228 POINTER :: coeffs_kind
1229 INTEGER, DIMENSION(:, :) :: is_assoc_atomic_block_global
1230 LOGICAL :: do_periodic
1231 INTEGER :: kind_of(*)
1232 TYPE(hfx_basis_type), DIMENSION(:), POINTER :: basis_parameter
1233 TYPE(hfx_p_kind), DIMENSION(:), POINTER :: pmax_set
1234 REAL(dp), DIMENSION(:, :), POINTER :: pmax_atom
1235 REAL(dp) :: pmax_blocks
1236 TYPE(cell_type), POINTER :: cell
1237 TYPE(hfx_type), POINTER :: x_data
1238 TYPE(mp_para_env_type), INTENT(IN) :: para_env
1239 REAL(dp), DIMENSION(:, :), POINTER :: pmax_block
1240 LOGICAL, INTENT(IN) :: do_p_screening
1241 INTEGER, DIMENSION(:), POINTER :: map_atom_to_kind_atom
1242 INTEGER, INTENT(IN) :: eval_type
1243 REAL(dp) :: log10_eps_schwarz, log_2, &
1244 coeffs_kind_max0
1245 LOGICAL, INTENT(IN) :: use_virial
1246 LOGICAL, DIMENSION(:, :), POINTER :: atomic_pair_list
1247
1248 INTEGER :: col, endoffset, i, iatom_block, iatom_end, iatom_start, idx, inv_perm, &
1249 jatom_block, jatom_end, jatom_start, katom_block, katom_end, katom_start, latom_block, &
1250 latom_end, latom_start, nbins, nproc, row, startoffset
1251 INTEGER(int_8) :: atom_block, tmp_block
1252 INTEGER, ALLOCATABLE, DIMENSION(:) :: ithblocksize, localblocksize
1253 INTEGER, DIMENSION(blockend - blockstart + 1) :: bin_perm, tmp_perm
1254 REAL(dp) :: partialcost
1255 REAL(dp), DIMENSION(nblocks*nblocks) :: cost_vector
1256
1257 nproc = nproc_in
1258 cost_vector = 0.0_dp
1259
1260! loop over local atom_blocks.
1261 DO atom_block = my_process_id, int(nblocks, kind=int_8)**4 - 1, n_processes
1262
1263! get corresponding 4D block indices
1264 latom_block = int(modulo(atom_block, int(nblocks, kind=int_8))) + 1
1265 tmp_block = atom_block/nblocks
1266 katom_block = int(modulo(tmp_block, int(nblocks, kind=int_8))) + 1
1267 IF (latom_block < katom_block) cycle
1268 tmp_block = tmp_block/nblocks
1269 jatom_block = int(modulo(tmp_block, int(nblocks, kind=int_8))) + 1
1270 tmp_block = tmp_block/nblocks
1271 iatom_block = int(modulo(tmp_block, int(nblocks, kind=int_8))) + 1
1272 IF (jatom_block < iatom_block) cycle
1273
1274! get 2D indices of this atom_block (with permutation applied)
1275! for this, we need to invert the permutation, this means
1276! find position in permutation vector where value==idx
1277
1278 row = (katom_block - 1)*nblocks + jatom_block
1279 inv_perm = 1
1280 DO WHILE (permute(inv_perm) .NE. row)
1281 inv_perm = inv_perm + 1
1282 END DO
1283 row = inv_perm
1284
1285 col = (iatom_block - 1)*nblocks + latom_block
1286 inv_perm = 1
1287 DO WHILE (permute(inv_perm) .NE. col)
1288 inv_perm = inv_perm + 1
1289 END DO
1290 col = inv_perm
1291
1292! if row/col outside our current diagonal block, skip calculation.
1293 IF (col < blockstart .OR. col > blockend) cycle
1294 IF (row < blockstart .OR. row > blockend) cycle
1295
1296 iatom_start = x_data%blocks(iatom_block)%istart
1297 iatom_end = x_data%blocks(iatom_block)%iend
1298 jatom_start = x_data%blocks(jatom_block)%istart
1299 jatom_end = x_data%blocks(jatom_block)%iend
1300 katom_start = x_data%blocks(katom_block)%istart
1301 katom_end = x_data%blocks(katom_block)%iend
1302 latom_start = x_data%blocks(latom_block)%istart
1303 latom_end = x_data%blocks(latom_block)%iend
1304
1305! whatever.
1306 SELECT CASE (eval_type)
1307 CASE (hfx_do_eval_energy)
1308 pmax_blocks = max(pmax_block(katom_block, iatom_block), &
1309 pmax_block(latom_block, jatom_block), &
1310 pmax_block(latom_block, iatom_block), &
1311 pmax_block(katom_block, jatom_block))
1312 CASE (hfx_do_eval_forces)
1313 pmax_blocks = max(pmax_block(katom_block, iatom_block) + &
1314 pmax_block(latom_block, jatom_block), &
1315 pmax_block(latom_block, iatom_block) + &
1316 pmax_block(katom_block, jatom_block))
1317 END SELECT
1318
1319! screening.
1320 IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) cycle
1321
1322! every second recursion step, compute row sum instead of column sum
1323
1324 IF (modulo(step, 2) .EQ. 0) THEN
1325 idx = row
1326 ELSE
1327 idx = col
1328 END IF
1329
1330! estimate the cost of this atom_block.
1331 partialcost = estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, &
1332 set_list_kl, &
1333 iatom_start, iatom_end, jatom_start, jatom_end, &
1334 katom_start, katom_end, latom_start, latom_end, &
1335 particle_set, &
1336 coeffs_set, coeffs_kind, &
1337 is_assoc_atomic_block_global, do_periodic, &
1338 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
1339 cell, &
1340 do_p_screening, map_atom_to_kind_atom, eval_type, &
1341 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
1342
1343 cost_vector(idx) = cost_vector(idx) + partialcost
1344 END DO ! atom_block
1345
1346! sum costvector over all processes
1347 CALL para_env%sum(cost_vector)
1348
1349! calculate next prime factor of nProc
1350 nbins = 2
1351 DO WHILE (modulo(int(nproc), int(nbins)) .NE. 0)
1352 nbins = nbins + 1
1353 END DO
1354
1355 nproc = nproc/nbins
1356
1357! ... do the binning...
1358
1359 ALLOCATE (localblocksize(nbins))
1360 CALL hfx_permute_binning(nbins, cost_vector(blockstart:blockend), blockend - blockstart + 1, bin_perm, localblocksize)
1361
1362!... and update the permutation vector
1363
1364 tmp_perm = permute(blockstart:blockend)
1365 permute(blockstart:blockend) = tmp_perm(bin_perm)
1366
1367! split recursion into the nBins Bins
1368 IF (nproc > 1) THEN
1369 ALLOCATE (ithblocksize(nproc))
1370 DO i = 1, nbins
1371 startoffset = sum(localblocksize(1:(i - 1)))
1372 endoffset = sum(localblocksize(1:i)) - 1
1373
1374 CALL hfx_recursive_permute(ithblocksize, blockstart + startoffset, blockstart + endoffset, nproc, &
1375 permute, step + 1, &
1376 my_process_id, n_processes, nblocks, &
1377 natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
1378 particle_set, &
1379 coeffs_set, coeffs_kind, &
1380 is_assoc_atomic_block_global, do_periodic, &
1381 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
1382 cell, x_data, para_env, pmax_block, &
1383 do_p_screening, map_atom_to_kind_atom, eval_type, &
1384 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
1385 blocksize(((i - 1)*nproc + 1):(i*nproc)) = ithblocksize
1386 END DO
1387 DEALLOCATE (ithblocksize)
1388 ELSE
1389 DO i = 1, nbins
1390 blocksize(i) = localblocksize(i)
1391 END DO
1392 END IF
1393
1394 DEALLOCATE (localblocksize)
1395
1396 END SUBROUTINE hfx_recursive_permute
1397
1398! **************************************************************************************************
1399!> \brief small binning routine for the recursive load balancing
1400!>
1401!> \param nBins number of Bins (INPUT)
1402!> \param costvector vector of current row/column costs which have to be binned (INPUT)
1403!> \param maxbinsize upper bound for bin size (INPUT)
1404!> \param perm resulting permutation due to be binning routine (OUTPUT)
1405!> \param block_count vector of size(nbins) which contains the size of each bin (OUTPUT)
1406!> \par History
1407!> 03.2011 created [Michael Steinlechner]
1408!> \author Michael Steinlechner
1409! **************************************************************************************************
1410 SUBROUTINE hfx_permute_binning(nBins, costvector, maxbinsize, perm, block_count)
1411
1412 INTEGER, INTENT(IN) :: nbins, maxbinsize
1413 REAL(dp), DIMENSION(maxbinsize), INTENT(IN) :: costvector
1414 INTEGER, DIMENSION(maxbinsize), INTENT(OUT) :: perm
1415 INTEGER, DIMENSION(nBins), INTENT(OUT) :: block_count
1416
1417 INTEGER :: i, j, mod_idx, offset
1418 INTEGER, DIMENSION(nBins, maxbinsize) :: bin
1419 INTEGER, DIMENSION(nBins) :: bin_idx
1420 INTEGER, DIMENSION(maxbinsize) :: idx
1421 REAL(dp), DIMENSION(maxbinsize) :: vec
1422 REAL(dp), DIMENSION(nBins) :: bincosts
1423
1424! be careful not to change costvector (copy it!)
1425
1426 vec = costvector
1427 block_count = 0
1428 bincosts = 0
1429
1430 !sort the array (ascending)
1431 CALL sort(vec, maxbinsize, idx)
1432
1433 ! count the loop down to distribute the largest cols/rows first
1434 DO i = maxbinsize, 1, -1
1435 IF (vec(i) == 0) THEN
1436 ! spread zero-cost col/rows evenly among procs
1437 mod_idx = modulo(i, nbins) + 1 !(note the fortran offset by one!)
1438 block_count(mod_idx) = block_count(mod_idx) + 1
1439 bin(mod_idx, block_count(mod_idx)) = idx(i)
1440 ELSE
1441 ! sort the bins so that the one with the lowest cost is at the
1442 ! first place, where we then assign the current col/row
1443 CALL sort(bincosts, nbins, bin_idx)
1444 block_count = block_count(bin_idx)
1445 bin = bin(bin_idx, :)
1446
1447 bincosts(1) = bincosts(1) + vec(i)
1448 block_count(1) = block_count(1) + 1
1449 bin(1, block_count(1)) = idx(i)
1450 END IF
1451 END DO
1452
1453 ! construct permutation vector from the binning
1454 offset = 0
1455 DO i = 1, nbins
1456 DO j = 1, block_count(i)
1457 perm(offset + j) = bin(i, j)
1458 END DO
1459 offset = offset + block_count(i)
1460 END DO
1461
1462 END SUBROUTINE hfx_permute_binning
1463
1464! **************************************************************************************************
1465!> \brief Cheap way of redistributing the eri's
1466!> \param x_data Object that stores the indices array
1467!> \param para_env para_env
1468!> \param load_balance_parameter contains parmameter for Monte-Carlo routines
1469!> \param i_thread current thread ID
1470!> \param n_threads Total Number of threads
1471!> \param eval_type ...
1472!> \par History
1473!> 12.2007 created [Manuel Guidon]
1474!> 02.2009 optimize Memory Usage [Manuel Guidon]
1475!> \author Manuel Guidon
1476!> \note
1477!> The cost matrix is given by the walltime for each bin that is measured
1478!> during the calculation
1479! **************************************************************************************************
1480 SUBROUTINE hfx_update_load_balance(x_data, para_env, &
1481 load_balance_parameter, &
1482 i_thread, n_threads, eval_type)
1483
1484 TYPE(hfx_type), POINTER :: x_data
1485 TYPE(mp_para_env_type), INTENT(IN) :: para_env
1486 TYPE(hfx_load_balance_type) :: load_balance_parameter
1487 INTEGER, INTENT(IN) :: i_thread, n_threads, eval_type
1488
1489 CHARACTER(LEN=*), PARAMETER :: routinen = 'hfx_update_load_balance'
1490
1491 INTEGER :: data_from, dest, end_idx, handle, i, ibin, icpu, iprocess, j, mepos, my_bin_size, &
1492 my_global_start_idx, my_process_id, n_processes, nbins, ncpu, source, start_idx
1493 TYPE(mp_request_type), DIMENSION(2) :: req
1494 INTEGER(int_8), DIMENSION(:), POINTER :: local_cost_matrix, recbuffer, &
1495 sendbuffer, swapbuffer
1496 INTEGER(int_8), DIMENSION(:), POINTER, SAVE :: cost_matrix
1497 INTEGER, ALLOCATABLE, DIMENSION(:) :: tmp_pos
1498 INTEGER, ALLOCATABLE, DIMENSION(:), SAVE :: bins_per_rank
1499 INTEGER, ALLOCATABLE, DIMENSION(:, :), SAVE :: bin_histogram
1500 INTEGER, DIMENSION(:), POINTER, SAVE :: shm_distribution_vector
1501 INTEGER, SAVE :: max_bin_size
1502 TYPE(hfx_distribution), DIMENSION(:), POINTER :: binned_dist, ptr_to_tmp_dist, tmp_dist
1503 TYPE(hfx_distribution), DIMENSION(:, :), POINTER, &
1504 SAVE :: full_dist
1505
1506!$OMP BARRIER
1507!$OMP MASTER
1508 CALL timeset(routinen, handle)
1509!$OMP END MASTER
1510!$OMP BARRIER
1511
1512 ncpu = para_env%num_pe
1513 n_processes = ncpu*n_threads
1514 !! If there is only 1 cpu skip the binning
1515 IF (n_processes == 1) THEN
1516 ALLOCATE (tmp_dist(1))
1517 tmp_dist(1)%number_of_atom_quartets = huge(tmp_dist(1)%number_of_atom_quartets)
1518 tmp_dist(1)%istart = 0_int_8
1519 ptr_to_tmp_dist => tmp_dist(:)
1520 SELECT CASE (eval_type)
1521 CASE (hfx_do_eval_energy)
1522 CALL hfx_set_distr_energy(ptr_to_tmp_dist, x_data)
1523 CASE (hfx_do_eval_forces)
1524 CALL hfx_set_distr_forces(ptr_to_tmp_dist, x_data)
1525 END SELECT
1526 DEALLOCATE (tmp_dist)
1527 ELSE
1528 mepos = para_env%mepos
1529 my_process_id = para_env%mepos*n_threads + i_thread
1530 nbins = load_balance_parameter%nbins
1531!$OMP MASTER
1532 ALLOCATE (bin_histogram(n_processes, 2))
1533 bin_histogram = 0
1534!$OMP END MASTER
1535!$OMP BARRIER
1536 SELECT CASE (eval_type)
1537 CASE (hfx_do_eval_energy)
1538 my_bin_size = SIZE(x_data%distribution_energy)
1539 CASE (hfx_do_eval_forces)
1540 my_bin_size = SIZE(x_data%distribution_forces)
1541 END SELECT
1542 bin_histogram(my_process_id + 1, 1) = my_bin_size
1543!$OMP BARRIER
1544!$OMP MASTER
1545 CALL para_env%sum(bin_histogram(:, 1))
1546 bin_histogram(1, 2) = bin_histogram(1, 1)
1547 DO iprocess = 2, n_processes
1548 bin_histogram(iprocess, 2) = bin_histogram(iprocess - 1, 2) + bin_histogram(iprocess, 1)
1549 END DO
1550
1551 max_bin_size = maxval(bin_histogram(para_env%mepos*n_threads + 1:para_env%mepos*n_threads + n_threads, 1))
1552 CALL para_env%max(max_bin_size)
1553!$OMP END MASTER
1554!$OMP BARRIER
1555 ALLOCATE (binned_dist(my_bin_size))
1556 !! Use old binned_dist, but with timings cost
1557 SELECT CASE (eval_type)
1558 CASE (hfx_do_eval_energy)
1559 binned_dist = x_data%distribution_energy
1560 CASE (hfx_do_eval_forces)
1561 binned_dist = x_data%distribution_forces
1562 END SELECT
1563
1564 DO ibin = 1, my_bin_size
1565 IF (binned_dist(ibin)%number_of_atom_quartets == 0) THEN
1566 binned_dist(ibin)%cost = 0
1567 ELSE
1568 SELECT CASE (eval_type)
1569 CASE (hfx_do_eval_energy)
1570 IF (.NOT. load_balance_parameter%rtp_redistribute) THEN
1571 binned_dist(ibin)%cost = int((binned_dist(ibin)%time_first_scf + &
1572 binned_dist(ibin)%time_other_scf)*10000.0_dp, int_8)
1573 ELSE
1574 binned_dist(ibin)%cost = int((binned_dist(ibin)%time_other_scf)*10000.0_dp, int_8)
1575 END IF
1576 CASE (hfx_do_eval_forces)
1577 binned_dist(ibin)%cost = int((binned_dist(ibin)%time_forces)*10000.0_dp, int_8)
1578 END SELECT
1579 END IF
1580 END DO
1581!$OMP BARRIER
1582!$OMP MASTER
1583 !! store all local results in a big cost matrix
1584 ALLOCATE (cost_matrix(ncpu*nbins*n_threads))
1585 cost_matrix = 0
1586 ALLOCATE (sendbuffer(max_bin_size*n_threads))
1587 ALLOCATE (recbuffer(max_bin_size*n_threads))
1588!$OMP END MASTER
1589!$OMP BARRIER
1590 my_global_start_idx = bin_histogram(my_process_id + 1, 2) - my_bin_size
1591 icpu = para_env%mepos + 1
1592 DO i = 1, my_bin_size
1593 cost_matrix(my_global_start_idx + i) = binned_dist(i)%cost
1594 END DO
1595
1596 mepos = para_env%mepos
1597!$OMP BARRIER
1598!$OMP MASTER
1599 ALLOCATE (bins_per_rank(ncpu))
1600 bins_per_rank = 0
1601 DO icpu = 1, ncpu
1602 bins_per_rank(icpu) = sum(bin_histogram((icpu - 1)*n_threads + 1:(icpu - 1)*n_threads + n_threads, 1))
1603 END DO
1604 sendbuffer(1:bins_per_rank(para_env%mepos + 1)) = &
1605 cost_matrix(my_global_start_idx + 1:my_global_start_idx + bins_per_rank(para_env%mepos + 1))
1606
1607 dest = modulo(mepos + 1, ncpu)
1608 source = modulo(mepos - 1, ncpu)
1609 ! sync before/after ring of isendrecv
1610 CALL para_env%sync()
1611 DO icpu = 0, ncpu - 1
1612 IF (icpu .NE. ncpu - 1) THEN
1613 CALL para_env%isendrecv(sendbuffer, dest, recbuffer, source, &
1614 req(1), req(2), 13)
1615 END IF
1616 data_from = modulo(mepos - icpu, ncpu)
1617 start_idx = sum(bins_per_rank(1:data_from + 1)) - bins_per_rank(data_from + 1) + 1
1618 end_idx = start_idx + bins_per_rank(data_from + 1) - 1
1619 cost_matrix(start_idx:end_idx) = sendbuffer(1:end_idx - start_idx + 1)
1620
1621 IF (icpu .NE. ncpu - 1) THEN
1622 CALL mp_waitall(req)
1623 END IF
1624 swapbuffer => sendbuffer
1625 sendbuffer => recbuffer
1626 recbuffer => swapbuffer
1627 END DO
1628 DEALLOCATE (recbuffer, sendbuffer)
1629 ! sync before/after ring of isendrecv
1630 CALL para_env%sync()
1631!$OMP END MASTER
1632!$OMP BARRIER
1633 ALLOCATE (local_cost_matrix(SIZE(cost_matrix, 1)))
1634 local_cost_matrix = cost_matrix
1635!$OMP MASTER
1636 ALLOCATE (shm_distribution_vector(ncpu*nbins*n_threads))
1637 CALL optimize_distribution(ncpu*nbins*n_threads, ncpu*n_threads, local_cost_matrix, &
1638 shm_distribution_vector, x_data%load_balance_parameter%do_randomize)
1639
1640 ALLOCATE (full_dist(ncpu*n_threads, max_bin_size))
1641
1642 full_dist(:, :)%istart = 0_int_8
1643 full_dist(:, :)%number_of_atom_quartets = 0_int_8
1644 full_dist(:, :)%cost = 0_int_8
1645 full_dist(:, :)%time_first_scf = 0.0_dp
1646 full_dist(:, :)%time_other_scf = 0.0_dp
1647 full_dist(:, :)%time_forces = 0.0_dp
1648!$OMP END MASTER
1649
1650!$OMP BARRIER
1651 mepos = para_env%mepos + 1
1652 full_dist((mepos - 1)*n_threads + i_thread + 1, 1:my_bin_size) = binned_dist(1:my_bin_size)
1653!$OMP BARRIER
1654!$OMP MASTER
1655 ALLOCATE (sendbuffer(3*max_bin_size*n_threads))
1656 ALLOCATE (recbuffer(3*max_bin_size*n_threads))
1657 mepos = para_env%mepos
1658 DO j = 1, n_threads
1659 DO i = 1, max_bin_size
1660 sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 1) = full_dist(mepos*n_threads + j, i)%istart
1661 sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 2) = full_dist(mepos*n_threads + j, i)%number_of_atom_quartets
1662 sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 3) = full_dist(mepos*n_threads + j, i)%cost
1663 END DO
1664 END DO
1665 dest = modulo(mepos + 1, ncpu)
1666 source = modulo(mepos - 1, ncpu)
1667 ! sync before/after ring of isendrecv
1668 CALL para_env%sync()
1669 DO icpu = 0, ncpu - 1
1670 IF (icpu .NE. ncpu - 1) THEN
1671 CALL para_env%isendrecv(sendbuffer, dest, recbuffer, source, &
1672 req(1), req(2), 13)
1673 END IF
1674 data_from = modulo(mepos - icpu, ncpu)
1675 DO j = 1, n_threads
1676 DO i = 1, max_bin_size
1677 full_dist(data_from*n_threads + j, i)%istart = sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 1)
1678 full_dist(data_from*n_threads + j, i)%number_of_atom_quartets = sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 2)
1679 full_dist(data_from*n_threads + j, i)%cost = sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 3)
1680 END DO
1681 END DO
1682
1683 IF (icpu .NE. ncpu - 1) THEN
1684 CALL mp_waitall(req)
1685 END IF
1686 swapbuffer => sendbuffer
1687 sendbuffer => recbuffer
1688 recbuffer => swapbuffer
1689 END DO
1690 ! sync before/after ring of isendrecv
1691 DEALLOCATE (recbuffer, sendbuffer)
1692 CALL para_env%sync()
1693!$OMP END MASTER
1694!$OMP BARRIER
1695 !! reorder the distribution according to the distribution vector
1696 ALLOCATE (tmp_pos(ncpu*n_threads))
1697 tmp_pos = 1
1698 ALLOCATE (tmp_dist(nbins*ncpu*n_threads))
1699
1700 tmp_dist(:)%istart = 0_int_8
1701 tmp_dist(:)%number_of_atom_quartets = 0_int_8
1702 tmp_dist(:)%cost = 0_int_8
1703 tmp_dist(:)%time_first_scf = 0.0_dp
1704 tmp_dist(:)%time_other_scf = 0.0_dp
1705 tmp_dist(:)%time_forces = 0.0_dp
1706
1707 mepos = my_process_id + 1
1708 DO icpu = 1, n_processes
1709 DO i = 1, bin_histogram(icpu, 1)
1710 IF (shm_distribution_vector(bin_histogram(icpu, 2) - bin_histogram(icpu, 1) + i) == mepos) THEN
1711 tmp_dist(tmp_pos(mepos)) = full_dist(icpu, i)
1712 tmp_pos(mepos) = tmp_pos(mepos) + 1
1713 END IF
1714 END DO
1715 END DO
1716
1717 !! Assign the load to each process
1718 NULLIFY (ptr_to_tmp_dist)
1719 mepos = my_process_id + 1
1720 ptr_to_tmp_dist => tmp_dist(1:tmp_pos(mepos) - 1)
1721 SELECT CASE (eval_type)
1722 CASE (hfx_do_eval_energy)
1723 CALL hfx_set_distr_energy(ptr_to_tmp_dist, x_data)
1724 CASE (hfx_do_eval_forces)
1725 CALL hfx_set_distr_forces(ptr_to_tmp_dist, x_data)
1726 END SELECT
1727
1728!$OMP BARRIER
1729!$OMP MASTER
1730 DEALLOCATE (full_dist, cost_matrix, shm_distribution_vector)
1731 DEALLOCATE (bins_per_rank, bin_histogram)
1732!$OMP END MASTER
1733!$OMP BARRIER
1734 DEALLOCATE (tmp_dist, tmp_pos)
1735 DEALLOCATE (binned_dist, local_cost_matrix)
1736 END IF
1737!$OMP BARRIER
1738!$OMP MASTER
1739 CALL timestop(handle)
1740!$OMP END MASTER
1741!$OMP BARRIER
1742
1743 END SUBROUTINE hfx_update_load_balance
1744
1745! **************************************************************************************************
1746!> \brief estimates the cost of a set quartet with info available at load balance time
1747!> i.e. without much info on the primitives primitives
1748!> \param nsa ...
1749!> \param nsb ...
1750!> \param nsc ...
1751!> \param nsd ...
1752!> \param npgfa ...
1753!> \param npgfb ...
1754!> \param npgfc ...
1755!> \param npgfd ...
1756!> \param ratio ...
1757!> \param p1 ...
1758!> \param p2 ...
1759!> \param p3 ...
1760!> \return ...
1761!> \par History
1762!> 08.2009 created Joost VandeVondele
1763!> \author Joost VandeVondele
1764! **************************************************************************************************
1765 FUNCTION cost_model(nsa, nsb, nsc, nsd, npgfa, npgfb, npgfc, npgfd, ratio, p1, p2, p3) RESULT(res)
1766 IMPLICIT NONE
1767 REAL(kind=dp) :: estimate1, estimate2, estimate, ratio, switch, mu, sigma
1768 INTEGER(KIND=int_8) :: res
1769 REAL(kind=dp), INTENT(IN) :: p1(12), p2(12), p3(2)
1770
1771 INTEGER :: nsa, nsb, nsc, nsd, npgfa, npgfb, npgfc, npgfd
1772
1773 estimate1 = estimate_basic(p1)
1774 estimate2 = estimate_basic(p2)
1775 mu = log(abs(1.0e6_dp*p3(1)) + 1)
1776 sigma = p3(2)*0.1_dp*mu
1777 switch = 1.0_dp/(1.0_dp + exp((log(estimate1) - mu)/sigma))
1778 estimate = estimate1*(1.0_dp - switch) + estimate2*switch
1779 res = int(estimate*0.001_dp, kind=int_8) + 1
1780
1781 CONTAINS
1782
1783! **************************************************************************************************
1784!> \brief ...
1785!> \param p ...
1786!> \return ...
1787! **************************************************************************************************
1788 REAL(kind=dp) FUNCTION estimate_basic(p) RESULT(res)
1789 REAL(kind=dp) :: p(12)
1790
1791 REAL(kind=dp) :: p1, p10, p11, p12, p2, p3, p4, p5, p6, &
1792 p7, p8, p9
1793
1794 p1 = p(1); p2 = p(2); p3 = p(3); p4 = p(4)
1795 p5 = p(5); p6 = p(6); p7 = p(7); p8 = p(8)
1796 p9 = p(9); p10 = p(10); p11 = p(11); p12 = p(12)
1797 res = poly2(nsa, p1, p2, p3)*poly2(nsb, p1, p2, p3)*poly2(nsc, p1, p2, p3)*poly2(nsd, p1, p2, p3)* &
1798 poly2(npgfa, p4, p5, p6)*poly2(npgfb, p4, p5, p6)*poly2(npgfc, p4, p5, p6)* &
1799 poly2(npgfd, p4, p5, p6)*exp(-p7*ratio + p8*ratio**2) + &
1800 1000.0_dp*p9 + poly2(nsa, p10, p11, p12)*poly2(nsb, p10, p11, p12)*poly2(nsc, p10, p11, p12)*poly2(nsd, p10, p11, p12)
1801 res = 1 + abs(res)
1802 END FUNCTION estimate_basic
1803
1804! **************************************************************************************************
1805!> \brief ...
1806!> \param x ...
1807!> \param a0 ...
1808!> \param a1 ...
1809!> \param a2 ...
1810!> \return ...
1811! **************************************************************************************************
1812 REAL(kind=dp) FUNCTION poly2(x, a0, a1, a2)
1813 INTEGER, INTENT(IN) :: x
1814 REAL(kind=dp), INTENT(IN) :: a0, a1, a2
1815 REAL(kind=dp) :: r
1816
1817 r = real(x, kind=dp)
1818 poly2 = a0 + (a1 + a2*r)*r
1819 END FUNCTION poly2
1820
1821 END FUNCTION cost_model
1822! **************************************************************************************************
1823!> \brief Minimizes the maximum cost per cpu by shuffling around all bins
1824!> \param total_number_of_bins ...
1825!> \param number_of_processes ...
1826!> \param bin_costs costs per bin
1827!> \param distribution_vector will contain the final distribution
1828!> \param do_randomize ...
1829!> \par History
1830!> 03.2009 created from a hack by Joost [Manuel Guidon]
1831!> \author Manuel Guidon
1832! **************************************************************************************************
1833 SUBROUTINE optimize_distribution(total_number_of_bins, number_of_processes, bin_costs, &
1834 distribution_vector, do_randomize)
1835 INTEGER :: total_number_of_bins, number_of_processes
1836 INTEGER(int_8), DIMENSION(:), POINTER :: bin_costs
1837 INTEGER, DIMENSION(:), POINTER :: distribution_vector
1838 LOGICAL, INTENT(IN) :: do_randomize
1839
1840 INTEGER :: i, itmp, j, nstep
1841 INTEGER(int_8), DIMENSION(:), POINTER :: my_cost_cpu, tmp_cost, tmp_cpu_cost
1842 INTEGER, DIMENSION(:), POINTER :: tmp_cpu_index, tmp_index
1843 TYPE(rng_stream_type), ALLOCATABLE :: rng_stream
1844
1845 nstep = max(1, int(number_of_processes)/2)
1846
1847 ALLOCATE (tmp_cost(total_number_of_bins))
1848 ALLOCATE (tmp_index(total_number_of_bins))
1849 ALLOCATE (tmp_cpu_cost(number_of_processes))
1850 ALLOCATE (tmp_cpu_index(number_of_processes))
1851 ALLOCATE (my_cost_cpu(number_of_processes))
1852 tmp_cost = bin_costs
1853
1854 CALL sort(tmp_cost, total_number_of_bins, tmp_index)
1855 my_cost_cpu = 0
1856 !
1857 ! assign the largest remaining bin to the CPU with the smallest load
1858 ! gives near perfect distributions for a sufficient number of bins ...
1859 ! doing this in chunks of nstep (where nstep ~ number_of_processes) makes this n log n and gives
1860 ! each cpu a similar number of tasks.
1861 ! it also avoids degenerate cases where thousands of zero sized tasks
1862 ! are assigned to the same (least loaded) cpu
1863 !
1864 IF (do_randomize) &
1865 rng_stream = rng_stream_type(name="uniform_rng", &
1866 distribution_type=uniform)
1867
1868 DO i = total_number_of_bins, 1, -nstep
1869 tmp_cpu_cost = my_cost_cpu
1870 CALL sort(tmp_cpu_cost, int(number_of_processes), tmp_cpu_index)
1871 IF (do_randomize) THEN
1872 CALL rng_stream%shuffle(tmp_cpu_index(1:min(i, nstep)))
1873 END IF
1874 DO j = 1, min(i, nstep)
1875 itmp = tmp_cpu_index(j)
1876 distribution_vector(tmp_index(i - j + 1)) = itmp
1877 my_cost_cpu(itmp) = my_cost_cpu(itmp) + bin_costs(tmp_index(i - j + 1))
1878 END DO
1879 END DO
1880
1881 DEALLOCATE (tmp_cost, tmp_index, tmp_cpu_cost)
1882 DEALLOCATE (tmp_cpu_index, my_cost_cpu)
1883 END SUBROUTINE optimize_distribution
1884
1885! **************************************************************************************************
1886!> \brief Given a 2d index pair, this function returns a 1d index pair for
1887!> a symmetric upper triangle NxN matrix
1888!> The compiler should inline this function, therefore it appears in
1889!> several modules
1890!> \param i 2d index
1891!> \param j 2d index
1892!> \param N matrix size
1893!> \return ...
1894!> \par History
1895!> 03.2009 created [Manuel Guidon]
1896!> \author Manuel Guidon
1897! **************************************************************************************************
1898 PURE FUNCTION get_1d_idx(i, j, N)
1899 INTEGER, INTENT(IN) :: i, j
1900 INTEGER(int_8), INTENT(IN) :: n
1901 INTEGER(int_8) :: get_1d_idx
1902
1903 INTEGER(int_8) :: min_ij
1904
1905 min_ij = min(i, j)
1906 get_1d_idx = min_ij*n + max(i, j) - (min_ij - 1)*min_ij/2 - n
1907
1908 END FUNCTION get_1d_idx
1909
1910! **************************************************************************************************
1911!> \brief ...
1912!> \param natom ...
1913!> \param nkind ...
1914!> \param list_ij ...
1915!> \param list_kl ...
1916!> \param set_list_ij ...
1917!> \param set_list_kl ...
1918!> \param iatom_start ...
1919!> \param iatom_end ...
1920!> \param jatom_start ...
1921!> \param jatom_end ...
1922!> \param katom_start ...
1923!> \param katom_end ...
1924!> \param latom_start ...
1925!> \param latom_end ...
1926!> \param particle_set ...
1927!> \param coeffs_set ...
1928!> \param coeffs_kind ...
1929!> \param is_assoc_atomic_block_global ...
1930!> \param do_periodic ...
1931!> \param kind_of ...
1932!> \param basis_parameter ...
1933!> \param pmax_set ...
1934!> \param pmax_atom ...
1935!> \param pmax_blocks ...
1936!> \param cell ...
1937!> \param do_p_screening ...
1938!> \param map_atom_to_kind_atom ...
1939!> \param eval_type ...
1940!> \param log10_eps_schwarz ...
1941!> \param log_2 ...
1942!> \param coeffs_kind_max0 ...
1943!> \param use_virial ...
1944!> \param atomic_pair_list ...
1945!> \return ...
1946! **************************************************************************************************
1947 FUNCTION estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
1948 iatom_start, iatom_end, jatom_start, jatom_end, &
1949 katom_start, katom_end, latom_start, latom_end, &
1950 particle_set, &
1951 coeffs_set, coeffs_kind, &
1952 is_assoc_atomic_block_global, do_periodic, &
1953 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
1954 cell, &
1955 do_p_screening, map_atom_to_kind_atom, eval_type, &
1956 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, &
1957 atomic_pair_list)
1958
1959 INTEGER, INTENT(IN) :: natom, nkind
1960 TYPE(pair_list_type) :: list_ij, list_kl
1961 TYPE(pair_set_list_type), DIMENSION(:) :: set_list_ij, set_list_kl
1962 INTEGER, INTENT(IN) :: iatom_start, iatom_end, jatom_start, &
1963 jatom_end, katom_start, katom_end, &
1964 latom_start, latom_end
1965 TYPE(particle_type), DIMENSION(:), POINTER :: particle_set
1966 TYPE(hfx_screen_coeff_type), &
1967 DIMENSION(:, :, :, :), POINTER :: coeffs_set
1968 TYPE(hfx_screen_coeff_type), &
1969 DIMENSION(nkind, nkind) :: coeffs_kind
1970 INTEGER, DIMENSION(:, :) :: is_assoc_atomic_block_global
1971 LOGICAL :: do_periodic
1972 INTEGER :: kind_of(*)
1973 TYPE(hfx_basis_type), DIMENSION(:), POINTER :: basis_parameter
1974 TYPE(hfx_p_kind), DIMENSION(:), POINTER :: pmax_set
1975 REAL(dp), DIMENSION(:, :), POINTER :: pmax_atom
1976 REAL(dp) :: pmax_blocks
1977 TYPE(cell_type), POINTER :: cell
1978 LOGICAL, INTENT(IN) :: do_p_screening
1979 INTEGER, DIMENSION(:), POINTER :: map_atom_to_kind_atom
1980 INTEGER, INTENT(IN) :: eval_type
1981 REAL(dp) :: log10_eps_schwarz, log_2, &
1982 coeffs_kind_max0
1983 LOGICAL, INTENT(IN) :: use_virial
1984 LOGICAL, DIMENSION(natom, natom) :: atomic_pair_list
1985 INTEGER(int_8) :: estimate_block_cost
1986
1987 INTEGER :: i_list_ij, i_list_kl, i_set_list_ij, i_set_list_ij_start, i_set_list_ij_stop, &
1988 i_set_list_kl, i_set_list_kl_start, i_set_list_kl_stop, iatom, ikind, iset, jatom, jkind, &
1989 jset, katom, kind_kind_idx, kkind, kset, latom, lkind, lset, swap_id
1990 INTEGER, DIMENSION(:), POINTER :: npgfa, npgfb, npgfc, npgfd, nsgfa, &
1991 nsgfb, nsgfc, nsgfd
1992 REAL(dp) :: actual_pmax_atom, cost_tmp, max_val1, &
1993 max_val2, pmax_entry, rab2, rcd2, &
1994 screen_kind_ij, screen_kind_kl
1995 REAL(dp), DIMENSION(:, :), POINTER :: ptr_p_1, ptr_p_2, ptr_p_3, ptr_p_4
1996
1997 estimate_block_cost = 0_int_8
1998
1999 CALL build_pair_list(natom, list_ij, set_list_ij, iatom_start, iatom_end, jatom_start, jatom_end, &
2000 kind_of, basis_parameter, particle_set, &
2001 do_periodic, coeffs_set, coeffs_kind, coeffs_kind_max0, &
2002 log10_eps_schwarz, cell, pmax_blocks, atomic_pair_list)
2003
2004 CALL build_pair_list(natom, list_kl, set_list_kl, katom_start, katom_end, latom_start, latom_end, &
2005 kind_of, basis_parameter, particle_set, &
2006 do_periodic, coeffs_set, coeffs_kind, coeffs_kind_max0, &
2007 log10_eps_schwarz, cell, pmax_blocks, atomic_pair_list)
2008
2009 DO i_list_ij = 1, list_ij%n_element
2010 iatom = list_ij%elements(i_list_ij)%pair(1)
2011 jatom = list_ij%elements(i_list_ij)%pair(2)
2012 i_set_list_ij_start = list_ij%elements(i_list_ij)%set_bounds(1)
2013 i_set_list_ij_stop = list_ij%elements(i_list_ij)%set_bounds(2)
2014 ikind = list_ij%elements(i_list_ij)%kind_pair(1)
2015 jkind = list_ij%elements(i_list_ij)%kind_pair(2)
2016 rab2 = list_ij%elements(i_list_ij)%dist2
2017
2018 nsgfa => basis_parameter(ikind)%nsgf
2019 nsgfb => basis_parameter(jkind)%nsgf
2020 npgfa => basis_parameter(ikind)%npgf
2021 npgfb => basis_parameter(jkind)%npgf
2022
2023 DO i_list_kl = 1, list_kl%n_element
2024
2025 katom = list_kl%elements(i_list_kl)%pair(1)
2026 latom = list_kl%elements(i_list_kl)%pair(2)
2027
2028 IF (.NOT. (katom + latom <= iatom + jatom)) cycle
2029 IF (((iatom + jatom) .EQ. (katom + latom)) .AND. (katom < iatom)) cycle
2030
2031 IF (eval_type == hfx_do_eval_forces) THEN
2032 IF (.NOT. use_virial) THEN
2033 IF ((iatom == jatom .AND. iatom == katom .AND. iatom == latom)) cycle
2034 END IF
2035 END IF
2036
2037 i_set_list_kl_start = list_kl%elements(i_list_kl)%set_bounds(1)
2038 i_set_list_kl_stop = list_kl%elements(i_list_kl)%set_bounds(2)
2039 kkind = list_kl%elements(i_list_kl)%kind_pair(1)
2040 lkind = list_kl%elements(i_list_kl)%kind_pair(2)
2041 rcd2 = list_kl%elements(i_list_kl)%dist2
2042
2043 nsgfc => basis_parameter(kkind)%nsgf
2044 nsgfd => basis_parameter(lkind)%nsgf
2045 npgfc => basis_parameter(kkind)%npgf
2046 npgfd => basis_parameter(lkind)%npgf
2047
2048 IF (do_p_screening) THEN
2049 actual_pmax_atom = max(pmax_atom(katom, iatom), &
2050 pmax_atom(latom, jatom), &
2051 pmax_atom(latom, iatom), &
2052 pmax_atom(katom, jatom))
2053 ELSE
2054 actual_pmax_atom = 0.0_dp
2055 END IF
2056
2057 screen_kind_ij = coeffs_kind(jkind, ikind)%x(1)*rab2 + &
2058 coeffs_kind(jkind, ikind)%x(2)
2059 screen_kind_kl = coeffs_kind(lkind, kkind)%x(1)*rcd2 + &
2060 coeffs_kind(lkind, kkind)%x(2)
2061 IF (screen_kind_ij + screen_kind_kl + actual_pmax_atom < log10_eps_schwarz) cycle
2062
2063 IF (.NOT. (is_assoc_atomic_block_global(latom, iatom) >= 1 .AND. &
2064 is_assoc_atomic_block_global(katom, iatom) >= 1 .AND. &
2065 is_assoc_atomic_block_global(katom, jatom) >= 1 .AND. &
2066 is_assoc_atomic_block_global(latom, jatom) >= 1)) cycle
2067
2068 IF (do_p_screening) THEN
2069 SELECT CASE (eval_type)
2070 CASE (hfx_do_eval_energy)
2071 swap_id = 0
2072 kind_kind_idx = int(get_1d_idx(kkind, ikind, int(nkind, int_8)))
2073 IF (ikind >= kkind) THEN
2074 ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2075 map_atom_to_kind_atom(katom), &
2076 map_atom_to_kind_atom(iatom))
2077 ELSE
2078 ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2079 map_atom_to_kind_atom(iatom), &
2080 map_atom_to_kind_atom(katom))
2081 swap_id = swap_id + 1
2082 END IF
2083 kind_kind_idx = int(get_1d_idx(lkind, jkind, int(nkind, int_8)))
2084 IF (jkind >= lkind) THEN
2085 ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2086 map_atom_to_kind_atom(latom), &
2087 map_atom_to_kind_atom(jatom))
2088 ELSE
2089 ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2090 map_atom_to_kind_atom(jatom), &
2091 map_atom_to_kind_atom(latom))
2092 swap_id = swap_id + 2
2093 END IF
2094 kind_kind_idx = int(get_1d_idx(lkind, ikind, int(nkind, int_8)))
2095 IF (ikind >= lkind) THEN
2096 ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2097 map_atom_to_kind_atom(latom), &
2098 map_atom_to_kind_atom(iatom))
2099 ELSE
2100 ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2101 map_atom_to_kind_atom(iatom), &
2102 map_atom_to_kind_atom(latom))
2103 swap_id = swap_id + 4
2104 END IF
2105 kind_kind_idx = int(get_1d_idx(kkind, jkind, int(nkind, int_8)))
2106 IF (jkind >= kkind) THEN
2107 ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2108 map_atom_to_kind_atom(katom), &
2109 map_atom_to_kind_atom(jatom))
2110 ELSE
2111 ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2112 map_atom_to_kind_atom(jatom), &
2113 map_atom_to_kind_atom(katom))
2114 swap_id = swap_id + 8
2115 END IF
2116 CASE (hfx_do_eval_forces)
2117 swap_id = 16
2118 kind_kind_idx = int(get_1d_idx(kkind, ikind, int(nkind, int_8)))
2119 IF (ikind >= kkind) THEN
2120 ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2121 map_atom_to_kind_atom(katom), &
2122 map_atom_to_kind_atom(iatom))
2123 ELSE
2124 ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2125 map_atom_to_kind_atom(iatom), &
2126 map_atom_to_kind_atom(katom))
2127 swap_id = swap_id + 1
2128 END IF
2129 kind_kind_idx = int(get_1d_idx(lkind, jkind, int(nkind, int_8)))
2130 IF (jkind >= lkind) THEN
2131 ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2132 map_atom_to_kind_atom(latom), &
2133 map_atom_to_kind_atom(jatom))
2134 ELSE
2135 ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2136 map_atom_to_kind_atom(jatom), &
2137 map_atom_to_kind_atom(latom))
2138 swap_id = swap_id + 2
2139 END IF
2140 kind_kind_idx = int(get_1d_idx(lkind, ikind, int(nkind, int_8)))
2141 IF (ikind >= lkind) THEN
2142 ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2143 map_atom_to_kind_atom(latom), &
2144 map_atom_to_kind_atom(iatom))
2145 ELSE
2146 ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2147 map_atom_to_kind_atom(iatom), &
2148 map_atom_to_kind_atom(latom))
2149 swap_id = swap_id + 4
2150 END IF
2151 kind_kind_idx = int(get_1d_idx(kkind, jkind, int(nkind, int_8)))
2152 IF (jkind >= kkind) THEN
2153 ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2154 map_atom_to_kind_atom(katom), &
2155 map_atom_to_kind_atom(jatom))
2156 ELSE
2157 ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2158 map_atom_to_kind_atom(jatom), &
2159 map_atom_to_kind_atom(katom))
2160 swap_id = swap_id + 8
2161 END IF
2162 END SELECT
2163 END IF
2164
2165 DO i_set_list_ij = i_set_list_ij_start, i_set_list_ij_stop
2166 iset = set_list_ij(i_set_list_ij)%pair(1)
2167 jset = set_list_ij(i_set_list_ij)%pair(2)
2168
2169 max_val1 = coeffs_set(jset, iset, jkind, ikind)%x(1)*rab2 + &
2170 coeffs_set(jset, iset, jkind, ikind)%x(2)
2171
2172 IF (max_val1 + screen_kind_kl + actual_pmax_atom < log10_eps_schwarz) cycle
2173 DO i_set_list_kl = i_set_list_kl_start, i_set_list_kl_stop
2174 kset = set_list_kl(i_set_list_kl)%pair(1)
2175 lset = set_list_kl(i_set_list_kl)%pair(2)
2176
2177 max_val2 = max_val1 + (coeffs_set(lset, kset, lkind, kkind)%x(1)*rcd2 + &
2178 coeffs_set(lset, kset, lkind, kkind)%x(2))
2179
2180 IF (max_val2 + actual_pmax_atom < log10_eps_schwarz) cycle
2181 IF (do_p_screening) THEN
2182 CALL get_pmax_val(ptr_p_1, ptr_p_2, ptr_p_3, ptr_p_4, &
2183 iset, jset, kset, lset, &
2184 pmax_entry, swap_id)
2185 IF (eval_type == hfx_do_eval_forces) THEN
2186 pmax_entry = log_2 + pmax_entry
2187 END IF
2188 ELSE
2189 pmax_entry = 0.0_dp
2190 END IF
2191 max_val2 = max_val2 + pmax_entry
2192 IF (max_val2 < log10_eps_schwarz) cycle
2193 SELECT CASE (eval_type)
2194 CASE (hfx_do_eval_energy)
2195 cost_tmp = cost_model(nsgfa(iset), nsgfb(jset), nsgfc(kset), nsgfd(lset), &
2196 npgfa(iset), npgfb(jset), npgfc(kset), npgfd(lset), &
2197 max_val2/log10_eps_schwarz, &
2199 estimate_block_cost = estimate_block_cost + int(cost_tmp, kind=int_8)
2200 CASE (hfx_do_eval_forces)
2201 cost_tmp = cost_model(nsgfa(iset), nsgfb(jset), nsgfc(kset), nsgfd(lset), &
2202 npgfa(iset), npgfb(jset), npgfc(kset), npgfd(lset), &
2203 max_val2/log10_eps_schwarz, &
2204 p1_forces, p2_forces, p3_forces)
2205 estimate_block_cost = estimate_block_cost + int(cost_tmp, kind=int_8)
2206 END SELECT
2207 END DO ! i_set_list_kl
2208 END DO ! i_set_list_ij
2209 END DO ! i_list_kl
2210 END DO ! i_list_ij
2211
2212 END FUNCTION estimate_block_cost
2213
2214! **************************************************************************************************
2215!> \brief ...
2216!> \param nkind ...
2217!> \param para_env ...
2218!> \param natom ...
2219!> \param block_size ...
2220!> \param nblock ...
2221!> \param blocks ...
2222!> \param list_ij ...
2223!> \param list_kl ...
2224!> \param set_list_ij ...
2225!> \param set_list_kl ...
2226!> \param particle_set ...
2227!> \param coeffs_set ...
2228!> \param coeffs_kind ...
2229!> \param is_assoc_atomic_block_global ...
2230!> \param do_periodic ...
2231!> \param kind_of ...
2232!> \param basis_parameter ...
2233!> \param pmax_set ...
2234!> \param pmax_atom ...
2235!> \param pmax_blocks ...
2236!> \param cell ...
2237!> \param do_p_screening ...
2238!> \param map_atom_to_kind_atom ...
2239!> \param eval_type ...
2240!> \param log10_eps_schwarz ...
2241!> \param log_2 ...
2242!> \param coeffs_kind_max0 ...
2243!> \param use_virial ...
2244!> \param atomic_pair_list ...
2245! **************************************************************************************************
2246 SUBROUTINE init_blocks(nkind, para_env, natom, block_size, nblock, blocks, &
2247 list_ij, list_kl, set_list_ij, set_list_kl, &
2248 particle_set, &
2249 coeffs_set, coeffs_kind, &
2250 is_assoc_atomic_block_global, do_periodic, &
2251 kind_of, basis_parameter, pmax_set, pmax_atom, &
2252 pmax_blocks, cell, &
2253 do_p_screening, map_atom_to_kind_atom, eval_type, &
2254 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, &
2255 atomic_pair_list)
2256
2257 INTEGER, INTENT(IN) :: nkind
2258 TYPE(mp_para_env_type), INTENT(IN) :: para_env
2259 INTEGER :: natom, block_size, nblock
2260 TYPE(hfx_block_range_type), DIMENSION(1:nblock) :: blocks
2261 TYPE(pair_list_type) :: list_ij, list_kl
2262 TYPE(pair_set_list_type), DIMENSION(:) :: set_list_ij, set_list_kl
2263 TYPE(particle_type), DIMENSION(:), POINTER :: particle_set
2264 TYPE(hfx_screen_coeff_type), &
2265 DIMENSION(:, :, :, :), POINTER :: coeffs_set
2266 TYPE(hfx_screen_coeff_type), DIMENSION(:, :), &
2267 POINTER :: coeffs_kind
2268 INTEGER, DIMENSION(:, :) :: is_assoc_atomic_block_global
2269 LOGICAL :: do_periodic
2270 INTEGER :: kind_of(*)
2271 TYPE(hfx_basis_type), DIMENSION(:), POINTER :: basis_parameter
2272 TYPE(hfx_p_kind), DIMENSION(:), POINTER :: pmax_set
2273 REAL(dp), DIMENSION(:, :), POINTER :: pmax_atom
2274 REAL(dp) :: pmax_blocks
2275 TYPE(cell_type), POINTER :: cell
2276 LOGICAL, INTENT(IN) :: do_p_screening
2277 INTEGER, DIMENSION(:), POINTER :: map_atom_to_kind_atom
2278 INTEGER, INTENT(IN) :: eval_type
2279 REAL(dp) :: log10_eps_schwarz, log_2, &
2280 coeffs_kind_max0
2281 LOGICAL, INTENT(IN) :: use_virial
2282 LOGICAL, DIMENSION(natom, natom) :: atomic_pair_list
2283
2284 INTEGER :: atom_block, i, iatom_block, iatom_end, &
2285 iatom_start, my_cpu_rank, ncpus
2286
2287 DO atom_block = 0, nblock - 1
2288 iatom_block = modulo(atom_block, nblock) + 1
2289 iatom_start = (iatom_block - 1)*block_size + 1
2290 iatom_end = min(iatom_block*block_size, natom)
2291 blocks(atom_block + 1)%istart = iatom_start
2292 blocks(atom_block + 1)%iend = iatom_end
2293 blocks(atom_block + 1)%cost = 0_int_8
2294 END DO
2295
2296 ncpus = para_env%num_pe
2297 my_cpu_rank = para_env%mepos
2298 DO i = 1, nblock
2299 IF (modulo(i, ncpus) /= my_cpu_rank) THEN
2300 blocks(i)%istart = 0
2301 blocks(i)%iend = 0
2302 cycle
2303 END IF
2304 iatom_start = blocks(i)%istart
2305 iatom_end = blocks(i)%iend
2306 blocks(i)%cost = estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
2307 iatom_start, iatom_end, iatom_start, iatom_end, &
2308 iatom_start, iatom_end, iatom_start, iatom_end, &
2309 particle_set, &
2310 coeffs_set, coeffs_kind, &
2311 is_assoc_atomic_block_global, do_periodic, &
2312 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
2313 cell, &
2314 do_p_screening, map_atom_to_kind_atom, eval_type, &
2315 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
2316
2317 END DO
2318 END SUBROUTINE init_blocks
2319
2320! **************************************************************************************************
2321!> \brief ...
2322!> \param para_env ...
2323!> \param x_data ...
2324!> \param iw ...
2325!> \param n_threads ...
2326!> \param i_thread ...
2327!> \param eval_type ...
2328! **************************************************************************************************
2329 SUBROUTINE collect_load_balance_info(para_env, x_data, iw, n_threads, i_thread, &
2330 eval_type)
2331
2332 TYPE(mp_para_env_type), INTENT(IN) :: para_env
2333 TYPE(hfx_type), POINTER :: x_data
2334 INTEGER, INTENT(IN) :: iw, n_threads, i_thread, eval_type
2335
2336 INTEGER :: i, j, k, my_rank, nbins, nranks, &
2337 total_bins
2338 INTEGER(int_8) :: avg_bin, avg_rank, max_bin, max_rank, &
2339 min_bin, min_rank, sum_bin, sum_rank
2340 INTEGER(int_8), ALLOCATABLE, DIMENSION(:) :: buffer, buffer_in, buffer_out, summary
2341 INTEGER(int_8), ALLOCATABLE, DIMENSION(:), SAVE :: shm_cost_vector
2342 INTEGER, ALLOCATABLE, DIMENSION(:) :: bins_per_rank, rdispl, sort_idx
2343 INTEGER, ALLOCATABLE, DIMENSION(:), SAVE :: shm_bins_per_rank, shm_displ
2344
2345 SELECT CASE (eval_type)
2346 CASE (hfx_do_eval_energy)
2347 nbins = SIZE(x_data%distribution_energy)
2348 CASE (hfx_do_eval_forces)
2349 nbins = SIZE(x_data%distribution_forces)
2350 END SELECT
2351
2352!$OMP MASTER
2353 ALLOCATE (shm_bins_per_rank(n_threads))
2354 ALLOCATE (shm_displ(n_threads + 1))
2355!$OMP END MASTER
2356!$OMP BARRIER
2357
2358 shm_bins_per_rank(i_thread + 1) = nbins
2359!$OMP BARRIER
2360 nbins = 0
2361 DO i = 1, n_threads
2362 nbins = nbins + shm_bins_per_rank(i)
2363 END DO
2364 my_rank = para_env%mepos
2365 nranks = para_env%num_pe
2366
2367!$OMP BARRIER
2368!$OMP MASTER
2369 ALLOCATE (bins_per_rank(nranks))
2370 bins_per_rank = 0
2371
2372 bins_per_rank(my_rank + 1) = nbins
2373
2374 CALL para_env%sum(bins_per_rank)
2375
2376 total_bins = 0
2377 DO i = 1, nranks
2378 total_bins = total_bins + bins_per_rank(i)
2379 END DO
2380
2381 ALLOCATE (shm_cost_vector(2*total_bins))
2382 shm_cost_vector = -1_int_8
2383 shm_displ(1) = 1
2384 DO i = 2, n_threads
2385 shm_displ(i) = shm_displ(i - 1) + shm_bins_per_rank(i - 1)
2386 END DO
2387 shm_displ(n_threads + 1) = nbins + 1
2388!$OMP END MASTER
2389!$OMP BARRIER
2390 j = 0
2391 SELECT CASE (eval_type)
2392 CASE (hfx_do_eval_energy)
2393 DO i = shm_displ(i_thread + 1), shm_displ(i_thread + 2) - 1
2394 j = j + 1
2395 shm_cost_vector(2*(i - 1) + 1) = x_data%distribution_energy(j)%cost
2396 shm_cost_vector(2*i) = int(x_data%distribution_energy(j)%time_first_scf*10000.0_dp, kind=int_8)
2397 END DO
2398 CASE (hfx_do_eval_forces)
2399 DO i = shm_displ(i_thread + 1), shm_displ(i_thread + 2) - 1
2400 j = j + 1
2401 shm_cost_vector(2*(i - 1) + 1) = x_data%distribution_forces(j)%cost
2402 shm_cost_vector(2*i) = int(x_data%distribution_forces(j)%time_forces*10000.0_dp, kind=int_8)
2403 END DO
2404 END SELECT
2405!$OMP BARRIER
2406!$OMP MASTER
2407 ! ** calculate offsets
2408 ALLOCATE (rdispl(nranks))
2409 bins_per_rank(:) = bins_per_rank(:)*2
2410 rdispl(1) = 0
2411 DO i = 2, nranks
2412 rdispl(i) = rdispl(i - 1) + bins_per_rank(i - 1)
2413 END DO
2414
2415 ALLOCATE (buffer_in(2*nbins))
2416 ALLOCATE (buffer_out(2*total_bins))
2417
2418 DO i = 1, nbins
2419 buffer_in(2*(i - 1) + 1) = shm_cost_vector(2*(i - 1) + 1)
2420 buffer_in(2*i) = shm_cost_vector(2*i)
2421 END DO
2422
2423 CALL para_env%gatherv(buffer_in, buffer_out, bins_per_rank, rdispl)
2424
2425 IF (iw > 0) THEN
2426
2427 ALLOCATE (summary(2*nranks))
2428 summary = 0_int_8
2429
2430 WRITE (iw, '( /, 1X, 79("-") )')
2431 WRITE (iw, '( " -", 77X, "-" )')
2432 SELECT CASE (eval_type)
2433 CASE (hfx_do_eval_energy)
2434 WRITE (iw, '( " -", 20X, A, 19X, "-" )') ' HFX LOAD BALANCE INFORMATION - ENERGY '
2435 CASE (hfx_do_eval_forces)
2436 WRITE (iw, '( " -", 20X, A, 19X, "-" )') ' HFX LOAD BALANCE INFORMATION - FORCES '
2437 END SELECT
2438 WRITE (iw, '( " -", 77X, "-" )')
2439 WRITE (iw, '( 1X, 79("-") )')
2440
2441 WRITE (iw, fmt="(T3,A,T15,A,T35,A,T55,A)") "MPI RANK", "BIN #", "EST cost", "Processing time [s]"
2442 WRITE (iw, '( 1X, 79("-"), / )')
2443 k = 0
2444 DO i = 1, nranks
2445 DO j = 1, bins_per_rank(i)/2
2446 k = k + 1
2447 WRITE (iw, fmt="(T6,I5,T15,I5,T27,I16,T55,F19.8)") &
2448 i - 1, j, buffer_out(2*(k - 1) + 1), real(buffer_out(2*k), dp)/10000.0_dp
2449 summary(2*(i - 1) + 1) = summary(2*(i - 1) + 1) + buffer_out(2*(k - 1) + 1)
2450 summary(2*i) = summary(2*i) + buffer_out(2*k)
2451 END DO
2452 END DO
2453
2454 !** Summary
2455 max_bin = 0_int_8
2456 min_bin = huge(min_bin)
2457 sum_bin = 0_int_8
2458 DO i = 1, total_bins
2459 sum_bin = sum_bin + buffer_out(2*i)
2460 max_bin = max(max_bin, buffer_out(2*i))
2461 min_bin = min(min_bin, buffer_out(2*i))
2462 END DO
2463 avg_bin = sum_bin/total_bins
2464
2465 max_rank = 0_int_8
2466 min_rank = huge(min_rank)
2467 sum_rank = 0_int_8
2468 DO i = 1, nranks
2469 sum_rank = sum_rank + summary(2*i)
2470 max_rank = max(max_rank, summary(2*i))
2471 min_rank = min(min_rank, summary(2*i))
2472 END DO
2473 avg_rank = sum_rank/nranks
2474
2475 WRITE (iw, fmt='(/,T3,A,/)') "SUMMARY:"
2476 WRITE (iw, fmt="(T3,A,T35,F19.8)") "Max bin", real(max_bin, dp)/10000.0_dp
2477 WRITE (iw, fmt="(T3,A,T35,F19.8)") "Min bin", real(min_bin, dp)/10000.0_dp
2478 WRITE (iw, fmt="(T3,A,T35,F19.8)") "Sum bin", real(sum_bin, dp)/10000.0_dp
2479 WRITE (iw, fmt="(T3,A,T35,F19.8,/)") "Avg bin", real(avg_bin, dp)/10000.0_dp
2480 WRITE (iw, fmt="(T3,A,T35,F19.8)") "Max rank", real(max_rank, dp)/10000.0_dp
2481 WRITE (iw, fmt="(T3,A,T35,F19.8)") "Min rank", real(min_rank, dp)/10000.0_dp
2482 WRITE (iw, fmt="(T3,A,T35,F19.8)") "Sum rank", real(sum_rank, dp)/10000.0_dp
2483 WRITE (iw, fmt="(T3,A,T35,F19.8,/)") "Avg rank", real(avg_rank, dp)/10000.0_dp
2484
2485 ALLOCATE (buffer(nranks))
2486 ALLOCATE (sort_idx(nranks))
2487
2488 DO i = 1, nranks
2489 buffer(i) = summary(2*i)
2490 END DO
2491
2492 CALL sort(buffer, nranks, sort_idx)
2493
2494 WRITE (iw, fmt="(T3,A,T35,A,T55,A,/)") "MPI RANK", "EST cost", "Processing time [s]"
2495 DO i = nranks, 1, -1
2496 WRITE (iw, fmt="(T6,I5,T27,I16,T55,F19.8)") sort_idx(i) - 1, summary(2*(sort_idx(i) - 1) + 1), real(buffer(i), dp)/10000.0_dp
2497 END DO
2498
2499 DEALLOCATE (summary, buffer, sort_idx)
2500
2501 END IF
2502
2503 DEALLOCATE (buffer_in, buffer_out, rdispl)
2504
2505 CALL para_env%sync()
2506
2507 DEALLOCATE (shm_bins_per_rank, shm_displ, shm_cost_vector)
2508!$OMP END MASTER
2509!$OMP BARRIER
2510
2511 END SUBROUTINE collect_load_balance_info
2512
2513
2514! **************************************************************************************************
2515!> \brief This routine calculates the maximum density matrix element, when
2516!> screening on an initial density matrix is applied. Due to symmetry of
2517!> the ERI's, there are always 4 matrix elements to be considered.
2518!> CASE 0-15 belong to an energy calculation (linear screening)
2519!> CASE 16-31 belong to a force calculation (square screening)
2520!> \param ptr_p_1 Pointers to atomic density matrices
2521!> \param ptr_p_2 Pointers to atomic density matrices
2522!> \param ptr_p_3 Pointers to atomic density matrices
2523!> \param ptr_p_4 Pointers to atomic density matrices
2524!> \param iset Current set
2525!> \param jset Current set
2526!> \param kset Current set
2527!> \param lset Current set
2528!> \param pmax_val value to be calculated
2529!> \param swap_id Defines how the matrices are accessed
2530!> \par History
2531!> 06.2009 created [Manuel Guidon]
2532!> \author Manuel Guidon
2533! **************************************************************************************************
2534PURE SUBROUTINE get_pmax_val(ptr_p_1, ptr_p_2, ptr_p_3, ptr_p_4, iset, jset, kset, lset, pmax_val, swap_id)
2535
2536 REAL(dp), DIMENSION(:, :), POINTER :: ptr_p_1, ptr_p_2, ptr_p_3, ptr_p_4
2537 INTEGER, INTENT(IN) :: iset, jset, kset, lset
2538
2539 REAL(dp), INTENT(OUT) :: pmax_val
2540 INTEGER, INTENT(IN) :: swap_id
2541
2542 REAL(dp) :: pmax_1, pmax_2, pmax_3, pmax_4
2543
2544 SELECT CASE (swap_id)
2545 CASE (0)
2546 pmax_1 = ptr_p_1(kset, iset)
2547 pmax_2 = ptr_p_2(lset, jset)
2548 pmax_3 = ptr_p_3(lset, iset)
2549 pmax_4 = ptr_p_4(kset, jset)
2550 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2551 CASE (1)
2552 pmax_1 = ptr_p_1(iset, kset)
2553 pmax_2 = ptr_p_2(lset, jset)
2554 pmax_3 = ptr_p_3(lset, iset)
2555 pmax_4 = ptr_p_4(kset, jset)
2556 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2557 CASE (2)
2558 pmax_1 = ptr_p_1(kset, iset)
2559 pmax_2 = ptr_p_2(jset, lset)
2560 pmax_3 = ptr_p_3(lset, iset)
2561 pmax_4 = ptr_p_4(kset, jset)
2562 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2563 CASE (3)
2564 pmax_1 = ptr_p_1(iset, kset)
2565 pmax_2 = ptr_p_2(jset, lset)
2566 pmax_3 = ptr_p_3(lset, iset)
2567 pmax_4 = ptr_p_4(kset, jset)
2568 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2569 CASE (4)
2570 pmax_1 = ptr_p_1(kset, iset)
2571 pmax_2 = ptr_p_2(lset, jset)
2572 pmax_3 = ptr_p_3(iset, lset)
2573 pmax_4 = ptr_p_4(kset, jset)
2574 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2575 CASE (5)
2576 pmax_1 = ptr_p_1(iset, kset)
2577 pmax_2 = ptr_p_2(lset, jset)
2578 pmax_3 = ptr_p_3(iset, lset)
2579 pmax_4 = ptr_p_4(kset, jset)
2580 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2581 CASE (6)
2582 pmax_1 = ptr_p_1(kset, iset)
2583 pmax_2 = ptr_p_2(jset, lset)
2584 pmax_3 = ptr_p_3(iset, lset)
2585 pmax_4 = ptr_p_4(kset, jset)
2586 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2587 CASE (7)
2588 pmax_1 = ptr_p_1(iset, kset)
2589 pmax_2 = ptr_p_2(jset, lset)
2590 pmax_3 = ptr_p_3(iset, lset)
2591 pmax_4 = ptr_p_4(kset, jset)
2592 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2593 CASE (8)
2594 pmax_1 = ptr_p_1(kset, iset)
2595 pmax_2 = ptr_p_2(lset, jset)
2596 pmax_3 = ptr_p_3(lset, iset)
2597 pmax_4 = ptr_p_4(jset, kset)
2598 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2599 CASE (9)
2600 pmax_1 = ptr_p_1(iset, kset)
2601 pmax_2 = ptr_p_2(lset, jset)
2602 pmax_3 = ptr_p_3(lset, iset)
2603 pmax_4 = ptr_p_4(jset, kset)
2604 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2605 CASE (10)
2606 pmax_1 = ptr_p_1(kset, iset)
2607 pmax_2 = ptr_p_2(jset, lset)
2608 pmax_3 = ptr_p_3(lset, iset)
2609 pmax_4 = ptr_p_4(jset, kset)
2610 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2611 CASE (11)
2612 pmax_1 = ptr_p_1(iset, kset)
2613 pmax_2 = ptr_p_2(jset, lset)
2614 pmax_3 = ptr_p_3(lset, iset)
2615 pmax_4 = ptr_p_4(jset, kset)
2616 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2617 CASE (12)
2618 pmax_1 = ptr_p_1(kset, iset)
2619 pmax_2 = ptr_p_2(lset, jset)
2620 pmax_3 = ptr_p_3(iset, lset)
2621 pmax_4 = ptr_p_4(jset, kset)
2622 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2623 CASE (13)
2624 pmax_1 = ptr_p_1(iset, kset)
2625 pmax_2 = ptr_p_2(lset, jset)
2626 pmax_3 = ptr_p_3(iset, lset)
2627 pmax_4 = ptr_p_4(jset, kset)
2628 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2629 CASE (14)
2630 pmax_1 = ptr_p_1(kset, iset)
2631 pmax_2 = ptr_p_2(jset, lset)
2632 pmax_3 = ptr_p_3(iset, lset)
2633 pmax_4 = ptr_p_4(jset, kset)
2634 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2635 CASE (15)
2636 pmax_1 = ptr_p_1(iset, kset)
2637 pmax_2 = ptr_p_2(jset, lset)
2638 pmax_3 = ptr_p_3(iset, lset)
2639 pmax_4 = ptr_p_4(jset, kset)
2640 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2641 CASE (16)
2642 pmax_1 = ptr_p_1(kset, iset)
2643 pmax_2 = ptr_p_2(lset, jset)
2644 pmax_3 = ptr_p_3(lset, iset)
2645 pmax_4 = ptr_p_4(kset, jset)
2646 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2647 CASE (17)
2648 pmax_1 = ptr_p_1(iset, kset)
2649 pmax_2 = ptr_p_2(lset, jset)
2650 pmax_3 = ptr_p_3(lset, iset)
2651 pmax_4 = ptr_p_4(kset, jset)
2652 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2653 CASE (18)
2654 pmax_1 = ptr_p_1(kset, iset)
2655 pmax_2 = ptr_p_2(jset, lset)
2656 pmax_3 = ptr_p_3(lset, iset)
2657 pmax_4 = ptr_p_4(kset, jset)
2658 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2659 CASE (19)
2660 pmax_1 = ptr_p_1(iset, kset)
2661 pmax_2 = ptr_p_2(jset, lset)
2662 pmax_3 = ptr_p_3(lset, iset)
2663 pmax_4 = ptr_p_4(kset, jset)
2664 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2665 CASE (20)
2666 pmax_1 = ptr_p_1(kset, iset)
2667 pmax_2 = ptr_p_2(lset, jset)
2668 pmax_3 = ptr_p_3(iset, lset)
2669 pmax_4 = ptr_p_4(kset, jset)
2670 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2671 CASE (21)
2672 pmax_1 = ptr_p_1(iset, kset)
2673 pmax_2 = ptr_p_2(lset, jset)
2674 pmax_3 = ptr_p_3(iset, lset)
2675 pmax_4 = ptr_p_4(kset, jset)
2676 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2677 CASE (22)
2678 pmax_1 = ptr_p_1(kset, iset)
2679 pmax_2 = ptr_p_2(jset, lset)
2680 pmax_3 = ptr_p_3(iset, lset)
2681 pmax_4 = ptr_p_4(kset, jset)
2682 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2683 CASE (23)
2684 pmax_1 = ptr_p_1(iset, kset)
2685 pmax_2 = ptr_p_2(jset, lset)
2686 pmax_3 = ptr_p_3(iset, lset)
2687 pmax_4 = ptr_p_4(kset, jset)
2688 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2689 CASE (24)
2690 pmax_1 = ptr_p_1(kset, iset)
2691 pmax_2 = ptr_p_2(lset, jset)
2692 pmax_3 = ptr_p_3(lset, iset)
2693 pmax_4 = ptr_p_4(jset, kset)
2694 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2695 CASE (25)
2696 pmax_1 = ptr_p_1(iset, kset)
2697 pmax_2 = ptr_p_2(lset, jset)
2698 pmax_3 = ptr_p_3(lset, iset)
2699 pmax_4 = ptr_p_4(jset, kset)
2700 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2701 CASE (26)
2702 pmax_1 = ptr_p_1(kset, iset)
2703 pmax_2 = ptr_p_2(jset, lset)
2704 pmax_3 = ptr_p_3(lset, iset)
2705 pmax_4 = ptr_p_4(jset, kset)
2706 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2707 CASE (27)
2708 pmax_1 = ptr_p_1(iset, kset)
2709 pmax_2 = ptr_p_2(jset, lset)
2710 pmax_3 = ptr_p_3(lset, iset)
2711 pmax_4 = ptr_p_4(jset, kset)
2712 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2713 CASE (28)
2714 pmax_1 = ptr_p_1(kset, iset)
2715 pmax_2 = ptr_p_2(lset, jset)
2716 pmax_3 = ptr_p_3(iset, lset)
2717 pmax_4 = ptr_p_4(jset, kset)
2718 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2719 CASE (29)
2720 pmax_1 = ptr_p_1(iset, kset)
2721 pmax_2 = ptr_p_2(lset, jset)
2722 pmax_3 = ptr_p_3(iset, lset)
2723 pmax_4 = ptr_p_4(jset, kset)
2724 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2725 CASE (30)
2726 pmax_1 = ptr_p_1(kset, iset)
2727 pmax_2 = ptr_p_2(jset, lset)
2728 pmax_3 = ptr_p_3(iset, lset)
2729 pmax_4 = ptr_p_4(jset, kset)
2730 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2731 CASE (31)
2732 pmax_1 = ptr_p_1(iset, kset)
2733 pmax_2 = ptr_p_2(jset, lset)
2734 pmax_3 = ptr_p_3(iset, lset)
2735 pmax_4 = ptr_p_4(jset, kset)
2736 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2737 END SELECT
2738
2739END SUBROUTINE get_pmax_val
2740
2741
2742END MODULE hfx_load_balance_methods
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
static GRID_HOST_DEVICE int idx(const orbital a)
Return coset index of given orbital angular momentum.
Handles all functions related to the CELL.
Definition cell_types.F:15
Utility routines to open and close files. Tracking of preconnections.
Definition cp_files.F:16
subroutine, public open_file(file_name, file_status, file_form, file_action, file_position, file_pad, unit_number, debug, skip_get_unit_number, file_access)
Opens the requested file using a free unit number.
Definition cp_files.F:308
subroutine, public close_file(unit_number, file_status, keep_preconnection)
Close an open file given by its logical unit number. Optionally, keep the file and unit preconnected.
Definition cp_files.F:119
Routines for optimizing load balance between processes in HFX calculations.
real(kind=dp), dimension(12), parameter, public p1_energy
real(kind=dp), dimension(2), parameter, public p3_energy
real(kind=dp), dimension(12), parameter, public p2_energy
subroutine, public collect_load_balance_info(para_env, x_data, iw, n_threads, i_thread, eval_type)
...
subroutine, public hfx_load_balance(x_data, eps_schwarz, particle_set, max_set, para_env, coeffs_set, coeffs_kind, is_assoc_atomic_block_global, do_periodic, load_balance_parameter, kind_of, basis_parameter, pmax_set, pmax_atom, i_thread, n_threads, cell, do_p_screening, map_atom_to_kind_atom, nkind, eval_type, pmax_block, use_virial)
Distributes the computation of eri's to all available processes.
integer(kind=int_8) function, public cost_model(nsa, nsb, nsc, nsd, npgfa, npgfb, npgfc, npgfd, ratio, p1, p2, p3)
estimates the cost of a set quartet with info available at load balance time i.e. without much info o...
subroutine, public hfx_update_load_balance(x_data, para_env, load_balance_parameter, i_thread, n_threads, eval_type)
Cheap way of redistributing the eri's.
Routines for optimizing load balance between processes in HFX calculations.
subroutine, public build_atomic_pair_list(natom, atomic_pair_list, kind_of, basis_parameter, particle_set, do_periodic, coeffs_kind, coeffs_kind_max0, log10_eps_schwarz, cell, blocks)
...
subroutine, public build_pair_list(natom, list, set_list, i_start, i_end, j_start, j_end, kind_of, basis_parameter, particle_set, do_periodic, coeffs_set, coeffs_kind, coeffs_kind_max0, log10_eps_schwarz, cell, pmax_blocks, atomic_pair_list)
...
Types and set/get functions for HFX.
Definition hfx_types.F:15
subroutine, public hfx_set_distr_energy(ptr_to_distr, x_data)
This routine stores the data obtained from the load balance routine for the energy
Definition hfx_types.F:2567
subroutine, public hfx_set_distr_forces(ptr_to_distr, x_data)
This routine stores the data obtained from the load balance routine for the forces
Definition hfx_types.F:2587
collects all constants needed in input so that they can be used without circular dependencies
integer, parameter, public hfx_do_eval_energy
integer, parameter, public hfx_do_eval_forces
Defines the basic variable types.
Definition kinds.F:23
integer, parameter, public int_8
Definition kinds.F:54
integer, parameter, public dp
Definition kinds.F:34
Interface to the message passing library MPI.
Parallel (pseudo)random number generator (RNG) for multiple streams and substreams of random numbers.
integer, parameter, public uniform
Define the data structure for the particle information.
All kind of helpful little routines.
Definition util.F:14
Type defining parameters related to the simulation cell.
Definition cell_types.F:55
stores some data used in construction of Kohn-Sham matrix
Definition hfx_types.F:509
stores all the informations relevant to an mpi environment