(git:1f9fd2c)
Loading...
Searching...
No Matches
hfx_load_balance_methods.F
Go to the documentation of this file.
1!--------------------------------------------------------------------------------------------------!
2! CP2K: A general program to perform molecular dynamics simulations !
3! Copyright 2000-2026 CP2K developers group <https://cp2k.org> !
4! !
5! SPDX-License-Identifier: GPL-2.0-or-later !
6!--------------------------------------------------------------------------------------------------!
7
8! **************************************************************************************************
9!> \brief Routines for optimizing load balance between processes in HFX calculations
10!> \par History
11!> 04.2008 created [Manuel Guidon]
12!> \author Manuel Guidon
13! **************************************************************************************************
15 USE cell_types, ONLY: cell_type
16 USE cp_files, ONLY: close_file, &
21 USE hfx_types, ONLY: &
27 USE kinds, ONLY: dp, &
28 int_8
30 USE parallel_rng_types, ONLY: uniform, &
33 USE util, ONLY: sort
34#include "./base/base_uses.f90"
35
36 IMPLICIT NONE
37 PRIVATE
38
39 PUBLIC :: hfx_load_balance, &
42
43 CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'hfx_load_balance_methods'
44
45 REAL(kind=dp), PARAMETER :: p1_energy(12) = [2.9461408209700424_dp, 1.0624718662999657_dp, &
46 -1.91570128356921242e-002_dp, -1.6668495454436603_dp, &
47 1.7512639006523709_dp, -9.76074323945336081e-002_dp, &
48 2.6230786127311889_dp, -0.31870737623014189_dp, &
49 7.9588203912690973_dp, 1.8331423413134813_dp, &
50 -0.15427618665346299_dp, 0.19749436090711650_dp]
51 REAL(kind=dp), PARAMETER :: p2_energy(12) = [2.3104682960662593_dp, 1.8744052737304417_dp, &
52 -9.36564055598656797e-002_dp, 0.64284973765086939_dp, &
53 1.0137565430060556_dp, -6.80088178288954567e-003_dp, &
54 1.1692629207374552_dp, -2.6314710080507573_dp, &
55 19.237814781880786_dp, 1.0505934173661349_dp, &
56 0.80382371955699250_dp, 0.49903401991818103_dp]
57 REAL(kind=dp), PARAMETER :: p3_energy(2) = [7.82336287670072350e-002_dp, 0.38073304105744837_dp]
58 REAL(kind=dp), PARAMETER :: p1_forces(12) = [2.5746279948798874_dp, 1.3420575378609276_dp, &
59 -9.41673106447732111e-002_dp, 0.94568006899317825_dp, &
60 -1.4511897117448544_dp, 0.59178934677316952_dp, &
61 2.7291149361757236_dp, -0.50555512044800210_dp, &
62 8.3508180969609871_dp, 1.6829982496141809_dp, &
63 -0.74895370472152600_dp, 0.43801726744197500_dp]
64 REAL(kind=dp), PARAMETER :: p2_forces(12) = [2.6398568961569020_dp, 2.3024918834564101_dp, &
65 5.33216585432061581e-003_dp, 0.45572145697283628_dp, &
66 1.8119743851500618_dp, -0.12533918548421166_dp, &
67 -1.4040312084552751_dp, -4.5331650463917859_dp, &
68 12.593431549069477_dp, 1.1311978374487595_dp, &
69 1.4245996087624646_dp, 1.1425350529853495_dp]
70 REAL(kind=dp), PARAMETER :: p3_forces(2) = [0.12051930516830946_dp, 1.3828051586144336_dp]
71
72!***
73
74CONTAINS
75
76! **************************************************************************************************
77!> \brief Distributes the computation of eri's to all available processes.
78!> \param x_data Object that stores the indices array
79!> \param eps_schwarz screening parameter
80!> \param particle_set , atomic_kind_set, para_env ...
81!> \param max_set Maximum number of set to be considered
82!> \param para_env para_env
83!> \param coeffs_set screening functions
84!> \param coeffs_kind screening functions
85!> \param is_assoc_atomic_block_global KS-matrix sparsity
86!> \param do_periodic flag for periodicity
87!> \param load_balance_parameter Parameters for Monte-Carlo routines
88!> \param kind_of helper array for mapping
89!> \param basis_parameter Basis set parameters
90!> \param pmax_set Initial screening matrix
91!> \param pmax_atom ...
92!> \param i_thread Process ID of current Thread
93!> \param n_threads Total Number of Threads
94!> \param cell cell
95!> \param do_p_screening Flag for initial p screening
96!> \param map_atom_to_kind_atom ...
97!> \param nkind ...
98!> \param eval_type ...
99!> \param pmax_block ...
100!> \param use_virial ...
101!> \par History
102!> 06.2007 created [Manuel Guidon]
103!> 08.2007 new parallel scheme [Manuel Guidon]
104!> 09.2007 new 'modulo' parellel scheme and Monte Carlo step [Manuel Guidon]
105!> 11.2007 parallelize load balance on box_idx1 [Manuel Guidon]
106!> 02.2009 completely refactored [Manuel Guidon]
107!> \author Manuel Guidon
108!> \note
109!> The optimization is done via a binning procedure followed by simple
110!> Monte Carlo procedure:
111!> In a first step the total amount of integrals in the system is calculated,
112!> taking into account the sparsity of the KS-matrix , the screening based
113!> on near/farfield approximations and if desired the screening on an initial
114!> density matrix.
115!> In a second step, bins are generate that contain approximately the same number
116!> of integrals, and a cost for these bins is estimated (currently the number of integrals)
117!> In a third step, a Monte Carlo procedure optimizes the assignment
118!> of the different loads to each process
119!> At the end each process owns an unique array of *atomic* indices-ranges
120!> that are used to decide whether a process has to calculate a certain
121!> bunch of integrals or not
122! **************************************************************************************************
123 SUBROUTINE hfx_load_balance(x_data, eps_schwarz, particle_set, max_set, para_env, &
124 coeffs_set, coeffs_kind, &
125 is_assoc_atomic_block_global, do_periodic, &
126 load_balance_parameter, kind_of, basis_parameter, pmax_set, &
127 pmax_atom, i_thread, n_threads, cell, &
128 do_p_screening, map_atom_to_kind_atom, nkind, eval_type, &
129 pmax_block, use_virial)
130 TYPE(hfx_type), POINTER :: x_data
131 REAL(dp), INTENT(IN) :: eps_schwarz
132 TYPE(particle_type), DIMENSION(:), POINTER :: particle_set
133 INTEGER, INTENT(IN) :: max_set
134 TYPE(mp_para_env_type), POINTER :: para_env
135 TYPE(hfx_screen_coeff_type), &
136 DIMENSION(:, :, :, :), POINTER :: coeffs_set
137 TYPE(hfx_screen_coeff_type), DIMENSION(:, :), &
138 POINTER :: coeffs_kind
139 INTEGER, DIMENSION(:, :) :: is_assoc_atomic_block_global
140 LOGICAL :: do_periodic
141 TYPE(hfx_load_balance_type), POINTER :: load_balance_parameter
142 INTEGER :: kind_of(*)
143 TYPE(hfx_basis_type), DIMENSION(:), POINTER :: basis_parameter
144 TYPE(hfx_p_kind), DIMENSION(:), POINTER :: pmax_set
145 REAL(dp), DIMENSION(:, :), POINTER :: pmax_atom
146 INTEGER, INTENT(IN) :: i_thread, n_threads
147 TYPE(cell_type), POINTER :: cell
148 LOGICAL, INTENT(IN) :: do_p_screening
149 INTEGER, DIMENSION(:), POINTER :: map_atom_to_kind_atom
150 INTEGER, INTENT(IN) :: nkind, eval_type
151 REAL(dp), DIMENSION(:, :), POINTER :: pmax_block
152 LOGICAL, INTENT(IN) :: use_virial
153
154 CHARACTER(LEN=*), PARAMETER :: routinen = 'hfx_load_balance'
155
156 CHARACTER(LEN=512) :: error_msg
157 INTEGER :: block_size, current_block_id, data_from, dest, handle, handle_inner, &
158 handle_range, i, iatom_block, iatom_end, iatom_start, ibin, icpu, j, jatom_block, &
159 jatom_end, jatom_start, katom_block, katom_end, katom_start, latom_block, latom_end, &
160 latom_start, mepos, my_process_id, n_processes, natom, nbins, nblocks, ncpu, &
161 new_iatom_end, new_iatom_start, new_jatom_end, new_jatom_start, non_empty_blocks, &
162 objective_block_size, objective_nblocks, source, total_blocks
163 TYPE(mp_request_type), DIMENSION(2) :: req
164 INTEGER(int_8) :: atom_block, cost_per_bin, cost_per_core, current_cost, &
165 distribution_counter_end, distribution_counter_start, global_quartet_counter, &
166 local_quartet_counter, self_cost_per_block, tmp_block, total_block_self_cost
167 INTEGER(int_8), ALLOCATABLE, DIMENSION(:) :: buffer_in, buffer_out
168 INTEGER(int_8), DIMENSION(:), POINTER :: local_cost_matrix, recbuffer, &
169 sendbuffer, swapbuffer
170 INTEGER(int_8), DIMENSION(:), POINTER, SAVE :: cost_matrix
171 INTEGER(int_8), SAVE :: shm_global_quartet_counter, &
172 shm_local_quartet_counter
173 INTEGER, ALLOCATABLE, DIMENSION(:) :: rcount, rdispl, tmp_index, tmp_pos, &
174 to_be_sorted
175 INTEGER, DIMENSION(:), POINTER, SAVE :: shm_distribution_vector
176 INTEGER, SAVE :: shm_nblocks
177 LOGICAL :: changed, last_bin_needs_to_be_filled, &
178 optimized
179 LOGICAL, DIMENSION(:, :), POINTER, SAVE :: atomic_pair_list
180 REAL(dp) :: coeffs_kind_max0, log10_eps_schwarz, &
181 log_2, pmax_blocks
182 TYPE(hfx_block_range_type), DIMENSION(:), POINTER :: blocks_guess, tmp_blocks, tmp_blocks2
183 TYPE(hfx_block_range_type), DIMENSION(:), &
184 POINTER, SAVE :: shm_blocks
185 TYPE(hfx_distribution), DIMENSION(:), POINTER :: binned_dist, ptr_to_tmp_dist, tmp_dist
186 TYPE(hfx_distribution), DIMENSION(:, :), POINTER, &
187 SAVE :: full_dist
188 TYPE(pair_list_type) :: list_ij, list_kl
189 TYPE(pair_set_list_type), ALLOCATABLE, &
190 DIMENSION(:) :: set_list_ij, set_list_kl
191
192!$OMP BARRIER
193!$OMP MASTER
194 CALL timeset(routinen, handle)
195!$OMP END MASTER
196!$OMP BARRIER
197
198 log10_eps_schwarz = log10(eps_schwarz)
199 log_2 = log10(2.0_dp)
200 coeffs_kind_max0 = maxval(coeffs_kind(:, :)%x(2))
201 ncpu = para_env%num_pe
202 n_processes = ncpu*n_threads
203 natom = SIZE(particle_set)
204
205 block_size = load_balance_parameter%block_size
206 ALLOCATE (set_list_ij((max_set*block_size)**2))
207 ALLOCATE (set_list_kl((max_set*block_size)**2))
208
209 IF (.NOT. load_balance_parameter%blocks_initialized) THEN
210!$OMP BARRIER
211!$OMP MASTER
212 CALL timeset(routinen//"_range", handle_range)
213
214 nblocks = max((natom + block_size - 1)/block_size, 1)
215 ALLOCATE (blocks_guess(nblocks))
216 ALLOCATE (tmp_blocks(natom))
217 ALLOCATE (tmp_blocks2(natom))
218
219 pmax_blocks = 0.0_dp
220 SELECT CASE (eval_type)
221 CASE (hfx_do_eval_energy)
222 atomic_pair_list => x_data%atomic_pair_list
223 CASE (hfx_do_eval_forces)
224 atomic_pair_list => x_data%atomic_pair_list_forces
225 END SELECT
226 atomic_pair_list = .true.
227 CALL init_blocks(nkind, para_env, natom, block_size, nblocks, blocks_guess, &
228 list_ij, list_kl, set_list_ij, set_list_kl, &
229 particle_set, &
230 coeffs_set, coeffs_kind, &
231 is_assoc_atomic_block_global, do_periodic, &
232 kind_of, basis_parameter, pmax_set, pmax_atom, &
233 pmax_blocks, cell, &
234 do_p_screening, map_atom_to_kind_atom, eval_type, &
235 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
236
237 total_block_self_cost = 0
238
239 DO i = 1, nblocks
240 total_block_self_cost = total_block_self_cost + blocks_guess(i)%cost
241 END DO
242
243 CALL para_env%sum(total_block_self_cost)
244
245 objective_block_size = load_balance_parameter%block_size
246 objective_nblocks = max((natom + objective_block_size - 1)/objective_block_size, 1)
247
248 self_cost_per_block = (total_block_self_cost + objective_nblocks - 1)/(objective_nblocks)
249
250 DO i = 1, nblocks
251 tmp_blocks2(i) = blocks_guess(i)
252 END DO
253
254 optimized = .false.
255 i = 0
256 DO WHILE (.NOT. optimized)
257 i = i + 1
258 current_block_id = 0
259 changed = .false.
260 DO atom_block = 1, nblocks
261 current_block_id = current_block_id + 1
262 iatom_start = tmp_blocks2(atom_block)%istart
263 iatom_end = tmp_blocks2(atom_block)%iend
264 IF (tmp_blocks2(atom_block)%cost > 1.5_dp*self_cost_per_block .AND. iatom_end - iatom_start > 0) THEN
265 changed = .true.
266 new_iatom_start = iatom_start
267 new_iatom_end = (iatom_end - iatom_start + 1)/2 + iatom_start - 1
268 new_jatom_start = new_iatom_end + 1
269 new_jatom_end = iatom_end
270 tmp_blocks(current_block_id)%istart = new_iatom_start
271 tmp_blocks(current_block_id)%iend = new_iatom_end
272 tmp_blocks(current_block_id)%cost = estimate_block_cost( &
273 natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
274 new_iatom_start, new_iatom_end, new_iatom_start, new_iatom_end, &
275 new_iatom_start, new_iatom_end, new_iatom_start, new_iatom_end, &
276 particle_set, &
277 coeffs_set, coeffs_kind, &
278 is_assoc_atomic_block_global, do_periodic, &
279 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
280 cell, &
281 do_p_screening, map_atom_to_kind_atom, eval_type, &
282 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
283 current_block_id = current_block_id + 1
284 tmp_blocks(current_block_id)%istart = new_jatom_start
285 tmp_blocks(current_block_id)%iend = new_jatom_end
286 tmp_blocks(current_block_id)%cost = estimate_block_cost( &
287 natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
288 new_jatom_start, new_jatom_end, new_jatom_start, new_jatom_end, &
289 new_jatom_start, new_jatom_end, new_jatom_start, new_jatom_end, &
290 particle_set, &
291 coeffs_set, coeffs_kind, &
292 is_assoc_atomic_block_global, do_periodic, &
293 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
294 cell, &
295 do_p_screening, map_atom_to_kind_atom, eval_type, &
296 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
297 ELSE
298 tmp_blocks(current_block_id)%istart = iatom_start
299 tmp_blocks(current_block_id)%iend = iatom_end
300 tmp_blocks(current_block_id)%cost = tmp_blocks2(atom_block)%cost
301 END IF
302 END DO
303 IF (.NOT. changed) optimized = .true.
304 IF (i > 20) optimized = .true.
305 nblocks = current_block_id
306 DO atom_block = 1, nblocks
307 tmp_blocks2(atom_block) = tmp_blocks(atom_block)
308 END DO
309 END DO
310
311 DEALLOCATE (tmp_blocks2)
312
313 ! ** count number of non empty blocks on each node
314 non_empty_blocks = 0
315 DO atom_block = 1, nblocks
316 IF (tmp_blocks(atom_block)%istart == 0) cycle
317 non_empty_blocks = non_empty_blocks + 1
318 END DO
319
320 ALLOCATE (rcount(ncpu))
321 rcount = 0
322 rcount(para_env%mepos + 1) = non_empty_blocks
323 CALL para_env%sum(rcount)
324
325 ! ** sum all non_empty_blocks
326 total_blocks = 0
327 DO i = 1, ncpu
328 total_blocks = total_blocks + rcount(i)
329 END DO
330
331 ! ** calculate offsets
332 ALLOCATE (rdispl(ncpu))
333 rcount(:) = rcount(:)*3
334 rdispl(1) = 0
335 DO i = 2, ncpu
336 rdispl(i) = rdispl(i - 1) + rcount(i - 1)
337 END DO
338
339 ALLOCATE (buffer_in(3*non_empty_blocks))
340
341 non_empty_blocks = 0
342 DO atom_block = 1, nblocks
343 IF (tmp_blocks(atom_block)%istart == 0) cycle
344 buffer_in(non_empty_blocks*3 + 1) = tmp_blocks(atom_block)%istart
345 buffer_in(non_empty_blocks*3 + 2) = tmp_blocks(atom_block)%iend
346 buffer_in(non_empty_blocks*3 + 3) = tmp_blocks(atom_block)%cost
347 non_empty_blocks = non_empty_blocks + 1
348 END DO
349
350 nblocks = total_blocks
351
352 ALLOCATE (tmp_blocks2(nblocks))
353
354 ALLOCATE (buffer_out(3*nblocks))
355
356 ! ** Gather all three arrays
357 CALL para_env%allgatherv(buffer_in, buffer_out, rcount, rdispl)
358
359 DO i = 1, nblocks
360 tmp_blocks2(i)%istart = int(buffer_out((i - 1)*3 + 1))
361 tmp_blocks2(i)%iend = int(buffer_out((i - 1)*3 + 2))
362 tmp_blocks2(i)%cost = buffer_out((i - 1)*3 + 3)
363 END DO
364
365 ! ** Now we sort the blocks
366 ALLOCATE (to_be_sorted(nblocks))
367 ALLOCATE (tmp_index(nblocks))
368
369 DO atom_block = 1, nblocks
370 to_be_sorted(atom_block) = tmp_blocks2(atom_block)%istart
371 END DO
372
373 CALL sort(to_be_sorted, nblocks, tmp_index)
374
375 ALLOCATE (x_data%blocks(nblocks))
376
377 DO atom_block = 1, nblocks
378 x_data%blocks(atom_block) = tmp_blocks2(tmp_index(atom_block))
379 END DO
380
381 shm_blocks => x_data%blocks
382 shm_nblocks = nblocks
383
384 ! ** Set nblocks in structure
385 load_balance_parameter%nblocks = nblocks
386
387 DEALLOCATE (blocks_guess, tmp_blocks, tmp_blocks2)
388
389 DEALLOCATE (rcount, rdispl, buffer_in, buffer_out, to_be_sorted, tmp_index)
390
391 load_balance_parameter%blocks_initialized = .true.
392
393 x_data%blocks = shm_blocks
394 load_balance_parameter%nblocks = shm_nblocks
395 load_balance_parameter%blocks_initialized = .true.
396
397 ALLOCATE (x_data%pmax_block(shm_nblocks, shm_nblocks))
398 x_data%pmax_block = 0.0_dp
399 pmax_block => x_data%pmax_block
400 CALL timestop(handle_range)
401!$OMP END MASTER
402!$OMP BARRIER
403
404 IF (.NOT. load_balance_parameter%blocks_initialized) THEN
405 ALLOCATE (x_data%blocks(shm_nblocks))
406 x_data%blocks = shm_blocks
407 load_balance_parameter%nblocks = shm_nblocks
408 load_balance_parameter%blocks_initialized = .true.
409 END IF
410 !! ** precalculate maximum density matrix elements in blocks
411!$OMP BARRIER
412 END IF
413
414!$OMP BARRIER
415!$OMP MASTER
416 pmax_block => x_data%pmax_block
417 pmax_block = 0.0_dp
418 IF (do_p_screening) THEN
419 DO iatom_block = 1, shm_nblocks
420 iatom_start = x_data%blocks(iatom_block)%istart
421 iatom_end = x_data%blocks(iatom_block)%iend
422 DO jatom_block = 1, shm_nblocks
423 jatom_start = x_data%blocks(jatom_block)%istart
424 jatom_end = x_data%blocks(jatom_block)%iend
425 pmax_block(iatom_block, jatom_block) = maxval(pmax_atom(iatom_start:iatom_end, jatom_start:jatom_end))
426 END DO
427 END DO
428 END IF
429
430 SELECT CASE (eval_type)
431 CASE (hfx_do_eval_energy)
432 atomic_pair_list => x_data%atomic_pair_list
433 CASE (hfx_do_eval_forces)
434 atomic_pair_list => x_data%atomic_pair_list_forces
435 END SELECT
436 CALL build_atomic_pair_list(natom, atomic_pair_list, kind_of, basis_parameter, particle_set, &
437 do_periodic, coeffs_kind, coeffs_kind_max0, log10_eps_schwarz, cell, &
438 x_data%blocks)
439
440!$OMP END MASTER
441!$OMP BARRIER
442
443 !! If there is only 1 cpu skip the binning
444 IF (n_processes == 1) THEN
445 ALLOCATE (tmp_dist(1))
446 tmp_dist(1)%number_of_atom_quartets = huge(tmp_dist(1)%number_of_atom_quartets)
447 tmp_dist(1)%istart = 0_int_8
448 ptr_to_tmp_dist => tmp_dist(:)
449 SELECT CASE (eval_type)
450 CASE (hfx_do_eval_energy)
451 CALL hfx_set_distr_energy(ptr_to_tmp_dist, x_data)
452 CASE (hfx_do_eval_forces)
453 CALL hfx_set_distr_forces(ptr_to_tmp_dist, x_data)
454 END SELECT
455 DEALLOCATE (tmp_dist)
456 ELSE
457 !! Calculate total numbers of integrals that have to be calculated (wrt screening and symmetry)
458!$OMP BARRIER
459!$OMP MASTER
460 CALL timeset(routinen//"_count", handle_inner)
461!$OMP END MASTER
462!$OMP BARRIER
463
464 cost_per_core = 0_int_8
465 my_process_id = para_env%mepos*n_threads + i_thread
466 nblocks = load_balance_parameter%nblocks
467
468 DO atom_block = my_process_id, int(nblocks, kind=int_8)**4 - 1, n_processes
469
470 latom_block = int(modulo(atom_block, int(nblocks, kind=int_8))) + 1
471 tmp_block = atom_block/nblocks
472 katom_block = int(modulo(tmp_block, int(nblocks, kind=int_8))) + 1
473 IF (latom_block < katom_block) cycle
474 tmp_block = tmp_block/nblocks
475 jatom_block = int(modulo(tmp_block, int(nblocks, kind=int_8))) + 1
476 tmp_block = tmp_block/nblocks
477 iatom_block = int(modulo(tmp_block, int(nblocks, kind=int_8))) + 1
478 IF (jatom_block < iatom_block) cycle
479
480 iatom_start = x_data%blocks(iatom_block)%istart
481 iatom_end = x_data%blocks(iatom_block)%iend
482 jatom_start = x_data%blocks(jatom_block)%istart
483 jatom_end = x_data%blocks(jatom_block)%iend
484 katom_start = x_data%blocks(katom_block)%istart
485 katom_end = x_data%blocks(katom_block)%iend
486 latom_start = x_data%blocks(latom_block)%istart
487 latom_end = x_data%blocks(latom_block)%iend
488
489 SELECT CASE (eval_type)
490 CASE (hfx_do_eval_energy)
491 pmax_blocks = max(pmax_block(katom_block, iatom_block), &
492 pmax_block(latom_block, jatom_block), &
493 pmax_block(latom_block, iatom_block), &
494 pmax_block(katom_block, jatom_block))
495 CASE (hfx_do_eval_forces)
496 pmax_blocks = max(pmax_block(katom_block, iatom_block) + &
497 pmax_block(latom_block, jatom_block), &
498 pmax_block(latom_block, iatom_block) + &
499 pmax_block(katom_block, jatom_block))
500 END SELECT
501
502 IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) cycle
503
504 cost_per_core = cost_per_core &
505 + estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
506 iatom_start, iatom_end, jatom_start, jatom_end, &
507 katom_start, katom_end, latom_start, latom_end, &
508 particle_set, &
509 coeffs_set, coeffs_kind, &
510 is_assoc_atomic_block_global, do_periodic, &
511 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
512 cell, &
513 do_p_screening, map_atom_to_kind_atom, eval_type, &
514 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
515
516 END DO ! atom_block
517
518 nbins = load_balance_parameter%nbins
519 cost_per_bin = (cost_per_core + nbins - 1)/(nbins)
520
521!$OMP BARRIER
522!$OMP MASTER
523 CALL timestop(handle_inner)
524!$OMP END MASTER
525!$OMP BARRIER
526
527! new load balancing test
528 IF (.false.) THEN
529 CALL hfx_recursive_load_balance(n_processes, my_process_id, nblocks, &
530 natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
531 particle_set, &
532 coeffs_set, coeffs_kind, &
533 is_assoc_atomic_block_global, do_periodic, &
534 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
535 cell, x_data, para_env, pmax_block, &
536 do_p_screening, map_atom_to_kind_atom, eval_type, &
537 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
538 END IF
539
540!$OMP BARRIER
541!$OMP MASTER
542 CALL timeset(routinen//"_bin", handle_inner)
543!$OMP END MASTER
544!$OMP BARRIER
545
546 ALLOCATE (binned_dist(nbins))
547 binned_dist(:)%istart = -1_int_8
548 binned_dist(:)%number_of_atom_quartets = 0_int_8
549 binned_dist(:)%cost = 0_int_8
550 binned_dist(:)%time_first_scf = 0.0_dp
551 binned_dist(:)%time_other_scf = 0.0_dp
552 binned_dist(:)%time_forces = 0.0_dp
553
554 current_cost = 0
555 mepos = 1
556 distribution_counter_start = 1
557 distribution_counter_end = 0
558 ibin = 1
559
560 global_quartet_counter = 0
561 local_quartet_counter = 0
562 last_bin_needs_to_be_filled = .false.
563 DO atom_block = my_process_id, int(nblocks, kind=int_8)**4 - 1, n_processes
564 latom_block = int(modulo(atom_block, int(nblocks, kind=int_8))) + 1
565 tmp_block = atom_block/nblocks
566 katom_block = int(modulo(tmp_block, int(nblocks, kind=int_8))) + 1
567 IF (latom_block < katom_block) cycle
568 tmp_block = tmp_block/nblocks
569 jatom_block = int(modulo(tmp_block, int(nblocks, kind=int_8))) + 1
570 tmp_block = tmp_block/nblocks
571 iatom_block = int(modulo(tmp_block, int(nblocks, kind=int_8))) + 1
572 IF (jatom_block < iatom_block) cycle
573
574 distribution_counter_end = distribution_counter_end + 1
575 global_quartet_counter = global_quartet_counter + 1
576 last_bin_needs_to_be_filled = .true.
577
578 IF (binned_dist(ibin)%istart == -1_int_8) binned_dist(ibin)%istart = atom_block
579
580 iatom_start = x_data%blocks(iatom_block)%istart
581 iatom_end = x_data%blocks(iatom_block)%iend
582 jatom_start = x_data%blocks(jatom_block)%istart
583 jatom_end = x_data%blocks(jatom_block)%iend
584 katom_start = x_data%blocks(katom_block)%istart
585 katom_end = x_data%blocks(katom_block)%iend
586 latom_start = x_data%blocks(latom_block)%istart
587 latom_end = x_data%blocks(latom_block)%iend
588
589 SELECT CASE (eval_type)
590 CASE (hfx_do_eval_energy)
591 pmax_blocks = max(pmax_block(katom_block, iatom_block), &
592 pmax_block(latom_block, jatom_block), &
593 pmax_block(latom_block, iatom_block), &
594 pmax_block(katom_block, jatom_block))
595 CASE (hfx_do_eval_forces)
596 pmax_blocks = max(pmax_block(katom_block, iatom_block) + &
597 pmax_block(latom_block, jatom_block), &
598 pmax_block(latom_block, iatom_block) + &
599 pmax_block(katom_block, jatom_block))
600 END SELECT
601
602 IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) cycle
603
604 current_cost = current_cost &
605 + estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
606 iatom_start, iatom_end, jatom_start, jatom_end, &
607 katom_start, katom_end, latom_start, latom_end, &
608 particle_set, &
609 coeffs_set, coeffs_kind, &
610 is_assoc_atomic_block_global, do_periodic, &
611 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
612 cell, &
613 do_p_screening, map_atom_to_kind_atom, eval_type, &
614 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
615
616 IF (current_cost >= cost_per_bin) THEN
617 IF (ibin == nbins) THEN
618 binned_dist(ibin)%number_of_atom_quartets = binned_dist(ibin)%number_of_atom_quartets + &
619 distribution_counter_end - distribution_counter_start + 1
620 ELSE
621 binned_dist(ibin)%number_of_atom_quartets = distribution_counter_end - distribution_counter_start + 1
622 END IF
623 binned_dist(ibin)%cost = binned_dist(ibin)%cost + current_cost
624 ibin = min(ibin + 1, nbins)
625 distribution_counter_start = distribution_counter_end + 1
626 current_cost = 0
627 last_bin_needs_to_be_filled = .false.
628 END IF
629 END DO
630
631!$OMP BARRIER
632!$OMP MASTER
633 CALL timestop(handle_inner)
634 CALL timeset(routinen//"_dist", handle_inner)
635!$OMP END MASTER
636!$OMP BARRIER
637 !! Fill the last bin if necessary
638 IF (last_bin_needs_to_be_filled) THEN
639 binned_dist(ibin)%cost = binned_dist(ibin)%cost + current_cost
640 IF (ibin == nbins) THEN
641 binned_dist(ibin)%number_of_atom_quartets = binned_dist(ibin)%number_of_atom_quartets + &
642 distribution_counter_end - distribution_counter_start + 1
643 ELSE
644 binned_dist(ibin)%number_of_atom_quartets = distribution_counter_end - distribution_counter_start + 1
645 END IF
646 END IF
647
648 !! Sanity-Check
649 DO ibin = 1, nbins
650 local_quartet_counter = local_quartet_counter + binned_dist(ibin)%number_of_atom_quartets
651 END DO
652!$OMP BARRIER
653!$OMP MASTER
654 shm_local_quartet_counter = 0
655 shm_global_quartet_counter = 0
656!$OMP END MASTER
657!$OMP BARRIER
658!$OMP ATOMIC
659 shm_local_quartet_counter = shm_local_quartet_counter + local_quartet_counter
660!$OMP ATOMIC
661 shm_global_quartet_counter = shm_global_quartet_counter + global_quartet_counter
662
663!$OMP BARRIER
664!$OMP MASTER
665 CALL para_env%sum(shm_local_quartet_counter)
666 CALL para_env%sum(shm_global_quartet_counter)
667 IF (para_env%is_source()) THEN
668 IF (shm_local_quartet_counter /= shm_global_quartet_counter) THEN
669 WRITE (error_msg, '(A,I0,A,I0,A)') "HFX Sanity check for parallel distribution failed. "// &
670 "Number of local quartets (", shm_local_quartet_counter, &
671 ") and number of global quartets (", shm_global_quartet_counter, &
672 ") are different. Please send in a bug report."
673 cpabort(error_msg)
674 END IF
675 END IF
676!$OMP END MASTER
677
678!$OMP BARRIER
679!$OMP MASTER
680 ALLOCATE (cost_matrix(ncpu*nbins*n_threads))
681 cost_matrix = 0
682!$OMP END MASTER
683!$OMP BARRIER
684 icpu = para_env%mepos + 1
685 DO i = 1, nbins
686 cost_matrix((icpu - 1)*nbins*n_threads + i_thread*nbins + i) = binned_dist(i)%cost
687 END DO
688 mepos = para_env%mepos
689!$OMP BARRIER
690
691!$OMP MASTER
692 ! sync before/after ring of isendrecv
693 CALL para_env%sync()
694
695 ALLOCATE (sendbuffer(nbins*n_threads))
696 ALLOCATE (recbuffer(nbins*n_threads))
697
698 sendbuffer = cost_matrix(mepos*nbins*n_threads + 1:mepos*nbins*n_threads + nbins*n_threads)
699
700 dest = modulo(mepos + 1, ncpu)
701 source = modulo(mepos - 1, ncpu)
702 DO icpu = 0, ncpu - 1
703 IF (icpu /= ncpu - 1) THEN
704 CALL para_env%isendrecv(sendbuffer, dest, recbuffer, source, &
705 req(1), req(2), 13)
706 END IF
707 data_from = modulo(mepos - icpu, ncpu)
708 cost_matrix(data_from*nbins*n_threads + 1:data_from*nbins*n_threads + nbins*n_threads) = sendbuffer
709 IF (icpu /= ncpu - 1) THEN
710 CALL mp_waitall(req)
711 END IF
712 swapbuffer => sendbuffer
713 sendbuffer => recbuffer
714 recbuffer => swapbuffer
715 END DO
716 DEALLOCATE (recbuffer, sendbuffer)
717!$OMP END MASTER
718!$OMP BARRIER
719
720!$OMP BARRIER
721!$OMP MASTER
722 CALL timestop(handle_inner)
723 CALL timeset(routinen//"_opt", handle_inner)
724!$OMP END MASTER
725!$OMP BARRIER
726
727 !! Find an optimal distribution i.e. assign each element of the cost matrix to a certain process
728!$OMP BARRIER
729 ALLOCATE (local_cost_matrix(SIZE(cost_matrix, 1)))
730 local_cost_matrix = cost_matrix
731!$OMP MASTER
732 ALLOCATE (shm_distribution_vector(ncpu*nbins*n_threads))
733
734 CALL optimize_distribution(ncpu*nbins*n_threads, ncpu*n_threads, local_cost_matrix, &
735 shm_distribution_vector, x_data%load_balance_parameter%do_randomize)
736
737 CALL timestop(handle_inner)
738 CALL timeset(routinen//"_redist", handle_inner)
739 !! Collect local data to global array
740 ALLOCATE (full_dist(ncpu*n_threads, nbins))
741
742 full_dist(:, :)%istart = 0_int_8
743 full_dist(:, :)%number_of_atom_quartets = 0_int_8
744 full_dist(:, :)%cost = 0_int_8
745 full_dist(:, :)%time_first_scf = 0.0_dp
746 full_dist(:, :)%time_other_scf = 0.0_dp
747 full_dist(:, :)%time_forces = 0.0_dp
748!$OMP END MASTER
749!$OMP BARRIER
750 mepos = para_env%mepos + 1
751 full_dist((mepos - 1)*n_threads + i_thread + 1, :) = binned_dist(:)
752
753!$OMP BARRIER
754!$OMP MASTER
755 ALLOCATE (sendbuffer(3*nbins*n_threads))
756 ALLOCATE (recbuffer(3*nbins*n_threads))
757 mepos = para_env%mepos
758 DO j = 1, n_threads
759 DO i = 1, nbins
760 sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 1) = full_dist(mepos*n_threads + j, i)%istart
761 sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 2) = full_dist(mepos*n_threads + j, i)%number_of_atom_quartets
762 sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 3) = full_dist(mepos*n_threads + j, i)%cost
763 END DO
764 END DO
765
766 ! sync before/after ring of isendrecv
767 CALL para_env%sync()
768 dest = modulo(mepos + 1, ncpu)
769 source = modulo(mepos - 1, ncpu)
770 DO icpu = 0, ncpu - 1
771 IF (icpu /= ncpu - 1) THEN
772 CALL para_env%isendrecv(sendbuffer, dest, recbuffer, source, &
773 req(1), req(2), 13)
774 END IF
775 data_from = modulo(mepos - icpu, ncpu)
776 DO j = 1, n_threads
777 DO i = 1, nbins
778 full_dist(data_from*n_threads + j, i)%istart = sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 1)
779 full_dist(data_from*n_threads + j, i)%number_of_atom_quartets = sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 2)
780 full_dist(data_from*n_threads + j, i)%cost = sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 3)
781 END DO
782 END DO
783
784 IF (icpu /= ncpu - 1) THEN
785 CALL mp_waitall(req)
786 END IF
787 swapbuffer => sendbuffer
788 sendbuffer => recbuffer
789 recbuffer => swapbuffer
790 END DO
791 DEALLOCATE (recbuffer, sendbuffer)
792
793 ! sync before/after ring of isendrecv
794 CALL para_env%sync()
795!$OMP END MASTER
796!$OMP BARRIER
797 !! reorder the distribution according to the distribution vector
798 ALLOCATE (tmp_pos(ncpu*n_threads))
799 tmp_pos = 1
800 ALLOCATE (tmp_dist(nbins*ncpu*n_threads))
801
802 tmp_dist(:)%istart = 0_int_8
803 tmp_dist(:)%number_of_atom_quartets = 0_int_8
804 tmp_dist(:)%cost = 0_int_8
805 tmp_dist(:)%time_first_scf = 0.0_dp
806 tmp_dist(:)%time_other_scf = 0.0_dp
807 tmp_dist(:)%time_forces = 0.0_dp
808
809 DO icpu = 1, n_processes
810 DO i = 1, nbins
811 mepos = my_process_id + 1
812 IF (shm_distribution_vector((icpu - 1)*nbins + i) == mepos) THEN
813 tmp_dist(tmp_pos(mepos)) = full_dist(icpu, i)
814 tmp_pos(mepos) = tmp_pos(mepos) + 1
815 END IF
816 END DO
817 END DO
818
819 !! Assign the load to each process
820 NULLIFY (ptr_to_tmp_dist)
821 mepos = my_process_id + 1
822 ptr_to_tmp_dist => tmp_dist(1:tmp_pos(mepos) - 1)
823 SELECT CASE (eval_type)
824 CASE (hfx_do_eval_energy)
825 CALL hfx_set_distr_energy(ptr_to_tmp_dist, x_data)
826 CASE (hfx_do_eval_forces)
827 CALL hfx_set_distr_forces(ptr_to_tmp_dist, x_data)
828 END SELECT
829
830!$OMP BARRIER
831!$OMP MASTER
832 DEALLOCATE (full_dist, cost_matrix, shm_distribution_vector)
833!$OMP END MASTER
834!$OMP BARRIER
835 DEALLOCATE (tmp_dist, tmp_pos)
836 DEALLOCATE (binned_dist, local_cost_matrix)
837 DEALLOCATE (set_list_ij, set_list_kl)
838
839!$OMP BARRIER
840!$OMP MASTER
841 CALL timestop(handle_inner)
842!$OMP END MASTER
843!$OMP BARRIER
844 END IF
845!$OMP BARRIER
846!$OMP MASTER
847 CALL timestop(handle)
848!$OMP END MASTER
849!$OMP BARRIER
850 END SUBROUTINE hfx_load_balance
851
852! **************************************************************************************************
853!> \brief Reference implementation of new recursive load balancing routine
854!> Computes a local list of atom_blocks (p_atom_blocks,q_atom_blocks) for
855!> each process in a P-Q grid such that every process has more or less the
856!> same amount of work. Has no output at the moment (not used) but writes
857!> its computed load balance values into a file. Possible output is ready
858!> to use in the two arrays p_atom_blocks & q_atom_blocks
859!> \param n_processes ...
860!> \param my_process_id ...
861!> \param nblocks ...
862!> \param natom ...
863!> \param nkind ...
864!> \param list_ij ...
865!> \param list_kl ...
866!> \param set_list_ij ...
867!> \param set_list_kl ...
868!> \param particle_set ...
869!> \param coeffs_set ...
870!> \param coeffs_kind ...
871!> \param is_assoc_atomic_block_global ...
872!> \param do_periodic ...
873!> \param kind_of ...
874!> \param basis_parameter ...
875!> \param pmax_set ...
876!> \param pmax_atom ...
877!> \param pmax_blocks ...
878!> \param cell ...
879!> \param x_data ...
880!> \param para_env ...
881!> \param pmax_block ...
882!> \param do_p_screening ...
883!> \param map_atom_to_kind_atom ...
884!> \param eval_type ...
885!> \param log10_eps_schwarz ...
886!> \param log_2 ...
887!> \param coeffs_kind_max0 ...
888!> \param use_virial ...
889!> \param atomic_pair_list ...
890!> \par History
891!> 03.2011 created [Michael Steinlechner]
892!> \author Michael Steinlechner
893! **************************************************************************************************
894
895 SUBROUTINE hfx_recursive_load_balance(n_processes, my_process_id, nblocks, &
896 natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
897 particle_set, &
898 coeffs_set, coeffs_kind, &
899 is_assoc_atomic_block_global, do_periodic, &
900 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
901 cell, x_data, para_env, pmax_block, &
902 do_p_screening, map_atom_to_kind_atom, eval_type, &
903 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
904
905! input variables:
906 INTEGER, INTENT(IN) :: n_processes, my_process_id, nblocks, &
907 natom, nkind
908 TYPE(pair_list_type), INTENT(IN) :: list_ij, list_kl
909 TYPE(pair_set_list_type), ALLOCATABLE, &
910 DIMENSION(:), INTENT(IN) :: set_list_ij, set_list_kl
911 TYPE(particle_type), DIMENSION(:), INTENT(IN), &
912 POINTER :: particle_set
913 TYPE(hfx_screen_coeff_type), &
914 DIMENSION(:, :, :, :), INTENT(IN), POINTER :: coeffs_set
915 TYPE(hfx_screen_coeff_type), DIMENSION(:, :), &
916 INTENT(IN), POINTER :: coeffs_kind
917 INTEGER, DIMENSION(:, :), INTENT(IN) :: is_assoc_atomic_block_global
918 LOGICAL, INTENT(IN) :: do_periodic
919 INTEGER, INTENT(IN) :: kind_of(*)
920 TYPE(hfx_basis_type), DIMENSION(:), INTENT(IN), &
921 POINTER :: basis_parameter
922 TYPE(hfx_p_kind), DIMENSION(:), INTENT(IN), &
923 POINTER :: pmax_set
924 REAL(dp), DIMENSION(:, :), INTENT(IN), POINTER :: pmax_atom
925 REAL(dp) :: pmax_blocks
926 TYPE(cell_type), INTENT(IN), POINTER :: cell
927 TYPE(hfx_type), INTENT(IN), POINTER :: x_data
928 TYPE(mp_para_env_type), INTENT(IN) :: para_env
929 REAL(dp), DIMENSION(:, :), INTENT(IN), POINTER :: pmax_block
930 LOGICAL, INTENT(IN) :: do_p_screening
931 INTEGER, DIMENSION(:), INTENT(IN), POINTER :: map_atom_to_kind_atom
932 INTEGER, INTENT(IN) :: eval_type
933 REAL(dp), INTENT(IN) :: log10_eps_schwarz, log_2, &
934 coeffs_kind_max0
935 LOGICAL, INTENT(IN) :: use_virial
936 LOGICAL, DIMENSION(:, :), INTENT(IN), POINTER :: atomic_pair_list
937
938 CHARACTER(LEN=*), PARAMETER :: routinen = 'hfx_recursive_load_balance'
939
940 INTEGER :: handle, i, iatom_block, iatom_end, iatom_start, j, jatom_block, jatom_end, &
941 jatom_start, katom_block, katom_end, katom_start, latom_block, latom_end, latom_start, &
942 np, nq, numbins, p, q, sizep, sizeq, unit_nr
943 INTEGER(int_8) :: local_cost, pidx, qidx, sump, sumq
944 INTEGER(int_8), ALLOCATABLE, DIMENSION(:) :: local_cost_vector
945 INTEGER, ALLOCATABLE, DIMENSION(:) :: blocksize, p_atom_blocks, permute, &
946 q_atom_blocks
947 REAL(dp) :: maximum, mean
948
949! internal variables:
950
951!$OMP BARRIER
952!$OMP MASTER
953 CALL timeset(routinen, handle)
954!$OMP END MASTER
955!$OMP BARRIER
956
957 ! calculate best p/q distribution grid for the n_processes
958 CALL hfx_calculate_pq(p, q, numbins, n_processes)
959
960 ALLOCATE (blocksize(numbins))
961 ALLOCATE (permute(nblocks**2))
962 DO i = 1, nblocks**2
963 permute(i) = i
964 END DO
965
966 ! call the main recursive permutation routine.
967 ! Output:
968 ! blocksize :: vector (size numBins) with the sizes for each column/row block
969 ! permute :: permutation vector
970 CALL hfx_recursive_permute(blocksize, 1, nblocks**2, numbins, &
971 permute, 1, &
972 my_process_id, n_processes, nblocks, &
973 natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
974 particle_set, &
975 coeffs_set, coeffs_kind, &
976 is_assoc_atomic_block_global, do_periodic, &
977 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
978 cell, x_data, para_env, pmax_block, &
979 do_p_screening, map_atom_to_kind_atom, eval_type, &
980 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
981
982 ! number of blocks per processor in p-direction (vertical)
983 np = numbins/p
984 ! number of blocks per processor in q-direction (horizontal)
985 nq = numbins/q
986
987 ! calc own position in P-Q-processor grid (PQ-grid is column-major)
988 pidx = modulo(int(my_process_id), int(p)) + 1
989 qidx = my_process_id/p + 1
990
991 sizep = sum(blocksize((np*(pidx - 1) + 1):(np*pidx)))
992 sizeq = sum(blocksize((nq*(qidx - 1) + 1):(nq*qidx)))
993
994 sump = sum(blocksize(1:(np*(pidx - 1))))
995 sumq = sum(blocksize(1:(nq*(qidx - 1))))
996
997 ALLOCATE (p_atom_blocks(sizep))
998 ALLOCATE (q_atom_blocks(sizeq))
999
1000 p_atom_blocks(:) = permute((sump + 1):(sump + sizep))
1001 q_atom_blocks(:) = permute((sumq + 1):(sumq + sizeq))
1002
1003 ! from here on, we are actually finished, each process has been
1004 ! assigned a (p_atom_blocks,q_atom_blocks) pair list.
1005 ! what follows is just a small routine to calculate the local cost
1006 ! for each processor which is then written to a file.
1007
1008 ! calculate local cost for each processor!
1009 ! ****************************************
1010 local_cost = 0
1011 DO i = 1, sizep
1012 DO j = 1, sizeq
1013
1014 ! get corresponding 4D block indices out of our own P-Q-block
1015 latom_block = modulo(q_atom_blocks(j), nblocks)
1016 iatom_block = q_atom_blocks(j)/nblocks + 1
1017 jatom_block = modulo(p_atom_blocks(i), nblocks)
1018 katom_block = p_atom_blocks(i)/nblocks + 1
1019
1020 ! symmetry checks.
1021 IF (latom_block < katom_block) cycle
1022 IF (jatom_block < iatom_block) cycle
1023
1024 iatom_start = x_data%blocks(iatom_block)%istart
1025 iatom_end = x_data%blocks(iatom_block)%iend
1026 jatom_start = x_data%blocks(jatom_block)%istart
1027 jatom_end = x_data%blocks(jatom_block)%iend
1028 katom_start = x_data%blocks(katom_block)%istart
1029 katom_end = x_data%blocks(katom_block)%iend
1030 latom_start = x_data%blocks(latom_block)%istart
1031 latom_end = x_data%blocks(latom_block)%iend
1032
1033 ! whatever.
1034 SELECT CASE (eval_type)
1035 CASE (hfx_do_eval_energy)
1036 pmax_blocks = max(pmax_block(katom_block, iatom_block), &
1037 pmax_block(latom_block, jatom_block), &
1038 pmax_block(latom_block, iatom_block), &
1039 pmax_block(katom_block, jatom_block))
1040 CASE (hfx_do_eval_forces)
1041 pmax_blocks = max(pmax_block(katom_block, iatom_block) + &
1042 pmax_block(latom_block, jatom_block), &
1043 pmax_block(latom_block, iatom_block) + &
1044 pmax_block(katom_block, jatom_block))
1045 END SELECT
1046
1047 ! screening.
1048 IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) cycle
1049
1050 ! estimate the cost of this atom_block.
1051 local_cost = local_cost + estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, &
1052 set_list_kl, &
1053 iatom_start, iatom_end, jatom_start, jatom_end, &
1054 katom_start, katom_end, latom_start, latom_end, &
1055 particle_set, &
1056 coeffs_set, coeffs_kind, &
1057 is_assoc_atomic_block_global, do_periodic, &
1058 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
1059 cell, &
1060 do_p_screening, map_atom_to_kind_atom, eval_type, &
1061 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
1062 END DO
1063 END DO
1064
1065 ALLOCATE (local_cost_vector(n_processes))
1066 local_cost_vector = 0
1067 local_cost_vector(my_process_id + 1) = local_cost
1068 CALL para_env%sum(local_cost_vector)
1069
1070 mean = sum(local_cost_vector)/n_processes
1071 maximum = maxval(local_cost_vector)
1072
1073!$OMP BARRIER
1074!$OMP MASTER
1075 ! only output once
1076 IF (my_process_id == 0) THEN
1077 CALL open_file(unit_number=unit_nr, file_name="loads.dat")
1078 WRITE (unit_nr, *) 'maximum cost:', maximum
1079 WRITE (unit_nr, *) 'mean cost:', mean
1080 WRITE (unit_nr, *) 'load balance ratio max/mean: ', maximum/mean
1081 WRITE (unit_nr, *) '-------- detailed per-process costs ---------'
1082 DO i = 1, n_processes
1083 WRITE (unit_nr, *) local_cost_vector(i)
1084 END DO
1085 CALL close_file(unit_nr)
1086 END IF
1087!$OMP END MASTER
1088!$OMP BARRIER
1089
1090 DEALLOCATE (local_cost_vector)
1091 DEALLOCATE (p_atom_blocks, q_atom_blocks)
1092 DEALLOCATE (blocksize, permute)
1093
1094!$OMP BARRIER
1095!$OMP MASTER
1096 CALL timestop(handle)
1097!$OMP END MASTER
1098!$OMP BARRIER
1099
1100 END SUBROUTINE hfx_recursive_load_balance
1101
1102! **************************************************************************************************
1103!> \brief Small routine to calculate the optimal P-Q-processor grid distribution
1104!> for a given number of processors N
1105!> and the corresponding number of Bins for the load balancing routine
1106!> \param p number of rows on P-Q process grid (output)
1107!> \param q number of columns on P-Q process grid (output)
1108!> \param nBins number of Bins (output)
1109!> \param N number of processes (input)
1110!> \par History
1111!> 03.2011 created [Michael Steinlechner]
1112!> \author Michael Steinlechner
1113! **************************************************************************************************
1114 SUBROUTINE hfx_calculate_pq(p, q, nBins, N)
1115
1116 INTEGER, INTENT(OUT) :: p, q, nbins
1117 INTEGER, INTENT(IN) :: n
1118
1119 INTEGER :: a, b, k
1120 REAL(dp) :: sqn
1121
1122 k = 2
1123 sqn = sqrt(real(n, kind=dp))
1124 p = 1
1125
1126 DO WHILE (real(k, kind=dp) <= sqn)
1127 IF (modulo(n, k) == 0) THEN
1128 p = k
1129 END IF
1130 k = k + 1
1131 END DO
1132 q = n/p
1133
1134 ! now compute the least common multiple of p & q to get the number of necessary bins
1135 ! compute using the relation LCM(p,q) = abs(p*q) / GCD(p,q)
1136 ! and use euclid's algorithm for GCD computation.
1137 a = p
1138 b = q
1139
1140 DO WHILE (b /= 0)
1141 IF (a > b) THEN
1142 a = a - b
1143 ELSE
1144 b = b - a
1145 END IF
1146 END DO
1147 ! gcd(p,q) is now saved in a
1148
1149 nbins = p*q/a
1150
1151 END SUBROUTINE hfx_calculate_pq
1152
1153! **************************************************************************************************
1154!> \brief Recursive permutation routine for the load balancing of the integral
1155!> computation
1156!> \param blocksize vector of blocksizes, size(nProc), which contains for
1157!> each process the local blocksize (OUTPUT)
1158!> \param blockstart starting row/column idx of the block which is to be examined
1159!> at this point (INPUT)
1160!> \param blockend ending row/column idx of the block which is to be examined
1161!> (INPUT)
1162!> \param nProc_in number of bins into which the current block has to be divided
1163!> (INPUT)
1164!> \param permute permutation vector which balances column/row cost
1165!> size(nblocks^2). (OUTPUT)
1166!> \param step ...
1167!> \param my_process_id ...
1168!> \param n_processes ...
1169!> \param nblocks ...
1170!> \param natom ...
1171!> \param nkind ...
1172!> \param list_ij ...
1173!> \param list_kl ...
1174!> \param set_list_ij ...
1175!> \param set_list_kl ...
1176!> \param particle_set ...
1177!> \param coeffs_set ...
1178!> \param coeffs_kind ...
1179!> \param is_assoc_atomic_block_global ...
1180!> \param do_periodic ...
1181!> \param kind_of ...
1182!> \param basis_parameter ...
1183!> \param pmax_set ...
1184!> \param pmax_atom ...
1185!> \param pmax_blocks ...
1186!> \param cell ...
1187!> \param x_data ...
1188!> \param para_env ...
1189!> \param pmax_block ...
1190!> \param do_p_screening ...
1191!> \param map_atom_to_kind_atom ...
1192!> \param eval_type ...
1193!> \param log10_eps_schwarz ...
1194!> \param log_2 ...
1195!> \param coeffs_kind_max0 ...
1196!> \param use_virial ...
1197!> \param atomic_pair_list ...
1198!> \par History
1199!> 03.2011 created [Michael Steinlechner]
1200!> \author Michael Steinlechner
1201! **************************************************************************************************
1202 RECURSIVE SUBROUTINE hfx_recursive_permute(blocksize, blockstart, blockend, nProc_in, &
1203 permute, step, &
1204 my_process_id, n_processes, nblocks, &
1205 natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
1206 particle_set, &
1207 coeffs_set, coeffs_kind, &
1208 is_assoc_atomic_block_global, do_periodic, &
1209 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
1210 cell, x_data, para_env, pmax_block, &
1211 do_p_screening, map_atom_to_kind_atom, eval_type, &
1212 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
1213
1214 INTEGER :: nproc_in, blockend, blockstart
1215 INTEGER, DIMENSION(nProc_in) :: blocksize
1216 INTEGER :: nblocks, n_processes, my_process_id
1217 INTEGER, INTENT(IN) :: step
1218 INTEGER, DIMENSION(nblocks*nblocks) :: permute
1219 INTEGER :: natom
1220 INTEGER, INTENT(IN) :: nkind
1221 TYPE(pair_list_type) :: list_ij, list_kl
1222 TYPE(pair_set_list_type), ALLOCATABLE, &
1223 DIMENSION(:) :: set_list_ij, set_list_kl
1224 TYPE(particle_type), DIMENSION(:), POINTER :: particle_set
1225 TYPE(hfx_screen_coeff_type), &
1226 DIMENSION(:, :, :, :), POINTER :: coeffs_set
1227 TYPE(hfx_screen_coeff_type), DIMENSION(:, :), &
1228 POINTER :: coeffs_kind
1229 INTEGER, DIMENSION(:, :) :: is_assoc_atomic_block_global
1230 LOGICAL :: do_periodic
1231 INTEGER :: kind_of(*)
1232 TYPE(hfx_basis_type), DIMENSION(:), POINTER :: basis_parameter
1233 TYPE(hfx_p_kind), DIMENSION(:), POINTER :: pmax_set
1234 REAL(dp), DIMENSION(:, :), POINTER :: pmax_atom
1235 REAL(dp) :: pmax_blocks
1236 TYPE(cell_type), POINTER :: cell
1237 TYPE(hfx_type), POINTER :: x_data
1238 TYPE(mp_para_env_type), INTENT(IN) :: para_env
1239 REAL(dp), DIMENSION(:, :), POINTER :: pmax_block
1240 LOGICAL, INTENT(IN) :: do_p_screening
1241 INTEGER, DIMENSION(:), POINTER :: map_atom_to_kind_atom
1242 INTEGER, INTENT(IN) :: eval_type
1243 REAL(dp) :: log10_eps_schwarz, log_2, &
1244 coeffs_kind_max0
1245 LOGICAL, INTENT(IN) :: use_virial
1246 LOGICAL, DIMENSION(:, :), POINTER :: atomic_pair_list
1247
1248 INTEGER :: col, endoffset, i, iatom_block, iatom_end, iatom_start, idx, inv_perm, &
1249 jatom_block, jatom_end, jatom_start, katom_block, katom_end, katom_start, latom_block, &
1250 latom_end, latom_start, nbins, nproc, row, startoffset
1251 INTEGER(int_8) :: atom_block, tmp_block
1252 INTEGER, ALLOCATABLE, DIMENSION(:) :: ithblocksize, localblocksize
1253 INTEGER, DIMENSION(blockend - blockstart + 1) :: bin_perm, tmp_perm
1254 REAL(dp) :: partialcost
1255 REAL(dp), DIMENSION(nblocks*nblocks) :: cost_vector
1256
1257 nproc = nproc_in
1258 cost_vector = 0.0_dp
1259
1260! loop over local atom_blocks.
1261 DO atom_block = my_process_id, int(nblocks, kind=int_8)**4 - 1, n_processes
1262
1263! get corresponding 4D block indices
1264 latom_block = int(modulo(atom_block, int(nblocks, kind=int_8))) + 1
1265 tmp_block = atom_block/nblocks
1266 katom_block = int(modulo(tmp_block, int(nblocks, kind=int_8))) + 1
1267 IF (latom_block < katom_block) cycle
1268 tmp_block = tmp_block/nblocks
1269 jatom_block = int(modulo(tmp_block, int(nblocks, kind=int_8))) + 1
1270 tmp_block = tmp_block/nblocks
1271 iatom_block = int(modulo(tmp_block, int(nblocks, kind=int_8))) + 1
1272 IF (jatom_block < iatom_block) cycle
1273
1274! get 2D indices of this atom_block (with permutation applied)
1275! for this, we need to invert the permutation, this means
1276! find position in permutation vector where value==idx
1277
1278 row = (katom_block - 1)*nblocks + jatom_block
1279 inv_perm = 1
1280 DO WHILE (permute(inv_perm) /= row)
1281 inv_perm = inv_perm + 1
1282 END DO
1283 row = inv_perm
1284
1285 col = (iatom_block - 1)*nblocks + latom_block
1286 inv_perm = 1
1287 DO WHILE (permute(inv_perm) /= col)
1288 inv_perm = inv_perm + 1
1289 END DO
1290 col = inv_perm
1291
1292! if row/col outside our current diagonal block, skip calculation.
1293 IF (col < blockstart .OR. col > blockend) cycle
1294 IF (row < blockstart .OR. row > blockend) cycle
1295
1296 iatom_start = x_data%blocks(iatom_block)%istart
1297 iatom_end = x_data%blocks(iatom_block)%iend
1298 jatom_start = x_data%blocks(jatom_block)%istart
1299 jatom_end = x_data%blocks(jatom_block)%iend
1300 katom_start = x_data%blocks(katom_block)%istart
1301 katom_end = x_data%blocks(katom_block)%iend
1302 latom_start = x_data%blocks(latom_block)%istart
1303 latom_end = x_data%blocks(latom_block)%iend
1304
1305! whatever.
1306 SELECT CASE (eval_type)
1307 CASE (hfx_do_eval_energy)
1308 pmax_blocks = max(pmax_block(katom_block, iatom_block), &
1309 pmax_block(latom_block, jatom_block), &
1310 pmax_block(latom_block, iatom_block), &
1311 pmax_block(katom_block, jatom_block))
1312 CASE (hfx_do_eval_forces)
1313 pmax_blocks = max(pmax_block(katom_block, iatom_block) + &
1314 pmax_block(latom_block, jatom_block), &
1315 pmax_block(latom_block, iatom_block) + &
1316 pmax_block(katom_block, jatom_block))
1317 END SELECT
1318
1319! screening.
1320 IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) cycle
1321
1322! every second recursion step, compute row sum instead of column sum
1323
1324 IF (modulo(step, 2) == 0) THEN
1325 idx = row
1326 ELSE
1327 idx = col
1328 END IF
1329
1330! estimate the cost of this atom_block.
1331 partialcost = estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, &
1332 set_list_kl, &
1333 iatom_start, iatom_end, jatom_start, jatom_end, &
1334 katom_start, katom_end, latom_start, latom_end, &
1335 particle_set, &
1336 coeffs_set, coeffs_kind, &
1337 is_assoc_atomic_block_global, do_periodic, &
1338 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
1339 cell, &
1340 do_p_screening, map_atom_to_kind_atom, eval_type, &
1341 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
1342
1343 cost_vector(idx) = cost_vector(idx) + partialcost
1344 END DO ! atom_block
1345
1346! sum costvector over all processes
1347 CALL para_env%sum(cost_vector)
1348
1349! calculate next prime factor of nProc
1350 nbins = 2
1351 DO WHILE (modulo(int(nproc), int(nbins)) /= 0)
1352 nbins = nbins + 1
1353 END DO
1354
1355 nproc = nproc/nbins
1356
1357! ... do the binning...
1358
1359 ALLOCATE (localblocksize(nbins))
1360 CALL hfx_permute_binning(nbins, cost_vector(blockstart:blockend), blockend - blockstart + 1, bin_perm, localblocksize)
1361
1362!... and update the permutation vector
1363
1364 tmp_perm = permute(blockstart:blockend)
1365 permute(blockstart:blockend) = tmp_perm(bin_perm)
1366
1367! split recursion into the nBins Bins
1368 IF (nproc > 1) THEN
1369 ALLOCATE (ithblocksize(nproc))
1370 DO i = 1, nbins
1371 startoffset = sum(localblocksize(1:(i - 1)))
1372 endoffset = sum(localblocksize(1:i)) - 1
1373
1374 CALL hfx_recursive_permute(ithblocksize, blockstart + startoffset, blockstart + endoffset, nproc, &
1375 permute, step + 1, &
1376 my_process_id, n_processes, nblocks, &
1377 natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
1378 particle_set, &
1379 coeffs_set, coeffs_kind, &
1380 is_assoc_atomic_block_global, do_periodic, &
1381 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
1382 cell, x_data, para_env, pmax_block, &
1383 do_p_screening, map_atom_to_kind_atom, eval_type, &
1384 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
1385 blocksize(((i - 1)*nproc + 1):(i*nproc)) = ithblocksize
1386 END DO
1387 DEALLOCATE (ithblocksize)
1388 ELSE
1389 DO i = 1, nbins
1390 blocksize(i) = localblocksize(i)
1391 END DO
1392 END IF
1393
1394 DEALLOCATE (localblocksize)
1395
1396 END SUBROUTINE hfx_recursive_permute
1397
1398! **************************************************************************************************
1399!> \brief small binning routine for the recursive load balancing
1400!>
1401!> \param nBins number of Bins (INPUT)
1402!> \param costvector vector of current row/column costs which have to be binned (INPUT)
1403!> \param maxbinsize upper bound for bin size (INPUT)
1404!> \param perm resulting permutation due to be binning routine (OUTPUT)
1405!> \param block_count vector of size(nbins) which contains the size of each bin (OUTPUT)
1406!> \par History
1407!> 03.2011 created [Michael Steinlechner]
1408!> \author Michael Steinlechner
1409! **************************************************************************************************
1410 SUBROUTINE hfx_permute_binning(nBins, costvector, maxbinsize, perm, block_count)
1411
1412 INTEGER, INTENT(IN) :: nbins, maxbinsize
1413 REAL(dp), DIMENSION(maxbinsize), INTENT(IN) :: costvector
1414 INTEGER, DIMENSION(maxbinsize), INTENT(OUT) :: perm
1415 INTEGER, DIMENSION(nBins), INTENT(OUT) :: block_count
1416
1417 INTEGER :: i, j, mod_idx, offset
1418 INTEGER, DIMENSION(nBins, maxbinsize) :: bin
1419 INTEGER, DIMENSION(nBins) :: bin_idx
1420 INTEGER, DIMENSION(maxbinsize) :: idx
1421 REAL(dp), DIMENSION(maxbinsize) :: vec
1422 REAL(dp), DIMENSION(nBins) :: bincosts
1423
1424! be careful not to change costvector (copy it!)
1425
1426 vec = costvector
1427 block_count = 0
1428 bincosts = 0
1429
1430 !sort the array (ascending)
1431 CALL sort(vec, maxbinsize, idx)
1432
1433 ! count the loop down to distribute the largest cols/rows first
1434 DO i = maxbinsize, 1, -1
1435 IF (vec(i) == 0) THEN
1436 ! spread zero-cost col/rows evenly among procs
1437 mod_idx = modulo(i, nbins) + 1 !(note the fortran offset by one!)
1438 block_count(mod_idx) = block_count(mod_idx) + 1
1439 bin(mod_idx, block_count(mod_idx)) = idx(i)
1440 ELSE
1441 ! sort the bins so that the one with the lowest cost is at the
1442 ! first place, where we then assign the current col/row
1443 CALL sort(bincosts, nbins, bin_idx)
1444 block_count = block_count(bin_idx)
1445 bin = bin(bin_idx, :)
1446
1447 bincosts(1) = bincosts(1) + vec(i)
1448 block_count(1) = block_count(1) + 1
1449 bin(1, block_count(1)) = idx(i)
1450 END IF
1451 END DO
1452
1453 ! construct permutation vector from the binning
1454 offset = 0
1455 DO i = 1, nbins
1456 DO j = 1, block_count(i)
1457 perm(offset + j) = bin(i, j)
1458 END DO
1459 offset = offset + block_count(i)
1460 END DO
1461
1462 END SUBROUTINE hfx_permute_binning
1463
1464! **************************************************************************************************
1465!> \brief Cheap way of redistributing the eri's
1466!> \param x_data Object that stores the indices array
1467!> \param para_env para_env
1468!> \param load_balance_parameter contains parmameter for Monte-Carlo routines
1469!> \param i_thread current thread ID
1470!> \param n_threads Total Number of threads
1471!> \param eval_type ...
1472!> \par History
1473!> 12.2007 created [Manuel Guidon]
1474!> 02.2009 optimize Memory Usage [Manuel Guidon]
1475!> \author Manuel Guidon
1476!> \note
1477!> The cost matrix is given by the walltime for each bin that is measured
1478!> during the calculation
1479! **************************************************************************************************
1480 SUBROUTINE hfx_update_load_balance(x_data, para_env, &
1481 load_balance_parameter, &
1482 i_thread, n_threads, eval_type)
1483
1484 TYPE(hfx_type), POINTER :: x_data
1485 TYPE(mp_para_env_type), INTENT(IN) :: para_env
1486 TYPE(hfx_load_balance_type) :: load_balance_parameter
1487 INTEGER, INTENT(IN) :: i_thread, n_threads, eval_type
1488
1489 CHARACTER(LEN=*), PARAMETER :: routinen = 'hfx_update_load_balance'
1490
1491 INTEGER :: data_from, dest, end_idx, handle, i, ibin, icpu, iprocess, j, mepos, my_bin_size, &
1492 my_global_start_idx, my_process_id, n_processes, nbins, ncpu, source, start_idx
1493 TYPE(mp_request_type), DIMENSION(2) :: req
1494 INTEGER(int_8), DIMENSION(:), POINTER :: local_cost_matrix, recbuffer, &
1495 sendbuffer, swapbuffer
1496 INTEGER(int_8), DIMENSION(:), POINTER, SAVE :: cost_matrix
1497 INTEGER, ALLOCATABLE, DIMENSION(:) :: tmp_pos
1498 INTEGER, ALLOCATABLE, DIMENSION(:), SAVE :: bins_per_rank
1499 INTEGER, ALLOCATABLE, DIMENSION(:, :), SAVE :: bin_histogram
1500 INTEGER, DIMENSION(:), POINTER, SAVE :: shm_distribution_vector
1501 INTEGER, SAVE :: max_bin_size
1502 TYPE(hfx_distribution), DIMENSION(:), POINTER :: binned_dist, ptr_to_tmp_dist, tmp_dist
1503 TYPE(hfx_distribution), DIMENSION(:, :), POINTER, &
1504 SAVE :: full_dist
1505
1506!$OMP BARRIER
1507!$OMP MASTER
1508 CALL timeset(routinen, handle)
1509!$OMP END MASTER
1510!$OMP BARRIER
1511
1512 ncpu = para_env%num_pe
1513 n_processes = ncpu*n_threads
1514 !! If there is only 1 cpu skip the binning
1515 IF (n_processes == 1) THEN
1516 ALLOCATE (tmp_dist(1))
1517 tmp_dist(1)%number_of_atom_quartets = huge(tmp_dist(1)%number_of_atom_quartets)
1518 tmp_dist(1)%istart = 0_int_8
1519 ptr_to_tmp_dist => tmp_dist(:)
1520 SELECT CASE (eval_type)
1521 CASE (hfx_do_eval_energy)
1522 CALL hfx_set_distr_energy(ptr_to_tmp_dist, x_data)
1523 CASE (hfx_do_eval_forces)
1524 CALL hfx_set_distr_forces(ptr_to_tmp_dist, x_data)
1525 END SELECT
1526 DEALLOCATE (tmp_dist)
1527 ELSE
1528 mepos = para_env%mepos
1529 my_process_id = para_env%mepos*n_threads + i_thread
1530 nbins = load_balance_parameter%nbins
1531!$OMP MASTER
1532 ALLOCATE (bin_histogram(n_processes, 2))
1533 bin_histogram = 0
1534!$OMP END MASTER
1535!$OMP BARRIER
1536 SELECT CASE (eval_type)
1537 CASE (hfx_do_eval_energy)
1538 my_bin_size = SIZE(x_data%distribution_energy)
1539 CASE (hfx_do_eval_forces)
1540 my_bin_size = SIZE(x_data%distribution_forces)
1541 END SELECT
1542 bin_histogram(my_process_id + 1, 1) = my_bin_size
1543!$OMP BARRIER
1544!$OMP MASTER
1545 CALL para_env%sum(bin_histogram(:, 1))
1546 bin_histogram(1, 2) = bin_histogram(1, 1)
1547 DO iprocess = 2, n_processes
1548 bin_histogram(iprocess, 2) = bin_histogram(iprocess - 1, 2) + bin_histogram(iprocess, 1)
1549 END DO
1550
1551 max_bin_size = maxval(bin_histogram(para_env%mepos*n_threads + 1:para_env%mepos*n_threads + n_threads, 1))
1552 CALL para_env%max(max_bin_size)
1553!$OMP END MASTER
1554!$OMP BARRIER
1555 ALLOCATE (binned_dist(my_bin_size))
1556 !! Use old binned_dist, but with timings cost
1557 SELECT CASE (eval_type)
1558 CASE (hfx_do_eval_energy)
1559 binned_dist = x_data%distribution_energy
1560 CASE (hfx_do_eval_forces)
1561 binned_dist = x_data%distribution_forces
1562 END SELECT
1563
1564 DO ibin = 1, my_bin_size
1565 IF (binned_dist(ibin)%number_of_atom_quartets == 0) THEN
1566 binned_dist(ibin)%cost = 0
1567 ELSE
1568 SELECT CASE (eval_type)
1569 CASE (hfx_do_eval_energy)
1570 IF (.NOT. load_balance_parameter%rtp_redistribute) THEN
1571 binned_dist(ibin)%cost = int((binned_dist(ibin)%time_first_scf + &
1572 binned_dist(ibin)%time_other_scf)*10000.0_dp, int_8)
1573 ELSE
1574 binned_dist(ibin)%cost = int((binned_dist(ibin)%time_other_scf)*10000.0_dp, int_8)
1575 END IF
1576 CASE (hfx_do_eval_forces)
1577 binned_dist(ibin)%cost = int((binned_dist(ibin)%time_forces)*10000.0_dp, int_8)
1578 END SELECT
1579 END IF
1580 END DO
1581!$OMP BARRIER
1582!$OMP MASTER
1583 !! store all local results in a big cost matrix
1584 ALLOCATE (cost_matrix(ncpu*nbins*n_threads))
1585 cost_matrix = 0
1586 ALLOCATE (sendbuffer(max_bin_size*n_threads))
1587 ALLOCATE (recbuffer(max_bin_size*n_threads))
1588!$OMP END MASTER
1589!$OMP BARRIER
1590 my_global_start_idx = bin_histogram(my_process_id + 1, 2) - my_bin_size
1591 icpu = para_env%mepos + 1
1592 DO i = 1, my_bin_size
1593 cost_matrix(my_global_start_idx + i) = binned_dist(i)%cost
1594 END DO
1595
1596 mepos = para_env%mepos
1597!$OMP BARRIER
1598!$OMP MASTER
1599 ALLOCATE (bins_per_rank(ncpu))
1600 bins_per_rank = 0
1601 DO icpu = 1, ncpu
1602 bins_per_rank(icpu) = sum(bin_histogram((icpu - 1)*n_threads + 1:(icpu - 1)*n_threads + n_threads, 1))
1603 END DO
1604 sendbuffer(1:bins_per_rank(para_env%mepos + 1)) = &
1605 cost_matrix(my_global_start_idx + 1:my_global_start_idx + bins_per_rank(para_env%mepos + 1))
1606
1607 dest = modulo(mepos + 1, ncpu)
1608 source = modulo(mepos - 1, ncpu)
1609 ! sync before/after ring of isendrecv
1610 CALL para_env%sync()
1611 DO icpu = 0, ncpu - 1
1612 IF (icpu /= ncpu - 1) THEN
1613 CALL para_env%isendrecv(sendbuffer, dest, recbuffer, source, &
1614 req(1), req(2), 13)
1615 END IF
1616 data_from = modulo(mepos - icpu, ncpu)
1617 start_idx = sum(bins_per_rank(1:data_from + 1)) - bins_per_rank(data_from + 1) + 1
1618 end_idx = start_idx + bins_per_rank(data_from + 1) - 1
1619 cost_matrix(start_idx:end_idx) = sendbuffer(1:end_idx - start_idx + 1)
1620
1621 IF (icpu /= ncpu - 1) THEN
1622 CALL mp_waitall(req)
1623 END IF
1624 swapbuffer => sendbuffer
1625 sendbuffer => recbuffer
1626 recbuffer => swapbuffer
1627 END DO
1628 DEALLOCATE (recbuffer, sendbuffer)
1629 ! sync before/after ring of isendrecv
1630 CALL para_env%sync()
1631!$OMP END MASTER
1632!$OMP BARRIER
1633 ALLOCATE (local_cost_matrix(SIZE(cost_matrix, 1)))
1634 local_cost_matrix = cost_matrix
1635!$OMP MASTER
1636 ALLOCATE (shm_distribution_vector(ncpu*nbins*n_threads))
1637 CALL optimize_distribution(ncpu*nbins*n_threads, ncpu*n_threads, local_cost_matrix, &
1638 shm_distribution_vector, x_data%load_balance_parameter%do_randomize)
1639
1640 ALLOCATE (full_dist(ncpu*n_threads, max_bin_size))
1641
1642 full_dist(:, :)%istart = 0_int_8
1643 full_dist(:, :)%number_of_atom_quartets = 0_int_8
1644 full_dist(:, :)%cost = 0_int_8
1645 full_dist(:, :)%time_first_scf = 0.0_dp
1646 full_dist(:, :)%time_other_scf = 0.0_dp
1647 full_dist(:, :)%time_forces = 0.0_dp
1648!$OMP END MASTER
1649
1650!$OMP BARRIER
1651 mepos = para_env%mepos + 1
1652 full_dist((mepos - 1)*n_threads + i_thread + 1, 1:my_bin_size) = binned_dist(1:my_bin_size)
1653!$OMP BARRIER
1654!$OMP MASTER
1655 ALLOCATE (sendbuffer(3*max_bin_size*n_threads))
1656 ALLOCATE (recbuffer(3*max_bin_size*n_threads))
1657 mepos = para_env%mepos
1658 DO j = 1, n_threads
1659 DO i = 1, max_bin_size
1660 sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 1) = full_dist(mepos*n_threads + j, i)%istart
1661 sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 2) = full_dist(mepos*n_threads + j, i)%number_of_atom_quartets
1662 sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 3) = full_dist(mepos*n_threads + j, i)%cost
1663 END DO
1664 END DO
1665 dest = modulo(mepos + 1, ncpu)
1666 source = modulo(mepos - 1, ncpu)
1667 ! sync before/after ring of isendrecv
1668 CALL para_env%sync()
1669 DO icpu = 0, ncpu - 1
1670 IF (icpu /= ncpu - 1) THEN
1671 CALL para_env%isendrecv(sendbuffer, dest, recbuffer, source, &
1672 req(1), req(2), 13)
1673 END IF
1674 data_from = modulo(mepos - icpu, ncpu)
1675 DO j = 1, n_threads
1676 DO i = 1, max_bin_size
1677 full_dist(data_from*n_threads + j, i)%istart = sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 1)
1678 full_dist(data_from*n_threads + j, i)%number_of_atom_quartets = sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 2)
1679 full_dist(data_from*n_threads + j, i)%cost = sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 3)
1680 END DO
1681 END DO
1682
1683 IF (icpu /= ncpu - 1) THEN
1684 CALL mp_waitall(req)
1685 END IF
1686 swapbuffer => sendbuffer
1687 sendbuffer => recbuffer
1688 recbuffer => swapbuffer
1689 END DO
1690 ! sync before/after ring of isendrecv
1691 DEALLOCATE (recbuffer, sendbuffer)
1692 CALL para_env%sync()
1693!$OMP END MASTER
1694!$OMP BARRIER
1695 !! reorder the distribution according to the distribution vector
1696 ALLOCATE (tmp_pos(ncpu*n_threads))
1697 tmp_pos = 1
1698 ALLOCATE (tmp_dist(nbins*ncpu*n_threads))
1699
1700 tmp_dist(:)%istart = 0_int_8
1701 tmp_dist(:)%number_of_atom_quartets = 0_int_8
1702 tmp_dist(:)%cost = 0_int_8
1703 tmp_dist(:)%time_first_scf = 0.0_dp
1704 tmp_dist(:)%time_other_scf = 0.0_dp
1705 tmp_dist(:)%time_forces = 0.0_dp
1706
1707 mepos = my_process_id + 1
1708 DO icpu = 1, n_processes
1709 DO i = 1, bin_histogram(icpu, 1)
1710 IF (shm_distribution_vector(bin_histogram(icpu, 2) - bin_histogram(icpu, 1) + i) == mepos) THEN
1711 tmp_dist(tmp_pos(mepos)) = full_dist(icpu, i)
1712 tmp_pos(mepos) = tmp_pos(mepos) + 1
1713 END IF
1714 END DO
1715 END DO
1716
1717 !! Assign the load to each process
1718 NULLIFY (ptr_to_tmp_dist)
1719 mepos = my_process_id + 1
1720 ptr_to_tmp_dist => tmp_dist(1:tmp_pos(mepos) - 1)
1721 SELECT CASE (eval_type)
1722 CASE (hfx_do_eval_energy)
1723 CALL hfx_set_distr_energy(ptr_to_tmp_dist, x_data)
1724 CASE (hfx_do_eval_forces)
1725 CALL hfx_set_distr_forces(ptr_to_tmp_dist, x_data)
1726 END SELECT
1727
1728!$OMP BARRIER
1729!$OMP MASTER
1730 DEALLOCATE (full_dist, cost_matrix, shm_distribution_vector)
1731 DEALLOCATE (bins_per_rank, bin_histogram)
1732!$OMP END MASTER
1733!$OMP BARRIER
1734 DEALLOCATE (tmp_dist, tmp_pos)
1735 DEALLOCATE (binned_dist, local_cost_matrix)
1736 END IF
1737!$OMP BARRIER
1738!$OMP MASTER
1739 CALL timestop(handle)
1740!$OMP END MASTER
1741!$OMP BARRIER
1742
1743 END SUBROUTINE hfx_update_load_balance
1744
1745! **************************************************************************************************
1746!> \brief estimates the cost of a set quartet with info available at load balance time
1747!> i.e. without much info on the primitives primitives
1748!> \param nsa ...
1749!> \param nsb ...
1750!> \param nsc ...
1751!> \param nsd ...
1752!> \param npgfa ...
1753!> \param npgfb ...
1754!> \param npgfc ...
1755!> \param npgfd ...
1756!> \param ratio ...
1757!> \param p1 ...
1758!> \param p2 ...
1759!> \param p3 ...
1760!> \return ...
1761!> \par History
1762!> 08.2009 created Joost VandeVondele
1763!> \author Joost VandeVondele
1764! **************************************************************************************************
1765 FUNCTION cost_model(nsa, nsb, nsc, nsd, npgfa, npgfb, npgfc, npgfd, ratio, p1, p2, p3) RESULT(res)
1766 REAL(kind=dp) :: estimate1, estimate2, estimate, ratio, switch, mu, sigma
1767 INTEGER(KIND=int_8) :: res
1768 REAL(kind=dp), INTENT(IN) :: p1(12), p2(12), p3(2)
1769
1770 INTEGER :: nsa, nsb, nsc, nsd, npgfa, npgfb, npgfc, npgfd
1771
1772 estimate1 = estimate_basic(p1)
1773 estimate2 = estimate_basic(p2)
1774 mu = log(abs(1.0e6_dp*p3(1)) + 1)
1775 sigma = p3(2)*0.1_dp*mu
1776 switch = 1.0_dp/(1.0_dp + exp((log(estimate1) - mu)/sigma))
1777 estimate = estimate1*(1.0_dp - switch) + estimate2*switch
1778 res = int(estimate*0.001_dp, kind=int_8) + 1
1779
1780 CONTAINS
1781
1782! **************************************************************************************************
1783!> \brief ...
1784!> \param p ...
1785!> \return ...
1786! **************************************************************************************************
1787 REAL(kind=dp) FUNCTION estimate_basic(p) RESULT(res)
1788 REAL(kind=dp) :: p(12)
1789
1790 REAL(kind=dp) :: p1, p10, p11, p12, p2, p3, p4, p5, p6, &
1791 p7, p8, p9
1792
1793 p1 = p(1); p2 = p(2); p3 = p(3); p4 = p(4)
1794 p5 = p(5); p6 = p(6); p7 = p(7); p8 = p(8)
1795 p9 = p(9); p10 = p(10); p11 = p(11); p12 = p(12)
1796 res = poly2(nsa, p1, p2, p3)*poly2(nsb, p1, p2, p3)*poly2(nsc, p1, p2, p3)*poly2(nsd, p1, p2, p3)* &
1797 poly2(npgfa, p4, p5, p6)*poly2(npgfb, p4, p5, p6)*poly2(npgfc, p4, p5, p6)* &
1798 poly2(npgfd, p4, p5, p6)*exp(-p7*ratio + p8*ratio**2) + &
1799 1000.0_dp*p9 + poly2(nsa, p10, p11, p12)*poly2(nsb, p10, p11, p12)*poly2(nsc, p10, p11, p12)*poly2(nsd, p10, p11, p12)
1800 res = 1 + abs(res)
1801 END FUNCTION estimate_basic
1802
1803! **************************************************************************************************
1804!> \brief ...
1805!> \param x ...
1806!> \param a0 ...
1807!> \param a1 ...
1808!> \param a2 ...
1809!> \return ...
1810! **************************************************************************************************
1811 REAL(kind=dp) FUNCTION poly2(x, a0, a1, a2)
1812 INTEGER, INTENT(IN) :: x
1813 REAL(kind=dp), INTENT(IN) :: a0, a1, a2
1814 REAL(kind=dp) :: r
1815
1816 r = real(x, kind=dp)
1817 poly2 = a0 + (a1 + a2*r)*r
1818 END FUNCTION poly2
1819
1820 END FUNCTION cost_model
1821! **************************************************************************************************
1822!> \brief Minimizes the maximum cost per cpu by shuffling around all bins
1823!> \param total_number_of_bins ...
1824!> \param number_of_processes ...
1825!> \param bin_costs costs per bin
1826!> \param distribution_vector will contain the final distribution
1827!> \param do_randomize ...
1828!> \par History
1829!> 03.2009 created from a hack by Joost [Manuel Guidon]
1830!> \author Manuel Guidon
1831! **************************************************************************************************
1832 SUBROUTINE optimize_distribution(total_number_of_bins, number_of_processes, bin_costs, &
1833 distribution_vector, do_randomize)
1834 INTEGER :: total_number_of_bins, number_of_processes
1835 INTEGER(int_8), DIMENSION(:), POINTER :: bin_costs
1836 INTEGER, DIMENSION(:), POINTER :: distribution_vector
1837 LOGICAL, INTENT(IN) :: do_randomize
1838
1839 INTEGER :: i, itmp, j, nstep
1840 INTEGER(int_8), DIMENSION(:), POINTER :: my_cost_cpu, tmp_cost, tmp_cpu_cost
1841 INTEGER, DIMENSION(:), POINTER :: tmp_cpu_index, tmp_index
1842 TYPE(rng_stream_type), ALLOCATABLE :: rng_stream
1843
1844 nstep = max(1, int(number_of_processes)/2)
1845
1846 ALLOCATE (tmp_cost(total_number_of_bins))
1847 ALLOCATE (tmp_index(total_number_of_bins))
1848 ALLOCATE (tmp_cpu_cost(number_of_processes))
1849 ALLOCATE (tmp_cpu_index(number_of_processes))
1850 ALLOCATE (my_cost_cpu(number_of_processes))
1851 tmp_cost = bin_costs
1852
1853 CALL sort(tmp_cost, total_number_of_bins, tmp_index)
1854 my_cost_cpu = 0
1855 !
1856 ! assign the largest remaining bin to the CPU with the smallest load
1857 ! gives near perfect distributions for a sufficient number of bins ...
1858 ! doing this in chunks of nstep (where nstep ~ number_of_processes) makes this n log n and gives
1859 ! each cpu a similar number of tasks.
1860 ! it also avoids degenerate cases where thousands of zero sized tasks
1861 ! are assigned to the same (least loaded) cpu
1862 !
1863 IF (do_randomize) &
1864 rng_stream = rng_stream_type(name="uniform_rng", &
1865 distribution_type=uniform)
1866
1867 DO i = total_number_of_bins, 1, -nstep
1868 tmp_cpu_cost = my_cost_cpu
1869 CALL sort(tmp_cpu_cost, int(number_of_processes), tmp_cpu_index)
1870 IF (do_randomize) THEN
1871 CALL rng_stream%shuffle(tmp_cpu_index(1:min(i, nstep)))
1872 END IF
1873 DO j = 1, min(i, nstep)
1874 itmp = tmp_cpu_index(j)
1875 distribution_vector(tmp_index(i - j + 1)) = itmp
1876 my_cost_cpu(itmp) = my_cost_cpu(itmp) + bin_costs(tmp_index(i - j + 1))
1877 END DO
1878 END DO
1879
1880 DEALLOCATE (tmp_cost, tmp_index, tmp_cpu_cost)
1881 DEALLOCATE (tmp_cpu_index, my_cost_cpu)
1882 END SUBROUTINE optimize_distribution
1883
1884! **************************************************************************************************
1885!> \brief Given a 2d index pair, this function returns a 1d index pair for
1886!> a symmetric upper triangle NxN matrix
1887!> The compiler should inline this function, therefore it appears in
1888!> several modules
1889!> \param i 2d index
1890!> \param j 2d index
1891!> \param N matrix size
1892!> \return ...
1893!> \par History
1894!> 03.2009 created [Manuel Guidon]
1895!> \author Manuel Guidon
1896! **************************************************************************************************
1897 PURE FUNCTION get_1d_idx(i, j, N)
1898 INTEGER, INTENT(IN) :: i, j
1899 INTEGER(int_8), INTENT(IN) :: n
1900 INTEGER(int_8) :: get_1d_idx
1901
1902 INTEGER(int_8) :: min_ij
1903
1904 min_ij = min(i, j)
1905 get_1d_idx = min_ij*n + max(i, j) - (min_ij - 1)*min_ij/2 - n
1906
1907 END FUNCTION get_1d_idx
1908
1909! **************************************************************************************************
1910!> \brief ...
1911!> \param natom ...
1912!> \param nkind ...
1913!> \param list_ij ...
1914!> \param list_kl ...
1915!> \param set_list_ij ...
1916!> \param set_list_kl ...
1917!> \param iatom_start ...
1918!> \param iatom_end ...
1919!> \param jatom_start ...
1920!> \param jatom_end ...
1921!> \param katom_start ...
1922!> \param katom_end ...
1923!> \param latom_start ...
1924!> \param latom_end ...
1925!> \param particle_set ...
1926!> \param coeffs_set ...
1927!> \param coeffs_kind ...
1928!> \param is_assoc_atomic_block_global ...
1929!> \param do_periodic ...
1930!> \param kind_of ...
1931!> \param basis_parameter ...
1932!> \param pmax_set ...
1933!> \param pmax_atom ...
1934!> \param pmax_blocks ...
1935!> \param cell ...
1936!> \param do_p_screening ...
1937!> \param map_atom_to_kind_atom ...
1938!> \param eval_type ...
1939!> \param log10_eps_schwarz ...
1940!> \param log_2 ...
1941!> \param coeffs_kind_max0 ...
1942!> \param use_virial ...
1943!> \param atomic_pair_list ...
1944!> \return ...
1945! **************************************************************************************************
1946 FUNCTION estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
1947 iatom_start, iatom_end, jatom_start, jatom_end, &
1948 katom_start, katom_end, latom_start, latom_end, &
1949 particle_set, &
1950 coeffs_set, coeffs_kind, &
1951 is_assoc_atomic_block_global, do_periodic, &
1952 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
1953 cell, &
1954 do_p_screening, map_atom_to_kind_atom, eval_type, &
1955 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, &
1956 atomic_pair_list)
1957
1958 INTEGER, INTENT(IN) :: natom, nkind
1959 TYPE(pair_list_type) :: list_ij, list_kl
1960 TYPE(pair_set_list_type), DIMENSION(:) :: set_list_ij, set_list_kl
1961 INTEGER, INTENT(IN) :: iatom_start, iatom_end, jatom_start, &
1962 jatom_end, katom_start, katom_end, &
1963 latom_start, latom_end
1964 TYPE(particle_type), DIMENSION(:), POINTER :: particle_set
1965 TYPE(hfx_screen_coeff_type), &
1966 DIMENSION(:, :, :, :), POINTER :: coeffs_set
1967 TYPE(hfx_screen_coeff_type), &
1968 DIMENSION(nkind, nkind) :: coeffs_kind
1969 INTEGER, DIMENSION(:, :) :: is_assoc_atomic_block_global
1970 LOGICAL :: do_periodic
1971 INTEGER :: kind_of(*)
1972 TYPE(hfx_basis_type), DIMENSION(:), POINTER :: basis_parameter
1973 TYPE(hfx_p_kind), DIMENSION(:), POINTER :: pmax_set
1974 REAL(dp), DIMENSION(:, :), POINTER :: pmax_atom
1975 REAL(dp) :: pmax_blocks
1976 TYPE(cell_type), POINTER :: cell
1977 LOGICAL, INTENT(IN) :: do_p_screening
1978 INTEGER, DIMENSION(:), POINTER :: map_atom_to_kind_atom
1979 INTEGER, INTENT(IN) :: eval_type
1980 REAL(dp) :: log10_eps_schwarz, log_2, &
1981 coeffs_kind_max0
1982 LOGICAL, INTENT(IN) :: use_virial
1983 LOGICAL, DIMENSION(natom, natom) :: atomic_pair_list
1984 INTEGER(int_8) :: estimate_block_cost
1985
1986 INTEGER :: i_list_ij, i_list_kl, i_set_list_ij, i_set_list_ij_start, i_set_list_ij_stop, &
1987 i_set_list_kl, i_set_list_kl_start, i_set_list_kl_stop, iatom, ikind, iset, jatom, jkind, &
1988 jset, katom, kind_kind_idx, kkind, kset, latom, lkind, lset, swap_id
1989 INTEGER, DIMENSION(:), POINTER :: npgfa, npgfb, npgfc, npgfd, nsgfa, &
1990 nsgfb, nsgfc, nsgfd
1991 REAL(dp) :: actual_pmax_atom, cost_tmp, max_val1, &
1992 max_val2, pmax_entry, rab2, rcd2, &
1993 screen_kind_ij, screen_kind_kl
1994 REAL(dp), DIMENSION(:, :), POINTER :: ptr_p_1, ptr_p_2, ptr_p_3, ptr_p_4
1995
1996 estimate_block_cost = 0_int_8
1997
1998 CALL build_pair_list(natom, list_ij, set_list_ij, iatom_start, iatom_end, jatom_start, jatom_end, &
1999 kind_of, basis_parameter, particle_set, &
2000 do_periodic, coeffs_set, coeffs_kind, coeffs_kind_max0, &
2001 log10_eps_schwarz, cell, pmax_blocks, atomic_pair_list)
2002
2003 CALL build_pair_list(natom, list_kl, set_list_kl, katom_start, katom_end, latom_start, latom_end, &
2004 kind_of, basis_parameter, particle_set, &
2005 do_periodic, coeffs_set, coeffs_kind, coeffs_kind_max0, &
2006 log10_eps_schwarz, cell, pmax_blocks, atomic_pair_list)
2007
2008 DO i_list_ij = 1, list_ij%n_element
2009 iatom = list_ij%elements(i_list_ij)%pair(1)
2010 jatom = list_ij%elements(i_list_ij)%pair(2)
2011 i_set_list_ij_start = list_ij%elements(i_list_ij)%set_bounds(1)
2012 i_set_list_ij_stop = list_ij%elements(i_list_ij)%set_bounds(2)
2013 ikind = list_ij%elements(i_list_ij)%kind_pair(1)
2014 jkind = list_ij%elements(i_list_ij)%kind_pair(2)
2015 rab2 = list_ij%elements(i_list_ij)%dist2
2016
2017 nsgfa => basis_parameter(ikind)%nsgf
2018 nsgfb => basis_parameter(jkind)%nsgf
2019 npgfa => basis_parameter(ikind)%npgf
2020 npgfb => basis_parameter(jkind)%npgf
2021
2022 DO i_list_kl = 1, list_kl%n_element
2023
2024 katom = list_kl%elements(i_list_kl)%pair(1)
2025 latom = list_kl%elements(i_list_kl)%pair(2)
2026
2027 IF (.NOT. (katom + latom <= iatom + jatom)) cycle
2028 IF (((iatom + jatom) == (katom + latom)) .AND. (katom < iatom)) cycle
2029
2030 IF (eval_type == hfx_do_eval_forces) THEN
2031 IF (.NOT. use_virial) THEN
2032 IF ((iatom == jatom .AND. iatom == katom .AND. iatom == latom)) cycle
2033 END IF
2034 END IF
2035
2036 i_set_list_kl_start = list_kl%elements(i_list_kl)%set_bounds(1)
2037 i_set_list_kl_stop = list_kl%elements(i_list_kl)%set_bounds(2)
2038 kkind = list_kl%elements(i_list_kl)%kind_pair(1)
2039 lkind = list_kl%elements(i_list_kl)%kind_pair(2)
2040 rcd2 = list_kl%elements(i_list_kl)%dist2
2041
2042 nsgfc => basis_parameter(kkind)%nsgf
2043 nsgfd => basis_parameter(lkind)%nsgf
2044 npgfc => basis_parameter(kkind)%npgf
2045 npgfd => basis_parameter(lkind)%npgf
2046
2047 IF (do_p_screening) THEN
2048 actual_pmax_atom = max(pmax_atom(katom, iatom), &
2049 pmax_atom(latom, jatom), &
2050 pmax_atom(latom, iatom), &
2051 pmax_atom(katom, jatom))
2052 ELSE
2053 actual_pmax_atom = 0.0_dp
2054 END IF
2055
2056 screen_kind_ij = coeffs_kind(jkind, ikind)%x(1)*rab2 + &
2057 coeffs_kind(jkind, ikind)%x(2)
2058 screen_kind_kl = coeffs_kind(lkind, kkind)%x(1)*rcd2 + &
2059 coeffs_kind(lkind, kkind)%x(2)
2060 IF (screen_kind_ij + screen_kind_kl + actual_pmax_atom < log10_eps_schwarz) cycle
2061
2062 IF (.NOT. (is_assoc_atomic_block_global(latom, iatom) >= 1 .AND. &
2063 is_assoc_atomic_block_global(katom, iatom) >= 1 .AND. &
2064 is_assoc_atomic_block_global(katom, jatom) >= 1 .AND. &
2065 is_assoc_atomic_block_global(latom, jatom) >= 1)) cycle
2066
2067 IF (do_p_screening) THEN
2068 SELECT CASE (eval_type)
2069 CASE (hfx_do_eval_energy)
2070 swap_id = 0
2071 kind_kind_idx = int(get_1d_idx(kkind, ikind, int(nkind, int_8)))
2072 IF (ikind >= kkind) THEN
2073 ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2074 map_atom_to_kind_atom(katom), &
2075 map_atom_to_kind_atom(iatom))
2076 ELSE
2077 ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2078 map_atom_to_kind_atom(iatom), &
2079 map_atom_to_kind_atom(katom))
2080 swap_id = swap_id + 1
2081 END IF
2082 kind_kind_idx = int(get_1d_idx(lkind, jkind, int(nkind, int_8)))
2083 IF (jkind >= lkind) THEN
2084 ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2085 map_atom_to_kind_atom(latom), &
2086 map_atom_to_kind_atom(jatom))
2087 ELSE
2088 ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2089 map_atom_to_kind_atom(jatom), &
2090 map_atom_to_kind_atom(latom))
2091 swap_id = swap_id + 2
2092 END IF
2093 kind_kind_idx = int(get_1d_idx(lkind, ikind, int(nkind, int_8)))
2094 IF (ikind >= lkind) THEN
2095 ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2096 map_atom_to_kind_atom(latom), &
2097 map_atom_to_kind_atom(iatom))
2098 ELSE
2099 ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2100 map_atom_to_kind_atom(iatom), &
2101 map_atom_to_kind_atom(latom))
2102 swap_id = swap_id + 4
2103 END IF
2104 kind_kind_idx = int(get_1d_idx(kkind, jkind, int(nkind, int_8)))
2105 IF (jkind >= kkind) THEN
2106 ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2107 map_atom_to_kind_atom(katom), &
2108 map_atom_to_kind_atom(jatom))
2109 ELSE
2110 ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2111 map_atom_to_kind_atom(jatom), &
2112 map_atom_to_kind_atom(katom))
2113 swap_id = swap_id + 8
2114 END IF
2115 CASE (hfx_do_eval_forces)
2116 swap_id = 16
2117 kind_kind_idx = int(get_1d_idx(kkind, ikind, int(nkind, int_8)))
2118 IF (ikind >= kkind) THEN
2119 ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2120 map_atom_to_kind_atom(katom), &
2121 map_atom_to_kind_atom(iatom))
2122 ELSE
2123 ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2124 map_atom_to_kind_atom(iatom), &
2125 map_atom_to_kind_atom(katom))
2126 swap_id = swap_id + 1
2127 END IF
2128 kind_kind_idx = int(get_1d_idx(lkind, jkind, int(nkind, int_8)))
2129 IF (jkind >= lkind) THEN
2130 ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2131 map_atom_to_kind_atom(latom), &
2132 map_atom_to_kind_atom(jatom))
2133 ELSE
2134 ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2135 map_atom_to_kind_atom(jatom), &
2136 map_atom_to_kind_atom(latom))
2137 swap_id = swap_id + 2
2138 END IF
2139 kind_kind_idx = int(get_1d_idx(lkind, ikind, int(nkind, int_8)))
2140 IF (ikind >= lkind) THEN
2141 ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2142 map_atom_to_kind_atom(latom), &
2143 map_atom_to_kind_atom(iatom))
2144 ELSE
2145 ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2146 map_atom_to_kind_atom(iatom), &
2147 map_atom_to_kind_atom(latom))
2148 swap_id = swap_id + 4
2149 END IF
2150 kind_kind_idx = int(get_1d_idx(kkind, jkind, int(nkind, int_8)))
2151 IF (jkind >= kkind) THEN
2152 ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2153 map_atom_to_kind_atom(katom), &
2154 map_atom_to_kind_atom(jatom))
2155 ELSE
2156 ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2157 map_atom_to_kind_atom(jatom), &
2158 map_atom_to_kind_atom(katom))
2159 swap_id = swap_id + 8
2160 END IF
2161 END SELECT
2162 END IF
2163
2164 DO i_set_list_ij = i_set_list_ij_start, i_set_list_ij_stop
2165 iset = set_list_ij(i_set_list_ij)%pair(1)
2166 jset = set_list_ij(i_set_list_ij)%pair(2)
2167
2168 max_val1 = coeffs_set(jset, iset, jkind, ikind)%x(1)*rab2 + &
2169 coeffs_set(jset, iset, jkind, ikind)%x(2)
2170
2171 IF (max_val1 + screen_kind_kl + actual_pmax_atom < log10_eps_schwarz) cycle
2172 DO i_set_list_kl = i_set_list_kl_start, i_set_list_kl_stop
2173 kset = set_list_kl(i_set_list_kl)%pair(1)
2174 lset = set_list_kl(i_set_list_kl)%pair(2)
2175
2176 max_val2 = max_val1 + (coeffs_set(lset, kset, lkind, kkind)%x(1)*rcd2 + &
2177 coeffs_set(lset, kset, lkind, kkind)%x(2))
2178
2179 IF (max_val2 + actual_pmax_atom < log10_eps_schwarz) cycle
2180 IF (do_p_screening) THEN
2181 CALL get_pmax_val(ptr_p_1, ptr_p_2, ptr_p_3, ptr_p_4, &
2182 iset, jset, kset, lset, &
2183 pmax_entry, swap_id)
2184 IF (eval_type == hfx_do_eval_forces) THEN
2185 pmax_entry = log_2 + pmax_entry
2186 END IF
2187 ELSE
2188 pmax_entry = 0.0_dp
2189 END IF
2190 max_val2 = max_val2 + pmax_entry
2191 IF (max_val2 < log10_eps_schwarz) cycle
2192 SELECT CASE (eval_type)
2193 CASE (hfx_do_eval_energy)
2194 cost_tmp = cost_model(nsgfa(iset), nsgfb(jset), nsgfc(kset), nsgfd(lset), &
2195 npgfa(iset), npgfb(jset), npgfc(kset), npgfd(lset), &
2196 max_val2/log10_eps_schwarz, &
2198 estimate_block_cost = estimate_block_cost + int(cost_tmp, kind=int_8)
2199 CASE (hfx_do_eval_forces)
2200 cost_tmp = cost_model(nsgfa(iset), nsgfb(jset), nsgfc(kset), nsgfd(lset), &
2201 npgfa(iset), npgfb(jset), npgfc(kset), npgfd(lset), &
2202 max_val2/log10_eps_schwarz, &
2203 p1_forces, p2_forces, p3_forces)
2204 estimate_block_cost = estimate_block_cost + int(cost_tmp, kind=int_8)
2205 END SELECT
2206 END DO ! i_set_list_kl
2207 END DO ! i_set_list_ij
2208 END DO ! i_list_kl
2209 END DO ! i_list_ij
2210
2211 END FUNCTION estimate_block_cost
2212
2213! **************************************************************************************************
2214!> \brief ...
2215!> \param nkind ...
2216!> \param para_env ...
2217!> \param natom ...
2218!> \param block_size ...
2219!> \param nblock ...
2220!> \param blocks ...
2221!> \param list_ij ...
2222!> \param list_kl ...
2223!> \param set_list_ij ...
2224!> \param set_list_kl ...
2225!> \param particle_set ...
2226!> \param coeffs_set ...
2227!> \param coeffs_kind ...
2228!> \param is_assoc_atomic_block_global ...
2229!> \param do_periodic ...
2230!> \param kind_of ...
2231!> \param basis_parameter ...
2232!> \param pmax_set ...
2233!> \param pmax_atom ...
2234!> \param pmax_blocks ...
2235!> \param cell ...
2236!> \param do_p_screening ...
2237!> \param map_atom_to_kind_atom ...
2238!> \param eval_type ...
2239!> \param log10_eps_schwarz ...
2240!> \param log_2 ...
2241!> \param coeffs_kind_max0 ...
2242!> \param use_virial ...
2243!> \param atomic_pair_list ...
2244! **************************************************************************************************
2245 SUBROUTINE init_blocks(nkind, para_env, natom, block_size, nblock, blocks, &
2246 list_ij, list_kl, set_list_ij, set_list_kl, &
2247 particle_set, &
2248 coeffs_set, coeffs_kind, &
2249 is_assoc_atomic_block_global, do_periodic, &
2250 kind_of, basis_parameter, pmax_set, pmax_atom, &
2251 pmax_blocks, cell, &
2252 do_p_screening, map_atom_to_kind_atom, eval_type, &
2253 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, &
2254 atomic_pair_list)
2255
2256 INTEGER, INTENT(IN) :: nkind
2257 TYPE(mp_para_env_type), INTENT(IN) :: para_env
2258 INTEGER :: natom, block_size, nblock
2259 TYPE(hfx_block_range_type), DIMENSION(1:nblock) :: blocks
2260 TYPE(pair_list_type) :: list_ij, list_kl
2261 TYPE(pair_set_list_type), DIMENSION(:) :: set_list_ij, set_list_kl
2262 TYPE(particle_type), DIMENSION(:), POINTER :: particle_set
2263 TYPE(hfx_screen_coeff_type), &
2264 DIMENSION(:, :, :, :), POINTER :: coeffs_set
2265 TYPE(hfx_screen_coeff_type), DIMENSION(:, :), &
2266 POINTER :: coeffs_kind
2267 INTEGER, DIMENSION(:, :) :: is_assoc_atomic_block_global
2268 LOGICAL :: do_periodic
2269 INTEGER :: kind_of(*)
2270 TYPE(hfx_basis_type), DIMENSION(:), POINTER :: basis_parameter
2271 TYPE(hfx_p_kind), DIMENSION(:), POINTER :: pmax_set
2272 REAL(dp), DIMENSION(:, :), POINTER :: pmax_atom
2273 REAL(dp) :: pmax_blocks
2274 TYPE(cell_type), POINTER :: cell
2275 LOGICAL, INTENT(IN) :: do_p_screening
2276 INTEGER, DIMENSION(:), POINTER :: map_atom_to_kind_atom
2277 INTEGER, INTENT(IN) :: eval_type
2278 REAL(dp) :: log10_eps_schwarz, log_2, &
2279 coeffs_kind_max0
2280 LOGICAL, INTENT(IN) :: use_virial
2281 LOGICAL, DIMENSION(natom, natom) :: atomic_pair_list
2282
2283 INTEGER :: atom_block, i, iatom_block, iatom_end, &
2284 iatom_start, my_cpu_rank, ncpus
2285
2286 DO atom_block = 0, nblock - 1
2287 iatom_block = modulo(atom_block, nblock) + 1
2288 iatom_start = (iatom_block - 1)*block_size + 1
2289 iatom_end = min(iatom_block*block_size, natom)
2290 blocks(atom_block + 1)%istart = iatom_start
2291 blocks(atom_block + 1)%iend = iatom_end
2292 blocks(atom_block + 1)%cost = 0_int_8
2293 END DO
2294
2295 ncpus = para_env%num_pe
2296 my_cpu_rank = para_env%mepos
2297 DO i = 1, nblock
2298 IF (modulo(i, ncpus) /= my_cpu_rank) THEN
2299 blocks(i)%istart = 0
2300 blocks(i)%iend = 0
2301 cycle
2302 END IF
2303 iatom_start = blocks(i)%istart
2304 iatom_end = blocks(i)%iend
2305 blocks(i)%cost = estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
2306 iatom_start, iatom_end, iatom_start, iatom_end, &
2307 iatom_start, iatom_end, iatom_start, iatom_end, &
2308 particle_set, &
2309 coeffs_set, coeffs_kind, &
2310 is_assoc_atomic_block_global, do_periodic, &
2311 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
2312 cell, &
2313 do_p_screening, map_atom_to_kind_atom, eval_type, &
2314 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
2315
2316 END DO
2317 END SUBROUTINE init_blocks
2318
2319! **************************************************************************************************
2320!> \brief ...
2321!> \param para_env ...
2322!> \param x_data ...
2323!> \param iw ...
2324!> \param n_threads ...
2325!> \param i_thread ...
2326!> \param eval_type ...
2327! **************************************************************************************************
2328 SUBROUTINE collect_load_balance_info(para_env, x_data, iw, n_threads, i_thread, &
2329 eval_type)
2330
2331 TYPE(mp_para_env_type), INTENT(IN) :: para_env
2332 TYPE(hfx_type), POINTER :: x_data
2333 INTEGER, INTENT(IN) :: iw, n_threads, i_thread, eval_type
2334
2335 INTEGER :: i, j, k, my_rank, nbins, nranks, &
2336 total_bins
2337 INTEGER(int_8) :: avg_bin, avg_rank, max_bin, max_rank, &
2338 min_bin, min_rank, sum_bin, sum_rank
2339 INTEGER(int_8), ALLOCATABLE, DIMENSION(:) :: buffer, buffer_in, buffer_out, summary
2340 INTEGER(int_8), ALLOCATABLE, DIMENSION(:), SAVE :: shm_cost_vector
2341 INTEGER, ALLOCATABLE, DIMENSION(:) :: bins_per_rank, rdispl, sort_idx
2342 INTEGER, ALLOCATABLE, DIMENSION(:), SAVE :: shm_bins_per_rank, shm_displ
2343
2344 SELECT CASE (eval_type)
2345 CASE (hfx_do_eval_energy)
2346 nbins = SIZE(x_data%distribution_energy)
2347 CASE (hfx_do_eval_forces)
2348 nbins = SIZE(x_data%distribution_forces)
2349 END SELECT
2350
2351!$OMP MASTER
2352 ALLOCATE (shm_bins_per_rank(n_threads))
2353 ALLOCATE (shm_displ(n_threads + 1))
2354!$OMP END MASTER
2355!$OMP BARRIER
2356
2357 shm_bins_per_rank(i_thread + 1) = nbins
2358!$OMP BARRIER
2359 nbins = 0
2360 DO i = 1, n_threads
2361 nbins = nbins + shm_bins_per_rank(i)
2362 END DO
2363 my_rank = para_env%mepos
2364 nranks = para_env%num_pe
2365
2366!$OMP BARRIER
2367!$OMP MASTER
2368 ALLOCATE (bins_per_rank(nranks))
2369 bins_per_rank = 0
2370
2371 bins_per_rank(my_rank + 1) = nbins
2372
2373 CALL para_env%sum(bins_per_rank)
2374
2375 total_bins = 0
2376 DO i = 1, nranks
2377 total_bins = total_bins + bins_per_rank(i)
2378 END DO
2379
2380 ALLOCATE (shm_cost_vector(2*total_bins))
2381 shm_cost_vector = -1_int_8
2382 shm_displ(1) = 1
2383 DO i = 2, n_threads
2384 shm_displ(i) = shm_displ(i - 1) + shm_bins_per_rank(i - 1)
2385 END DO
2386 shm_displ(n_threads + 1) = nbins + 1
2387!$OMP END MASTER
2388!$OMP BARRIER
2389 j = 0
2390 SELECT CASE (eval_type)
2391 CASE (hfx_do_eval_energy)
2392 DO i = shm_displ(i_thread + 1), shm_displ(i_thread + 2) - 1
2393 j = j + 1
2394 shm_cost_vector(2*(i - 1) + 1) = x_data%distribution_energy(j)%cost
2395 shm_cost_vector(2*i) = int(x_data%distribution_energy(j)%time_first_scf*10000.0_dp, kind=int_8)
2396 END DO
2397 CASE (hfx_do_eval_forces)
2398 DO i = shm_displ(i_thread + 1), shm_displ(i_thread + 2) - 1
2399 j = j + 1
2400 shm_cost_vector(2*(i - 1) + 1) = x_data%distribution_forces(j)%cost
2401 shm_cost_vector(2*i) = int(x_data%distribution_forces(j)%time_forces*10000.0_dp, kind=int_8)
2402 END DO
2403 END SELECT
2404!$OMP BARRIER
2405!$OMP MASTER
2406 ! ** calculate offsets
2407 ALLOCATE (rdispl(nranks))
2408 bins_per_rank(:) = bins_per_rank(:)*2
2409 rdispl(1) = 0
2410 DO i = 2, nranks
2411 rdispl(i) = rdispl(i - 1) + bins_per_rank(i - 1)
2412 END DO
2413
2414 ALLOCATE (buffer_in(2*nbins))
2415 ALLOCATE (buffer_out(2*total_bins))
2416
2417 DO i = 1, nbins
2418 buffer_in(2*(i - 1) + 1) = shm_cost_vector(2*(i - 1) + 1)
2419 buffer_in(2*i) = shm_cost_vector(2*i)
2420 END DO
2421
2422 CALL para_env%gatherv(buffer_in, buffer_out, bins_per_rank, rdispl)
2423
2424 IF (iw > 0) THEN
2425
2426 ALLOCATE (summary(2*nranks))
2427 summary = 0_int_8
2428
2429 WRITE (iw, '( /, 1X, 79("-") )')
2430 WRITE (iw, '( " -", 77X, "-" )')
2431 SELECT CASE (eval_type)
2432 CASE (hfx_do_eval_energy)
2433 WRITE (iw, '( " -", 20X, A, 19X, "-" )') ' HFX LOAD BALANCE INFORMATION - ENERGY '
2434 CASE (hfx_do_eval_forces)
2435 WRITE (iw, '( " -", 20X, A, 19X, "-" )') ' HFX LOAD BALANCE INFORMATION - FORCES '
2436 END SELECT
2437 WRITE (iw, '( " -", 77X, "-" )')
2438 WRITE (iw, '( 1X, 79("-") )')
2439
2440 WRITE (iw, fmt="(T3,A,T15,A,T35,A,T55,A)") "MPI RANK", "BIN #", "EST cost", "Processing time [s]"
2441 WRITE (iw, '( 1X, 79("-"), / )')
2442 k = 0
2443 DO i = 1, nranks
2444 DO j = 1, bins_per_rank(i)/2
2445 k = k + 1
2446 WRITE (iw, fmt="(T6,I5,T15,I5,T27,I16,T55,F19.8)") &
2447 i - 1, j, buffer_out(2*(k - 1) + 1), real(buffer_out(2*k), dp)/10000.0_dp
2448 summary(2*(i - 1) + 1) = summary(2*(i - 1) + 1) + buffer_out(2*(k - 1) + 1)
2449 summary(2*i) = summary(2*i) + buffer_out(2*k)
2450 END DO
2451 END DO
2452
2453 !** Summary
2454 max_bin = 0_int_8
2455 min_bin = huge(min_bin)
2456 sum_bin = 0_int_8
2457 DO i = 1, total_bins
2458 sum_bin = sum_bin + buffer_out(2*i)
2459 max_bin = max(max_bin, buffer_out(2*i))
2460 min_bin = min(min_bin, buffer_out(2*i))
2461 END DO
2462 avg_bin = sum_bin/total_bins
2463
2464 max_rank = 0_int_8
2465 min_rank = huge(min_rank)
2466 sum_rank = 0_int_8
2467 DO i = 1, nranks
2468 sum_rank = sum_rank + summary(2*i)
2469 max_rank = max(max_rank, summary(2*i))
2470 min_rank = min(min_rank, summary(2*i))
2471 END DO
2472 avg_rank = sum_rank/nranks
2473
2474 WRITE (iw, fmt='(/,T3,A,/)') "SUMMARY:"
2475 WRITE (iw, fmt="(T3,A,T35,F19.8)") "Max bin", real(max_bin, dp)/10000.0_dp
2476 WRITE (iw, fmt="(T3,A,T35,F19.8)") "Min bin", real(min_bin, dp)/10000.0_dp
2477 WRITE (iw, fmt="(T3,A,T35,F19.8)") "Sum bin", real(sum_bin, dp)/10000.0_dp
2478 WRITE (iw, fmt="(T3,A,T35,F19.8,/)") "Avg bin", real(avg_bin, dp)/10000.0_dp
2479 WRITE (iw, fmt="(T3,A,T35,F19.8)") "Max rank", real(max_rank, dp)/10000.0_dp
2480 WRITE (iw, fmt="(T3,A,T35,F19.8)") "Min rank", real(min_rank, dp)/10000.0_dp
2481 WRITE (iw, fmt="(T3,A,T35,F19.8)") "Sum rank", real(sum_rank, dp)/10000.0_dp
2482 WRITE (iw, fmt="(T3,A,T35,F19.8,/)") "Avg rank", real(avg_rank, dp)/10000.0_dp
2483
2484 ALLOCATE (buffer(nranks))
2485 ALLOCATE (sort_idx(nranks))
2486
2487 DO i = 1, nranks
2488 buffer(i) = summary(2*i)
2489 END DO
2490
2491 CALL sort(buffer, nranks, sort_idx)
2492
2493 WRITE (iw, fmt="(T3,A,T35,A,T55,A,/)") "MPI RANK", "EST cost", "Processing time [s]"
2494 DO i = nranks, 1, -1
2495 WRITE (iw, fmt="(T6,I5,T27,I16,T55,F19.8)") sort_idx(i) - 1, summary(2*(sort_idx(i) - 1) + 1), real(buffer(i), dp)/10000.0_dp
2496 END DO
2497
2498 DEALLOCATE (summary, buffer, sort_idx)
2499
2500 END IF
2501
2502 DEALLOCATE (buffer_in, buffer_out, rdispl)
2503
2504 CALL para_env%sync()
2505
2506 DEALLOCATE (shm_bins_per_rank, shm_displ, shm_cost_vector)
2507!$OMP END MASTER
2508!$OMP BARRIER
2509
2510 END SUBROUTINE collect_load_balance_info
2511
2512
2513! **************************************************************************************************
2514!> \brief This routine calculates the maximum density matrix element, when
2515!> screening on an initial density matrix is applied. Due to symmetry of
2516!> the ERI's, there are always 4 matrix elements to be considered.
2517!> CASE 0-15 belong to an energy calculation (linear screening)
2518!> CASE 16-31 belong to a force calculation (square screening)
2519!> \param ptr_p_1 Pointers to atomic density matrices
2520!> \param ptr_p_2 Pointers to atomic density matrices
2521!> \param ptr_p_3 Pointers to atomic density matrices
2522!> \param ptr_p_4 Pointers to atomic density matrices
2523!> \param iset Current set
2524!> \param jset Current set
2525!> \param kset Current set
2526!> \param lset Current set
2527!> \param pmax_val value to be calculated
2528!> \param swap_id Defines how the matrices are accessed
2529!> \par History
2530!> 06.2009 created [Manuel Guidon]
2531!> \author Manuel Guidon
2532! **************************************************************************************************
2533PURE SUBROUTINE get_pmax_val(ptr_p_1, ptr_p_2, ptr_p_3, ptr_p_4, iset, jset, kset, lset, pmax_val, swap_id)
2534
2535 REAL(dp), DIMENSION(:, :), POINTER :: ptr_p_1, ptr_p_2, ptr_p_3, ptr_p_4
2536 INTEGER, INTENT(IN) :: iset, jset, kset, lset
2537
2538 REAL(dp), INTENT(OUT) :: pmax_val
2539 INTEGER, INTENT(IN) :: swap_id
2540
2541 REAL(dp) :: pmax_1, pmax_2, pmax_3, pmax_4
2542
2543 SELECT CASE (swap_id)
2544 CASE (0)
2545 pmax_1 = ptr_p_1(kset, iset)
2546 pmax_2 = ptr_p_2(lset, jset)
2547 pmax_3 = ptr_p_3(lset, iset)
2548 pmax_4 = ptr_p_4(kset, jset)
2549 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2550 CASE (1)
2551 pmax_1 = ptr_p_1(iset, kset)
2552 pmax_2 = ptr_p_2(lset, jset)
2553 pmax_3 = ptr_p_3(lset, iset)
2554 pmax_4 = ptr_p_4(kset, jset)
2555 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2556 CASE (2)
2557 pmax_1 = ptr_p_1(kset, iset)
2558 pmax_2 = ptr_p_2(jset, lset)
2559 pmax_3 = ptr_p_3(lset, iset)
2560 pmax_4 = ptr_p_4(kset, jset)
2561 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2562 CASE (3)
2563 pmax_1 = ptr_p_1(iset, kset)
2564 pmax_2 = ptr_p_2(jset, lset)
2565 pmax_3 = ptr_p_3(lset, iset)
2566 pmax_4 = ptr_p_4(kset, jset)
2567 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2568 CASE (4)
2569 pmax_1 = ptr_p_1(kset, iset)
2570 pmax_2 = ptr_p_2(lset, jset)
2571 pmax_3 = ptr_p_3(iset, lset)
2572 pmax_4 = ptr_p_4(kset, jset)
2573 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2574 CASE (5)
2575 pmax_1 = ptr_p_1(iset, kset)
2576 pmax_2 = ptr_p_2(lset, jset)
2577 pmax_3 = ptr_p_3(iset, lset)
2578 pmax_4 = ptr_p_4(kset, jset)
2579 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2580 CASE (6)
2581 pmax_1 = ptr_p_1(kset, iset)
2582 pmax_2 = ptr_p_2(jset, lset)
2583 pmax_3 = ptr_p_3(iset, lset)
2584 pmax_4 = ptr_p_4(kset, jset)
2585 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2586 CASE (7)
2587 pmax_1 = ptr_p_1(iset, kset)
2588 pmax_2 = ptr_p_2(jset, lset)
2589 pmax_3 = ptr_p_3(iset, lset)
2590 pmax_4 = ptr_p_4(kset, jset)
2591 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2592 CASE (8)
2593 pmax_1 = ptr_p_1(kset, iset)
2594 pmax_2 = ptr_p_2(lset, jset)
2595 pmax_3 = ptr_p_3(lset, iset)
2596 pmax_4 = ptr_p_4(jset, kset)
2597 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2598 CASE (9)
2599 pmax_1 = ptr_p_1(iset, kset)
2600 pmax_2 = ptr_p_2(lset, jset)
2601 pmax_3 = ptr_p_3(lset, iset)
2602 pmax_4 = ptr_p_4(jset, kset)
2603 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2604 CASE (10)
2605 pmax_1 = ptr_p_1(kset, iset)
2606 pmax_2 = ptr_p_2(jset, lset)
2607 pmax_3 = ptr_p_3(lset, iset)
2608 pmax_4 = ptr_p_4(jset, kset)
2609 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2610 CASE (11)
2611 pmax_1 = ptr_p_1(iset, kset)
2612 pmax_2 = ptr_p_2(jset, lset)
2613 pmax_3 = ptr_p_3(lset, iset)
2614 pmax_4 = ptr_p_4(jset, kset)
2615 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2616 CASE (12)
2617 pmax_1 = ptr_p_1(kset, iset)
2618 pmax_2 = ptr_p_2(lset, jset)
2619 pmax_3 = ptr_p_3(iset, lset)
2620 pmax_4 = ptr_p_4(jset, kset)
2621 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2622 CASE (13)
2623 pmax_1 = ptr_p_1(iset, kset)
2624 pmax_2 = ptr_p_2(lset, jset)
2625 pmax_3 = ptr_p_3(iset, lset)
2626 pmax_4 = ptr_p_4(jset, kset)
2627 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2628 CASE (14)
2629 pmax_1 = ptr_p_1(kset, iset)
2630 pmax_2 = ptr_p_2(jset, lset)
2631 pmax_3 = ptr_p_3(iset, lset)
2632 pmax_4 = ptr_p_4(jset, kset)
2633 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2634 CASE (15)
2635 pmax_1 = ptr_p_1(iset, kset)
2636 pmax_2 = ptr_p_2(jset, lset)
2637 pmax_3 = ptr_p_3(iset, lset)
2638 pmax_4 = ptr_p_4(jset, kset)
2639 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2640 CASE (16)
2641 pmax_1 = ptr_p_1(kset, iset)
2642 pmax_2 = ptr_p_2(lset, jset)
2643 pmax_3 = ptr_p_3(lset, iset)
2644 pmax_4 = ptr_p_4(kset, jset)
2645 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2646 CASE (17)
2647 pmax_1 = ptr_p_1(iset, kset)
2648 pmax_2 = ptr_p_2(lset, jset)
2649 pmax_3 = ptr_p_3(lset, iset)
2650 pmax_4 = ptr_p_4(kset, jset)
2651 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2652 CASE (18)
2653 pmax_1 = ptr_p_1(kset, iset)
2654 pmax_2 = ptr_p_2(jset, lset)
2655 pmax_3 = ptr_p_3(lset, iset)
2656 pmax_4 = ptr_p_4(kset, jset)
2657 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2658 CASE (19)
2659 pmax_1 = ptr_p_1(iset, kset)
2660 pmax_2 = ptr_p_2(jset, lset)
2661 pmax_3 = ptr_p_3(lset, iset)
2662 pmax_4 = ptr_p_4(kset, jset)
2663 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2664 CASE (20)
2665 pmax_1 = ptr_p_1(kset, iset)
2666 pmax_2 = ptr_p_2(lset, jset)
2667 pmax_3 = ptr_p_3(iset, lset)
2668 pmax_4 = ptr_p_4(kset, jset)
2669 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2670 CASE (21)
2671 pmax_1 = ptr_p_1(iset, kset)
2672 pmax_2 = ptr_p_2(lset, jset)
2673 pmax_3 = ptr_p_3(iset, lset)
2674 pmax_4 = ptr_p_4(kset, jset)
2675 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2676 CASE (22)
2677 pmax_1 = ptr_p_1(kset, iset)
2678 pmax_2 = ptr_p_2(jset, lset)
2679 pmax_3 = ptr_p_3(iset, lset)
2680 pmax_4 = ptr_p_4(kset, jset)
2681 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2682 CASE (23)
2683 pmax_1 = ptr_p_1(iset, kset)
2684 pmax_2 = ptr_p_2(jset, lset)
2685 pmax_3 = ptr_p_3(iset, lset)
2686 pmax_4 = ptr_p_4(kset, jset)
2687 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2688 CASE (24)
2689 pmax_1 = ptr_p_1(kset, iset)
2690 pmax_2 = ptr_p_2(lset, jset)
2691 pmax_3 = ptr_p_3(lset, iset)
2692 pmax_4 = ptr_p_4(jset, kset)
2693 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2694 CASE (25)
2695 pmax_1 = ptr_p_1(iset, kset)
2696 pmax_2 = ptr_p_2(lset, jset)
2697 pmax_3 = ptr_p_3(lset, iset)
2698 pmax_4 = ptr_p_4(jset, kset)
2699 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2700 CASE (26)
2701 pmax_1 = ptr_p_1(kset, iset)
2702 pmax_2 = ptr_p_2(jset, lset)
2703 pmax_3 = ptr_p_3(lset, iset)
2704 pmax_4 = ptr_p_4(jset, kset)
2705 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2706 CASE (27)
2707 pmax_1 = ptr_p_1(iset, kset)
2708 pmax_2 = ptr_p_2(jset, lset)
2709 pmax_3 = ptr_p_3(lset, iset)
2710 pmax_4 = ptr_p_4(jset, kset)
2711 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2712 CASE (28)
2713 pmax_1 = ptr_p_1(kset, iset)
2714 pmax_2 = ptr_p_2(lset, jset)
2715 pmax_3 = ptr_p_3(iset, lset)
2716 pmax_4 = ptr_p_4(jset, kset)
2717 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2718 CASE (29)
2719 pmax_1 = ptr_p_1(iset, kset)
2720 pmax_2 = ptr_p_2(lset, jset)
2721 pmax_3 = ptr_p_3(iset, lset)
2722 pmax_4 = ptr_p_4(jset, kset)
2723 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2724 CASE (30)
2725 pmax_1 = ptr_p_1(kset, iset)
2726 pmax_2 = ptr_p_2(jset, lset)
2727 pmax_3 = ptr_p_3(iset, lset)
2728 pmax_4 = ptr_p_4(jset, kset)
2729 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2730 CASE (31)
2731 pmax_1 = ptr_p_1(iset, kset)
2732 pmax_2 = ptr_p_2(jset, lset)
2733 pmax_3 = ptr_p_3(iset, lset)
2734 pmax_4 = ptr_p_4(jset, kset)
2735 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2736 END SELECT
2737
2738END SUBROUTINE get_pmax_val
2739
2740
2741END MODULE hfx_load_balance_methods
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
static GRID_HOST_DEVICE int idx(const orbital a)
Return coset index of given orbital angular momentum.
Handles all functions related to the CELL.
Definition cell_types.F:15
Utility routines to open and close files. Tracking of preconnections.
Definition cp_files.F:16
subroutine, public open_file(file_name, file_status, file_form, file_action, file_position, file_pad, unit_number, debug, skip_get_unit_number, file_access)
Opens the requested file using a free unit number.
Definition cp_files.F:311
subroutine, public close_file(unit_number, file_status, keep_preconnection)
Close an open file given by its logical unit number. Optionally, keep the file and unit preconnected.
Definition cp_files.F:122
Routines for optimizing load balance between processes in HFX calculations.
real(kind=dp), dimension(12), parameter, public p1_energy
real(kind=dp), dimension(2), parameter, public p3_energy
real(kind=dp), dimension(12), parameter, public p2_energy
subroutine, public collect_load_balance_info(para_env, x_data, iw, n_threads, i_thread, eval_type)
...
subroutine, public hfx_load_balance(x_data, eps_schwarz, particle_set, max_set, para_env, coeffs_set, coeffs_kind, is_assoc_atomic_block_global, do_periodic, load_balance_parameter, kind_of, basis_parameter, pmax_set, pmax_atom, i_thread, n_threads, cell, do_p_screening, map_atom_to_kind_atom, nkind, eval_type, pmax_block, use_virial)
Distributes the computation of eri's to all available processes.
integer(kind=int_8) function, public cost_model(nsa, nsb, nsc, nsd, npgfa, npgfb, npgfc, npgfd, ratio, p1, p2, p3)
estimates the cost of a set quartet with info available at load balance time i.e. without much info o...
subroutine, public hfx_update_load_balance(x_data, para_env, load_balance_parameter, i_thread, n_threads, eval_type)
Cheap way of redistributing the eri's.
Routines for optimizing load balance between processes in HFX calculations.
subroutine, public build_atomic_pair_list(natom, atomic_pair_list, kind_of, basis_parameter, particle_set, do_periodic, coeffs_kind, coeffs_kind_max0, log10_eps_schwarz, cell, blocks)
...
subroutine, public build_pair_list(natom, list, set_list, i_start, i_end, j_start, j_end, kind_of, basis_parameter, particle_set, do_periodic, coeffs_set, coeffs_kind, coeffs_kind_max0, log10_eps_schwarz, cell, pmax_blocks, atomic_pair_list)
...
Types and set/get functions for HFX.
Definition hfx_types.F:16
subroutine, public hfx_set_distr_energy(ptr_to_distr, x_data)
This routine stores the data obtained from the load balance routine for the energy
Definition hfx_types.F:2639
subroutine, public hfx_set_distr_forces(ptr_to_distr, x_data)
This routine stores the data obtained from the load balance routine for the forces
Definition hfx_types.F:2659
collects all constants needed in input so that they can be used without circular dependencies
integer, parameter, public hfx_do_eval_energy
integer, parameter, public hfx_do_eval_forces
Defines the basic variable types.
Definition kinds.F:23
integer, parameter, public int_8
Definition kinds.F:54
integer, parameter, public dp
Definition kinds.F:34
Interface to the message passing library MPI.
Parallel (pseudo)random number generator (RNG) for multiple streams and substreams of random numbers.
integer, parameter, public uniform
Define the data structure for the particle information.
All kind of helpful little routines.
Definition util.F:14
Type defining parameters related to the simulation cell.
Definition cell_types.F:60
stores some data used in construction of Kohn-Sham matrix
Definition hfx_types.F:514
stores all the informations relevant to an mpi environment