(git:374b731)
Loading...
Searching...
No Matches
hfx_load_balance_methods.F
Go to the documentation of this file.
1!--------------------------------------------------------------------------------------------------!
2! CP2K: A general program to perform molecular dynamics simulations !
3! Copyright 2000-2024 CP2K developers group <https://cp2k.org> !
4! !
5! SPDX-License-Identifier: GPL-2.0-or-later !
6!--------------------------------------------------------------------------------------------------!
7
8! **************************************************************************************************
9!> \brief Routines for optimizing load balance between processes in HFX calculations
10!> \par History
11!> 04.2008 created [Manuel Guidon]
12!> \author Manuel Guidon
13! **************************************************************************************************
15 USE cell_types, ONLY: cell_type
16 USE cp_files, ONLY: close_file, &
21 USE hfx_types, ONLY: &
27 USE kinds, ONLY: dp, &
28 int_8
30 USE parallel_rng_types, ONLY: uniform, &
33 USE util, ONLY: sort
34#include "./base/base_uses.f90"
35
36 IMPLICIT NONE
37 PRIVATE
38
39 PUBLIC :: hfx_load_balance, &
42
43 CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'hfx_load_balance_methods'
44
45 REAL(kind=dp), PARAMETER :: p1_energy(12) = (/2.9461408209700424_dp, 1.0624718662999657_dp, &
46 -1.91570128356921242e-002_dp, -1.6668495454436603_dp, &
47 1.7512639006523709_dp, -9.76074323945336081e-002_dp, &
48 2.6230786127311889_dp, -0.31870737623014189_dp, &
49 7.9588203912690973_dp, 1.8331423413134813_dp, &
50 -0.15427618665346299_dp, 0.19749436090711650_dp/)
51 REAL(kind=dp), PARAMETER :: p2_energy(12) = (/2.3104682960662593_dp, 1.8744052737304417_dp, &
52 -9.36564055598656797e-002_dp, 0.64284973765086939_dp, &
53 1.0137565430060556_dp, -6.80088178288954567e-003_dp, &
54 1.1692629207374552_dp, -2.6314710080507573_dp, &
55 19.237814781880786_dp, 1.0505934173661349_dp, &
56 0.80382371955699250_dp, 0.49903401991818103_dp/)
57 REAL(kind=dp), PARAMETER :: p3_energy(2) = (/7.82336287670072350e-002_dp, 0.38073304105744837_dp/)
58 REAL(kind=dp), PARAMETER :: p1_forces(12) = (/2.5746279948798874_dp, 1.3420575378609276_dp, &
59 -9.41673106447732111e-002_dp, 0.94568006899317825_dp, &
60 -1.4511897117448544_dp, 0.59178934677316952_dp, &
61 2.7291149361757236_dp, -0.50555512044800210_dp, &
62 8.3508180969609871_dp, 1.6829982496141809_dp, &
63 -0.74895370472152600_dp, 0.43801726744197500_dp/)
64 REAL(kind=dp), PARAMETER :: p2_forces(12) = (/2.6398568961569020_dp, 2.3024918834564101_dp, &
65 5.33216585432061581e-003_dp, 0.45572145697283628_dp, &
66 1.8119743851500618_dp, -0.12533918548421166_dp, &
67 -1.4040312084552751_dp, -4.5331650463917859_dp, &
68 12.593431549069477_dp, 1.1311978374487595_dp, &
69 1.4245996087624646_dp, 1.1425350529853495_dp/)
70 REAL(kind=dp), PARAMETER :: p3_forces(2) = (/0.12051930516830946_dp, 1.3828051586144336_dp/)
71
72!***
73
74CONTAINS
75
76! **************************************************************************************************
77!> \brief Distributes the computation of eri's to all available processes.
78!> \param x_data Object that stores the indices array
79!> \param eps_schwarz screening parameter
80!> \param particle_set , atomic_kind_set, para_env ...
81!> \param max_set Maximum number of set to be considered
82!> \param para_env para_env
83!> \param coeffs_set screening functions
84!> \param coeffs_kind screening functions
85!> \param is_assoc_atomic_block_global KS-matrix sparsity
86!> \param do_periodic flag for periodicity
87!> \param load_balance_parameter Parameters for Monte-Carlo routines
88!> \param kind_of helper array for mapping
89!> \param basis_parameter Basis set parameters
90!> \param pmax_set Initial screening matrix
91!> \param pmax_atom ...
92!> \param i_thread Process ID of current Thread
93!> \param n_threads Total Number of Threads
94!> \param cell cell
95!> \param do_p_screening Flag for initial p screening
96!> \param map_atom_to_kind_atom ...
97!> \param nkind ...
98!> \param eval_type ...
99!> \param pmax_block ...
100!> \param use_virial ...
101!> \par History
102!> 06.2007 created [Manuel Guidon]
103!> 08.2007 new parallel scheme [Manuel Guidon]
104!> 09.2007 new 'modulo' parellel scheme and Monte Carlo step [Manuel Guidon]
105!> 11.2007 parallelize load balance on box_idx1 [Manuel Guidon]
106!> 02.2009 completely refactored [Manuel Guidon]
107!> \author Manuel Guidon
108!> \note
109!> The optimization is done via a binning procedure followed by simple
110!> Monte Carlo procedure:
111!> In a first step the total amount of integrals in the system is calculated,
112!> taking into account the sparsity of the KS-matrix , the screening based
113!> on near/farfield approximations and if desired the screening on an initial
114!> density matrix.
115!> In a second step, bins are generate that contain approximately the same number
116!> of integrals, and a cost for these bins is estimated (currently the number of integrals)
117!> In a third step, a Monte Carlo procedure optimizes the assignment
118!> of the different loads to each process
119!> At the end each process owns an unique array of *atomic* indices-ranges
120!> that are used to decide whether a process has to calculate a certain
121!> bunch of integrals or not
122! **************************************************************************************************
123 SUBROUTINE hfx_load_balance(x_data, eps_schwarz, particle_set, max_set, para_env, &
124 coeffs_set, coeffs_kind, &
125 is_assoc_atomic_block_global, do_periodic, &
126 load_balance_parameter, kind_of, basis_parameter, pmax_set, &
127 pmax_atom, i_thread, n_threads, cell, &
128 do_p_screening, map_atom_to_kind_atom, nkind, eval_type, &
129 pmax_block, use_virial)
130 TYPE(hfx_type), POINTER :: x_data
131 REAL(dp), INTENT(IN) :: eps_schwarz
132 TYPE(particle_type), DIMENSION(:), POINTER :: particle_set
133 INTEGER, INTENT(IN) :: max_set
134 TYPE(mp_para_env_type), POINTER :: para_env
135 TYPE(hfx_screen_coeff_type), &
136 DIMENSION(:, :, :, :), POINTER :: coeffs_set
137 TYPE(hfx_screen_coeff_type), DIMENSION(:, :), &
138 POINTER :: coeffs_kind
139 INTEGER, DIMENSION(:, :) :: is_assoc_atomic_block_global
140 LOGICAL :: do_periodic
141 TYPE(hfx_load_balance_type), POINTER :: load_balance_parameter
142 INTEGER :: kind_of(*)
143 TYPE(hfx_basis_type), DIMENSION(:), POINTER :: basis_parameter
144 TYPE(hfx_p_kind), DIMENSION(:), POINTER :: pmax_set
145 REAL(dp), DIMENSION(:, :), POINTER :: pmax_atom
146 INTEGER, INTENT(IN) :: i_thread, n_threads
147 TYPE(cell_type), POINTER :: cell
148 LOGICAL, INTENT(IN) :: do_p_screening
149 INTEGER, DIMENSION(:), POINTER :: map_atom_to_kind_atom
150 INTEGER, INTENT(IN) :: nkind, eval_type
151 REAL(dp), DIMENSION(:, :), POINTER :: pmax_block
152 LOGICAL, INTENT(IN) :: use_virial
153
154 CHARACTER(LEN=*), PARAMETER :: routinen = 'hfx_load_balance'
155
156 CHARACTER(LEN=512) :: error_msg
157 INTEGER :: block_size, current_block_id, data_from, dest, handle, handle_inner, &
158 handle_range, i, iatom_block, iatom_end, iatom_start, ibin, icpu, j, jatom_block, &
159 jatom_end, jatom_start, katom_block, katom_end, katom_start, latom_block, latom_end, &
160 latom_start, mepos, my_process_id, n_processes, natom, nbins, nblocks, ncpu, &
161 new_iatom_end, new_iatom_start, new_jatom_end, new_jatom_start, non_empty_blocks, &
162 objective_block_size, objective_nblocks, source, total_blocks
163 TYPE(mp_request_type), DIMENSION(2) :: req
164 INTEGER(int_8) :: atom_block, cost_per_bin, cost_per_core, current_cost, &
165 distribution_counter_end, distribution_counter_start, global_quartet_counter, &
166 local_quartet_counter, self_cost_per_block, tmp_block, total_block_self_cost
167 INTEGER(int_8), ALLOCATABLE, DIMENSION(:) :: buffer_in, buffer_out
168 INTEGER(int_8), DIMENSION(:), POINTER :: local_cost_matrix, recbuffer, &
169 sendbuffer, swapbuffer
170 INTEGER(int_8), DIMENSION(:), POINTER, SAVE :: cost_matrix
171 INTEGER(int_8), SAVE :: shm_global_quartet_counter, &
172 shm_local_quartet_counter
173 INTEGER, ALLOCATABLE, DIMENSION(:) :: rcount, rdispl, tmp_index, tmp_pos, &
174 to_be_sorted
175 INTEGER, DIMENSION(:), POINTER, SAVE :: shm_distribution_vector
176 INTEGER, SAVE :: shm_nblocks
177 LOGICAL :: changed, last_bin_needs_to_be_filled, &
178 optimized
179 LOGICAL, DIMENSION(:, :), POINTER, SAVE :: atomic_pair_list
180 REAL(dp) :: coeffs_kind_max0, log10_eps_schwarz, &
181 log_2, pmax_blocks
182 TYPE(hfx_block_range_type), DIMENSION(:), POINTER :: blocks_guess, tmp_blocks, tmp_blocks2
183 TYPE(hfx_block_range_type), DIMENSION(:), &
184 POINTER, SAVE :: shm_blocks
185 TYPE(hfx_distribution), DIMENSION(:), POINTER :: binned_dist, ptr_to_tmp_dist, tmp_dist
186 TYPE(hfx_distribution), DIMENSION(:, :), POINTER, &
187 SAVE :: full_dist
188 TYPE(pair_list_type) :: list_ij, list_kl
189 TYPE(pair_set_list_type), ALLOCATABLE, &
190 DIMENSION(:) :: set_list_ij, set_list_kl
191
192!$OMP BARRIER
193!$OMP MASTER
194 CALL timeset(routinen, handle)
195!$OMP END MASTER
196!$OMP BARRIER
197
198 log10_eps_schwarz = log10(eps_schwarz)
199 log_2 = log10(2.0_dp)
200 coeffs_kind_max0 = maxval(coeffs_kind(:, :)%x(2))
201 ncpu = para_env%num_pe
202 n_processes = ncpu*n_threads
203 natom = SIZE(particle_set)
204
205 block_size = load_balance_parameter%block_size
206 ALLOCATE (set_list_ij((max_set*block_size)**2))
207 ALLOCATE (set_list_kl((max_set*block_size)**2))
208
209 IF (.NOT. load_balance_parameter%blocks_initialized) THEN
210!$OMP BARRIER
211!$OMP MASTER
212 CALL timeset(routinen//"_range", handle_range)
213
214 nblocks = max((natom + block_size - 1)/block_size, 1)
215 ALLOCATE (blocks_guess(nblocks))
216 ALLOCATE (tmp_blocks(natom))
217 ALLOCATE (tmp_blocks2(natom))
218
219 pmax_blocks = 0.0_dp
220 SELECT CASE (eval_type)
221 CASE (hfx_do_eval_energy)
222 atomic_pair_list => x_data%atomic_pair_list
223 CASE (hfx_do_eval_forces)
224 atomic_pair_list => x_data%atomic_pair_list_forces
225 END SELECT
226 atomic_pair_list = .true.
227 CALL init_blocks(nkind, para_env, natom, block_size, nblocks, blocks_guess, &
228 list_ij, list_kl, set_list_ij, set_list_kl, &
229 particle_set, &
230 coeffs_set, coeffs_kind, &
231 is_assoc_atomic_block_global, do_periodic, &
232 kind_of, basis_parameter, pmax_set, pmax_atom, &
233 pmax_blocks, cell, &
234 do_p_screening, map_atom_to_kind_atom, eval_type, &
235 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
236
237 total_block_self_cost = 0
238
239 DO i = 1, nblocks
240 total_block_self_cost = total_block_self_cost + blocks_guess(i)%cost
241 END DO
242
243 CALL para_env%sum(total_block_self_cost)
244
245 objective_block_size = load_balance_parameter%block_size
246 objective_nblocks = max((natom + objective_block_size - 1)/objective_block_size, 1)
247
248 self_cost_per_block = (total_block_self_cost + objective_nblocks - 1)/(objective_nblocks)
249
250 DO i = 1, nblocks
251 tmp_blocks2(i) = blocks_guess(i)
252 END DO
253
254 optimized = .false.
255 i = 0
256 DO WHILE (.NOT. optimized)
257 i = i + 1
258 current_block_id = 0
259 changed = .false.
260 DO atom_block = 1, nblocks
261 current_block_id = current_block_id + 1
262 iatom_start = tmp_blocks2(atom_block)%istart
263 iatom_end = tmp_blocks2(atom_block)%iend
264 IF (tmp_blocks2(atom_block)%cost > 1.5_dp*self_cost_per_block .AND. iatom_end - iatom_start > 0) THEN
265 changed = .true.
266 new_iatom_start = iatom_start
267 new_iatom_end = (iatom_end - iatom_start + 1)/2 + iatom_start - 1
268 new_jatom_start = new_iatom_end + 1
269 new_jatom_end = iatom_end
270 tmp_blocks(current_block_id)%istart = new_iatom_start
271 tmp_blocks(current_block_id)%iend = new_iatom_end
272 tmp_blocks(current_block_id)%cost = estimate_block_cost( &
273 natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
274 new_iatom_start, new_iatom_end, new_iatom_start, new_iatom_end, &
275 new_iatom_start, new_iatom_end, new_iatom_start, new_iatom_end, &
276 particle_set, &
277 coeffs_set, coeffs_kind, &
278 is_assoc_atomic_block_global, do_periodic, &
279 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
280 cell, &
281 do_p_screening, map_atom_to_kind_atom, eval_type, &
282 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
283 current_block_id = current_block_id + 1
284 tmp_blocks(current_block_id)%istart = new_jatom_start
285 tmp_blocks(current_block_id)%iend = new_jatom_end
286 tmp_blocks(current_block_id)%cost = estimate_block_cost( &
287 natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
288 new_jatom_start, new_jatom_end, new_jatom_start, new_jatom_end, &
289 new_jatom_start, new_jatom_end, new_jatom_start, new_jatom_end, &
290 particle_set, &
291 coeffs_set, coeffs_kind, &
292 is_assoc_atomic_block_global, do_periodic, &
293 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
294 cell, &
295 do_p_screening, map_atom_to_kind_atom, eval_type, &
296 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
297 ELSE
298 tmp_blocks(current_block_id)%istart = iatom_start
299 tmp_blocks(current_block_id)%iend = iatom_end
300 tmp_blocks(current_block_id)%cost = tmp_blocks2(atom_block)%cost
301 END IF
302 END DO
303 IF (.NOT. changed) optimized = .true.
304 IF (i > 20) optimized = .true.
305 nblocks = current_block_id
306 DO atom_block = 1, nblocks
307 tmp_blocks2(atom_block) = tmp_blocks(atom_block)
308 END DO
309 END DO
310
311 DEALLOCATE (tmp_blocks2)
312
313 ! ** count number of non empty blocks on each node
314 non_empty_blocks = 0
315 DO atom_block = 1, nblocks
316 IF (tmp_blocks(atom_block)%istart == 0) cycle
317 non_empty_blocks = non_empty_blocks + 1
318 END DO
319
320 ALLOCATE (rcount(ncpu))
321 rcount = 0
322 rcount(para_env%mepos + 1) = non_empty_blocks
323 CALL para_env%sum(rcount)
324
325 ! ** sum all non_empty_blocks
326 total_blocks = 0
327 DO i = 1, ncpu
328 total_blocks = total_blocks + rcount(i)
329 END DO
330
331 ! ** calculate offsets
332 ALLOCATE (rdispl(ncpu))
333 rcount(:) = rcount(:)*3
334 rdispl(1) = 0
335 DO i = 2, ncpu
336 rdispl(i) = rdispl(i - 1) + rcount(i - 1)
337 END DO
338
339 ALLOCATE (buffer_in(3*non_empty_blocks))
340
341 non_empty_blocks = 0
342 DO atom_block = 1, nblocks
343 IF (tmp_blocks(atom_block)%istart == 0) cycle
344 buffer_in(non_empty_blocks*3 + 1) = tmp_blocks(atom_block)%istart
345 buffer_in(non_empty_blocks*3 + 2) = tmp_blocks(atom_block)%iend
346 buffer_in(non_empty_blocks*3 + 3) = tmp_blocks(atom_block)%cost
347 non_empty_blocks = non_empty_blocks + 1
348 END DO
349
350 nblocks = total_blocks
351
352 ALLOCATE (tmp_blocks2(nblocks))
353
354 ALLOCATE (buffer_out(3*nblocks))
355
356 ! ** Gather all three arrays
357 CALL para_env%allgatherv(buffer_in, buffer_out, rcount, rdispl)
358
359 DO i = 1, nblocks
360 tmp_blocks2(i)%istart = int(buffer_out((i - 1)*3 + 1))
361 tmp_blocks2(i)%iend = int(buffer_out((i - 1)*3 + 2))
362 tmp_blocks2(i)%cost = buffer_out((i - 1)*3 + 3)
363 END DO
364
365 ! ** Now we sort the blocks
366 ALLOCATE (to_be_sorted(nblocks))
367 ALLOCATE (tmp_index(nblocks))
368
369 DO atom_block = 1, nblocks
370 to_be_sorted(atom_block) = tmp_blocks2(atom_block)%istart
371 END DO
372
373 CALL sort(to_be_sorted, nblocks, tmp_index)
374
375 ALLOCATE (x_data%blocks(nblocks))
376
377 DO atom_block = 1, nblocks
378 x_data%blocks(atom_block) = tmp_blocks2(tmp_index(atom_block))
379 END DO
380
381 shm_blocks => x_data%blocks
382 shm_nblocks = nblocks
383
384 ! ** Set nblocks in structure
385 load_balance_parameter%nblocks = nblocks
386
387 DEALLOCATE (blocks_guess, tmp_blocks, tmp_blocks2)
388
389 DEALLOCATE (rcount, rdispl, buffer_in, buffer_out, to_be_sorted, tmp_index)
390
391 load_balance_parameter%blocks_initialized = .true.
392
393 x_data%blocks = shm_blocks
394 load_balance_parameter%nblocks = shm_nblocks
395 load_balance_parameter%blocks_initialized = .true.
396
397 ALLOCATE (x_data%pmax_block(shm_nblocks, shm_nblocks))
398 x_data%pmax_block = 0.0_dp
399 pmax_block => x_data%pmax_block
400 CALL timestop(handle_range)
401!$OMP END MASTER
402!$OMP BARRIER
403
404 IF (.NOT. load_balance_parameter%blocks_initialized) THEN
405 ALLOCATE (x_data%blocks(shm_nblocks))
406 x_data%blocks = shm_blocks
407 load_balance_parameter%nblocks = shm_nblocks
408 load_balance_parameter%blocks_initialized = .true.
409 END IF
410 !! ** precalculate maximum density matrix elements in blocks
411!$OMP BARRIER
412 END IF
413
414!$OMP BARRIER
415!$OMP MASTER
416 pmax_block => x_data%pmax_block
417 pmax_block = 0.0_dp
418 IF (do_p_screening) THEN
419 DO iatom_block = 1, shm_nblocks
420 iatom_start = x_data%blocks(iatom_block)%istart
421 iatom_end = x_data%blocks(iatom_block)%iend
422 DO jatom_block = 1, shm_nblocks
423 jatom_start = x_data%blocks(jatom_block)%istart
424 jatom_end = x_data%blocks(jatom_block)%iend
425 pmax_block(iatom_block, jatom_block) = maxval(pmax_atom(iatom_start:iatom_end, jatom_start:jatom_end))
426 END DO
427 END DO
428 END IF
429
430 SELECT CASE (eval_type)
431 CASE (hfx_do_eval_energy)
432 atomic_pair_list => x_data%atomic_pair_list
433 CASE (hfx_do_eval_forces)
434 atomic_pair_list => x_data%atomic_pair_list_forces
435 END SELECT
436 CALL build_atomic_pair_list(natom, atomic_pair_list, kind_of, basis_parameter, particle_set, &
437 do_periodic, coeffs_kind, coeffs_kind_max0, log10_eps_schwarz, cell, &
438 x_data%blocks)
439
440!$OMP END MASTER
441!$OMP BARRIER
442
443 !! If there is only 1 cpu skip the binning
444 IF (n_processes == 1) THEN
445 ALLOCATE (tmp_dist(1))
446 tmp_dist(1)%number_of_atom_quartets = huge(tmp_dist(1)%number_of_atom_quartets)
447 tmp_dist(1)%istart = 0_int_8
448 ptr_to_tmp_dist => tmp_dist(:)
449 SELECT CASE (eval_type)
450 CASE (hfx_do_eval_energy)
451 CALL hfx_set_distr_energy(ptr_to_tmp_dist, x_data)
452 CASE (hfx_do_eval_forces)
453 CALL hfx_set_distr_forces(ptr_to_tmp_dist, x_data)
454 END SELECT
455 DEALLOCATE (tmp_dist)
456 ELSE
457 !! Calculate total numbers of integrals that have to be calculated (wrt screening and symmetry)
458!$OMP BARRIER
459!$OMP MASTER
460 CALL timeset(routinen//"_count", handle_inner)
461!$OMP END MASTER
462!$OMP BARRIER
463
464 cost_per_core = 0_int_8
465 my_process_id = para_env%mepos*n_threads + i_thread
466 nblocks = load_balance_parameter%nblocks
467
468 DO atom_block = my_process_id, int(nblocks, kind=int_8)**4 - 1, n_processes
469
470 latom_block = int(modulo(atom_block, int(nblocks, kind=int_8))) + 1
471 tmp_block = atom_block/nblocks
472 katom_block = int(modulo(tmp_block, int(nblocks, kind=int_8))) + 1
473 IF (latom_block < katom_block) cycle
474 tmp_block = tmp_block/nblocks
475 jatom_block = int(modulo(tmp_block, int(nblocks, kind=int_8))) + 1
476 tmp_block = tmp_block/nblocks
477 iatom_block = int(modulo(tmp_block, int(nblocks, kind=int_8))) + 1
478 IF (jatom_block < iatom_block) cycle
479
480 iatom_start = x_data%blocks(iatom_block)%istart
481 iatom_end = x_data%blocks(iatom_block)%iend
482 jatom_start = x_data%blocks(jatom_block)%istart
483 jatom_end = x_data%blocks(jatom_block)%iend
484 katom_start = x_data%blocks(katom_block)%istart
485 katom_end = x_data%blocks(katom_block)%iend
486 latom_start = x_data%blocks(latom_block)%istart
487 latom_end = x_data%blocks(latom_block)%iend
488
489 SELECT CASE (eval_type)
490 CASE (hfx_do_eval_energy)
491 pmax_blocks = max(pmax_block(katom_block, iatom_block), &
492 pmax_block(latom_block, jatom_block), &
493 pmax_block(latom_block, iatom_block), &
494 pmax_block(katom_block, jatom_block))
495 CASE (hfx_do_eval_forces)
496 pmax_blocks = max(pmax_block(katom_block, iatom_block) + &
497 pmax_block(latom_block, jatom_block), &
498 pmax_block(latom_block, iatom_block) + &
499 pmax_block(katom_block, jatom_block))
500 END SELECT
501
502 IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) cycle
503
504 cost_per_core = cost_per_core &
505 + estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
506 iatom_start, iatom_end, jatom_start, jatom_end, &
507 katom_start, katom_end, latom_start, latom_end, &
508 particle_set, &
509 coeffs_set, coeffs_kind, &
510 is_assoc_atomic_block_global, do_periodic, &
511 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
512 cell, &
513 do_p_screening, map_atom_to_kind_atom, eval_type, &
514 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
515
516 END DO ! atom_block
517
518 nbins = load_balance_parameter%nbins
519 cost_per_bin = (cost_per_core + nbins - 1)/(nbins)
520
521!$OMP BARRIER
522!$OMP MASTER
523 CALL timestop(handle_inner)
524!$OMP END MASTER
525!$OMP BARRIER
526
527! new load balancing test
528 IF (.false.) THEN
529 CALL hfx_recursive_load_balance(n_processes, my_process_id, nblocks, &
530 natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
531 particle_set, &
532 coeffs_set, coeffs_kind, &
533 is_assoc_atomic_block_global, do_periodic, &
534 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
535 cell, x_data, para_env, pmax_block, &
536 do_p_screening, map_atom_to_kind_atom, eval_type, &
537 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
538 END IF
539
540!$OMP BARRIER
541!$OMP MASTER
542 CALL timeset(routinen//"_bin", handle_inner)
543!$OMP END MASTER
544!$OMP BARRIER
545
546 ALLOCATE (binned_dist(nbins))
547 binned_dist(:)%istart = -1_int_8
548 binned_dist(:)%number_of_atom_quartets = 0_int_8
549 binned_dist(:)%cost = 0_int_8
550 binned_dist(:)%time_first_scf = 0.0_dp
551 binned_dist(:)%time_other_scf = 0.0_dp
552 binned_dist(:)%time_forces = 0.0_dp
553
554 current_cost = 0
555 mepos = 1
556 distribution_counter_start = 1
557 distribution_counter_end = 0
558 ibin = 1
559
560 global_quartet_counter = 0
561 local_quartet_counter = 0
562 last_bin_needs_to_be_filled = .false.
563 DO atom_block = my_process_id, int(nblocks, kind=int_8)**4 - 1, n_processes
564 latom_block = int(modulo(atom_block, int(nblocks, kind=int_8))) + 1
565 tmp_block = atom_block/nblocks
566 katom_block = int(modulo(tmp_block, int(nblocks, kind=int_8))) + 1
567 IF (latom_block < katom_block) cycle
568 tmp_block = tmp_block/nblocks
569 jatom_block = int(modulo(tmp_block, int(nblocks, kind=int_8))) + 1
570 tmp_block = tmp_block/nblocks
571 iatom_block = int(modulo(tmp_block, int(nblocks, kind=int_8))) + 1
572 IF (jatom_block < iatom_block) cycle
573
574 distribution_counter_end = distribution_counter_end + 1
575 global_quartet_counter = global_quartet_counter + 1
576 last_bin_needs_to_be_filled = .true.
577
578 IF (binned_dist(ibin)%istart == -1_int_8) binned_dist(ibin)%istart = atom_block
579
580 iatom_start = x_data%blocks(iatom_block)%istart
581 iatom_end = x_data%blocks(iatom_block)%iend
582 jatom_start = x_data%blocks(jatom_block)%istart
583 jatom_end = x_data%blocks(jatom_block)%iend
584 katom_start = x_data%blocks(katom_block)%istart
585 katom_end = x_data%blocks(katom_block)%iend
586 latom_start = x_data%blocks(latom_block)%istart
587 latom_end = x_data%blocks(latom_block)%iend
588
589 SELECT CASE (eval_type)
590 CASE (hfx_do_eval_energy)
591 pmax_blocks = max(pmax_block(katom_block, iatom_block), &
592 pmax_block(latom_block, jatom_block), &
593 pmax_block(latom_block, iatom_block), &
594 pmax_block(katom_block, jatom_block))
595 CASE (hfx_do_eval_forces)
596 pmax_blocks = max(pmax_block(katom_block, iatom_block) + &
597 pmax_block(latom_block, jatom_block), &
598 pmax_block(latom_block, iatom_block) + &
599 pmax_block(katom_block, jatom_block))
600 END SELECT
601
602 IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) cycle
603
604 current_cost = current_cost &
605 + estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
606 iatom_start, iatom_end, jatom_start, jatom_end, &
607 katom_start, katom_end, latom_start, latom_end, &
608 particle_set, &
609 coeffs_set, coeffs_kind, &
610 is_assoc_atomic_block_global, do_periodic, &
611 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
612 cell, &
613 do_p_screening, map_atom_to_kind_atom, eval_type, &
614 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
615
616 IF (current_cost >= cost_per_bin) THEN
617 IF (ibin == nbins) THEN
618 binned_dist(ibin)%number_of_atom_quartets = binned_dist(ibin)%number_of_atom_quartets + &
619 distribution_counter_end - distribution_counter_start + 1
620 ELSE
621 binned_dist(ibin)%number_of_atom_quartets = distribution_counter_end - distribution_counter_start + 1
622 END IF
623 binned_dist(ibin)%cost = binned_dist(ibin)%cost + current_cost
624 ibin = min(ibin + 1, nbins)
625 distribution_counter_start = distribution_counter_end + 1
626 current_cost = 0
627 last_bin_needs_to_be_filled = .false.
628 END IF
629 END DO
630
631!$OMP BARRIER
632!$OMP MASTER
633 CALL timestop(handle_inner)
634 CALL timeset(routinen//"_dist", handle_inner)
635!$OMP END MASTER
636!$OMP BARRIER
637 !! Fill the last bin if necessary
638 IF (last_bin_needs_to_be_filled) THEN
639 binned_dist(ibin)%cost = binned_dist(ibin)%cost + current_cost
640 IF (ibin == nbins) THEN
641 binned_dist(ibin)%number_of_atom_quartets = binned_dist(ibin)%number_of_atom_quartets + &
642 distribution_counter_end - distribution_counter_start + 1
643 ELSE
644 binned_dist(ibin)%number_of_atom_quartets = distribution_counter_end - distribution_counter_start + 1
645 END IF
646 END IF
647
648 !! Sanity-Check
649 DO ibin = 1, nbins
650 local_quartet_counter = local_quartet_counter + binned_dist(ibin)%number_of_atom_quartets
651 END DO
652!$OMP BARRIER
653!$OMP MASTER
654 shm_local_quartet_counter = 0
655 shm_global_quartet_counter = 0
656!$OMP END MASTER
657!$OMP BARRIER
658!$OMP ATOMIC
659 shm_local_quartet_counter = shm_local_quartet_counter + local_quartet_counter
660!$OMP ATOMIC
661 shm_global_quartet_counter = shm_global_quartet_counter + global_quartet_counter
662
663!$OMP BARRIER
664!$OMP MASTER
665 CALL para_env%sum(shm_local_quartet_counter)
666 CALL para_env%sum(shm_global_quartet_counter)
667 IF (para_env%is_source()) THEN
668 IF (shm_local_quartet_counter /= shm_global_quartet_counter) THEN
669 WRITE (error_msg, '(A,I0,A,I0,A)') "HFX Sanity check for parallel distribution failed. "// &
670 "Number of local quartets (", shm_local_quartet_counter, &
671 ") and number of global quartets (", shm_global_quartet_counter, &
672 ") are different. Please send in a bug report."
673 cpabort(error_msg)
674 END IF
675 END IF
676!$OMP END MASTER
677
678!$OMP BARRIER
679!$OMP MASTER
680 ALLOCATE (cost_matrix(ncpu*nbins*n_threads))
681 cost_matrix = 0
682!$OMP END MASTER
683!$OMP BARRIER
684 icpu = para_env%mepos + 1
685 DO i = 1, nbins
686 cost_matrix((icpu - 1)*nbins*n_threads + i_thread*nbins + i) = binned_dist(i)%cost
687 END DO
688 mepos = para_env%mepos
689!$OMP BARRIER
690
691!$OMP MASTER
692 ! sync before/after ring of isendrecv
693 CALL para_env%sync()
694
695 ALLOCATE (sendbuffer(nbins*n_threads))
696 ALLOCATE (recbuffer(nbins*n_threads))
697
698 sendbuffer = cost_matrix(mepos*nbins*n_threads + 1:mepos*nbins*n_threads + nbins*n_threads)
699
700 dest = modulo(mepos + 1, ncpu)
701 source = modulo(mepos - 1, ncpu)
702 DO icpu = 0, ncpu - 1
703 IF (icpu .NE. ncpu - 1) THEN
704 CALL para_env%isendrecv(sendbuffer, dest, recbuffer, source, &
705 req(1), req(2), 13)
706 END IF
707 data_from = modulo(mepos - icpu, ncpu)
708 cost_matrix(data_from*nbins*n_threads + 1:data_from*nbins*n_threads + nbins*n_threads) = sendbuffer
709 IF (icpu .NE. ncpu - 1) THEN
710 CALL mp_waitall(req)
711 END IF
712 swapbuffer => sendbuffer
713 sendbuffer => recbuffer
714 recbuffer => swapbuffer
715 END DO
716 DEALLOCATE (recbuffer, sendbuffer)
717!$OMP END MASTER
718!$OMP BARRIER
719
720!$OMP BARRIER
721!$OMP MASTER
722 CALL timestop(handle_inner)
723 CALL timeset(routinen//"_opt", handle_inner)
724!$OMP END MASTER
725!$OMP BARRIER
726
727 !! Find an optimal distribution i.e. assign each element of the cost matrix to a certain process
728!$OMP BARRIER
729 ALLOCATE (local_cost_matrix(SIZE(cost_matrix, 1)))
730 local_cost_matrix = cost_matrix
731!$OMP MASTER
732 ALLOCATE (shm_distribution_vector(ncpu*nbins*n_threads))
733
734 CALL optimize_distribution(ncpu*nbins*n_threads, ncpu*n_threads, local_cost_matrix, &
735 shm_distribution_vector, x_data%load_balance_parameter%do_randomize)
736
737 CALL timestop(handle_inner)
738 CALL timeset(routinen//"_redist", handle_inner)
739 !! Collect local data to global array
740 ALLOCATE (full_dist(ncpu*n_threads, nbins))
741
742 full_dist(:, :)%istart = 0_int_8
743 full_dist(:, :)%number_of_atom_quartets = 0_int_8
744 full_dist(:, :)%cost = 0_int_8
745 full_dist(:, :)%time_first_scf = 0.0_dp
746 full_dist(:, :)%time_other_scf = 0.0_dp
747 full_dist(:, :)%time_forces = 0.0_dp
748!$OMP END MASTER
749!$OMP BARRIER
750 mepos = para_env%mepos + 1
751 full_dist((mepos - 1)*n_threads + i_thread + 1, :) = binned_dist(:)
752
753!$OMP BARRIER
754!$OMP MASTER
755 ALLOCATE (sendbuffer(3*nbins*n_threads))
756 ALLOCATE (recbuffer(3*nbins*n_threads))
757 mepos = para_env%mepos
758 DO j = 1, n_threads
759 DO i = 1, nbins
760 sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 1) = full_dist(mepos*n_threads + j, i)%istart
761 sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 2) = full_dist(mepos*n_threads + j, i)%number_of_atom_quartets
762 sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 3) = full_dist(mepos*n_threads + j, i)%cost
763 END DO
764 END DO
765
766 ! sync before/after ring of isendrecv
767 CALL para_env%sync()
768 dest = modulo(mepos + 1, ncpu)
769 source = modulo(mepos - 1, ncpu)
770 DO icpu = 0, ncpu - 1
771 IF (icpu .NE. ncpu - 1) THEN
772 CALL para_env%isendrecv(sendbuffer, dest, recbuffer, source, &
773 req(1), req(2), 13)
774 END IF
775 data_from = modulo(mepos - icpu, ncpu)
776 DO j = 1, n_threads
777 DO i = 1, nbins
778 full_dist(data_from*n_threads + j, i)%istart = sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 1)
779 full_dist(data_from*n_threads + j, i)%number_of_atom_quartets = sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 2)
780 full_dist(data_from*n_threads + j, i)%cost = sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 3)
781 END DO
782 END DO
783
784 IF (icpu .NE. ncpu - 1) THEN
785 CALL mp_waitall(req)
786 END IF
787 swapbuffer => sendbuffer
788 sendbuffer => recbuffer
789 recbuffer => swapbuffer
790 END DO
791 DEALLOCATE (recbuffer, sendbuffer)
792
793 ! sync before/after ring of isendrecv
794 CALL para_env%sync()
795!$OMP END MASTER
796!$OMP BARRIER
797 !! reorder the distribution according to the distribution vector
798 ALLOCATE (tmp_pos(ncpu*n_threads))
799 tmp_pos = 1
800 ALLOCATE (tmp_dist(nbins*ncpu*n_threads))
801
802 tmp_dist(:)%istart = 0_int_8
803 tmp_dist(:)%number_of_atom_quartets = 0_int_8
804 tmp_dist(:)%cost = 0_int_8
805 tmp_dist(:)%time_first_scf = 0.0_dp
806 tmp_dist(:)%time_other_scf = 0.0_dp
807 tmp_dist(:)%time_forces = 0.0_dp
808
809 DO icpu = 1, n_processes
810 DO i = 1, nbins
811 mepos = my_process_id + 1
812 IF (shm_distribution_vector((icpu - 1)*nbins + i) == mepos) THEN
813 tmp_dist(tmp_pos(mepos)) = full_dist(icpu, i)
814 tmp_pos(mepos) = tmp_pos(mepos) + 1
815 END IF
816 END DO
817 END DO
818
819 !! Assign the load to each process
820 NULLIFY (ptr_to_tmp_dist)
821 mepos = my_process_id + 1
822 ptr_to_tmp_dist => tmp_dist(1:tmp_pos(mepos) - 1)
823 SELECT CASE (eval_type)
824 CASE (hfx_do_eval_energy)
825 CALL hfx_set_distr_energy(ptr_to_tmp_dist, x_data)
826 CASE (hfx_do_eval_forces)
827 CALL hfx_set_distr_forces(ptr_to_tmp_dist, x_data)
828 END SELECT
829
830!$OMP BARRIER
831!$OMP MASTER
832 DEALLOCATE (full_dist, cost_matrix, shm_distribution_vector)
833!$OMP END MASTER
834!$OMP BARRIER
835 DEALLOCATE (tmp_dist, tmp_pos)
836 DEALLOCATE (binned_dist, local_cost_matrix)
837 DEALLOCATE (set_list_ij, set_list_kl)
838
839!$OMP BARRIER
840!$OMP MASTER
841 CALL timestop(handle_inner)
842!$OMP END MASTER
843!$OMP BARRIER
844 END IF
845!$OMP BARRIER
846!$OMP MASTER
847 CALL timestop(handle)
848!$OMP END MASTER
849!$OMP BARRIER
850 END SUBROUTINE hfx_load_balance
851
852! **************************************************************************************************
853!> \brief Reference implementation of new recursive load balancing routine
854!> Computes a local list of atom_blocks (p_atom_blocks,q_atom_blocks) for
855!> each process in a P-Q grid such that every process has more or less the
856!> same amount of work. Has no output at the moment (not used) but writes
857!> its computed load balance values into a file. Possible output is ready
858!> to use in the two arrays p_atom_blocks & q_atom_blocks
859!> \param n_processes ...
860!> \param my_process_id ...
861!> \param nblocks ...
862!> \param natom ...
863!> \param nkind ...
864!> \param list_ij ...
865!> \param list_kl ...
866!> \param set_list_ij ...
867!> \param set_list_kl ...
868!> \param particle_set ...
869!> \param coeffs_set ...
870!> \param coeffs_kind ...
871!> \param is_assoc_atomic_block_global ...
872!> \param do_periodic ...
873!> \param kind_of ...
874!> \param basis_parameter ...
875!> \param pmax_set ...
876!> \param pmax_atom ...
877!> \param pmax_blocks ...
878!> \param cell ...
879!> \param x_data ...
880!> \param para_env ...
881!> \param pmax_block ...
882!> \param do_p_screening ...
883!> \param map_atom_to_kind_atom ...
884!> \param eval_type ...
885!> \param log10_eps_schwarz ...
886!> \param log_2 ...
887!> \param coeffs_kind_max0 ...
888!> \param use_virial ...
889!> \param atomic_pair_list ...
890!> \par History
891!> 03.2011 created [Michael Steinlechner]
892!> \author Michael Steinlechner
893! **************************************************************************************************
894
895 SUBROUTINE hfx_recursive_load_balance(n_processes, my_process_id, nblocks, &
896 natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
897 particle_set, &
898 coeffs_set, coeffs_kind, &
899 is_assoc_atomic_block_global, do_periodic, &
900 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
901 cell, x_data, para_env, pmax_block, &
902 do_p_screening, map_atom_to_kind_atom, eval_type, &
903 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
904
905! input variables:
906 INTEGER, INTENT(IN) :: n_processes, my_process_id, nblocks, &
907 natom, nkind
908 TYPE(pair_list_type), INTENT(IN) :: list_ij, list_kl
909 TYPE(pair_set_list_type), ALLOCATABLE, &
910 DIMENSION(:), INTENT(IN) :: set_list_ij, set_list_kl
911 TYPE(particle_type), DIMENSION(:), INTENT(IN), &
912 POINTER :: particle_set
913 TYPE(hfx_screen_coeff_type), &
914 DIMENSION(:, :, :, :), INTENT(IN), POINTER :: coeffs_set
915 TYPE(hfx_screen_coeff_type), DIMENSION(:, :), &
916 INTENT(IN), POINTER :: coeffs_kind
917 INTEGER, DIMENSION(:, :), INTENT(IN) :: is_assoc_atomic_block_global
918 LOGICAL, INTENT(IN) :: do_periodic
919 INTEGER, INTENT(IN) :: kind_of(*)
920 TYPE(hfx_basis_type), DIMENSION(:), INTENT(IN), &
921 POINTER :: basis_parameter
922 TYPE(hfx_p_kind), DIMENSION(:), INTENT(IN), &
923 POINTER :: pmax_set
924 REAL(dp), DIMENSION(:, :), INTENT(IN), POINTER :: pmax_atom
925 REAL(dp) :: pmax_blocks
926 TYPE(cell_type), INTENT(IN), POINTER :: cell
927 TYPE(hfx_type), INTENT(IN), POINTER :: x_data
928 TYPE(mp_para_env_type), INTENT(IN) :: para_env
929 REAL(dp), DIMENSION(:, :), INTENT(IN), POINTER :: pmax_block
930 LOGICAL, INTENT(IN) :: do_p_screening
931 INTEGER, DIMENSION(:), INTENT(IN), POINTER :: map_atom_to_kind_atom
932 INTEGER, INTENT(IN) :: eval_type
933 REAL(dp), INTENT(IN) :: log10_eps_schwarz, log_2, &
934 coeffs_kind_max0
935 LOGICAL, INTENT(IN) :: use_virial
936 LOGICAL, DIMENSION(:, :), INTENT(IN), POINTER :: atomic_pair_list
937
938 CHARACTER(LEN=*), PARAMETER :: routinen = 'hfx_recursive_load_balance'
939
940 INTEGER :: handle, i, iatom_block, iatom_end, iatom_start, j, jatom_block, jatom_end, &
941 jatom_start, katom_block, katom_end, katom_start, latom_block, latom_end, latom_start, &
942 np, nq, numbins, p, q, sizep, sizeq, unit_nr
943 INTEGER(int_8) :: local_cost, pidx, qidx, sump, sumq
944 INTEGER(int_8), ALLOCATABLE, DIMENSION(:) :: local_cost_vector
945 INTEGER, ALLOCATABLE, DIMENSION(:) :: blocksize, p_atom_blocks, permute, &
946 q_atom_blocks
947 REAL(dp) :: maximum, mean
948
949! internal variables:
950
951!$OMP BARRIER
952!$OMP MASTER
953 CALL timeset(routinen, handle)
954!$OMP END MASTER
955!$OMP BARRIER
956
957 ! calculate best p/q distribution grid for the n_processes
958 CALL hfx_calculate_pq(p, q, numbins, n_processes)
959
960 ALLOCATE (blocksize(numbins))
961 ALLOCATE (permute(nblocks**2))
962 DO i = 1, nblocks**2
963 permute(i) = i
964 END DO
965
966 ! call the main recursive permutation routine.
967 ! Output:
968 ! blocksize :: vector (size numBins) with the sizes for each column/row block
969 ! permute :: permutation vector
970 CALL hfx_recursive_permute(blocksize, 1, nblocks**2, numbins, &
971 permute, 1, &
972 my_process_id, n_processes, nblocks, &
973 natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
974 particle_set, &
975 coeffs_set, coeffs_kind, &
976 is_assoc_atomic_block_global, do_periodic, &
977 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
978 cell, x_data, para_env, pmax_block, &
979 do_p_screening, map_atom_to_kind_atom, eval_type, &
980 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
981
982 ! number of blocks per processor in p-direction (vertical)
983 np = numbins/p
984 ! number of blocks per processor in q-direction (horizontal)
985 nq = numbins/q
986
987 ! calc own position in P-Q-processor grid (PQ-grid is column-major)
988 pidx = modulo(int(my_process_id), int(p)) + 1
989 qidx = my_process_id/p + 1
990
991 sizep = sum(blocksize((np*(pidx - 1) + 1):(np*pidx)))
992 sizeq = sum(blocksize((nq*(qidx - 1) + 1):(nq*qidx)))
993
994 sump = sum(blocksize(1:(np*(pidx - 1))))
995 sumq = sum(blocksize(1:(nq*(qidx - 1))))
996
997 ALLOCATE (p_atom_blocks(sizep))
998 ALLOCATE (q_atom_blocks(sizeq))
999
1000 p_atom_blocks(:) = permute((sump + 1):(sump + sizep))
1001 q_atom_blocks(:) = permute((sumq + 1):(sumq + sizeq))
1002
1003 ! from here on, we are actually finished, each process has been
1004 ! assigned a (p_atom_blocks,q_atom_blocks) pair list.
1005 ! what follows is just a small routine to calculate the local cost
1006 ! for each processor which is then written to a file.
1007
1008 ! calculate local cost for each processor!
1009 ! ****************************************
1010 local_cost = 0
1011 DO i = 1, sizep
1012 DO j = 1, sizeq
1013
1014 ! get corresponding 4D block indices out of our own P-Q-block
1015 latom_block = modulo(q_atom_blocks(j), nblocks)
1016 iatom_block = q_atom_blocks(j)/nblocks + 1
1017 jatom_block = modulo(p_atom_blocks(i), nblocks)
1018 katom_block = p_atom_blocks(i)/nblocks + 1
1019
1020 ! symmetry checks.
1021 IF (latom_block < katom_block) cycle
1022 IF (jatom_block < iatom_block) cycle
1023
1024 iatom_start = x_data%blocks(iatom_block)%istart
1025 iatom_end = x_data%blocks(iatom_block)%iend
1026 jatom_start = x_data%blocks(jatom_block)%istart
1027 jatom_end = x_data%blocks(jatom_block)%iend
1028 katom_start = x_data%blocks(katom_block)%istart
1029 katom_end = x_data%blocks(katom_block)%iend
1030 latom_start = x_data%blocks(latom_block)%istart
1031 latom_end = x_data%blocks(latom_block)%iend
1032
1033 ! whatever.
1034 SELECT CASE (eval_type)
1035 CASE (hfx_do_eval_energy)
1036 pmax_blocks = max(pmax_block(katom_block, iatom_block), &
1037 pmax_block(latom_block, jatom_block), &
1038 pmax_block(latom_block, iatom_block), &
1039 pmax_block(katom_block, jatom_block))
1040 CASE (hfx_do_eval_forces)
1041 pmax_blocks = max(pmax_block(katom_block, iatom_block) + &
1042 pmax_block(latom_block, jatom_block), &
1043 pmax_block(latom_block, iatom_block) + &
1044 pmax_block(katom_block, jatom_block))
1045 END SELECT
1046
1047 ! screening.
1048 IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) cycle
1049
1050 ! estimate the cost of this atom_block.
1051 local_cost = local_cost + estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, &
1052 set_list_kl, &
1053 iatom_start, iatom_end, jatom_start, jatom_end, &
1054 katom_start, katom_end, latom_start, latom_end, &
1055 particle_set, &
1056 coeffs_set, coeffs_kind, &
1057 is_assoc_atomic_block_global, do_periodic, &
1058 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
1059 cell, &
1060 do_p_screening, map_atom_to_kind_atom, eval_type, &
1061 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
1062 END DO
1063 END DO
1064
1065 ALLOCATE (local_cost_vector(n_processes))
1066 local_cost_vector = 0
1067 local_cost_vector(my_process_id + 1) = local_cost
1068 CALL para_env%sum(local_cost_vector)
1069
1070 mean = sum(local_cost_vector)/n_processes
1071 maximum = maxval(local_cost_vector)
1072
1073!$OMP BARRIER
1074!$OMP MASTER
1075 ! only output once
1076 IF (my_process_id == 0) THEN
1077 CALL open_file(unit_number=unit_nr, file_name="loads.dat")
1078 WRITE (unit_nr, *) 'maximum cost:', maximum
1079 WRITE (unit_nr, *) 'mean cost:', mean
1080 WRITE (unit_nr, *) 'load balance ratio max/mean: ', maximum/mean
1081 WRITE (unit_nr, *) '-------- detailed per-process costs ---------'
1082 DO i = 1, n_processes
1083 WRITE (unit_nr, *) local_cost_vector(i)
1084 END DO
1085 CALL close_file(unit_nr)
1086 END IF
1087!$OMP END MASTER
1088!$OMP BARRIER
1089
1090 DEALLOCATE (local_cost_vector)
1091 DEALLOCATE (p_atom_blocks, q_atom_blocks)
1092 DEALLOCATE (blocksize, permute)
1093
1094!$OMP BARRIER
1095!$OMP MASTER
1096 CALL timestop(handle)
1097!$OMP END MASTER
1098!$OMP BARRIER
1099
1100 END SUBROUTINE hfx_recursive_load_balance
1101
1102! **************************************************************************************************
1103!> \brief Small routine to calculate the optimal P-Q-processor grid distribution
1104!> for a given number of processors N
1105!> and the corresponding number of Bins for the load balancing routine
1106!> \param p number of rows on P-Q process grid (output)
1107!> \param q number of columns on P-Q process grid (output)
1108!> \param nBins number of Bins (output)
1109!> \param N number of processes (input)
1110!> \par History
1111!> 03.2011 created [Michael Steinlechner]
1112!> \author Michael Steinlechner
1113! **************************************************************************************************
1114 SUBROUTINE hfx_calculate_pq(p, q, nBins, N)
1115
1116 INTEGER, INTENT(OUT) :: p, q, nbins
1117 INTEGER, INTENT(IN) :: n
1118
1119 INTEGER :: a, b, k
1120 REAL(dp) :: sqn
1121
1122 k = 2
1123 sqn = sqrt(real(n, kind=dp))
1124 p = 1
1125
1126 DO WHILE (real(k, kind=dp) <= sqn)
1127 IF (modulo(n, k) == 0) THEN
1128 p = k
1129 END IF
1130 k = k + 1
1131 END DO
1132 q = n/p
1133
1134 ! now compute the least common multiple of p & q to get the number of necessary bins
1135 ! compute using the relation LCM(p,q) = abs(p*q) / GCD(p,q)
1136 ! and use euclid's algorithm for GCD computation.
1137 a = p
1138 b = q
1139
1140 DO WHILE (b .NE. 0)
1141 IF (a > b) THEN
1142 a = a - b
1143 ELSE
1144 b = b - a
1145 END IF
1146 END DO
1147 ! gcd(p,q) is now saved in a
1148
1149 nbins = p*q/a
1150
1151 END SUBROUTINE
1152
1153! **************************************************************************************************
1154!> \brief Recursive permutation routine for the load balancing of the integral
1155!> computation
1156!> \param blocksize vector of blocksizes, size(nProc), which contains for
1157!> each process the local blocksize (OUTPUT)
1158!> \param blockstart starting row/column idx of the block which is to be examined
1159!> at this point (INPUT)
1160!> \param blockend ending row/column idx of the block which is to be examined
1161!> (INPUT)
1162!> \param nProc_in number of bins into which the current block has to be divided
1163!> (INPUT)
1164!> \param permute permutation vector which balances column/row cost
1165!> size(nblocks^2). (OUTPUT)
1166!> \param step ...
1167!> \param my_process_id ...
1168!> \param n_processes ...
1169!> \param nblocks ...
1170!> \param natom ...
1171!> \param nkind ...
1172!> \param list_ij ...
1173!> \param list_kl ...
1174!> \param set_list_ij ...
1175!> \param set_list_kl ...
1176!> \param particle_set ...
1177!> \param coeffs_set ...
1178!> \param coeffs_kind ...
1179!> \param is_assoc_atomic_block_global ...
1180!> \param do_periodic ...
1181!> \param kind_of ...
1182!> \param basis_parameter ...
1183!> \param pmax_set ...
1184!> \param pmax_atom ...
1185!> \param pmax_blocks ...
1186!> \param cell ...
1187!> \param x_data ...
1188!> \param para_env ...
1189!> \param pmax_block ...
1190!> \param do_p_screening ...
1191!> \param map_atom_to_kind_atom ...
1192!> \param eval_type ...
1193!> \param log10_eps_schwarz ...
1194!> \param log_2 ...
1195!> \param coeffs_kind_max0 ...
1196!> \param use_virial ...
1197!> \param atomic_pair_list ...
1198!> \par History
1199!> 03.2011 created [Michael Steinlechner]
1200!> \author Michael Steinlechner
1201! **************************************************************************************************
1202 RECURSIVE SUBROUTINE hfx_recursive_permute(blocksize, blockstart, blockend, nProc_in, &
1203 permute, step, &
1204 my_process_id, n_processes, nblocks, &
1205 natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
1206 particle_set, &
1207 coeffs_set, coeffs_kind, &
1208 is_assoc_atomic_block_global, do_periodic, &
1209 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
1210 cell, x_data, para_env, pmax_block, &
1211 do_p_screening, map_atom_to_kind_atom, eval_type, &
1212 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
1213
1214 INTEGER :: nproc_in, blockend, blockstart
1215 INTEGER, DIMENSION(nProc_in) :: blocksize
1216 INTEGER :: nblocks, n_processes, my_process_id
1217 INTEGER, INTENT(IN) :: step
1218 INTEGER, DIMENSION(nblocks*nblocks) :: permute
1219 INTEGER :: natom
1220 INTEGER, INTENT(IN) :: nkind
1221 TYPE(pair_list_type) :: list_ij, list_kl
1222 TYPE(pair_set_list_type), ALLOCATABLE, &
1223 DIMENSION(:) :: set_list_ij, set_list_kl
1224 TYPE(particle_type), DIMENSION(:), POINTER :: particle_set
1225 TYPE(hfx_screen_coeff_type), &
1226 DIMENSION(:, :, :, :), POINTER :: coeffs_set
1227 TYPE(hfx_screen_coeff_type), DIMENSION(:, :), &
1228 POINTER :: coeffs_kind
1229 INTEGER, DIMENSION(:, :) :: is_assoc_atomic_block_global
1230 LOGICAL :: do_periodic
1231 INTEGER :: kind_of(*)
1232 TYPE(hfx_basis_type), DIMENSION(:), POINTER :: basis_parameter
1233 TYPE(hfx_p_kind), DIMENSION(:), POINTER :: pmax_set
1234 REAL(dp), DIMENSION(:, :), POINTER :: pmax_atom
1235 REAL(dp) :: pmax_blocks
1236 TYPE(cell_type), POINTER :: cell
1237 TYPE(hfx_type), POINTER :: x_data
1238 TYPE(mp_para_env_type), INTENT(IN) :: para_env
1239 REAL(dp), DIMENSION(:, :), POINTER :: pmax_block
1240 LOGICAL, INTENT(IN) :: do_p_screening
1241 INTEGER, DIMENSION(:), POINTER :: map_atom_to_kind_atom
1242 INTEGER, INTENT(IN) :: eval_type
1243 REAL(dp) :: log10_eps_schwarz, log_2, &
1244 coeffs_kind_max0
1245 LOGICAL, INTENT(IN) :: use_virial
1246 LOGICAL, DIMENSION(:, :), POINTER :: atomic_pair_list
1247
1248 INTEGER :: col, endoffset, i, iatom_block, iatom_end, iatom_start, idx, inv_perm, &
1249 jatom_block, jatom_end, jatom_start, katom_block, katom_end, katom_start, latom_block, &
1250 latom_end, latom_start, nbins, nproc, row, startoffset
1251 INTEGER(int_8) :: atom_block, tmp_block
1252 INTEGER, ALLOCATABLE, DIMENSION(:) :: ithblocksize, localblocksize
1253 INTEGER, DIMENSION(blockend - blockstart + 1) :: bin_perm, tmp_perm
1254 REAL(dp) :: partialcost
1255 REAL(dp), DIMENSION(nblocks*nblocks) :: cost_vector
1256
1257 nproc = nproc_in
1258 cost_vector = 0.0_dp
1259
1260! loop over local atom_blocks.
1261 DO atom_block = my_process_id, int(nblocks, kind=int_8)**4 - 1, n_processes
1262
1263! get corresponding 4D block indices
1264 latom_block = int(modulo(atom_block, int(nblocks, kind=int_8))) + 1
1265 tmp_block = atom_block/nblocks
1266 katom_block = int(modulo(tmp_block, int(nblocks, kind=int_8))) + 1
1267 IF (latom_block < katom_block) cycle
1268 tmp_block = tmp_block/nblocks
1269 jatom_block = int(modulo(tmp_block, int(nblocks, kind=int_8))) + 1
1270 tmp_block = tmp_block/nblocks
1271 iatom_block = int(modulo(tmp_block, int(nblocks, kind=int_8))) + 1
1272 IF (jatom_block < iatom_block) cycle
1273
1274! get 2D indices of this atom_block (with permutation applied)
1275! for this, we need to invert the permutation, this means
1276! find position in permutation vector where value==idx
1277
1278 row = (katom_block - 1)*nblocks + jatom_block
1279 inv_perm = 1
1280 DO WHILE (permute(inv_perm) .NE. row)
1281 inv_perm = inv_perm + 1
1282 END DO
1283 row = inv_perm
1284
1285 col = (iatom_block - 1)*nblocks + latom_block
1286 inv_perm = 1
1287 DO WHILE (permute(inv_perm) .NE. col)
1288 inv_perm = inv_perm + 1
1289 END DO
1290 col = inv_perm
1291
1292! if row/col outside our current diagonal block, skip calculation.
1293 IF (col < blockstart .OR. col > blockend) cycle
1294 IF (row < blockstart .OR. row > blockend) cycle
1295
1296 iatom_start = x_data%blocks(iatom_block)%istart
1297 iatom_end = x_data%blocks(iatom_block)%iend
1298 jatom_start = x_data%blocks(jatom_block)%istart
1299 jatom_end = x_data%blocks(jatom_block)%iend
1300 katom_start = x_data%blocks(katom_block)%istart
1301 katom_end = x_data%blocks(katom_block)%iend
1302 latom_start = x_data%blocks(latom_block)%istart
1303 latom_end = x_data%blocks(latom_block)%iend
1304
1305! whatever.
1306 SELECT CASE (eval_type)
1307 CASE (hfx_do_eval_energy)
1308 pmax_blocks = max(pmax_block(katom_block, iatom_block), &
1309 pmax_block(latom_block, jatom_block), &
1310 pmax_block(latom_block, iatom_block), &
1311 pmax_block(katom_block, jatom_block))
1312 CASE (hfx_do_eval_forces)
1313 pmax_blocks = max(pmax_block(katom_block, iatom_block) + &
1314 pmax_block(latom_block, jatom_block), &
1315 pmax_block(latom_block, iatom_block) + &
1316 pmax_block(katom_block, jatom_block))
1317 END SELECT
1318
1319! screening.
1320 IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) cycle
1321
1322! every second recursion step, compute row sum instead of column sum
1323
1324 IF (modulo(step, 2) .EQ. 0) THEN
1325 idx = row
1326 ELSE
1327 idx = col
1328 END IF
1329
1330! estimate the cost of this atom_block.
1331 partialcost = estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, &
1332 set_list_kl, &
1333 iatom_start, iatom_end, jatom_start, jatom_end, &
1334 katom_start, katom_end, latom_start, latom_end, &
1335 particle_set, &
1336 coeffs_set, coeffs_kind, &
1337 is_assoc_atomic_block_global, do_periodic, &
1338 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
1339 cell, &
1340 do_p_screening, map_atom_to_kind_atom, eval_type, &
1341 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
1342
1343 cost_vector(idx) = cost_vector(idx) + partialcost
1344 END DO ! atom_block
1345
1346! sum costvector over all processes
1347 CALL para_env%sum(cost_vector)
1348
1349! calculate next prime factor of nProc
1350 nbins = 2
1351 DO WHILE (modulo(int(nproc), int(nbins)) .NE. 0)
1352 nbins = nbins + 1
1353 END DO
1354
1355 nproc = nproc/nbins
1356
1357! ... do the binning...
1358
1359 ALLOCATE (localblocksize(nbins))
1360 CALL hfx_permute_binning(nbins, cost_vector(blockstart:blockend), blockend - blockstart + 1, bin_perm, localblocksize)
1361
1362!... and update the permutation vector
1363
1364 tmp_perm = permute(blockstart:blockend)
1365 permute(blockstart:blockend) = tmp_perm(bin_perm)
1366
1367! split recursion into the nBins Bins
1368 IF (nproc > 1) THEN
1369 ALLOCATE (ithblocksize(nproc))
1370 DO i = 1, nbins
1371 startoffset = sum(localblocksize(1:(i - 1)))
1372 endoffset = sum(localblocksize(1:i)) - 1
1373
1374 CALL hfx_recursive_permute(ithblocksize, blockstart + startoffset, blockstart + endoffset, nproc, &
1375 permute, step + 1, &
1376 my_process_id, n_processes, nblocks, &
1377 natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
1378 particle_set, &
1379 coeffs_set, coeffs_kind, &
1380 is_assoc_atomic_block_global, do_periodic, &
1381 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
1382 cell, x_data, para_env, pmax_block, &
1383 do_p_screening, map_atom_to_kind_atom, eval_type, &
1384 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
1385 blocksize(((i - 1)*nproc + 1):(i*nproc)) = ithblocksize
1386 END DO
1387 DEALLOCATE (ithblocksize)
1388 ELSE
1389 DO i = 1, nbins
1390 blocksize(i) = localblocksize(i)
1391 END DO
1392 END IF
1393
1394 DEALLOCATE (localblocksize)
1395
1396 END SUBROUTINE hfx_recursive_permute
1397
1398! **************************************************************************************************
1399!> \brief small binning routine for the recursive load balancing
1400!>
1401!> \param nBins number of Bins (INPUT)
1402!> \param costvector vector of current row/column costs which have to be binned (INPUT)
1403!> \param maxbinsize upper bound for bin size (INPUT)
1404!> \param perm resulting permutation due to be binning routine (OUTPUT)
1405!> \param block_count vector of size(nbins) which contains the size of each bin (OUTPUT)
1406!> \par History
1407!> 03.2011 created [Michael Steinlechner]
1408!> \author Michael Steinlechner
1409! **************************************************************************************************
1410 SUBROUTINE hfx_permute_binning(nBins, costvector, maxbinsize, perm, block_count)
1411
1412 INTEGER, INTENT(IN) :: nbins, maxbinsize
1413 REAL(dp), DIMENSION(maxbinsize), INTENT(IN) :: costvector
1414 INTEGER, DIMENSION(maxbinsize), INTENT(OUT) :: perm
1415 INTEGER, DIMENSION(nBins), INTENT(OUT) :: block_count
1416
1417 INTEGER :: i, j, mod_idx, offset
1418 INTEGER, DIMENSION(nBins, maxbinsize) :: bin
1419 INTEGER, DIMENSION(nBins) :: bin_idx
1420 INTEGER, DIMENSION(maxbinsize) :: idx
1421 REAL(dp), DIMENSION(maxbinsize) :: vec
1422 REAL(dp), DIMENSION(nBins) :: bincosts
1423
1424! be careful not to change costvector (copy it!)
1425
1426 vec = costvector
1427 block_count = 0
1428 bincosts = 0
1429
1430 !sort the array (ascending)
1431 CALL sort(vec, maxbinsize, idx)
1432
1433 ! count the loop down to distribute the largest cols/rows first
1434 DO i = maxbinsize, 1, -1
1435 IF (vec(i) == 0) THEN
1436 ! spread zero-cost col/rows evenly among procs
1437 mod_idx = modulo(i, nbins) + 1 !(note the fortran offset by one!)
1438 block_count(mod_idx) = block_count(mod_idx) + 1
1439 bin(mod_idx, block_count(mod_idx)) = idx(i)
1440 ELSE
1441 ! sort the bins so that the one with the lowest cost is at the
1442 ! first place, where we then assign the current col/row
1443 CALL sort(bincosts, nbins, bin_idx)
1444 block_count = block_count(bin_idx)
1445 bin = bin(bin_idx, :)
1446
1447 bincosts(1) = bincosts(1) + vec(i)
1448 block_count(1) = block_count(1) + 1
1449 bin(1, block_count(1)) = idx(i)
1450 END IF
1451 END DO
1452
1453 ! construct permutation vector from the binning
1454 offset = 0
1455 DO i = 1, nbins
1456 DO j = 1, block_count(i)
1457 perm(offset + j) = bin(i, j)
1458 END DO
1459 offset = offset + block_count(i)
1460 END DO
1461
1462 END SUBROUTINE hfx_permute_binning
1463
1464! **************************************************************************************************
1465!> \brief Cheap way of redistributing the eri's
1466!> \param x_data Object that stores the indices array
1467!> \param para_env para_env
1468!> \param load_balance_parameter contains parmameter for Monte-Carlo routines
1469!> \param i_thread current thread ID
1470!> \param n_threads Total Number of threads
1471!> \param eval_type ...
1472!> \par History
1473!> 12.2007 created [Manuel Guidon]
1474!> 02.2009 optimize Memory Usage [Manuel Guidon]
1475!> \author Manuel Guidon
1476!> \note
1477!> The cost matrix is given by the walltime for each bin that is measured
1478!> during the calculation
1479! **************************************************************************************************
1480 SUBROUTINE hfx_update_load_balance(x_data, para_env, &
1481 load_balance_parameter, &
1482 i_thread, n_threads, eval_type)
1483
1484 TYPE(hfx_type), POINTER :: x_data
1485 TYPE(mp_para_env_type), INTENT(IN) :: para_env
1486 TYPE(hfx_load_balance_type) :: load_balance_parameter
1487 INTEGER, INTENT(IN) :: i_thread, n_threads, eval_type
1488
1489 CHARACTER(LEN=*), PARAMETER :: routinen = 'hfx_update_load_balance'
1490
1491 INTEGER :: data_from, dest, end_idx, handle, i, ibin, icpu, iprocess, j, mepos, my_bin_size, &
1492 my_global_start_idx, my_process_id, n_processes, nbins, ncpu, source, start_idx
1493 TYPE(mp_request_type), DIMENSION(2) :: req
1494 INTEGER(int_8), DIMENSION(:), POINTER :: local_cost_matrix, recbuffer, &
1495 sendbuffer, swapbuffer
1496 INTEGER(int_8), DIMENSION(:), POINTER, SAVE :: cost_matrix
1497 INTEGER, ALLOCATABLE, DIMENSION(:) :: tmp_pos
1498 INTEGER, ALLOCATABLE, DIMENSION(:), SAVE :: bins_per_rank
1499 INTEGER, ALLOCATABLE, DIMENSION(:, :), SAVE :: bin_histogram
1500 INTEGER, DIMENSION(:), POINTER, SAVE :: shm_distribution_vector
1501 INTEGER, SAVE :: max_bin_size
1502 TYPE(hfx_distribution), DIMENSION(:), POINTER :: binned_dist, ptr_to_tmp_dist, tmp_dist
1503 TYPE(hfx_distribution), DIMENSION(:, :), POINTER, &
1504 SAVE :: full_dist
1505
1506!$OMP BARRIER
1507!$OMP MASTER
1508 CALL timeset(routinen, handle)
1509!$OMP END MASTER
1510!$OMP BARRIER
1511
1512 ncpu = para_env%num_pe
1513 n_processes = ncpu*n_threads
1514 !! If there is only 1 cpu skip the binning
1515 IF (n_processes == 1) THEN
1516 ALLOCATE (tmp_dist(1))
1517 tmp_dist(1)%number_of_atom_quartets = huge(tmp_dist(1)%number_of_atom_quartets)
1518 tmp_dist(1)%istart = 0_int_8
1519 ptr_to_tmp_dist => tmp_dist(:)
1520 SELECT CASE (eval_type)
1521 CASE (hfx_do_eval_energy)
1522 CALL hfx_set_distr_energy(ptr_to_tmp_dist, x_data)
1523 CASE (hfx_do_eval_forces)
1524 CALL hfx_set_distr_forces(ptr_to_tmp_dist, x_data)
1525 END SELECT
1526 DEALLOCATE (tmp_dist)
1527 ELSE
1528 mepos = para_env%mepos
1529 my_process_id = para_env%mepos*n_threads + i_thread
1530 nbins = load_balance_parameter%nbins
1531!$OMP MASTER
1532 ALLOCATE (bin_histogram(n_processes, 2))
1533 bin_histogram = 0
1534!$OMP END MASTER
1535!$OMP BARRIER
1536 SELECT CASE (eval_type)
1537 CASE (hfx_do_eval_energy)
1538 my_bin_size = SIZE(x_data%distribution_energy)
1539 CASE (hfx_do_eval_forces)
1540 my_bin_size = SIZE(x_data%distribution_forces)
1541 END SELECT
1542 bin_histogram(my_process_id + 1, 1) = my_bin_size
1543!$OMP BARRIER
1544!$OMP MASTER
1545 CALL para_env%sum(bin_histogram(:, 1))
1546 bin_histogram(1, 2) = bin_histogram(1, 1)
1547 DO iprocess = 2, n_processes
1548 bin_histogram(iprocess, 2) = bin_histogram(iprocess - 1, 2) + bin_histogram(iprocess, 1)
1549 END DO
1550
1551 max_bin_size = maxval(bin_histogram(para_env%mepos*n_threads + 1:para_env%mepos*n_threads + n_threads, 1))
1552 CALL para_env%max(max_bin_size)
1553!$OMP END MASTER
1554!$OMP BARRIER
1555 ALLOCATE (binned_dist(my_bin_size))
1556 !! Use old binned_dist, but with timings cost
1557 SELECT CASE (eval_type)
1558 CASE (hfx_do_eval_energy)
1559 binned_dist = x_data%distribution_energy
1560 CASE (hfx_do_eval_forces)
1561 binned_dist = x_data%distribution_forces
1562 END SELECT
1563
1564 DO ibin = 1, my_bin_size
1565 IF (binned_dist(ibin)%number_of_atom_quartets == 0) THEN
1566 binned_dist(ibin)%cost = 0
1567 ELSE
1568 SELECT CASE (eval_type)
1569 CASE (hfx_do_eval_energy)
1570 IF (.NOT. load_balance_parameter%rtp_redistribute) THEN
1571 binned_dist(ibin)%cost = int((binned_dist(ibin)%time_first_scf + &
1572 binned_dist(ibin)%time_other_scf)*10000.0_dp, int_8)
1573 ELSE
1574 binned_dist(ibin)%cost = int((binned_dist(ibin)%time_other_scf)*10000.0_dp, int_8)
1575 END IF
1576 CASE (hfx_do_eval_forces)
1577 binned_dist(ibin)%cost = int((binned_dist(ibin)%time_forces)*10000.0_dp, int_8)
1578 END SELECT
1579 END IF
1580 END DO
1581!$OMP BARRIER
1582!$OMP MASTER
1583 !! store all local results in a big cost matrix
1584 ALLOCATE (cost_matrix(ncpu*nbins*n_threads))
1585 cost_matrix = 0
1586 ALLOCATE (sendbuffer(max_bin_size*n_threads))
1587 ALLOCATE (recbuffer(max_bin_size*n_threads))
1588!$OMP END MASTER
1589!$OMP BARRIER
1590 my_global_start_idx = bin_histogram(my_process_id + 1, 2) - my_bin_size
1591 icpu = para_env%mepos + 1
1592 DO i = 1, my_bin_size
1593 cost_matrix(my_global_start_idx + i) = binned_dist(i)%cost
1594 END DO
1595
1596 mepos = para_env%mepos
1597!$OMP BARRIER
1598!$OMP MASTER
1599 ALLOCATE (bins_per_rank(ncpu))
1600 bins_per_rank = 0
1601 DO icpu = 1, ncpu
1602 bins_per_rank(icpu) = sum(bin_histogram((icpu - 1)*n_threads + 1:(icpu - 1)*n_threads + n_threads, 1))
1603 END DO
1604 sendbuffer(1:bins_per_rank(para_env%mepos + 1)) = &
1605 cost_matrix(my_global_start_idx + 1:my_global_start_idx + bins_per_rank(para_env%mepos + 1))
1606
1607 dest = modulo(mepos + 1, ncpu)
1608 source = modulo(mepos - 1, ncpu)
1609 ! sync before/after ring of isendrecv
1610 CALL para_env%sync()
1611 DO icpu = 0, ncpu - 1
1612 IF (icpu .NE. ncpu - 1) THEN
1613 CALL para_env%isendrecv(sendbuffer, dest, recbuffer, source, &
1614 req(1), req(2), 13)
1615 END IF
1616 data_from = modulo(mepos - icpu, ncpu)
1617 start_idx = sum(bins_per_rank(1:data_from + 1)) - bins_per_rank(data_from + 1) + 1
1618 end_idx = start_idx + bins_per_rank(data_from + 1) - 1
1619 cost_matrix(start_idx:end_idx) = sendbuffer(1:end_idx - start_idx + 1)
1620
1621 IF (icpu .NE. ncpu - 1) THEN
1622 CALL mp_waitall(req)
1623 END IF
1624 swapbuffer => sendbuffer
1625 sendbuffer => recbuffer
1626 recbuffer => swapbuffer
1627 END DO
1628 DEALLOCATE (recbuffer, sendbuffer)
1629 ! sync before/after ring of isendrecv
1630 CALL para_env%sync()
1631!$OMP END MASTER
1632!$OMP BARRIER
1633 ALLOCATE (local_cost_matrix(SIZE(cost_matrix, 1)))
1634 local_cost_matrix = cost_matrix
1635!$OMP MASTER
1636 ALLOCATE (shm_distribution_vector(ncpu*nbins*n_threads))
1637 CALL optimize_distribution(ncpu*nbins*n_threads, ncpu*n_threads, local_cost_matrix, &
1638 shm_distribution_vector, x_data%load_balance_parameter%do_randomize)
1639
1640 ALLOCATE (full_dist(ncpu*n_threads, max_bin_size))
1641
1642 full_dist(:, :)%istart = 0_int_8
1643 full_dist(:, :)%number_of_atom_quartets = 0_int_8
1644 full_dist(:, :)%cost = 0_int_8
1645 full_dist(:, :)%time_first_scf = 0.0_dp
1646 full_dist(:, :)%time_other_scf = 0.0_dp
1647 full_dist(:, :)%time_forces = 0.0_dp
1648!$OMP END MASTER
1649
1650!$OMP BARRIER
1651 mepos = para_env%mepos + 1
1652 full_dist((mepos - 1)*n_threads + i_thread + 1, 1:my_bin_size) = binned_dist(1:my_bin_size)
1653!$OMP BARRIER
1654!$OMP MASTER
1655 ALLOCATE (sendbuffer(3*max_bin_size*n_threads))
1656 ALLOCATE (recbuffer(3*max_bin_size*n_threads))
1657 mepos = para_env%mepos
1658 DO j = 1, n_threads
1659 DO i = 1, max_bin_size
1660 sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 1) = full_dist(mepos*n_threads + j, i)%istart
1661 sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 2) = full_dist(mepos*n_threads + j, i)%number_of_atom_quartets
1662 sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 3) = full_dist(mepos*n_threads + j, i)%cost
1663 END DO
1664 END DO
1665 dest = modulo(mepos + 1, ncpu)
1666 source = modulo(mepos - 1, ncpu)
1667 ! sync before/after ring of isendrecv
1668 CALL para_env%sync()
1669 DO icpu = 0, ncpu - 1
1670 IF (icpu .NE. ncpu - 1) THEN
1671 CALL para_env%isendrecv(sendbuffer, dest, recbuffer, source, &
1672 req(1), req(2), 13)
1673 END IF
1674 data_from = modulo(mepos - icpu, ncpu)
1675 DO j = 1, n_threads
1676 DO i = 1, max_bin_size
1677 full_dist(data_from*n_threads + j, i)%istart = sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 1)
1678 full_dist(data_from*n_threads + j, i)%number_of_atom_quartets = sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 2)
1679 full_dist(data_from*n_threads + j, i)%cost = sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 3)
1680 END DO
1681 END DO
1682
1683 IF (icpu .NE. ncpu - 1) THEN
1684 CALL mp_waitall(req)
1685 END IF
1686 swapbuffer => sendbuffer
1687 sendbuffer => recbuffer
1688 recbuffer => swapbuffer
1689 END DO
1690 ! sync before/after ring of isendrecv
1691 DEALLOCATE (recbuffer, sendbuffer)
1692 CALL para_env%sync()
1693!$OMP END MASTER
1694!$OMP BARRIER
1695 !! reorder the distribution according to the distribution vector
1696 ALLOCATE (tmp_pos(ncpu*n_threads))
1697 tmp_pos = 1
1698 ALLOCATE (tmp_dist(nbins*ncpu*n_threads))
1699
1700 tmp_dist(:)%istart = 0_int_8
1701 tmp_dist(:)%number_of_atom_quartets = 0_int_8
1702 tmp_dist(:)%cost = 0_int_8
1703 tmp_dist(:)%time_first_scf = 0.0_dp
1704 tmp_dist(:)%time_other_scf = 0.0_dp
1705 tmp_dist(:)%time_forces = 0.0_dp
1706
1707 mepos = my_process_id + 1
1708 DO icpu = 1, n_processes
1709 DO i = 1, bin_histogram(icpu, 1)
1710 IF (shm_distribution_vector(bin_histogram(icpu, 2) - bin_histogram(icpu, 1) + i) == mepos) THEN
1711 tmp_dist(tmp_pos(mepos)) = full_dist(icpu, i)
1712 tmp_pos(mepos) = tmp_pos(mepos) + 1
1713 END IF
1714 END DO
1715 END DO
1716
1717 !! Assign the load to each process
1718 NULLIFY (ptr_to_tmp_dist)
1719 mepos = my_process_id + 1
1720 ptr_to_tmp_dist => tmp_dist(1:tmp_pos(mepos) - 1)
1721 SELECT CASE (eval_type)
1722 CASE (hfx_do_eval_energy)
1723 CALL hfx_set_distr_energy(ptr_to_tmp_dist, x_data)
1724 CASE (hfx_do_eval_forces)
1725 CALL hfx_set_distr_forces(ptr_to_tmp_dist, x_data)
1726 END SELECT
1727
1728!$OMP BARRIER
1729!$OMP MASTER
1730 DEALLOCATE (full_dist, cost_matrix, shm_distribution_vector)
1731 DEALLOCATE (bins_per_rank, bin_histogram)
1732!$OMP END MASTER
1733!$OMP BARRIER
1734 DEALLOCATE (tmp_dist, tmp_pos)
1735 DEALLOCATE (binned_dist, local_cost_matrix)
1736 END IF
1737!$OMP BARRIER
1738!$OMP MASTER
1739 CALL timestop(handle)
1740!$OMP END MASTER
1741!$OMP BARRIER
1742
1743 END SUBROUTINE hfx_update_load_balance
1744
1745! **************************************************************************************************
1746!> \brief estimates the cost of a set quartet with info available at load balance time
1747!> i.e. without much info on the primitives primitives
1748!> \param nsa ...
1749!> \param nsb ...
1750!> \param nsc ...
1751!> \param nsd ...
1752!> \param npgfa ...
1753!> \param npgfb ...
1754!> \param npgfc ...
1755!> \param npgfd ...
1756!> \param ratio ...
1757!> \param p1 ...
1758!> \param p2 ...
1759!> \param p3 ...
1760!> \return ...
1761!> \par History
1762!> 08.2009 created Joost VandeVondele
1763!> \author Joost VandeVondele
1764! **************************************************************************************************
1765 FUNCTION cost_model(nsa, nsb, nsc, nsd, npgfa, npgfb, npgfc, npgfd, ratio, p1, p2, p3) RESULT(res)
1766 IMPLICIT NONE
1767 REAL(kind=dp) :: estimate1, estimate2, estimate, ratio, switch, mu, sigma
1768 INTEGER(KIND=int_8) :: res
1769 REAL(kind=dp), INTENT(IN) :: p1(12), p2(12), p3(2)
1770
1771 INTEGER :: nsa, nsb, nsc, nsd, npgfa, npgfb, npgfc, npgfd
1772
1773 estimate1 = estimate_basic(p1)
1774 estimate2 = estimate_basic(p2)
1775 mu = log(abs(1.0e6_dp*p3(1)) + 1)
1776 sigma = p3(2)*0.1_dp*mu
1777 switch = 1.0_dp/(1.0_dp + exp((log(estimate1) - mu)/sigma))
1778 estimate = estimate1*(1.0_dp - switch) + estimate2*switch
1779 res = int(estimate*0.001_dp, kind=int_8) + 1
1780
1781 CONTAINS
1782
1783! **************************************************************************************************
1784!> \brief ...
1785!> \param p ...
1786!> \return ...
1787! **************************************************************************************************
1788 REAL(kind=dp) FUNCTION estimate_basic(p) RESULT(res)
1789 REAL(kind=dp) :: p(12)
1790
1791 REAL(kind=dp) :: p1, p10, p11, p12, p2, p3, p4, p5, p6, &
1792 p7, p8, p9
1793
1794 p1 = p(1); p2 = p(2); p3 = p(3); p4 = p(4)
1795 p5 = p(5); p6 = p(6); p7 = p(7); p8 = p(8)
1796 p9 = p(9); p10 = p(10); p11 = p(11); p12 = p(12)
1797 res = poly2(nsa, p1, p2, p3)*poly2(nsb, p1, p2, p3)*poly2(nsc, p1, p2, p3)*poly2(nsd, p1, p2, p3)* &
1798 poly2(npgfa, p4, p5, p6)*poly2(npgfb, p4, p5, p6)*poly2(npgfc, p4, p5, p6)* &
1799 poly2(npgfd, p4, p5, p6)*exp(-p7*ratio + p8*ratio**2) + &
1800 1000.0_dp*p9 + poly2(nsa, p10, p11, p12)*poly2(nsb, p10, p11, p12)*poly2(nsc, p10, p11, p12)*poly2(nsd, p10, p11, p12)
1801 res = 1 + abs(res)
1802 END FUNCTION estimate_basic
1803
1804! **************************************************************************************************
1805!> \brief ...
1806!> \param x ...
1807!> \param a0 ...
1808!> \param a1 ...
1809!> \param a2 ...
1810!> \return ...
1811! **************************************************************************************************
1812 REAL(kind=dp) FUNCTION poly2(x, a0, a1, a2)
1813 INTEGER :: x
1814 REAL(kind=dp) :: a0, a1, a2
1815
1816 poly2 = a0 + a1*x + a2*x*x
1817 END FUNCTION poly2
1818
1819 END FUNCTION cost_model
1820! **************************************************************************************************
1821!> \brief Minimizes the maximum cost per cpu by shuffling around all bins
1822!> \param total_number_of_bins ...
1823!> \param number_of_processes ...
1824!> \param bin_costs costs per bin
1825!> \param distribution_vector will contain the final distribution
1826!> \param do_randomize ...
1827!> \par History
1828!> 03.2009 created from a hack by Joost [Manuel Guidon]
1829!> \author Manuel Guidon
1830! **************************************************************************************************
1831 SUBROUTINE optimize_distribution(total_number_of_bins, number_of_processes, bin_costs, &
1832 distribution_vector, do_randomize)
1833 INTEGER :: total_number_of_bins, number_of_processes
1834 INTEGER(int_8), DIMENSION(:), POINTER :: bin_costs
1835 INTEGER, DIMENSION(:), POINTER :: distribution_vector
1836 LOGICAL, INTENT(IN) :: do_randomize
1837
1838 INTEGER :: i, itmp, j, nstep
1839 INTEGER(int_8), DIMENSION(:), POINTER :: my_cost_cpu, tmp_cost, tmp_cpu_cost
1840 INTEGER, DIMENSION(:), POINTER :: tmp_cpu_index, tmp_index
1841 TYPE(rng_stream_type), ALLOCATABLE :: rng_stream
1842
1843 nstep = max(1, int(number_of_processes)/2)
1844
1845 ALLOCATE (tmp_cost(total_number_of_bins))
1846 ALLOCATE (tmp_index(total_number_of_bins))
1847 ALLOCATE (tmp_cpu_cost(number_of_processes))
1848 ALLOCATE (tmp_cpu_index(number_of_processes))
1849 ALLOCATE (my_cost_cpu(number_of_processes))
1850 tmp_cost = bin_costs
1851
1852 CALL sort(tmp_cost, total_number_of_bins, tmp_index)
1853 my_cost_cpu = 0
1854 !
1855 ! assign the largest remaining bin to the CPU with the smallest load
1856 ! gives near perfect distributions for a sufficient number of bins ...
1857 ! doing this in chunks of nstep (where nstep ~ number_of_processes) makes this n log n and gives
1858 ! each cpu a similar number of tasks.
1859 ! it also avoids degenerate cases where thousands of zero sized tasks
1860 ! are assigned to the same (least loaded) cpu
1861 !
1862 IF (do_randomize) &
1863 rng_stream = rng_stream_type(name="uniform_rng", &
1864 distribution_type=uniform)
1865
1866 DO i = total_number_of_bins, 1, -nstep
1867 tmp_cpu_cost = my_cost_cpu
1868 CALL sort(tmp_cpu_cost, int(number_of_processes), tmp_cpu_index)
1869 IF (do_randomize) THEN
1870 CALL rng_stream%shuffle(tmp_cpu_index(1:min(i, nstep)))
1871 END IF
1872 DO j = 1, min(i, nstep)
1873 itmp = tmp_cpu_index(j)
1874 distribution_vector(tmp_index(i - j + 1)) = itmp
1875 my_cost_cpu(itmp) = my_cost_cpu(itmp) + bin_costs(tmp_index(i - j + 1))
1876 END DO
1877 END DO
1878
1879 DEALLOCATE (tmp_cost, tmp_index, tmp_cpu_cost)
1880 DEALLOCATE (tmp_cpu_index, my_cost_cpu)
1881 END SUBROUTINE optimize_distribution
1882
1883! **************************************************************************************************
1884!> \brief Given a 2d index pair, this function returns a 1d index pair for
1885!> a symmetric upper triangle NxN matrix
1886!> The compiler should inline this function, therefore it appears in
1887!> several modules
1888!> \param i 2d index
1889!> \param j 2d index
1890!> \param N matrix size
1891!> \return ...
1892!> \par History
1893!> 03.2009 created [Manuel Guidon]
1894!> \author Manuel Guidon
1895! **************************************************************************************************
1896 PURE FUNCTION get_1d_idx(i, j, N)
1897 INTEGER, INTENT(IN) :: i, j
1898 INTEGER(int_8), INTENT(IN) :: n
1899 INTEGER(int_8) :: get_1d_idx
1900
1901 INTEGER(int_8) :: min_ij
1902
1903 min_ij = min(i, j)
1904 get_1d_idx = min_ij*n + max(i, j) - (min_ij - 1)*min_ij/2 - n
1905
1906 END FUNCTION get_1d_idx
1907
1908! **************************************************************************************************
1909!> \brief ...
1910!> \param natom ...
1911!> \param nkind ...
1912!> \param list_ij ...
1913!> \param list_kl ...
1914!> \param set_list_ij ...
1915!> \param set_list_kl ...
1916!> \param iatom_start ...
1917!> \param iatom_end ...
1918!> \param jatom_start ...
1919!> \param jatom_end ...
1920!> \param katom_start ...
1921!> \param katom_end ...
1922!> \param latom_start ...
1923!> \param latom_end ...
1924!> \param particle_set ...
1925!> \param coeffs_set ...
1926!> \param coeffs_kind ...
1927!> \param is_assoc_atomic_block_global ...
1928!> \param do_periodic ...
1929!> \param kind_of ...
1930!> \param basis_parameter ...
1931!> \param pmax_set ...
1932!> \param pmax_atom ...
1933!> \param pmax_blocks ...
1934!> \param cell ...
1935!> \param do_p_screening ...
1936!> \param map_atom_to_kind_atom ...
1937!> \param eval_type ...
1938!> \param log10_eps_schwarz ...
1939!> \param log_2 ...
1940!> \param coeffs_kind_max0 ...
1941!> \param use_virial ...
1942!> \param atomic_pair_list ...
1943!> \return ...
1944! **************************************************************************************************
1945 FUNCTION estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
1946 iatom_start, iatom_end, jatom_start, jatom_end, &
1947 katom_start, katom_end, latom_start, latom_end, &
1948 particle_set, &
1949 coeffs_set, coeffs_kind, &
1950 is_assoc_atomic_block_global, do_periodic, &
1951 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
1952 cell, &
1953 do_p_screening, map_atom_to_kind_atom, eval_type, &
1954 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, &
1955 atomic_pair_list)
1956
1957 INTEGER, INTENT(IN) :: natom, nkind
1958 TYPE(pair_list_type) :: list_ij, list_kl
1959 TYPE(pair_set_list_type), DIMENSION(:) :: set_list_ij, set_list_kl
1960 INTEGER, INTENT(IN) :: iatom_start, iatom_end, jatom_start, &
1961 jatom_end, katom_start, katom_end, &
1962 latom_start, latom_end
1963 TYPE(particle_type), DIMENSION(:), POINTER :: particle_set
1964 TYPE(hfx_screen_coeff_type), &
1965 DIMENSION(:, :, :, :), POINTER :: coeffs_set
1966 TYPE(hfx_screen_coeff_type), &
1967 DIMENSION(nkind, nkind) :: coeffs_kind
1968 INTEGER, DIMENSION(:, :) :: is_assoc_atomic_block_global
1969 LOGICAL :: do_periodic
1970 INTEGER :: kind_of(*)
1971 TYPE(hfx_basis_type), DIMENSION(:), POINTER :: basis_parameter
1972 TYPE(hfx_p_kind), DIMENSION(:), POINTER :: pmax_set
1973 REAL(dp), DIMENSION(:, :), POINTER :: pmax_atom
1974 REAL(dp) :: pmax_blocks
1975 TYPE(cell_type), POINTER :: cell
1976 LOGICAL, INTENT(IN) :: do_p_screening
1977 INTEGER, DIMENSION(:), POINTER :: map_atom_to_kind_atom
1978 INTEGER, INTENT(IN) :: eval_type
1979 REAL(dp) :: log10_eps_schwarz, log_2, &
1980 coeffs_kind_max0
1981 LOGICAL, INTENT(IN) :: use_virial
1982 LOGICAL, DIMENSION(natom, natom) :: atomic_pair_list
1983 INTEGER(int_8) :: estimate_block_cost
1984
1985 INTEGER :: i_list_ij, i_list_kl, i_set_list_ij, i_set_list_ij_start, i_set_list_ij_stop, &
1986 i_set_list_kl, i_set_list_kl_start, i_set_list_kl_stop, iatom, ikind, iset, jatom, jkind, &
1987 jset, katom, kind_kind_idx, kkind, kset, latom, lkind, lset, swap_id
1988 INTEGER, DIMENSION(:), POINTER :: npgfa, npgfb, npgfc, npgfd, nsgfa, &
1989 nsgfb, nsgfc, nsgfd
1990 REAL(dp) :: actual_pmax_atom, cost_tmp, max_val1, &
1991 max_val2, pmax_entry, rab2, rcd2, &
1992 screen_kind_ij, screen_kind_kl
1993 REAL(dp), DIMENSION(:, :), POINTER :: ptr_p_1, ptr_p_2, ptr_p_3, ptr_p_4
1994
1995 estimate_block_cost = 0_int_8
1996
1997 CALL build_pair_list(natom, list_ij, set_list_ij, iatom_start, iatom_end, jatom_start, jatom_end, &
1998 kind_of, basis_parameter, particle_set, &
1999 do_periodic, coeffs_set, coeffs_kind, coeffs_kind_max0, &
2000 log10_eps_schwarz, cell, pmax_blocks, atomic_pair_list)
2001
2002 CALL build_pair_list(natom, list_kl, set_list_kl, katom_start, katom_end, latom_start, latom_end, &
2003 kind_of, basis_parameter, particle_set, &
2004 do_periodic, coeffs_set, coeffs_kind, coeffs_kind_max0, &
2005 log10_eps_schwarz, cell, pmax_blocks, atomic_pair_list)
2006
2007 DO i_list_ij = 1, list_ij%n_element
2008 iatom = list_ij%elements(i_list_ij)%pair(1)
2009 jatom = list_ij%elements(i_list_ij)%pair(2)
2010 i_set_list_ij_start = list_ij%elements(i_list_ij)%set_bounds(1)
2011 i_set_list_ij_stop = list_ij%elements(i_list_ij)%set_bounds(2)
2012 ikind = list_ij%elements(i_list_ij)%kind_pair(1)
2013 jkind = list_ij%elements(i_list_ij)%kind_pair(2)
2014 rab2 = list_ij%elements(i_list_ij)%dist2
2015
2016 nsgfa => basis_parameter(ikind)%nsgf
2017 nsgfb => basis_parameter(jkind)%nsgf
2018 npgfa => basis_parameter(ikind)%npgf
2019 npgfb => basis_parameter(jkind)%npgf
2020
2021 DO i_list_kl = 1, list_kl%n_element
2022
2023 katom = list_kl%elements(i_list_kl)%pair(1)
2024 latom = list_kl%elements(i_list_kl)%pair(2)
2025
2026 IF (.NOT. (katom + latom <= iatom + jatom)) cycle
2027 IF (((iatom + jatom) .EQ. (katom + latom)) .AND. (katom < iatom)) cycle
2028
2029 IF (eval_type == hfx_do_eval_forces) THEN
2030 IF (.NOT. use_virial) THEN
2031 IF ((iatom == jatom .AND. iatom == katom .AND. iatom == latom)) cycle
2032 END IF
2033 END IF
2034
2035 i_set_list_kl_start = list_kl%elements(i_list_kl)%set_bounds(1)
2036 i_set_list_kl_stop = list_kl%elements(i_list_kl)%set_bounds(2)
2037 kkind = list_kl%elements(i_list_kl)%kind_pair(1)
2038 lkind = list_kl%elements(i_list_kl)%kind_pair(2)
2039 rcd2 = list_kl%elements(i_list_kl)%dist2
2040
2041 nsgfc => basis_parameter(kkind)%nsgf
2042 nsgfd => basis_parameter(lkind)%nsgf
2043 npgfc => basis_parameter(kkind)%npgf
2044 npgfd => basis_parameter(lkind)%npgf
2045
2046 IF (do_p_screening) THEN
2047 actual_pmax_atom = max(pmax_atom(katom, iatom), &
2048 pmax_atom(latom, jatom), &
2049 pmax_atom(latom, iatom), &
2050 pmax_atom(katom, jatom))
2051 ELSE
2052 actual_pmax_atom = 0.0_dp
2053 END IF
2054
2055 screen_kind_ij = coeffs_kind(jkind, ikind)%x(1)*rab2 + &
2056 coeffs_kind(jkind, ikind)%x(2)
2057 screen_kind_kl = coeffs_kind(lkind, kkind)%x(1)*rcd2 + &
2058 coeffs_kind(lkind, kkind)%x(2)
2059 IF (screen_kind_ij + screen_kind_kl + actual_pmax_atom < log10_eps_schwarz) cycle
2060
2061 IF (.NOT. (is_assoc_atomic_block_global(latom, iatom) >= 1 .AND. &
2062 is_assoc_atomic_block_global(katom, iatom) >= 1 .AND. &
2063 is_assoc_atomic_block_global(katom, jatom) >= 1 .AND. &
2064 is_assoc_atomic_block_global(latom, jatom) >= 1)) cycle
2065
2066 IF (do_p_screening) THEN
2067 SELECT CASE (eval_type)
2068 CASE (hfx_do_eval_energy)
2069 swap_id = 0
2070 kind_kind_idx = int(get_1d_idx(kkind, ikind, int(nkind, int_8)))
2071 IF (ikind >= kkind) THEN
2072 ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2073 map_atom_to_kind_atom(katom), &
2074 map_atom_to_kind_atom(iatom))
2075 ELSE
2076 ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2077 map_atom_to_kind_atom(iatom), &
2078 map_atom_to_kind_atom(katom))
2079 swap_id = swap_id + 1
2080 END IF
2081 kind_kind_idx = int(get_1d_idx(lkind, jkind, int(nkind, int_8)))
2082 IF (jkind >= lkind) THEN
2083 ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2084 map_atom_to_kind_atom(latom), &
2085 map_atom_to_kind_atom(jatom))
2086 ELSE
2087 ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2088 map_atom_to_kind_atom(jatom), &
2089 map_atom_to_kind_atom(latom))
2090 swap_id = swap_id + 2
2091 END IF
2092 kind_kind_idx = int(get_1d_idx(lkind, ikind, int(nkind, int_8)))
2093 IF (ikind >= lkind) THEN
2094 ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2095 map_atom_to_kind_atom(latom), &
2096 map_atom_to_kind_atom(iatom))
2097 ELSE
2098 ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2099 map_atom_to_kind_atom(iatom), &
2100 map_atom_to_kind_atom(latom))
2101 swap_id = swap_id + 4
2102 END IF
2103 kind_kind_idx = int(get_1d_idx(kkind, jkind, int(nkind, int_8)))
2104 IF (jkind >= kkind) THEN
2105 ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2106 map_atom_to_kind_atom(katom), &
2107 map_atom_to_kind_atom(jatom))
2108 ELSE
2109 ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2110 map_atom_to_kind_atom(jatom), &
2111 map_atom_to_kind_atom(katom))
2112 swap_id = swap_id + 8
2113 END IF
2114 CASE (hfx_do_eval_forces)
2115 swap_id = 16
2116 kind_kind_idx = int(get_1d_idx(kkind, ikind, int(nkind, int_8)))
2117 IF (ikind >= kkind) THEN
2118 ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2119 map_atom_to_kind_atom(katom), &
2120 map_atom_to_kind_atom(iatom))
2121 ELSE
2122 ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2123 map_atom_to_kind_atom(iatom), &
2124 map_atom_to_kind_atom(katom))
2125 swap_id = swap_id + 1
2126 END IF
2127 kind_kind_idx = int(get_1d_idx(lkind, jkind, int(nkind, int_8)))
2128 IF (jkind >= lkind) THEN
2129 ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2130 map_atom_to_kind_atom(latom), &
2131 map_atom_to_kind_atom(jatom))
2132 ELSE
2133 ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2134 map_atom_to_kind_atom(jatom), &
2135 map_atom_to_kind_atom(latom))
2136 swap_id = swap_id + 2
2137 END IF
2138 kind_kind_idx = int(get_1d_idx(lkind, ikind, int(nkind, int_8)))
2139 IF (ikind >= lkind) THEN
2140 ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2141 map_atom_to_kind_atom(latom), &
2142 map_atom_to_kind_atom(iatom))
2143 ELSE
2144 ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2145 map_atom_to_kind_atom(iatom), &
2146 map_atom_to_kind_atom(latom))
2147 swap_id = swap_id + 4
2148 END IF
2149 kind_kind_idx = int(get_1d_idx(kkind, jkind, int(nkind, int_8)))
2150 IF (jkind >= kkind) THEN
2151 ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2152 map_atom_to_kind_atom(katom), &
2153 map_atom_to_kind_atom(jatom))
2154 ELSE
2155 ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2156 map_atom_to_kind_atom(jatom), &
2157 map_atom_to_kind_atom(katom))
2158 swap_id = swap_id + 8
2159 END IF
2160 END SELECT
2161 END IF
2162
2163 DO i_set_list_ij = i_set_list_ij_start, i_set_list_ij_stop
2164 iset = set_list_ij(i_set_list_ij)%pair(1)
2165 jset = set_list_ij(i_set_list_ij)%pair(2)
2166
2167 max_val1 = coeffs_set(jset, iset, jkind, ikind)%x(1)*rab2 + &
2168 coeffs_set(jset, iset, jkind, ikind)%x(2)
2169
2170 IF (max_val1 + screen_kind_kl + actual_pmax_atom < log10_eps_schwarz) cycle
2171 DO i_set_list_kl = i_set_list_kl_start, i_set_list_kl_stop
2172 kset = set_list_kl(i_set_list_kl)%pair(1)
2173 lset = set_list_kl(i_set_list_kl)%pair(2)
2174
2175 max_val2 = max_val1 + (coeffs_set(lset, kset, lkind, kkind)%x(1)*rcd2 + &
2176 coeffs_set(lset, kset, lkind, kkind)%x(2))
2177
2178 IF (max_val2 + actual_pmax_atom < log10_eps_schwarz) cycle
2179 IF (do_p_screening) THEN
2180 CALL get_pmax_val(ptr_p_1, ptr_p_2, ptr_p_3, ptr_p_4, &
2181 iset, jset, kset, lset, &
2182 pmax_entry, swap_id)
2183 IF (eval_type == hfx_do_eval_forces) THEN
2184 pmax_entry = log_2 + pmax_entry
2185 END IF
2186 ELSE
2187 pmax_entry = 0.0_dp
2188 END IF
2189 max_val2 = max_val2 + pmax_entry
2190 IF (max_val2 < log10_eps_schwarz) cycle
2191 SELECT CASE (eval_type)
2192 CASE (hfx_do_eval_energy)
2193 cost_tmp = cost_model(nsgfa(iset), nsgfb(jset), nsgfc(kset), nsgfd(lset), &
2194 npgfa(iset), npgfb(jset), npgfc(kset), npgfd(lset), &
2195 max_val2/log10_eps_schwarz, &
2197 estimate_block_cost = estimate_block_cost + int(cost_tmp, kind=int_8)
2198 CASE (hfx_do_eval_forces)
2199 cost_tmp = cost_model(nsgfa(iset), nsgfb(jset), nsgfc(kset), nsgfd(lset), &
2200 npgfa(iset), npgfb(jset), npgfc(kset), npgfd(lset), &
2201 max_val2/log10_eps_schwarz, &
2202 p1_forces, p2_forces, p3_forces)
2203 estimate_block_cost = estimate_block_cost + int(cost_tmp, kind=int_8)
2204 END SELECT
2205 END DO ! i_set_list_kl
2206 END DO ! i_set_list_ij
2207 END DO ! i_list_kl
2208 END DO ! i_list_ij
2209
2210 END FUNCTION estimate_block_cost
2211
2212! **************************************************************************************************
2213!> \brief ...
2214!> \param nkind ...
2215!> \param para_env ...
2216!> \param natom ...
2217!> \param block_size ...
2218!> \param nblock ...
2219!> \param blocks ...
2220!> \param list_ij ...
2221!> \param list_kl ...
2222!> \param set_list_ij ...
2223!> \param set_list_kl ...
2224!> \param particle_set ...
2225!> \param coeffs_set ...
2226!> \param coeffs_kind ...
2227!> \param is_assoc_atomic_block_global ...
2228!> \param do_periodic ...
2229!> \param kind_of ...
2230!> \param basis_parameter ...
2231!> \param pmax_set ...
2232!> \param pmax_atom ...
2233!> \param pmax_blocks ...
2234!> \param cell ...
2235!> \param do_p_screening ...
2236!> \param map_atom_to_kind_atom ...
2237!> \param eval_type ...
2238!> \param log10_eps_schwarz ...
2239!> \param log_2 ...
2240!> \param coeffs_kind_max0 ...
2241!> \param use_virial ...
2242!> \param atomic_pair_list ...
2243! **************************************************************************************************
2244 SUBROUTINE init_blocks(nkind, para_env, natom, block_size, nblock, blocks, &
2245 list_ij, list_kl, set_list_ij, set_list_kl, &
2246 particle_set, &
2247 coeffs_set, coeffs_kind, &
2248 is_assoc_atomic_block_global, do_periodic, &
2249 kind_of, basis_parameter, pmax_set, pmax_atom, &
2250 pmax_blocks, cell, &
2251 do_p_screening, map_atom_to_kind_atom, eval_type, &
2252 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, &
2253 atomic_pair_list)
2254
2255 INTEGER, INTENT(IN) :: nkind
2256 TYPE(mp_para_env_type), INTENT(IN) :: para_env
2257 INTEGER :: natom, block_size, nblock
2258 TYPE(hfx_block_range_type), DIMENSION(1:nblock) :: blocks
2259 TYPE(pair_list_type) :: list_ij, list_kl
2260 TYPE(pair_set_list_type), DIMENSION(:) :: set_list_ij, set_list_kl
2261 TYPE(particle_type), DIMENSION(:), POINTER :: particle_set
2262 TYPE(hfx_screen_coeff_type), &
2263 DIMENSION(:, :, :, :), POINTER :: coeffs_set
2264 TYPE(hfx_screen_coeff_type), DIMENSION(:, :), &
2265 POINTER :: coeffs_kind
2266 INTEGER, DIMENSION(:, :) :: is_assoc_atomic_block_global
2267 LOGICAL :: do_periodic
2268 INTEGER :: kind_of(*)
2269 TYPE(hfx_basis_type), DIMENSION(:), POINTER :: basis_parameter
2270 TYPE(hfx_p_kind), DIMENSION(:), POINTER :: pmax_set
2271 REAL(dp), DIMENSION(:, :), POINTER :: pmax_atom
2272 REAL(dp) :: pmax_blocks
2273 TYPE(cell_type), POINTER :: cell
2274 LOGICAL, INTENT(IN) :: do_p_screening
2275 INTEGER, DIMENSION(:), POINTER :: map_atom_to_kind_atom
2276 INTEGER, INTENT(IN) :: eval_type
2277 REAL(dp) :: log10_eps_schwarz, log_2, &
2278 coeffs_kind_max0
2279 LOGICAL, INTENT(IN) :: use_virial
2280 LOGICAL, DIMENSION(natom, natom) :: atomic_pair_list
2281
2282 INTEGER :: atom_block, i, iatom_block, iatom_end, &
2283 iatom_start, my_cpu_rank, ncpus
2284
2285 DO atom_block = 0, nblock - 1
2286 iatom_block = modulo(atom_block, nblock) + 1
2287 iatom_start = (iatom_block - 1)*block_size + 1
2288 iatom_end = min(iatom_block*block_size, natom)
2289 blocks(atom_block + 1)%istart = iatom_start
2290 blocks(atom_block + 1)%iend = iatom_end
2291 blocks(atom_block + 1)%cost = 0_int_8
2292 END DO
2293
2294 ncpus = para_env%num_pe
2295 my_cpu_rank = para_env%mepos
2296 DO i = 1, nblock
2297 IF (modulo(i, ncpus) /= my_cpu_rank) THEN
2298 blocks(i)%istart = 0
2299 blocks(i)%iend = 0
2300 cycle
2301 END IF
2302 iatom_start = blocks(i)%istart
2303 iatom_end = blocks(i)%iend
2304 blocks(i)%cost = estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
2305 iatom_start, iatom_end, iatom_start, iatom_end, &
2306 iatom_start, iatom_end, iatom_start, iatom_end, &
2307 particle_set, &
2308 coeffs_set, coeffs_kind, &
2309 is_assoc_atomic_block_global, do_periodic, &
2310 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
2311 cell, &
2312 do_p_screening, map_atom_to_kind_atom, eval_type, &
2313 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
2314
2315 END DO
2316 END SUBROUTINE init_blocks
2317
2318! **************************************************************************************************
2319!> \brief ...
2320!> \param para_env ...
2321!> \param x_data ...
2322!> \param iw ...
2323!> \param n_threads ...
2324!> \param i_thread ...
2325!> \param eval_type ...
2326! **************************************************************************************************
2327 SUBROUTINE collect_load_balance_info(para_env, x_data, iw, n_threads, i_thread, &
2328 eval_type)
2329
2330 TYPE(mp_para_env_type), INTENT(IN) :: para_env
2331 TYPE(hfx_type), POINTER :: x_data
2332 INTEGER, INTENT(IN) :: iw, n_threads, i_thread, eval_type
2333
2334 INTEGER :: i, j, k, my_rank, nbins, nranks, &
2335 total_bins
2336 INTEGER(int_8) :: avg_bin, avg_rank, max_bin, max_rank, &
2337 min_bin, min_rank, sum_bin, sum_rank
2338 INTEGER(int_8), ALLOCATABLE, DIMENSION(:) :: buffer, buffer_in, buffer_out, summary
2339 INTEGER(int_8), ALLOCATABLE, DIMENSION(:), SAVE :: shm_cost_vector
2340 INTEGER, ALLOCATABLE, DIMENSION(:) :: bins_per_rank, rdispl, sort_idx
2341 INTEGER, ALLOCATABLE, DIMENSION(:), SAVE :: shm_bins_per_rank, shm_displ
2342
2343 SELECT CASE (eval_type)
2344 CASE (hfx_do_eval_energy)
2345 nbins = SIZE(x_data%distribution_energy)
2346 CASE (hfx_do_eval_forces)
2347 nbins = SIZE(x_data%distribution_forces)
2348 END SELECT
2349
2350!$OMP MASTER
2351 ALLOCATE (shm_bins_per_rank(n_threads))
2352 ALLOCATE (shm_displ(n_threads + 1))
2353!$OMP END MASTER
2354!$OMP BARRIER
2355
2356 shm_bins_per_rank(i_thread + 1) = nbins
2357!$OMP BARRIER
2358 nbins = 0
2359 DO i = 1, n_threads
2360 nbins = nbins + shm_bins_per_rank(i)
2361 END DO
2362 my_rank = para_env%mepos
2363 nranks = para_env%num_pe
2364
2365!$OMP BARRIER
2366!$OMP MASTER
2367 ALLOCATE (bins_per_rank(nranks))
2368 bins_per_rank = 0
2369
2370 bins_per_rank(my_rank + 1) = nbins
2371
2372 CALL para_env%sum(bins_per_rank)
2373
2374 total_bins = 0
2375 DO i = 1, nranks
2376 total_bins = total_bins + bins_per_rank(i)
2377 END DO
2378
2379 ALLOCATE (shm_cost_vector(2*total_bins))
2380 shm_cost_vector = -1_int_8
2381 shm_displ(1) = 1
2382 DO i = 2, n_threads
2383 shm_displ(i) = shm_displ(i - 1) + shm_bins_per_rank(i - 1)
2384 END DO
2385 shm_displ(n_threads + 1) = nbins + 1
2386!$OMP END MASTER
2387!$OMP BARRIER
2388 j = 0
2389 SELECT CASE (eval_type)
2390 CASE (hfx_do_eval_energy)
2391 DO i = shm_displ(i_thread + 1), shm_displ(i_thread + 2) - 1
2392 j = j + 1
2393 shm_cost_vector(2*(i - 1) + 1) = x_data%distribution_energy(j)%cost
2394 shm_cost_vector(2*i) = int(x_data%distribution_energy(j)%time_first_scf*10000.0_dp, kind=int_8)
2395 END DO
2396 CASE (hfx_do_eval_forces)
2397 DO i = shm_displ(i_thread + 1), shm_displ(i_thread + 2) - 1
2398 j = j + 1
2399 shm_cost_vector(2*(i - 1) + 1) = x_data%distribution_forces(j)%cost
2400 shm_cost_vector(2*i) = int(x_data%distribution_forces(j)%time_forces*10000.0_dp, kind=int_8)
2401 END DO
2402 END SELECT
2403!$OMP BARRIER
2404!$OMP MASTER
2405 ! ** calculate offsets
2406 ALLOCATE (rdispl(nranks))
2407 bins_per_rank(:) = bins_per_rank(:)*2
2408 rdispl(1) = 0
2409 DO i = 2, nranks
2410 rdispl(i) = rdispl(i - 1) + bins_per_rank(i - 1)
2411 END DO
2412
2413 ALLOCATE (buffer_in(2*nbins))
2414 ALLOCATE (buffer_out(2*total_bins))
2415
2416 DO i = 1, nbins
2417 buffer_in(2*(i - 1) + 1) = shm_cost_vector(2*(i - 1) + 1)
2418 buffer_in(2*i) = shm_cost_vector(2*i)
2419 END DO
2420
2421 CALL para_env%gatherv(buffer_in, buffer_out, bins_per_rank, rdispl)
2422
2423 IF (iw > 0) THEN
2424
2425 ALLOCATE (summary(2*nranks))
2426 summary = 0_int_8
2427
2428 WRITE (iw, '( /, 1X, 79("-") )')
2429 WRITE (iw, '( " -", 77X, "-" )')
2430 SELECT CASE (eval_type)
2431 CASE (hfx_do_eval_energy)
2432 WRITE (iw, '( " -", 20X, A, 19X, "-" )') ' HFX LOAD BALANCE INFORMATION - ENERGY '
2433 CASE (hfx_do_eval_forces)
2434 WRITE (iw, '( " -", 20X, A, 19X, "-" )') ' HFX LOAD BALANCE INFORMATION - FORCES '
2435 END SELECT
2436 WRITE (iw, '( " -", 77X, "-" )')
2437 WRITE (iw, '( 1X, 79("-") )')
2438
2439 WRITE (iw, fmt="(T3,A,T15,A,T35,A,T55,A)") "MPI RANK", "BIN #", "EST cost", "Processing time [s]"
2440 WRITE (iw, '( 1X, 79("-"), / )')
2441 k = 0
2442 DO i = 1, nranks
2443 DO j = 1, bins_per_rank(i)/2
2444 k = k + 1
2445 WRITE (iw, fmt="(T6,I5,T15,I5,T27,I16,T55,F19.8)") &
2446 i - 1, j, buffer_out(2*(k - 1) + 1), real(buffer_out(2*k), dp)/10000.0_dp
2447 summary(2*(i - 1) + 1) = summary(2*(i - 1) + 1) + buffer_out(2*(k - 1) + 1)
2448 summary(2*i) = summary(2*i) + buffer_out(2*k)
2449 END DO
2450 END DO
2451
2452 !** Summary
2453 max_bin = 0_int_8
2454 min_bin = huge(min_bin)
2455 sum_bin = 0_int_8
2456 DO i = 1, total_bins
2457 sum_bin = sum_bin + buffer_out(2*i)
2458 max_bin = max(max_bin, buffer_out(2*i))
2459 min_bin = min(min_bin, buffer_out(2*i))
2460 END DO
2461 avg_bin = sum_bin/total_bins
2462
2463 max_rank = 0_int_8
2464 min_rank = huge(min_rank)
2465 sum_rank = 0_int_8
2466 DO i = 1, nranks
2467 sum_rank = sum_rank + summary(2*i)
2468 max_rank = max(max_rank, summary(2*i))
2469 min_rank = min(min_rank, summary(2*i))
2470 END DO
2471 avg_rank = sum_rank/nranks
2472
2473 WRITE (iw, fmt='(/,T3,A,/)') "SUMMARY:"
2474 WRITE (iw, fmt="(T3,A,T35,F19.8)") "Max bin", real(max_bin, dp)/10000.0_dp
2475 WRITE (iw, fmt="(T3,A,T35,F19.8)") "Min bin", real(min_bin, dp)/10000.0_dp
2476 WRITE (iw, fmt="(T3,A,T35,F19.8)") "Sum bin", real(sum_bin, dp)/10000.0_dp
2477 WRITE (iw, fmt="(T3,A,T35,F19.8,/)") "Avg bin", real(avg_bin, dp)/10000.0_dp
2478 WRITE (iw, fmt="(T3,A,T35,F19.8)") "Max rank", real(max_rank, dp)/10000.0_dp
2479 WRITE (iw, fmt="(T3,A,T35,F19.8)") "Min rank", real(min_rank, dp)/10000.0_dp
2480 WRITE (iw, fmt="(T3,A,T35,F19.8)") "Sum rank", real(sum_rank, dp)/10000.0_dp
2481 WRITE (iw, fmt="(T3,A,T35,F19.8,/)") "Avg rank", real(avg_rank, dp)/10000.0_dp
2482
2483 ALLOCATE (buffer(nranks))
2484 ALLOCATE (sort_idx(nranks))
2485
2486 DO i = 1, nranks
2487 buffer(i) = summary(2*i)
2488 END DO
2489
2490 CALL sort(buffer, nranks, sort_idx)
2491
2492 WRITE (iw, fmt="(T3,A,T35,A,T55,A,/)") "MPI RANK", "EST cost", "Processing time [s]"
2493 DO i = nranks, 1, -1
2494 WRITE (iw, fmt="(T6,I5,T27,I16,T55,F19.8)") sort_idx(i) - 1, summary(2*(sort_idx(i) - 1) + 1), real(buffer(i), dp)/10000.0_dp
2495 END DO
2496
2497 DEALLOCATE (summary, buffer, sort_idx)
2498
2499 END IF
2500
2501 DEALLOCATE (buffer_in, buffer_out, rdispl)
2502
2503 CALL para_env%sync()
2504
2505 DEALLOCATE (shm_bins_per_rank, shm_displ, shm_cost_vector)
2506!$OMP END MASTER
2507!$OMP BARRIER
2508
2509 END SUBROUTINE collect_load_balance_info
2510
2511
2512! **************************************************************************************************
2513!> \brief This routine calculates the maximum density matrix element, when
2514!> screening on an initial density matrix is applied. Due to symmetry of
2515!> the ERI's, there are always 4 matrix elements to be considered.
2516!> CASE 0-15 belong to an energy calculation (linear screening)
2517!> CASE 16-31 belong to a force calculation (square screening)
2518!> \param ptr_p_1 Pointers to atomic density matrices
2519!> \param ptr_p_2 Pointers to atomic density matrices
2520!> \param ptr_p_3 Pointers to atomic density matrices
2521!> \param ptr_p_4 Pointers to atomic density matrices
2522!> \param iset Current set
2523!> \param jset Current set
2524!> \param kset Current set
2525!> \param lset Current set
2526!> \param pmax_val value to be calculated
2527!> \param swap_id Defines how the matrices are accessed
2528!> \par History
2529!> 06.2009 created [Manuel Guidon]
2530!> \author Manuel Guidon
2531! **************************************************************************************************
2532PURE SUBROUTINE get_pmax_val(ptr_p_1, ptr_p_2, ptr_p_3, ptr_p_4, iset, jset, kset, lset, pmax_val, swap_id)
2533
2534 REAL(dp), DIMENSION(:, :), POINTER :: ptr_p_1, ptr_p_2, ptr_p_3, ptr_p_4
2535 INTEGER, INTENT(IN) :: iset, jset, kset, lset
2536
2537 REAL(dp), INTENT(OUT) :: pmax_val
2538 INTEGER, INTENT(IN) :: swap_id
2539
2540 REAL(dp) :: pmax_1, pmax_2, pmax_3, pmax_4
2541
2542 SELECT CASE (swap_id)
2543 CASE (0)
2544 pmax_1 = ptr_p_1(kset, iset)
2545 pmax_2 = ptr_p_2(lset, jset)
2546 pmax_3 = ptr_p_3(lset, iset)
2547 pmax_4 = ptr_p_4(kset, jset)
2548 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2549 CASE (1)
2550 pmax_1 = ptr_p_1(iset, kset)
2551 pmax_2 = ptr_p_2(lset, jset)
2552 pmax_3 = ptr_p_3(lset, iset)
2553 pmax_4 = ptr_p_4(kset, jset)
2554 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2555 CASE (2)
2556 pmax_1 = ptr_p_1(kset, iset)
2557 pmax_2 = ptr_p_2(jset, lset)
2558 pmax_3 = ptr_p_3(lset, iset)
2559 pmax_4 = ptr_p_4(kset, jset)
2560 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2561 CASE (3)
2562 pmax_1 = ptr_p_1(iset, kset)
2563 pmax_2 = ptr_p_2(jset, lset)
2564 pmax_3 = ptr_p_3(lset, iset)
2565 pmax_4 = ptr_p_4(kset, jset)
2566 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2567 CASE (4)
2568 pmax_1 = ptr_p_1(kset, iset)
2569 pmax_2 = ptr_p_2(lset, jset)
2570 pmax_3 = ptr_p_3(iset, lset)
2571 pmax_4 = ptr_p_4(kset, jset)
2572 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2573 CASE (5)
2574 pmax_1 = ptr_p_1(iset, kset)
2575 pmax_2 = ptr_p_2(lset, jset)
2576 pmax_3 = ptr_p_3(iset, lset)
2577 pmax_4 = ptr_p_4(kset, jset)
2578 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2579 CASE (6)
2580 pmax_1 = ptr_p_1(kset, iset)
2581 pmax_2 = ptr_p_2(jset, lset)
2582 pmax_3 = ptr_p_3(iset, lset)
2583 pmax_4 = ptr_p_4(kset, jset)
2584 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2585 CASE (7)
2586 pmax_1 = ptr_p_1(iset, kset)
2587 pmax_2 = ptr_p_2(jset, lset)
2588 pmax_3 = ptr_p_3(iset, lset)
2589 pmax_4 = ptr_p_4(kset, jset)
2590 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2591 CASE (8)
2592 pmax_1 = ptr_p_1(kset, iset)
2593 pmax_2 = ptr_p_2(lset, jset)
2594 pmax_3 = ptr_p_3(lset, iset)
2595 pmax_4 = ptr_p_4(jset, kset)
2596 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2597 CASE (9)
2598 pmax_1 = ptr_p_1(iset, kset)
2599 pmax_2 = ptr_p_2(lset, jset)
2600 pmax_3 = ptr_p_3(lset, iset)
2601 pmax_4 = ptr_p_4(jset, kset)
2602 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2603 CASE (10)
2604 pmax_1 = ptr_p_1(kset, iset)
2605 pmax_2 = ptr_p_2(jset, lset)
2606 pmax_3 = ptr_p_3(lset, iset)
2607 pmax_4 = ptr_p_4(jset, kset)
2608 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2609 CASE (11)
2610 pmax_1 = ptr_p_1(iset, kset)
2611 pmax_2 = ptr_p_2(jset, lset)
2612 pmax_3 = ptr_p_3(lset, iset)
2613 pmax_4 = ptr_p_4(jset, kset)
2614 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2615 CASE (12)
2616 pmax_1 = ptr_p_1(kset, iset)
2617 pmax_2 = ptr_p_2(lset, jset)
2618 pmax_3 = ptr_p_3(iset, lset)
2619 pmax_4 = ptr_p_4(jset, kset)
2620 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2621 CASE (13)
2622 pmax_1 = ptr_p_1(iset, kset)
2623 pmax_2 = ptr_p_2(lset, jset)
2624 pmax_3 = ptr_p_3(iset, lset)
2625 pmax_4 = ptr_p_4(jset, kset)
2626 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2627 CASE (14)
2628 pmax_1 = ptr_p_1(kset, iset)
2629 pmax_2 = ptr_p_2(jset, lset)
2630 pmax_3 = ptr_p_3(iset, lset)
2631 pmax_4 = ptr_p_4(jset, kset)
2632 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2633 CASE (15)
2634 pmax_1 = ptr_p_1(iset, kset)
2635 pmax_2 = ptr_p_2(jset, lset)
2636 pmax_3 = ptr_p_3(iset, lset)
2637 pmax_4 = ptr_p_4(jset, kset)
2638 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2639 CASE (16)
2640 pmax_1 = ptr_p_1(kset, iset)
2641 pmax_2 = ptr_p_2(lset, jset)
2642 pmax_3 = ptr_p_3(lset, iset)
2643 pmax_4 = ptr_p_4(kset, jset)
2644 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2645 CASE (17)
2646 pmax_1 = ptr_p_1(iset, kset)
2647 pmax_2 = ptr_p_2(lset, jset)
2648 pmax_3 = ptr_p_3(lset, iset)
2649 pmax_4 = ptr_p_4(kset, jset)
2650 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2651 CASE (18)
2652 pmax_1 = ptr_p_1(kset, iset)
2653 pmax_2 = ptr_p_2(jset, lset)
2654 pmax_3 = ptr_p_3(lset, iset)
2655 pmax_4 = ptr_p_4(kset, jset)
2656 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2657 CASE (19)
2658 pmax_1 = ptr_p_1(iset, kset)
2659 pmax_2 = ptr_p_2(jset, lset)
2660 pmax_3 = ptr_p_3(lset, iset)
2661 pmax_4 = ptr_p_4(kset, jset)
2662 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2663 CASE (20)
2664 pmax_1 = ptr_p_1(kset, iset)
2665 pmax_2 = ptr_p_2(lset, jset)
2666 pmax_3 = ptr_p_3(iset, lset)
2667 pmax_4 = ptr_p_4(kset, jset)
2668 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2669 CASE (21)
2670 pmax_1 = ptr_p_1(iset, kset)
2671 pmax_2 = ptr_p_2(lset, jset)
2672 pmax_3 = ptr_p_3(iset, lset)
2673 pmax_4 = ptr_p_4(kset, jset)
2674 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2675 CASE (22)
2676 pmax_1 = ptr_p_1(kset, iset)
2677 pmax_2 = ptr_p_2(jset, lset)
2678 pmax_3 = ptr_p_3(iset, lset)
2679 pmax_4 = ptr_p_4(kset, jset)
2680 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2681 CASE (23)
2682 pmax_1 = ptr_p_1(iset, kset)
2683 pmax_2 = ptr_p_2(jset, lset)
2684 pmax_3 = ptr_p_3(iset, lset)
2685 pmax_4 = ptr_p_4(kset, jset)
2686 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2687 CASE (24)
2688 pmax_1 = ptr_p_1(kset, iset)
2689 pmax_2 = ptr_p_2(lset, jset)
2690 pmax_3 = ptr_p_3(lset, iset)
2691 pmax_4 = ptr_p_4(jset, kset)
2692 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2693 CASE (25)
2694 pmax_1 = ptr_p_1(iset, kset)
2695 pmax_2 = ptr_p_2(lset, jset)
2696 pmax_3 = ptr_p_3(lset, iset)
2697 pmax_4 = ptr_p_4(jset, kset)
2698 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2699 CASE (26)
2700 pmax_1 = ptr_p_1(kset, iset)
2701 pmax_2 = ptr_p_2(jset, lset)
2702 pmax_3 = ptr_p_3(lset, iset)
2703 pmax_4 = ptr_p_4(jset, kset)
2704 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2705 CASE (27)
2706 pmax_1 = ptr_p_1(iset, kset)
2707 pmax_2 = ptr_p_2(jset, lset)
2708 pmax_3 = ptr_p_3(lset, iset)
2709 pmax_4 = ptr_p_4(jset, kset)
2710 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2711 CASE (28)
2712 pmax_1 = ptr_p_1(kset, iset)
2713 pmax_2 = ptr_p_2(lset, jset)
2714 pmax_3 = ptr_p_3(iset, lset)
2715 pmax_4 = ptr_p_4(jset, kset)
2716 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2717 CASE (29)
2718 pmax_1 = ptr_p_1(iset, kset)
2719 pmax_2 = ptr_p_2(lset, jset)
2720 pmax_3 = ptr_p_3(iset, lset)
2721 pmax_4 = ptr_p_4(jset, kset)
2722 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2723 CASE (30)
2724 pmax_1 = ptr_p_1(kset, iset)
2725 pmax_2 = ptr_p_2(jset, lset)
2726 pmax_3 = ptr_p_3(iset, lset)
2727 pmax_4 = ptr_p_4(jset, kset)
2728 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2729 CASE (31)
2730 pmax_1 = ptr_p_1(iset, kset)
2731 pmax_2 = ptr_p_2(jset, lset)
2732 pmax_3 = ptr_p_3(iset, lset)
2733 pmax_4 = ptr_p_4(jset, kset)
2734 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2735 END SELECT
2736
2737END SUBROUTINE get_pmax_val
2738
2739
2740END MODULE hfx_load_balance_methods
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
static GRID_HOST_DEVICE int idx(const orbital a)
Return coset index of given orbital angular momentum.
Handles all functions related to the CELL.
Definition cell_types.F:15
Utility routines to open and close files. Tracking of preconnections.
Definition cp_files.F:16
subroutine, public open_file(file_name, file_status, file_form, file_action, file_position, file_pad, unit_number, debug, skip_get_unit_number, file_access)
Opens the requested file using a free unit number.
Definition cp_files.F:308
subroutine, public close_file(unit_number, file_status, keep_preconnection)
Close an open file given by its logical unit number. Optionally, keep the file and unit preconnected.
Definition cp_files.F:119
Routines for optimizing load balance between processes in HFX calculations.
real(kind=dp), dimension(12), parameter, public p1_energy
real(kind=dp), dimension(2), parameter, public p3_energy
real(kind=dp), dimension(12), parameter, public p2_energy
subroutine, public collect_load_balance_info(para_env, x_data, iw, n_threads, i_thread, eval_type)
...
subroutine, public hfx_load_balance(x_data, eps_schwarz, particle_set, max_set, para_env, coeffs_set, coeffs_kind, is_assoc_atomic_block_global, do_periodic, load_balance_parameter, kind_of, basis_parameter, pmax_set, pmax_atom, i_thread, n_threads, cell, do_p_screening, map_atom_to_kind_atom, nkind, eval_type, pmax_block, use_virial)
Distributes the computation of eri's to all available processes.
integer(kind=int_8) function, public cost_model(nsa, nsb, nsc, nsd, npgfa, npgfb, npgfc, npgfd, ratio, p1, p2, p3)
estimates the cost of a set quartet with info available at load balance time i.e. without much info o...
subroutine, public hfx_update_load_balance(x_data, para_env, load_balance_parameter, i_thread, n_threads, eval_type)
Cheap way of redistributing the eri's.
Routines for optimizing load balance between processes in HFX calculations.
subroutine, public build_atomic_pair_list(natom, atomic_pair_list, kind_of, basis_parameter, particle_set, do_periodic, coeffs_kind, coeffs_kind_max0, log10_eps_schwarz, cell, blocks)
...
subroutine, public build_pair_list(natom, list, set_list, i_start, i_end, j_start, j_end, kind_of, basis_parameter, particle_set, do_periodic, coeffs_set, coeffs_kind, coeffs_kind_max0, log10_eps_schwarz, cell, pmax_blocks, atomic_pair_list)
...
Types and set/get functions for HFX.
Definition hfx_types.F:15
subroutine, public hfx_set_distr_energy(ptr_to_distr, x_data)
This routine stores the data obtained from the load balance routine for the energy
Definition hfx_types.F:2568
subroutine, public hfx_set_distr_forces(ptr_to_distr, x_data)
This routine stores the data obtained from the load balance routine for the forces
Definition hfx_types.F:2588
collects all constants needed in input so that they can be used without circular dependencies
integer, parameter, public hfx_do_eval_energy
integer, parameter, public hfx_do_eval_forces
Defines the basic variable types.
Definition kinds.F:23
integer, parameter, public int_8
Definition kinds.F:54
integer, parameter, public dp
Definition kinds.F:34
Interface to the message passing library MPI.
Parallel (pseudo)random number generator (RNG) for multiple streams and substreams of random numbers.
integer, parameter, public uniform
Define the data structure for the particle information.
All kind of helpful little routines.
Definition util.F:14
Type defining parameters related to the simulation cell.
Definition cell_types.F:55
stores some data used in construction of Kohn-Sham matrix
Definition hfx_types.F:509
stores all the informations relevant to an mpi environment