(git:ccc2433)
hfx_load_balance_methods.F
Go to the documentation of this file.
1 !--------------------------------------------------------------------------------------------------!
2 ! CP2K: A general program to perform molecular dynamics simulations !
3 ! Copyright 2000-2024 CP2K developers group <https://cp2k.org> !
4 ! !
5 ! SPDX-License-Identifier: GPL-2.0-or-later !
6 !--------------------------------------------------------------------------------------------------!
7 
8 ! **************************************************************************************************
9 !> \brief Routines for optimizing load balance between processes in HFX calculations
10 !> \par History
11 !> 04.2008 created [Manuel Guidon]
12 !> \author Manuel Guidon
13 ! **************************************************************************************************
15  USE cell_types, ONLY: cell_type
16  USE cp_files, ONLY: close_file, &
17  open_file
18  USE message_passing, ONLY: mp_para_env_type
21  USE hfx_types, ONLY: &
22  hfx_basis_type, hfx_block_range_type, hfx_distribution, hfx_load_balance_type, hfx_p_kind, &
23  hfx_screen_coeff_type, hfx_set_distr_energy, hfx_set_distr_forces, hfx_type, &
24  pair_list_type, pair_set_list_type
27  USE kinds, ONLY: dp, &
28  int_8
29  USE message_passing, ONLY: mp_waitall, mp_request_type
30  USE parallel_rng_types, ONLY: uniform, &
31  rng_stream_type
32  USE particle_types, ONLY: particle_type
33  USE util, ONLY: sort
34 #include "./base/base_uses.f90"
35 
36  IMPLICIT NONE
37  PRIVATE
38 
39  PUBLIC :: hfx_load_balance, &
42 
43  CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'hfx_load_balance_methods'
44 
45  REAL(kind=dp), PARAMETER :: p1_energy(12) = (/2.9461408209700424_dp, 1.0624718662999657_dp, &
46  -1.91570128356921242e-002_dp, -1.6668495454436603_dp, &
47  1.7512639006523709_dp, -9.76074323945336081e-002_dp, &
48  2.6230786127311889_dp, -0.31870737623014189_dp, &
49  7.9588203912690973_dp, 1.8331423413134813_dp, &
50  -0.15427618665346299_dp, 0.19749436090711650_dp/)
51  REAL(kind=dp), PARAMETER :: p2_energy(12) = (/2.3104682960662593_dp, 1.8744052737304417_dp, &
52  -9.36564055598656797e-002_dp, 0.64284973765086939_dp, &
53  1.0137565430060556_dp, -6.80088178288954567e-003_dp, &
54  1.1692629207374552_dp, -2.6314710080507573_dp, &
55  19.237814781880786_dp, 1.0505934173661349_dp, &
56  0.80382371955699250_dp, 0.49903401991818103_dp/)
57  REAL(kind=dp), PARAMETER :: p3_energy(2) = (/7.82336287670072350e-002_dp, 0.38073304105744837_dp/)
58  REAL(kind=dp), PARAMETER :: p1_forces(12) = (/2.5746279948798874_dp, 1.3420575378609276_dp, &
59  -9.41673106447732111e-002_dp, 0.94568006899317825_dp, &
60  -1.4511897117448544_dp, 0.59178934677316952_dp, &
61  2.7291149361757236_dp, -0.50555512044800210_dp, &
62  8.3508180969609871_dp, 1.6829982496141809_dp, &
63  -0.74895370472152600_dp, 0.43801726744197500_dp/)
64  REAL(kind=dp), PARAMETER :: p2_forces(12) = (/2.6398568961569020_dp, 2.3024918834564101_dp, &
65  5.33216585432061581e-003_dp, 0.45572145697283628_dp, &
66  1.8119743851500618_dp, -0.12533918548421166_dp, &
67  -1.4040312084552751_dp, -4.5331650463917859_dp, &
68  12.593431549069477_dp, 1.1311978374487595_dp, &
69  1.4245996087624646_dp, 1.1425350529853495_dp/)
70  REAL(kind=dp), PARAMETER :: p3_forces(2) = (/0.12051930516830946_dp, 1.3828051586144336_dp/)
71 
72 !***
73 
74 CONTAINS
75 
76 ! **************************************************************************************************
77 !> \brief Distributes the computation of eri's to all available processes.
78 !> \param x_data Object that stores the indices array
79 !> \param eps_schwarz screening parameter
80 !> \param particle_set , atomic_kind_set, para_env ...
81 !> \param max_set Maximum number of set to be considered
82 !> \param para_env para_env
83 !> \param coeffs_set screening functions
84 !> \param coeffs_kind screening functions
85 !> \param is_assoc_atomic_block_global KS-matrix sparsity
86 !> \param do_periodic flag for periodicity
87 !> \param load_balance_parameter Parameters for Monte-Carlo routines
88 !> \param kind_of helper array for mapping
89 !> \param basis_parameter Basis set parameters
90 !> \param pmax_set Initial screening matrix
91 !> \param pmax_atom ...
92 !> \param i_thread Process ID of current Thread
93 !> \param n_threads Total Number of Threads
94 !> \param cell cell
95 !> \param do_p_screening Flag for initial p screening
96 !> \param map_atom_to_kind_atom ...
97 !> \param nkind ...
98 !> \param eval_type ...
99 !> \param pmax_block ...
100 !> \param use_virial ...
101 !> \par History
102 !> 06.2007 created [Manuel Guidon]
103 !> 08.2007 new parallel scheme [Manuel Guidon]
104 !> 09.2007 new 'modulo' parellel scheme and Monte Carlo step [Manuel Guidon]
105 !> 11.2007 parallelize load balance on box_idx1 [Manuel Guidon]
106 !> 02.2009 completely refactored [Manuel Guidon]
107 !> \author Manuel Guidon
108 !> \note
109 !> The optimization is done via a binning procedure followed by simple
110 !> Monte Carlo procedure:
111 !> In a first step the total amount of integrals in the system is calculated,
112 !> taking into account the sparsity of the KS-matrix , the screening based
113 !> on near/farfield approximations and if desired the screening on an initial
114 !> density matrix.
115 !> In a second step, bins are generate that contain approximately the same number
116 !> of integrals, and a cost for these bins is estimated (currently the number of integrals)
117 !> In a third step, a Monte Carlo procedure optimizes the assignment
118 !> of the different loads to each process
119 !> At the end each process owns an unique array of *atomic* indices-ranges
120 !> that are used to decide whether a process has to calculate a certain
121 !> bunch of integrals or not
122 ! **************************************************************************************************
123  SUBROUTINE hfx_load_balance(x_data, eps_schwarz, particle_set, max_set, para_env, &
124  coeffs_set, coeffs_kind, &
125  is_assoc_atomic_block_global, do_periodic, &
126  load_balance_parameter, kind_of, basis_parameter, pmax_set, &
127  pmax_atom, i_thread, n_threads, cell, &
128  do_p_screening, map_atom_to_kind_atom, nkind, eval_type, &
129  pmax_block, use_virial)
130  TYPE(hfx_type), POINTER :: x_data
131  REAL(dp), INTENT(IN) :: eps_schwarz
132  TYPE(particle_type), DIMENSION(:), POINTER :: particle_set
133  INTEGER, INTENT(IN) :: max_set
134  TYPE(mp_para_env_type), POINTER :: para_env
135  TYPE(hfx_screen_coeff_type), &
136  DIMENSION(:, :, :, :), POINTER :: coeffs_set
137  TYPE(hfx_screen_coeff_type), DIMENSION(:, :), &
138  POINTER :: coeffs_kind
139  INTEGER, DIMENSION(:, :) :: is_assoc_atomic_block_global
140  LOGICAL :: do_periodic
141  TYPE(hfx_load_balance_type), POINTER :: load_balance_parameter
142  INTEGER :: kind_of(*)
143  TYPE(hfx_basis_type), DIMENSION(:), POINTER :: basis_parameter
144  TYPE(hfx_p_kind), DIMENSION(:), POINTER :: pmax_set
145  REAL(dp), DIMENSION(:, :), POINTER :: pmax_atom
146  INTEGER, INTENT(IN) :: i_thread, n_threads
147  TYPE(cell_type), POINTER :: cell
148  LOGICAL, INTENT(IN) :: do_p_screening
149  INTEGER, DIMENSION(:), POINTER :: map_atom_to_kind_atom
150  INTEGER, INTENT(IN) :: nkind, eval_type
151  REAL(dp), DIMENSION(:, :), POINTER :: pmax_block
152  LOGICAL, INTENT(IN) :: use_virial
153 
154  CHARACTER(LEN=*), PARAMETER :: routinen = 'hfx_load_balance'
155 
156  CHARACTER(LEN=512) :: error_msg
157  INTEGER :: block_size, current_block_id, data_from, dest, handle, handle_inner, &
158  handle_range, i, iatom_block, iatom_end, iatom_start, ibin, icpu, j, jatom_block, &
159  jatom_end, jatom_start, katom_block, katom_end, katom_start, latom_block, latom_end, &
160  latom_start, mepos, my_process_id, n_processes, natom, nbins, nblocks, ncpu, &
161  new_iatom_end, new_iatom_start, new_jatom_end, new_jatom_start, non_empty_blocks, &
162  objective_block_size, objective_nblocks, source, total_blocks
163  TYPE(mp_request_type), DIMENSION(2) :: req
164  INTEGER(int_8) :: atom_block, cost_per_bin, cost_per_core, current_cost, &
165  distribution_counter_end, distribution_counter_start, global_quartet_counter, &
166  local_quartet_counter, self_cost_per_block, tmp_block, total_block_self_cost
167  INTEGER(int_8), ALLOCATABLE, DIMENSION(:) :: buffer_in, buffer_out
168  INTEGER(int_8), DIMENSION(:), POINTER :: local_cost_matrix, recbuffer, &
169  sendbuffer, swapbuffer
170  INTEGER(int_8), DIMENSION(:), POINTER, SAVE :: cost_matrix
171  INTEGER(int_8), SAVE :: shm_global_quartet_counter, &
172  shm_local_quartet_counter
173  INTEGER, ALLOCATABLE, DIMENSION(:) :: rcount, rdispl, tmp_index, tmp_pos, &
174  to_be_sorted
175  INTEGER, DIMENSION(:), POINTER, SAVE :: shm_distribution_vector
176  INTEGER, SAVE :: shm_nblocks
177  LOGICAL :: changed, last_bin_needs_to_be_filled, &
178  optimized
179  LOGICAL, DIMENSION(:, :), POINTER, SAVE :: atomic_pair_list
180  REAL(dp) :: coeffs_kind_max0, log10_eps_schwarz, &
181  log_2, pmax_blocks
182  TYPE(hfx_block_range_type), DIMENSION(:), POINTER :: blocks_guess, tmp_blocks, tmp_blocks2
183  TYPE(hfx_block_range_type), DIMENSION(:), &
184  POINTER, SAVE :: shm_blocks
185  TYPE(hfx_distribution), DIMENSION(:), POINTER :: binned_dist, ptr_to_tmp_dist, tmp_dist
186  TYPE(hfx_distribution), DIMENSION(:, :), POINTER, &
187  SAVE :: full_dist
188  TYPE(pair_list_type) :: list_ij, list_kl
189  TYPE(pair_set_list_type), ALLOCATABLE, &
190  DIMENSION(:) :: set_list_ij, set_list_kl
191 
192 !$OMP BARRIER
193 !$OMP MASTER
194  CALL timeset(routinen, handle)
195 !$OMP END MASTER
196 !$OMP BARRIER
197 
198  log10_eps_schwarz = log10(eps_schwarz)
199  log_2 = log10(2.0_dp)
200  coeffs_kind_max0 = maxval(coeffs_kind(:, :)%x(2))
201  ncpu = para_env%num_pe
202  n_processes = ncpu*n_threads
203  natom = SIZE(particle_set)
204 
205  block_size = load_balance_parameter%block_size
206  ALLOCATE (set_list_ij((max_set*block_size)**2))
207  ALLOCATE (set_list_kl((max_set*block_size)**2))
208 
209  IF (.NOT. load_balance_parameter%blocks_initialized) THEN
210 !$OMP BARRIER
211 !$OMP MASTER
212  CALL timeset(routinen//"_range", handle_range)
213 
214  nblocks = max((natom + block_size - 1)/block_size, 1)
215  ALLOCATE (blocks_guess(nblocks))
216  ALLOCATE (tmp_blocks(natom))
217  ALLOCATE (tmp_blocks2(natom))
218 
219  pmax_blocks = 0.0_dp
220  SELECT CASE (eval_type)
221  CASE (hfx_do_eval_energy)
222  atomic_pair_list => x_data%atomic_pair_list
223  CASE (hfx_do_eval_forces)
224  atomic_pair_list => x_data%atomic_pair_list_forces
225  END SELECT
226  atomic_pair_list = .true.
227  CALL init_blocks(nkind, para_env, natom, block_size, nblocks, blocks_guess, &
228  list_ij, list_kl, set_list_ij, set_list_kl, &
229  particle_set, &
230  coeffs_set, coeffs_kind, &
231  is_assoc_atomic_block_global, do_periodic, &
232  kind_of, basis_parameter, pmax_set, pmax_atom, &
233  pmax_blocks, cell, &
234  do_p_screening, map_atom_to_kind_atom, eval_type, &
235  log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
236 
237  total_block_self_cost = 0
238 
239  DO i = 1, nblocks
240  total_block_self_cost = total_block_self_cost + blocks_guess(i)%cost
241  END DO
242 
243  CALL para_env%sum(total_block_self_cost)
244 
245  objective_block_size = load_balance_parameter%block_size
246  objective_nblocks = max((natom + objective_block_size - 1)/objective_block_size, 1)
247 
248  self_cost_per_block = (total_block_self_cost + objective_nblocks - 1)/(objective_nblocks)
249 
250  DO i = 1, nblocks
251  tmp_blocks2(i) = blocks_guess(i)
252  END DO
253 
254  optimized = .false.
255  i = 0
256  DO WHILE (.NOT. optimized)
257  i = i + 1
258  current_block_id = 0
259  changed = .false.
260  DO atom_block = 1, nblocks
261  current_block_id = current_block_id + 1
262  iatom_start = tmp_blocks2(atom_block)%istart
263  iatom_end = tmp_blocks2(atom_block)%iend
264  IF (tmp_blocks2(atom_block)%cost > 1.5_dp*self_cost_per_block .AND. iatom_end - iatom_start > 0) THEN
265  changed = .true.
266  new_iatom_start = iatom_start
267  new_iatom_end = (iatom_end - iatom_start + 1)/2 + iatom_start - 1
268  new_jatom_start = new_iatom_end + 1
269  new_jatom_end = iatom_end
270  tmp_blocks(current_block_id)%istart = new_iatom_start
271  tmp_blocks(current_block_id)%iend = new_iatom_end
272  tmp_blocks(current_block_id)%cost = estimate_block_cost( &
273  natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
274  new_iatom_start, new_iatom_end, new_iatom_start, new_iatom_end, &
275  new_iatom_start, new_iatom_end, new_iatom_start, new_iatom_end, &
276  particle_set, &
277  coeffs_set, coeffs_kind, &
278  is_assoc_atomic_block_global, do_periodic, &
279  kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
280  cell, &
281  do_p_screening, map_atom_to_kind_atom, eval_type, &
282  log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
283  current_block_id = current_block_id + 1
284  tmp_blocks(current_block_id)%istart = new_jatom_start
285  tmp_blocks(current_block_id)%iend = new_jatom_end
286  tmp_blocks(current_block_id)%cost = estimate_block_cost( &
287  natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
288  new_jatom_start, new_jatom_end, new_jatom_start, new_jatom_end, &
289  new_jatom_start, new_jatom_end, new_jatom_start, new_jatom_end, &
290  particle_set, &
291  coeffs_set, coeffs_kind, &
292  is_assoc_atomic_block_global, do_periodic, &
293  kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
294  cell, &
295  do_p_screening, map_atom_to_kind_atom, eval_type, &
296  log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
297  ELSE
298  tmp_blocks(current_block_id)%istart = iatom_start
299  tmp_blocks(current_block_id)%iend = iatom_end
300  tmp_blocks(current_block_id)%cost = tmp_blocks2(atom_block)%cost
301  END IF
302  END DO
303  IF (.NOT. changed) optimized = .true.
304  IF (i > 20) optimized = .true.
305  nblocks = current_block_id
306  DO atom_block = 1, nblocks
307  tmp_blocks2(atom_block) = tmp_blocks(atom_block)
308  END DO
309  END DO
310 
311  DEALLOCATE (tmp_blocks2)
312 
313  ! ** count number of non empty blocks on each node
314  non_empty_blocks = 0
315  DO atom_block = 1, nblocks
316  IF (tmp_blocks(atom_block)%istart == 0) cycle
317  non_empty_blocks = non_empty_blocks + 1
318  END DO
319 
320  ALLOCATE (rcount(ncpu))
321  rcount = 0
322  rcount(para_env%mepos + 1) = non_empty_blocks
323  CALL para_env%sum(rcount)
324 
325  ! ** sum all non_empty_blocks
326  total_blocks = 0
327  DO i = 1, ncpu
328  total_blocks = total_blocks + rcount(i)
329  END DO
330 
331  ! ** calculate offsets
332  ALLOCATE (rdispl(ncpu))
333  rcount(:) = rcount(:)*3
334  rdispl(1) = 0
335  DO i = 2, ncpu
336  rdispl(i) = rdispl(i - 1) + rcount(i - 1)
337  END DO
338 
339  ALLOCATE (buffer_in(3*non_empty_blocks))
340 
341  non_empty_blocks = 0
342  DO atom_block = 1, nblocks
343  IF (tmp_blocks(atom_block)%istart == 0) cycle
344  buffer_in(non_empty_blocks*3 + 1) = tmp_blocks(atom_block)%istart
345  buffer_in(non_empty_blocks*3 + 2) = tmp_blocks(atom_block)%iend
346  buffer_in(non_empty_blocks*3 + 3) = tmp_blocks(atom_block)%cost
347  non_empty_blocks = non_empty_blocks + 1
348  END DO
349 
350  nblocks = total_blocks
351 
352  ALLOCATE (tmp_blocks2(nblocks))
353 
354  ALLOCATE (buffer_out(3*nblocks))
355 
356  ! ** Gather all three arrays
357  CALL para_env%allgatherv(buffer_in, buffer_out, rcount, rdispl)
358 
359  DO i = 1, nblocks
360  tmp_blocks2(i)%istart = int(buffer_out((i - 1)*3 + 1))
361  tmp_blocks2(i)%iend = int(buffer_out((i - 1)*3 + 2))
362  tmp_blocks2(i)%cost = buffer_out((i - 1)*3 + 3)
363  END DO
364 
365  ! ** Now we sort the blocks
366  ALLOCATE (to_be_sorted(nblocks))
367  ALLOCATE (tmp_index(nblocks))
368 
369  DO atom_block = 1, nblocks
370  to_be_sorted(atom_block) = tmp_blocks2(atom_block)%istart
371  END DO
372 
373  CALL sort(to_be_sorted, nblocks, tmp_index)
374 
375  ALLOCATE (x_data%blocks(nblocks))
376 
377  DO atom_block = 1, nblocks
378  x_data%blocks(atom_block) = tmp_blocks2(tmp_index(atom_block))
379  END DO
380 
381  shm_blocks => x_data%blocks
382  shm_nblocks = nblocks
383 
384  ! ** Set nblocks in structure
385  load_balance_parameter%nblocks = nblocks
386 
387  DEALLOCATE (blocks_guess, tmp_blocks, tmp_blocks2)
388 
389  DEALLOCATE (rcount, rdispl, buffer_in, buffer_out, to_be_sorted, tmp_index)
390 
391  load_balance_parameter%blocks_initialized = .true.
392 
393  x_data%blocks = shm_blocks
394  load_balance_parameter%nblocks = shm_nblocks
395  load_balance_parameter%blocks_initialized = .true.
396 
397  ALLOCATE (x_data%pmax_block(shm_nblocks, shm_nblocks))
398  x_data%pmax_block = 0.0_dp
399  pmax_block => x_data%pmax_block
400  CALL timestop(handle_range)
401 !$OMP END MASTER
402 !$OMP BARRIER
403 
404  IF (.NOT. load_balance_parameter%blocks_initialized) THEN
405  ALLOCATE (x_data%blocks(shm_nblocks))
406  x_data%blocks = shm_blocks
407  load_balance_parameter%nblocks = shm_nblocks
408  load_balance_parameter%blocks_initialized = .true.
409  END IF
410  !! ** precalculate maximum density matrix elements in blocks
411 !$OMP BARRIER
412  END IF
413 
414 !$OMP BARRIER
415 !$OMP MASTER
416  pmax_block => x_data%pmax_block
417  pmax_block = 0.0_dp
418  IF (do_p_screening) THEN
419  DO iatom_block = 1, shm_nblocks
420  iatom_start = x_data%blocks(iatom_block)%istart
421  iatom_end = x_data%blocks(iatom_block)%iend
422  DO jatom_block = 1, shm_nblocks
423  jatom_start = x_data%blocks(jatom_block)%istart
424  jatom_end = x_data%blocks(jatom_block)%iend
425  pmax_block(iatom_block, jatom_block) = maxval(pmax_atom(iatom_start:iatom_end, jatom_start:jatom_end))
426  END DO
427  END DO
428  END IF
429 
430  SELECT CASE (eval_type)
431  CASE (hfx_do_eval_energy)
432  atomic_pair_list => x_data%atomic_pair_list
433  CASE (hfx_do_eval_forces)
434  atomic_pair_list => x_data%atomic_pair_list_forces
435  END SELECT
436  CALL build_atomic_pair_list(natom, atomic_pair_list, kind_of, basis_parameter, particle_set, &
437  do_periodic, coeffs_kind, coeffs_kind_max0, log10_eps_schwarz, cell, &
438  x_data%blocks)
439 
440 !$OMP END MASTER
441 !$OMP BARRIER
442 
443  !! If there is only 1 cpu skip the binning
444  IF (n_processes == 1) THEN
445  ALLOCATE (tmp_dist(1))
446  tmp_dist(1)%number_of_atom_quartets = huge(tmp_dist(1)%number_of_atom_quartets)
447  tmp_dist(1)%istart = 0_int_8
448  ptr_to_tmp_dist => tmp_dist(:)
449  SELECT CASE (eval_type)
450  CASE (hfx_do_eval_energy)
451  CALL hfx_set_distr_energy(ptr_to_tmp_dist, x_data)
452  CASE (hfx_do_eval_forces)
453  CALL hfx_set_distr_forces(ptr_to_tmp_dist, x_data)
454  END SELECT
455  DEALLOCATE (tmp_dist)
456  ELSE
457  !! Calculate total numbers of integrals that have to be calculated (wrt screening and symmetry)
458 !$OMP BARRIER
459 !$OMP MASTER
460  CALL timeset(routinen//"_count", handle_inner)
461 !$OMP END MASTER
462 !$OMP BARRIER
463 
464  cost_per_core = 0_int_8
465  my_process_id = para_env%mepos*n_threads + i_thread
466  nblocks = load_balance_parameter%nblocks
467 
468  DO atom_block = my_process_id, int(nblocks, kind=int_8)**4 - 1, n_processes
469 
470  latom_block = int(modulo(atom_block, int(nblocks, kind=int_8))) + 1
471  tmp_block = atom_block/nblocks
472  katom_block = int(modulo(tmp_block, int(nblocks, kind=int_8))) + 1
473  IF (latom_block < katom_block) cycle
474  tmp_block = tmp_block/nblocks
475  jatom_block = int(modulo(tmp_block, int(nblocks, kind=int_8))) + 1
476  tmp_block = tmp_block/nblocks
477  iatom_block = int(modulo(tmp_block, int(nblocks, kind=int_8))) + 1
478  IF (jatom_block < iatom_block) cycle
479 
480  iatom_start = x_data%blocks(iatom_block)%istart
481  iatom_end = x_data%blocks(iatom_block)%iend
482  jatom_start = x_data%blocks(jatom_block)%istart
483  jatom_end = x_data%blocks(jatom_block)%iend
484  katom_start = x_data%blocks(katom_block)%istart
485  katom_end = x_data%blocks(katom_block)%iend
486  latom_start = x_data%blocks(latom_block)%istart
487  latom_end = x_data%blocks(latom_block)%iend
488 
489  SELECT CASE (eval_type)
490  CASE (hfx_do_eval_energy)
491  pmax_blocks = max(pmax_block(katom_block, iatom_block), &
492  pmax_block(latom_block, jatom_block), &
493  pmax_block(latom_block, iatom_block), &
494  pmax_block(katom_block, jatom_block))
495  CASE (hfx_do_eval_forces)
496  pmax_blocks = max(pmax_block(katom_block, iatom_block) + &
497  pmax_block(latom_block, jatom_block), &
498  pmax_block(latom_block, iatom_block) + &
499  pmax_block(katom_block, jatom_block))
500  END SELECT
501 
502  IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) cycle
503 
504  cost_per_core = cost_per_core &
505  + estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
506  iatom_start, iatom_end, jatom_start, jatom_end, &
507  katom_start, katom_end, latom_start, latom_end, &
508  particle_set, &
509  coeffs_set, coeffs_kind, &
510  is_assoc_atomic_block_global, do_periodic, &
511  kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
512  cell, &
513  do_p_screening, map_atom_to_kind_atom, eval_type, &
514  log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
515 
516  END DO ! atom_block
517 
518  nbins = load_balance_parameter%nbins
519  cost_per_bin = (cost_per_core + nbins - 1)/(nbins)
520 
521 !$OMP BARRIER
522 !$OMP MASTER
523  CALL timestop(handle_inner)
524 !$OMP END MASTER
525 !$OMP BARRIER
526 
527 ! new load balancing test
528  IF (.false.) THEN
529  CALL hfx_recursive_load_balance(n_processes, my_process_id, nblocks, &
530  natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
531  particle_set, &
532  coeffs_set, coeffs_kind, &
533  is_assoc_atomic_block_global, do_periodic, &
534  kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
535  cell, x_data, para_env, pmax_block, &
536  do_p_screening, map_atom_to_kind_atom, eval_type, &
537  log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
538  END IF
539 
540 !$OMP BARRIER
541 !$OMP MASTER
542  CALL timeset(routinen//"_bin", handle_inner)
543 !$OMP END MASTER
544 !$OMP BARRIER
545 
546  ALLOCATE (binned_dist(nbins))
547  binned_dist(:)%istart = -1_int_8
548  binned_dist(:)%number_of_atom_quartets = 0_int_8
549  binned_dist(:)%cost = 0_int_8
550  binned_dist(:)%time_first_scf = 0.0_dp
551  binned_dist(:)%time_other_scf = 0.0_dp
552  binned_dist(:)%time_forces = 0.0_dp
553 
554  current_cost = 0
555  mepos = 1
556  distribution_counter_start = 1
557  distribution_counter_end = 0
558  ibin = 1
559 
560  global_quartet_counter = 0
561  local_quartet_counter = 0
562  last_bin_needs_to_be_filled = .false.
563  DO atom_block = my_process_id, int(nblocks, kind=int_8)**4 - 1, n_processes
564  latom_block = int(modulo(atom_block, int(nblocks, kind=int_8))) + 1
565  tmp_block = atom_block/nblocks
566  katom_block = int(modulo(tmp_block, int(nblocks, kind=int_8))) + 1
567  IF (latom_block < katom_block) cycle
568  tmp_block = tmp_block/nblocks
569  jatom_block = int(modulo(tmp_block, int(nblocks, kind=int_8))) + 1
570  tmp_block = tmp_block/nblocks
571  iatom_block = int(modulo(tmp_block, int(nblocks, kind=int_8))) + 1
572  IF (jatom_block < iatom_block) cycle
573 
574  distribution_counter_end = distribution_counter_end + 1
575  global_quartet_counter = global_quartet_counter + 1
576  last_bin_needs_to_be_filled = .true.
577 
578  IF (binned_dist(ibin)%istart == -1_int_8) binned_dist(ibin)%istart = atom_block
579 
580  iatom_start = x_data%blocks(iatom_block)%istart
581  iatom_end = x_data%blocks(iatom_block)%iend
582  jatom_start = x_data%blocks(jatom_block)%istart
583  jatom_end = x_data%blocks(jatom_block)%iend
584  katom_start = x_data%blocks(katom_block)%istart
585  katom_end = x_data%blocks(katom_block)%iend
586  latom_start = x_data%blocks(latom_block)%istart
587  latom_end = x_data%blocks(latom_block)%iend
588 
589  SELECT CASE (eval_type)
590  CASE (hfx_do_eval_energy)
591  pmax_blocks = max(pmax_block(katom_block, iatom_block), &
592  pmax_block(latom_block, jatom_block), &
593  pmax_block(latom_block, iatom_block), &
594  pmax_block(katom_block, jatom_block))
595  CASE (hfx_do_eval_forces)
596  pmax_blocks = max(pmax_block(katom_block, iatom_block) + &
597  pmax_block(latom_block, jatom_block), &
598  pmax_block(latom_block, iatom_block) + &
599  pmax_block(katom_block, jatom_block))
600  END SELECT
601 
602  IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) cycle
603 
604  current_cost = current_cost &
605  + estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
606  iatom_start, iatom_end, jatom_start, jatom_end, &
607  katom_start, katom_end, latom_start, latom_end, &
608  particle_set, &
609  coeffs_set, coeffs_kind, &
610  is_assoc_atomic_block_global, do_periodic, &
611  kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
612  cell, &
613  do_p_screening, map_atom_to_kind_atom, eval_type, &
614  log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
615 
616  IF (current_cost >= cost_per_bin) THEN
617  IF (ibin == nbins) THEN
618  binned_dist(ibin)%number_of_atom_quartets = binned_dist(ibin)%number_of_atom_quartets + &
619  distribution_counter_end - distribution_counter_start + 1
620  ELSE
621  binned_dist(ibin)%number_of_atom_quartets = distribution_counter_end - distribution_counter_start + 1
622  END IF
623  binned_dist(ibin)%cost = binned_dist(ibin)%cost + current_cost
624  ibin = min(ibin + 1, nbins)
625  distribution_counter_start = distribution_counter_end + 1
626  current_cost = 0
627  last_bin_needs_to_be_filled = .false.
628  END IF
629  END DO
630 
631 !$OMP BARRIER
632 !$OMP MASTER
633  CALL timestop(handle_inner)
634  CALL timeset(routinen//"_dist", handle_inner)
635 !$OMP END MASTER
636 !$OMP BARRIER
637  !! Fill the last bin if necessary
638  IF (last_bin_needs_to_be_filled) THEN
639  binned_dist(ibin)%cost = binned_dist(ibin)%cost + current_cost
640  IF (ibin == nbins) THEN
641  binned_dist(ibin)%number_of_atom_quartets = binned_dist(ibin)%number_of_atom_quartets + &
642  distribution_counter_end - distribution_counter_start + 1
643  ELSE
644  binned_dist(ibin)%number_of_atom_quartets = distribution_counter_end - distribution_counter_start + 1
645  END IF
646  END IF
647 
648  !! Sanity-Check
649  DO ibin = 1, nbins
650  local_quartet_counter = local_quartet_counter + binned_dist(ibin)%number_of_atom_quartets
651  END DO
652 !$OMP BARRIER
653 !$OMP MASTER
654  shm_local_quartet_counter = 0
655  shm_global_quartet_counter = 0
656 !$OMP END MASTER
657 !$OMP BARRIER
658 !$OMP ATOMIC
659  shm_local_quartet_counter = shm_local_quartet_counter + local_quartet_counter
660 !$OMP ATOMIC
661  shm_global_quartet_counter = shm_global_quartet_counter + global_quartet_counter
662 
663 !$OMP BARRIER
664 !$OMP MASTER
665  CALL para_env%sum(shm_local_quartet_counter)
666  CALL para_env%sum(shm_global_quartet_counter)
667  IF (para_env%is_source()) THEN
668  IF (shm_local_quartet_counter /= shm_global_quartet_counter) THEN
669  WRITE (error_msg, '(A,I0,A,I0,A)') "HFX Sanity check for parallel distribution failed. "// &
670  "Number of local quartets (", shm_local_quartet_counter, &
671  ") and number of global quartets (", shm_global_quartet_counter, &
672  ") are different. Please send in a bug report."
673  cpabort(error_msg)
674  END IF
675  END IF
676 !$OMP END MASTER
677 
678 !$OMP BARRIER
679 !$OMP MASTER
680  ALLOCATE (cost_matrix(ncpu*nbins*n_threads))
681  cost_matrix = 0
682 !$OMP END MASTER
683 !$OMP BARRIER
684  icpu = para_env%mepos + 1
685  DO i = 1, nbins
686  cost_matrix((icpu - 1)*nbins*n_threads + i_thread*nbins + i) = binned_dist(i)%cost
687  END DO
688  mepos = para_env%mepos
689 !$OMP BARRIER
690 
691 !$OMP MASTER
692  ! sync before/after ring of isendrecv
693  CALL para_env%sync()
694 
695  ALLOCATE (sendbuffer(nbins*n_threads))
696  ALLOCATE (recbuffer(nbins*n_threads))
697 
698  sendbuffer = cost_matrix(mepos*nbins*n_threads + 1:mepos*nbins*n_threads + nbins*n_threads)
699 
700  dest = modulo(mepos + 1, ncpu)
701  source = modulo(mepos - 1, ncpu)
702  DO icpu = 0, ncpu - 1
703  IF (icpu .NE. ncpu - 1) THEN
704  CALL para_env%isendrecv(sendbuffer, dest, recbuffer, source, &
705  req(1), req(2), 13)
706  END IF
707  data_from = modulo(mepos - icpu, ncpu)
708  cost_matrix(data_from*nbins*n_threads + 1:data_from*nbins*n_threads + nbins*n_threads) = sendbuffer
709  IF (icpu .NE. ncpu - 1) THEN
710  CALL mp_waitall(req)
711  END IF
712  swapbuffer => sendbuffer
713  sendbuffer => recbuffer
714  recbuffer => swapbuffer
715  END DO
716  DEALLOCATE (recbuffer, sendbuffer)
717 !$OMP END MASTER
718 !$OMP BARRIER
719 
720 !$OMP BARRIER
721 !$OMP MASTER
722  CALL timestop(handle_inner)
723  CALL timeset(routinen//"_opt", handle_inner)
724 !$OMP END MASTER
725 !$OMP BARRIER
726 
727  !! Find an optimal distribution i.e. assign each element of the cost matrix to a certain process
728 !$OMP BARRIER
729  ALLOCATE (local_cost_matrix(SIZE(cost_matrix, 1)))
730  local_cost_matrix = cost_matrix
731 !$OMP MASTER
732  ALLOCATE (shm_distribution_vector(ncpu*nbins*n_threads))
733 
734  CALL optimize_distribution(ncpu*nbins*n_threads, ncpu*n_threads, local_cost_matrix, &
735  shm_distribution_vector, x_data%load_balance_parameter%do_randomize)
736 
737  CALL timestop(handle_inner)
738  CALL timeset(routinen//"_redist", handle_inner)
739  !! Collect local data to global array
740  ALLOCATE (full_dist(ncpu*n_threads, nbins))
741 
742  full_dist(:, :)%istart = 0_int_8
743  full_dist(:, :)%number_of_atom_quartets = 0_int_8
744  full_dist(:, :)%cost = 0_int_8
745  full_dist(:, :)%time_first_scf = 0.0_dp
746  full_dist(:, :)%time_other_scf = 0.0_dp
747  full_dist(:, :)%time_forces = 0.0_dp
748 !$OMP END MASTER
749 !$OMP BARRIER
750  mepos = para_env%mepos + 1
751  full_dist((mepos - 1)*n_threads + i_thread + 1, :) = binned_dist(:)
752 
753 !$OMP BARRIER
754 !$OMP MASTER
755  ALLOCATE (sendbuffer(3*nbins*n_threads))
756  ALLOCATE (recbuffer(3*nbins*n_threads))
757  mepos = para_env%mepos
758  DO j = 1, n_threads
759  DO i = 1, nbins
760  sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 1) = full_dist(mepos*n_threads + j, i)%istart
761  sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 2) = full_dist(mepos*n_threads + j, i)%number_of_atom_quartets
762  sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 3) = full_dist(mepos*n_threads + j, i)%cost
763  END DO
764  END DO
765 
766  ! sync before/after ring of isendrecv
767  CALL para_env%sync()
768  dest = modulo(mepos + 1, ncpu)
769  source = modulo(mepos - 1, ncpu)
770  DO icpu = 0, ncpu - 1
771  IF (icpu .NE. ncpu - 1) THEN
772  CALL para_env%isendrecv(sendbuffer, dest, recbuffer, source, &
773  req(1), req(2), 13)
774  END IF
775  data_from = modulo(mepos - icpu, ncpu)
776  DO j = 1, n_threads
777  DO i = 1, nbins
778  full_dist(data_from*n_threads + j, i)%istart = sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 1)
779  full_dist(data_from*n_threads + j, i)%number_of_atom_quartets = sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 2)
780  full_dist(data_from*n_threads + j, i)%cost = sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 3)
781  END DO
782  END DO
783 
784  IF (icpu .NE. ncpu - 1) THEN
785  CALL mp_waitall(req)
786  END IF
787  swapbuffer => sendbuffer
788  sendbuffer => recbuffer
789  recbuffer => swapbuffer
790  END DO
791  DEALLOCATE (recbuffer, sendbuffer)
792 
793  ! sync before/after ring of isendrecv
794  CALL para_env%sync()
795 !$OMP END MASTER
796 !$OMP BARRIER
797  !! reorder the distribution according to the distribution vector
798  ALLOCATE (tmp_pos(ncpu*n_threads))
799  tmp_pos = 1
800  ALLOCATE (tmp_dist(nbins*ncpu*n_threads))
801 
802  tmp_dist(:)%istart = 0_int_8
803  tmp_dist(:)%number_of_atom_quartets = 0_int_8
804  tmp_dist(:)%cost = 0_int_8
805  tmp_dist(:)%time_first_scf = 0.0_dp
806  tmp_dist(:)%time_other_scf = 0.0_dp
807  tmp_dist(:)%time_forces = 0.0_dp
808 
809  DO icpu = 1, n_processes
810  DO i = 1, nbins
811  mepos = my_process_id + 1
812  IF (shm_distribution_vector((icpu - 1)*nbins + i) == mepos) THEN
813  tmp_dist(tmp_pos(mepos)) = full_dist(icpu, i)
814  tmp_pos(mepos) = tmp_pos(mepos) + 1
815  END IF
816  END DO
817  END DO
818 
819  !! Assign the load to each process
820  NULLIFY (ptr_to_tmp_dist)
821  mepos = my_process_id + 1
822  ptr_to_tmp_dist => tmp_dist(1:tmp_pos(mepos) - 1)
823  SELECT CASE (eval_type)
824  CASE (hfx_do_eval_energy)
825  CALL hfx_set_distr_energy(ptr_to_tmp_dist, x_data)
826  CASE (hfx_do_eval_forces)
827  CALL hfx_set_distr_forces(ptr_to_tmp_dist, x_data)
828  END SELECT
829 
830 !$OMP BARRIER
831 !$OMP MASTER
832  DEALLOCATE (full_dist, cost_matrix, shm_distribution_vector)
833 !$OMP END MASTER
834 !$OMP BARRIER
835  DEALLOCATE (tmp_dist, tmp_pos)
836  DEALLOCATE (binned_dist, local_cost_matrix)
837  DEALLOCATE (set_list_ij, set_list_kl)
838 
839 !$OMP BARRIER
840 !$OMP MASTER
841  CALL timestop(handle_inner)
842 !$OMP END MASTER
843 !$OMP BARRIER
844  END IF
845 !$OMP BARRIER
846 !$OMP MASTER
847  CALL timestop(handle)
848 !$OMP END MASTER
849 !$OMP BARRIER
850  END SUBROUTINE hfx_load_balance
851 
852 ! **************************************************************************************************
853 !> \brief Reference implementation of new recursive load balancing routine
854 !> Computes a local list of atom_blocks (p_atom_blocks,q_atom_blocks) for
855 !> each process in a P-Q grid such that every process has more or less the
856 !> same amount of work. Has no output at the moment (not used) but writes
857 !> its computed load balance values into a file. Possible output is ready
858 !> to use in the two arrays p_atom_blocks & q_atom_blocks
859 !> \param n_processes ...
860 !> \param my_process_id ...
861 !> \param nblocks ...
862 !> \param natom ...
863 !> \param nkind ...
864 !> \param list_ij ...
865 !> \param list_kl ...
866 !> \param set_list_ij ...
867 !> \param set_list_kl ...
868 !> \param particle_set ...
869 !> \param coeffs_set ...
870 !> \param coeffs_kind ...
871 !> \param is_assoc_atomic_block_global ...
872 !> \param do_periodic ...
873 !> \param kind_of ...
874 !> \param basis_parameter ...
875 !> \param pmax_set ...
876 !> \param pmax_atom ...
877 !> \param pmax_blocks ...
878 !> \param cell ...
879 !> \param x_data ...
880 !> \param para_env ...
881 !> \param pmax_block ...
882 !> \param do_p_screening ...
883 !> \param map_atom_to_kind_atom ...
884 !> \param eval_type ...
885 !> \param log10_eps_schwarz ...
886 !> \param log_2 ...
887 !> \param coeffs_kind_max0 ...
888 !> \param use_virial ...
889 !> \param atomic_pair_list ...
890 !> \par History
891 !> 03.2011 created [Michael Steinlechner]
892 !> \author Michael Steinlechner
893 ! **************************************************************************************************
894 
895  SUBROUTINE hfx_recursive_load_balance(n_processes, my_process_id, nblocks, &
896  natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
897  particle_set, &
898  coeffs_set, coeffs_kind, &
899  is_assoc_atomic_block_global, do_periodic, &
900  kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
901  cell, x_data, para_env, pmax_block, &
902  do_p_screening, map_atom_to_kind_atom, eval_type, &
903  log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
904 
905 ! input variables:
906  INTEGER, INTENT(IN) :: n_processes, my_process_id, nblocks, &
907  natom, nkind
908  TYPE(pair_list_type), INTENT(IN) :: list_ij, list_kl
909  TYPE(pair_set_list_type), ALLOCATABLE, &
910  DIMENSION(:), INTENT(IN) :: set_list_ij, set_list_kl
911  TYPE(particle_type), DIMENSION(:), INTENT(IN), &
912  POINTER :: particle_set
913  TYPE(hfx_screen_coeff_type), &
914  DIMENSION(:, :, :, :), INTENT(IN), POINTER :: coeffs_set
915  TYPE(hfx_screen_coeff_type), DIMENSION(:, :), &
916  INTENT(IN), POINTER :: coeffs_kind
917  INTEGER, DIMENSION(:, :), INTENT(IN) :: is_assoc_atomic_block_global
918  LOGICAL, INTENT(IN) :: do_periodic
919  INTEGER, INTENT(IN) :: kind_of(*)
920  TYPE(hfx_basis_type), DIMENSION(:), INTENT(IN), &
921  POINTER :: basis_parameter
922  TYPE(hfx_p_kind), DIMENSION(:), INTENT(IN), &
923  POINTER :: pmax_set
924  REAL(dp), DIMENSION(:, :), INTENT(IN), POINTER :: pmax_atom
925  REAL(dp) :: pmax_blocks
926  TYPE(cell_type), INTENT(IN), POINTER :: cell
927  TYPE(hfx_type), INTENT(IN), POINTER :: x_data
928  TYPE(mp_para_env_type), INTENT(IN) :: para_env
929  REAL(dp), DIMENSION(:, :), INTENT(IN), POINTER :: pmax_block
930  LOGICAL, INTENT(IN) :: do_p_screening
931  INTEGER, DIMENSION(:), INTENT(IN), POINTER :: map_atom_to_kind_atom
932  INTEGER, INTENT(IN) :: eval_type
933  REAL(dp), INTENT(IN) :: log10_eps_schwarz, log_2, &
934  coeffs_kind_max0
935  LOGICAL, INTENT(IN) :: use_virial
936  LOGICAL, DIMENSION(:, :), INTENT(IN), POINTER :: atomic_pair_list
937 
938  CHARACTER(LEN=*), PARAMETER :: routinen = 'hfx_recursive_load_balance'
939 
940  INTEGER :: handle, i, iatom_block, iatom_end, iatom_start, j, jatom_block, jatom_end, &
941  jatom_start, katom_block, katom_end, katom_start, latom_block, latom_end, latom_start, &
942  np, nq, numbins, p, q, sizep, sizeq, unit_nr
943  INTEGER(int_8) :: local_cost, pidx, qidx, sump, sumq
944  INTEGER(int_8), ALLOCATABLE, DIMENSION(:) :: local_cost_vector
945  INTEGER, ALLOCATABLE, DIMENSION(:) :: blocksize, p_atom_blocks, permute, &
946  q_atom_blocks
947  REAL(dp) :: maximum, mean
948 
949 ! internal variables:
950 
951 !$OMP BARRIER
952 !$OMP MASTER
953  CALL timeset(routinen, handle)
954 !$OMP END MASTER
955 !$OMP BARRIER
956 
957  ! calculate best p/q distribution grid for the n_processes
958  CALL hfx_calculate_pq(p, q, numbins, n_processes)
959 
960  ALLOCATE (blocksize(numbins))
961  ALLOCATE (permute(nblocks**2))
962  DO i = 1, nblocks**2
963  permute(i) = i
964  END DO
965 
966  ! call the main recursive permutation routine.
967  ! Output:
968  ! blocksize :: vector (size numBins) with the sizes for each column/row block
969  ! permute :: permutation vector
970  CALL hfx_recursive_permute(blocksize, 1, nblocks**2, numbins, &
971  permute, 1, &
972  my_process_id, n_processes, nblocks, &
973  natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
974  particle_set, &
975  coeffs_set, coeffs_kind, &
976  is_assoc_atomic_block_global, do_periodic, &
977  kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
978  cell, x_data, para_env, pmax_block, &
979  do_p_screening, map_atom_to_kind_atom, eval_type, &
980  log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
981 
982  ! number of blocks per processor in p-direction (vertical)
983  np = numbins/p
984  ! number of blocks per processor in q-direction (horizontal)
985  nq = numbins/q
986 
987  ! calc own position in P-Q-processor grid (PQ-grid is column-major)
988  pidx = modulo(int(my_process_id), int(p)) + 1
989  qidx = my_process_id/p + 1
990 
991  sizep = sum(blocksize((np*(pidx - 1) + 1):(np*pidx)))
992  sizeq = sum(blocksize((nq*(qidx - 1) + 1):(nq*qidx)))
993 
994  sump = sum(blocksize(1:(np*(pidx - 1))))
995  sumq = sum(blocksize(1:(nq*(qidx - 1))))
996 
997  ALLOCATE (p_atom_blocks(sizep))
998  ALLOCATE (q_atom_blocks(sizeq))
999 
1000  p_atom_blocks(:) = permute((sump + 1):(sump + sizep))
1001  q_atom_blocks(:) = permute((sumq + 1):(sumq + sizeq))
1002 
1003  ! from here on, we are actually finished, each process has been
1004  ! assigned a (p_atom_blocks,q_atom_blocks) pair list.
1005  ! what follows is just a small routine to calculate the local cost
1006  ! for each processor which is then written to a file.
1007 
1008  ! calculate local cost for each processor!
1009  ! ****************************************
1010  local_cost = 0
1011  DO i = 1, sizep
1012  DO j = 1, sizeq
1013 
1014  ! get corresponding 4D block indices out of our own P-Q-block
1015  latom_block = modulo(q_atom_blocks(j), nblocks)
1016  iatom_block = q_atom_blocks(j)/nblocks + 1
1017  jatom_block = modulo(p_atom_blocks(i), nblocks)
1018  katom_block = p_atom_blocks(i)/nblocks + 1
1019 
1020  ! symmetry checks.
1021  IF (latom_block < katom_block) cycle
1022  IF (jatom_block < iatom_block) cycle
1023 
1024  iatom_start = x_data%blocks(iatom_block)%istart
1025  iatom_end = x_data%blocks(iatom_block)%iend
1026  jatom_start = x_data%blocks(jatom_block)%istart
1027  jatom_end = x_data%blocks(jatom_block)%iend
1028  katom_start = x_data%blocks(katom_block)%istart
1029  katom_end = x_data%blocks(katom_block)%iend
1030  latom_start = x_data%blocks(latom_block)%istart
1031  latom_end = x_data%blocks(latom_block)%iend
1032 
1033  ! whatever.
1034  SELECT CASE (eval_type)
1035  CASE (hfx_do_eval_energy)
1036  pmax_blocks = max(pmax_block(katom_block, iatom_block), &
1037  pmax_block(latom_block, jatom_block), &
1038  pmax_block(latom_block, iatom_block), &
1039  pmax_block(katom_block, jatom_block))
1040  CASE (hfx_do_eval_forces)
1041  pmax_blocks = max(pmax_block(katom_block, iatom_block) + &
1042  pmax_block(latom_block, jatom_block), &
1043  pmax_block(latom_block, iatom_block) + &
1044  pmax_block(katom_block, jatom_block))
1045  END SELECT
1046 
1047  ! screening.
1048  IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) cycle
1049 
1050  ! estimate the cost of this atom_block.
1051  local_cost = local_cost + estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, &
1052  set_list_kl, &
1053  iatom_start, iatom_end, jatom_start, jatom_end, &
1054  katom_start, katom_end, latom_start, latom_end, &
1055  particle_set, &
1056  coeffs_set, coeffs_kind, &
1057  is_assoc_atomic_block_global, do_periodic, &
1058  kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
1059  cell, &
1060  do_p_screening, map_atom_to_kind_atom, eval_type, &
1061  log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
1062  END DO
1063  END DO
1064 
1065  ALLOCATE (local_cost_vector(n_processes))
1066  local_cost_vector = 0
1067  local_cost_vector(my_process_id + 1) = local_cost
1068  CALL para_env%sum(local_cost_vector)
1069 
1070  mean = sum(local_cost_vector)/n_processes
1071  maximum = maxval(local_cost_vector)
1072 
1073 !$OMP BARRIER
1074 !$OMP MASTER
1075  ! only output once
1076  IF (my_process_id == 0) THEN
1077  CALL open_file(unit_number=unit_nr, file_name="loads.dat")
1078  WRITE (unit_nr, *) 'maximum cost:', maximum
1079  WRITE (unit_nr, *) 'mean cost:', mean
1080  WRITE (unit_nr, *) 'load balance ratio max/mean: ', maximum/mean
1081  WRITE (unit_nr, *) '-------- detailed per-process costs ---------'
1082  DO i = 1, n_processes
1083  WRITE (unit_nr, *) local_cost_vector(i)
1084  END DO
1085  CALL close_file(unit_nr)
1086  END IF
1087 !$OMP END MASTER
1088 !$OMP BARRIER
1089 
1090  DEALLOCATE (local_cost_vector)
1091  DEALLOCATE (p_atom_blocks, q_atom_blocks)
1092  DEALLOCATE (blocksize, permute)
1093 
1094 !$OMP BARRIER
1095 !$OMP MASTER
1096  CALL timestop(handle)
1097 !$OMP END MASTER
1098 !$OMP BARRIER
1099 
1100  END SUBROUTINE hfx_recursive_load_balance
1101 
1102 ! **************************************************************************************************
1103 !> \brief Small routine to calculate the optimal P-Q-processor grid distribution
1104 !> for a given number of processors N
1105 !> and the corresponding number of Bins for the load balancing routine
1106 !> \param p number of rows on P-Q process grid (output)
1107 !> \param q number of columns on P-Q process grid (output)
1108 !> \param nBins number of Bins (output)
1109 !> \param N number of processes (input)
1110 !> \par History
1111 !> 03.2011 created [Michael Steinlechner]
1112 !> \author Michael Steinlechner
1113 ! **************************************************************************************************
1114  SUBROUTINE hfx_calculate_pq(p, q, nBins, N)
1115 
1116  INTEGER, INTENT(OUT) :: p, q, nbins
1117  INTEGER, INTENT(IN) :: n
1118 
1119  INTEGER :: a, b, k
1120  REAL(dp) :: sqn
1121 
1122  k = 2
1123  sqn = sqrt(real(n, kind=dp))
1124  p = 1
1125 
1126  DO WHILE (real(k, kind=dp) <= sqn)
1127  IF (modulo(n, k) == 0) THEN
1128  p = k
1129  END IF
1130  k = k + 1
1131  END DO
1132  q = n/p
1133 
1134  ! now compute the least common multiple of p & q to get the number of necessary bins
1135  ! compute using the relation LCM(p,q) = abs(p*q) / GCD(p,q)
1136  ! and use euclid's algorithm for GCD computation.
1137  a = p
1138  b = q
1139 
1140  DO WHILE (b .NE. 0)
1141  IF (a > b) THEN
1142  a = a - b
1143  ELSE
1144  b = b - a
1145  END IF
1146  END DO
1147  ! gcd(p,q) is now saved in a
1148 
1149  nbins = p*q/a
1150 
1151  END SUBROUTINE
1152 
1153 ! **************************************************************************************************
1154 !> \brief Recursive permutation routine for the load balancing of the integral
1155 !> computation
1156 !> \param blocksize vector of blocksizes, size(nProc), which contains for
1157 !> each process the local blocksize (OUTPUT)
1158 !> \param blockstart starting row/column idx of the block which is to be examined
1159 !> at this point (INPUT)
1160 !> \param blockend ending row/column idx of the block which is to be examined
1161 !> (INPUT)
1162 !> \param nProc_in number of bins into which the current block has to be divided
1163 !> (INPUT)
1164 !> \param permute permutation vector which balances column/row cost
1165 !> size(nblocks^2). (OUTPUT)
1166 !> \param step ...
1167 !> \param my_process_id ...
1168 !> \param n_processes ...
1169 !> \param nblocks ...
1170 !> \param natom ...
1171 !> \param nkind ...
1172 !> \param list_ij ...
1173 !> \param list_kl ...
1174 !> \param set_list_ij ...
1175 !> \param set_list_kl ...
1176 !> \param particle_set ...
1177 !> \param coeffs_set ...
1178 !> \param coeffs_kind ...
1179 !> \param is_assoc_atomic_block_global ...
1180 !> \param do_periodic ...
1181 !> \param kind_of ...
1182 !> \param basis_parameter ...
1183 !> \param pmax_set ...
1184 !> \param pmax_atom ...
1185 !> \param pmax_blocks ...
1186 !> \param cell ...
1187 !> \param x_data ...
1188 !> \param para_env ...
1189 !> \param pmax_block ...
1190 !> \param do_p_screening ...
1191 !> \param map_atom_to_kind_atom ...
1192 !> \param eval_type ...
1193 !> \param log10_eps_schwarz ...
1194 !> \param log_2 ...
1195 !> \param coeffs_kind_max0 ...
1196 !> \param use_virial ...
1197 !> \param atomic_pair_list ...
1198 !> \par History
1199 !> 03.2011 created [Michael Steinlechner]
1200 !> \author Michael Steinlechner
1201 ! **************************************************************************************************
1202  RECURSIVE SUBROUTINE hfx_recursive_permute(blocksize, blockstart, blockend, nProc_in, &
1203  permute, step, &
1204  my_process_id, n_processes, nblocks, &
1205  natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
1206  particle_set, &
1207  coeffs_set, coeffs_kind, &
1208  is_assoc_atomic_block_global, do_periodic, &
1209  kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
1210  cell, x_data, para_env, pmax_block, &
1211  do_p_screening, map_atom_to_kind_atom, eval_type, &
1212  log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
1213 
1214  INTEGER :: nproc_in, blockend, blockstart
1215  INTEGER, DIMENSION(nProc_in) :: blocksize
1216  INTEGER :: nblocks, n_processes, my_process_id
1217  INTEGER, INTENT(IN) :: step
1218  INTEGER, DIMENSION(nblocks*nblocks) :: permute
1219  INTEGER :: natom
1220  INTEGER, INTENT(IN) :: nkind
1221  TYPE(pair_list_type) :: list_ij, list_kl
1222  TYPE(pair_set_list_type), ALLOCATABLE, &
1223  DIMENSION(:) :: set_list_ij, set_list_kl
1224  TYPE(particle_type), DIMENSION(:), POINTER :: particle_set
1225  TYPE(hfx_screen_coeff_type), &
1226  DIMENSION(:, :, :, :), POINTER :: coeffs_set
1227  TYPE(hfx_screen_coeff_type), DIMENSION(:, :), &
1228  POINTER :: coeffs_kind
1229  INTEGER, DIMENSION(:, :) :: is_assoc_atomic_block_global
1230  LOGICAL :: do_periodic
1231  INTEGER :: kind_of(*)
1232  TYPE(hfx_basis_type), DIMENSION(:), POINTER :: basis_parameter
1233  TYPE(hfx_p_kind), DIMENSION(:), POINTER :: pmax_set
1234  REAL(dp), DIMENSION(:, :), POINTER :: pmax_atom
1235  REAL(dp) :: pmax_blocks
1236  TYPE(cell_type), POINTER :: cell
1237  TYPE(hfx_type), POINTER :: x_data
1238  TYPE(mp_para_env_type), INTENT(IN) :: para_env
1239  REAL(dp), DIMENSION(:, :), POINTER :: pmax_block
1240  LOGICAL, INTENT(IN) :: do_p_screening
1241  INTEGER, DIMENSION(:), POINTER :: map_atom_to_kind_atom
1242  INTEGER, INTENT(IN) :: eval_type
1243  REAL(dp) :: log10_eps_schwarz, log_2, &
1244  coeffs_kind_max0
1245  LOGICAL, INTENT(IN) :: use_virial
1246  LOGICAL, DIMENSION(:, :), POINTER :: atomic_pair_list
1247 
1248  INTEGER :: col, endoffset, i, iatom_block, iatom_end, iatom_start, idx, inv_perm, &
1249  jatom_block, jatom_end, jatom_start, katom_block, katom_end, katom_start, latom_block, &
1250  latom_end, latom_start, nbins, nproc, row, startoffset
1251  INTEGER(int_8) :: atom_block, tmp_block
1252  INTEGER, ALLOCATABLE, DIMENSION(:) :: ithblocksize, localblocksize
1253  INTEGER, DIMENSION(blockend - blockstart + 1) :: bin_perm, tmp_perm
1254  REAL(dp) :: partialcost
1255  REAL(dp), DIMENSION(nblocks*nblocks) :: cost_vector
1256 
1257  nproc = nproc_in
1258  cost_vector = 0.0_dp
1259 
1260 ! loop over local atom_blocks.
1261  DO atom_block = my_process_id, int(nblocks, kind=int_8)**4 - 1, n_processes
1262 
1263 ! get corresponding 4D block indices
1264  latom_block = int(modulo(atom_block, int(nblocks, kind=int_8))) + 1
1265  tmp_block = atom_block/nblocks
1266  katom_block = int(modulo(tmp_block, int(nblocks, kind=int_8))) + 1
1267  IF (latom_block < katom_block) cycle
1268  tmp_block = tmp_block/nblocks
1269  jatom_block = int(modulo(tmp_block, int(nblocks, kind=int_8))) + 1
1270  tmp_block = tmp_block/nblocks
1271  iatom_block = int(modulo(tmp_block, int(nblocks, kind=int_8))) + 1
1272  IF (jatom_block < iatom_block) cycle
1273 
1274 ! get 2D indices of this atom_block (with permutation applied)
1275 ! for this, we need to invert the permutation, this means
1276 ! find position in permutation vector where value==idx
1277 
1278  row = (katom_block - 1)*nblocks + jatom_block
1279  inv_perm = 1
1280  DO WHILE (permute(inv_perm) .NE. row)
1281  inv_perm = inv_perm + 1
1282  END DO
1283  row = inv_perm
1284 
1285  col = (iatom_block - 1)*nblocks + latom_block
1286  inv_perm = 1
1287  DO WHILE (permute(inv_perm) .NE. col)
1288  inv_perm = inv_perm + 1
1289  END DO
1290  col = inv_perm
1291 
1292 ! if row/col outside our current diagonal block, skip calculation.
1293  IF (col < blockstart .OR. col > blockend) cycle
1294  IF (row < blockstart .OR. row > blockend) cycle
1295 
1296  iatom_start = x_data%blocks(iatom_block)%istart
1297  iatom_end = x_data%blocks(iatom_block)%iend
1298  jatom_start = x_data%blocks(jatom_block)%istart
1299  jatom_end = x_data%blocks(jatom_block)%iend
1300  katom_start = x_data%blocks(katom_block)%istart
1301  katom_end = x_data%blocks(katom_block)%iend
1302  latom_start = x_data%blocks(latom_block)%istart
1303  latom_end = x_data%blocks(latom_block)%iend
1304 
1305 ! whatever.
1306  SELECT CASE (eval_type)
1307  CASE (hfx_do_eval_energy)
1308  pmax_blocks = max(pmax_block(katom_block, iatom_block), &
1309  pmax_block(latom_block, jatom_block), &
1310  pmax_block(latom_block, iatom_block), &
1311  pmax_block(katom_block, jatom_block))
1312  CASE (hfx_do_eval_forces)
1313  pmax_blocks = max(pmax_block(katom_block, iatom_block) + &
1314  pmax_block(latom_block, jatom_block), &
1315  pmax_block(latom_block, iatom_block) + &
1316  pmax_block(katom_block, jatom_block))
1317  END SELECT
1318 
1319 ! screening.
1320  IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) cycle
1321 
1322 ! every second recursion step, compute row sum instead of column sum
1323 
1324  IF (modulo(step, 2) .EQ. 0) THEN
1325  idx = row
1326  ELSE
1327  idx = col
1328  END IF
1329 
1330 ! estimate the cost of this atom_block.
1331  partialcost = estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, &
1332  set_list_kl, &
1333  iatom_start, iatom_end, jatom_start, jatom_end, &
1334  katom_start, katom_end, latom_start, latom_end, &
1335  particle_set, &
1336  coeffs_set, coeffs_kind, &
1337  is_assoc_atomic_block_global, do_periodic, &
1338  kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
1339  cell, &
1340  do_p_screening, map_atom_to_kind_atom, eval_type, &
1341  log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
1342 
1343  cost_vector(idx) = cost_vector(idx) + partialcost
1344  END DO ! atom_block
1345 
1346 ! sum costvector over all processes
1347  CALL para_env%sum(cost_vector)
1348 
1349 ! calculate next prime factor of nProc
1350  nbins = 2
1351  DO WHILE (modulo(int(nproc), int(nbins)) .NE. 0)
1352  nbins = nbins + 1
1353  END DO
1354 
1355  nproc = nproc/nbins
1356 
1357 ! ... do the binning...
1358 
1359  ALLOCATE (localblocksize(nbins))
1360  CALL hfx_permute_binning(nbins, cost_vector(blockstart:blockend), blockend - blockstart + 1, bin_perm, localblocksize)
1361 
1362 !... and update the permutation vector
1363 
1364  tmp_perm = permute(blockstart:blockend)
1365  permute(blockstart:blockend) = tmp_perm(bin_perm)
1366 
1367 ! split recursion into the nBins Bins
1368  IF (nproc > 1) THEN
1369  ALLOCATE (ithblocksize(nproc))
1370  DO i = 1, nbins
1371  startoffset = sum(localblocksize(1:(i - 1)))
1372  endoffset = sum(localblocksize(1:i)) - 1
1373 
1374  CALL hfx_recursive_permute(ithblocksize, blockstart + startoffset, blockstart + endoffset, nproc, &
1375  permute, step + 1, &
1376  my_process_id, n_processes, nblocks, &
1377  natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
1378  particle_set, &
1379  coeffs_set, coeffs_kind, &
1380  is_assoc_atomic_block_global, do_periodic, &
1381  kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
1382  cell, x_data, para_env, pmax_block, &
1383  do_p_screening, map_atom_to_kind_atom, eval_type, &
1384  log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
1385  blocksize(((i - 1)*nproc + 1):(i*nproc)) = ithblocksize
1386  END DO
1387  DEALLOCATE (ithblocksize)
1388  ELSE
1389  DO i = 1, nbins
1390  blocksize(i) = localblocksize(i)
1391  END DO
1392  END IF
1393 
1394  DEALLOCATE (localblocksize)
1395 
1396  END SUBROUTINE hfx_recursive_permute
1397 
1398 ! **************************************************************************************************
1399 !> \brief small binning routine for the recursive load balancing
1400 !>
1401 !> \param nBins number of Bins (INPUT)
1402 !> \param costvector vector of current row/column costs which have to be binned (INPUT)
1403 !> \param maxbinsize upper bound for bin size (INPUT)
1404 !> \param perm resulting permutation due to be binning routine (OUTPUT)
1405 !> \param block_count vector of size(nbins) which contains the size of each bin (OUTPUT)
1406 !> \par History
1407 !> 03.2011 created [Michael Steinlechner]
1408 !> \author Michael Steinlechner
1409 ! **************************************************************************************************
1410  SUBROUTINE hfx_permute_binning(nBins, costvector, maxbinsize, perm, block_count)
1411 
1412  INTEGER, INTENT(IN) :: nbins, maxbinsize
1413  REAL(dp), DIMENSION(maxbinsize), INTENT(IN) :: costvector
1414  INTEGER, DIMENSION(maxbinsize), INTENT(OUT) :: perm
1415  INTEGER, DIMENSION(nBins), INTENT(OUT) :: block_count
1416 
1417  INTEGER :: i, j, mod_idx, offset
1418  INTEGER, DIMENSION(nBins, maxbinsize) :: bin
1419  INTEGER, DIMENSION(nBins) :: bin_idx
1420  INTEGER, DIMENSION(maxbinsize) :: idx
1421  REAL(dp), DIMENSION(maxbinsize) :: vec
1422  REAL(dp), DIMENSION(nBins) :: bincosts
1423 
1424 ! be careful not to change costvector (copy it!)
1425 
1426  vec = costvector
1427  block_count = 0
1428  bincosts = 0
1429 
1430  !sort the array (ascending)
1431  CALL sort(vec, maxbinsize, idx)
1432 
1433  ! count the loop down to distribute the largest cols/rows first
1434  DO i = maxbinsize, 1, -1
1435  IF (vec(i) == 0) THEN
1436  ! spread zero-cost col/rows evenly among procs
1437  mod_idx = modulo(i, nbins) + 1 !(note the fortran offset by one!)
1438  block_count(mod_idx) = block_count(mod_idx) + 1
1439  bin(mod_idx, block_count(mod_idx)) = idx(i)
1440  ELSE
1441  ! sort the bins so that the one with the lowest cost is at the
1442  ! first place, where we then assign the current col/row
1443  CALL sort(bincosts, nbins, bin_idx)
1444  block_count = block_count(bin_idx)
1445  bin = bin(bin_idx, :)
1446 
1447  bincosts(1) = bincosts(1) + vec(i)
1448  block_count(1) = block_count(1) + 1
1449  bin(1, block_count(1)) = idx(i)
1450  END IF
1451  END DO
1452 
1453  ! construct permutation vector from the binning
1454  offset = 0
1455  DO i = 1, nbins
1456  DO j = 1, block_count(i)
1457  perm(offset + j) = bin(i, j)
1458  END DO
1459  offset = offset + block_count(i)
1460  END DO
1461 
1462  END SUBROUTINE hfx_permute_binning
1463 
1464 ! **************************************************************************************************
1465 !> \brief Cheap way of redistributing the eri's
1466 !> \param x_data Object that stores the indices array
1467 !> \param para_env para_env
1468 !> \param load_balance_parameter contains parmameter for Monte-Carlo routines
1469 !> \param i_thread current thread ID
1470 !> \param n_threads Total Number of threads
1471 !> \param eval_type ...
1472 !> \par History
1473 !> 12.2007 created [Manuel Guidon]
1474 !> 02.2009 optimize Memory Usage [Manuel Guidon]
1475 !> \author Manuel Guidon
1476 !> \note
1477 !> The cost matrix is given by the walltime for each bin that is measured
1478 !> during the calculation
1479 ! **************************************************************************************************
1480  SUBROUTINE hfx_update_load_balance(x_data, para_env, &
1481  load_balance_parameter, &
1482  i_thread, n_threads, eval_type)
1483 
1484  TYPE(hfx_type), POINTER :: x_data
1485  TYPE(mp_para_env_type), INTENT(IN) :: para_env
1486  TYPE(hfx_load_balance_type) :: load_balance_parameter
1487  INTEGER, INTENT(IN) :: i_thread, n_threads, eval_type
1488 
1489  CHARACTER(LEN=*), PARAMETER :: routinen = 'hfx_update_load_balance'
1490 
1491  INTEGER :: data_from, dest, end_idx, handle, i, ibin, icpu, iprocess, j, mepos, my_bin_size, &
1492  my_global_start_idx, my_process_id, n_processes, nbins, ncpu, source, start_idx
1493  TYPE(mp_request_type), DIMENSION(2) :: req
1494  INTEGER(int_8), DIMENSION(:), POINTER :: local_cost_matrix, recbuffer, &
1495  sendbuffer, swapbuffer
1496  INTEGER(int_8), DIMENSION(:), POINTER, SAVE :: cost_matrix
1497  INTEGER, ALLOCATABLE, DIMENSION(:) :: tmp_pos
1498  INTEGER, ALLOCATABLE, DIMENSION(:), SAVE :: bins_per_rank
1499  INTEGER, ALLOCATABLE, DIMENSION(:, :), SAVE :: bin_histogram
1500  INTEGER, DIMENSION(:), POINTER, SAVE :: shm_distribution_vector
1501  INTEGER, SAVE :: max_bin_size
1502  TYPE(hfx_distribution), DIMENSION(:), POINTER :: binned_dist, ptr_to_tmp_dist, tmp_dist
1503  TYPE(hfx_distribution), DIMENSION(:, :), POINTER, &
1504  SAVE :: full_dist
1505 
1506 !$OMP BARRIER
1507 !$OMP MASTER
1508  CALL timeset(routinen, handle)
1509 !$OMP END MASTER
1510 !$OMP BARRIER
1511 
1512  ncpu = para_env%num_pe
1513  n_processes = ncpu*n_threads
1514  !! If there is only 1 cpu skip the binning
1515  IF (n_processes == 1) THEN
1516  ALLOCATE (tmp_dist(1))
1517  tmp_dist(1)%number_of_atom_quartets = huge(tmp_dist(1)%number_of_atom_quartets)
1518  tmp_dist(1)%istart = 0_int_8
1519  ptr_to_tmp_dist => tmp_dist(:)
1520  SELECT CASE (eval_type)
1521  CASE (hfx_do_eval_energy)
1522  CALL hfx_set_distr_energy(ptr_to_tmp_dist, x_data)
1523  CASE (hfx_do_eval_forces)
1524  CALL hfx_set_distr_forces(ptr_to_tmp_dist, x_data)
1525  END SELECT
1526  DEALLOCATE (tmp_dist)
1527  ELSE
1528  mepos = para_env%mepos
1529  my_process_id = para_env%mepos*n_threads + i_thread
1530  nbins = load_balance_parameter%nbins
1531 !$OMP MASTER
1532  ALLOCATE (bin_histogram(n_processes, 2))
1533  bin_histogram = 0
1534 !$OMP END MASTER
1535 !$OMP BARRIER
1536  SELECT CASE (eval_type)
1537  CASE (hfx_do_eval_energy)
1538  my_bin_size = SIZE(x_data%distribution_energy)
1539  CASE (hfx_do_eval_forces)
1540  my_bin_size = SIZE(x_data%distribution_forces)
1541  END SELECT
1542  bin_histogram(my_process_id + 1, 1) = my_bin_size
1543 !$OMP BARRIER
1544 !$OMP MASTER
1545  CALL para_env%sum(bin_histogram(:, 1))
1546  bin_histogram(1, 2) = bin_histogram(1, 1)
1547  DO iprocess = 2, n_processes
1548  bin_histogram(iprocess, 2) = bin_histogram(iprocess - 1, 2) + bin_histogram(iprocess, 1)
1549  END DO
1550 
1551  max_bin_size = maxval(bin_histogram(para_env%mepos*n_threads + 1:para_env%mepos*n_threads + n_threads, 1))
1552  CALL para_env%max(max_bin_size)
1553 !$OMP END MASTER
1554 !$OMP BARRIER
1555  ALLOCATE (binned_dist(my_bin_size))
1556  !! Use old binned_dist, but with timings cost
1557  SELECT CASE (eval_type)
1558  CASE (hfx_do_eval_energy)
1559  binned_dist = x_data%distribution_energy
1560  CASE (hfx_do_eval_forces)
1561  binned_dist = x_data%distribution_forces
1562  END SELECT
1563 
1564  DO ibin = 1, my_bin_size
1565  IF (binned_dist(ibin)%number_of_atom_quartets == 0) THEN
1566  binned_dist(ibin)%cost = 0
1567  ELSE
1568  SELECT CASE (eval_type)
1569  CASE (hfx_do_eval_energy)
1570  IF (.NOT. load_balance_parameter%rtp_redistribute) THEN
1571  binned_dist(ibin)%cost = int((binned_dist(ibin)%time_first_scf + &
1572  binned_dist(ibin)%time_other_scf)*10000.0_dp, int_8)
1573  ELSE
1574  binned_dist(ibin)%cost = int((binned_dist(ibin)%time_other_scf)*10000.0_dp, int_8)
1575  END IF
1576  CASE (hfx_do_eval_forces)
1577  binned_dist(ibin)%cost = int((binned_dist(ibin)%time_forces)*10000.0_dp, int_8)
1578  END SELECT
1579  END IF
1580  END DO
1581 !$OMP BARRIER
1582 !$OMP MASTER
1583  !! store all local results in a big cost matrix
1584  ALLOCATE (cost_matrix(ncpu*nbins*n_threads))
1585  cost_matrix = 0
1586  ALLOCATE (sendbuffer(max_bin_size*n_threads))
1587  ALLOCATE (recbuffer(max_bin_size*n_threads))
1588 !$OMP END MASTER
1589 !$OMP BARRIER
1590  my_global_start_idx = bin_histogram(my_process_id + 1, 2) - my_bin_size
1591  icpu = para_env%mepos + 1
1592  DO i = 1, my_bin_size
1593  cost_matrix(my_global_start_idx + i) = binned_dist(i)%cost
1594  END DO
1595 
1596  mepos = para_env%mepos
1597 !$OMP BARRIER
1598 !$OMP MASTER
1599  ALLOCATE (bins_per_rank(ncpu))
1600  bins_per_rank = 0
1601  DO icpu = 1, ncpu
1602  bins_per_rank(icpu) = sum(bin_histogram((icpu - 1)*n_threads + 1:(icpu - 1)*n_threads + n_threads, 1))
1603  END DO
1604  sendbuffer(1:bins_per_rank(para_env%mepos + 1)) = &
1605  cost_matrix(my_global_start_idx + 1:my_global_start_idx + bins_per_rank(para_env%mepos + 1))
1606 
1607  dest = modulo(mepos + 1, ncpu)
1608  source = modulo(mepos - 1, ncpu)
1609  ! sync before/after ring of isendrecv
1610  CALL para_env%sync()
1611  DO icpu = 0, ncpu - 1
1612  IF (icpu .NE. ncpu - 1) THEN
1613  CALL para_env%isendrecv(sendbuffer, dest, recbuffer, source, &
1614  req(1), req(2), 13)
1615  END IF
1616  data_from = modulo(mepos - icpu, ncpu)
1617  start_idx = sum(bins_per_rank(1:data_from + 1)) - bins_per_rank(data_from + 1) + 1
1618  end_idx = start_idx + bins_per_rank(data_from + 1) - 1
1619  cost_matrix(start_idx:end_idx) = sendbuffer(1:end_idx - start_idx + 1)
1620 
1621  IF (icpu .NE. ncpu - 1) THEN
1622  CALL mp_waitall(req)
1623  END IF
1624  swapbuffer => sendbuffer
1625  sendbuffer => recbuffer
1626  recbuffer => swapbuffer
1627  END DO
1628  DEALLOCATE (recbuffer, sendbuffer)
1629  ! sync before/after ring of isendrecv
1630  CALL para_env%sync()
1631 !$OMP END MASTER
1632 !$OMP BARRIER
1633  ALLOCATE (local_cost_matrix(SIZE(cost_matrix, 1)))
1634  local_cost_matrix = cost_matrix
1635 !$OMP MASTER
1636  ALLOCATE (shm_distribution_vector(ncpu*nbins*n_threads))
1637  CALL optimize_distribution(ncpu*nbins*n_threads, ncpu*n_threads, local_cost_matrix, &
1638  shm_distribution_vector, x_data%load_balance_parameter%do_randomize)
1639 
1640  ALLOCATE (full_dist(ncpu*n_threads, max_bin_size))
1641 
1642  full_dist(:, :)%istart = 0_int_8
1643  full_dist(:, :)%number_of_atom_quartets = 0_int_8
1644  full_dist(:, :)%cost = 0_int_8
1645  full_dist(:, :)%time_first_scf = 0.0_dp
1646  full_dist(:, :)%time_other_scf = 0.0_dp
1647  full_dist(:, :)%time_forces = 0.0_dp
1648 !$OMP END MASTER
1649 
1650 !$OMP BARRIER
1651  mepos = para_env%mepos + 1
1652  full_dist((mepos - 1)*n_threads + i_thread + 1, 1:my_bin_size) = binned_dist(1:my_bin_size)
1653 !$OMP BARRIER
1654 !$OMP MASTER
1655  ALLOCATE (sendbuffer(3*max_bin_size*n_threads))
1656  ALLOCATE (recbuffer(3*max_bin_size*n_threads))
1657  mepos = para_env%mepos
1658  DO j = 1, n_threads
1659  DO i = 1, max_bin_size
1660  sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 1) = full_dist(mepos*n_threads + j, i)%istart
1661  sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 2) = full_dist(mepos*n_threads + j, i)%number_of_atom_quartets
1662  sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 3) = full_dist(mepos*n_threads + j, i)%cost
1663  END DO
1664  END DO
1665  dest = modulo(mepos + 1, ncpu)
1666  source = modulo(mepos - 1, ncpu)
1667  ! sync before/after ring of isendrecv
1668  CALL para_env%sync()
1669  DO icpu = 0, ncpu - 1
1670  IF (icpu .NE. ncpu - 1) THEN
1671  CALL para_env%isendrecv(sendbuffer, dest, recbuffer, source, &
1672  req(1), req(2), 13)
1673  END IF
1674  data_from = modulo(mepos - icpu, ncpu)
1675  DO j = 1, n_threads
1676  DO i = 1, max_bin_size
1677  full_dist(data_from*n_threads + j, i)%istart = sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 1)
1678  full_dist(data_from*n_threads + j, i)%number_of_atom_quartets = sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 2)
1679  full_dist(data_from*n_threads + j, i)%cost = sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 3)
1680  END DO
1681  END DO
1682 
1683  IF (icpu .NE. ncpu - 1) THEN
1684  CALL mp_waitall(req)
1685  END IF
1686  swapbuffer => sendbuffer
1687  sendbuffer => recbuffer
1688  recbuffer => swapbuffer
1689  END DO
1690  ! sync before/after ring of isendrecv
1691  DEALLOCATE (recbuffer, sendbuffer)
1692  CALL para_env%sync()
1693 !$OMP END MASTER
1694 !$OMP BARRIER
1695  !! reorder the distribution according to the distribution vector
1696  ALLOCATE (tmp_pos(ncpu*n_threads))
1697  tmp_pos = 1
1698  ALLOCATE (tmp_dist(nbins*ncpu*n_threads))
1699 
1700  tmp_dist(:)%istart = 0_int_8
1701  tmp_dist(:)%number_of_atom_quartets = 0_int_8
1702  tmp_dist(:)%cost = 0_int_8
1703  tmp_dist(:)%time_first_scf = 0.0_dp
1704  tmp_dist(:)%time_other_scf = 0.0_dp
1705  tmp_dist(:)%time_forces = 0.0_dp
1706 
1707  mepos = my_process_id + 1
1708  DO icpu = 1, n_processes
1709  DO i = 1, bin_histogram(icpu, 1)
1710  IF (shm_distribution_vector(bin_histogram(icpu, 2) - bin_histogram(icpu, 1) + i) == mepos) THEN
1711  tmp_dist(tmp_pos(mepos)) = full_dist(icpu, i)
1712  tmp_pos(mepos) = tmp_pos(mepos) + 1
1713  END IF
1714  END DO
1715  END DO
1716 
1717  !! Assign the load to each process
1718  NULLIFY (ptr_to_tmp_dist)
1719  mepos = my_process_id + 1
1720  ptr_to_tmp_dist => tmp_dist(1:tmp_pos(mepos) - 1)
1721  SELECT CASE (eval_type)
1722  CASE (hfx_do_eval_energy)
1723  CALL hfx_set_distr_energy(ptr_to_tmp_dist, x_data)
1724  CASE (hfx_do_eval_forces)
1725  CALL hfx_set_distr_forces(ptr_to_tmp_dist, x_data)
1726  END SELECT
1727 
1728 !$OMP BARRIER
1729 !$OMP MASTER
1730  DEALLOCATE (full_dist, cost_matrix, shm_distribution_vector)
1731  DEALLOCATE (bins_per_rank, bin_histogram)
1732 !$OMP END MASTER
1733 !$OMP BARRIER
1734  DEALLOCATE (tmp_dist, tmp_pos)
1735  DEALLOCATE (binned_dist, local_cost_matrix)
1736  END IF
1737 !$OMP BARRIER
1738 !$OMP MASTER
1739  CALL timestop(handle)
1740 !$OMP END MASTER
1741 !$OMP BARRIER
1742 
1743  END SUBROUTINE hfx_update_load_balance
1744 
1745 ! **************************************************************************************************
1746 !> \brief estimates the cost of a set quartet with info available at load balance time
1747 !> i.e. without much info on the primitives primitives
1748 !> \param nsa ...
1749 !> \param nsb ...
1750 !> \param nsc ...
1751 !> \param nsd ...
1752 !> \param npgfa ...
1753 !> \param npgfb ...
1754 !> \param npgfc ...
1755 !> \param npgfd ...
1756 !> \param ratio ...
1757 !> \param p1 ...
1758 !> \param p2 ...
1759 !> \param p3 ...
1760 !> \return ...
1761 !> \par History
1762 !> 08.2009 created Joost VandeVondele
1763 !> \author Joost VandeVondele
1764 ! **************************************************************************************************
1765  FUNCTION cost_model(nsa, nsb, nsc, nsd, npgfa, npgfb, npgfc, npgfd, ratio, p1, p2, p3) RESULT(res)
1766  IMPLICIT NONE
1767  REAL(kind=dp) :: estimate1, estimate2, estimate, ratio, switch, mu, sigma
1768  INTEGER(KIND=int_8) :: res
1769  REAL(kind=dp), INTENT(IN) :: p1(12), p2(12), p3(2)
1770 
1771  INTEGER :: nsa, nsb, nsc, nsd, npgfa, npgfb, npgfc, npgfd
1772 
1773  estimate1 = estimate_basic(p1)
1774  estimate2 = estimate_basic(p2)
1775  mu = log(abs(1.0e6_dp*p3(1)) + 1)
1776  sigma = p3(2)*0.1_dp*mu
1777  switch = 1.0_dp/(1.0_dp + exp((log(estimate1) - mu)/sigma))
1778  estimate = estimate1*(1.0_dp - switch) + estimate2*switch
1779  res = int(estimate*0.001_dp, kind=int_8) + 1
1780 
1781  CONTAINS
1782 
1783 ! **************************************************************************************************
1784 !> \brief ...
1785 !> \param p ...
1786 !> \return ...
1787 ! **************************************************************************************************
1788  REAL(kind=dp) FUNCTION estimate_basic(p) RESULT(res)
1789  REAL(kind=dp) :: p(12)
1790 
1791  REAL(kind=dp) :: p1, p10, p11, p12, p2, p3, p4, p5, p6, &
1792  p7, p8, p9
1793 
1794  p1 = p(1); p2 = p(2); p3 = p(3); p4 = p(4)
1795  p5 = p(5); p6 = p(6); p7 = p(7); p8 = p(8)
1796  p9 = p(9); p10 = p(10); p11 = p(11); p12 = p(12)
1797  res = poly2(nsa, p1, p2, p3)*poly2(nsb, p1, p2, p3)*poly2(nsc, p1, p2, p3)*poly2(nsd, p1, p2, p3)* &
1798  poly2(npgfa, p4, p5, p6)*poly2(npgfb, p4, p5, p6)*poly2(npgfc, p4, p5, p6)* &
1799  poly2(npgfd, p4, p5, p6)*exp(-p7*ratio + p8*ratio**2) + &
1800  1000.0_dp*p9 + poly2(nsa, p10, p11, p12)*poly2(nsb, p10, p11, p12)*poly2(nsc, p10, p11, p12)*poly2(nsd, p10, p11, p12)
1801  res = 1 + abs(res)
1802  END FUNCTION estimate_basic
1803 
1804 ! **************************************************************************************************
1805 !> \brief ...
1806 !> \param x ...
1807 !> \param a0 ...
1808 !> \param a1 ...
1809 !> \param a2 ...
1810 !> \return ...
1811 ! **************************************************************************************************
1812  REAL(kind=dp) FUNCTION poly2(x, a0, a1, a2)
1813  INTEGER :: x
1814  REAL(kind=dp) :: a0, a1, a2
1815 
1816  poly2 = a0 + a1*x + a2*x*x
1817  END FUNCTION poly2
1818 
1819  END FUNCTION cost_model
1820 ! **************************************************************************************************
1821 !> \brief Minimizes the maximum cost per cpu by shuffling around all bins
1822 !> \param total_number_of_bins ...
1823 !> \param number_of_processes ...
1824 !> \param bin_costs costs per bin
1825 !> \param distribution_vector will contain the final distribution
1826 !> \param do_randomize ...
1827 !> \par History
1828 !> 03.2009 created from a hack by Joost [Manuel Guidon]
1829 !> \author Manuel Guidon
1830 ! **************************************************************************************************
1831  SUBROUTINE optimize_distribution(total_number_of_bins, number_of_processes, bin_costs, &
1832  distribution_vector, do_randomize)
1833  INTEGER :: total_number_of_bins, number_of_processes
1834  INTEGER(int_8), DIMENSION(:), POINTER :: bin_costs
1835  INTEGER, DIMENSION(:), POINTER :: distribution_vector
1836  LOGICAL, INTENT(IN) :: do_randomize
1837 
1838  INTEGER :: i, itmp, j, nstep
1839  INTEGER(int_8), DIMENSION(:), POINTER :: my_cost_cpu, tmp_cost, tmp_cpu_cost
1840  INTEGER, DIMENSION(:), POINTER :: tmp_cpu_index, tmp_index
1841  TYPE(rng_stream_type), ALLOCATABLE :: rng_stream
1842 
1843  nstep = max(1, int(number_of_processes)/2)
1844 
1845  ALLOCATE (tmp_cost(total_number_of_bins))
1846  ALLOCATE (tmp_index(total_number_of_bins))
1847  ALLOCATE (tmp_cpu_cost(number_of_processes))
1848  ALLOCATE (tmp_cpu_index(number_of_processes))
1849  ALLOCATE (my_cost_cpu(number_of_processes))
1850  tmp_cost = bin_costs
1851 
1852  CALL sort(tmp_cost, total_number_of_bins, tmp_index)
1853  my_cost_cpu = 0
1854  !
1855  ! assign the largest remaining bin to the CPU with the smallest load
1856  ! gives near perfect distributions for a sufficient number of bins ...
1857  ! doing this in chunks of nstep (where nstep ~ number_of_processes) makes this n log n and gives
1858  ! each cpu a similar number of tasks.
1859  ! it also avoids degenerate cases where thousands of zero sized tasks
1860  ! are assigned to the same (least loaded) cpu
1861  !
1862  IF (do_randomize) &
1863  rng_stream = rng_stream_type(name="uniform_rng", &
1864  distribution_type=uniform)
1865 
1866  DO i = total_number_of_bins, 1, -nstep
1867  tmp_cpu_cost = my_cost_cpu
1868  CALL sort(tmp_cpu_cost, int(number_of_processes), tmp_cpu_index)
1869  IF (do_randomize) THEN
1870  CALL rng_stream%shuffle(tmp_cpu_index(1:min(i, nstep)))
1871  END IF
1872  DO j = 1, min(i, nstep)
1873  itmp = tmp_cpu_index(j)
1874  distribution_vector(tmp_index(i - j + 1)) = itmp
1875  my_cost_cpu(itmp) = my_cost_cpu(itmp) + bin_costs(tmp_index(i - j + 1))
1876  END DO
1877  END DO
1878 
1879  DEALLOCATE (tmp_cost, tmp_index, tmp_cpu_cost)
1880  DEALLOCATE (tmp_cpu_index, my_cost_cpu)
1881  END SUBROUTINE optimize_distribution
1882 
1883 ! **************************************************************************************************
1884 !> \brief Given a 2d index pair, this function returns a 1d index pair for
1885 !> a symmetric upper triangle NxN matrix
1886 !> The compiler should inline this function, therefore it appears in
1887 !> several modules
1888 !> \param i 2d index
1889 !> \param j 2d index
1890 !> \param N matrix size
1891 !> \return ...
1892 !> \par History
1893 !> 03.2009 created [Manuel Guidon]
1894 !> \author Manuel Guidon
1895 ! **************************************************************************************************
1896  PURE FUNCTION get_1d_idx(i, j, N)
1897  INTEGER, INTENT(IN) :: i, j
1898  INTEGER(int_8), INTENT(IN) :: n
1899  INTEGER(int_8) :: get_1d_idx
1900 
1901  INTEGER(int_8) :: min_ij
1902 
1903  min_ij = min(i, j)
1904  get_1d_idx = min_ij*n + max(i, j) - (min_ij - 1)*min_ij/2 - n
1905 
1906  END FUNCTION get_1d_idx
1907 
1908 ! **************************************************************************************************
1909 !> \brief ...
1910 !> \param natom ...
1911 !> \param nkind ...
1912 !> \param list_ij ...
1913 !> \param list_kl ...
1914 !> \param set_list_ij ...
1915 !> \param set_list_kl ...
1916 !> \param iatom_start ...
1917 !> \param iatom_end ...
1918 !> \param jatom_start ...
1919 !> \param jatom_end ...
1920 !> \param katom_start ...
1921 !> \param katom_end ...
1922 !> \param latom_start ...
1923 !> \param latom_end ...
1924 !> \param particle_set ...
1925 !> \param coeffs_set ...
1926 !> \param coeffs_kind ...
1927 !> \param is_assoc_atomic_block_global ...
1928 !> \param do_periodic ...
1929 !> \param kind_of ...
1930 !> \param basis_parameter ...
1931 !> \param pmax_set ...
1932 !> \param pmax_atom ...
1933 !> \param pmax_blocks ...
1934 !> \param cell ...
1935 !> \param do_p_screening ...
1936 !> \param map_atom_to_kind_atom ...
1937 !> \param eval_type ...
1938 !> \param log10_eps_schwarz ...
1939 !> \param log_2 ...
1940 !> \param coeffs_kind_max0 ...
1941 !> \param use_virial ...
1942 !> \param atomic_pair_list ...
1943 !> \return ...
1944 ! **************************************************************************************************
1945  FUNCTION estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
1946  iatom_start, iatom_end, jatom_start, jatom_end, &
1947  katom_start, katom_end, latom_start, latom_end, &
1948  particle_set, &
1949  coeffs_set, coeffs_kind, &
1950  is_assoc_atomic_block_global, do_periodic, &
1951  kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
1952  cell, &
1953  do_p_screening, map_atom_to_kind_atom, eval_type, &
1954  log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, &
1955  atomic_pair_list)
1956 
1957  INTEGER, INTENT(IN) :: natom, nkind
1958  TYPE(pair_list_type) :: list_ij, list_kl
1959  TYPE(pair_set_list_type), DIMENSION(:) :: set_list_ij, set_list_kl
1960  INTEGER, INTENT(IN) :: iatom_start, iatom_end, jatom_start, &
1961  jatom_end, katom_start, katom_end, &
1962  latom_start, latom_end
1963  TYPE(particle_type), DIMENSION(:), POINTER :: particle_set
1964  TYPE(hfx_screen_coeff_type), &
1965  DIMENSION(:, :, :, :), POINTER :: coeffs_set
1966  TYPE(hfx_screen_coeff_type), &
1967  DIMENSION(nkind, nkind) :: coeffs_kind
1968  INTEGER, DIMENSION(:, :) :: is_assoc_atomic_block_global
1969  LOGICAL :: do_periodic
1970  INTEGER :: kind_of(*)
1971  TYPE(hfx_basis_type), DIMENSION(:), POINTER :: basis_parameter
1972  TYPE(hfx_p_kind), DIMENSION(:), POINTER :: pmax_set
1973  REAL(dp), DIMENSION(:, :), POINTER :: pmax_atom
1974  REAL(dp) :: pmax_blocks
1975  TYPE(cell_type), POINTER :: cell
1976  LOGICAL, INTENT(IN) :: do_p_screening
1977  INTEGER, DIMENSION(:), POINTER :: map_atom_to_kind_atom
1978  INTEGER, INTENT(IN) :: eval_type
1979  REAL(dp) :: log10_eps_schwarz, log_2, &
1980  coeffs_kind_max0
1981  LOGICAL, INTENT(IN) :: use_virial
1982  LOGICAL, DIMENSION(natom, natom) :: atomic_pair_list
1983  INTEGER(int_8) :: estimate_block_cost
1984 
1985  INTEGER :: i_list_ij, i_list_kl, i_set_list_ij, i_set_list_ij_start, i_set_list_ij_stop, &
1986  i_set_list_kl, i_set_list_kl_start, i_set_list_kl_stop, iatom, ikind, iset, jatom, jkind, &
1987  jset, katom, kind_kind_idx, kkind, kset, latom, lkind, lset, swap_id
1988  INTEGER, DIMENSION(:), POINTER :: npgfa, npgfb, npgfc, npgfd, nsgfa, &
1989  nsgfb, nsgfc, nsgfd
1990  REAL(dp) :: actual_pmax_atom, cost_tmp, max_val1, &
1991  max_val2, pmax_entry, rab2, rcd2, &
1992  screen_kind_ij, screen_kind_kl
1993  REAL(dp), DIMENSION(:, :), POINTER :: ptr_p_1, ptr_p_2, ptr_p_3, ptr_p_4
1994 
1995  estimate_block_cost = 0_int_8
1996 
1997  CALL build_pair_list(natom, list_ij, set_list_ij, iatom_start, iatom_end, jatom_start, jatom_end, &
1998  kind_of, basis_parameter, particle_set, &
1999  do_periodic, coeffs_set, coeffs_kind, coeffs_kind_max0, &
2000  log10_eps_schwarz, cell, pmax_blocks, atomic_pair_list)
2001 
2002  CALL build_pair_list(natom, list_kl, set_list_kl, katom_start, katom_end, latom_start, latom_end, &
2003  kind_of, basis_parameter, particle_set, &
2004  do_periodic, coeffs_set, coeffs_kind, coeffs_kind_max0, &
2005  log10_eps_schwarz, cell, pmax_blocks, atomic_pair_list)
2006 
2007  DO i_list_ij = 1, list_ij%n_element
2008  iatom = list_ij%elements(i_list_ij)%pair(1)
2009  jatom = list_ij%elements(i_list_ij)%pair(2)
2010  i_set_list_ij_start = list_ij%elements(i_list_ij)%set_bounds(1)
2011  i_set_list_ij_stop = list_ij%elements(i_list_ij)%set_bounds(2)
2012  ikind = list_ij%elements(i_list_ij)%kind_pair(1)
2013  jkind = list_ij%elements(i_list_ij)%kind_pair(2)
2014  rab2 = list_ij%elements(i_list_ij)%dist2
2015 
2016  nsgfa => basis_parameter(ikind)%nsgf
2017  nsgfb => basis_parameter(jkind)%nsgf
2018  npgfa => basis_parameter(ikind)%npgf
2019  npgfb => basis_parameter(jkind)%npgf
2020 
2021  DO i_list_kl = 1, list_kl%n_element
2022 
2023  katom = list_kl%elements(i_list_kl)%pair(1)
2024  latom = list_kl%elements(i_list_kl)%pair(2)
2025 
2026  IF (.NOT. (katom + latom <= iatom + jatom)) cycle
2027  IF (((iatom + jatom) .EQ. (katom + latom)) .AND. (katom < iatom)) cycle
2028 
2029  IF (eval_type == hfx_do_eval_forces) THEN
2030  IF (.NOT. use_virial) THEN
2031  IF ((iatom == jatom .AND. iatom == katom .AND. iatom == latom)) cycle
2032  END IF
2033  END IF
2034 
2035  i_set_list_kl_start = list_kl%elements(i_list_kl)%set_bounds(1)
2036  i_set_list_kl_stop = list_kl%elements(i_list_kl)%set_bounds(2)
2037  kkind = list_kl%elements(i_list_kl)%kind_pair(1)
2038  lkind = list_kl%elements(i_list_kl)%kind_pair(2)
2039  rcd2 = list_kl%elements(i_list_kl)%dist2
2040 
2041  nsgfc => basis_parameter(kkind)%nsgf
2042  nsgfd => basis_parameter(lkind)%nsgf
2043  npgfc => basis_parameter(kkind)%npgf
2044  npgfd => basis_parameter(lkind)%npgf
2045 
2046  IF (do_p_screening) THEN
2047  actual_pmax_atom = max(pmax_atom(katom, iatom), &
2048  pmax_atom(latom, jatom), &
2049  pmax_atom(latom, iatom), &
2050  pmax_atom(katom, jatom))
2051  ELSE
2052  actual_pmax_atom = 0.0_dp
2053  END IF
2054 
2055  screen_kind_ij = coeffs_kind(jkind, ikind)%x(1)*rab2 + &
2056  coeffs_kind(jkind, ikind)%x(2)
2057  screen_kind_kl = coeffs_kind(lkind, kkind)%x(1)*rcd2 + &
2058  coeffs_kind(lkind, kkind)%x(2)
2059  IF (screen_kind_ij + screen_kind_kl + actual_pmax_atom < log10_eps_schwarz) cycle
2060 
2061  IF (.NOT. (is_assoc_atomic_block_global(latom, iatom) >= 1 .AND. &
2062  is_assoc_atomic_block_global(katom, iatom) >= 1 .AND. &
2063  is_assoc_atomic_block_global(katom, jatom) >= 1 .AND. &
2064  is_assoc_atomic_block_global(latom, jatom) >= 1)) cycle
2065 
2066  IF (do_p_screening) THEN
2067  SELECT CASE (eval_type)
2068  CASE (hfx_do_eval_energy)
2069  swap_id = 0
2070  kind_kind_idx = int(get_1d_idx(kkind, ikind, int(nkind, int_8)))
2071  IF (ikind >= kkind) THEN
2072  ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2073  map_atom_to_kind_atom(katom), &
2074  map_atom_to_kind_atom(iatom))
2075  ELSE
2076  ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2077  map_atom_to_kind_atom(iatom), &
2078  map_atom_to_kind_atom(katom))
2079  swap_id = swap_id + 1
2080  END IF
2081  kind_kind_idx = int(get_1d_idx(lkind, jkind, int(nkind, int_8)))
2082  IF (jkind >= lkind) THEN
2083  ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2084  map_atom_to_kind_atom(latom), &
2085  map_atom_to_kind_atom(jatom))
2086  ELSE
2087  ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2088  map_atom_to_kind_atom(jatom), &
2089  map_atom_to_kind_atom(latom))
2090  swap_id = swap_id + 2
2091  END IF
2092  kind_kind_idx = int(get_1d_idx(lkind, ikind, int(nkind, int_8)))
2093  IF (ikind >= lkind) THEN
2094  ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2095  map_atom_to_kind_atom(latom), &
2096  map_atom_to_kind_atom(iatom))
2097  ELSE
2098  ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2099  map_atom_to_kind_atom(iatom), &
2100  map_atom_to_kind_atom(latom))
2101  swap_id = swap_id + 4
2102  END IF
2103  kind_kind_idx = int(get_1d_idx(kkind, jkind, int(nkind, int_8)))
2104  IF (jkind >= kkind) THEN
2105  ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2106  map_atom_to_kind_atom(katom), &
2107  map_atom_to_kind_atom(jatom))
2108  ELSE
2109  ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2110  map_atom_to_kind_atom(jatom), &
2111  map_atom_to_kind_atom(katom))
2112  swap_id = swap_id + 8
2113  END IF
2114  CASE (hfx_do_eval_forces)
2115  swap_id = 16
2116  kind_kind_idx = int(get_1d_idx(kkind, ikind, int(nkind, int_8)))
2117  IF (ikind >= kkind) THEN
2118  ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2119  map_atom_to_kind_atom(katom), &
2120  map_atom_to_kind_atom(iatom))
2121  ELSE
2122  ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2123  map_atom_to_kind_atom(iatom), &
2124  map_atom_to_kind_atom(katom))
2125  swap_id = swap_id + 1
2126  END IF
2127  kind_kind_idx = int(get_1d_idx(lkind, jkind, int(nkind, int_8)))
2128  IF (jkind >= lkind) THEN
2129  ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2130  map_atom_to_kind_atom(latom), &
2131  map_atom_to_kind_atom(jatom))
2132  ELSE
2133  ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2134  map_atom_to_kind_atom(jatom), &
2135  map_atom_to_kind_atom(latom))
2136  swap_id = swap_id + 2
2137  END IF
2138  kind_kind_idx = int(get_1d_idx(lkind, ikind, int(nkind, int_8)))
2139  IF (ikind >= lkind) THEN
2140  ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2141  map_atom_to_kind_atom(latom), &
2142  map_atom_to_kind_atom(iatom))
2143  ELSE
2144  ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2145  map_atom_to_kind_atom(iatom), &
2146  map_atom_to_kind_atom(latom))
2147  swap_id = swap_id + 4
2148  END IF
2149  kind_kind_idx = int(get_1d_idx(kkind, jkind, int(nkind, int_8)))
2150  IF (jkind >= kkind) THEN
2151  ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2152  map_atom_to_kind_atom(katom), &
2153  map_atom_to_kind_atom(jatom))
2154  ELSE
2155  ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2156  map_atom_to_kind_atom(jatom), &
2157  map_atom_to_kind_atom(katom))
2158  swap_id = swap_id + 8
2159  END IF
2160  END SELECT
2161  END IF
2162 
2163  DO i_set_list_ij = i_set_list_ij_start, i_set_list_ij_stop
2164  iset = set_list_ij(i_set_list_ij)%pair(1)
2165  jset = set_list_ij(i_set_list_ij)%pair(2)
2166 
2167  max_val1 = coeffs_set(jset, iset, jkind, ikind)%x(1)*rab2 + &
2168  coeffs_set(jset, iset, jkind, ikind)%x(2)
2169 
2170  IF (max_val1 + screen_kind_kl + actual_pmax_atom < log10_eps_schwarz) cycle
2171  DO i_set_list_kl = i_set_list_kl_start, i_set_list_kl_stop
2172  kset = set_list_kl(i_set_list_kl)%pair(1)
2173  lset = set_list_kl(i_set_list_kl)%pair(2)
2174 
2175  max_val2 = max_val1 + (coeffs_set(lset, kset, lkind, kkind)%x(1)*rcd2 + &
2176  coeffs_set(lset, kset, lkind, kkind)%x(2))
2177 
2178  IF (max_val2 + actual_pmax_atom < log10_eps_schwarz) cycle
2179  IF (do_p_screening) THEN
2180  CALL get_pmax_val(ptr_p_1, ptr_p_2, ptr_p_3, ptr_p_4, &
2181  iset, jset, kset, lset, &
2182  pmax_entry, swap_id)
2183  IF (eval_type == hfx_do_eval_forces) THEN
2184  pmax_entry = log_2 + pmax_entry
2185  END IF
2186  ELSE
2187  pmax_entry = 0.0_dp
2188  END IF
2189  max_val2 = max_val2 + pmax_entry
2190  IF (max_val2 < log10_eps_schwarz) cycle
2191  SELECT CASE (eval_type)
2192  CASE (hfx_do_eval_energy)
2193  cost_tmp = cost_model(nsgfa(iset), nsgfb(jset), nsgfc(kset), nsgfd(lset), &
2194  npgfa(iset), npgfb(jset), npgfc(kset), npgfd(lset), &
2195  max_val2/log10_eps_schwarz, &
2197  estimate_block_cost = estimate_block_cost + int(cost_tmp, kind=int_8)
2198  CASE (hfx_do_eval_forces)
2199  cost_tmp = cost_model(nsgfa(iset), nsgfb(jset), nsgfc(kset), nsgfd(lset), &
2200  npgfa(iset), npgfb(jset), npgfc(kset), npgfd(lset), &
2201  max_val2/log10_eps_schwarz, &
2202  p1_forces, p2_forces, p3_forces)
2203  estimate_block_cost = estimate_block_cost + int(cost_tmp, kind=int_8)
2204  END SELECT
2205  END DO ! i_set_list_kl
2206  END DO ! i_set_list_ij
2207  END DO ! i_list_kl
2208  END DO ! i_list_ij
2209 
2210  END FUNCTION estimate_block_cost
2211 
2212 ! **************************************************************************************************
2213 !> \brief ...
2214 !> \param nkind ...
2215 !> \param para_env ...
2216 !> \param natom ...
2217 !> \param block_size ...
2218 !> \param nblock ...
2219 !> \param blocks ...
2220 !> \param list_ij ...
2221 !> \param list_kl ...
2222 !> \param set_list_ij ...
2223 !> \param set_list_kl ...
2224 !> \param particle_set ...
2225 !> \param coeffs_set ...
2226 !> \param coeffs_kind ...
2227 !> \param is_assoc_atomic_block_global ...
2228 !> \param do_periodic ...
2229 !> \param kind_of ...
2230 !> \param basis_parameter ...
2231 !> \param pmax_set ...
2232 !> \param pmax_atom ...
2233 !> \param pmax_blocks ...
2234 !> \param cell ...
2235 !> \param do_p_screening ...
2236 !> \param map_atom_to_kind_atom ...
2237 !> \param eval_type ...
2238 !> \param log10_eps_schwarz ...
2239 !> \param log_2 ...
2240 !> \param coeffs_kind_max0 ...
2241 !> \param use_virial ...
2242 !> \param atomic_pair_list ...
2243 ! **************************************************************************************************
2244  SUBROUTINE init_blocks(nkind, para_env, natom, block_size, nblock, blocks, &
2245  list_ij, list_kl, set_list_ij, set_list_kl, &
2246  particle_set, &
2247  coeffs_set, coeffs_kind, &
2248  is_assoc_atomic_block_global, do_periodic, &
2249  kind_of, basis_parameter, pmax_set, pmax_atom, &
2250  pmax_blocks, cell, &
2251  do_p_screening, map_atom_to_kind_atom, eval_type, &
2252  log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, &
2253  atomic_pair_list)
2254 
2255  INTEGER, INTENT(IN) :: nkind
2256  TYPE(mp_para_env_type), INTENT(IN) :: para_env
2257  INTEGER :: natom, block_size, nblock
2258  TYPE(hfx_block_range_type), DIMENSION(1:nblock) :: blocks
2259  TYPE(pair_list_type) :: list_ij, list_kl
2260  TYPE(pair_set_list_type), DIMENSION(:) :: set_list_ij, set_list_kl
2261  TYPE(particle_type), DIMENSION(:), POINTER :: particle_set
2262  TYPE(hfx_screen_coeff_type), &
2263  DIMENSION(:, :, :, :), POINTER :: coeffs_set
2264  TYPE(hfx_screen_coeff_type), DIMENSION(:, :), &
2265  POINTER :: coeffs_kind
2266  INTEGER, DIMENSION(:, :) :: is_assoc_atomic_block_global
2267  LOGICAL :: do_periodic
2268  INTEGER :: kind_of(*)
2269  TYPE(hfx_basis_type), DIMENSION(:), POINTER :: basis_parameter
2270  TYPE(hfx_p_kind), DIMENSION(:), POINTER :: pmax_set
2271  REAL(dp), DIMENSION(:, :), POINTER :: pmax_atom
2272  REAL(dp) :: pmax_blocks
2273  TYPE(cell_type), POINTER :: cell
2274  LOGICAL, INTENT(IN) :: do_p_screening
2275  INTEGER, DIMENSION(:), POINTER :: map_atom_to_kind_atom
2276  INTEGER, INTENT(IN) :: eval_type
2277  REAL(dp) :: log10_eps_schwarz, log_2, &
2278  coeffs_kind_max0
2279  LOGICAL, INTENT(IN) :: use_virial
2280  LOGICAL, DIMENSION(natom, natom) :: atomic_pair_list
2281 
2282  INTEGER :: atom_block, i, iatom_block, iatom_end, &
2283  iatom_start, my_cpu_rank, ncpus
2284 
2285  DO atom_block = 0, nblock - 1
2286  iatom_block = modulo(atom_block, nblock) + 1
2287  iatom_start = (iatom_block - 1)*block_size + 1
2288  iatom_end = min(iatom_block*block_size, natom)
2289  blocks(atom_block + 1)%istart = iatom_start
2290  blocks(atom_block + 1)%iend = iatom_end
2291  blocks(atom_block + 1)%cost = 0_int_8
2292  END DO
2293 
2294  ncpus = para_env%num_pe
2295  my_cpu_rank = para_env%mepos
2296  DO i = 1, nblock
2297  IF (modulo(i, ncpus) /= my_cpu_rank) THEN
2298  blocks(i)%istart = 0
2299  blocks(i)%iend = 0
2300  cycle
2301  END IF
2302  iatom_start = blocks(i)%istart
2303  iatom_end = blocks(i)%iend
2304  blocks(i)%cost = estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
2305  iatom_start, iatom_end, iatom_start, iatom_end, &
2306  iatom_start, iatom_end, iatom_start, iatom_end, &
2307  particle_set, &
2308  coeffs_set, coeffs_kind, &
2309  is_assoc_atomic_block_global, do_periodic, &
2310  kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
2311  cell, &
2312  do_p_screening, map_atom_to_kind_atom, eval_type, &
2313  log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
2314 
2315  END DO
2316  END SUBROUTINE init_blocks
2317 
2318 ! **************************************************************************************************
2319 !> \brief ...
2320 !> \param para_env ...
2321 !> \param x_data ...
2322 !> \param iw ...
2323 !> \param n_threads ...
2324 !> \param i_thread ...
2325 !> \param eval_type ...
2326 ! **************************************************************************************************
2327  SUBROUTINE collect_load_balance_info(para_env, x_data, iw, n_threads, i_thread, &
2328  eval_type)
2329 
2330  TYPE(mp_para_env_type), INTENT(IN) :: para_env
2331  TYPE(hfx_type), POINTER :: x_data
2332  INTEGER, INTENT(IN) :: iw, n_threads, i_thread, eval_type
2333 
2334  INTEGER :: i, j, k, my_rank, nbins, nranks, &
2335  total_bins
2336  INTEGER(int_8) :: avg_bin, avg_rank, max_bin, max_rank, &
2337  min_bin, min_rank, sum_bin, sum_rank
2338  INTEGER(int_8), ALLOCATABLE, DIMENSION(:) :: buffer, buffer_in, buffer_out, summary
2339  INTEGER(int_8), ALLOCATABLE, DIMENSION(:), SAVE :: shm_cost_vector
2340  INTEGER, ALLOCATABLE, DIMENSION(:) :: bins_per_rank, rdispl, sort_idx
2341  INTEGER, ALLOCATABLE, DIMENSION(:), SAVE :: shm_bins_per_rank, shm_displ
2342 
2343  SELECT CASE (eval_type)
2344  CASE (hfx_do_eval_energy)
2345  nbins = SIZE(x_data%distribution_energy)
2346  CASE (hfx_do_eval_forces)
2347  nbins = SIZE(x_data%distribution_forces)
2348  END SELECT
2349 
2350 !$OMP MASTER
2351  ALLOCATE (shm_bins_per_rank(n_threads))
2352  ALLOCATE (shm_displ(n_threads + 1))
2353 !$OMP END MASTER
2354 !$OMP BARRIER
2355 
2356  shm_bins_per_rank(i_thread + 1) = nbins
2357 !$OMP BARRIER
2358  nbins = 0
2359  DO i = 1, n_threads
2360  nbins = nbins + shm_bins_per_rank(i)
2361  END DO
2362  my_rank = para_env%mepos
2363  nranks = para_env%num_pe
2364 
2365 !$OMP BARRIER
2366 !$OMP MASTER
2367  ALLOCATE (bins_per_rank(nranks))
2368  bins_per_rank = 0
2369 
2370  bins_per_rank(my_rank + 1) = nbins
2371 
2372  CALL para_env%sum(bins_per_rank)
2373 
2374  total_bins = 0
2375  DO i = 1, nranks
2376  total_bins = total_bins + bins_per_rank(i)
2377  END DO
2378 
2379  ALLOCATE (shm_cost_vector(2*total_bins))
2380  shm_cost_vector = -1_int_8
2381  shm_displ(1) = 1
2382  DO i = 2, n_threads
2383  shm_displ(i) = shm_displ(i - 1) + shm_bins_per_rank(i - 1)
2384  END DO
2385  shm_displ(n_threads + 1) = nbins + 1
2386 !$OMP END MASTER
2387 !$OMP BARRIER
2388  j = 0
2389  SELECT CASE (eval_type)
2390  CASE (hfx_do_eval_energy)
2391  DO i = shm_displ(i_thread + 1), shm_displ(i_thread + 2) - 1
2392  j = j + 1
2393  shm_cost_vector(2*(i - 1) + 1) = x_data%distribution_energy(j)%cost
2394  shm_cost_vector(2*i) = int(x_data%distribution_energy(j)%time_first_scf*10000.0_dp, kind=int_8)
2395  END DO
2396  CASE (hfx_do_eval_forces)
2397  DO i = shm_displ(i_thread + 1), shm_displ(i_thread + 2) - 1
2398  j = j + 1
2399  shm_cost_vector(2*(i - 1) + 1) = x_data%distribution_forces(j)%cost
2400  shm_cost_vector(2*i) = int(x_data%distribution_forces(j)%time_forces*10000.0_dp, kind=int_8)
2401  END DO
2402  END SELECT
2403 !$OMP BARRIER
2404 !$OMP MASTER
2405  ! ** calculate offsets
2406  ALLOCATE (rdispl(nranks))
2407  bins_per_rank(:) = bins_per_rank(:)*2
2408  rdispl(1) = 0
2409  DO i = 2, nranks
2410  rdispl(i) = rdispl(i - 1) + bins_per_rank(i - 1)
2411  END DO
2412 
2413  ALLOCATE (buffer_in(2*nbins))
2414  ALLOCATE (buffer_out(2*total_bins))
2415 
2416  DO i = 1, nbins
2417  buffer_in(2*(i - 1) + 1) = shm_cost_vector(2*(i - 1) + 1)
2418  buffer_in(2*i) = shm_cost_vector(2*i)
2419  END DO
2420 
2421  CALL para_env%gatherv(buffer_in, buffer_out, bins_per_rank, rdispl)
2422 
2423  IF (iw > 0) THEN
2424 
2425  ALLOCATE (summary(2*nranks))
2426  summary = 0_int_8
2427 
2428  WRITE (iw, '( /, 1X, 79("-") )')
2429  WRITE (iw, '( " -", 77X, "-" )')
2430  SELECT CASE (eval_type)
2431  CASE (hfx_do_eval_energy)
2432  WRITE (iw, '( " -", 20X, A, 19X, "-" )') ' HFX LOAD BALANCE INFORMATION - ENERGY '
2433  CASE (hfx_do_eval_forces)
2434  WRITE (iw, '( " -", 20X, A, 19X, "-" )') ' HFX LOAD BALANCE INFORMATION - FORCES '
2435  END SELECT
2436  WRITE (iw, '( " -", 77X, "-" )')
2437  WRITE (iw, '( 1X, 79("-") )')
2438 
2439  WRITE (iw, fmt="(T3,A,T15,A,T35,A,T55,A)") "MPI RANK", "BIN #", "EST cost", "Processing time [s]"
2440  WRITE (iw, '( 1X, 79("-"), / )')
2441  k = 0
2442  DO i = 1, nranks
2443  DO j = 1, bins_per_rank(i)/2
2444  k = k + 1
2445  WRITE (iw, fmt="(T6,I5,T15,I5,T27,I16,T55,F19.8)") &
2446  i - 1, j, buffer_out(2*(k - 1) + 1), real(buffer_out(2*k), dp)/10000.0_dp
2447  summary(2*(i - 1) + 1) = summary(2*(i - 1) + 1) + buffer_out(2*(k - 1) + 1)
2448  summary(2*i) = summary(2*i) + buffer_out(2*k)
2449  END DO
2450  END DO
2451 
2452  !** Summary
2453  max_bin = 0_int_8
2454  min_bin = huge(min_bin)
2455  sum_bin = 0_int_8
2456  DO i = 1, total_bins
2457  sum_bin = sum_bin + buffer_out(2*i)
2458  max_bin = max(max_bin, buffer_out(2*i))
2459  min_bin = min(min_bin, buffer_out(2*i))
2460  END DO
2461  avg_bin = sum_bin/total_bins
2462 
2463  max_rank = 0_int_8
2464  min_rank = huge(min_rank)
2465  sum_rank = 0_int_8
2466  DO i = 1, nranks
2467  sum_rank = sum_rank + summary(2*i)
2468  max_rank = max(max_rank, summary(2*i))
2469  min_rank = min(min_rank, summary(2*i))
2470  END DO
2471  avg_rank = sum_rank/nranks
2472 
2473  WRITE (iw, fmt='(/,T3,A,/)') "SUMMARY:"
2474  WRITE (iw, fmt="(T3,A,T35,F19.8)") "Max bin", real(max_bin, dp)/10000.0_dp
2475  WRITE (iw, fmt="(T3,A,T35,F19.8)") "Min bin", real(min_bin, dp)/10000.0_dp
2476  WRITE (iw, fmt="(T3,A,T35,F19.8)") "Sum bin", real(sum_bin, dp)/10000.0_dp
2477  WRITE (iw, fmt="(T3,A,T35,F19.8,/)") "Avg bin", real(avg_bin, dp)/10000.0_dp
2478  WRITE (iw, fmt="(T3,A,T35,F19.8)") "Max rank", real(max_rank, dp)/10000.0_dp
2479  WRITE (iw, fmt="(T3,A,T35,F19.8)") "Min rank", real(min_rank, dp)/10000.0_dp
2480  WRITE (iw, fmt="(T3,A,T35,F19.8)") "Sum rank", real(sum_rank, dp)/10000.0_dp
2481  WRITE (iw, fmt="(T3,A,T35,F19.8,/)") "Avg rank", real(avg_rank, dp)/10000.0_dp
2482 
2483  ALLOCATE (buffer(nranks))
2484  ALLOCATE (sort_idx(nranks))
2485 
2486  DO i = 1, nranks
2487  buffer(i) = summary(2*i)
2488  END DO
2489 
2490  CALL sort(buffer, nranks, sort_idx)
2491 
2492  WRITE (iw, fmt="(T3,A,T35,A,T55,A,/)") "MPI RANK", "EST cost", "Processing time [s]"
2493  DO i = nranks, 1, -1
2494  WRITE (iw, fmt="(T6,I5,T27,I16,T55,F19.8)") sort_idx(i) - 1, summary(2*(sort_idx(i) - 1) + 1), real(buffer(i), dp)/10000.0_dp
2495  END DO
2496 
2497  DEALLOCATE (summary, buffer, sort_idx)
2498 
2499  END IF
2500 
2501  DEALLOCATE (buffer_in, buffer_out, rdispl)
2502 
2503  CALL para_env%sync()
2504 
2505  DEALLOCATE (shm_bins_per_rank, shm_displ, shm_cost_vector)
2506 !$OMP END MASTER
2507 !$OMP BARRIER
2508 
2509  END SUBROUTINE collect_load_balance_info
2510 
2511 
2512 ! **************************************************************************************************
2513 !> \brief This routine calculates the maximum density matrix element, when
2514 !> screening on an initial density matrix is applied. Due to symmetry of
2515 !> the ERI's, there are always 4 matrix elements to be considered.
2516 !> CASE 0-15 belong to an energy calculation (linear screening)
2517 !> CASE 16-31 belong to a force calculation (square screening)
2518 !> \param ptr_p_1 Pointers to atomic density matrices
2519 !> \param ptr_p_2 Pointers to atomic density matrices
2520 !> \param ptr_p_3 Pointers to atomic density matrices
2521 !> \param ptr_p_4 Pointers to atomic density matrices
2522 !> \param iset Current set
2523 !> \param jset Current set
2524 !> \param kset Current set
2525 !> \param lset Current set
2526 !> \param pmax_val value to be calculated
2527 !> \param swap_id Defines how the matrices are accessed
2528 !> \par History
2529 !> 06.2009 created [Manuel Guidon]
2530 !> \author Manuel Guidon
2531 ! **************************************************************************************************
2532 PURE SUBROUTINE get_pmax_val(ptr_p_1, ptr_p_2, ptr_p_3, ptr_p_4, iset, jset, kset, lset, pmax_val, swap_id)
2533 
2534  REAL(dp), DIMENSION(:, :), POINTER :: ptr_p_1, ptr_p_2, ptr_p_3, ptr_p_4
2535  INTEGER, INTENT(IN) :: iset, jset, kset, lset
2536 
2537  REAL(dp), INTENT(OUT) :: pmax_val
2538  INTEGER, INTENT(IN) :: swap_id
2539 
2540  REAL(dp) :: pmax_1, pmax_2, pmax_3, pmax_4
2541 
2542  SELECT CASE (swap_id)
2543  CASE (0)
2544  pmax_1 = ptr_p_1(kset, iset)
2545  pmax_2 = ptr_p_2(lset, jset)
2546  pmax_3 = ptr_p_3(lset, iset)
2547  pmax_4 = ptr_p_4(kset, jset)
2548  pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2549  CASE (1)
2550  pmax_1 = ptr_p_1(iset, kset)
2551  pmax_2 = ptr_p_2(lset, jset)
2552  pmax_3 = ptr_p_3(lset, iset)
2553  pmax_4 = ptr_p_4(kset, jset)
2554  pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2555  CASE (2)
2556  pmax_1 = ptr_p_1(kset, iset)
2557  pmax_2 = ptr_p_2(jset, lset)
2558  pmax_3 = ptr_p_3(lset, iset)
2559  pmax_4 = ptr_p_4(kset, jset)
2560  pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2561  CASE (3)
2562  pmax_1 = ptr_p_1(iset, kset)
2563  pmax_2 = ptr_p_2(jset, lset)
2564  pmax_3 = ptr_p_3(lset, iset)
2565  pmax_4 = ptr_p_4(kset, jset)
2566  pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2567  CASE (4)
2568  pmax_1 = ptr_p_1(kset, iset)
2569  pmax_2 = ptr_p_2(lset, jset)
2570  pmax_3 = ptr_p_3(iset, lset)
2571  pmax_4 = ptr_p_4(kset, jset)
2572  pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2573  CASE (5)
2574  pmax_1 = ptr_p_1(iset, kset)
2575  pmax_2 = ptr_p_2(lset, jset)
2576  pmax_3 = ptr_p_3(iset, lset)
2577  pmax_4 = ptr_p_4(kset, jset)
2578  pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2579  CASE (6)
2580  pmax_1 = ptr_p_1(kset, iset)
2581  pmax_2 = ptr_p_2(jset, lset)
2582  pmax_3 = ptr_p_3(iset, lset)
2583  pmax_4 = ptr_p_4(kset, jset)
2584  pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2585  CASE (7)
2586  pmax_1 = ptr_p_1(iset, kset)
2587  pmax_2 = ptr_p_2(jset, lset)
2588  pmax_3 = ptr_p_3(iset, lset)
2589  pmax_4 = ptr_p_4(kset, jset)
2590  pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2591  CASE (8)
2592  pmax_1 = ptr_p_1(kset, iset)
2593  pmax_2 = ptr_p_2(lset, jset)
2594  pmax_3 = ptr_p_3(lset, iset)
2595  pmax_4 = ptr_p_4(jset, kset)
2596  pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2597  CASE (9)
2598  pmax_1 = ptr_p_1(iset, kset)
2599  pmax_2 = ptr_p_2(lset, jset)
2600  pmax_3 = ptr_p_3(lset, iset)
2601  pmax_4 = ptr_p_4(jset, kset)
2602  pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2603  CASE (10)
2604  pmax_1 = ptr_p_1(kset, iset)
2605  pmax_2 = ptr_p_2(jset, lset)
2606  pmax_3 = ptr_p_3(lset, iset)
2607  pmax_4 = ptr_p_4(jset, kset)
2608  pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2609  CASE (11)
2610  pmax_1 = ptr_p_1(iset, kset)
2611  pmax_2 = ptr_p_2(jset, lset)
2612  pmax_3 = ptr_p_3(lset, iset)
2613  pmax_4 = ptr_p_4(jset, kset)
2614  pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2615  CASE (12)
2616  pmax_1 = ptr_p_1(kset, iset)
2617  pmax_2 = ptr_p_2(lset, jset)
2618  pmax_3 = ptr_p_3(iset, lset)
2619  pmax_4 = ptr_p_4(jset, kset)
2620  pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2621  CASE (13)
2622  pmax_1 = ptr_p_1(iset, kset)
2623  pmax_2 = ptr_p_2(lset, jset)
2624  pmax_3 = ptr_p_3(iset, lset)
2625  pmax_4 = ptr_p_4(jset, kset)
2626  pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2627  CASE (14)
2628  pmax_1 = ptr_p_1(kset, iset)
2629  pmax_2 = ptr_p_2(jset, lset)
2630  pmax_3 = ptr_p_3(iset, lset)
2631  pmax_4 = ptr_p_4(jset, kset)
2632  pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2633  CASE (15)
2634  pmax_1 = ptr_p_1(iset, kset)
2635  pmax_2 = ptr_p_2(jset, lset)
2636  pmax_3 = ptr_p_3(iset, lset)
2637  pmax_4 = ptr_p_4(jset, kset)
2638  pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2639  CASE (16)
2640  pmax_1 = ptr_p_1(kset, iset)
2641  pmax_2 = ptr_p_2(lset, jset)
2642  pmax_3 = ptr_p_3(lset, iset)
2643  pmax_4 = ptr_p_4(kset, jset)
2644  pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2645  CASE (17)
2646  pmax_1 = ptr_p_1(iset, kset)
2647  pmax_2 = ptr_p_2(lset, jset)
2648  pmax_3 = ptr_p_3(lset, iset)
2649  pmax_4 = ptr_p_4(kset, jset)
2650  pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2651  CASE (18)
2652  pmax_1 = ptr_p_1(kset, iset)
2653  pmax_2 = ptr_p_2(jset, lset)
2654  pmax_3 = ptr_p_3(lset, iset)
2655  pmax_4 = ptr_p_4(kset, jset)
2656  pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2657  CASE (19)
2658  pmax_1 = ptr_p_1(iset, kset)
2659  pmax_2 = ptr_p_2(jset, lset)
2660  pmax_3 = ptr_p_3(lset, iset)
2661  pmax_4 = ptr_p_4(kset, jset)
2662  pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2663  CASE (20)
2664  pmax_1 = ptr_p_1(kset, iset)
2665  pmax_2 = ptr_p_2(lset, jset)
2666  pmax_3 = ptr_p_3(iset, lset)
2667  pmax_4 = ptr_p_4(kset, jset)
2668  pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2669  CASE (21)
2670  pmax_1 = ptr_p_1(iset, kset)
2671  pmax_2 = ptr_p_2(lset, jset)
2672  pmax_3 = ptr_p_3(iset, lset)
2673  pmax_4 = ptr_p_4(kset, jset)
2674  pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2675  CASE (22)
2676  pmax_1 = ptr_p_1(kset, iset)
2677  pmax_2 = ptr_p_2(jset, lset)
2678  pmax_3 = ptr_p_3(iset, lset)
2679  pmax_4 = ptr_p_4(kset, jset)
2680  pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2681  CASE (23)
2682  pmax_1 = ptr_p_1(iset, kset)
2683  pmax_2 = ptr_p_2(jset, lset)
2684  pmax_3 = ptr_p_3(iset, lset)
2685  pmax_4 = ptr_p_4(kset, jset)
2686  pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2687  CASE (24)
2688  pmax_1 = ptr_p_1(kset, iset)
2689  pmax_2 = ptr_p_2(lset, jset)
2690  pmax_3 = ptr_p_3(lset, iset)
2691  pmax_4 = ptr_p_4(jset, kset)
2692  pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2693  CASE (25)
2694  pmax_1 = ptr_p_1(iset, kset)
2695  pmax_2 = ptr_p_2(lset, jset)
2696  pmax_3 = ptr_p_3(lset, iset)
2697  pmax_4 = ptr_p_4(jset, kset)
2698  pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2699  CASE (26)
2700  pmax_1 = ptr_p_1(kset, iset)
2701  pmax_2 = ptr_p_2(jset, lset)
2702  pmax_3 = ptr_p_3(lset, iset)
2703  pmax_4 = ptr_p_4(jset, kset)
2704  pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2705  CASE (27)
2706  pmax_1 = ptr_p_1(iset, kset)
2707  pmax_2 = ptr_p_2(jset, lset)
2708  pmax_3 = ptr_p_3(lset, iset)
2709  pmax_4 = ptr_p_4(jset, kset)
2710  pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2711  CASE (28)
2712  pmax_1 = ptr_p_1(kset, iset)
2713  pmax_2 = ptr_p_2(lset, jset)
2714  pmax_3 = ptr_p_3(iset, lset)
2715  pmax_4 = ptr_p_4(jset, kset)
2716  pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2717  CASE (29)
2718  pmax_1 = ptr_p_1(iset, kset)
2719  pmax_2 = ptr_p_2(lset, jset)
2720  pmax_3 = ptr_p_3(iset, lset)
2721  pmax_4 = ptr_p_4(jset, kset)
2722  pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2723  CASE (30)
2724  pmax_1 = ptr_p_1(kset, iset)
2725  pmax_2 = ptr_p_2(jset, lset)
2726  pmax_3 = ptr_p_3(iset, lset)
2727  pmax_4 = ptr_p_4(jset, kset)
2728  pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2729  CASE (31)
2730  pmax_1 = ptr_p_1(iset, kset)
2731  pmax_2 = ptr_p_2(jset, lset)
2732  pmax_3 = ptr_p_3(iset, lset)
2733  pmax_4 = ptr_p_4(jset, kset)
2734  pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2735  END SELECT
2736 
2737 END SUBROUTINE get_pmax_val
2738 
2739 
2740 END MODULE hfx_load_balance_methods
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
Definition: grid_common.h:117
static GRID_HOST_DEVICE int idx(const orbital a)
Return coset index of given orbital angular momentum.
Definition: grid_common.h:153
Handles all functions related to the CELL.
Definition: cell_types.F:15
Utility routines to open and close files. Tracking of preconnections.
Definition: cp_files.F:16
subroutine, public open_file(file_name, file_status, file_form, file_action, file_position, file_pad, unit_number, debug, skip_get_unit_number, file_access)
Opens the requested file using a free unit number.
Definition: cp_files.F:308
subroutine, public close_file(unit_number, file_status, keep_preconnection)
Close an open file given by its logical unit number. Optionally, keep the file and unit preconnected.
Definition: cp_files.F:119
Routines for optimizing load balance between processes in HFX calculations.
real(kind=dp), dimension(12), parameter, public p1_energy
real(kind=dp), dimension(2), parameter, public p3_energy
real(kind=dp), dimension(12), parameter, public p2_energy
subroutine, public collect_load_balance_info(para_env, x_data, iw, n_threads, i_thread, eval_type)
...
subroutine, public hfx_load_balance(x_data, eps_schwarz, particle_set, max_set, para_env, coeffs_set, coeffs_kind, is_assoc_atomic_block_global, do_periodic, load_balance_parameter, kind_of, basis_parameter, pmax_set, pmax_atom, i_thread, n_threads, cell, do_p_screening, map_atom_to_kind_atom, nkind, eval_type, pmax_block, use_virial)
Distributes the computation of eri's to all available processes.
integer(kind=int_8) function, public cost_model(nsa, nsb, nsc, nsd, npgfa, npgfb, npgfc, npgfd, ratio, p1, p2, p3)
estimates the cost of a set quartet with info available at load balance time i.e. without much info o...
subroutine, public hfx_update_load_balance(x_data, para_env, load_balance_parameter, i_thread, n_threads, eval_type)
Cheap way of redistributing the eri's.
Routines for optimizing load balance between processes in HFX calculations.
subroutine, public build_atomic_pair_list(natom, atomic_pair_list, kind_of, basis_parameter, particle_set, do_periodic, coeffs_kind, coeffs_kind_max0, log10_eps_schwarz, cell, blocks)
...
subroutine, public build_pair_list(natom, list, set_list, i_start, i_end, j_start, j_end, kind_of, basis_parameter, particle_set, do_periodic, coeffs_set, coeffs_kind, coeffs_kind_max0, log10_eps_schwarz, cell, pmax_blocks, atomic_pair_list)
...
Types and set/get functions for HFX.
Definition: hfx_types.F:15
subroutine, public hfx_set_distr_energy(ptr_to_distr, x_data)
This routine stores the data obtained from the load balance routine for the energy
Definition: hfx_types.F:2568
subroutine, public hfx_set_distr_forces(ptr_to_distr, x_data)
This routine stores the data obtained from the load balance routine for the forces
Definition: hfx_types.F:2588
collects all constants needed in input so that they can be used without circular dependencies
integer, parameter, public hfx_do_eval_energy
integer, parameter, public hfx_do_eval_forces
Defines the basic variable types.
Definition: kinds.F:23
integer, parameter, public int_8
Definition: kinds.F:54
integer, parameter, public dp
Definition: kinds.F:34
Interface to the message passing library MPI.
Parallel (pseudo)random number generator (RNG) for multiple streams and substreams of random numbers.
integer, parameter, public uniform
Define the data structure for the particle information.
All kind of helpful little routines.
Definition: util.F:14