34#include "./base/base_uses.f90"
43 CHARACTER(len=*),
PARAMETER,
PRIVATE :: moduleN =
'hfx_load_balance_methods'
45 REAL(kind=
dp),
PARAMETER ::
p1_energy(12) = (/2.9461408209700424_dp, 1.0624718662999657_dp, &
46 -1.91570128356921242e-002_dp, -1.6668495454436603_dp, &
47 1.7512639006523709_dp, -9.76074323945336081e-002_dp, &
48 2.6230786127311889_dp, -0.31870737623014189_dp, &
49 7.9588203912690973_dp, 1.8331423413134813_dp, &
50 -0.15427618665346299_dp, 0.19749436090711650_dp/)
51 REAL(kind=
dp),
PARAMETER ::
p2_energy(12) = (/2.3104682960662593_dp, 1.8744052737304417_dp, &
52 -9.36564055598656797e-002_dp, 0.64284973765086939_dp, &
53 1.0137565430060556_dp, -6.80088178288954567e-003_dp, &
54 1.1692629207374552_dp, -2.6314710080507573_dp, &
55 19.237814781880786_dp, 1.0505934173661349_dp, &
56 0.80382371955699250_dp, 0.49903401991818103_dp/)
57 REAL(kind=
dp),
PARAMETER ::
p3_energy(2) = (/7.82336287670072350e-002_dp, 0.38073304105744837_dp/)
58 REAL(kind=
dp),
PARAMETER :: p1_forces(12) = (/2.5746279948798874_dp, 1.3420575378609276_dp, &
59 -9.41673106447732111e-002_dp, 0.94568006899317825_dp, &
60 -1.4511897117448544_dp, 0.59178934677316952_dp, &
61 2.7291149361757236_dp, -0.50555512044800210_dp, &
62 8.3508180969609871_dp, 1.6829982496141809_dp, &
63 -0.74895370472152600_dp, 0.43801726744197500_dp/)
64 REAL(kind=
dp),
PARAMETER :: p2_forces(12) = (/2.6398568961569020_dp, 2.3024918834564101_dp, &
65 5.33216585432061581e-003_dp, 0.45572145697283628_dp, &
66 1.8119743851500618_dp, -0.12533918548421166_dp, &
67 -1.4040312084552751_dp, -4.5331650463917859_dp, &
68 12.593431549069477_dp, 1.1311978374487595_dp, &
69 1.4245996087624646_dp, 1.1425350529853495_dp/)
70 REAL(kind=
dp),
PARAMETER :: p3_forces(2) = (/0.12051930516830946_dp, 1.3828051586144336_dp/)
124 coeffs_set, coeffs_kind, &
125 is_assoc_atomic_block_global, do_periodic, &
126 load_balance_parameter, kind_of, basis_parameter, pmax_set, &
127 pmax_atom, i_thread, n_threads, cell, &
128 do_p_screening, map_atom_to_kind_atom, nkind, eval_type, &
129 pmax_block, use_virial)
131 REAL(
dp),
INTENT(IN) :: eps_schwarz
133 INTEGER,
INTENT(IN) :: max_set
136 DIMENSION(:, :, :, :),
POINTER :: coeffs_set
138 POINTER :: coeffs_kind
139 INTEGER,
DIMENSION(:, :) :: is_assoc_atomic_block_global
140 LOGICAL :: do_periodic
142 INTEGER :: kind_of(*)
144 TYPE(
hfx_p_kind),
DIMENSION(:),
POINTER :: pmax_set
145 REAL(
dp),
DIMENSION(:, :),
POINTER :: pmax_atom
146 INTEGER,
INTENT(IN) :: i_thread, n_threads
148 LOGICAL,
INTENT(IN) :: do_p_screening
149 INTEGER,
DIMENSION(:),
POINTER :: map_atom_to_kind_atom
150 INTEGER,
INTENT(IN) :: nkind, eval_type
151 REAL(
dp),
DIMENSION(:, :),
POINTER :: pmax_block
152 LOGICAL,
INTENT(IN) :: use_virial
154 CHARACTER(LEN=*),
PARAMETER :: routinen =
'hfx_load_balance'
156 CHARACTER(LEN=512) :: error_msg
157 INTEGER :: block_size, current_block_id, data_from, dest, handle, handle_inner, &
158 handle_range, i, iatom_block, iatom_end, iatom_start, ibin, icpu, j, jatom_block, &
159 jatom_end, jatom_start, katom_block, katom_end, katom_start, latom_block, latom_end, &
160 latom_start, mepos, my_process_id, n_processes, natom, nbins, nblocks, ncpu, &
161 new_iatom_end, new_iatom_start, new_jatom_end, new_jatom_start, non_empty_blocks, &
162 objective_block_size, objective_nblocks, source, total_blocks
164 INTEGER(int_8) :: atom_block, cost_per_bin, cost_per_core, current_cost, &
165 distribution_counter_end, distribution_counter_start, global_quartet_counter, &
166 local_quartet_counter, self_cost_per_block, tmp_block, total_block_self_cost
167 INTEGER(int_8),
ALLOCATABLE,
DIMENSION(:) :: buffer_in, buffer_out
168 INTEGER(int_8),
DIMENSION(:),
POINTER :: local_cost_matrix, recbuffer, &
169 sendbuffer, swapbuffer
170 INTEGER(int_8),
DIMENSION(:),
POINTER,
SAVE :: cost_matrix
171 INTEGER(int_8),
SAVE :: shm_global_quartet_counter, &
172 shm_local_quartet_counter
173 INTEGER,
ALLOCATABLE,
DIMENSION(:) :: rcount, rdispl, tmp_index, tmp_pos, &
175 INTEGER,
DIMENSION(:),
POINTER,
SAVE :: shm_distribution_vector
176 INTEGER,
SAVE :: shm_nblocks
177 LOGICAL :: changed, last_bin_needs_to_be_filled, &
179 LOGICAL,
DIMENSION(:, :),
POINTER,
SAVE :: atomic_pair_list
180 REAL(
dp) :: coeffs_kind_max0, log10_eps_schwarz, &
184 POINTER,
SAVE :: shm_blocks
185 TYPE(
hfx_distribution),
DIMENSION(:),
POINTER :: binned_dist, ptr_to_tmp_dist, tmp_dist
190 DIMENSION(:) :: set_list_ij, set_list_kl
194 CALL timeset(routinen, handle)
198 log10_eps_schwarz = log10(eps_schwarz)
199 log_2 = log10(2.0_dp)
200 coeffs_kind_max0 = maxval(coeffs_kind(:, :)%x(2))
201 ncpu = para_env%num_pe
202 n_processes = ncpu*n_threads
203 natom =
SIZE(particle_set)
205 block_size = load_balance_parameter%block_size
206 ALLOCATE (set_list_ij((max_set*block_size)**2))
207 ALLOCATE (set_list_kl((max_set*block_size)**2))
209 IF (.NOT. load_balance_parameter%blocks_initialized)
THEN
212 CALL timeset(routinen//
"_range", handle_range)
214 nblocks = max((natom + block_size - 1)/block_size, 1)
215 ALLOCATE (blocks_guess(nblocks))
216 ALLOCATE (tmp_blocks(natom))
217 ALLOCATE (tmp_blocks2(natom))
220 SELECT CASE (eval_type)
222 atomic_pair_list => x_data%atomic_pair_list
224 atomic_pair_list => x_data%atomic_pair_list_forces
226 atomic_pair_list = .true.
227 CALL init_blocks(nkind, para_env, natom, block_size, nblocks, blocks_guess, &
228 list_ij, list_kl, set_list_ij, set_list_kl, &
230 coeffs_set, coeffs_kind, &
231 is_assoc_atomic_block_global, do_periodic, &
232 kind_of, basis_parameter, pmax_set, pmax_atom, &
234 do_p_screening, map_atom_to_kind_atom, eval_type, &
235 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
237 total_block_self_cost = 0
240 total_block_self_cost = total_block_self_cost + blocks_guess(i)%cost
243 CALL para_env%sum(total_block_self_cost)
245 objective_block_size = load_balance_parameter%block_size
246 objective_nblocks = max((natom + objective_block_size - 1)/objective_block_size, 1)
248 self_cost_per_block = (total_block_self_cost + objective_nblocks - 1)/(objective_nblocks)
251 tmp_blocks2(i) = blocks_guess(i)
256 DO WHILE (.NOT. optimized)
260 DO atom_block = 1, nblocks
261 current_block_id = current_block_id + 1
262 iatom_start = tmp_blocks2(atom_block)%istart
263 iatom_end = tmp_blocks2(atom_block)%iend
264 IF (tmp_blocks2(atom_block)%cost > 1.5_dp*self_cost_per_block .AND. iatom_end - iatom_start > 0)
THEN
266 new_iatom_start = iatom_start
267 new_iatom_end = (iatom_end - iatom_start + 1)/2 + iatom_start - 1
268 new_jatom_start = new_iatom_end + 1
269 new_jatom_end = iatom_end
270 tmp_blocks(current_block_id)%istart = new_iatom_start
271 tmp_blocks(current_block_id)%iend = new_iatom_end
272 tmp_blocks(current_block_id)%cost = estimate_block_cost( &
273 natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
274 new_iatom_start, new_iatom_end, new_iatom_start, new_iatom_end, &
275 new_iatom_start, new_iatom_end, new_iatom_start, new_iatom_end, &
277 coeffs_set, coeffs_kind, &
278 is_assoc_atomic_block_global, do_periodic, &
279 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
281 do_p_screening, map_atom_to_kind_atom, eval_type, &
282 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
283 current_block_id = current_block_id + 1
284 tmp_blocks(current_block_id)%istart = new_jatom_start
285 tmp_blocks(current_block_id)%iend = new_jatom_end
286 tmp_blocks(current_block_id)%cost = estimate_block_cost( &
287 natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
288 new_jatom_start, new_jatom_end, new_jatom_start, new_jatom_end, &
289 new_jatom_start, new_jatom_end, new_jatom_start, new_jatom_end, &
291 coeffs_set, coeffs_kind, &
292 is_assoc_atomic_block_global, do_periodic, &
293 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
295 do_p_screening, map_atom_to_kind_atom, eval_type, &
296 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
298 tmp_blocks(current_block_id)%istart = iatom_start
299 tmp_blocks(current_block_id)%iend = iatom_end
300 tmp_blocks(current_block_id)%cost = tmp_blocks2(atom_block)%cost
303 IF (.NOT. changed) optimized = .true.
304 IF (i > 20) optimized = .true.
305 nblocks = current_block_id
306 DO atom_block = 1, nblocks
307 tmp_blocks2(atom_block) = tmp_blocks(atom_block)
311 DEALLOCATE (tmp_blocks2)
315 DO atom_block = 1, nblocks
316 IF (tmp_blocks(atom_block)%istart == 0) cycle
317 non_empty_blocks = non_empty_blocks + 1
320 ALLOCATE (rcount(ncpu))
322 rcount(para_env%mepos + 1) = non_empty_blocks
323 CALL para_env%sum(rcount)
328 total_blocks = total_blocks + rcount(i)
332 ALLOCATE (rdispl(ncpu))
333 rcount(:) = rcount(:)*3
336 rdispl(i) = rdispl(i - 1) + rcount(i - 1)
339 ALLOCATE (buffer_in(3*non_empty_blocks))
342 DO atom_block = 1, nblocks
343 IF (tmp_blocks(atom_block)%istart == 0) cycle
344 buffer_in(non_empty_blocks*3 + 1) = tmp_blocks(atom_block)%istart
345 buffer_in(non_empty_blocks*3 + 2) = tmp_blocks(atom_block)%iend
346 buffer_in(non_empty_blocks*3 + 3) = tmp_blocks(atom_block)%cost
347 non_empty_blocks = non_empty_blocks + 1
350 nblocks = total_blocks
352 ALLOCATE (tmp_blocks2(nblocks))
354 ALLOCATE (buffer_out(3*nblocks))
357 CALL para_env%allgatherv(buffer_in, buffer_out, rcount, rdispl)
360 tmp_blocks2(i)%istart = int(buffer_out((i - 1)*3 + 1))
361 tmp_blocks2(i)%iend = int(buffer_out((i - 1)*3 + 2))
362 tmp_blocks2(i)%cost = buffer_out((i - 1)*3 + 3)
366 ALLOCATE (to_be_sorted(nblocks))
367 ALLOCATE (tmp_index(nblocks))
369 DO atom_block = 1, nblocks
370 to_be_sorted(atom_block) = tmp_blocks2(atom_block)%istart
373 CALL sort(to_be_sorted, nblocks, tmp_index)
375 ALLOCATE (x_data%blocks(nblocks))
377 DO atom_block = 1, nblocks
378 x_data%blocks(atom_block) = tmp_blocks2(tmp_index(atom_block))
381 shm_blocks => x_data%blocks
382 shm_nblocks = nblocks
385 load_balance_parameter%nblocks = nblocks
387 DEALLOCATE (blocks_guess, tmp_blocks, tmp_blocks2)
389 DEALLOCATE (rcount, rdispl, buffer_in, buffer_out, to_be_sorted, tmp_index)
391 load_balance_parameter%blocks_initialized = .true.
393 x_data%blocks = shm_blocks
394 load_balance_parameter%nblocks = shm_nblocks
395 load_balance_parameter%blocks_initialized = .true.
397 ALLOCATE (x_data%pmax_block(shm_nblocks, shm_nblocks))
398 x_data%pmax_block = 0.0_dp
399 pmax_block => x_data%pmax_block
400 CALL timestop(handle_range)
404 IF (.NOT. load_balance_parameter%blocks_initialized)
THEN
405 ALLOCATE (x_data%blocks(shm_nblocks))
406 x_data%blocks = shm_blocks
407 load_balance_parameter%nblocks = shm_nblocks
408 load_balance_parameter%blocks_initialized = .true.
416 pmax_block => x_data%pmax_block
418 IF (do_p_screening)
THEN
419 DO iatom_block = 1, shm_nblocks
420 iatom_start = x_data%blocks(iatom_block)%istart
421 iatom_end = x_data%blocks(iatom_block)%iend
422 DO jatom_block = 1, shm_nblocks
423 jatom_start = x_data%blocks(jatom_block)%istart
424 jatom_end = x_data%blocks(jatom_block)%iend
425 pmax_block(iatom_block, jatom_block) = maxval(pmax_atom(iatom_start:iatom_end, jatom_start:jatom_end))
430 SELECT CASE (eval_type)
432 atomic_pair_list => x_data%atomic_pair_list
434 atomic_pair_list => x_data%atomic_pair_list_forces
437 do_periodic, coeffs_kind, coeffs_kind_max0, log10_eps_schwarz, cell, &
444 IF (n_processes == 1)
THEN
445 ALLOCATE (tmp_dist(1))
446 tmp_dist(1)%number_of_atom_quartets = huge(tmp_dist(1)%number_of_atom_quartets)
447 tmp_dist(1)%istart = 0_int_8
448 ptr_to_tmp_dist => tmp_dist(:)
449 SELECT CASE (eval_type)
455 DEALLOCATE (tmp_dist)
460 CALL timeset(routinen//
"_count", handle_inner)
464 cost_per_core = 0_int_8
465 my_process_id = para_env%mepos*n_threads + i_thread
466 nblocks = load_balance_parameter%nblocks
468 DO atom_block = my_process_id, int(nblocks, kind=
int_8)**4 - 1, n_processes
470 latom_block = int(
modulo(atom_block, int(nblocks, kind=
int_8))) + 1
471 tmp_block = atom_block/nblocks
472 katom_block = int(
modulo(tmp_block, int(nblocks, kind=
int_8))) + 1
473 IF (latom_block < katom_block) cycle
474 tmp_block = tmp_block/nblocks
475 jatom_block = int(
modulo(tmp_block, int(nblocks, kind=
int_8))) + 1
476 tmp_block = tmp_block/nblocks
477 iatom_block = int(
modulo(tmp_block, int(nblocks, kind=
int_8))) + 1
478 IF (jatom_block < iatom_block) cycle
480 iatom_start = x_data%blocks(iatom_block)%istart
481 iatom_end = x_data%blocks(iatom_block)%iend
482 jatom_start = x_data%blocks(jatom_block)%istart
483 jatom_end = x_data%blocks(jatom_block)%iend
484 katom_start = x_data%blocks(katom_block)%istart
485 katom_end = x_data%blocks(katom_block)%iend
486 latom_start = x_data%blocks(latom_block)%istart
487 latom_end = x_data%blocks(latom_block)%iend
489 SELECT CASE (eval_type)
491 pmax_blocks = max(pmax_block(katom_block, iatom_block), &
492 pmax_block(latom_block, jatom_block), &
493 pmax_block(latom_block, iatom_block), &
494 pmax_block(katom_block, jatom_block))
496 pmax_blocks = max(pmax_block(katom_block, iatom_block) + &
497 pmax_block(latom_block, jatom_block), &
498 pmax_block(latom_block, iatom_block) + &
499 pmax_block(katom_block, jatom_block))
502 IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) cycle
504 cost_per_core = cost_per_core &
505 + estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
506 iatom_start, iatom_end, jatom_start, jatom_end, &
507 katom_start, katom_end, latom_start, latom_end, &
509 coeffs_set, coeffs_kind, &
510 is_assoc_atomic_block_global, do_periodic, &
511 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
513 do_p_screening, map_atom_to_kind_atom, eval_type, &
514 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
518 nbins = load_balance_parameter%nbins
519 cost_per_bin = (cost_per_core + nbins - 1)/(nbins)
523 CALL timestop(handle_inner)
529 CALL hfx_recursive_load_balance(n_processes, my_process_id, nblocks, &
530 natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
532 coeffs_set, coeffs_kind, &
533 is_assoc_atomic_block_global, do_periodic, &
534 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
535 cell, x_data, para_env, pmax_block, &
536 do_p_screening, map_atom_to_kind_atom, eval_type, &
537 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
542 CALL timeset(routinen//
"_bin", handle_inner)
546 ALLOCATE (binned_dist(nbins))
547 binned_dist(:)%istart = -1_int_8
548 binned_dist(:)%number_of_atom_quartets = 0_int_8
549 binned_dist(:)%cost = 0_int_8
550 binned_dist(:)%time_first_scf = 0.0_dp
551 binned_dist(:)%time_other_scf = 0.0_dp
552 binned_dist(:)%time_forces = 0.0_dp
556 distribution_counter_start = 1
557 distribution_counter_end = 0
560 global_quartet_counter = 0
561 local_quartet_counter = 0
562 last_bin_needs_to_be_filled = .false.
563 DO atom_block = my_process_id, int(nblocks, kind=
int_8)**4 - 1, n_processes
564 latom_block = int(
modulo(atom_block, int(nblocks, kind=
int_8))) + 1
565 tmp_block = atom_block/nblocks
566 katom_block = int(
modulo(tmp_block, int(nblocks, kind=
int_8))) + 1
567 IF (latom_block < katom_block) cycle
568 tmp_block = tmp_block/nblocks
569 jatom_block = int(
modulo(tmp_block, int(nblocks, kind=
int_8))) + 1
570 tmp_block = tmp_block/nblocks
571 iatom_block = int(
modulo(tmp_block, int(nblocks, kind=
int_8))) + 1
572 IF (jatom_block < iatom_block) cycle
574 distribution_counter_end = distribution_counter_end + 1
575 global_quartet_counter = global_quartet_counter + 1
576 last_bin_needs_to_be_filled = .true.
578 IF (binned_dist(ibin)%istart == -1_int_8) binned_dist(ibin)%istart = atom_block
580 iatom_start = x_data%blocks(iatom_block)%istart
581 iatom_end = x_data%blocks(iatom_block)%iend
582 jatom_start = x_data%blocks(jatom_block)%istart
583 jatom_end = x_data%blocks(jatom_block)%iend
584 katom_start = x_data%blocks(katom_block)%istart
585 katom_end = x_data%blocks(katom_block)%iend
586 latom_start = x_data%blocks(latom_block)%istart
587 latom_end = x_data%blocks(latom_block)%iend
589 SELECT CASE (eval_type)
591 pmax_blocks = max(pmax_block(katom_block, iatom_block), &
592 pmax_block(latom_block, jatom_block), &
593 pmax_block(latom_block, iatom_block), &
594 pmax_block(katom_block, jatom_block))
596 pmax_blocks = max(pmax_block(katom_block, iatom_block) + &
597 pmax_block(latom_block, jatom_block), &
598 pmax_block(latom_block, iatom_block) + &
599 pmax_block(katom_block, jatom_block))
602 IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) cycle
604 current_cost = current_cost &
605 + estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
606 iatom_start, iatom_end, jatom_start, jatom_end, &
607 katom_start, katom_end, latom_start, latom_end, &
609 coeffs_set, coeffs_kind, &
610 is_assoc_atomic_block_global, do_periodic, &
611 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
613 do_p_screening, map_atom_to_kind_atom, eval_type, &
614 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
616 IF (current_cost >= cost_per_bin)
THEN
617 IF (ibin == nbins)
THEN
618 binned_dist(ibin)%number_of_atom_quartets = binned_dist(ibin)%number_of_atom_quartets + &
619 distribution_counter_end - distribution_counter_start + 1
621 binned_dist(ibin)%number_of_atom_quartets = distribution_counter_end - distribution_counter_start + 1
623 binned_dist(ibin)%cost = binned_dist(ibin)%cost + current_cost
624 ibin = min(ibin + 1, nbins)
625 distribution_counter_start = distribution_counter_end + 1
627 last_bin_needs_to_be_filled = .false.
633 CALL timestop(handle_inner)
634 CALL timeset(routinen//
"_dist", handle_inner)
638 IF (last_bin_needs_to_be_filled)
THEN
639 binned_dist(ibin)%cost = binned_dist(ibin)%cost + current_cost
640 IF (ibin == nbins)
THEN
641 binned_dist(ibin)%number_of_atom_quartets = binned_dist(ibin)%number_of_atom_quartets + &
642 distribution_counter_end - distribution_counter_start + 1
644 binned_dist(ibin)%number_of_atom_quartets = distribution_counter_end - distribution_counter_start + 1
650 local_quartet_counter = local_quartet_counter + binned_dist(ibin)%number_of_atom_quartets
654 shm_local_quartet_counter = 0
655 shm_global_quartet_counter = 0
659 shm_local_quartet_counter = shm_local_quartet_counter + local_quartet_counter
661 shm_global_quartet_counter = shm_global_quartet_counter + global_quartet_counter
665 CALL para_env%sum(shm_local_quartet_counter)
666 CALL para_env%sum(shm_global_quartet_counter)
667 IF (para_env%is_source())
THEN
668 IF (shm_local_quartet_counter /= shm_global_quartet_counter)
THEN
669 WRITE (error_msg,
'(A,I0,A,I0,A)')
"HFX Sanity check for parallel distribution failed. "// &
670 "Number of local quartets (", shm_local_quartet_counter, &
671 ") and number of global quartets (", shm_global_quartet_counter, &
672 ") are different. Please send in a bug report."
680 ALLOCATE (cost_matrix(ncpu*nbins*n_threads))
684 icpu = para_env%mepos + 1
686 cost_matrix((icpu - 1)*nbins*n_threads + i_thread*nbins + i) = binned_dist(i)%cost
688 mepos = para_env%mepos
695 ALLOCATE (sendbuffer(nbins*n_threads))
696 ALLOCATE (recbuffer(nbins*n_threads))
698 sendbuffer = cost_matrix(mepos*nbins*n_threads + 1:mepos*nbins*n_threads + nbins*n_threads)
700 dest =
modulo(mepos + 1, ncpu)
701 source =
modulo(mepos - 1, ncpu)
702 DO icpu = 0, ncpu - 1
703 IF (icpu .NE. ncpu - 1)
THEN
704 CALL para_env%isendrecv(sendbuffer, dest, recbuffer, source, &
707 data_from =
modulo(mepos - icpu, ncpu)
708 cost_matrix(data_from*nbins*n_threads + 1:data_from*nbins*n_threads + nbins*n_threads) = sendbuffer
709 IF (icpu .NE. ncpu - 1)
THEN
712 swapbuffer => sendbuffer
713 sendbuffer => recbuffer
714 recbuffer => swapbuffer
716 DEALLOCATE (recbuffer, sendbuffer)
722 CALL timestop(handle_inner)
723 CALL timeset(routinen//
"_opt", handle_inner)
729 ALLOCATE (local_cost_matrix(
SIZE(cost_matrix, 1)))
730 local_cost_matrix = cost_matrix
732 ALLOCATE (shm_distribution_vector(ncpu*nbins*n_threads))
734 CALL optimize_distribution(ncpu*nbins*n_threads, ncpu*n_threads, local_cost_matrix, &
735 shm_distribution_vector, x_data%load_balance_parameter%do_randomize)
737 CALL timestop(handle_inner)
738 CALL timeset(routinen//
"_redist", handle_inner)
740 ALLOCATE (full_dist(ncpu*n_threads, nbins))
742 full_dist(:, :)%istart = 0_int_8
743 full_dist(:, :)%number_of_atom_quartets = 0_int_8
744 full_dist(:, :)%cost = 0_int_8
745 full_dist(:, :)%time_first_scf = 0.0_dp
746 full_dist(:, :)%time_other_scf = 0.0_dp
747 full_dist(:, :)%time_forces = 0.0_dp
750 mepos = para_env%mepos + 1
751 full_dist((mepos - 1)*n_threads + i_thread + 1, :) = binned_dist(:)
755 ALLOCATE (sendbuffer(3*nbins*n_threads))
756 ALLOCATE (recbuffer(3*nbins*n_threads))
757 mepos = para_env%mepos
760 sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 1) = full_dist(mepos*n_threads + j, i)%istart
761 sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 2) = full_dist(mepos*n_threads + j, i)%number_of_atom_quartets
762 sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 3) = full_dist(mepos*n_threads + j, i)%cost
768 dest =
modulo(mepos + 1, ncpu)
769 source =
modulo(mepos - 1, ncpu)
770 DO icpu = 0, ncpu - 1
771 IF (icpu .NE. ncpu - 1)
THEN
772 CALL para_env%isendrecv(sendbuffer, dest, recbuffer, source, &
775 data_from =
modulo(mepos - icpu, ncpu)
778 full_dist(data_from*n_threads + j, i)%istart = sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 1)
779 full_dist(data_from*n_threads + j, i)%number_of_atom_quartets = sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 2)
780 full_dist(data_from*n_threads + j, i)%cost = sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 3)
784 IF (icpu .NE. ncpu - 1)
THEN
787 swapbuffer => sendbuffer
788 sendbuffer => recbuffer
789 recbuffer => swapbuffer
791 DEALLOCATE (recbuffer, sendbuffer)
798 ALLOCATE (tmp_pos(ncpu*n_threads))
800 ALLOCATE (tmp_dist(nbins*ncpu*n_threads))
802 tmp_dist(:)%istart = 0_int_8
803 tmp_dist(:)%number_of_atom_quartets = 0_int_8
804 tmp_dist(:)%cost = 0_int_8
805 tmp_dist(:)%time_first_scf = 0.0_dp
806 tmp_dist(:)%time_other_scf = 0.0_dp
807 tmp_dist(:)%time_forces = 0.0_dp
809 DO icpu = 1, n_processes
811 mepos = my_process_id + 1
812 IF (shm_distribution_vector((icpu - 1)*nbins + i) == mepos)
THEN
813 tmp_dist(tmp_pos(mepos)) = full_dist(icpu, i)
814 tmp_pos(mepos) = tmp_pos(mepos) + 1
820 NULLIFY (ptr_to_tmp_dist)
821 mepos = my_process_id + 1
822 ptr_to_tmp_dist => tmp_dist(1:tmp_pos(mepos) - 1)
823 SELECT CASE (eval_type)
832 DEALLOCATE (full_dist, cost_matrix, shm_distribution_vector)
835 DEALLOCATE (tmp_dist, tmp_pos)
836 DEALLOCATE (binned_dist, local_cost_matrix)
837 DEALLOCATE (set_list_ij, set_list_kl)
841 CALL timestop(handle_inner)
847 CALL timestop(handle)
895 SUBROUTINE hfx_recursive_load_balance(n_processes, my_process_id, nblocks, &
896 natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
898 coeffs_set, coeffs_kind, &
899 is_assoc_atomic_block_global, do_periodic, &
900 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
901 cell, x_data, para_env, pmax_block, &
902 do_p_screening, map_atom_to_kind_atom, eval_type, &
903 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
906 INTEGER,
INTENT(IN) :: n_processes, my_process_id, nblocks, &
910 DIMENSION(:),
INTENT(IN) :: set_list_ij, set_list_kl
912 POINTER :: particle_set
914 DIMENSION(:, :, :, :),
INTENT(IN),
POINTER :: coeffs_set
916 INTENT(IN),
POINTER :: coeffs_kind
917 INTEGER,
DIMENSION(:, :),
INTENT(IN) :: is_assoc_atomic_block_global
918 LOGICAL,
INTENT(IN) :: do_periodic
919 INTEGER,
INTENT(IN) :: kind_of(*)
921 POINTER :: basis_parameter
924 REAL(
dp),
DIMENSION(:, :),
INTENT(IN),
POINTER :: pmax_atom
925 REAL(
dp) :: pmax_blocks
926 TYPE(
cell_type),
INTENT(IN),
POINTER :: cell
927 TYPE(
hfx_type),
INTENT(IN),
POINTER :: x_data
929 REAL(
dp),
DIMENSION(:, :),
INTENT(IN),
POINTER :: pmax_block
930 LOGICAL,
INTENT(IN) :: do_p_screening
931 INTEGER,
DIMENSION(:),
INTENT(IN),
POINTER :: map_atom_to_kind_atom
932 INTEGER,
INTENT(IN) :: eval_type
933 REAL(
dp),
INTENT(IN) :: log10_eps_schwarz, log_2, &
935 LOGICAL,
INTENT(IN) :: use_virial
936 LOGICAL,
DIMENSION(:, :),
INTENT(IN),
POINTER :: atomic_pair_list
938 CHARACTER(LEN=*),
PARAMETER :: routinen =
'hfx_recursive_load_balance'
940 INTEGER :: handle, i, iatom_block, iatom_end, iatom_start, j, jatom_block, jatom_end, &
941 jatom_start, katom_block, katom_end, katom_start, latom_block, latom_end, latom_start, &
942 np, nq, numbins, p, q, sizep, sizeq, unit_nr
943 INTEGER(int_8) :: local_cost, pidx, qidx, sump, sumq
944 INTEGER(int_8),
ALLOCATABLE,
DIMENSION(:) :: local_cost_vector
945 INTEGER,
ALLOCATABLE,
DIMENSION(:) :: blocksize, p_atom_blocks, permute, &
947 REAL(
dp) :: maximum, mean
953 CALL timeset(routinen, handle)
958 CALL hfx_calculate_pq(p, q, numbins, n_processes)
960 ALLOCATE (blocksize(numbins))
961 ALLOCATE (permute(nblocks**2))
970 CALL hfx_recursive_permute(blocksize, 1, nblocks**2, numbins, &
972 my_process_id, n_processes, nblocks, &
973 natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
975 coeffs_set, coeffs_kind, &
976 is_assoc_atomic_block_global, do_periodic, &
977 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
978 cell, x_data, para_env, pmax_block, &
979 do_p_screening, map_atom_to_kind_atom, eval_type, &
980 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
988 pidx =
modulo(int(my_process_id), int(p)) + 1
989 qidx = my_process_id/p + 1
991 sizep = sum(blocksize((np*(pidx - 1) + 1):(np*pidx)))
992 sizeq = sum(blocksize((nq*(qidx - 1) + 1):(nq*qidx)))
994 sump = sum(blocksize(1:(np*(pidx - 1))))
995 sumq = sum(blocksize(1:(nq*(qidx - 1))))
997 ALLOCATE (p_atom_blocks(sizep))
998 ALLOCATE (q_atom_blocks(sizeq))
1000 p_atom_blocks(:) = permute((sump + 1):(sump + sizep))
1001 q_atom_blocks(:) = permute((sumq + 1):(sumq + sizeq))
1015 latom_block =
modulo(q_atom_blocks(j), nblocks)
1016 iatom_block = q_atom_blocks(j)/nblocks + 1
1017 jatom_block =
modulo(p_atom_blocks(i), nblocks)
1018 katom_block = p_atom_blocks(i)/nblocks + 1
1021 IF (latom_block < katom_block) cycle
1022 IF (jatom_block < iatom_block) cycle
1024 iatom_start = x_data%blocks(iatom_block)%istart
1025 iatom_end = x_data%blocks(iatom_block)%iend
1026 jatom_start = x_data%blocks(jatom_block)%istart
1027 jatom_end = x_data%blocks(jatom_block)%iend
1028 katom_start = x_data%blocks(katom_block)%istart
1029 katom_end = x_data%blocks(katom_block)%iend
1030 latom_start = x_data%blocks(latom_block)%istart
1031 latom_end = x_data%blocks(latom_block)%iend
1034 SELECT CASE (eval_type)
1036 pmax_blocks = max(pmax_block(katom_block, iatom_block), &
1037 pmax_block(latom_block, jatom_block), &
1038 pmax_block(latom_block, iatom_block), &
1039 pmax_block(katom_block, jatom_block))
1041 pmax_blocks = max(pmax_block(katom_block, iatom_block) + &
1042 pmax_block(latom_block, jatom_block), &
1043 pmax_block(latom_block, iatom_block) + &
1044 pmax_block(katom_block, jatom_block))
1048 IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) cycle
1051 local_cost = local_cost + estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, &
1053 iatom_start, iatom_end, jatom_start, jatom_end, &
1054 katom_start, katom_end, latom_start, latom_end, &
1056 coeffs_set, coeffs_kind, &
1057 is_assoc_atomic_block_global, do_periodic, &
1058 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
1060 do_p_screening, map_atom_to_kind_atom, eval_type, &
1061 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
1065 ALLOCATE (local_cost_vector(n_processes))
1066 local_cost_vector = 0
1067 local_cost_vector(my_process_id + 1) = local_cost
1068 CALL para_env%sum(local_cost_vector)
1070 mean = sum(local_cost_vector)/n_processes
1071 maximum = maxval(local_cost_vector)
1076 IF (my_process_id == 0)
THEN
1077 CALL open_file(unit_number=unit_nr, file_name=
"loads.dat")
1078 WRITE (unit_nr, *)
'maximum cost:', maximum
1079 WRITE (unit_nr, *)
'mean cost:', mean
1080 WRITE (unit_nr, *)
'load balance ratio max/mean: ', maximum/mean
1081 WRITE (unit_nr, *)
'-------- detailed per-process costs ---------'
1082 DO i = 1, n_processes
1083 WRITE (unit_nr, *) local_cost_vector(i)
1090 DEALLOCATE (local_cost_vector)
1091 DEALLOCATE (p_atom_blocks, q_atom_blocks)
1092 DEALLOCATE (blocksize, permute)
1096 CALL timestop(handle)
1100 END SUBROUTINE hfx_recursive_load_balance
1114 SUBROUTINE hfx_calculate_pq(p, q, nBins, N)
1116 INTEGER,
INTENT(OUT) :: p, q, nbins
1117 INTEGER,
INTENT(IN) :: n
1123 sqn = sqrt(real(n, kind=
dp))
1126 DO WHILE (real(k, kind=
dp) <= sqn)
1127 IF (
modulo(n, k) == 0)
THEN
1202 RECURSIVE SUBROUTINE hfx_recursive_permute(blocksize, blockstart, blockend, nProc_in, &
1204 my_process_id, n_processes, nblocks, &
1205 natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
1207 coeffs_set, coeffs_kind, &
1208 is_assoc_atomic_block_global, do_periodic, &
1209 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
1210 cell, x_data, para_env, pmax_block, &
1211 do_p_screening, map_atom_to_kind_atom, eval_type, &
1212 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
1214 INTEGER :: nproc_in, blockend, blockstart
1215 INTEGER,
DIMENSION(nProc_in) :: blocksize
1216 INTEGER :: nblocks, n_processes, my_process_id
1217 INTEGER,
INTENT(IN) :: step
1218 INTEGER,
DIMENSION(nblocks*nblocks) :: permute
1220 INTEGER,
INTENT(IN) :: nkind
1223 DIMENSION(:) :: set_list_ij, set_list_kl
1226 DIMENSION(:, :, :, :),
POINTER :: coeffs_set
1228 POINTER :: coeffs_kind
1229 INTEGER,
DIMENSION(:, :) :: is_assoc_atomic_block_global
1230 LOGICAL :: do_periodic
1231 INTEGER :: kind_of(*)
1233 TYPE(
hfx_p_kind),
DIMENSION(:),
POINTER :: pmax_set
1234 REAL(
dp),
DIMENSION(:, :),
POINTER :: pmax_atom
1235 REAL(
dp) :: pmax_blocks
1239 REAL(
dp),
DIMENSION(:, :),
POINTER :: pmax_block
1240 LOGICAL,
INTENT(IN) :: do_p_screening
1241 INTEGER,
DIMENSION(:),
POINTER :: map_atom_to_kind_atom
1242 INTEGER,
INTENT(IN) :: eval_type
1243 REAL(
dp) :: log10_eps_schwarz, log_2, &
1245 LOGICAL,
INTENT(IN) :: use_virial
1246 LOGICAL,
DIMENSION(:, :),
POINTER :: atomic_pair_list
1248 INTEGER :: col, endoffset, i, iatom_block, iatom_end, iatom_start,
idx, inv_perm, &
1249 jatom_block, jatom_end, jatom_start, katom_block, katom_end, katom_start, latom_block, &
1250 latom_end, latom_start, nbins, nproc, row, startoffset
1251 INTEGER(int_8) :: atom_block, tmp_block
1252 INTEGER,
ALLOCATABLE,
DIMENSION(:) :: ithblocksize, localblocksize
1253 INTEGER,
DIMENSION(blockend - blockstart + 1) :: bin_perm, tmp_perm
1254 REAL(
dp) :: partialcost
1255 REAL(
dp),
DIMENSION(nblocks*nblocks) :: cost_vector
1258 cost_vector = 0.0_dp
1261 DO atom_block = my_process_id, int(nblocks, kind=
int_8)**4 - 1, n_processes
1264 latom_block = int(
modulo(atom_block, int(nblocks, kind=
int_8))) + 1
1265 tmp_block = atom_block/nblocks
1266 katom_block = int(
modulo(tmp_block, int(nblocks, kind=
int_8))) + 1
1267 IF (latom_block < katom_block) cycle
1268 tmp_block = tmp_block/nblocks
1269 jatom_block = int(
modulo(tmp_block, int(nblocks, kind=
int_8))) + 1
1270 tmp_block = tmp_block/nblocks
1271 iatom_block = int(
modulo(tmp_block, int(nblocks, kind=
int_8))) + 1
1272 IF (jatom_block < iatom_block) cycle
1278 row = (katom_block - 1)*nblocks + jatom_block
1280 DO WHILE (permute(inv_perm) .NE. row)
1281 inv_perm = inv_perm + 1
1285 col = (iatom_block - 1)*nblocks + latom_block
1287 DO WHILE (permute(inv_perm) .NE. col)
1288 inv_perm = inv_perm + 1
1293 IF (col < blockstart .OR. col > blockend) cycle
1294 IF (row < blockstart .OR. row > blockend) cycle
1296 iatom_start = x_data%blocks(iatom_block)%istart
1297 iatom_end = x_data%blocks(iatom_block)%iend
1298 jatom_start = x_data%blocks(jatom_block)%istart
1299 jatom_end = x_data%blocks(jatom_block)%iend
1300 katom_start = x_data%blocks(katom_block)%istart
1301 katom_end = x_data%blocks(katom_block)%iend
1302 latom_start = x_data%blocks(latom_block)%istart
1303 latom_end = x_data%blocks(latom_block)%iend
1306 SELECT CASE (eval_type)
1308 pmax_blocks = max(pmax_block(katom_block, iatom_block), &
1309 pmax_block(latom_block, jatom_block), &
1310 pmax_block(latom_block, iatom_block), &
1311 pmax_block(katom_block, jatom_block))
1313 pmax_blocks = max(pmax_block(katom_block, iatom_block) + &
1314 pmax_block(latom_block, jatom_block), &
1315 pmax_block(latom_block, iatom_block) + &
1316 pmax_block(katom_block, jatom_block))
1320 IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) cycle
1324 IF (
modulo(step, 2) .EQ. 0)
THEN
1331 partialcost = estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, &
1333 iatom_start, iatom_end, jatom_start, jatom_end, &
1334 katom_start, katom_end, latom_start, latom_end, &
1336 coeffs_set, coeffs_kind, &
1337 is_assoc_atomic_block_global, do_periodic, &
1338 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
1340 do_p_screening, map_atom_to_kind_atom, eval_type, &
1341 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
1343 cost_vector(
idx) = cost_vector(
idx) + partialcost
1347 CALL para_env%sum(cost_vector)
1351 DO WHILE (
modulo(int(nproc), int(nbins)) .NE. 0)
1359 ALLOCATE (localblocksize(nbins))
1360 CALL hfx_permute_binning(nbins, cost_vector(blockstart:blockend), blockend - blockstart + 1, bin_perm, localblocksize)
1364 tmp_perm = permute(blockstart:blockend)
1365 permute(blockstart:blockend) = tmp_perm(bin_perm)
1369 ALLOCATE (ithblocksize(nproc))
1371 startoffset = sum(localblocksize(1:(i - 1)))
1372 endoffset = sum(localblocksize(1:i)) - 1
1374 CALL hfx_recursive_permute(ithblocksize, blockstart + startoffset, blockstart + endoffset, nproc, &
1375 permute, step + 1, &
1376 my_process_id, n_processes, nblocks, &
1377 natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
1379 coeffs_set, coeffs_kind, &
1380 is_assoc_atomic_block_global, do_periodic, &
1381 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
1382 cell, x_data, para_env, pmax_block, &
1383 do_p_screening, map_atom_to_kind_atom, eval_type, &
1384 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
1385 blocksize(((i - 1)*nproc + 1):(i*nproc)) = ithblocksize
1387 DEALLOCATE (ithblocksize)
1390 blocksize(i) = localblocksize(i)
1394 DEALLOCATE (localblocksize)
1396 END SUBROUTINE hfx_recursive_permute
1410 SUBROUTINE hfx_permute_binning(nBins, costvector, maxbinsize, perm, block_count)
1412 INTEGER,
INTENT(IN) :: nbins, maxbinsize
1413 REAL(
dp),
DIMENSION(maxbinsize),
INTENT(IN) :: costvector
1414 INTEGER,
DIMENSION(maxbinsize),
INTENT(OUT) :: perm
1415 INTEGER,
DIMENSION(nBins),
INTENT(OUT) :: block_count
1417 INTEGER :: i, j, mod_idx, offset
1418 INTEGER,
DIMENSION(nBins, maxbinsize) :: bin
1419 INTEGER,
DIMENSION(nBins) :: bin_idx
1420 INTEGER,
DIMENSION(maxbinsize) ::
idx
1421 REAL(
dp),
DIMENSION(maxbinsize) :: vec
1422 REAL(
dp),
DIMENSION(nBins) :: bincosts
1431 CALL sort(vec, maxbinsize,
idx)
1434 DO i = maxbinsize, 1, -1
1435 IF (vec(i) == 0)
THEN
1437 mod_idx =
modulo(i, nbins) + 1
1438 block_count(mod_idx) = block_count(mod_idx) + 1
1439 bin(mod_idx, block_count(mod_idx)) =
idx(i)
1443 CALL sort(bincosts, nbins, bin_idx)
1444 block_count = block_count(bin_idx)
1445 bin = bin(bin_idx, :)
1447 bincosts(1) = bincosts(1) + vec(i)
1448 block_count(1) = block_count(1) + 1
1449 bin(1, block_count(1)) =
idx(i)
1456 DO j = 1, block_count(i)
1457 perm(offset + j) = bin(i, j)
1459 offset = offset + block_count(i)
1462 END SUBROUTINE hfx_permute_binning
1481 load_balance_parameter, &
1482 i_thread, n_threads, eval_type)
1487 INTEGER,
INTENT(IN) :: i_thread, n_threads, eval_type
1489 CHARACTER(LEN=*),
PARAMETER :: routinen =
'hfx_update_load_balance'
1491 INTEGER :: data_from, dest, end_idx, handle, i, ibin, icpu, iprocess, j, mepos, my_bin_size, &
1492 my_global_start_idx, my_process_id, n_processes, nbins, ncpu, source, start_idx
1494 INTEGER(int_8),
DIMENSION(:),
POINTER :: local_cost_matrix, recbuffer, &
1495 sendbuffer, swapbuffer
1496 INTEGER(int_8),
DIMENSION(:),
POINTER,
SAVE :: cost_matrix
1497 INTEGER,
ALLOCATABLE,
DIMENSION(:) :: tmp_pos
1498 INTEGER,
ALLOCATABLE,
DIMENSION(:),
SAVE :: bins_per_rank
1499 INTEGER,
ALLOCATABLE,
DIMENSION(:, :),
SAVE :: bin_histogram
1500 INTEGER,
DIMENSION(:),
POINTER,
SAVE :: shm_distribution_vector
1501 INTEGER,
SAVE :: max_bin_size
1502 TYPE(
hfx_distribution),
DIMENSION(:),
POINTER :: binned_dist, ptr_to_tmp_dist, tmp_dist
1508 CALL timeset(routinen, handle)
1512 ncpu = para_env%num_pe
1513 n_processes = ncpu*n_threads
1515 IF (n_processes == 1)
THEN
1516 ALLOCATE (tmp_dist(1))
1517 tmp_dist(1)%number_of_atom_quartets = huge(tmp_dist(1)%number_of_atom_quartets)
1518 tmp_dist(1)%istart = 0_int_8
1519 ptr_to_tmp_dist => tmp_dist(:)
1520 SELECT CASE (eval_type)
1526 DEALLOCATE (tmp_dist)
1528 mepos = para_env%mepos
1529 my_process_id = para_env%mepos*n_threads + i_thread
1530 nbins = load_balance_parameter%nbins
1532 ALLOCATE (bin_histogram(n_processes, 2))
1536 SELECT CASE (eval_type)
1538 my_bin_size =
SIZE(x_data%distribution_energy)
1540 my_bin_size =
SIZE(x_data%distribution_forces)
1542 bin_histogram(my_process_id + 1, 1) = my_bin_size
1545 CALL para_env%sum(bin_histogram(:, 1))
1546 bin_histogram(1, 2) = bin_histogram(1, 1)
1547 DO iprocess = 2, n_processes
1548 bin_histogram(iprocess, 2) = bin_histogram(iprocess - 1, 2) + bin_histogram(iprocess, 1)
1551 max_bin_size = maxval(bin_histogram(para_env%mepos*n_threads + 1:para_env%mepos*n_threads + n_threads, 1))
1552 CALL para_env%max(max_bin_size)
1555 ALLOCATE (binned_dist(my_bin_size))
1557 SELECT CASE (eval_type)
1559 binned_dist = x_data%distribution_energy
1561 binned_dist = x_data%distribution_forces
1564 DO ibin = 1, my_bin_size
1565 IF (binned_dist(ibin)%number_of_atom_quartets == 0)
THEN
1566 binned_dist(ibin)%cost = 0
1568 SELECT CASE (eval_type)
1570 IF (.NOT. load_balance_parameter%rtp_redistribute)
THEN
1571 binned_dist(ibin)%cost = int((binned_dist(ibin)%time_first_scf + &
1572 binned_dist(ibin)%time_other_scf)*10000.0_dp,
int_8)
1574 binned_dist(ibin)%cost = int((binned_dist(ibin)%time_other_scf)*10000.0_dp,
int_8)
1577 binned_dist(ibin)%cost = int((binned_dist(ibin)%time_forces)*10000.0_dp,
int_8)
1584 ALLOCATE (cost_matrix(ncpu*nbins*n_threads))
1586 ALLOCATE (sendbuffer(max_bin_size*n_threads))
1587 ALLOCATE (recbuffer(max_bin_size*n_threads))
1590 my_global_start_idx = bin_histogram(my_process_id + 1, 2) - my_bin_size
1591 icpu = para_env%mepos + 1
1592 DO i = 1, my_bin_size
1593 cost_matrix(my_global_start_idx + i) = binned_dist(i)%cost
1596 mepos = para_env%mepos
1599 ALLOCATE (bins_per_rank(ncpu))
1602 bins_per_rank(icpu) = sum(bin_histogram((icpu - 1)*n_threads + 1:(icpu - 1)*n_threads + n_threads, 1))
1604 sendbuffer(1:bins_per_rank(para_env%mepos + 1)) = &
1605 cost_matrix(my_global_start_idx + 1:my_global_start_idx + bins_per_rank(para_env%mepos + 1))
1607 dest =
modulo(mepos + 1, ncpu)
1608 source =
modulo(mepos - 1, ncpu)
1610 CALL para_env%sync()
1611 DO icpu = 0, ncpu - 1
1612 IF (icpu .NE. ncpu - 1)
THEN
1613 CALL para_env%isendrecv(sendbuffer, dest, recbuffer, source, &
1616 data_from =
modulo(mepos - icpu, ncpu)
1617 start_idx = sum(bins_per_rank(1:data_from + 1)) - bins_per_rank(data_from + 1) + 1
1618 end_idx = start_idx + bins_per_rank(data_from + 1) - 1
1619 cost_matrix(start_idx:end_idx) = sendbuffer(1:end_idx - start_idx + 1)
1621 IF (icpu .NE. ncpu - 1)
THEN
1624 swapbuffer => sendbuffer
1625 sendbuffer => recbuffer
1626 recbuffer => swapbuffer
1628 DEALLOCATE (recbuffer, sendbuffer)
1630 CALL para_env%sync()
1633 ALLOCATE (local_cost_matrix(
SIZE(cost_matrix, 1)))
1634 local_cost_matrix = cost_matrix
1636 ALLOCATE (shm_distribution_vector(ncpu*nbins*n_threads))
1637 CALL optimize_distribution(ncpu*nbins*n_threads, ncpu*n_threads, local_cost_matrix, &
1638 shm_distribution_vector, x_data%load_balance_parameter%do_randomize)
1640 ALLOCATE (full_dist(ncpu*n_threads, max_bin_size))
1642 full_dist(:, :)%istart = 0_int_8
1643 full_dist(:, :)%number_of_atom_quartets = 0_int_8
1644 full_dist(:, :)%cost = 0_int_8
1645 full_dist(:, :)%time_first_scf = 0.0_dp
1646 full_dist(:, :)%time_other_scf = 0.0_dp
1647 full_dist(:, :)%time_forces = 0.0_dp
1651 mepos = para_env%mepos + 1
1652 full_dist((mepos - 1)*n_threads + i_thread + 1, 1:my_bin_size) = binned_dist(1:my_bin_size)
1655 ALLOCATE (sendbuffer(3*max_bin_size*n_threads))
1656 ALLOCATE (recbuffer(3*max_bin_size*n_threads))
1657 mepos = para_env%mepos
1659 DO i = 1, max_bin_size
1660 sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 1) = full_dist(mepos*n_threads + j, i)%istart
1661 sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 2) = full_dist(mepos*n_threads + j, i)%number_of_atom_quartets
1662 sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 3) = full_dist(mepos*n_threads + j, i)%cost
1665 dest =
modulo(mepos + 1, ncpu)
1666 source =
modulo(mepos - 1, ncpu)
1668 CALL para_env%sync()
1669 DO icpu = 0, ncpu - 1
1670 IF (icpu .NE. ncpu - 1)
THEN
1671 CALL para_env%isendrecv(sendbuffer, dest, recbuffer, source, &
1674 data_from =
modulo(mepos - icpu, ncpu)
1676 DO i = 1, max_bin_size
1677 full_dist(data_from*n_threads + j, i)%istart = sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 1)
1678 full_dist(data_from*n_threads + j, i)%number_of_atom_quartets = sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 2)
1679 full_dist(data_from*n_threads + j, i)%cost = sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 3)
1683 IF (icpu .NE. ncpu - 1)
THEN
1686 swapbuffer => sendbuffer
1687 sendbuffer => recbuffer
1688 recbuffer => swapbuffer
1691 DEALLOCATE (recbuffer, sendbuffer)
1692 CALL para_env%sync()
1696 ALLOCATE (tmp_pos(ncpu*n_threads))
1698 ALLOCATE (tmp_dist(nbins*ncpu*n_threads))
1700 tmp_dist(:)%istart = 0_int_8
1701 tmp_dist(:)%number_of_atom_quartets = 0_int_8
1702 tmp_dist(:)%cost = 0_int_8
1703 tmp_dist(:)%time_first_scf = 0.0_dp
1704 tmp_dist(:)%time_other_scf = 0.0_dp
1705 tmp_dist(:)%time_forces = 0.0_dp
1707 mepos = my_process_id + 1
1708 DO icpu = 1, n_processes
1709 DO i = 1, bin_histogram(icpu, 1)
1710 IF (shm_distribution_vector(bin_histogram(icpu, 2) - bin_histogram(icpu, 1) + i) == mepos)
THEN
1711 tmp_dist(tmp_pos(mepos)) = full_dist(icpu, i)
1712 tmp_pos(mepos) = tmp_pos(mepos) + 1
1718 NULLIFY (ptr_to_tmp_dist)
1719 mepos = my_process_id + 1
1720 ptr_to_tmp_dist => tmp_dist(1:tmp_pos(mepos) - 1)
1721 SELECT CASE (eval_type)
1730 DEALLOCATE (full_dist, cost_matrix, shm_distribution_vector)
1731 DEALLOCATE (bins_per_rank, bin_histogram)
1734 DEALLOCATE (tmp_dist, tmp_pos)
1735 DEALLOCATE (binned_dist, local_cost_matrix)
1739 CALL timestop(handle)
1765 FUNCTION cost_model(nsa, nsb, nsc, nsd, npgfa, npgfb, npgfc, npgfd, ratio, p1, p2, p3)
RESULT(res)
1767 REAL(kind=
dp) :: estimate1, estimate2, estimate, ratio, switch, mu, sigma
1768 INTEGER(KIND=int_8) :: res
1769 REAL(kind=
dp),
INTENT(IN) :: p1(12), p2(12), p3(2)
1771 INTEGER :: nsa, nsb, nsc, nsd, npgfa, npgfb, npgfc, npgfd
1773 estimate1 = estimate_basic(p1)
1774 estimate2 = estimate_basic(p2)
1775 mu = log(abs(1.0e6_dp*p3(1)) + 1)
1776 sigma = p3(2)*0.1_dp*mu
1777 switch = 1.0_dp/(1.0_dp + exp((log(estimate1) - mu)/sigma))
1778 estimate = estimate1*(1.0_dp - switch) + estimate2*switch
1779 res = int(estimate*0.001_dp, kind=
int_8) + 1
1788 REAL(kind=
dp)
FUNCTION estimate_basic(p)
RESULT(res)
1789 REAL(kind=
dp) :: p(12)
1791 REAL(kind=
dp) :: p1, p10, p11, p12, p2, p3, p4, p5, p6, &
1794 p1 = p(1); p2 = p(2); p3 = p(3); p4 = p(4)
1795 p5 = p(5); p6 = p(6); p7 = p(7); p8 = p(8)
1796 p9 = p(9); p10 = p(10); p11 = p(11); p12 = p(12)
1797 res = poly2(nsa, p1, p2, p3)*poly2(nsb, p1, p2, p3)*poly2(nsc, p1, p2, p3)*poly2(nsd, p1, p2, p3)* &
1798 poly2(npgfa, p4, p5, p6)*poly2(npgfb, p4, p5, p6)*poly2(npgfc, p4, p5, p6)* &
1799 poly2(npgfd, p4, p5, p6)*exp(-p7*ratio + p8*ratio**2) + &
1800 1000.0_dp*p9 + poly2(nsa, p10, p11, p12)*poly2(nsb, p10, p11, p12)*poly2(nsc, p10, p11, p12)*poly2(nsd, p10, p11, p12)
1802 END FUNCTION estimate_basic
1812 REAL(kind=
dp)
FUNCTION poly2(x, a0, a1, a2)
1813 INTEGER,
INTENT(IN) :: x
1814 REAL(kind=
dp),
INTENT(IN) :: a0, a1, a2
1817 r = real(x, kind=
dp)
1818 poly2 = a0 + (a1 + a2*r)*r
1833 SUBROUTINE optimize_distribution(total_number_of_bins, number_of_processes, bin_costs, &
1834 distribution_vector, do_randomize)
1835 INTEGER :: total_number_of_bins, number_of_processes
1836 INTEGER(int_8),
DIMENSION(:),
POINTER :: bin_costs
1837 INTEGER,
DIMENSION(:),
POINTER :: distribution_vector
1838 LOGICAL,
INTENT(IN) :: do_randomize
1840 INTEGER :: i, itmp, j, nstep
1841 INTEGER(int_8),
DIMENSION(:),
POINTER :: my_cost_cpu, tmp_cost, tmp_cpu_cost
1842 INTEGER,
DIMENSION(:),
POINTER :: tmp_cpu_index, tmp_index
1845 nstep = max(1, int(number_of_processes)/2)
1847 ALLOCATE (tmp_cost(total_number_of_bins))
1848 ALLOCATE (tmp_index(total_number_of_bins))
1849 ALLOCATE (tmp_cpu_cost(number_of_processes))
1850 ALLOCATE (tmp_cpu_index(number_of_processes))
1851 ALLOCATE (my_cost_cpu(number_of_processes))
1852 tmp_cost = bin_costs
1854 CALL sort(tmp_cost, total_number_of_bins, tmp_index)
1868 DO i = total_number_of_bins, 1, -nstep
1869 tmp_cpu_cost = my_cost_cpu
1870 CALL sort(tmp_cpu_cost, int(number_of_processes), tmp_cpu_index)
1871 IF (do_randomize)
THEN
1872 CALL rng_stream%shuffle(tmp_cpu_index(1:min(i, nstep)))
1874 DO j = 1, min(i, nstep)
1875 itmp = tmp_cpu_index(j)
1876 distribution_vector(tmp_index(i - j + 1)) = itmp
1877 my_cost_cpu(itmp) = my_cost_cpu(itmp) + bin_costs(tmp_index(i - j + 1))
1881 DEALLOCATE (tmp_cost, tmp_index, tmp_cpu_cost)
1882 DEALLOCATE (tmp_cpu_index, my_cost_cpu)
1883 END SUBROUTINE optimize_distribution
1898 PURE FUNCTION get_1d_idx(i, j, N)
1899 INTEGER,
INTENT(IN) :: i, j
1900 INTEGER(int_8),
INTENT(IN) :: n
1901 INTEGER(int_8) :: get_1d_idx
1903 INTEGER(int_8) :: min_ij
1906 get_1d_idx = min_ij*n + max(i, j) - (min_ij - 1)*min_ij/2 - n
1908 END FUNCTION get_1d_idx
1947 FUNCTION estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
1948 iatom_start, iatom_end, jatom_start, jatom_end, &
1949 katom_start, katom_end, latom_start, latom_end, &
1951 coeffs_set, coeffs_kind, &
1952 is_assoc_atomic_block_global, do_periodic, &
1953 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
1955 do_p_screening, map_atom_to_kind_atom, eval_type, &
1956 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, &
1959 INTEGER,
INTENT(IN) :: natom, nkind
1962 INTEGER,
INTENT(IN) :: iatom_start, iatom_end, jatom_start, &
1963 jatom_end, katom_start, katom_end, &
1964 latom_start, latom_end
1967 DIMENSION(:, :, :, :),
POINTER :: coeffs_set
1969 DIMENSION(nkind, nkind) :: coeffs_kind
1970 INTEGER,
DIMENSION(:, :) :: is_assoc_atomic_block_global
1971 LOGICAL :: do_periodic
1972 INTEGER :: kind_of(*)
1974 TYPE(
hfx_p_kind),
DIMENSION(:),
POINTER :: pmax_set
1975 REAL(
dp),
DIMENSION(:, :),
POINTER :: pmax_atom
1976 REAL(
dp) :: pmax_blocks
1978 LOGICAL,
INTENT(IN) :: do_p_screening
1979 INTEGER,
DIMENSION(:),
POINTER :: map_atom_to_kind_atom
1980 INTEGER,
INTENT(IN) :: eval_type
1981 REAL(
dp) :: log10_eps_schwarz, log_2, &
1983 LOGICAL,
INTENT(IN) :: use_virial
1984 LOGICAL,
DIMENSION(natom, natom) :: atomic_pair_list
1985 INTEGER(int_8) :: estimate_block_cost
1987 INTEGER :: i_list_ij, i_list_kl, i_set_list_ij, i_set_list_ij_start, i_set_list_ij_stop, &
1988 i_set_list_kl, i_set_list_kl_start, i_set_list_kl_stop, iatom, ikind, iset, jatom, jkind, &
1989 jset, katom, kind_kind_idx, kkind, kset, latom, lkind, lset, swap_id
1990 INTEGER,
DIMENSION(:),
POINTER :: npgfa, npgfb, npgfc, npgfd, nsgfa, &
1992 REAL(
dp) :: actual_pmax_atom, cost_tmp, max_val1, &
1993 max_val2, pmax_entry, rab2, rcd2, &
1994 screen_kind_ij, screen_kind_kl
1995 REAL(
dp),
DIMENSION(:, :),
POINTER :: ptr_p_1, ptr_p_2, ptr_p_3, ptr_p_4
1997 estimate_block_cost = 0_int_8
1999 CALL build_pair_list(natom, list_ij, set_list_ij, iatom_start, iatom_end, jatom_start, jatom_end, &
2000 kind_of, basis_parameter, particle_set, &
2001 do_periodic, coeffs_set, coeffs_kind, coeffs_kind_max0, &
2002 log10_eps_schwarz, cell, pmax_blocks, atomic_pair_list)
2004 CALL build_pair_list(natom, list_kl, set_list_kl, katom_start, katom_end, latom_start, latom_end, &
2005 kind_of, basis_parameter, particle_set, &
2006 do_periodic, coeffs_set, coeffs_kind, coeffs_kind_max0, &
2007 log10_eps_schwarz, cell, pmax_blocks, atomic_pair_list)
2009 DO i_list_ij = 1, list_ij%n_element
2010 iatom = list_ij%elements(i_list_ij)%pair(1)
2011 jatom = list_ij%elements(i_list_ij)%pair(2)
2012 i_set_list_ij_start = list_ij%elements(i_list_ij)%set_bounds(1)
2013 i_set_list_ij_stop = list_ij%elements(i_list_ij)%set_bounds(2)
2014 ikind = list_ij%elements(i_list_ij)%kind_pair(1)
2015 jkind = list_ij%elements(i_list_ij)%kind_pair(2)
2016 rab2 = list_ij%elements(i_list_ij)%dist2
2018 nsgfa => basis_parameter(ikind)%nsgf
2019 nsgfb => basis_parameter(jkind)%nsgf
2020 npgfa => basis_parameter(ikind)%npgf
2021 npgfb => basis_parameter(jkind)%npgf
2023 DO i_list_kl = 1, list_kl%n_element
2025 katom = list_kl%elements(i_list_kl)%pair(1)
2026 latom = list_kl%elements(i_list_kl)%pair(2)
2028 IF (.NOT. (katom + latom <= iatom + jatom)) cycle
2029 IF (((iatom + jatom) .EQ. (katom + latom)) .AND. (katom < iatom)) cycle
2032 IF (.NOT. use_virial)
THEN
2033 IF ((iatom == jatom .AND. iatom == katom .AND. iatom == latom)) cycle
2037 i_set_list_kl_start = list_kl%elements(i_list_kl)%set_bounds(1)
2038 i_set_list_kl_stop = list_kl%elements(i_list_kl)%set_bounds(2)
2039 kkind = list_kl%elements(i_list_kl)%kind_pair(1)
2040 lkind = list_kl%elements(i_list_kl)%kind_pair(2)
2041 rcd2 = list_kl%elements(i_list_kl)%dist2
2043 nsgfc => basis_parameter(kkind)%nsgf
2044 nsgfd => basis_parameter(lkind)%nsgf
2045 npgfc => basis_parameter(kkind)%npgf
2046 npgfd => basis_parameter(lkind)%npgf
2048 IF (do_p_screening)
THEN
2049 actual_pmax_atom = max(pmax_atom(katom, iatom), &
2050 pmax_atom(latom, jatom), &
2051 pmax_atom(latom, iatom), &
2052 pmax_atom(katom, jatom))
2054 actual_pmax_atom = 0.0_dp
2057 screen_kind_ij = coeffs_kind(jkind, ikind)%x(1)*rab2 + &
2058 coeffs_kind(jkind, ikind)%x(2)
2059 screen_kind_kl = coeffs_kind(lkind, kkind)%x(1)*rcd2 + &
2060 coeffs_kind(lkind, kkind)%x(2)
2061 IF (screen_kind_ij + screen_kind_kl + actual_pmax_atom < log10_eps_schwarz) cycle
2063 IF (.NOT. (is_assoc_atomic_block_global(latom, iatom) >= 1 .AND. &
2064 is_assoc_atomic_block_global(katom, iatom) >= 1 .AND. &
2065 is_assoc_atomic_block_global(katom, jatom) >= 1 .AND. &
2066 is_assoc_atomic_block_global(latom, jatom) >= 1)) cycle
2068 IF (do_p_screening)
THEN
2069 SELECT CASE (eval_type)
2072 kind_kind_idx = int(get_1d_idx(kkind, ikind, int(nkind,
int_8)))
2073 IF (ikind >= kkind)
THEN
2074 ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2075 map_atom_to_kind_atom(katom), &
2076 map_atom_to_kind_atom(iatom))
2078 ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2079 map_atom_to_kind_atom(iatom), &
2080 map_atom_to_kind_atom(katom))
2081 swap_id = swap_id + 1
2083 kind_kind_idx = int(get_1d_idx(lkind, jkind, int(nkind,
int_8)))
2084 IF (jkind >= lkind)
THEN
2085 ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2086 map_atom_to_kind_atom(latom), &
2087 map_atom_to_kind_atom(jatom))
2089 ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2090 map_atom_to_kind_atom(jatom), &
2091 map_atom_to_kind_atom(latom))
2092 swap_id = swap_id + 2
2094 kind_kind_idx = int(get_1d_idx(lkind, ikind, int(nkind,
int_8)))
2095 IF (ikind >= lkind)
THEN
2096 ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2097 map_atom_to_kind_atom(latom), &
2098 map_atom_to_kind_atom(iatom))
2100 ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2101 map_atom_to_kind_atom(iatom), &
2102 map_atom_to_kind_atom(latom))
2103 swap_id = swap_id + 4
2105 kind_kind_idx = int(get_1d_idx(kkind, jkind, int(nkind,
int_8)))
2106 IF (jkind >= kkind)
THEN
2107 ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2108 map_atom_to_kind_atom(katom), &
2109 map_atom_to_kind_atom(jatom))
2111 ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2112 map_atom_to_kind_atom(jatom), &
2113 map_atom_to_kind_atom(katom))
2114 swap_id = swap_id + 8
2118 kind_kind_idx = int(get_1d_idx(kkind, ikind, int(nkind,
int_8)))
2119 IF (ikind >= kkind)
THEN
2120 ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2121 map_atom_to_kind_atom(katom), &
2122 map_atom_to_kind_atom(iatom))
2124 ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2125 map_atom_to_kind_atom(iatom), &
2126 map_atom_to_kind_atom(katom))
2127 swap_id = swap_id + 1
2129 kind_kind_idx = int(get_1d_idx(lkind, jkind, int(nkind,
int_8)))
2130 IF (jkind >= lkind)
THEN
2131 ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2132 map_atom_to_kind_atom(latom), &
2133 map_atom_to_kind_atom(jatom))
2135 ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2136 map_atom_to_kind_atom(jatom), &
2137 map_atom_to_kind_atom(latom))
2138 swap_id = swap_id + 2
2140 kind_kind_idx = int(get_1d_idx(lkind, ikind, int(nkind,
int_8)))
2141 IF (ikind >= lkind)
THEN
2142 ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2143 map_atom_to_kind_atom(latom), &
2144 map_atom_to_kind_atom(iatom))
2146 ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2147 map_atom_to_kind_atom(iatom), &
2148 map_atom_to_kind_atom(latom))
2149 swap_id = swap_id + 4
2151 kind_kind_idx = int(get_1d_idx(kkind, jkind, int(nkind,
int_8)))
2152 IF (jkind >= kkind)
THEN
2153 ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2154 map_atom_to_kind_atom(katom), &
2155 map_atom_to_kind_atom(jatom))
2157 ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2158 map_atom_to_kind_atom(jatom), &
2159 map_atom_to_kind_atom(katom))
2160 swap_id = swap_id + 8
2165 DO i_set_list_ij = i_set_list_ij_start, i_set_list_ij_stop
2166 iset = set_list_ij(i_set_list_ij)%pair(1)
2167 jset = set_list_ij(i_set_list_ij)%pair(2)
2169 max_val1 = coeffs_set(jset, iset, jkind, ikind)%x(1)*rab2 + &
2170 coeffs_set(jset, iset, jkind, ikind)%x(2)
2172 IF (max_val1 + screen_kind_kl + actual_pmax_atom < log10_eps_schwarz) cycle
2173 DO i_set_list_kl = i_set_list_kl_start, i_set_list_kl_stop
2174 kset = set_list_kl(i_set_list_kl)%pair(1)
2175 lset = set_list_kl(i_set_list_kl)%pair(2)
2177 max_val2 = max_val1 + (coeffs_set(lset, kset, lkind, kkind)%x(1)*rcd2 + &
2178 coeffs_set(lset, kset, lkind, kkind)%x(2))
2180 IF (max_val2 + actual_pmax_atom < log10_eps_schwarz) cycle
2181 IF (do_p_screening)
THEN
2182 CALL get_pmax_val(ptr_p_1, ptr_p_2, ptr_p_3, ptr_p_4, &
2183 iset, jset, kset, lset, &
2184 pmax_entry, swap_id)
2186 pmax_entry = log_2 + pmax_entry
2191 max_val2 = max_val2 + pmax_entry
2192 IF (max_val2 < log10_eps_schwarz) cycle
2193 SELECT CASE (eval_type)
2195 cost_tmp =
cost_model(nsgfa(iset), nsgfb(jset), nsgfc(kset), nsgfd(lset), &
2196 npgfa(iset), npgfb(jset), npgfc(kset), npgfd(lset), &
2197 max_val2/log10_eps_schwarz, &
2199 estimate_block_cost = estimate_block_cost + int(cost_tmp, kind=
int_8)
2201 cost_tmp =
cost_model(nsgfa(iset), nsgfb(jset), nsgfc(kset), nsgfd(lset), &
2202 npgfa(iset), npgfb(jset), npgfc(kset), npgfd(lset), &
2203 max_val2/log10_eps_schwarz, &
2204 p1_forces, p2_forces, p3_forces)
2205 estimate_block_cost = estimate_block_cost + int(cost_tmp, kind=
int_8)
2212 END FUNCTION estimate_block_cost
2246 SUBROUTINE init_blocks(nkind, para_env, natom, block_size, nblock, blocks, &
2247 list_ij, list_kl, set_list_ij, set_list_kl, &
2249 coeffs_set, coeffs_kind, &
2250 is_assoc_atomic_block_global, do_periodic, &
2251 kind_of, basis_parameter, pmax_set, pmax_atom, &
2252 pmax_blocks, cell, &
2253 do_p_screening, map_atom_to_kind_atom, eval_type, &
2254 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, &
2257 INTEGER,
INTENT(IN) :: nkind
2259 INTEGER :: natom, block_size, nblock
2265 DIMENSION(:, :, :, :),
POINTER :: coeffs_set
2267 POINTER :: coeffs_kind
2268 INTEGER,
DIMENSION(:, :) :: is_assoc_atomic_block_global
2269 LOGICAL :: do_periodic
2270 INTEGER :: kind_of(*)
2272 TYPE(
hfx_p_kind),
DIMENSION(:),
POINTER :: pmax_set
2273 REAL(
dp),
DIMENSION(:, :),
POINTER :: pmax_atom
2274 REAL(
dp) :: pmax_blocks
2276 LOGICAL,
INTENT(IN) :: do_p_screening
2277 INTEGER,
DIMENSION(:),
POINTER :: map_atom_to_kind_atom
2278 INTEGER,
INTENT(IN) :: eval_type
2279 REAL(
dp) :: log10_eps_schwarz, log_2, &
2281 LOGICAL,
INTENT(IN) :: use_virial
2282 LOGICAL,
DIMENSION(natom, natom) :: atomic_pair_list
2284 INTEGER :: atom_block, i, iatom_block, iatom_end, &
2285 iatom_start, my_cpu_rank, ncpus
2287 DO atom_block = 0, nblock - 1
2288 iatom_block =
modulo(atom_block, nblock) + 1
2289 iatom_start = (iatom_block - 1)*block_size + 1
2290 iatom_end = min(iatom_block*block_size, natom)
2291 blocks(atom_block + 1)%istart = iatom_start
2292 blocks(atom_block + 1)%iend = iatom_end
2293 blocks(atom_block + 1)%cost = 0_int_8
2296 ncpus = para_env%num_pe
2297 my_cpu_rank = para_env%mepos
2299 IF (
modulo(i, ncpus) /= my_cpu_rank)
THEN
2300 blocks(i)%istart = 0
2304 iatom_start = blocks(i)%istart
2305 iatom_end = blocks(i)%iend
2306 blocks(i)%cost = estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
2307 iatom_start, iatom_end, iatom_start, iatom_end, &
2308 iatom_start, iatom_end, iatom_start, iatom_end, &
2310 coeffs_set, coeffs_kind, &
2311 is_assoc_atomic_block_global, do_periodic, &
2312 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
2314 do_p_screening, map_atom_to_kind_atom, eval_type, &
2315 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
2318 END SUBROUTINE init_blocks
2334 INTEGER,
INTENT(IN) :: iw, n_threads, i_thread, eval_type
2336 INTEGER :: i, j, k, my_rank, nbins, nranks, &
2338 INTEGER(int_8) :: avg_bin, avg_rank, max_bin, max_rank, &
2339 min_bin, min_rank, sum_bin, sum_rank
2340 INTEGER(int_8),
ALLOCATABLE,
DIMENSION(:) :: buffer, buffer_in, buffer_out, summary
2341 INTEGER(int_8),
ALLOCATABLE,
DIMENSION(:),
SAVE :: shm_cost_vector
2342 INTEGER,
ALLOCATABLE,
DIMENSION(:) :: bins_per_rank, rdispl, sort_idx
2343 INTEGER,
ALLOCATABLE,
DIMENSION(:),
SAVE :: shm_bins_per_rank, shm_displ
2345 SELECT CASE (eval_type)
2347 nbins =
SIZE(x_data%distribution_energy)
2349 nbins =
SIZE(x_data%distribution_forces)
2353 ALLOCATE (shm_bins_per_rank(n_threads))
2354 ALLOCATE (shm_displ(n_threads + 1))
2358 shm_bins_per_rank(i_thread + 1) = nbins
2362 nbins = nbins + shm_bins_per_rank(i)
2364 my_rank = para_env%mepos
2365 nranks = para_env%num_pe
2369 ALLOCATE (bins_per_rank(nranks))
2372 bins_per_rank(my_rank + 1) = nbins
2374 CALL para_env%sum(bins_per_rank)
2378 total_bins = total_bins + bins_per_rank(i)
2381 ALLOCATE (shm_cost_vector(2*total_bins))
2382 shm_cost_vector = -1_int_8
2385 shm_displ(i) = shm_displ(i - 1) + shm_bins_per_rank(i - 1)
2387 shm_displ(n_threads + 1) = nbins + 1
2391 SELECT CASE (eval_type)
2393 DO i = shm_displ(i_thread + 1), shm_displ(i_thread + 2) - 1
2395 shm_cost_vector(2*(i - 1) + 1) = x_data%distribution_energy(j)%cost
2396 shm_cost_vector(2*i) = int(x_data%distribution_energy(j)%time_first_scf*10000.0_dp, kind=
int_8)
2399 DO i = shm_displ(i_thread + 1), shm_displ(i_thread + 2) - 1
2401 shm_cost_vector(2*(i - 1) + 1) = x_data%distribution_forces(j)%cost
2402 shm_cost_vector(2*i) = int(x_data%distribution_forces(j)%time_forces*10000.0_dp, kind=
int_8)
2408 ALLOCATE (rdispl(nranks))
2409 bins_per_rank(:) = bins_per_rank(:)*2
2412 rdispl(i) = rdispl(i - 1) + bins_per_rank(i - 1)
2415 ALLOCATE (buffer_in(2*nbins))
2416 ALLOCATE (buffer_out(2*total_bins))
2419 buffer_in(2*(i - 1) + 1) = shm_cost_vector(2*(i - 1) + 1)
2420 buffer_in(2*i) = shm_cost_vector(2*i)
2423 CALL para_env%gatherv(buffer_in, buffer_out, bins_per_rank, rdispl)
2427 ALLOCATE (summary(2*nranks))
2430 WRITE (iw,
'( /, 1X, 79("-") )')
2431 WRITE (iw,
'( " -", 77X, "-" )')
2432 SELECT CASE (eval_type)
2434 WRITE (iw,
'( " -", 20X, A, 19X, "-" )')
' HFX LOAD BALANCE INFORMATION - ENERGY '
2436 WRITE (iw,
'( " -", 20X, A, 19X, "-" )')
' HFX LOAD BALANCE INFORMATION - FORCES '
2438 WRITE (iw,
'( " -", 77X, "-" )')
2439 WRITE (iw,
'( 1X, 79("-") )')
2441 WRITE (iw, fmt=
"(T3,A,T15,A,T35,A,T55,A)")
"MPI RANK",
"BIN #",
"EST cost",
"Processing time [s]"
2442 WRITE (iw,
'( 1X, 79("-"), / )')
2445 DO j = 1, bins_per_rank(i)/2
2447 WRITE (iw, fmt=
"(T6,I5,T15,I5,T27,I16,T55,F19.8)") &
2448 i - 1, j, buffer_out(2*(k - 1) + 1), real(buffer_out(2*k),
dp)/10000.0_dp
2449 summary(2*(i - 1) + 1) = summary(2*(i - 1) + 1) + buffer_out(2*(k - 1) + 1)
2450 summary(2*i) = summary(2*i) + buffer_out(2*k)
2456 min_bin = huge(min_bin)
2458 DO i = 1, total_bins
2459 sum_bin = sum_bin + buffer_out(2*i)
2460 max_bin = max(max_bin, buffer_out(2*i))
2461 min_bin = min(min_bin, buffer_out(2*i))
2463 avg_bin = sum_bin/total_bins
2466 min_rank = huge(min_rank)
2469 sum_rank = sum_rank + summary(2*i)
2470 max_rank = max(max_rank, summary(2*i))
2471 min_rank = min(min_rank, summary(2*i))
2473 avg_rank = sum_rank/nranks
2475 WRITE (iw, fmt=
'(/,T3,A,/)')
"SUMMARY:"
2476 WRITE (iw, fmt=
"(T3,A,T35,F19.8)")
"Max bin", real(max_bin,
dp)/10000.0_dp
2477 WRITE (iw, fmt=
"(T3,A,T35,F19.8)")
"Min bin", real(min_bin,
dp)/10000.0_dp
2478 WRITE (iw, fmt=
"(T3,A,T35,F19.8)")
"Sum bin", real(sum_bin,
dp)/10000.0_dp
2479 WRITE (iw, fmt=
"(T3,A,T35,F19.8,/)")
"Avg bin", real(avg_bin,
dp)/10000.0_dp
2480 WRITE (iw, fmt=
"(T3,A,T35,F19.8)")
"Max rank", real(max_rank,
dp)/10000.0_dp
2481 WRITE (iw, fmt=
"(T3,A,T35,F19.8)")
"Min rank", real(min_rank,
dp)/10000.0_dp
2482 WRITE (iw, fmt=
"(T3,A,T35,F19.8)")
"Sum rank", real(sum_rank,
dp)/10000.0_dp
2483 WRITE (iw, fmt=
"(T3,A,T35,F19.8,/)")
"Avg rank", real(avg_rank,
dp)/10000.0_dp
2485 ALLOCATE (buffer(nranks))
2486 ALLOCATE (sort_idx(nranks))
2489 buffer(i) = summary(2*i)
2492 CALL sort(buffer, nranks, sort_idx)
2494 WRITE (iw, fmt=
"(T3,A,T35,A,T55,A,/)")
"MPI RANK",
"EST cost",
"Processing time [s]"
2495 DO i = nranks, 1, -1
2496 WRITE (iw, fmt=
"(T6,I5,T27,I16,T55,F19.8)") sort_idx(i) - 1, summary(2*(sort_idx(i) - 1) + 1), real(buffer(i),
dp)/10000.0_dp
2499 DEALLOCATE (summary, buffer, sort_idx)
2503 DEALLOCATE (buffer_in, buffer_out, rdispl)
2505 CALL para_env%sync()
2507 DEALLOCATE (shm_bins_per_rank, shm_displ, shm_cost_vector)
2534PURE SUBROUTINE get_pmax_val(ptr_p_1, ptr_p_2, ptr_p_3, ptr_p_4, iset, jset, kset, lset, pmax_val, swap_id)
2536 REAL(
dp),
DIMENSION(:, :),
POINTER :: ptr_p_1, ptr_p_2, ptr_p_3, ptr_p_4
2537 INTEGER,
INTENT(IN) :: iset, jset, kset, lset
2539 REAL(
dp),
INTENT(OUT) :: pmax_val
2540 INTEGER,
INTENT(IN) :: swap_id
2542 REAL(
dp) :: pmax_1, pmax_2, pmax_3, pmax_4
2544 SELECT CASE (swap_id)
2546 pmax_1 = ptr_p_1(kset, iset)
2547 pmax_2 = ptr_p_2(lset, jset)
2548 pmax_3 = ptr_p_3(lset, iset)
2549 pmax_4 = ptr_p_4(kset, jset)
2550 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2552 pmax_1 = ptr_p_1(iset, kset)
2553 pmax_2 = ptr_p_2(lset, jset)
2554 pmax_3 = ptr_p_3(lset, iset)
2555 pmax_4 = ptr_p_4(kset, jset)
2556 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2558 pmax_1 = ptr_p_1(kset, iset)
2559 pmax_2 = ptr_p_2(jset, lset)
2560 pmax_3 = ptr_p_3(lset, iset)
2561 pmax_4 = ptr_p_4(kset, jset)
2562 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2564 pmax_1 = ptr_p_1(iset, kset)
2565 pmax_2 = ptr_p_2(jset, lset)
2566 pmax_3 = ptr_p_3(lset, iset)
2567 pmax_4 = ptr_p_4(kset, jset)
2568 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2570 pmax_1 = ptr_p_1(kset, iset)
2571 pmax_2 = ptr_p_2(lset, jset)
2572 pmax_3 = ptr_p_3(iset, lset)
2573 pmax_4 = ptr_p_4(kset, jset)
2574 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2576 pmax_1 = ptr_p_1(iset, kset)
2577 pmax_2 = ptr_p_2(lset, jset)
2578 pmax_3 = ptr_p_3(iset, lset)
2579 pmax_4 = ptr_p_4(kset, jset)
2580 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2582 pmax_1 = ptr_p_1(kset, iset)
2583 pmax_2 = ptr_p_2(jset, lset)
2584 pmax_3 = ptr_p_3(iset, lset)
2585 pmax_4 = ptr_p_4(kset, jset)
2586 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2588 pmax_1 = ptr_p_1(iset, kset)
2589 pmax_2 = ptr_p_2(jset, lset)
2590 pmax_3 = ptr_p_3(iset, lset)
2591 pmax_4 = ptr_p_4(kset, jset)
2592 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2594 pmax_1 = ptr_p_1(kset, iset)
2595 pmax_2 = ptr_p_2(lset, jset)
2596 pmax_3 = ptr_p_3(lset, iset)
2597 pmax_4 = ptr_p_4(jset, kset)
2598 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2600 pmax_1 = ptr_p_1(iset, kset)
2601 pmax_2 = ptr_p_2(lset, jset)
2602 pmax_3 = ptr_p_3(lset, iset)
2603 pmax_4 = ptr_p_4(jset, kset)
2604 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2606 pmax_1 = ptr_p_1(kset, iset)
2607 pmax_2 = ptr_p_2(jset, lset)
2608 pmax_3 = ptr_p_3(lset, iset)
2609 pmax_4 = ptr_p_4(jset, kset)
2610 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2612 pmax_1 = ptr_p_1(iset, kset)
2613 pmax_2 = ptr_p_2(jset, lset)
2614 pmax_3 = ptr_p_3(lset, iset)
2615 pmax_4 = ptr_p_4(jset, kset)
2616 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2618 pmax_1 = ptr_p_1(kset, iset)
2619 pmax_2 = ptr_p_2(lset, jset)
2620 pmax_3 = ptr_p_3(iset, lset)
2621 pmax_4 = ptr_p_4(jset, kset)
2622 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2624 pmax_1 = ptr_p_1(iset, kset)
2625 pmax_2 = ptr_p_2(lset, jset)
2626 pmax_3 = ptr_p_3(iset, lset)
2627 pmax_4 = ptr_p_4(jset, kset)
2628 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2630 pmax_1 = ptr_p_1(kset, iset)
2631 pmax_2 = ptr_p_2(jset, lset)
2632 pmax_3 = ptr_p_3(iset, lset)
2633 pmax_4 = ptr_p_4(jset, kset)
2634 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2636 pmax_1 = ptr_p_1(iset, kset)
2637 pmax_2 = ptr_p_2(jset, lset)
2638 pmax_3 = ptr_p_3(iset, lset)
2639 pmax_4 = ptr_p_4(jset, kset)
2640 pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)
2642 pmax_1 = ptr_p_1(kset, iset)
2643 pmax_2 = ptr_p_2(lset, jset)
2644 pmax_3 = ptr_p_3(lset, iset)
2645 pmax_4 = ptr_p_4(kset, jset)
2646 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2648 pmax_1 = ptr_p_1(iset, kset)
2649 pmax_2 = ptr_p_2(lset, jset)
2650 pmax_3 = ptr_p_3(lset, iset)
2651 pmax_4 = ptr_p_4(kset, jset)
2652 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2654 pmax_1 = ptr_p_1(kset, iset)
2655 pmax_2 = ptr_p_2(jset, lset)
2656 pmax_3 = ptr_p_3(lset, iset)
2657 pmax_4 = ptr_p_4(kset, jset)
2658 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2660 pmax_1 = ptr_p_1(iset, kset)
2661 pmax_2 = ptr_p_2(jset, lset)
2662 pmax_3 = ptr_p_3(lset, iset)
2663 pmax_4 = ptr_p_4(kset, jset)
2664 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2666 pmax_1 = ptr_p_1(kset, iset)
2667 pmax_2 = ptr_p_2(lset, jset)
2668 pmax_3 = ptr_p_3(iset, lset)
2669 pmax_4 = ptr_p_4(kset, jset)
2670 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2672 pmax_1 = ptr_p_1(iset, kset)
2673 pmax_2 = ptr_p_2(lset, jset)
2674 pmax_3 = ptr_p_3(iset, lset)
2675 pmax_4 = ptr_p_4(kset, jset)
2676 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2678 pmax_1 = ptr_p_1(kset, iset)
2679 pmax_2 = ptr_p_2(jset, lset)
2680 pmax_3 = ptr_p_3(iset, lset)
2681 pmax_4 = ptr_p_4(kset, jset)
2682 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2684 pmax_1 = ptr_p_1(iset, kset)
2685 pmax_2 = ptr_p_2(jset, lset)
2686 pmax_3 = ptr_p_3(iset, lset)
2687 pmax_4 = ptr_p_4(kset, jset)
2688 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2690 pmax_1 = ptr_p_1(kset, iset)
2691 pmax_2 = ptr_p_2(lset, jset)
2692 pmax_3 = ptr_p_3(lset, iset)
2693 pmax_4 = ptr_p_4(jset, kset)
2694 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2696 pmax_1 = ptr_p_1(iset, kset)
2697 pmax_2 = ptr_p_2(lset, jset)
2698 pmax_3 = ptr_p_3(lset, iset)
2699 pmax_4 = ptr_p_4(jset, kset)
2700 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2702 pmax_1 = ptr_p_1(kset, iset)
2703 pmax_2 = ptr_p_2(jset, lset)
2704 pmax_3 = ptr_p_3(lset, iset)
2705 pmax_4 = ptr_p_4(jset, kset)
2706 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2708 pmax_1 = ptr_p_1(iset, kset)
2709 pmax_2 = ptr_p_2(jset, lset)
2710 pmax_3 = ptr_p_3(lset, iset)
2711 pmax_4 = ptr_p_4(jset, kset)
2712 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2714 pmax_1 = ptr_p_1(kset, iset)
2715 pmax_2 = ptr_p_2(lset, jset)
2716 pmax_3 = ptr_p_3(iset, lset)
2717 pmax_4 = ptr_p_4(jset, kset)
2718 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2720 pmax_1 = ptr_p_1(iset, kset)
2721 pmax_2 = ptr_p_2(lset, jset)
2722 pmax_3 = ptr_p_3(iset, lset)
2723 pmax_4 = ptr_p_4(jset, kset)
2724 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2726 pmax_1 = ptr_p_1(kset, iset)
2727 pmax_2 = ptr_p_2(jset, lset)
2728 pmax_3 = ptr_p_3(iset, lset)
2729 pmax_4 = ptr_p_4(jset, kset)
2730 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2732 pmax_1 = ptr_p_1(iset, kset)
2733 pmax_2 = ptr_p_2(jset, lset)
2734 pmax_3 = ptr_p_3(iset, lset)
2735 pmax_4 = ptr_p_4(jset, kset)
2736 pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)
2739END SUBROUTINE get_pmax_val
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
static GRID_HOST_DEVICE int idx(const orbital a)
Return coset index of given orbital angular momentum.
Handles all functions related to the CELL.
Utility routines to open and close files. Tracking of preconnections.
subroutine, public open_file(file_name, file_status, file_form, file_action, file_position, file_pad, unit_number, debug, skip_get_unit_number, file_access)
Opens the requested file using a free unit number.
subroutine, public close_file(unit_number, file_status, keep_preconnection)
Close an open file given by its logical unit number. Optionally, keep the file and unit preconnected.
Routines for optimizing load balance between processes in HFX calculations.
real(kind=dp), dimension(12), parameter, public p1_energy
real(kind=dp), dimension(2), parameter, public p3_energy
real(kind=dp), dimension(12), parameter, public p2_energy
subroutine, public collect_load_balance_info(para_env, x_data, iw, n_threads, i_thread, eval_type)
...
subroutine, public hfx_load_balance(x_data, eps_schwarz, particle_set, max_set, para_env, coeffs_set, coeffs_kind, is_assoc_atomic_block_global, do_periodic, load_balance_parameter, kind_of, basis_parameter, pmax_set, pmax_atom, i_thread, n_threads, cell, do_p_screening, map_atom_to_kind_atom, nkind, eval_type, pmax_block, use_virial)
Distributes the computation of eri's to all available processes.
integer(kind=int_8) function, public cost_model(nsa, nsb, nsc, nsd, npgfa, npgfb, npgfc, npgfd, ratio, p1, p2, p3)
estimates the cost of a set quartet with info available at load balance time i.e. without much info o...
subroutine, public hfx_update_load_balance(x_data, para_env, load_balance_parameter, i_thread, n_threads, eval_type)
Cheap way of redistributing the eri's.
Routines for optimizing load balance between processes in HFX calculations.
subroutine, public build_atomic_pair_list(natom, atomic_pair_list, kind_of, basis_parameter, particle_set, do_periodic, coeffs_kind, coeffs_kind_max0, log10_eps_schwarz, cell, blocks)
...
subroutine, public build_pair_list(natom, list, set_list, i_start, i_end, j_start, j_end, kind_of, basis_parameter, particle_set, do_periodic, coeffs_set, coeffs_kind, coeffs_kind_max0, log10_eps_schwarz, cell, pmax_blocks, atomic_pair_list)
...
Types and set/get functions for HFX.
subroutine, public hfx_set_distr_energy(ptr_to_distr, x_data)
This routine stores the data obtained from the load balance routine for the energy
subroutine, public hfx_set_distr_forces(ptr_to_distr, x_data)
This routine stores the data obtained from the load balance routine for the forces
Defines the basic variable types.
integer, parameter, public int_8
integer, parameter, public dp
Interface to the message passing library MPI.
Parallel (pseudo)random number generator (RNG) for multiple streams and substreams of random numbers.
integer, parameter, public uniform
Define the data structure for the particle information.
All kind of helpful little routines.
Type defining parameters related to the simulation cell.
stores some data used in construction of Kohn-Sham matrix
stores all the informations relevant to an mpi environment