d2/d0b/hfx__load__balance__methods_8F_source.html

!--------------------------------------------------------------------------------------------------!

!   CP2K: A general program to perform molecular dynamics simulations                              !

!   Copyright 2000-2026 CP2K developers group <https://cp2k.org>                                   !

!                                                                                                  !

!   SPDX-License-Identifier: GPL-2.0-or-later                                                      !

!--------------------------------------------------------------------------------------------------!


! **************************************************************************************************

!> \brief Routines for optimizing load balance between processes in HFX calculations

!> \par History

!>      04.2008 created [Manuel Guidon]

!> \author Manuel Guidon

! **************************************************************************************************

MODULE hfx_load_balance_methods

   USE cell_types, ONLY: cell_type

   USE cp_files, ONLY: close_file, &

                       open_file

   USE message_passing, ONLY: mp_para_env_type

   USE hfx_pair_list_methods, ONLY: build_atomic_pair_list, &

                                    build_pair_list

   USE hfx_types, ONLY: &

      hfx_basis_type, hfx_block_range_type, hfx_distribution, hfx_load_balance_type, hfx_p_kind, &

      hfx_screen_coeff_type, hfx_set_distr_energy, hfx_set_distr_forces, hfx_type, &

      pair_list_type, pair_set_list_type

   USE input_constants, ONLY: hfx_do_eval_energy, &

                              hfx_do_eval_forces

   USE kinds, ONLY: dp, &

                    int_8

   USE message_passing, ONLY: mp_waitall, mp_request_type

   USE parallel_rng_types, ONLY: uniform, &

                                 rng_stream_type

   USE particle_types, ONLY: particle_type

   USE util, ONLY: sort

#include "./base/base_uses.f90"


   IMPLICIT NONE

   PRIVATE


   PUBLIC :: hfx_load_balance, &

             hfx_update_load_balance, &

             collect_load_balance_info, cost_model, p1_energy, p2_energy, p3_energy


   CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'hfx_load_balance_methods'


   REAL(kind=dp), PARAMETER :: p1_energy(12) = [2.9461408209700424_dp, 1.0624718662999657_dp, &

                                                -1.91570128356921242e-002_dp, -1.6668495454436603_dp, &

                                                1.7512639006523709_dp, -9.76074323945336081e-002_dp, &

                                                2.6230786127311889_dp, -0.31870737623014189_dp, &

                                                7.9588203912690973_dp, 1.8331423413134813_dp, &

                                                -0.15427618665346299_dp, 0.19749436090711650_dp]


   REAL(kind=dp), PARAMETER :: p2_energy(12) = [2.3104682960662593_dp, 1.8744052737304417_dp, &

                                                -9.36564055598656797e-002_dp, 0.64284973765086939_dp, &

                                                1.0137565430060556_dp, -6.80088178288954567e-003_dp, &

                                                1.1692629207374552_dp, -2.6314710080507573_dp, &

                                                19.237814781880786_dp, 1.0505934173661349_dp, &

                                                0.80382371955699250_dp, 0.49903401991818103_dp]


   REAL(kind=dp), PARAMETER :: p3_energy(2) = [7.82336287670072350e-002_dp, 0.38073304105744837_dp]

   REAL(kind=dp), PARAMETER :: p1_forces(12) = [2.5746279948798874_dp, 1.3420575378609276_dp, &

                                                -9.41673106447732111e-002_dp, 0.94568006899317825_dp, &

                                                -1.4511897117448544_dp, 0.59178934677316952_dp, &

                                                2.7291149361757236_dp, -0.50555512044800210_dp, &

                                                8.3508180969609871_dp, 1.6829982496141809_dp, &

                                                -0.74895370472152600_dp, 0.43801726744197500_dp]

   REAL(kind=dp), PARAMETER :: p2_forces(12) = [2.6398568961569020_dp, 2.3024918834564101_dp, &

                                                5.33216585432061581e-003_dp, 0.45572145697283628_dp, &

                                                1.8119743851500618_dp, -0.12533918548421166_dp, &

                                                -1.4040312084552751_dp, -4.5331650463917859_dp, &

                                                12.593431549069477_dp, 1.1311978374487595_dp, &

                                                1.4245996087624646_dp, 1.1425350529853495_dp]

   REAL(kind=dp), PARAMETER :: p3_forces(2) = [0.12051930516830946_dp, 1.3828051586144336_dp]


!***


CONTAINS


! **************************************************************************************************

!> \brief Distributes the computation of eri's to all available processes.

!> \param x_data Object that stores the indices array

!> \param eps_schwarz screening parameter

!> \param particle_set , atomic_kind_set, para_env ...

!> \param max_set Maximum number of set to be considered

!> \param para_env para_env

!> \param coeffs_set screening functions

!> \param coeffs_kind screening functions

!> \param is_assoc_atomic_block_global KS-matrix sparsity

!> \param do_periodic flag for periodicity

!> \param load_balance_parameter Parameters for Monte-Carlo routines

!> \param kind_of helper array for mapping

!> \param basis_parameter Basis set parameters

!> \param pmax_set Initial screening matrix

!> \param pmax_atom ...

!> \param i_thread Process ID of current Thread

!> \param n_threads Total Number of Threads

!> \param cell cell

!> \param do_p_screening Flag for initial p screening

!> \param map_atom_to_kind_atom ...

!> \param nkind ...

!> \param eval_type ...

!> \param pmax_block ...

!> \param use_virial ...

!> \par History

!>      06.2007 created [Manuel Guidon]

!>      08.2007 new parallel scheme [Manuel Guidon]

!>      09.2007 new 'modulo' parellel scheme and Monte Carlo step [Manuel Guidon]

!>      11.2007 parallelize load balance on box_idx1 [Manuel Guidon]

!>      02.2009 completely refactored [Manuel Guidon]

!> \author Manuel Guidon

!> \note

!>      The optimization is done via a binning procedure followed by simple

!>      Monte Carlo procedure:

!>      In a first step the total amount of integrals in the system is calculated,

!>      taking into account the sparsity of the KS-matrix , the screening based

!>      on near/farfield approximations and if desired the screening on an initial

!>      density matrix.

!>      In a second step, bins are generate that contain approximately the same number

!>      of integrals, and a cost for these bins is estimated (currently the number of integrals)

!>      In a third step, a Monte Carlo procedure optimizes the assignment

!>      of the different loads to each process

!>      At the end each process owns an unique array of *atomic* indices-ranges

!>      that are used to decide whether a process has to calculate a certain

!>      bunch of integrals or not

! **************************************************************************************************


   SUBROUTINE hfx_load_balance(x_data, eps_schwarz, particle_set, max_set, para_env, &

                               coeffs_set, coeffs_kind, &

                               is_assoc_atomic_block_global, do_periodic, &

                               load_balance_parameter, kind_of, basis_parameter, pmax_set, &

                               pmax_atom, i_thread, n_threads, cell, &

                               do_p_screening, map_atom_to_kind_atom, nkind, eval_type, &

                               pmax_block, use_virial)

      TYPE(hfx_type), POINTER                            :: x_data

      REAL(dp), INTENT(IN)                               :: eps_schwarz

      TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set

      INTEGER, INTENT(IN)                                :: max_set

      TYPE(mp_para_env_type), POINTER                    :: para_env

      TYPE(hfx_screen_coeff_type), &

         DIMENSION(:, :, :, :), POINTER                  :: coeffs_set

      TYPE(hfx_screen_coeff_type), DIMENSION(:, :), &

         POINTER                                         :: coeffs_kind

      INTEGER, DIMENSION(:, :)                           :: is_assoc_atomic_block_global

      LOGICAL                                            :: do_periodic

      TYPE(hfx_load_balance_type), POINTER               :: load_balance_parameter

      INTEGER                                            :: kind_of(*)

      TYPE(hfx_basis_type), DIMENSION(:), POINTER        :: basis_parameter

      TYPE(hfx_p_kind), DIMENSION(:), POINTER            :: pmax_set

      REAL(dp), DIMENSION(:, :), POINTER                 :: pmax_atom

      INTEGER, INTENT(IN)                                :: i_thread, n_threads

      TYPE(cell_type), POINTER                           :: cell

      LOGICAL, INTENT(IN)                                :: do_p_screening

      INTEGER, DIMENSION(:), POINTER                     :: map_atom_to_kind_atom

      INTEGER, INTENT(IN)                                :: nkind, eval_type

      REAL(dp), DIMENSION(:, :), POINTER                 :: pmax_block

      LOGICAL, INTENT(IN)                                :: use_virial


      CHARACTER(LEN=*), PARAMETER                        :: routinen = 'hfx_load_balance'


      CHARACTER(LEN=512)                                 :: error_msg

      INTEGER :: block_size, current_block_id, data_from, dest, handle, handle_inner, &

                 handle_range, i, iatom_block, iatom_end, iatom_start, ibin, icpu, j, jatom_block, &

                 jatom_end, jatom_start, katom_block, katom_end, katom_start, latom_block, latom_end, &

                 latom_start, mepos, my_process_id, n_processes, natom, nbins, nblocks, ncpu, &

                 new_iatom_end, new_iatom_start, new_jatom_end, new_jatom_start, non_empty_blocks, &

                 objective_block_size, objective_nblocks, source, total_blocks

      TYPE(mp_request_type), DIMENSION(2) :: req

      INTEGER(int_8) :: atom_block, cost_per_bin, cost_per_core, current_cost, &

                        distribution_counter_end, distribution_counter_start, global_quartet_counter, &

                        local_quartet_counter, self_cost_per_block, tmp_block, total_block_self_cost

      INTEGER(int_8), ALLOCATABLE, DIMENSION(:)          :: buffer_in, buffer_out

      INTEGER(int_8), DIMENSION(:), POINTER              :: local_cost_matrix, recbuffer, &

                                                            sendbuffer, swapbuffer

      INTEGER(int_8), DIMENSION(:), POINTER, SAVE        :: cost_matrix

      INTEGER(int_8), SAVE                               :: shm_global_quartet_counter, &

                                                            shm_local_quartet_counter

      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: rcount, rdispl, tmp_index, tmp_pos, &

                                                            to_be_sorted

      INTEGER, DIMENSION(:), POINTER, SAVE               :: shm_distribution_vector

      INTEGER, SAVE                                      :: shm_nblocks

      LOGICAL                                            :: changed, last_bin_needs_to_be_filled, &

                                                            optimized

      LOGICAL, DIMENSION(:, :), POINTER, SAVE            :: atomic_pair_list

      REAL(dp)                                           :: coeffs_kind_max0, log10_eps_schwarz, &

                                                            log_2, pmax_blocks

      TYPE(hfx_block_range_type), DIMENSION(:), POINTER  :: blocks_guess, tmp_blocks, tmp_blocks2

      TYPE(hfx_block_range_type), DIMENSION(:), &

         POINTER, SAVE                                   :: shm_blocks

      TYPE(hfx_distribution), DIMENSION(:), POINTER      :: binned_dist, ptr_to_tmp_dist, tmp_dist

      TYPE(hfx_distribution), DIMENSION(:, :), POINTER, &

         SAVE                                            :: full_dist

      TYPE(pair_list_type)                               :: list_ij, list_kl

      TYPE(pair_set_list_type), ALLOCATABLE, &

         DIMENSION(:)                                    :: set_list_ij, set_list_kl


!$OMP BARRIER

!$OMP MASTER

      CALL timeset(routinen, handle)

!$OMP END MASTER

!$OMP BARRIER


      log10_eps_schwarz = log10(eps_schwarz)

      log_2 = log10(2.0_dp)

      coeffs_kind_max0 = maxval(coeffs_kind(:, :)%x(2))

      ncpu = para_env%num_pe

      n_processes = ncpu*n_threads

      natom = SIZE(particle_set)


      block_size = load_balance_parameter%block_size

      ALLOCATE (set_list_ij((max_set*block_size)**2))

      ALLOCATE (set_list_kl((max_set*block_size)**2))


      IF (.NOT. load_balance_parameter%blocks_initialized) THEN

!$OMP BARRIER

!$OMP MASTER

         CALL timeset(routinen//"_range", handle_range)


         nblocks = max((natom + block_size - 1)/block_size, 1)

         ALLOCATE (blocks_guess(nblocks))

         ALLOCATE (tmp_blocks(natom))

         ALLOCATE (tmp_blocks2(natom))


         pmax_blocks = 0.0_dp

         SELECT CASE (eval_type)

         CASE (hfx_do_eval_energy)

            atomic_pair_list => x_data%atomic_pair_list

         CASE (hfx_do_eval_forces)

            atomic_pair_list => x_data%atomic_pair_list_forces

         END SELECT

         atomic_pair_list = .true.

         CALL init_blocks(nkind, para_env, natom, block_size, nblocks, blocks_guess, &

                          list_ij, list_kl, set_list_ij, set_list_kl, &

                          particle_set, &

                          coeffs_set, coeffs_kind, &

                          is_assoc_atomic_block_global, do_periodic, &

                          kind_of, basis_parameter, pmax_set, pmax_atom, &

                          pmax_blocks, cell, &

                          do_p_screening, map_atom_to_kind_atom, eval_type, &

                          log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)


         total_block_self_cost = 0


         DO i = 1, nblocks

            total_block_self_cost = total_block_self_cost + blocks_guess(i)%cost

         END DO


         CALL para_env%sum(total_block_self_cost)


         objective_block_size = load_balance_parameter%block_size

         objective_nblocks = max((natom + objective_block_size - 1)/objective_block_size, 1)


         self_cost_per_block = (total_block_self_cost + objective_nblocks - 1)/(objective_nblocks)


         DO i = 1, nblocks

            tmp_blocks2(i) = blocks_guess(i)

         END DO


         optimized = .false.

         i = 0

         DO WHILE (.NOT. optimized)

            i = i + 1

            current_block_id = 0

            changed = .false.

            DO atom_block = 1, nblocks

               current_block_id = current_block_id + 1

               iatom_start = tmp_blocks2(atom_block)%istart

               iatom_end = tmp_blocks2(atom_block)%iend

               IF (tmp_blocks2(atom_block)%cost > 1.5_dp*self_cost_per_block .AND. iatom_end - iatom_start > 0) THEN

                  changed = .true.

                  new_iatom_start = iatom_start

                  new_iatom_end = (iatom_end - iatom_start + 1)/2 + iatom_start - 1

                  new_jatom_start = new_iatom_end + 1

                  new_jatom_end = iatom_end

                  tmp_blocks(current_block_id)%istart = new_iatom_start

                  tmp_blocks(current_block_id)%iend = new_iatom_end

                  tmp_blocks(current_block_id)%cost = estimate_block_cost( &

                                                      natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &

                                                      new_iatom_start, new_iatom_end, new_iatom_start, new_iatom_end, &

                                                      new_iatom_start, new_iatom_end, new_iatom_start, new_iatom_end, &

                                                      particle_set, &

                                                      coeffs_set, coeffs_kind, &

                                                      is_assoc_atomic_block_global, do_periodic, &

                                                      kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &

                                                      cell, &

                                                      do_p_screening, map_atom_to_kind_atom, eval_type, &

                                                      log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)

                  current_block_id = current_block_id + 1

                  tmp_blocks(current_block_id)%istart = new_jatom_start

                  tmp_blocks(current_block_id)%iend = new_jatom_end

                  tmp_blocks(current_block_id)%cost = estimate_block_cost( &

                                                      natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &

                                                      new_jatom_start, new_jatom_end, new_jatom_start, new_jatom_end, &

                                                      new_jatom_start, new_jatom_end, new_jatom_start, new_jatom_end, &

                                                      particle_set, &

                                                      coeffs_set, coeffs_kind, &

                                                      is_assoc_atomic_block_global, do_periodic, &

                                                      kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &

                                                      cell, &

                                                      do_p_screening, map_atom_to_kind_atom, eval_type, &

                                                      log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)

               ELSE

                  tmp_blocks(current_block_id)%istart = iatom_start

                  tmp_blocks(current_block_id)%iend = iatom_end

                  tmp_blocks(current_block_id)%cost = tmp_blocks2(atom_block)%cost

               END IF

            END DO

            IF (.NOT. changed) optimized = .true.

            IF (i > 20) optimized = .true.

            nblocks = current_block_id

            DO atom_block = 1, nblocks

               tmp_blocks2(atom_block) = tmp_blocks(atom_block)

            END DO

         END DO


         DEALLOCATE (tmp_blocks2)


         ! ** count number of non empty blocks on each node

         non_empty_blocks = 0

         DO atom_block = 1, nblocks

            IF (tmp_blocks(atom_block)%istart == 0) cycle

            non_empty_blocks = non_empty_blocks + 1

         END DO


         ALLOCATE (rcount(ncpu))

         rcount = 0

         rcount(para_env%mepos + 1) = non_empty_blocks

         CALL para_env%sum(rcount)


         ! ** sum all non_empty_blocks

         total_blocks = 0

         DO i = 1, ncpu

            total_blocks = total_blocks + rcount(i)

         END DO


         ! ** calculate offsets

         ALLOCATE (rdispl(ncpu))

         rcount(:) = rcount(:)*3

         rdispl(1) = 0

         DO i = 2, ncpu

            rdispl(i) = rdispl(i - 1) + rcount(i - 1)

         END DO


         ALLOCATE (buffer_in(3*non_empty_blocks))


         non_empty_blocks = 0

         DO atom_block = 1, nblocks

            IF (tmp_blocks(atom_block)%istart == 0) cycle

            buffer_in(non_empty_blocks*3 + 1) = tmp_blocks(atom_block)%istart

            buffer_in(non_empty_blocks*3 + 2) = tmp_blocks(atom_block)%iend

            buffer_in(non_empty_blocks*3 + 3) = tmp_blocks(atom_block)%cost

            non_empty_blocks = non_empty_blocks + 1

         END DO


         nblocks = total_blocks


         ALLOCATE (tmp_blocks2(nblocks))


         ALLOCATE (buffer_out(3*nblocks))


         ! ** Gather all three arrays

         CALL para_env%allgatherv(buffer_in, buffer_out, rcount, rdispl)


         DO i = 1, nblocks

            tmp_blocks2(i)%istart = int(buffer_out((i - 1)*3 + 1))

            tmp_blocks2(i)%iend = int(buffer_out((i - 1)*3 + 2))

            tmp_blocks2(i)%cost = buffer_out((i - 1)*3 + 3)

         END DO


         ! ** Now we sort the blocks

         ALLOCATE (to_be_sorted(nblocks))

         ALLOCATE (tmp_index(nblocks))


         DO atom_block = 1, nblocks

            to_be_sorted(atom_block) = tmp_blocks2(atom_block)%istart

         END DO


         CALL sort(to_be_sorted, nblocks, tmp_index)


         ALLOCATE (x_data%blocks(nblocks))


         DO atom_block = 1, nblocks

            x_data%blocks(atom_block) = tmp_blocks2(tmp_index(atom_block))

         END DO


         shm_blocks => x_data%blocks

         shm_nblocks = nblocks


         ! ** Set nblocks in structure

         load_balance_parameter%nblocks = nblocks


         DEALLOCATE (blocks_guess, tmp_blocks, tmp_blocks2)


         DEALLOCATE (rcount, rdispl, buffer_in, buffer_out, to_be_sorted, tmp_index)


         load_balance_parameter%blocks_initialized = .true.


         x_data%blocks = shm_blocks

         load_balance_parameter%nblocks = shm_nblocks

         load_balance_parameter%blocks_initialized = .true.


         ALLOCATE (x_data%pmax_block(shm_nblocks, shm_nblocks))

         x_data%pmax_block = 0.0_dp

         pmax_block => x_data%pmax_block

         CALL timestop(handle_range)

!$OMP END MASTER

!$OMP BARRIER


         IF (.NOT. load_balance_parameter%blocks_initialized) THEN

            ALLOCATE (x_data%blocks(shm_nblocks))

            x_data%blocks = shm_blocks

            load_balance_parameter%nblocks = shm_nblocks

            load_balance_parameter%blocks_initialized = .true.

         END IF

         !! ** precalculate maximum density matrix elements in blocks

!$OMP BARRIER

      END IF


!$OMP BARRIER

!$OMP MASTER

      pmax_block => x_data%pmax_block

      pmax_block = 0.0_dp

      IF (do_p_screening) THEN

         DO iatom_block = 1, shm_nblocks

            iatom_start = x_data%blocks(iatom_block)%istart

            iatom_end = x_data%blocks(iatom_block)%iend

            DO jatom_block = 1, shm_nblocks

               jatom_start = x_data%blocks(jatom_block)%istart

               jatom_end = x_data%blocks(jatom_block)%iend

               pmax_block(iatom_block, jatom_block) = maxval(pmax_atom(iatom_start:iatom_end, jatom_start:jatom_end))

            END DO

         END DO

      END IF


      SELECT CASE (eval_type)

      CASE (hfx_do_eval_energy)

         atomic_pair_list => x_data%atomic_pair_list

      CASE (hfx_do_eval_forces)

         atomic_pair_list => x_data%atomic_pair_list_forces

      END SELECT

      CALL build_atomic_pair_list(natom, atomic_pair_list, kind_of, basis_parameter, particle_set, &

                                  do_periodic, coeffs_kind, coeffs_kind_max0, log10_eps_schwarz, cell, &

                                  x_data%blocks)


!$OMP END MASTER

!$OMP BARRIER


      !! If there is only 1 cpu skip the binning

      IF (n_processes == 1) THEN

         ALLOCATE (tmp_dist(1))

         tmp_dist(1)%number_of_atom_quartets = huge(tmp_dist(1)%number_of_atom_quartets)

         tmp_dist(1)%istart = 0_int_8

         ptr_to_tmp_dist => tmp_dist(:)

         SELECT CASE (eval_type)

         CASE (hfx_do_eval_energy)

            CALL hfx_set_distr_energy(ptr_to_tmp_dist, x_data)

         CASE (hfx_do_eval_forces)

            CALL hfx_set_distr_forces(ptr_to_tmp_dist, x_data)

         END SELECT

         DEALLOCATE (tmp_dist)

      ELSE

         !! Calculate total numbers of integrals that have to be calculated (wrt screening and symmetry)

!$OMP BARRIER

!$OMP MASTER

         CALL timeset(routinen//"_count", handle_inner)

!$OMP END MASTER

!$OMP BARRIER


         cost_per_core = 0_int_8

         my_process_id = para_env%mepos*n_threads + i_thread

         nblocks = load_balance_parameter%nblocks


         DO atom_block = my_process_id, int(nblocks, kind=int_8)**4 - 1, n_processes


            latom_block = int(modulo(atom_block, int(nblocks, kind=int_8))) + 1

            tmp_block = atom_block/nblocks

            katom_block = int(modulo(tmp_block, int(nblocks, kind=int_8))) + 1

            IF (latom_block < katom_block) cycle

            tmp_block = tmp_block/nblocks

            jatom_block = int(modulo(tmp_block, int(nblocks, kind=int_8))) + 1

            tmp_block = tmp_block/nblocks

            iatom_block = int(modulo(tmp_block, int(nblocks, kind=int_8))) + 1

            IF (jatom_block < iatom_block) cycle


            iatom_start = x_data%blocks(iatom_block)%istart

            iatom_end = x_data%blocks(iatom_block)%iend

            jatom_start = x_data%blocks(jatom_block)%istart

            jatom_end = x_data%blocks(jatom_block)%iend

            katom_start = x_data%blocks(katom_block)%istart

            katom_end = x_data%blocks(katom_block)%iend

            latom_start = x_data%blocks(latom_block)%istart

            latom_end = x_data%blocks(latom_block)%iend


            SELECT CASE (eval_type)

            CASE (hfx_do_eval_energy)

               pmax_blocks = max(pmax_block(katom_block, iatom_block), &

                                 pmax_block(latom_block, jatom_block), &

                                 pmax_block(latom_block, iatom_block), &

                                 pmax_block(katom_block, jatom_block))

            CASE (hfx_do_eval_forces)

               pmax_blocks = max(pmax_block(katom_block, iatom_block) + &

                                 pmax_block(latom_block, jatom_block), &

                                 pmax_block(latom_block, iatom_block) + &

                                 pmax_block(katom_block, jatom_block))

            END SELECT


            IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) cycle


            cost_per_core = cost_per_core &

                            + estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &

                                                  iatom_start, iatom_end, jatom_start, jatom_end, &

                                                  katom_start, katom_end, latom_start, latom_end, &

                                                  particle_set, &

                                                  coeffs_set, coeffs_kind, &

                                                  is_assoc_atomic_block_global, do_periodic, &

                                                  kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &

                                                  cell, &

                                                  do_p_screening, map_atom_to_kind_atom, eval_type, &

                                                  log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)


         END DO ! atom_block


         nbins = load_balance_parameter%nbins

         cost_per_bin = (cost_per_core + nbins - 1)/(nbins)


!$OMP BARRIER

!$OMP MASTER

         CALL timestop(handle_inner)

!$OMP END MASTER

!$OMP BARRIER


! new load balancing test

         IF (.false.) THEN

            CALL hfx_recursive_load_balance(n_processes, my_process_id, nblocks, &

                                            natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &

                                            particle_set, &

                                            coeffs_set, coeffs_kind, &

                                            is_assoc_atomic_block_global, do_periodic, &

                                            kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &

                                            cell, x_data, para_env, pmax_block, &

                                            do_p_screening, map_atom_to_kind_atom, eval_type, &

                                            log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)

         END IF


!$OMP BARRIER

!$OMP MASTER

         CALL timeset(routinen//"_bin", handle_inner)

!$OMP END MASTER

!$OMP BARRIER


         ALLOCATE (binned_dist(nbins))

         binned_dist(:)%istart = -1_int_8

         binned_dist(:)%number_of_atom_quartets = 0_int_8

         binned_dist(:)%cost = 0_int_8

         binned_dist(:)%time_first_scf = 0.0_dp

         binned_dist(:)%time_other_scf = 0.0_dp

         binned_dist(:)%time_forces = 0.0_dp


         current_cost = 0

         mepos = 1

         distribution_counter_start = 1

         distribution_counter_end = 0

         ibin = 1


         global_quartet_counter = 0

         local_quartet_counter = 0

         last_bin_needs_to_be_filled = .false.

         DO atom_block = my_process_id, int(nblocks, kind=int_8)**4 - 1, n_processes

            latom_block = int(modulo(atom_block, int(nblocks, kind=int_8))) + 1

            tmp_block = atom_block/nblocks

            katom_block = int(modulo(tmp_block, int(nblocks, kind=int_8))) + 1

            IF (latom_block < katom_block) cycle

            tmp_block = tmp_block/nblocks

            jatom_block = int(modulo(tmp_block, int(nblocks, kind=int_8))) + 1

            tmp_block = tmp_block/nblocks

            iatom_block = int(modulo(tmp_block, int(nblocks, kind=int_8))) + 1

            IF (jatom_block < iatom_block) cycle


            distribution_counter_end = distribution_counter_end + 1

            global_quartet_counter = global_quartet_counter + 1

            last_bin_needs_to_be_filled = .true.


            IF (binned_dist(ibin)%istart == -1_int_8) binned_dist(ibin)%istart = atom_block


            iatom_start = x_data%blocks(iatom_block)%istart

            iatom_end = x_data%blocks(iatom_block)%iend

            jatom_start = x_data%blocks(jatom_block)%istart

            jatom_end = x_data%blocks(jatom_block)%iend

            katom_start = x_data%blocks(katom_block)%istart

            katom_end = x_data%blocks(katom_block)%iend

            latom_start = x_data%blocks(latom_block)%istart

            latom_end = x_data%blocks(latom_block)%iend


            SELECT CASE (eval_type)

            CASE (hfx_do_eval_energy)

               pmax_blocks = max(pmax_block(katom_block, iatom_block), &

                                 pmax_block(latom_block, jatom_block), &

                                 pmax_block(latom_block, iatom_block), &

                                 pmax_block(katom_block, jatom_block))

            CASE (hfx_do_eval_forces)

               pmax_blocks = max(pmax_block(katom_block, iatom_block) + &

                                 pmax_block(latom_block, jatom_block), &

                                 pmax_block(latom_block, iatom_block) + &

                                 pmax_block(katom_block, jatom_block))

            END SELECT


            IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) cycle


            current_cost = current_cost &

                           + estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &

                                                 iatom_start, iatom_end, jatom_start, jatom_end, &

                                                 katom_start, katom_end, latom_start, latom_end, &

                                                 particle_set, &

                                                 coeffs_set, coeffs_kind, &

                                                 is_assoc_atomic_block_global, do_periodic, &

                                                 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &

                                                 cell, &

                                                 do_p_screening, map_atom_to_kind_atom, eval_type, &

                                                 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)


            IF (current_cost >= cost_per_bin) THEN

               IF (ibin == nbins) THEN

                  binned_dist(ibin)%number_of_atom_quartets = binned_dist(ibin)%number_of_atom_quartets + &

                                                              distribution_counter_end - distribution_counter_start + 1

               ELSE

                  binned_dist(ibin)%number_of_atom_quartets = distribution_counter_end - distribution_counter_start + 1

               END IF

               binned_dist(ibin)%cost = binned_dist(ibin)%cost + current_cost

               ibin = min(ibin + 1, nbins)

               distribution_counter_start = distribution_counter_end + 1

               current_cost = 0

               last_bin_needs_to_be_filled = .false.

            END IF

         END DO


!$OMP BARRIER

!$OMP MASTER

         CALL timestop(handle_inner)

         CALL timeset(routinen//"_dist", handle_inner)

!$OMP END MASTER

!$OMP BARRIER

         !! Fill the last bin if necessary

         IF (last_bin_needs_to_be_filled) THEN

            binned_dist(ibin)%cost = binned_dist(ibin)%cost + current_cost

            IF (ibin == nbins) THEN

               binned_dist(ibin)%number_of_atom_quartets = binned_dist(ibin)%number_of_atom_quartets + &

                                                           distribution_counter_end - distribution_counter_start + 1

            ELSE

               binned_dist(ibin)%number_of_atom_quartets = distribution_counter_end - distribution_counter_start + 1

            END IF

         END IF


         !! Sanity-Check

         DO ibin = 1, nbins

            local_quartet_counter = local_quartet_counter + binned_dist(ibin)%number_of_atom_quartets

         END DO

!$OMP BARRIER

!$OMP MASTER

         shm_local_quartet_counter = 0

         shm_global_quartet_counter = 0

!$OMP END MASTER

!$OMP BARRIER

!$OMP ATOMIC

         shm_local_quartet_counter = shm_local_quartet_counter + local_quartet_counter

!$OMP ATOMIC

         shm_global_quartet_counter = shm_global_quartet_counter + global_quartet_counter


!$OMP BARRIER

!$OMP MASTER

         CALL para_env%sum(shm_local_quartet_counter)

         CALL para_env%sum(shm_global_quartet_counter)

         IF (para_env%is_source()) THEN

            IF (shm_local_quartet_counter /= shm_global_quartet_counter) THEN

               WRITE (error_msg, '(A,I0,A,I0,A)') "HFX Sanity check for parallel distribution failed. "// &

                  "Number of local quartets (", shm_local_quartet_counter, &

                  ") and number of global quartets (", shm_global_quartet_counter, &

                  ") are different. Please send in a bug report."

               cpabort(error_msg)

            END IF

         END IF

!$OMP END MASTER


!$OMP BARRIER

!$OMP MASTER

         ALLOCATE (cost_matrix(ncpu*nbins*n_threads))

         cost_matrix = 0

!$OMP END MASTER

!$OMP BARRIER

         icpu = para_env%mepos + 1

         DO i = 1, nbins

            cost_matrix((icpu - 1)*nbins*n_threads + i_thread*nbins + i) = binned_dist(i)%cost

         END DO

         mepos = para_env%mepos

!$OMP BARRIER


!$OMP MASTER

         ! sync before/after ring of isendrecv

         CALL para_env%sync()


         ALLOCATE (sendbuffer(nbins*n_threads))

         ALLOCATE (recbuffer(nbins*n_threads))


         sendbuffer = cost_matrix(mepos*nbins*n_threads + 1:mepos*nbins*n_threads + nbins*n_threads)


         dest = modulo(mepos + 1, ncpu)

         source = modulo(mepos - 1, ncpu)

         DO icpu = 0, ncpu - 1

            IF (icpu /= ncpu - 1) THEN

               CALL para_env%isendrecv(sendbuffer, dest, recbuffer, source, &

                                       req(1), req(2), 13)

            END IF

            data_from = modulo(mepos - icpu, ncpu)

            cost_matrix(data_from*nbins*n_threads + 1:data_from*nbins*n_threads + nbins*n_threads) = sendbuffer

            IF (icpu /= ncpu - 1) THEN

               CALL mp_waitall(req)

            END IF

            swapbuffer => sendbuffer

            sendbuffer => recbuffer

            recbuffer => swapbuffer

         END DO

         DEALLOCATE (recbuffer, sendbuffer)

!$OMP END MASTER

!$OMP BARRIER


!$OMP BARRIER

!$OMP MASTER

         CALL timestop(handle_inner)

         CALL timeset(routinen//"_opt", handle_inner)

!$OMP END MASTER

!$OMP BARRIER


         !! Find an optimal distribution i.e. assign each element of the cost matrix to a certain process

!$OMP BARRIER

         ALLOCATE (local_cost_matrix(SIZE(cost_matrix, 1)))

         local_cost_matrix = cost_matrix

!$OMP MASTER

         ALLOCATE (shm_distribution_vector(ncpu*nbins*n_threads))


         CALL optimize_distribution(ncpu*nbins*n_threads, ncpu*n_threads, local_cost_matrix, &

                                    shm_distribution_vector, x_data%load_balance_parameter%do_randomize)


         CALL timestop(handle_inner)

         CALL timeset(routinen//"_redist", handle_inner)

         !! Collect local data to global array

         ALLOCATE (full_dist(ncpu*n_threads, nbins))


         full_dist(:, :)%istart = 0_int_8

         full_dist(:, :)%number_of_atom_quartets = 0_int_8

         full_dist(:, :)%cost = 0_int_8

         full_dist(:, :)%time_first_scf = 0.0_dp

         full_dist(:, :)%time_other_scf = 0.0_dp

         full_dist(:, :)%time_forces = 0.0_dp

!$OMP END MASTER

!$OMP BARRIER

         mepos = para_env%mepos + 1

         full_dist((mepos - 1)*n_threads + i_thread + 1, :) = binned_dist(:)


!$OMP BARRIER

!$OMP MASTER

         ALLOCATE (sendbuffer(3*nbins*n_threads))

         ALLOCATE (recbuffer(3*nbins*n_threads))

         mepos = para_env%mepos

         DO j = 1, n_threads

            DO i = 1, nbins

               sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 1) = full_dist(mepos*n_threads + j, i)%istart

               sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 2) = full_dist(mepos*n_threads + j, i)%number_of_atom_quartets

               sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 3) = full_dist(mepos*n_threads + j, i)%cost

            END DO

         END DO


         ! sync before/after ring of isendrecv

         CALL para_env%sync()

         dest = modulo(mepos + 1, ncpu)

         source = modulo(mepos - 1, ncpu)

         DO icpu = 0, ncpu - 1

            IF (icpu /= ncpu - 1) THEN

               CALL para_env%isendrecv(sendbuffer, dest, recbuffer, source, &

                                       req(1), req(2), 13)

            END IF

            data_from = modulo(mepos - icpu, ncpu)

            DO j = 1, n_threads

               DO i = 1, nbins

                  full_dist(data_from*n_threads + j, i)%istart = sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 1)

                  full_dist(data_from*n_threads + j, i)%number_of_atom_quartets = sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 2)

                  full_dist(data_from*n_threads + j, i)%cost = sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 3)

               END DO

            END DO


            IF (icpu /= ncpu - 1) THEN

               CALL mp_waitall(req)

            END IF

            swapbuffer => sendbuffer

            sendbuffer => recbuffer

            recbuffer => swapbuffer

         END DO

         DEALLOCATE (recbuffer, sendbuffer)


         ! sync before/after ring of isendrecv

         CALL para_env%sync()

!$OMP END MASTER

!$OMP BARRIER

         !! reorder the distribution according to the distribution vector

         ALLOCATE (tmp_pos(ncpu*n_threads))

         tmp_pos = 1

         ALLOCATE (tmp_dist(nbins*ncpu*n_threads))


         tmp_dist(:)%istart = 0_int_8

         tmp_dist(:)%number_of_atom_quartets = 0_int_8

         tmp_dist(:)%cost = 0_int_8

         tmp_dist(:)%time_first_scf = 0.0_dp

         tmp_dist(:)%time_other_scf = 0.0_dp

         tmp_dist(:)%time_forces = 0.0_dp


         DO icpu = 1, n_processes

            DO i = 1, nbins

               mepos = my_process_id + 1

               IF (shm_distribution_vector((icpu - 1)*nbins + i) == mepos) THEN

                  tmp_dist(tmp_pos(mepos)) = full_dist(icpu, i)

                  tmp_pos(mepos) = tmp_pos(mepos) + 1

               END IF

            END DO

         END DO


         !! Assign the load to each process

         NULLIFY (ptr_to_tmp_dist)

         mepos = my_process_id + 1

         ptr_to_tmp_dist => tmp_dist(1:tmp_pos(mepos) - 1)

         SELECT CASE (eval_type)

         CASE (hfx_do_eval_energy)

            CALL hfx_set_distr_energy(ptr_to_tmp_dist, x_data)

         CASE (hfx_do_eval_forces)

            CALL hfx_set_distr_forces(ptr_to_tmp_dist, x_data)

         END SELECT


!$OMP BARRIER

!$OMP MASTER

         DEALLOCATE (full_dist, cost_matrix, shm_distribution_vector)

!$OMP END MASTER

!$OMP BARRIER

         DEALLOCATE (tmp_dist, tmp_pos)

         DEALLOCATE (binned_dist, local_cost_matrix)

         DEALLOCATE (set_list_ij, set_list_kl)


!$OMP BARRIER

!$OMP MASTER

         CALL timestop(handle_inner)

!$OMP END MASTER

!$OMP BARRIER

      END IF

!$OMP BARRIER

!$OMP MASTER

      CALL timestop(handle)

!$OMP END MASTER

!$OMP BARRIER


   END SUBROUTINE hfx_load_balance


! **************************************************************************************************

!> \brief Reference implementation of new recursive load balancing routine

!>        Computes a local list of atom_blocks (p_atom_blocks,q_atom_blocks) for

!>        each process in a P-Q grid such that every process has more or less the

!>        same amount of work. Has no output at the moment (not used) but writes

!>        its computed load balance values into a file. Possible output is ready

!>        to use in the two arrays p_atom_blocks & q_atom_blocks

!> \param n_processes ...

!> \param my_process_id ...

!> \param nblocks ...

!> \param natom ...

!> \param nkind ...

!> \param list_ij ...

!> \param list_kl ...

!> \param set_list_ij ...

!> \param set_list_kl ...

!> \param particle_set ...

!> \param coeffs_set ...

!> \param coeffs_kind ...

!> \param is_assoc_atomic_block_global ...

!> \param do_periodic ...

!> \param kind_of ...

!> \param basis_parameter ...

!> \param pmax_set ...

!> \param pmax_atom ...

!> \param pmax_blocks ...

!> \param cell ...

!> \param x_data ...

!> \param para_env ...

!> \param pmax_block ...

!> \param do_p_screening ...

!> \param map_atom_to_kind_atom ...

!> \param eval_type ...

!> \param log10_eps_schwarz ...

!> \param log_2 ...

!> \param coeffs_kind_max0 ...

!> \param use_virial ...

!> \param atomic_pair_list ...

!> \par History

!>      03.2011 created [Michael Steinlechner]

!> \author Michael Steinlechner

! **************************************************************************************************


   SUBROUTINE hfx_recursive_load_balance(n_processes, my_process_id, nblocks, &

                                         natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &

                                         particle_set, &

                                         coeffs_set, coeffs_kind, &

                                         is_assoc_atomic_block_global, do_periodic, &

                                         kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &

                                         cell, x_data, para_env, pmax_block, &

                                         do_p_screening, map_atom_to_kind_atom, eval_type, &

                                         log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)


! input variables:

      INTEGER, INTENT(IN)                                :: n_processes, my_process_id, nblocks, &

                                                            natom, nkind

      TYPE(pair_list_type), INTENT(IN)                   :: list_ij, list_kl

      TYPE(pair_set_list_type), ALLOCATABLE, &

         DIMENSION(:), INTENT(IN)                        :: set_list_ij, set_list_kl

      TYPE(particle_type), DIMENSION(:), INTENT(IN), &

         POINTER                                         :: particle_set

      TYPE(hfx_screen_coeff_type), &

         DIMENSION(:, :, :, :), INTENT(IN), POINTER      :: coeffs_set

      TYPE(hfx_screen_coeff_type), DIMENSION(:, :), &

         INTENT(IN), POINTER                             :: coeffs_kind

      INTEGER, DIMENSION(:, :), INTENT(IN)               :: is_assoc_atomic_block_global

      LOGICAL, INTENT(IN)                                :: do_periodic

      INTEGER, INTENT(IN)                                :: kind_of(*)

      TYPE(hfx_basis_type), DIMENSION(:), INTENT(IN), &

         POINTER                                         :: basis_parameter

      TYPE(hfx_p_kind), DIMENSION(:), INTENT(IN), &

         POINTER                                         :: pmax_set

      REAL(dp), DIMENSION(:, :), INTENT(IN), POINTER     :: pmax_atom

      REAL(dp)                                           :: pmax_blocks

      TYPE(cell_type), INTENT(IN), POINTER               :: cell

      TYPE(hfx_type), INTENT(IN), POINTER                :: x_data

      TYPE(mp_para_env_type), INTENT(IN)        :: para_env

      REAL(dp), DIMENSION(:, :), INTENT(IN), POINTER     :: pmax_block

      LOGICAL, INTENT(IN)                                :: do_p_screening

      INTEGER, DIMENSION(:), INTENT(IN), POINTER         :: map_atom_to_kind_atom

      INTEGER, INTENT(IN)                                :: eval_type

      REAL(dp), INTENT(IN)                               :: log10_eps_schwarz, log_2, &

                                                            coeffs_kind_max0

      LOGICAL, INTENT(IN)                                :: use_virial

      LOGICAL, DIMENSION(:, :), INTENT(IN), POINTER      :: atomic_pair_list


      CHARACTER(LEN=*), PARAMETER :: routinen = 'hfx_recursive_load_balance'


      INTEGER :: handle, i, iatom_block, iatom_end, iatom_start, j, jatom_block, jatom_end, &

                 jatom_start, katom_block, katom_end, katom_start, latom_block, latom_end, latom_start, &

                 np, nq, numbins, p, q, sizep, sizeq, unit_nr

      INTEGER(int_8)                                     :: local_cost, pidx, qidx, sump, sumq

      INTEGER(int_8), ALLOCATABLE, DIMENSION(:)          :: local_cost_vector

      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: blocksize, p_atom_blocks, permute, &

                                                            q_atom_blocks

      REAL(dp)                                           :: maximum, mean


! internal variables:


!$OMP BARRIER

!$OMP MASTER

      CALL timeset(routinen, handle)

!$OMP END MASTER

!$OMP BARRIER


      ! calculate best p/q distribution grid for the n_processes

      CALL hfx_calculate_pq(p, q, numbins, n_processes)


      ALLOCATE (blocksize(numbins))

      ALLOCATE (permute(nblocks**2))

      DO i = 1, nblocks**2

         permute(i) = i

      END DO


      ! call the main recursive permutation routine.

      ! Output:

      !   blocksize :: vector (size numBins) with the sizes for each column/row block

      !   permute   :: permutation vector

      CALL hfx_recursive_permute(blocksize, 1, nblocks**2, numbins, &

                                 permute, 1, &

                                 my_process_id, n_processes, nblocks, &

                                 natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &

                                 particle_set, &

                                 coeffs_set, coeffs_kind, &

                                 is_assoc_atomic_block_global, do_periodic, &

                                 kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &

                                 cell, x_data, para_env, pmax_block, &

                                 do_p_screening, map_atom_to_kind_atom, eval_type, &

                                 log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)


      ! number of blocks per processor in p-direction (vertical)

      np = numbins/p

      ! number of blocks per processor in q-direction (horizontal)

      nq = numbins/q


      ! calc own position in P-Q-processor grid (PQ-grid is column-major)

      pidx = modulo(int(my_process_id), int(p)) + 1

      qidx = my_process_id/p + 1


      sizep = sum(blocksize((np*(pidx - 1) + 1):(np*pidx)))

      sizeq = sum(blocksize((nq*(qidx - 1) + 1):(nq*qidx)))


      sump = sum(blocksize(1:(np*(pidx - 1))))

      sumq = sum(blocksize(1:(nq*(qidx - 1))))


      ALLOCATE (p_atom_blocks(sizep))

      ALLOCATE (q_atom_blocks(sizeq))


      p_atom_blocks(:) = permute((sump + 1):(sump + sizep))

      q_atom_blocks(:) = permute((sumq + 1):(sumq + sizeq))


      ! from here on, we are actually finished, each process has been

      ! assigned a (p_atom_blocks,q_atom_blocks) pair list.

      ! what follows is just a small routine to calculate the local cost

      ! for each processor which is then written to a file.


      ! calculate local cost for each processor!

      ! ****************************************

      local_cost = 0

      DO i = 1, sizep

         DO j = 1, sizeq


            !       get corresponding 4D block indices out of our own P-Q-block

            latom_block = modulo(q_atom_blocks(j), nblocks)

            iatom_block = q_atom_blocks(j)/nblocks + 1

            jatom_block = modulo(p_atom_blocks(i), nblocks)

            katom_block = p_atom_blocks(i)/nblocks + 1


            !       symmetry checks.

            IF (latom_block < katom_block) cycle

            IF (jatom_block < iatom_block) cycle


            iatom_start = x_data%blocks(iatom_block)%istart

            iatom_end = x_data%blocks(iatom_block)%iend

            jatom_start = x_data%blocks(jatom_block)%istart

            jatom_end = x_data%blocks(jatom_block)%iend

            katom_start = x_data%blocks(katom_block)%istart

            katom_end = x_data%blocks(katom_block)%iend

            latom_start = x_data%blocks(latom_block)%istart

            latom_end = x_data%blocks(latom_block)%iend


            !       whatever.

            SELECT CASE (eval_type)

            CASE (hfx_do_eval_energy)

               pmax_blocks = max(pmax_block(katom_block, iatom_block), &

                                 pmax_block(latom_block, jatom_block), &

                                 pmax_block(latom_block, iatom_block), &

                                 pmax_block(katom_block, jatom_block))

            CASE (hfx_do_eval_forces)

               pmax_blocks = max(pmax_block(katom_block, iatom_block) + &

                                 pmax_block(latom_block, jatom_block), &

                                 pmax_block(latom_block, iatom_block) + &

                                 pmax_block(katom_block, jatom_block))

            END SELECT


            !       screening.

            IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) cycle


            !       estimate the cost of this atom_block.

            local_cost = local_cost + estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, &

                                                          set_list_kl, &

                                                          iatom_start, iatom_end, jatom_start, jatom_end, &

                                                          katom_start, katom_end, latom_start, latom_end, &

                                                          particle_set, &

                                                          coeffs_set, coeffs_kind, &

                                                          is_assoc_atomic_block_global, do_periodic, &

                                                          kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &

                                                          cell, &

                                                          do_p_screening, map_atom_to_kind_atom, eval_type, &

                                                          log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)

         END DO

      END DO


      ALLOCATE (local_cost_vector(n_processes))

      local_cost_vector = 0

      local_cost_vector(my_process_id + 1) = local_cost

      CALL para_env%sum(local_cost_vector)


      mean = sum(local_cost_vector)/n_processes

      maximum = maxval(local_cost_vector)


!$OMP     BARRIER

!$OMP     MASTER

      ! only output once

      IF (my_process_id == 0) THEN

         CALL open_file(unit_number=unit_nr, file_name="loads.dat")

         WRITE (unit_nr, *) 'maximum cost:', maximum

         WRITE (unit_nr, *) 'mean cost:', mean

         WRITE (unit_nr, *) 'load balance ratio max/mean: ', maximum/mean

         WRITE (unit_nr, *) '-------- detailed per-process costs ---------'

         DO i = 1, n_processes

            WRITE (unit_nr, *) local_cost_vector(i)

         END DO

         CALL close_file(unit_nr)

      END IF

!$OMP     END MASTER

!$OMP     BARRIER


      DEALLOCATE (local_cost_vector)

      DEALLOCATE (p_atom_blocks, q_atom_blocks)

      DEALLOCATE (blocksize, permute)


!$OMP BARRIER

!$OMP MASTER

      CALL timestop(handle)

!$OMP END MASTER

!$OMP BARRIER


   END SUBROUTINE hfx_recursive_load_balance


! **************************************************************************************************

!> \brief Small routine to calculate the optimal P-Q-processor grid distribution

!>        for a given number of processors N

!>        and the corresponding number of Bins for the load balancing routine

!> \param p     number of rows on P-Q process grid (output)

!> \param q     number of columns on P-Q process grid (output)

!> \param nBins number of Bins (output)

!> \param N     number of processes (input)

!> \par History

!>      03.2011 created [Michael Steinlechner]

!> \author Michael Steinlechner

! **************************************************************************************************

   SUBROUTINE hfx_calculate_pq(p, q, nBins, N)


      INTEGER, INTENT(OUT)                               :: p, q, nbins

      INTEGER, INTENT(IN)                                :: n


      INTEGER                                            :: a, b, k

      REAL(dp)                                           :: sqn


      k = 2

      sqn = sqrt(real(n, kind=dp))

      p = 1


      DO WHILE (real(k, kind=dp) <= sqn)

         IF (modulo(n, k) == 0) THEN

            p = k

         END IF

         k = k + 1

      END DO

      q = n/p


      ! now compute the least common multiple of p & q to get the number of necessary bins

      ! compute using the relation LCM(p,q) = abs(p*q) / GCD(p,q)

      ! and use euclid's algorithm for GCD computation.

      a = p

      b = q


      DO WHILE (b /= 0)

         IF (a > b) THEN

            a = a - b

         ELSE

            b = b - a

         END IF

      END DO

      ! gcd(p,q) is now saved in a


      nbins = p*q/a


   END SUBROUTINE hfx_calculate_pq


! **************************************************************************************************

!> \brief Recursive permutation routine for the load balancing of the integral

!>       computation

!> \param blocksize     vector of blocksizes, size(nProc), which contains for

!>                      each process the local blocksize (OUTPUT)

!> \param blockstart    starting row/column idx of the block which is to be examined

!>                      at this point (INPUT)

!> \param blockend      ending row/column idx of the block which is to be examined

!>                      (INPUT)

!> \param nProc_in      number of bins into which the current block has to be divided

!>                      (INPUT)

!> \param permute       permutation vector which balances column/row cost

!>                      size(nblocks^2). (OUTPUT)

!> \param step ...

!> \param my_process_id ...

!> \param n_processes ...

!> \param nblocks ...

!> \param natom ...

!> \param nkind ...

!> \param list_ij ...

!> \param list_kl ...

!> \param set_list_ij ...

!> \param set_list_kl ...

!> \param particle_set ...

!> \param coeffs_set ...

!> \param coeffs_kind ...

!> \param is_assoc_atomic_block_global ...

!> \param do_periodic ...

!> \param kind_of ...

!> \param basis_parameter ...

!> \param pmax_set ...

!> \param pmax_atom ...

!> \param pmax_blocks ...

!> \param cell ...

!> \param x_data ...

!> \param para_env ...

!> \param pmax_block ...

!> \param do_p_screening ...

!> \param map_atom_to_kind_atom ...

!> \param eval_type ...

!> \param log10_eps_schwarz ...

!> \param log_2 ...

!> \param coeffs_kind_max0 ...

!> \param use_virial ...

!> \param atomic_pair_list ...

!> \par History

!>      03.2011 created [Michael Steinlechner]

!> \author Michael Steinlechner

! **************************************************************************************************

   RECURSIVE SUBROUTINE hfx_recursive_permute(blocksize, blockstart, blockend, nProc_in, &

                                              permute, step, &

                                              my_process_id, n_processes, nblocks, &

                                              natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &

                                              particle_set, &

                                              coeffs_set, coeffs_kind, &

                                              is_assoc_atomic_block_global, do_periodic, &

                                              kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &

                                              cell, x_data, para_env, pmax_block, &

                                              do_p_screening, map_atom_to_kind_atom, eval_type, &

                                              log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)


      INTEGER                                            :: nproc_in, blockend, blockstart

      INTEGER, DIMENSION(nProc_in)                       :: blocksize

      INTEGER                                            :: nblocks, n_processes, my_process_id

      INTEGER, INTENT(IN)                                :: step

      INTEGER, DIMENSION(nblocks*nblocks)                :: permute

      INTEGER                                            :: natom

      INTEGER, INTENT(IN)                                :: nkind

      TYPE(pair_list_type)                               :: list_ij, list_kl

      TYPE(pair_set_list_type), ALLOCATABLE, &

         DIMENSION(:)                                    :: set_list_ij, set_list_kl

      TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set

      TYPE(hfx_screen_coeff_type), &

         DIMENSION(:, :, :, :), POINTER                  :: coeffs_set

      TYPE(hfx_screen_coeff_type), DIMENSION(:, :), &

         POINTER                                         :: coeffs_kind

      INTEGER, DIMENSION(:, :)                           :: is_assoc_atomic_block_global

      LOGICAL                                            :: do_periodic

      INTEGER                                            :: kind_of(*)

      TYPE(hfx_basis_type), DIMENSION(:), POINTER        :: basis_parameter

      TYPE(hfx_p_kind), DIMENSION(:), POINTER            :: pmax_set

      REAL(dp), DIMENSION(:, :), POINTER                 :: pmax_atom

      REAL(dp)                                           :: pmax_blocks

      TYPE(cell_type), POINTER                           :: cell

      TYPE(hfx_type), POINTER                            :: x_data

      TYPE(mp_para_env_type), INTENT(IN)                 :: para_env

      REAL(dp), DIMENSION(:, :), POINTER                 :: pmax_block

      LOGICAL, INTENT(IN)                                :: do_p_screening

      INTEGER, DIMENSION(:), POINTER                     :: map_atom_to_kind_atom

      INTEGER, INTENT(IN)                                :: eval_type

      REAL(dp)                                           :: log10_eps_schwarz, log_2, &

                                                            coeffs_kind_max0

      LOGICAL, INTENT(IN)                                :: use_virial

      LOGICAL, DIMENSION(:, :), POINTER                  :: atomic_pair_list


      INTEGER :: col, endoffset, i, iatom_block, iatom_end, iatom_start, idx, inv_perm, &

                 jatom_block, jatom_end, jatom_start, katom_block, katom_end, katom_start, latom_block, &

                 latom_end, latom_start, nbins, nproc, row, startoffset

      INTEGER(int_8)                                     :: atom_block, tmp_block

      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: ithblocksize, localblocksize

      INTEGER, DIMENSION(blockend - blockstart + 1)          :: bin_perm, tmp_perm

      REAL(dp)                                           :: partialcost

      REAL(dp), DIMENSION(nblocks*nblocks)               :: cost_vector


      nproc = nproc_in

      cost_vector = 0.0_dp


!   loop over local atom_blocks.

      DO atom_block = my_process_id, int(nblocks, kind=int_8)**4 - 1, n_processes


!       get corresponding 4D block indices

         latom_block = int(modulo(atom_block, int(nblocks, kind=int_8))) + 1

         tmp_block = atom_block/nblocks

         katom_block = int(modulo(tmp_block, int(nblocks, kind=int_8))) + 1

         IF (latom_block < katom_block) cycle

         tmp_block = tmp_block/nblocks

         jatom_block = int(modulo(tmp_block, int(nblocks, kind=int_8))) + 1

         tmp_block = tmp_block/nblocks

         iatom_block = int(modulo(tmp_block, int(nblocks, kind=int_8))) + 1

         IF (jatom_block < iatom_block) cycle


!       get 2D indices of this atom_block (with permutation applied)

!       for this, we need to invert the permutation, this means

!       find position in permutation vector where value==idx


         row = (katom_block - 1)*nblocks + jatom_block

         inv_perm = 1

         DO WHILE (permute(inv_perm) /= row)

            inv_perm = inv_perm + 1

         END DO

         row = inv_perm


         col = (iatom_block - 1)*nblocks + latom_block

         inv_perm = 1

         DO WHILE (permute(inv_perm) /= col)

            inv_perm = inv_perm + 1

         END DO

         col = inv_perm


!       if row/col outside our current diagonal block, skip calculation.

         IF (col < blockstart .OR. col > blockend) cycle

         IF (row < blockstart .OR. row > blockend) cycle


         iatom_start = x_data%blocks(iatom_block)%istart

         iatom_end = x_data%blocks(iatom_block)%iend

         jatom_start = x_data%blocks(jatom_block)%istart

         jatom_end = x_data%blocks(jatom_block)%iend

         katom_start = x_data%blocks(katom_block)%istart

         katom_end = x_data%blocks(katom_block)%iend

         latom_start = x_data%blocks(latom_block)%istart

         latom_end = x_data%blocks(latom_block)%iend


!       whatever.

         SELECT CASE (eval_type)

         CASE (hfx_do_eval_energy)

            pmax_blocks = max(pmax_block(katom_block, iatom_block), &

                              pmax_block(latom_block, jatom_block), &

                              pmax_block(latom_block, iatom_block), &

                              pmax_block(katom_block, jatom_block))

         CASE (hfx_do_eval_forces)

            pmax_blocks = max(pmax_block(katom_block, iatom_block) + &

                              pmax_block(latom_block, jatom_block), &

                              pmax_block(latom_block, iatom_block) + &

                              pmax_block(katom_block, jatom_block))

         END SELECT


!       screening.

         IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) cycle


!       every second recursion step, compute row sum instead of column sum


         IF (modulo(step, 2) == 0) THEN

            idx = row

         ELSE

            idx = col

         END IF


!       estimate the cost of this atom_block.

         partialcost = estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, &

                                           set_list_kl, &

                                           iatom_start, iatom_end, jatom_start, jatom_end, &

                                           katom_start, katom_end, latom_start, latom_end, &

                                           particle_set, &

                                           coeffs_set, coeffs_kind, &

                                           is_assoc_atomic_block_global, do_periodic, &

                                           kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &

                                           cell, &

                                           do_p_screening, map_atom_to_kind_atom, eval_type, &

                                           log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)


         cost_vector(idx) = cost_vector(idx) + partialcost

      END DO ! atom_block


!   sum costvector over all processes

      CALL para_env%sum(cost_vector)


!   calculate next prime factor of nProc

      nbins = 2

      DO WHILE (modulo(int(nproc), int(nbins)) /= 0)

         nbins = nbins + 1

      END DO


      nproc = nproc/nbins


! ... do the binning...


      ALLOCATE (localblocksize(nbins))

      CALL hfx_permute_binning(nbins, cost_vector(blockstart:blockend), blockend - blockstart + 1, bin_perm, localblocksize)


!... and update the permutation vector


      tmp_perm = permute(blockstart:blockend)

      permute(blockstart:blockend) = tmp_perm(bin_perm)


!   split recursion into the nBins Bins

      IF (nproc > 1) THEN

         ALLOCATE (ithblocksize(nproc))

         DO i = 1, nbins

            startoffset = sum(localblocksize(1:(i - 1)))

            endoffset = sum(localblocksize(1:i)) - 1


            CALL hfx_recursive_permute(ithblocksize, blockstart + startoffset, blockstart + endoffset, nproc, &

                                       permute, step + 1, &

                                       my_process_id, n_processes, nblocks, &

                                       natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &

                                       particle_set, &

                                       coeffs_set, coeffs_kind, &

                                       is_assoc_atomic_block_global, do_periodic, &

                                       kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &

                                       cell, x_data, para_env, pmax_block, &

                                       do_p_screening, map_atom_to_kind_atom, eval_type, &

                                       log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)

            blocksize(((i - 1)*nproc + 1):(i*nproc)) = ithblocksize

         END DO

         DEALLOCATE (ithblocksize)

      ELSE

         DO i = 1, nbins

            blocksize(i) = localblocksize(i)

         END DO

      END IF


      DEALLOCATE (localblocksize)


   END SUBROUTINE hfx_recursive_permute


! **************************************************************************************************

!> \brief small binning routine for the recursive load balancing

!>

!> \param nBins         number of Bins (INPUT)

!> \param costvector    vector of current row/column costs which have to be binned (INPUT)

!> \param maxbinsize    upper bound for bin size (INPUT)

!> \param perm          resulting permutation due to be binning routine (OUTPUT)

!> \param block_count   vector of size(nbins) which contains the size of each bin (OUTPUT)

!> \par History

!>      03.2011 created [Michael Steinlechner]

!> \author Michael Steinlechner

! **************************************************************************************************

   SUBROUTINE hfx_permute_binning(nBins, costvector, maxbinsize, perm, block_count)


      INTEGER, INTENT(IN)                                :: nbins, maxbinsize

      REAL(dp), DIMENSION(maxbinsize), INTENT(IN)        :: costvector

      INTEGER, DIMENSION(maxbinsize), INTENT(OUT)        :: perm

      INTEGER, DIMENSION(nBins), INTENT(OUT)             :: block_count


      INTEGER                                            :: i, j, mod_idx, offset

      INTEGER, DIMENSION(nBins, maxbinsize)              :: bin

      INTEGER, DIMENSION(nBins)                          :: bin_idx

      INTEGER, DIMENSION(maxbinsize)                     :: idx

      REAL(dp), DIMENSION(maxbinsize)                    :: vec

      REAL(dp), DIMENSION(nBins)                         :: bincosts


! be careful not to change costvector (copy it!)


      vec = costvector

      block_count = 0

      bincosts = 0


      !sort the array (ascending)

      CALL sort(vec, maxbinsize, idx)


      ! count the loop down to distribute the largest cols/rows first

      DO i = maxbinsize, 1, -1

         IF (vec(i) == 0) THEN

            ! spread zero-cost col/rows evenly among procs

            mod_idx = modulo(i, nbins) + 1 !(note the fortran offset by one!)

            block_count(mod_idx) = block_count(mod_idx) + 1

            bin(mod_idx, block_count(mod_idx)) = idx(i)

         ELSE

            ! sort the bins so that the one with the lowest cost is at the

            ! first place, where we then assign the current col/row

            CALL sort(bincosts, nbins, bin_idx)

            block_count = block_count(bin_idx)

            bin = bin(bin_idx, :)


            bincosts(1) = bincosts(1) + vec(i)

            block_count(1) = block_count(1) + 1

            bin(1, block_count(1)) = idx(i)

         END IF

      END DO


      ! construct permutation vector from the binning

      offset = 0

      DO i = 1, nbins

         DO j = 1, block_count(i)

            perm(offset + j) = bin(i, j)

         END DO

         offset = offset + block_count(i)

      END DO


   END SUBROUTINE hfx_permute_binning


! **************************************************************************************************

!> \brief Cheap way of redistributing the eri's

!> \param x_data Object that stores the indices array

!> \param para_env para_env

!> \param load_balance_parameter contains parmameter for Monte-Carlo routines

!> \param i_thread current thread ID

!> \param n_threads Total Number of threads

!> \param eval_type ...

!> \par History

!>      12.2007 created [Manuel Guidon]

!>      02.2009 optimize Memory Usage [Manuel Guidon]

!> \author Manuel Guidon

!> \note

!>      The cost matrix is given by the walltime for each bin that is measured

!>      during the calculation

! **************************************************************************************************


   SUBROUTINE hfx_update_load_balance(x_data, para_env, &

                                      load_balance_parameter, &

                                      i_thread, n_threads, eval_type)


      TYPE(hfx_type), POINTER                            :: x_data

      TYPE(mp_para_env_type), INTENT(IN)                 :: para_env

      TYPE(hfx_load_balance_type)                        :: load_balance_parameter

      INTEGER, INTENT(IN)                                :: i_thread, n_threads, eval_type


      CHARACTER(LEN=*), PARAMETER :: routinen = 'hfx_update_load_balance'


      INTEGER :: data_from, dest, end_idx, handle, i, ibin, icpu, iprocess, j, mepos, my_bin_size, &

                 my_global_start_idx, my_process_id, n_processes, nbins, ncpu, source, start_idx

      TYPE(mp_request_type), DIMENSION(2) :: req

      INTEGER(int_8), DIMENSION(:), POINTER              :: local_cost_matrix, recbuffer, &

                                                            sendbuffer, swapbuffer

      INTEGER(int_8), DIMENSION(:), POINTER, SAVE        :: cost_matrix

      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: tmp_pos

      INTEGER, ALLOCATABLE, DIMENSION(:), SAVE           :: bins_per_rank

      INTEGER, ALLOCATABLE, DIMENSION(:, :), SAVE        :: bin_histogram

      INTEGER, DIMENSION(:), POINTER, SAVE               :: shm_distribution_vector

      INTEGER, SAVE                                      :: max_bin_size

      TYPE(hfx_distribution), DIMENSION(:), POINTER      :: binned_dist, ptr_to_tmp_dist, tmp_dist

      TYPE(hfx_distribution), DIMENSION(:, :), POINTER, &

         SAVE                                            :: full_dist


!$OMP BARRIER

!$OMP MASTER

      CALL timeset(routinen, handle)

!$OMP END MASTER

!$OMP BARRIER


      ncpu = para_env%num_pe

      n_processes = ncpu*n_threads

      !! If there is only 1 cpu skip the binning

      IF (n_processes == 1) THEN

         ALLOCATE (tmp_dist(1))

         tmp_dist(1)%number_of_atom_quartets = huge(tmp_dist(1)%number_of_atom_quartets)

         tmp_dist(1)%istart = 0_int_8

         ptr_to_tmp_dist => tmp_dist(:)

         SELECT CASE (eval_type)

         CASE (hfx_do_eval_energy)

            CALL hfx_set_distr_energy(ptr_to_tmp_dist, x_data)

         CASE (hfx_do_eval_forces)

            CALL hfx_set_distr_forces(ptr_to_tmp_dist, x_data)

         END SELECT

         DEALLOCATE (tmp_dist)

      ELSE

         mepos = para_env%mepos

         my_process_id = para_env%mepos*n_threads + i_thread

         nbins = load_balance_parameter%nbins

!$OMP MASTER

         ALLOCATE (bin_histogram(n_processes, 2))

         bin_histogram = 0

!$OMP END MASTER

!$OMP BARRIER

         SELECT CASE (eval_type)

         CASE (hfx_do_eval_energy)

            my_bin_size = SIZE(x_data%distribution_energy)

         CASE (hfx_do_eval_forces)

            my_bin_size = SIZE(x_data%distribution_forces)

         END SELECT

         bin_histogram(my_process_id + 1, 1) = my_bin_size

!$OMP BARRIER

!$OMP MASTER

         CALL para_env%sum(bin_histogram(:, 1))

         bin_histogram(1, 2) = bin_histogram(1, 1)

         DO iprocess = 2, n_processes

            bin_histogram(iprocess, 2) = bin_histogram(iprocess - 1, 2) + bin_histogram(iprocess, 1)

         END DO


         max_bin_size = maxval(bin_histogram(para_env%mepos*n_threads + 1:para_env%mepos*n_threads + n_threads, 1))

         CALL para_env%max(max_bin_size)

!$OMP END MASTER

!$OMP BARRIER

         ALLOCATE (binned_dist(my_bin_size))

         !! Use old binned_dist, but with timings cost

         SELECT CASE (eval_type)

         CASE (hfx_do_eval_energy)

            binned_dist = x_data%distribution_energy

         CASE (hfx_do_eval_forces)

            binned_dist = x_data%distribution_forces

         END SELECT


         DO ibin = 1, my_bin_size

            IF (binned_dist(ibin)%number_of_atom_quartets == 0) THEN

               binned_dist(ibin)%cost = 0

            ELSE

               SELECT CASE (eval_type)

               CASE (hfx_do_eval_energy)

                  IF (.NOT. load_balance_parameter%rtp_redistribute) THEN

                     binned_dist(ibin)%cost = int((binned_dist(ibin)%time_first_scf + &

                                                   binned_dist(ibin)%time_other_scf)*10000.0_dp, int_8)

                  ELSE

                     binned_dist(ibin)%cost = int((binned_dist(ibin)%time_other_scf)*10000.0_dp, int_8)

                  END IF

               CASE (hfx_do_eval_forces)

                  binned_dist(ibin)%cost = int((binned_dist(ibin)%time_forces)*10000.0_dp, int_8)

               END SELECT

            END IF

         END DO

!$OMP BARRIER

!$OMP MASTER

         !! store all local results in a big cost matrix

         ALLOCATE (cost_matrix(ncpu*nbins*n_threads))

         cost_matrix = 0

         ALLOCATE (sendbuffer(max_bin_size*n_threads))

         ALLOCATE (recbuffer(max_bin_size*n_threads))

!$OMP END MASTER

!$OMP BARRIER

         my_global_start_idx = bin_histogram(my_process_id + 1, 2) - my_bin_size

         icpu = para_env%mepos + 1

         DO i = 1, my_bin_size

            cost_matrix(my_global_start_idx + i) = binned_dist(i)%cost

         END DO


         mepos = para_env%mepos

!$OMP BARRIER

!$OMP MASTER

         ALLOCATE (bins_per_rank(ncpu))

         bins_per_rank = 0

         DO icpu = 1, ncpu

            bins_per_rank(icpu) = sum(bin_histogram((icpu - 1)*n_threads + 1:(icpu - 1)*n_threads + n_threads, 1))

         END DO

         sendbuffer(1:bins_per_rank(para_env%mepos + 1)) = &

            cost_matrix(my_global_start_idx + 1:my_global_start_idx + bins_per_rank(para_env%mepos + 1))


         dest = modulo(mepos + 1, ncpu)

         source = modulo(mepos - 1, ncpu)

         ! sync before/after ring of isendrecv

         CALL para_env%sync()

         DO icpu = 0, ncpu - 1

            IF (icpu /= ncpu - 1) THEN

               CALL para_env%isendrecv(sendbuffer, dest, recbuffer, source, &

                                       req(1), req(2), 13)

            END IF

            data_from = modulo(mepos - icpu, ncpu)

            start_idx = sum(bins_per_rank(1:data_from + 1)) - bins_per_rank(data_from + 1) + 1

            end_idx = start_idx + bins_per_rank(data_from + 1) - 1

            cost_matrix(start_idx:end_idx) = sendbuffer(1:end_idx - start_idx + 1)


            IF (icpu /= ncpu - 1) THEN

               CALL mp_waitall(req)

            END IF

            swapbuffer => sendbuffer

            sendbuffer => recbuffer

            recbuffer => swapbuffer

         END DO

         DEALLOCATE (recbuffer, sendbuffer)

         ! sync before/after ring of isendrecv

         CALL para_env%sync()

!$OMP END MASTER

!$OMP BARRIER

         ALLOCATE (local_cost_matrix(SIZE(cost_matrix, 1)))

         local_cost_matrix = cost_matrix

!$OMP MASTER

         ALLOCATE (shm_distribution_vector(ncpu*nbins*n_threads))

         CALL optimize_distribution(ncpu*nbins*n_threads, ncpu*n_threads, local_cost_matrix, &

                                    shm_distribution_vector, x_data%load_balance_parameter%do_randomize)


         ALLOCATE (full_dist(ncpu*n_threads, max_bin_size))


         full_dist(:, :)%istart = 0_int_8

         full_dist(:, :)%number_of_atom_quartets = 0_int_8

         full_dist(:, :)%cost = 0_int_8

         full_dist(:, :)%time_first_scf = 0.0_dp

         full_dist(:, :)%time_other_scf = 0.0_dp

         full_dist(:, :)%time_forces = 0.0_dp

!$OMP END MASTER


!$OMP BARRIER

         mepos = para_env%mepos + 1

         full_dist((mepos - 1)*n_threads + i_thread + 1, 1:my_bin_size) = binned_dist(1:my_bin_size)

!$OMP BARRIER

!$OMP MASTER

         ALLOCATE (sendbuffer(3*max_bin_size*n_threads))

         ALLOCATE (recbuffer(3*max_bin_size*n_threads))

         mepos = para_env%mepos

         DO j = 1, n_threads

            DO i = 1, max_bin_size

               sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 1) = full_dist(mepos*n_threads + j, i)%istart

               sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 2) = full_dist(mepos*n_threads + j, i)%number_of_atom_quartets

               sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 3) = full_dist(mepos*n_threads + j, i)%cost

            END DO

         END DO

         dest = modulo(mepos + 1, ncpu)

         source = modulo(mepos - 1, ncpu)

         ! sync before/after ring of isendrecv

         CALL para_env%sync()

         DO icpu = 0, ncpu - 1

            IF (icpu /= ncpu - 1) THEN

               CALL para_env%isendrecv(sendbuffer, dest, recbuffer, source, &

                                       req(1), req(2), 13)

            END IF

            data_from = modulo(mepos - icpu, ncpu)

            DO j = 1, n_threads

               DO i = 1, max_bin_size

                  full_dist(data_from*n_threads + j, i)%istart = sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 1)

                  full_dist(data_from*n_threads + j, i)%number_of_atom_quartets = sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 2)

                  full_dist(data_from*n_threads + j, i)%cost = sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 3)

               END DO

            END DO


            IF (icpu /= ncpu - 1) THEN

               CALL mp_waitall(req)

            END IF

            swapbuffer => sendbuffer

            sendbuffer => recbuffer

            recbuffer => swapbuffer

         END DO

         ! sync before/after ring of isendrecv

         DEALLOCATE (recbuffer, sendbuffer)

         CALL para_env%sync()

!$OMP END MASTER

!$OMP BARRIER

         !! reorder the distribution according to the distribution vector

         ALLOCATE (tmp_pos(ncpu*n_threads))

         tmp_pos = 1

         ALLOCATE (tmp_dist(nbins*ncpu*n_threads))


         tmp_dist(:)%istart = 0_int_8

         tmp_dist(:)%number_of_atom_quartets = 0_int_8

         tmp_dist(:)%cost = 0_int_8

         tmp_dist(:)%time_first_scf = 0.0_dp

         tmp_dist(:)%time_other_scf = 0.0_dp

         tmp_dist(:)%time_forces = 0.0_dp


         mepos = my_process_id + 1

         DO icpu = 1, n_processes

            DO i = 1, bin_histogram(icpu, 1)

               IF (shm_distribution_vector(bin_histogram(icpu, 2) - bin_histogram(icpu, 1) + i) == mepos) THEN

                  tmp_dist(tmp_pos(mepos)) = full_dist(icpu, i)

                  tmp_pos(mepos) = tmp_pos(mepos) + 1

               END IF

            END DO

         END DO


         !! Assign the load to each process

         NULLIFY (ptr_to_tmp_dist)

         mepos = my_process_id + 1

         ptr_to_tmp_dist => tmp_dist(1:tmp_pos(mepos) - 1)

         SELECT CASE (eval_type)

         CASE (hfx_do_eval_energy)

            CALL hfx_set_distr_energy(ptr_to_tmp_dist, x_data)

         CASE (hfx_do_eval_forces)

            CALL hfx_set_distr_forces(ptr_to_tmp_dist, x_data)

         END SELECT


!$OMP BARRIER

!$OMP MASTER

         DEALLOCATE (full_dist, cost_matrix, shm_distribution_vector)

         DEALLOCATE (bins_per_rank, bin_histogram)

!$OMP END MASTER

!$OMP BARRIER

         DEALLOCATE (tmp_dist, tmp_pos)

         DEALLOCATE (binned_dist, local_cost_matrix)

      END IF

!$OMP BARRIER

!$OMP MASTER

      CALL timestop(handle)

!$OMP END MASTER

!$OMP BARRIER


   END SUBROUTINE hfx_update_load_balance


! **************************************************************************************************

!> \brief estimates the cost of a set quartet with info available at load balance time

!>        i.e. without much info on the primitives primitives

!> \param nsa ...

!> \param nsb ...

!> \param nsc ...

!> \param nsd ...

!> \param npgfa ...

!> \param npgfb ...

!> \param npgfc ...

!> \param npgfd ...

!> \param ratio ...

!> \param p1 ...

!> \param p2 ...

!> \param p3 ...

!> \return ...

!> \par History

!>      08.2009 created Joost VandeVondele

!> \author Joost VandeVondele

! **************************************************************************************************


   FUNCTION cost_model(nsa, nsb, nsc, nsd, npgfa, npgfb, npgfc, npgfd, ratio, p1, p2, p3) RESULT(res)

      REAL(kind=dp) :: estimate1, estimate2, estimate, ratio, switch, mu, sigma

      INTEGER(KIND=int_8) :: res

      REAL(kind=dp), INTENT(IN) :: p1(12), p2(12), p3(2)


      INTEGER   :: nsa, nsb, nsc, nsd, npgfa, npgfb, npgfc, npgfd


      estimate1 = estimate_basic(p1)

      estimate2 = estimate_basic(p2)

      mu = log(abs(1.0e6_dp*p3(1)) + 1)

      sigma = p3(2)*0.1_dp*mu

      switch = 1.0_dp/(1.0_dp + exp((log(estimate1) - mu)/sigma))

      estimate = estimate1*(1.0_dp - switch) + estimate2*switch

      res = int(estimate*0.001_dp, kind=int_8) + 1


   CONTAINS


! **************************************************************************************************

!> \brief ...

!> \param p ...

!> \return ...

! **************************************************************************************************

      REAL(kind=dp) FUNCTION estimate_basic(p) RESULT(res)

         REAL(kind=dp)                                      :: p(12)


         REAL(kind=dp)                                      :: p1, p10, p11, p12, p2, p3, p4, p5, p6, &

                                                               p7, p8, p9


         p1 = p(1); p2 = p(2); p3 = p(3); p4 = p(4)

         p5 = p(5); p6 = p(6); p7 = p(7); p8 = p(8)

         p9 = p(9); p10 = p(10); p11 = p(11); p12 = p(12)

         res = poly2(nsa, p1, p2, p3)*poly2(nsb, p1, p2, p3)*poly2(nsc, p1, p2, p3)*poly2(nsd, p1, p2, p3)* &

               poly2(npgfa, p4, p5, p6)*poly2(npgfb, p4, p5, p6)*poly2(npgfc, p4, p5, p6)* &

               poly2(npgfd, p4, p5, p6)*exp(-p7*ratio + p8*ratio**2) + &

              1000.0_dp*p9 + poly2(nsa, p10, p11, p12)*poly2(nsb, p10, p11, p12)*poly2(nsc, p10, p11, p12)*poly2(nsd, p10, p11, p12)

         res = 1 + abs(res)

      END FUNCTION estimate_basic


! **************************************************************************************************

!> \brief ...

!> \param x ...

!> \param a0 ...

!> \param a1 ...

!> \param a2 ...

!> \return ...

! **************************************************************************************************

      REAL(kind=dp) FUNCTION poly2(x, a0, a1, a2)

         INTEGER, INTENT(IN)                                :: x

         REAL(kind=dp), INTENT(IN)                          :: a0, a1, a2

         REAL(kind=dp)                                      :: r


         r = real(x, kind=dp)

         poly2 = a0 + (a1 + a2*r)*r

      END FUNCTION poly2


   END FUNCTION cost_model

! **************************************************************************************************

!> \brief Minimizes the maximum cost per cpu by shuffling around all bins

!> \param total_number_of_bins ...

!> \param number_of_processes ...

!> \param bin_costs costs per bin

!> \param distribution_vector will contain the final distribution

!> \param do_randomize ...

!> \par History

!>      03.2009 created from a hack by Joost [Manuel Guidon]

!> \author Manuel Guidon

! **************************************************************************************************

   SUBROUTINE optimize_distribution(total_number_of_bins, number_of_processes, bin_costs, &

                                    distribution_vector, do_randomize)

      INTEGER                                            :: total_number_of_bins, number_of_processes

      INTEGER(int_8), DIMENSION(:), POINTER              :: bin_costs

      INTEGER, DIMENSION(:), POINTER                     :: distribution_vector

      LOGICAL, INTENT(IN)                                :: do_randomize


      INTEGER                                            :: i, itmp, j, nstep

      INTEGER(int_8), DIMENSION(:), POINTER              :: my_cost_cpu, tmp_cost, tmp_cpu_cost

      INTEGER, DIMENSION(:), POINTER                     :: tmp_cpu_index, tmp_index

      TYPE(rng_stream_type), ALLOCATABLE                 :: rng_stream


      nstep = max(1, int(number_of_processes)/2)


      ALLOCATE (tmp_cost(total_number_of_bins))

      ALLOCATE (tmp_index(total_number_of_bins))

      ALLOCATE (tmp_cpu_cost(number_of_processes))

      ALLOCATE (tmp_cpu_index(number_of_processes))

      ALLOCATE (my_cost_cpu(number_of_processes))

      tmp_cost = bin_costs


      CALL sort(tmp_cost, total_number_of_bins, tmp_index)

      my_cost_cpu = 0

      !

      ! assign the largest remaining bin to the CPU with the smallest load

      ! gives near perfect distributions for a sufficient number of bins ...

      ! doing this in chunks of nstep (where nstep ~ number_of_processes) makes this n log n and gives

      ! each cpu a similar number of tasks.

      ! it also avoids degenerate cases where thousands of zero sized tasks

      ! are assigned to the same (least loaded) cpu

      !

      IF (do_randomize) &

         rng_stream = rng_stream_type(name="uniform_rng", &

                                      distribution_type=uniform)


      DO i = total_number_of_bins, 1, -nstep

         tmp_cpu_cost = my_cost_cpu

         CALL sort(tmp_cpu_cost, int(number_of_processes), tmp_cpu_index)

         IF (do_randomize) THEN

            CALL rng_stream%shuffle(tmp_cpu_index(1:min(i, nstep)))

         END IF

         DO j = 1, min(i, nstep)

            itmp = tmp_cpu_index(j)

            distribution_vector(tmp_index(i - j + 1)) = itmp

            my_cost_cpu(itmp) = my_cost_cpu(itmp) + bin_costs(tmp_index(i - j + 1))

         END DO

      END DO


      DEALLOCATE (tmp_cost, tmp_index, tmp_cpu_cost)

      DEALLOCATE (tmp_cpu_index, my_cost_cpu)

   END SUBROUTINE optimize_distribution


! **************************************************************************************************

!> \brief Given a 2d index pair, this function returns a 1d index pair for

!>        a symmetric upper triangle NxN matrix

!>        The compiler should inline this function, therefore it appears in

!>        several modules

!> \param i 2d index

!> \param j 2d index

!> \param N matrix size

!> \return ...

!> \par History

!>      03.2009 created [Manuel Guidon]

!> \author Manuel Guidon

! **************************************************************************************************

   PURE FUNCTION get_1d_idx(i, j, N)

      INTEGER, INTENT(IN)                                :: i, j

      INTEGER(int_8), INTENT(IN)                         :: n

      INTEGER(int_8)                                     :: get_1d_idx


      INTEGER(int_8)                                     :: min_ij


      min_ij = min(i, j)

      get_1d_idx = min_ij*n + max(i, j) - (min_ij - 1)*min_ij/2 - n


   END FUNCTION get_1d_idx


! **************************************************************************************************

!> \brief ...

!> \param natom ...

!> \param nkind ...

!> \param list_ij ...

!> \param list_kl ...

!> \param set_list_ij ...

!> \param set_list_kl ...

!> \param iatom_start ...

!> \param iatom_end ...

!> \param jatom_start ...

!> \param jatom_end ...

!> \param katom_start ...

!> \param katom_end ...

!> \param latom_start ...

!> \param latom_end ...

!> \param particle_set ...

!> \param coeffs_set ...

!> \param coeffs_kind ...

!> \param is_assoc_atomic_block_global ...

!> \param do_periodic ...

!> \param kind_of ...

!> \param basis_parameter ...

!> \param pmax_set ...

!> \param pmax_atom ...

!> \param pmax_blocks ...

!> \param cell ...

!> \param do_p_screening ...

!> \param map_atom_to_kind_atom ...

!> \param eval_type ...

!> \param log10_eps_schwarz ...

!> \param log_2 ...

!> \param coeffs_kind_max0 ...

!> \param use_virial ...

!> \param atomic_pair_list ...

!> \return ...

! **************************************************************************************************

   FUNCTION estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &

                                iatom_start, iatom_end, jatom_start, jatom_end, &

                                katom_start, katom_end, latom_start, latom_end, &

                                particle_set, &

                                coeffs_set, coeffs_kind, &

                                is_assoc_atomic_block_global, do_periodic, &

                                kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &

                                cell, &

                                do_p_screening, map_atom_to_kind_atom, eval_type, &

                                log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, &

                                atomic_pair_list)


      INTEGER, INTENT(IN)                                :: natom, nkind

      TYPE(pair_list_type)                               :: list_ij, list_kl

      TYPE(pair_set_list_type), DIMENSION(:)             :: set_list_ij, set_list_kl

      INTEGER, INTENT(IN)                                :: iatom_start, iatom_end, jatom_start, &

                                                            jatom_end, katom_start, katom_end, &

                                                            latom_start, latom_end

      TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set

      TYPE(hfx_screen_coeff_type), &

         DIMENSION(:, :, :, :), POINTER                  :: coeffs_set

      TYPE(hfx_screen_coeff_type), &

         DIMENSION(nkind, nkind)                         :: coeffs_kind

      INTEGER, DIMENSION(:, :)                           :: is_assoc_atomic_block_global

      LOGICAL                                            :: do_periodic

      INTEGER                                            :: kind_of(*)

      TYPE(hfx_basis_type), DIMENSION(:), POINTER        :: basis_parameter

      TYPE(hfx_p_kind), DIMENSION(:), POINTER            :: pmax_set

      REAL(dp), DIMENSION(:, :), POINTER                 :: pmax_atom

      REAL(dp)                                           :: pmax_blocks

      TYPE(cell_type), POINTER                           :: cell

      LOGICAL, INTENT(IN)                                :: do_p_screening

      INTEGER, DIMENSION(:), POINTER                     :: map_atom_to_kind_atom

      INTEGER, INTENT(IN)                                :: eval_type

      REAL(dp)                                           :: log10_eps_schwarz, log_2, &

                                                            coeffs_kind_max0

      LOGICAL, INTENT(IN)                                :: use_virial

      LOGICAL, DIMENSION(natom, natom)                   :: atomic_pair_list

      INTEGER(int_8)                                     :: estimate_block_cost


      INTEGER :: i_list_ij, i_list_kl, i_set_list_ij, i_set_list_ij_start, i_set_list_ij_stop, &

                 i_set_list_kl, i_set_list_kl_start, i_set_list_kl_stop, iatom, ikind, iset, jatom, jkind, &

                 jset, katom, kind_kind_idx, kkind, kset, latom, lkind, lset, swap_id

      INTEGER, DIMENSION(:), POINTER                     :: npgfa, npgfb, npgfc, npgfd, nsgfa, &

                                                            nsgfb, nsgfc, nsgfd

      REAL(dp)                                           :: actual_pmax_atom, cost_tmp, max_val1, &

                                                            max_val2, pmax_entry, rab2, rcd2, &

                                                            screen_kind_ij, screen_kind_kl

      REAL(dp), DIMENSION(:, :), POINTER                 :: ptr_p_1, ptr_p_2, ptr_p_3, ptr_p_4


      estimate_block_cost = 0_int_8


      CALL build_pair_list(natom, list_ij, set_list_ij, iatom_start, iatom_end, jatom_start, jatom_end, &

                           kind_of, basis_parameter, particle_set, &

                           do_periodic, coeffs_set, coeffs_kind, coeffs_kind_max0, &

                           log10_eps_schwarz, cell, pmax_blocks, atomic_pair_list)


      CALL build_pair_list(natom, list_kl, set_list_kl, katom_start, katom_end, latom_start, latom_end, &

                           kind_of, basis_parameter, particle_set, &

                           do_periodic, coeffs_set, coeffs_kind, coeffs_kind_max0, &

                           log10_eps_schwarz, cell, pmax_blocks, atomic_pair_list)


      DO i_list_ij = 1, list_ij%n_element

         iatom = list_ij%elements(i_list_ij)%pair(1)

         jatom = list_ij%elements(i_list_ij)%pair(2)

         i_set_list_ij_start = list_ij%elements(i_list_ij)%set_bounds(1)

         i_set_list_ij_stop = list_ij%elements(i_list_ij)%set_bounds(2)

         ikind = list_ij%elements(i_list_ij)%kind_pair(1)

         jkind = list_ij%elements(i_list_ij)%kind_pair(2)

         rab2 = list_ij%elements(i_list_ij)%dist2


         nsgfa => basis_parameter(ikind)%nsgf

         nsgfb => basis_parameter(jkind)%nsgf

         npgfa => basis_parameter(ikind)%npgf

         npgfb => basis_parameter(jkind)%npgf


         DO i_list_kl = 1, list_kl%n_element


            katom = list_kl%elements(i_list_kl)%pair(1)

            latom = list_kl%elements(i_list_kl)%pair(2)


            IF (.NOT. (katom + latom <= iatom + jatom)) cycle

            IF (((iatom + jatom) == (katom + latom)) .AND. (katom < iatom)) cycle


            IF (eval_type == hfx_do_eval_forces) THEN

               IF (.NOT. use_virial) THEN

                  IF ((iatom == jatom .AND. iatom == katom .AND. iatom == latom)) cycle

               END IF

            END IF


            i_set_list_kl_start = list_kl%elements(i_list_kl)%set_bounds(1)

            i_set_list_kl_stop = list_kl%elements(i_list_kl)%set_bounds(2)

            kkind = list_kl%elements(i_list_kl)%kind_pair(1)

            lkind = list_kl%elements(i_list_kl)%kind_pair(2)

            rcd2 = list_kl%elements(i_list_kl)%dist2


            nsgfc => basis_parameter(kkind)%nsgf

            nsgfd => basis_parameter(lkind)%nsgf

            npgfc => basis_parameter(kkind)%npgf

            npgfd => basis_parameter(lkind)%npgf


            IF (do_p_screening) THEN

               actual_pmax_atom = max(pmax_atom(katom, iatom), &

                                      pmax_atom(latom, jatom), &

                                      pmax_atom(latom, iatom), &

                                      pmax_atom(katom, jatom))

            ELSE

               actual_pmax_atom = 0.0_dp

            END IF


            screen_kind_ij = coeffs_kind(jkind, ikind)%x(1)*rab2 + &

                             coeffs_kind(jkind, ikind)%x(2)

            screen_kind_kl = coeffs_kind(lkind, kkind)%x(1)*rcd2 + &

                             coeffs_kind(lkind, kkind)%x(2)

            IF (screen_kind_ij + screen_kind_kl + actual_pmax_atom < log10_eps_schwarz) cycle


            IF (.NOT. (is_assoc_atomic_block_global(latom, iatom) >= 1 .AND. &

                       is_assoc_atomic_block_global(katom, iatom) >= 1 .AND. &

                       is_assoc_atomic_block_global(katom, jatom) >= 1 .AND. &

                       is_assoc_atomic_block_global(latom, jatom) >= 1)) cycle


            IF (do_p_screening) THEN

               SELECT CASE (eval_type)

               CASE (hfx_do_eval_energy)

                  swap_id = 0

                  kind_kind_idx = int(get_1d_idx(kkind, ikind, int(nkind, int_8)))

                  IF (ikind >= kkind) THEN

                     ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, &

                                                               map_atom_to_kind_atom(katom), &

                                                               map_atom_to_kind_atom(iatom))

                  ELSE

                     ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, &

                                                               map_atom_to_kind_atom(iatom), &

                                                               map_atom_to_kind_atom(katom))

                     swap_id = swap_id + 1

                  END IF

                  kind_kind_idx = int(get_1d_idx(lkind, jkind, int(nkind, int_8)))

                  IF (jkind >= lkind) THEN

                     ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, &

                                                               map_atom_to_kind_atom(latom), &

                                                               map_atom_to_kind_atom(jatom))

                  ELSE

                     ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, &

                                                               map_atom_to_kind_atom(jatom), &

                                                               map_atom_to_kind_atom(latom))

                     swap_id = swap_id + 2

                  END IF

                  kind_kind_idx = int(get_1d_idx(lkind, ikind, int(nkind, int_8)))

                  IF (ikind >= lkind) THEN

                     ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, &

                                                               map_atom_to_kind_atom(latom), &

                                                               map_atom_to_kind_atom(iatom))

                  ELSE

                     ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, &

                                                               map_atom_to_kind_atom(iatom), &

                                                               map_atom_to_kind_atom(latom))

                     swap_id = swap_id + 4

                  END IF

                  kind_kind_idx = int(get_1d_idx(kkind, jkind, int(nkind, int_8)))

                  IF (jkind >= kkind) THEN

                     ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, &

                                                               map_atom_to_kind_atom(katom), &

                                                               map_atom_to_kind_atom(jatom))

                  ELSE

                     ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, &

                                                               map_atom_to_kind_atom(jatom), &

                                                               map_atom_to_kind_atom(katom))

                     swap_id = swap_id + 8

                  END IF

               CASE (hfx_do_eval_forces)

                  swap_id = 16

                  kind_kind_idx = int(get_1d_idx(kkind, ikind, int(nkind, int_8)))

                  IF (ikind >= kkind) THEN

                     ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, &

                                                               map_atom_to_kind_atom(katom), &

                                                               map_atom_to_kind_atom(iatom))

                  ELSE

                     ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, &

                                                               map_atom_to_kind_atom(iatom), &

                                                               map_atom_to_kind_atom(katom))

                     swap_id = swap_id + 1

                  END IF

                  kind_kind_idx = int(get_1d_idx(lkind, jkind, int(nkind, int_8)))

                  IF (jkind >= lkind) THEN

                     ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, &

                                                               map_atom_to_kind_atom(latom), &

                                                               map_atom_to_kind_atom(jatom))

                  ELSE

                     ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, &

                                                               map_atom_to_kind_atom(jatom), &

                                                               map_atom_to_kind_atom(latom))

                     swap_id = swap_id + 2

                  END IF

                  kind_kind_idx = int(get_1d_idx(lkind, ikind, int(nkind, int_8)))

                  IF (ikind >= lkind) THEN

                     ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, &

                                                               map_atom_to_kind_atom(latom), &

                                                               map_atom_to_kind_atom(iatom))

                  ELSE

                     ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, &

                                                               map_atom_to_kind_atom(iatom), &

                                                               map_atom_to_kind_atom(latom))

                     swap_id = swap_id + 4

                  END IF

                  kind_kind_idx = int(get_1d_idx(kkind, jkind, int(nkind, int_8)))

                  IF (jkind >= kkind) THEN

                     ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, &

                                                               map_atom_to_kind_atom(katom), &

                                                               map_atom_to_kind_atom(jatom))

                  ELSE

                     ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, &

                                                               map_atom_to_kind_atom(jatom), &

                                                               map_atom_to_kind_atom(katom))

                     swap_id = swap_id + 8

                  END IF

               END SELECT

            END IF


            DO i_set_list_ij = i_set_list_ij_start, i_set_list_ij_stop

               iset = set_list_ij(i_set_list_ij)%pair(1)

               jset = set_list_ij(i_set_list_ij)%pair(2)


               max_val1 = coeffs_set(jset, iset, jkind, ikind)%x(1)*rab2 + &

                          coeffs_set(jset, iset, jkind, ikind)%x(2)


               IF (max_val1 + screen_kind_kl + actual_pmax_atom < log10_eps_schwarz) cycle

               DO i_set_list_kl = i_set_list_kl_start, i_set_list_kl_stop

                  kset = set_list_kl(i_set_list_kl)%pair(1)

                  lset = set_list_kl(i_set_list_kl)%pair(2)


                  max_val2 = max_val1 + (coeffs_set(lset, kset, lkind, kkind)%x(1)*rcd2 + &

                                         coeffs_set(lset, kset, lkind, kkind)%x(2))


                  IF (max_val2 + actual_pmax_atom < log10_eps_schwarz) cycle

                  IF (do_p_screening) THEN

                     CALL get_pmax_val(ptr_p_1, ptr_p_2, ptr_p_3, ptr_p_4, &

                                       iset, jset, kset, lset, &

                                       pmax_entry, swap_id)

                     IF (eval_type == hfx_do_eval_forces) THEN

                        pmax_entry = log_2 + pmax_entry

                     END IF

                  ELSE

                     pmax_entry = 0.0_dp

                  END IF

                  max_val2 = max_val2 + pmax_entry

                  IF (max_val2 < log10_eps_schwarz) cycle

                  SELECT CASE (eval_type)

                  CASE (hfx_do_eval_energy)

                     cost_tmp = cost_model(nsgfa(iset), nsgfb(jset), nsgfc(kset), nsgfd(lset), &

                                           npgfa(iset), npgfb(jset), npgfc(kset), npgfd(lset), &

                                           max_val2/log10_eps_schwarz, &

                                           p1_energy, p2_energy, p3_energy)

                     estimate_block_cost = estimate_block_cost + int(cost_tmp, kind=int_8)

                  CASE (hfx_do_eval_forces)

                     cost_tmp = cost_model(nsgfa(iset), nsgfb(jset), nsgfc(kset), nsgfd(lset), &

                                           npgfa(iset), npgfb(jset), npgfc(kset), npgfd(lset), &

                                           max_val2/log10_eps_schwarz, &

                                           p1_forces, p2_forces, p3_forces)

                     estimate_block_cost = estimate_block_cost + int(cost_tmp, kind=int_8)

                  END SELECT

               END DO ! i_set_list_kl

            END DO ! i_set_list_ij

         END DO ! i_list_kl

      END DO ! i_list_ij


   END FUNCTION estimate_block_cost


! **************************************************************************************************

!> \brief ...

!> \param nkind ...

!> \param para_env ...

!> \param natom ...

!> \param block_size ...

!> \param nblock ...

!> \param blocks ...

!> \param list_ij ...

!> \param list_kl ...

!> \param set_list_ij ...

!> \param set_list_kl ...

!> \param particle_set ...

!> \param coeffs_set ...

!> \param coeffs_kind ...

!> \param is_assoc_atomic_block_global ...

!> \param do_periodic ...

!> \param kind_of ...

!> \param basis_parameter ...

!> \param pmax_set ...

!> \param pmax_atom ...

!> \param pmax_blocks ...

!> \param cell ...

!> \param do_p_screening ...

!> \param map_atom_to_kind_atom ...

!> \param eval_type ...

!> \param log10_eps_schwarz ...

!> \param log_2 ...

!> \param coeffs_kind_max0 ...

!> \param use_virial ...

!> \param atomic_pair_list ...

! **************************************************************************************************

   SUBROUTINE init_blocks(nkind, para_env, natom, block_size, nblock, blocks, &

                          list_ij, list_kl, set_list_ij, set_list_kl, &

                          particle_set, &

                          coeffs_set, coeffs_kind, &

                          is_assoc_atomic_block_global, do_periodic, &

                          kind_of, basis_parameter, pmax_set, pmax_atom, &

                          pmax_blocks, cell, &

                          do_p_screening, map_atom_to_kind_atom, eval_type, &

                          log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, &

                          atomic_pair_list)


      INTEGER, INTENT(IN)                                :: nkind

      TYPE(mp_para_env_type), INTENT(IN)                 :: para_env

      INTEGER                                            :: natom, block_size, nblock

      TYPE(hfx_block_range_type), DIMENSION(1:nblock)    :: blocks

      TYPE(pair_list_type)                               :: list_ij, list_kl

      TYPE(pair_set_list_type), DIMENSION(:)             :: set_list_ij, set_list_kl

      TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set

      TYPE(hfx_screen_coeff_type), &

         DIMENSION(:, :, :, :), POINTER                  :: coeffs_set

      TYPE(hfx_screen_coeff_type), DIMENSION(:, :), &

         POINTER                                         :: coeffs_kind

      INTEGER, DIMENSION(:, :)                           :: is_assoc_atomic_block_global

      LOGICAL                                            :: do_periodic

      INTEGER                                            :: kind_of(*)

      TYPE(hfx_basis_type), DIMENSION(:), POINTER        :: basis_parameter

      TYPE(hfx_p_kind), DIMENSION(:), POINTER            :: pmax_set

      REAL(dp), DIMENSION(:, :), POINTER                 :: pmax_atom

      REAL(dp)                                           :: pmax_blocks

      TYPE(cell_type), POINTER                           :: cell

      LOGICAL, INTENT(IN)                                :: do_p_screening

      INTEGER, DIMENSION(:), POINTER                     :: map_atom_to_kind_atom

      INTEGER, INTENT(IN)                                :: eval_type

      REAL(dp)                                           :: log10_eps_schwarz, log_2, &

                                                            coeffs_kind_max0

      LOGICAL, INTENT(IN)                                :: use_virial

      LOGICAL, DIMENSION(natom, natom)                   :: atomic_pair_list


      INTEGER                                            :: atom_block, i, iatom_block, iatom_end, &

                                                            iatom_start, my_cpu_rank, ncpus


      DO atom_block = 0, nblock - 1

         iatom_block = modulo(atom_block, nblock) + 1

         iatom_start = (iatom_block - 1)*block_size + 1

         iatom_end = min(iatom_block*block_size, natom)

         blocks(atom_block + 1)%istart = iatom_start

         blocks(atom_block + 1)%iend = iatom_end

         blocks(atom_block + 1)%cost = 0_int_8

      END DO


      ncpus = para_env%num_pe

      my_cpu_rank = para_env%mepos

      DO i = 1, nblock

         IF (modulo(i, ncpus) /= my_cpu_rank) THEN

            blocks(i)%istart = 0

            blocks(i)%iend = 0

            cycle

         END IF

         iatom_start = blocks(i)%istart

         iatom_end = blocks(i)%iend

         blocks(i)%cost = estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &

                                              iatom_start, iatom_end, iatom_start, iatom_end, &

                                              iatom_start, iatom_end, iatom_start, iatom_end, &

                                              particle_set, &

                                              coeffs_set, coeffs_kind, &

                                              is_assoc_atomic_block_global, do_periodic, &

                                              kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &

                                              cell, &

                                              do_p_screening, map_atom_to_kind_atom, eval_type, &

                                              log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)


      END DO

   END SUBROUTINE init_blocks


! **************************************************************************************************

!> \brief ...

!> \param para_env ...

!> \param x_data ...

!> \param iw ...

!> \param n_threads ...

!> \param i_thread ...

!> \param eval_type ...

! **************************************************************************************************


   SUBROUTINE collect_load_balance_info(para_env, x_data, iw, n_threads, i_thread, &

                                        eval_type)


      TYPE(mp_para_env_type), INTENT(IN)                 :: para_env

      TYPE(hfx_type), POINTER                            :: x_data

      INTEGER, INTENT(IN)                                :: iw, n_threads, i_thread, eval_type


      INTEGER                                            :: i, j, k, my_rank, nbins, nranks, &

                                                            total_bins

      INTEGER(int_8)                                     :: avg_bin, avg_rank, max_bin, max_rank, &

                                                            min_bin, min_rank, sum_bin, sum_rank

      INTEGER(int_8), ALLOCATABLE, DIMENSION(:)          :: buffer, buffer_in, buffer_out, summary

      INTEGER(int_8), ALLOCATABLE, DIMENSION(:), SAVE    :: shm_cost_vector

      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: bins_per_rank, rdispl, sort_idx

      INTEGER, ALLOCATABLE, DIMENSION(:), SAVE           :: shm_bins_per_rank, shm_displ


      SELECT CASE (eval_type)

      CASE (hfx_do_eval_energy)

         nbins = SIZE(x_data%distribution_energy)

      CASE (hfx_do_eval_forces)

         nbins = SIZE(x_data%distribution_forces)

      END SELECT


!$OMP MASTER

      ALLOCATE (shm_bins_per_rank(n_threads))

      ALLOCATE (shm_displ(n_threads + 1))

!$OMP END MASTER

!$OMP BARRIER


      shm_bins_per_rank(i_thread + 1) = nbins

!$OMP BARRIER

      nbins = 0

      DO i = 1, n_threads

         nbins = nbins + shm_bins_per_rank(i)

      END DO

      my_rank = para_env%mepos

      nranks = para_env%num_pe


!$OMP BARRIER

!$OMP MASTER

      ALLOCATE (bins_per_rank(nranks))

      bins_per_rank = 0


      bins_per_rank(my_rank + 1) = nbins


      CALL para_env%sum(bins_per_rank)


      total_bins = 0

      DO i = 1, nranks

         total_bins = total_bins + bins_per_rank(i)

      END DO


      ALLOCATE (shm_cost_vector(2*total_bins))

      shm_cost_vector = -1_int_8

      shm_displ(1) = 1

      DO i = 2, n_threads

         shm_displ(i) = shm_displ(i - 1) + shm_bins_per_rank(i - 1)

      END DO

      shm_displ(n_threads + 1) = nbins + 1

!$OMP END MASTER

!$OMP BARRIER

      j = 0

      SELECT CASE (eval_type)

      CASE (hfx_do_eval_energy)

         DO i = shm_displ(i_thread + 1), shm_displ(i_thread + 2) - 1

            j = j + 1

            shm_cost_vector(2*(i - 1) + 1) = x_data%distribution_energy(j)%cost

            shm_cost_vector(2*i) = int(x_data%distribution_energy(j)%time_first_scf*10000.0_dp, kind=int_8)

         END DO

      CASE (hfx_do_eval_forces)

         DO i = shm_displ(i_thread + 1), shm_displ(i_thread + 2) - 1

            j = j + 1

            shm_cost_vector(2*(i - 1) + 1) = x_data%distribution_forces(j)%cost

            shm_cost_vector(2*i) = int(x_data%distribution_forces(j)%time_forces*10000.0_dp, kind=int_8)

         END DO

      END SELECT

!$OMP BARRIER

!$OMP MASTER

      ! ** calculate offsets

      ALLOCATE (rdispl(nranks))

      bins_per_rank(:) = bins_per_rank(:)*2

      rdispl(1) = 0

      DO i = 2, nranks

         rdispl(i) = rdispl(i - 1) + bins_per_rank(i - 1)

      END DO


      ALLOCATE (buffer_in(2*nbins))

      ALLOCATE (buffer_out(2*total_bins))


      DO i = 1, nbins

         buffer_in(2*(i - 1) + 1) = shm_cost_vector(2*(i - 1) + 1)

         buffer_in(2*i) = shm_cost_vector(2*i)

      END DO


      CALL para_env%gatherv(buffer_in, buffer_out, bins_per_rank, rdispl)


      IF (iw > 0) THEN


         ALLOCATE (summary(2*nranks))

         summary = 0_int_8


         WRITE (iw, '( /, 1X, 79("-") )')

         WRITE (iw, '( " -", 77X, "-" )')

         SELECT CASE (eval_type)

         CASE (hfx_do_eval_energy)

            WRITE (iw, '( " -", 20X, A, 19X, "-" )') ' HFX LOAD BALANCE INFORMATION - ENERGY '

         CASE (hfx_do_eval_forces)

            WRITE (iw, '( " -", 20X, A, 19X, "-" )') ' HFX LOAD BALANCE INFORMATION - FORCES '

         END SELECT

         WRITE (iw, '( " -", 77X, "-" )')

         WRITE (iw, '( 1X, 79("-") )')


         WRITE (iw, fmt="(T3,A,T15,A,T35,A,T55,A)") "MPI RANK", "BIN #", "EST cost", "Processing time [s]"

         WRITE (iw, '( 1X, 79("-"), / )')

         k = 0

         DO i = 1, nranks

            DO j = 1, bins_per_rank(i)/2

               k = k + 1

               WRITE (iw, fmt="(T6,I5,T15,I5,T27,I16,T55,F19.8)") &

                  i - 1, j, buffer_out(2*(k - 1) + 1), real(buffer_out(2*k), dp)/10000.0_dp

               summary(2*(i - 1) + 1) = summary(2*(i - 1) + 1) + buffer_out(2*(k - 1) + 1)

               summary(2*i) = summary(2*i) + buffer_out(2*k)

            END DO

         END DO


         !** Summary

         max_bin = 0_int_8

         min_bin = huge(min_bin)

         sum_bin = 0_int_8

         DO i = 1, total_bins

            sum_bin = sum_bin + buffer_out(2*i)

            max_bin = max(max_bin, buffer_out(2*i))

            min_bin = min(min_bin, buffer_out(2*i))

         END DO

         avg_bin = sum_bin/total_bins


         max_rank = 0_int_8

         min_rank = huge(min_rank)

         sum_rank = 0_int_8

         DO i = 1, nranks

            sum_rank = sum_rank + summary(2*i)

            max_rank = max(max_rank, summary(2*i))

            min_rank = min(min_rank, summary(2*i))

         END DO

         avg_rank = sum_rank/nranks


         WRITE (iw, fmt='(/,T3,A,/)') "SUMMARY:"

         WRITE (iw, fmt="(T3,A,T35,F19.8)") "Max bin", real(max_bin, dp)/10000.0_dp

         WRITE (iw, fmt="(T3,A,T35,F19.8)") "Min bin", real(min_bin, dp)/10000.0_dp

         WRITE (iw, fmt="(T3,A,T35,F19.8)") "Sum bin", real(sum_bin, dp)/10000.0_dp

         WRITE (iw, fmt="(T3,A,T35,F19.8,/)") "Avg bin", real(avg_bin, dp)/10000.0_dp

         WRITE (iw, fmt="(T3,A,T35,F19.8)") "Max rank", real(max_rank, dp)/10000.0_dp

         WRITE (iw, fmt="(T3,A,T35,F19.8)") "Min rank", real(min_rank, dp)/10000.0_dp

         WRITE (iw, fmt="(T3,A,T35,F19.8)") "Sum rank", real(sum_rank, dp)/10000.0_dp

         WRITE (iw, fmt="(T3,A,T35,F19.8,/)") "Avg rank", real(avg_rank, dp)/10000.0_dp


         ALLOCATE (buffer(nranks))

         ALLOCATE (sort_idx(nranks))


         DO i = 1, nranks

            buffer(i) = summary(2*i)

         END DO


         CALL sort(buffer, nranks, sort_idx)


         WRITE (iw, fmt="(T3,A,T35,A,T55,A,/)") "MPI RANK", "EST cost", "Processing time [s]"

         DO i = nranks, 1, -1

       WRITE (iw, fmt="(T6,I5,T27,I16,T55,F19.8)") sort_idx(i) - 1, summary(2*(sort_idx(i) - 1) + 1), real(buffer(i), dp)/10000.0_dp

         END DO


         DEALLOCATE (summary, buffer, sort_idx)


      END IF


      DEALLOCATE (buffer_in, buffer_out, rdispl)


      CALL para_env%sync()


      DEALLOCATE (shm_bins_per_rank, shm_displ, shm_cost_vector)

!$OMP END MASTER

!$OMP BARRIER


   END SUBROUTINE collect_load_balance_info


! **************************************************************************************************

!> \brief This routine calculates the maximum density matrix element, when

!>        screening on an initial density matrix is applied. Due to symmetry of

!>        the ERI's, there are always 4 matrix elements to be considered.

!>        CASE 0-15 belong to an energy calculation (linear screening)

!>        CASE 16-31 belong to a force calculation (square screening)

!> \param ptr_p_1 Pointers to atomic density matrices

!> \param ptr_p_2 Pointers to atomic density matrices

!> \param ptr_p_3 Pointers to atomic density matrices

!> \param ptr_p_4 Pointers to atomic density matrices

!> \param iset Current set

!> \param jset Current set

!> \param kset Current set

!> \param lset Current set

!> \param pmax_val value to be calculated

!> \param swap_id Defines how the matrices are accessed

!> \par History

!>      06.2009 created [Manuel Guidon]

!> \author Manuel Guidon

! **************************************************************************************************

PURE SUBROUTINE get_pmax_val(ptr_p_1, ptr_p_2, ptr_p_3, ptr_p_4, iset, jset, kset, lset, pmax_val, swap_id)


   REAL(dp), DIMENSION(:, :), POINTER       :: ptr_p_1, ptr_p_2, ptr_p_3, ptr_p_4

   INTEGER, INTENT(IN)                      :: iset, jset, kset, lset


   REAL(dp), INTENT(OUT)                    :: pmax_val

   INTEGER, INTENT(IN)                      :: swap_id


   REAL(dp)                                 :: pmax_1, pmax_2, pmax_3, pmax_4


   SELECT CASE (swap_id)

   CASE (0)

      pmax_1 = ptr_p_1(kset, iset)

      pmax_2 = ptr_p_2(lset, jset)

      pmax_3 = ptr_p_3(lset, iset)

      pmax_4 = ptr_p_4(kset, jset)

      pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)

   CASE (1)

      pmax_1 = ptr_p_1(iset, kset)

      pmax_2 = ptr_p_2(lset, jset)

      pmax_3 = ptr_p_3(lset, iset)

      pmax_4 = ptr_p_4(kset, jset)

      pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)

   CASE (2)

      pmax_1 = ptr_p_1(kset, iset)

      pmax_2 = ptr_p_2(jset, lset)

      pmax_3 = ptr_p_3(lset, iset)

      pmax_4 = ptr_p_4(kset, jset)

      pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)

   CASE (3)

      pmax_1 = ptr_p_1(iset, kset)

      pmax_2 = ptr_p_2(jset, lset)

      pmax_3 = ptr_p_3(lset, iset)

      pmax_4 = ptr_p_4(kset, jset)

      pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)

   CASE (4)

      pmax_1 = ptr_p_1(kset, iset)

      pmax_2 = ptr_p_2(lset, jset)

      pmax_3 = ptr_p_3(iset, lset)

      pmax_4 = ptr_p_4(kset, jset)

      pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)

   CASE (5)

      pmax_1 = ptr_p_1(iset, kset)

      pmax_2 = ptr_p_2(lset, jset)

      pmax_3 = ptr_p_3(iset, lset)

      pmax_4 = ptr_p_4(kset, jset)

      pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)

   CASE (6)

      pmax_1 = ptr_p_1(kset, iset)

      pmax_2 = ptr_p_2(jset, lset)

      pmax_3 = ptr_p_3(iset, lset)

      pmax_4 = ptr_p_4(kset, jset)

      pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)

   CASE (7)

      pmax_1 = ptr_p_1(iset, kset)

      pmax_2 = ptr_p_2(jset, lset)

      pmax_3 = ptr_p_3(iset, lset)

      pmax_4 = ptr_p_4(kset, jset)

      pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)

   CASE (8)

      pmax_1 = ptr_p_1(kset, iset)

      pmax_2 = ptr_p_2(lset, jset)

      pmax_3 = ptr_p_3(lset, iset)

      pmax_4 = ptr_p_4(jset, kset)

      pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)

   CASE (9)

      pmax_1 = ptr_p_1(iset, kset)

      pmax_2 = ptr_p_2(lset, jset)

      pmax_3 = ptr_p_3(lset, iset)

      pmax_4 = ptr_p_4(jset, kset)

      pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)

   CASE (10)

      pmax_1 = ptr_p_1(kset, iset)

      pmax_2 = ptr_p_2(jset, lset)

      pmax_3 = ptr_p_3(lset, iset)

      pmax_4 = ptr_p_4(jset, kset)

      pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)

   CASE (11)

      pmax_1 = ptr_p_1(iset, kset)

      pmax_2 = ptr_p_2(jset, lset)

      pmax_3 = ptr_p_3(lset, iset)

      pmax_4 = ptr_p_4(jset, kset)

      pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)

   CASE (12)

      pmax_1 = ptr_p_1(kset, iset)

      pmax_2 = ptr_p_2(lset, jset)

      pmax_3 = ptr_p_3(iset, lset)

      pmax_4 = ptr_p_4(jset, kset)

      pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)

   CASE (13)

      pmax_1 = ptr_p_1(iset, kset)

      pmax_2 = ptr_p_2(lset, jset)

      pmax_3 = ptr_p_3(iset, lset)

      pmax_4 = ptr_p_4(jset, kset)

      pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)

   CASE (14)

      pmax_1 = ptr_p_1(kset, iset)

      pmax_2 = ptr_p_2(jset, lset)

      pmax_3 = ptr_p_3(iset, lset)

      pmax_4 = ptr_p_4(jset, kset)

      pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)

   CASE (15)

      pmax_1 = ptr_p_1(iset, kset)

      pmax_2 = ptr_p_2(jset, lset)

      pmax_3 = ptr_p_3(iset, lset)

      pmax_4 = ptr_p_4(jset, kset)

      pmax_val = max(pmax_1, pmax_2, pmax_3, pmax_4)

   CASE (16)

      pmax_1 = ptr_p_1(kset, iset)

      pmax_2 = ptr_p_2(lset, jset)

      pmax_3 = ptr_p_3(lset, iset)

      pmax_4 = ptr_p_4(kset, jset)

      pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)

   CASE (17)

      pmax_1 = ptr_p_1(iset, kset)

      pmax_2 = ptr_p_2(lset, jset)

      pmax_3 = ptr_p_3(lset, iset)

      pmax_4 = ptr_p_4(kset, jset)

      pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)

   CASE (18)

      pmax_1 = ptr_p_1(kset, iset)

      pmax_2 = ptr_p_2(jset, lset)

      pmax_3 = ptr_p_3(lset, iset)

      pmax_4 = ptr_p_4(kset, jset)

      pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)

   CASE (19)

      pmax_1 = ptr_p_1(iset, kset)

      pmax_2 = ptr_p_2(jset, lset)

      pmax_3 = ptr_p_3(lset, iset)

      pmax_4 = ptr_p_4(kset, jset)

      pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)

   CASE (20)

      pmax_1 = ptr_p_1(kset, iset)

      pmax_2 = ptr_p_2(lset, jset)

      pmax_3 = ptr_p_3(iset, lset)

      pmax_4 = ptr_p_4(kset, jset)

      pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)

   CASE (21)

      pmax_1 = ptr_p_1(iset, kset)

      pmax_2 = ptr_p_2(lset, jset)

      pmax_3 = ptr_p_3(iset, lset)

      pmax_4 = ptr_p_4(kset, jset)

      pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)

   CASE (22)

      pmax_1 = ptr_p_1(kset, iset)

      pmax_2 = ptr_p_2(jset, lset)

      pmax_3 = ptr_p_3(iset, lset)

      pmax_4 = ptr_p_4(kset, jset)

      pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)

   CASE (23)

      pmax_1 = ptr_p_1(iset, kset)

      pmax_2 = ptr_p_2(jset, lset)

      pmax_3 = ptr_p_3(iset, lset)

      pmax_4 = ptr_p_4(kset, jset)

      pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)

   CASE (24)

      pmax_1 = ptr_p_1(kset, iset)

      pmax_2 = ptr_p_2(lset, jset)

      pmax_3 = ptr_p_3(lset, iset)

      pmax_4 = ptr_p_4(jset, kset)

      pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)

   CASE (25)

      pmax_1 = ptr_p_1(iset, kset)

      pmax_2 = ptr_p_2(lset, jset)

      pmax_3 = ptr_p_3(lset, iset)

      pmax_4 = ptr_p_4(jset, kset)

      pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)

   CASE (26)

      pmax_1 = ptr_p_1(kset, iset)

      pmax_2 = ptr_p_2(jset, lset)

      pmax_3 = ptr_p_3(lset, iset)

      pmax_4 = ptr_p_4(jset, kset)

      pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)

   CASE (27)

      pmax_1 = ptr_p_1(iset, kset)

      pmax_2 = ptr_p_2(jset, lset)

      pmax_3 = ptr_p_3(lset, iset)

      pmax_4 = ptr_p_4(jset, kset)

      pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)

   CASE (28)

      pmax_1 = ptr_p_1(kset, iset)

      pmax_2 = ptr_p_2(lset, jset)

      pmax_3 = ptr_p_3(iset, lset)

      pmax_4 = ptr_p_4(jset, kset)

      pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)

   CASE (29)

      pmax_1 = ptr_p_1(iset, kset)

      pmax_2 = ptr_p_2(lset, jset)

      pmax_3 = ptr_p_3(iset, lset)

      pmax_4 = ptr_p_4(jset, kset)

      pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)

   CASE (30)

      pmax_1 = ptr_p_1(kset, iset)

      pmax_2 = ptr_p_2(jset, lset)

      pmax_3 = ptr_p_3(iset, lset)

      pmax_4 = ptr_p_4(jset, kset)

      pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)

   CASE (31)

      pmax_1 = ptr_p_1(iset, kset)

      pmax_2 = ptr_p_2(jset, lset)

      pmax_3 = ptr_p_3(iset, lset)

      pmax_4 = ptr_p_4(jset, kset)

      pmax_val = max(pmax_1 + pmax_2, pmax_3 + pmax_4)

   END SELECT


END SUBROUTINE get_pmax_val


END MODULE hfx_load_balance_methods

modulo
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
Definition grid_common.h:125

idx
static GRID_HOST_DEVICE int idx(const orbital a)
Return coset index of given orbital angular momentum.
Definition grid_common.h:161

message_passing::mp_waitall
Definition message_passing.F:850

util::sort
Definition util.F:31

cell_types
Handles all functions related to the CELL.
Definition cell_types.F:15

cp_files
Utility routines to open and close files. Tracking of preconnections.
Definition cp_files.F:16

cp_files::open_file
subroutine, public open_file(file_name, file_status, file_form, file_action, file_position, file_pad, unit_number, debug, skip_get_unit_number, file_access)
Opens the requested file using a free unit number.
Definition cp_files.F:311

cp_files::close_file
subroutine, public close_file(unit_number, file_status, keep_preconnection)
Close an open file given by its logical unit number. Optionally, keep the file and unit preconnected.
Definition cp_files.F:122

hfx_load_balance_methods
Routines for optimizing load balance between processes in HFX calculations.
Definition hfx_load_balance_methods.F:14

hfx_load_balance_methods::p1_energy
real(kind=dp), dimension(12), parameter, public p1_energy
Definition hfx_load_balance_methods.F:45

hfx_load_balance_methods::p3_energy
real(kind=dp), dimension(2), parameter, public p3_energy
Definition hfx_load_balance_methods.F:57

hfx_load_balance_methods::p2_energy
real(kind=dp), dimension(12), parameter, public p2_energy
Definition hfx_load_balance_methods.F:51

hfx_load_balance_methods::collect_load_balance_info
subroutine, public collect_load_balance_info(para_env, x_data, iw, n_threads, i_thread, eval_type)
...
Definition hfx_load_balance_methods.F:2330

hfx_load_balance_methods::hfx_load_balance
subroutine, public hfx_load_balance(x_data, eps_schwarz, particle_set, max_set, para_env, coeffs_set, coeffs_kind, is_assoc_atomic_block_global, do_periodic, load_balance_parameter, kind_of, basis_parameter, pmax_set, pmax_atom, i_thread, n_threads, cell, do_p_screening, map_atom_to_kind_atom, nkind, eval_type, pmax_block, use_virial)
Distributes the computation of eri's to all available processes.
Definition hfx_load_balance_methods.F:130

hfx_load_balance_methods::cost_model
integer(kind=int_8) function, public cost_model(nsa, nsb, nsc, nsd, npgfa, npgfb, npgfc, npgfd, ratio, p1, p2, p3)
estimates the cost of a set quartet with info available at load balance time i.e. without much info o...
Definition hfx_load_balance_methods.F:1766

hfx_load_balance_methods::hfx_update_load_balance
subroutine, public hfx_update_load_balance(x_data, para_env, load_balance_parameter, i_thread, n_threads, eval_type)
Cheap way of redistributing the eri's.
Definition hfx_load_balance_methods.F:1483

hfx_pair_list_methods
Routines for optimizing load balance between processes in HFX calculations.
Definition hfx_pair_list_methods.F:15

hfx_pair_list_methods::build_atomic_pair_list
subroutine, public build_atomic_pair_list(natom, atomic_pair_list, kind_of, basis_parameter, particle_set, do_periodic, coeffs_kind, coeffs_kind_max0, log10_eps_schwarz, cell, blocks)
...
Definition hfx_pair_list_methods.F:686

hfx_pair_list_methods::build_pair_list
subroutine, public build_pair_list(natom, list, set_list, i_start, i_end, j_start, j_end, kind_of, basis_parameter, particle_set, do_periodic, coeffs_set, coeffs_kind, coeffs_kind_max0, log10_eps_schwarz, cell, pmax_blocks, atomic_pair_list)
...
Definition hfx_pair_list_methods.F:591

hfx_types
Types and set/get functions for HFX.
Definition hfx_types.F:16

hfx_types::hfx_set_distr_energy
subroutine, public hfx_set_distr_energy(ptr_to_distr, x_data)
This routine stores the data obtained from the load balance routine for the energy
Definition hfx_types.F:2639

hfx_types::hfx_set_distr_forces
subroutine, public hfx_set_distr_forces(ptr_to_distr, x_data)
This routine stores the data obtained from the load balance routine for the forces
Definition hfx_types.F:2659

input_constants
collects all constants needed in input so that they can be used without circular dependencies
Definition input_constants.F:18

input_constants::hfx_do_eval_energy
integer, parameter, public hfx_do_eval_energy
Definition input_constants.F:856

input_constants::hfx_do_eval_forces
integer, parameter, public hfx_do_eval_forces
Definition input_constants.F:856

kinds
Defines the basic variable types.
Definition kinds.F:23

kinds::int_8
integer, parameter, public int_8
Definition kinds.F:54

kinds::dp
integer, parameter, public dp
Definition kinds.F:34

message_passing
Interface to the message passing library MPI.
Definition message_passing.F:23

parallel_rng_types
Parallel (pseudo)random number generator (RNG) for multiple streams and substreams of random numbers.
Definition parallel_rng_types.F:51

parallel_rng_types::uniform
integer, parameter, public uniform
Definition parallel_rng_types.F:73

particle_types
Define the data structure for the particle information.
Definition particle_types.F:19

util
All kind of helpful little routines.
Definition util.F:14

cell_types::cell_type
Type defining parameters related to the simulation cell.
Definition cell_types.F:60

hfx_types::hfx_basis_type
Definition hfx_types.F:267

hfx_types::hfx_block_range_type
Definition hfx_types.F:344

hfx_types::hfx_distribution
Definition hfx_types.F:214

hfx_types::hfx_load_balance_type
Definition hfx_types.F:191

hfx_types::hfx_p_kind
Definition hfx_types.F:301

hfx_types::hfx_screen_coeff_type
Definition hfx_types.F:296

hfx_types::hfx_type
stores some data used in construction of Kohn-Sham matrix
Definition hfx_types.F:514

hfx_types::pair_list_type
Definition hfx_types.F:239

hfx_types::pair_set_list_type
Definition hfx_types.F:234

message_passing::mp_para_env_type
stores all the informations relevant to an mpi environment
Definition message_passing.F:743

message_passing::mp_request_type
Definition message_passing.F:603

parallel_rng_types::rng_stream_type
Definition parallel_rng_types.F:157

particle_types::particle_type
Definition particle_types.F:35