db/dff/xas__tdp__correction_8F_source.html

!--------------------------------------------------------------------------------------------------!

!   CP2K: A general program to perform molecular dynamics simulations                              !

!   Copyright 2000-2024 CP2K developers group <https://cp2k.org>                                   !

!                                                                                                  !

!   SPDX-License-Identifier: GPL-2.0-or-later                                                      !

!--------------------------------------------------------------------------------------------------!


! **************************************************************************************************

!> \brief Second order perturbation correction to XAS_TDP spectra (i.e. shift)

!> \author A. Bussy (01.2020)

! **************************************************************************************************


MODULE xas_tdp_correction

   USE admm_types,                      ONLY: admm_type

   USE admm_utils,                      ONLY: admm_correct_for_eigenvalues

   USE bibliography,                    ONLY: bussy2021b,&

                                              shigeta2001,&

                                              cite_reference

   USE cp_array_utils,                  ONLY: cp_1d_i_p_type,&

                                              cp_1d_r_p_type

   USE cp_blacs_env,                    ONLY: cp_blacs_env_type

   USE cp_cfm_types,                    ONLY: cp_cfm_create,&

                                              cp_cfm_get_submatrix,&

                                              cp_cfm_release,&

                                              cp_cfm_type,&

                                              cp_fm_to_cfm

   USE cp_control_types,                ONLY: dft_control_type

   USE cp_dbcsr_operations,             ONLY: copy_fm_to_dbcsr,&

                                              cp_dbcsr_sm_fm_multiply,&

                                              dbcsr_deallocate_matrix_set

   USE cp_fm_diag,                      ONLY: choose_eigv_solver

   USE cp_fm_struct,                    ONLY: cp_fm_struct_create,&

                                              cp_fm_struct_p_type,&

                                              cp_fm_struct_release,&

                                              cp_fm_struct_type

   USE cp_fm_types,                     ONLY: cp_fm_create,&

                                              cp_fm_get_diag,&

                                              cp_fm_get_submatrix,&

                                              cp_fm_release,&

                                              cp_fm_to_fm,&

                                              cp_fm_to_fm_submat,&

                                              cp_fm_type

   USE cp_log_handling,                 ONLY: cp_logger_get_default_io_unit

   USE dbcsr_api,                       ONLY: &

        dbcsr_copy, dbcsr_create, dbcsr_distribution_get, dbcsr_distribution_new, &

        dbcsr_distribution_release, dbcsr_distribution_type, dbcsr_get_info, dbcsr_p_type, &

        dbcsr_release, dbcsr_type

   USE dbt_api,                         ONLY: &

        dbt_contract, dbt_copy, dbt_copy_matrix_to_tensor, dbt_create, dbt_default_distvec, &

        dbt_destroy, dbt_distribution_destroy, dbt_distribution_new, dbt_distribution_type, &

        dbt_finalize, dbt_get_block, dbt_get_info, dbt_iterator_blocks_left, &

        dbt_iterator_next_block, dbt_iterator_start, dbt_iterator_stop, dbt_iterator_type, &

        dbt_pgrid_create, dbt_pgrid_destroy, dbt_pgrid_type, dbt_put_block, dbt_type

   USE hfx_admm_utils,                  ONLY: create_admm_xc_section

   USE input_section_types,             ONLY: section_vals_create,&

                                              section_vals_get_subs_vals,&

                                              section_vals_release,&

                                              section_vals_retain,&

                                              section_vals_set_subs_vals,&

                                              section_vals_type

   USE kinds,                           ONLY: dp

   USE machine,                         ONLY: m_flush

   USE mathlib,                         ONLY: complex_diag

   USE message_passing,                 ONLY: mp_para_env_type

   USE parallel_gemm_api,               ONLY: parallel_gemm

   USE physcon,                         ONLY: evolt

   USE qs_environment_types,            ONLY: get_qs_env,&

                                              qs_environment_type

   USE qs_ks_methods,                   ONLY: qs_ks_build_kohn_sham_matrix

   USE qs_mo_types,                     ONLY: deallocate_mo_set,&

                                              duplicate_mo_set,&

                                              get_mo_set,&

                                              mo_set_type,&

                                              reassign_allocated_mos

   USE util,                            ONLY: get_limit

   USE xas_tdp_kernel,                  ONLY: contract2_ao_to_domo,&

                                              ri_all_blocks_mm

   USE xas_tdp_types,                   ONLY: donor_state_type,&

                                              xas_tdp_control_type,&

                                              xas_tdp_env_type


!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num

#include "./base/base_uses.f90"


   IMPLICIT NONE

   PRIVATE


   CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'xas_tdp_correction'


   PUBLIC :: gw2x_shift, get_soc_splitting


CONTAINS


! **************************************************************************************************

!> \brief Computes the ionization potential using the GW2X method of Shigeta et. al. The result cam

!>        be used for XAS correction (shift) or XPS directly.

!> \param donor_state ...

!> \param xas_tdp_env ...

!> \param xas_tdp_control ...

!> \param qs_env ...

! **************************************************************************************************


   SUBROUTINE gw2x_shift(donor_state, xas_tdp_env, xas_tdp_control, qs_env)


      TYPE(donor_state_type), POINTER                    :: donor_state

      TYPE(xas_tdp_env_type), POINTER                    :: xas_tdp_env

      TYPE(xas_tdp_control_type), POINTER                :: xas_tdp_control

      TYPE(qs_environment_type), POINTER                 :: qs_env


      CHARACTER(len=*), PARAMETER                        :: routinen = 'GW2X_shift'


      INTEGER :: ex_idx, exat, first_domo(2), handle, i, ido_mo, iloc, ilocat, ispin, jspin, &

         locat, nao, natom, ndo_mo, nhomo(2), nlumo(2), nonloc, nspins, start_sgf

      INTEGER, DIMENSION(:), POINTER                     :: nsgf_blk

      LOGICAL                                            :: pseudo_canonical

      REAL(dp)                                           :: og_hfx_frac

      REAL(dp), ALLOCATABLE, DIMENSION(:, :)             :: contract_coeffs_backup

      TYPE(admm_type), POINTER                           :: admm_env

      TYPE(cp_1d_r_p_type), ALLOCATABLE, DIMENSION(:)    :: homo_evals, lumo_evals

      TYPE(cp_blacs_env_type), POINTER                   :: blacs_env

      TYPE(cp_fm_struct_p_type), ALLOCATABLE, &

         DIMENSION(:)                                    :: all_struct, homo_struct, lumo_struct

      TYPE(cp_fm_struct_type), POINTER                   :: hoho_struct, lulu_struct

      TYPE(cp_fm_type)                                   :: hoho_fock, hoho_work, homo_work, &

                                                            lulu_fock, lulu_work, lumo_work

      TYPE(cp_fm_type), ALLOCATABLE, DIMENSION(:)        :: all_coeffs, homo_coeffs, lumo_coeffs

      TYPE(cp_fm_type), POINTER                          :: mo_coeff

      TYPE(dbcsr_p_type), DIMENSION(:), POINTER          :: dbcsr_work, fock_matrix, matrix_ks

      TYPE(dbt_type), ALLOCATABLE, DIMENSION(:)          :: ja_x, oi_y

      TYPE(dbt_type), ALLOCATABLE, DIMENSION(:, :)       :: mo_template

      TYPE(dft_control_type), POINTER                    :: dft_control

      TYPE(mo_set_type), DIMENSION(:), POINTER           :: mos

      TYPE(mp_para_env_type), POINTER                    :: para_env

      TYPE(section_vals_type), POINTER                   :: xc_fun_empty, xc_fun_original, xc_section


      NULLIFY (xc_fun_empty, xc_fun_original, xc_section, mos, dft_control, dbcsr_work, &

               fock_matrix, matrix_ks, para_env, mo_coeff, blacs_env, nsgf_blk)


      CALL cite_reference(shigeta2001)

      CALL cite_reference(bussy2021b)


      CALL timeset(routinen, handle)


      !The GW2X correction we want to compute goes like this, where omega is the corrected epsilon_I:

      !omega = eps_I + 0.5 * sum_ajk |<Ia||jk>|^2/(omega + eps_a - eps_j - eps_k)

      !              + 0.5 * sum_jab |<Ij||ab>|^2/(omega + eps_j - eps_a - eps_b)

      ! j,k denote occupied spin-orbitals and a,b denote virtual spin orbitals


      !The strategy is the following (we assume restricted closed-shell):

      !1) Get the LUMOs from xas_tdp_env

      !2) Get the HOMOs from qs_env

      !3) Compute or fetch the generalize Fock matric

      !4) Diagonalize it in the subspace of HOMOs and LUMOs (or just take diagonal matrix elements)

      !5) Build the full HOMO-LUMO basis that we will use and compute eigenvalues

      !6) Iterate over GW2X steps to compute the self energy


      !We implement 2 approaches => diagonal elements of Fock matrix with original MOs and

      !pseudo-canonical MOs

      pseudo_canonical = xas_tdp_control%pseudo_canonical


      !Get donor state info

      ndo_mo = donor_state%ndo_mo

      nspins = 1; IF (xas_tdp_control%do_uks .OR. xas_tdp_control%do_roks) nspins = 2


      !1) Get the LUMO coefficients from the xas_tdp_env, that have been precomputed


      CALL get_qs_env(qs_env, matrix_ks=matrix_ks, mos=mos, para_env=para_env, &

                      blacs_env=blacs_env, natom=natom)


      ALLOCATE (lumo_struct(nspins), lumo_coeffs(nspins))


      DO ispin = 1, nspins

         CALL get_mo_set(mos(ispin), homo=nhomo(ispin), nao=nao)

         nlumo(ispin) = nao - nhomo(ispin)


         CALL cp_fm_struct_create(lumo_struct(ispin)%struct, para_env=para_env, context=blacs_env, &

                                  ncol_global=nlumo(ispin), nrow_global=nao)


         CALL cp_fm_create(lumo_coeffs(ispin), lumo_struct(ispin)%struct)

         CALL cp_fm_to_fm(xas_tdp_env%lumo_evecs(ispin), lumo_coeffs(ispin))

      END DO


      !2) get the HOMO coeffs. Reminder: keep all non-localized MOs + those localized on core atom

      !   For this to work, it is assumed that the LOCALIZE keyword is used

      ALLOCATE (homo_struct(nspins), homo_coeffs(nspins))


      DO ispin = 1, nspins

         nonloc = nhomo(ispin) - xas_tdp_control%n_search

         exat = donor_state%at_index

         ex_idx = minloc(abs(xas_tdp_env%ex_atom_indices - exat), 1)

         locat = count(xas_tdp_env%mos_of_ex_atoms(:, ex_idx, ispin) == 1)


         CALL cp_fm_struct_create(homo_struct(ispin)%struct, para_env=para_env, context=blacs_env, &

                                  ncol_global=locat + nonloc, nrow_global=nao)

         CALL cp_fm_create(homo_coeffs(ispin), homo_struct(ispin)%struct)


         CALL get_mo_set(mos(ispin), mo_coeff=mo_coeff)

         CALL cp_fm_to_fm_submat(mo_coeff, homo_coeffs(ispin), nrow=nao, ncol=nonloc, s_firstrow=1, &

                                 s_firstcol=xas_tdp_control%n_search + 1, t_firstrow=1, t_firstcol=locat + 1)


         !this bit is taken from xas_tdp_methods

         ilocat = 1

         DO iloc = 1, xas_tdp_control%n_search

            IF (xas_tdp_env%mos_of_ex_atoms(iloc, ex_idx, ispin) == -1) cycle

            CALL cp_fm_to_fm_submat(mo_coeff, homo_coeffs(ispin), nrow=nao, ncol=1, s_firstrow=1, &

                                    s_firstcol=iloc, t_firstrow=1, t_firstcol=ilocat)

            !keep track of donor MO index

            IF (iloc == donor_state%mo_indices(1, ispin)) first_domo(ispin) = ilocat !first donor MO


            ilocat = ilocat + 1

         END DO

         nhomo(ispin) = locat + nonloc

      END DO


      !3) Computing the generalized Fock Matrix, if not there already

      IF (ASSOCIATED(xas_tdp_env%fock_matrix)) THEN

         fock_matrix => xas_tdp_env%fock_matrix

      ELSE

         block

            TYPE(mo_set_type), DIMENSION(:), ALLOCATABLE :: backup_mos


            ALLOCATE (xas_tdp_env%fock_matrix(nspins))

            fock_matrix => xas_tdp_env%fock_matrix


            ! remove the xc_functionals and set HF fraction to 1

            xc_section => section_vals_get_subs_vals(qs_env%input, "DFT%XC")

            xc_fun_original => section_vals_get_subs_vals(xc_section, "XC_FUNCTIONAL")

            CALL section_vals_retain(xc_fun_original)

            CALL section_vals_create(xc_fun_empty, xc_fun_original%section)

            CALL section_vals_set_subs_vals(xc_section, "XC_FUNCTIONAL", xc_fun_empty)

            CALL section_vals_release(xc_fun_empty)

            og_hfx_frac = qs_env%x_data(1, 1)%general_parameter%fraction

            qs_env%x_data(:, :)%general_parameter%fraction = 1.0_dp


            !In case of ADMM, we need to re-create the admm XC section for the new hfx_fraction

            !We also need to make a backup of the MOs as theiy are modified

            CALL get_qs_env(qs_env, dft_control=dft_control, admm_env=admm_env)

            IF (dft_control%do_admm) THEN

               IF (ASSOCIATED(admm_env%xc_section_primary)) CALL section_vals_release(admm_env%xc_section_primary)

               IF (ASSOCIATED(admm_env%xc_section_aux)) CALL section_vals_release(admm_env%xc_section_aux)

               CALL create_admm_xc_section(qs_env%x_data, xc_section, admm_env)


               ALLOCATE (backup_mos(SIZE(mos)))

               DO i = 1, SIZE(mos)

                  CALL duplicate_mo_set(backup_mos(i), mos(i))

               END DO

            END IF


            ALLOCATE (dbcsr_work(nspins))

            DO ispin = 1, nspins

               ALLOCATE (dbcsr_work(ispin)%matrix)

               CALL dbcsr_copy(dbcsr_work(ispin)%matrix, matrix_ks(ispin)%matrix)

            END DO


            !both spins treated internally

            CALL qs_ks_build_kohn_sham_matrix(qs_env, calculate_forces=.false., just_energy=.false.)


            DO ispin = 1, nspins

               ALLOCATE (fock_matrix(ispin)%matrix)

               CALL dbcsr_copy(fock_matrix(ispin)%matrix, matrix_ks(ispin)%matrix, name="FOCK MATRIX")

               CALL dbcsr_release(matrix_ks(ispin)%matrix)

               CALL dbcsr_copy(matrix_ks(ispin)%matrix, dbcsr_work(ispin)%matrix)

            END DO

            CALL dbcsr_deallocate_matrix_set(dbcsr_work)


            !In case of ADMM, we want to correct for eigenvalues

            IF (dft_control%do_admm) THEN

               DO ispin = 1, nspins

                  CALL admm_correct_for_eigenvalues(ispin, admm_env, fock_matrix(ispin)%matrix)

               END DO

            END IF


            !restore xc and HF fraction

            CALL section_vals_set_subs_vals(xc_section, "XC_FUNCTIONAL", xc_fun_original)

            CALL section_vals_release(xc_fun_original)

            qs_env%x_data(:, :)%general_parameter%fraction = og_hfx_frac


            IF (dft_control%do_admm) THEN

               IF (ASSOCIATED(admm_env%xc_section_primary)) CALL section_vals_release(admm_env%xc_section_primary)

               IF (ASSOCIATED(admm_env%xc_section_aux)) CALL section_vals_release(admm_env%xc_section_aux)

               CALL create_admm_xc_section(qs_env%x_data, xc_section, admm_env)


               DO i = 1, SIZE(mos)

                  CALL reassign_allocated_mos(mos(i), backup_mos(i))

                  CALL deallocate_mo_set(backup_mos(i))

               END DO

               DEALLOCATE (backup_mos)

            END IF

         END block

      END IF


      !4,5) Build pseudo-canonical MOs if needed + get related Fock matrix elements

      ALLOCATE (all_struct(nspins), all_coeffs(nspins))

      ALLOCATE (homo_evals(nspins), lumo_evals(nspins))

      CALL dbcsr_get_info(matrix_ks(1)%matrix, row_blk_size=nsgf_blk)

      ALLOCATE (contract_coeffs_backup(nsgf_blk(exat), nspins*ndo_mo))


      DO ispin = 1, nspins

         CALL cp_fm_struct_create(hoho_struct, para_env=para_env, context=blacs_env, &

                                  ncol_global=nhomo(ispin), nrow_global=nhomo(ispin))

         CALL cp_fm_struct_create(lulu_struct, para_env=para_env, context=blacs_env, &

                                  ncol_global=nlumo(ispin), nrow_global=nlumo(ispin))


         CALL cp_fm_create(hoho_work, hoho_struct)

         CALL cp_fm_create(lulu_work, lulu_struct)

         CALL cp_fm_create(homo_work, homo_struct(ispin)%struct)

         CALL cp_fm_create(lumo_work, lumo_struct(ispin)%struct)


         IF (pseudo_canonical) THEN

            !That is where we rotate the MOs to make them pseudo canonical

            !The eigenvalues we get from the diagonalization


            !The Fock matrix in the HOMO subspace

            CALL cp_fm_create(hoho_fock, hoho_struct)

            NULLIFY (homo_evals(ispin)%array)

            ALLOCATE (homo_evals(ispin)%array(nhomo(ispin)))

            CALL cp_dbcsr_sm_fm_multiply(fock_matrix(ispin)%matrix, homo_coeffs(ispin), &

                                         homo_work, ncol=nhomo(ispin))

            CALL parallel_gemm('T', 'N', nhomo(ispin), nhomo(ispin), nao, 1.0_dp, homo_coeffs(ispin), &

                               homo_work, 0.0_dp, hoho_fock)


            !diagonalize and get pseudo-canonical MOs

            CALL choose_eigv_solver(hoho_fock, hoho_work, homo_evals(ispin)%array)

            CALL parallel_gemm('N', 'N', nao, nhomo(ispin), nhomo(ispin), 1.0_dp, homo_coeffs(ispin), &

                               hoho_work, 0.0_dp, homo_work)

            CALL cp_fm_to_fm(homo_work, homo_coeffs(ispin))


            !overwrite the donor_state's contract coeffs with those

            contract_coeffs_backup(:, (ispin - 1)*ndo_mo + 1:ispin*ndo_mo) = &

               donor_state%contract_coeffs(:, (ispin - 1)*ndo_mo + 1:ispin*ndo_mo)

            start_sgf = sum(nsgf_blk(1:exat - 1)) + 1

            CALL cp_fm_get_submatrix(homo_coeffs(ispin), &

                                     donor_state%contract_coeffs(:, (ispin - 1)*ndo_mo + 1:ispin*ndo_mo), &

                                     start_row=start_sgf, start_col=first_domo(ispin), &

                                     n_rows=nsgf_blk(exat), n_cols=ndo_mo)


            !do the same for the pseudo-LUMOs

            CALL cp_fm_create(lulu_fock, lulu_struct)

            NULLIFY (lumo_evals(ispin)%array)

            ALLOCATE (lumo_evals(ispin)%array(nlumo(ispin)))

            CALL cp_dbcsr_sm_fm_multiply(fock_matrix(ispin)%matrix, lumo_coeffs(ispin), &

                                         lumo_work, ncol=nlumo(ispin))

            CALL parallel_gemm('T', 'N', nlumo(ispin), nlumo(ispin), nao, 1.0_dp, lumo_coeffs(ispin), &

                               lumo_work, 0.0_dp, lulu_fock)


            !diagonalize and get pseudo-canonical MOs

            CALL choose_eigv_solver(lulu_fock, lulu_work, lumo_evals(ispin)%array)

            CALL parallel_gemm('N', 'N', nao, nlumo(ispin), nlumo(ispin), 1.0_dp, lumo_coeffs(ispin), &

                               lulu_work, 0.0_dp, lumo_work)

            CALL cp_fm_to_fm(lumo_work, lumo_coeffs(ispin))


            CALL cp_fm_release(lulu_fock)

            CALL cp_fm_release(hoho_fock)


         ELSE !using the generalized Fock matrix diagonal elements


            !Compute their Fock matrix diagonal

            ALLOCATE (homo_evals(ispin)%array(nhomo(ispin)))

            CALL cp_dbcsr_sm_fm_multiply(fock_matrix(ispin)%matrix, homo_coeffs(ispin), &

                                         homo_work, ncol=nhomo(ispin))

            CALL parallel_gemm('T', 'N', nhomo(ispin), nhomo(ispin), nao, 1.0_dp, homo_coeffs(ispin), &

                               homo_work, 0.0_dp, hoho_work)

            CALL cp_fm_get_diag(hoho_work, homo_evals(ispin)%array)


            ALLOCATE (lumo_evals(ispin)%array(nlumo(ispin)))

            CALL cp_dbcsr_sm_fm_multiply(fock_matrix(ispin)%matrix, lumo_coeffs(ispin), &

                                         lumo_work, ncol=nlumo(ispin))

            CALL parallel_gemm('T', 'N', nlumo(ispin), nlumo(ispin), nao, 1.0_dp, lumo_coeffs(ispin), &

                               lumo_work, 0.0_dp, lulu_work)

            CALL cp_fm_get_diag(lulu_work, lumo_evals(ispin)%array)


         END IF

         CALL cp_fm_release(homo_work)

         CALL cp_fm_release(hoho_work)

         CALL cp_fm_struct_release(hoho_struct)

         CALL cp_fm_release(lumo_work)

         CALL cp_fm_release(lulu_work)

         CALL cp_fm_struct_release(lulu_struct)


         !Put back homo and lumo coeffs together, to fit tensor structure

         CALL cp_fm_struct_create(all_struct(ispin)%struct, para_env=para_env, context=blacs_env, &

                                  ncol_global=nhomo(ispin) + nlumo(ispin), nrow_global=nao)

         CALL cp_fm_create(all_coeffs(ispin), all_struct(ispin)%struct)

         CALL cp_fm_to_fm(homo_coeffs(ispin), all_coeffs(ispin), ncol=nhomo(ispin), &

                          source_start=1, target_start=1)

         CALL cp_fm_to_fm(lumo_coeffs(ispin), all_coeffs(ispin), ncol=nlumo(ispin), &

                          source_start=1, target_start=nhomo(ispin) + 1)


      END DO !ispin


      !get semi-contracted tensor (AOs to MOs, keep RI uncontracted)

      CALL contract_aos_to_mos(ja_x, oi_y, mo_template, all_coeffs, nhomo, nlumo, &

                               donor_state, xas_tdp_env, xas_tdp_control, qs_env)


      !intermediate clean-up

      DO ispin = 1, nspins

         CALL cp_fm_release(all_coeffs(ispin))

         CALL cp_fm_release(homo_coeffs(ispin))

         CALL cp_fm_release(lumo_coeffs(ispin))

         CALL cp_fm_struct_release(all_struct(ispin)%struct)

         CALL cp_fm_struct_release(lumo_struct(ispin)%struct)

         CALL cp_fm_struct_release(homo_struct(ispin)%struct)

      END DO


      !6) GW2X iterations


      IF (nspins == 1) THEN

         !restricted-closed shell: only alpha spin

         CALL gw2x_rcs_iterations(first_domo(1), ja_x(1), oi_y, mo_template(1, 1), homo_evals(1)%array, &

                                  lumo_evals(1)%array, donor_state, xas_tdp_control, qs_env)

      ELSE

         !open-shell, need both spins

         CALL gw2x_os_iterations(first_domo, ja_x, oi_y, mo_template, homo_evals, lumo_evals, &

                                 donor_state, xas_tdp_control, qs_env)

      END IF


      !restore proper contract_coeffs

      IF (pseudo_canonical) THEN

         donor_state%contract_coeffs(:, :) = contract_coeffs_backup(:, :)

      END IF


      !Final clean-up

      DO ido_mo = 1, nspins*ndo_mo

         CALL dbt_destroy(oi_y(ido_mo))

      END DO

      DO ispin = 1, nspins

         CALL dbt_destroy(ja_x(ispin))

         DEALLOCATE (homo_evals(ispin)%array)

         DEALLOCATE (lumo_evals(ispin)%array)

         DO jspin = 1, nspins

            CALL dbt_destroy(mo_template(ispin, jspin))

         END DO

      END DO

      DEALLOCATE (oi_y, homo_evals, lumo_evals)


      CALL timestop(handle)


   END SUBROUTINE gw2x_shift


! **************************************************************************************************

!> \brief Preforms the GW2X iterations in the restricted-closed shell formalism according to the

!>        Newton-Raphson method

!> \param first_domo index of the first core donor MO to consider

!> \param ja_X semi-contracted tensor with j: occupied MO, a: virtual MO, X: RI basis element

!> \param oI_Y semi-contracted tensors with o: all MOs, I donor core MO, Y: RI basis element

!> \param mo_template tensor template for fully MO contracted tensor

!> \param homo_evals ...

!> \param lumo_evals ...

!> \param donor_state ...

!> \param xas_tdp_control ...

!> \param qs_env ...

! **************************************************************************************************

   SUBROUTINE gw2x_rcs_iterations(first_domo, ja_X, oI_Y, mo_template, homo_evals, lumo_evals, &

                                  donor_state, xas_tdp_control, qs_env)


      INTEGER, INTENT(IN)                                :: first_domo

      TYPE(dbt_type), INTENT(inout)                      :: ja_x

      TYPE(dbt_type), DIMENSION(:), INTENT(inout)        :: oi_y

      TYPE(dbt_type), INTENT(inout)                      :: mo_template

      REAL(dp), DIMENSION(:), INTENT(IN)                 :: homo_evals, lumo_evals

      TYPE(donor_state_type), POINTER                    :: donor_state

      TYPE(xas_tdp_control_type), POINTER                :: xas_tdp_control

      TYPE(qs_environment_type), POINTER                 :: qs_env


      CHARACTER(len=*), PARAMETER :: routinen = 'GW2X_rcs_iterations'


      INTEGER :: batch_size, bounds_1d(2), bounds_2d(2, 2), handle, i, ibatch, ido_mo, iloop, &

         max_iter, nbatch_occ, nbatch_virt, nblk_occ, nblk_virt, nblks(3), ndo_mo, nhomo, nlumo, &

         occ_bo(2), output_unit, tmp_sum, virt_bo(2)

      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: mo_blk_size

      REAL(dp)                                           :: c_os, c_ss, dg, diff, ds1, ds2, eps_i, &

                                                            eps_iter, g, omega_k, parts(4), s1, s2

      TYPE(dbt_type)                                     :: aj_ib, aj_ib_diff, aj_x, ja_ik, &

                                                            ja_ik_diff

      TYPE(mp_para_env_type), POINTER                    :: para_env


      CALL timeset(routinen, handle)


      eps_iter = xas_tdp_control%gw2x_eps

      max_iter = xas_tdp_control%max_gw2x_iter

      c_os = xas_tdp_control%c_os

      c_ss = xas_tdp_control%c_ss

      batch_size = xas_tdp_control%batch_size


      ndo_mo = donor_state%ndo_mo

      output_unit = cp_logger_get_default_io_unit()


      nhomo = SIZE(homo_evals)

      nlumo = SIZE(lumo_evals)


      CALL get_qs_env(qs_env, para_env=para_env)


      !We use the Newton-Raphson method to find the zero of the function:

      !g(omega) = eps_I - omega + mp2 terms, dg(omega) = -1 + d/d_omega (mp2 terms)

      !We simply compute at each iteration: omega_k+1 = omega_k - g(omega_k)/dg(omega_k)


      !need transposed tensor of (ja|X) for optimal contraction scheme (s.t. (aj|X) block is on same

      !processor as (ja|X))

      CALL dbt_create(ja_x, aj_x)

      CALL dbt_copy(ja_x, aj_x, order=[2, 1, 3])


      !split the MO blocks into batches for memory friendly batched contraction

      !huge dense tensors never need to be stored

      CALL dbt_get_info(ja_x, nblks_total=nblks)

      ALLOCATE (mo_blk_size(nblks(1)))

      CALL dbt_get_info(ja_x, blk_size_1=mo_blk_size)


      tmp_sum = 0

      DO i = 1, nblks(1)

         tmp_sum = tmp_sum + mo_blk_size(i)

         IF (tmp_sum == nhomo) THEN

            nblk_occ = i

            nblk_virt = nblks(1) - i

            EXIT

         END IF

      END DO

      nbatch_occ = max(1, nblk_occ/batch_size)

      nbatch_virt = max(1, nblk_virt/batch_size)


      !Loop over donor_states

      DO ido_mo = 1, ndo_mo

         IF (output_unit > 0) THEN

            WRITE (unit=output_unit, fmt="(/,T5,A,I2,A,I4,A,/,T5,A)") &

               "- GW2X correction for donor MO with spin ", 1, &

               " and MO index ", donor_state%mo_indices(ido_mo, 1), ":", &

               "                         iteration                convergence (eV)"

            CALL m_flush(output_unit)

         END IF


         !starting values

         eps_i = homo_evals(first_domo + ido_mo - 1)

         omega_k = eps_i

         iloop = 0

         diff = 2.0_dp*eps_iter


         DO WHILE (abs(diff) > eps_iter)

            iloop = iloop + 1


            !Compute the mp2 terms and their first derivative

            parts = 0.0_dp


            !We do batched contraction for (ja|Ik) and (ja|Ib) to never have to carry the full tensor

            DO ibatch = 1, nbatch_occ


               occ_bo = get_limit(nblk_occ, nbatch_occ, ibatch - 1)

               bounds_1d = [sum(mo_blk_size(1:occ_bo(1) - 1)) + 1, sum(mo_blk_size(1:occ_bo(2)))]


               CALL dbt_create(mo_template, ja_ik)

               CALL dbt_contract(alpha=1.0_dp, tensor_1=ja_x, tensor_2=oi_y(ido_mo), &

                                 beta=0.0_dp, tensor_3=ja_ik, contract_1=[3], &

                                 notcontract_1=[1, 2], contract_2=[2], notcontract_2=[1], &

                                 map_1=[1, 2], map_2=[3], bounds_3=bounds_1d)


               !opposite-spin contribution

               CALL calc_os_oov_contrib(parts(1), parts(2), ja_ik, homo_evals, lumo_evals, homo_evals, &

                                        omega_k, c_os, nhomo)


               bounds_2d(:, 2) = bounds_1d

               bounds_2d(1, 1) = nhomo + 1

               bounds_2d(2, 1) = nhomo + nlumo


               !same-spin contribution. Contraction only neede if c_ss != 0

               !directly compute the difference (ja|Ik) - (ka|Ij)

               IF (abs(c_ss) > epsilon(1.0_dp)) THEN


                  CALL dbt_create(ja_ik, ja_ik_diff, map1_2d=[1], map2_2d=[2, 3])

                  CALL dbt_copy(ja_ik, ja_ik_diff, move_data=.true.)


                  CALL dbt_contract(alpha=-1.0_dp, tensor_1=oi_y(ido_mo), tensor_2=aj_x, &

                                    beta=1.0_dp, tensor_3=ja_ik_diff, contract_1=[2], &

                                    notcontract_1=[1], contract_2=[3], notcontract_2=[1, 2], &

                                    map_1=[1], map_2=[2, 3], bounds_2=[1, nhomo], bounds_3=bounds_2d)


                  CALL calc_ss_oov_contrib(parts(1), parts(2), ja_ik_diff, homo_evals, lumo_evals, omega_k, c_ss)


                  CALL dbt_destroy(ja_ik_diff)

               END IF !c_ss != 0


               CALL dbt_destroy(ja_ik)

            END DO


            DO ibatch = 1, nbatch_virt


               virt_bo = get_limit(nblk_virt, nbatch_virt, ibatch - 1)

               bounds_1d = [sum(mo_blk_size(1:nblk_occ + virt_bo(1) - 1)) + 1, &

                            sum(mo_blk_size(1:nblk_occ + virt_bo(2)))]


               CALL dbt_create(mo_template, aj_ib)

               CALL dbt_contract(alpha=1.0_dp, tensor_1=aj_x, tensor_2=oi_y(ido_mo), &

                                 beta=0.0_dp, tensor_3=aj_ib, contract_1=[3], &

                                 notcontract_1=[1, 2], contract_2=[2], notcontract_2=[1], &

                                 map_1=[1, 2], map_2=[3], bounds_3=bounds_1d)


               !opposite-spin contribution

               CALL calc_os_ovv_contrib(parts(3), parts(4), aj_ib, lumo_evals, homo_evals, lumo_evals, &

                                        omega_k, c_os, nhomo, nhomo)


               !same-spin contribution, only if c_ss is not 0

               !directly compute the difference (aj|Ib) - (bj|Ia)

               IF (abs(c_ss) > epsilon(1.0_dp)) THEN

                  bounds_2d(1, 1) = 1

                  bounds_2d(2, 1) = nhomo

                  bounds_2d(:, 2) = bounds_1d


                  CALL dbt_create(aj_ib, aj_ib_diff, map1_2d=[1], map2_2d=[2, 3])

                  CALL dbt_copy(aj_ib, aj_ib_diff, move_data=.true.)


                  CALL dbt_contract(alpha=-1.0_dp, tensor_1=oi_y(ido_mo), tensor_2=ja_x, &

                                    beta=1.0_dp, tensor_3=aj_ib_diff, contract_1=[2], &

                                    notcontract_1=[1], contract_2=[3], notcontract_2=[1, 2], &

                                    map_1=[1], map_2=[2, 3], &

                                    bounds_2=[nhomo + 1, nhomo + nlumo], bounds_3=bounds_2d)


                  CALL calc_ss_ovv_contrib(parts(3), parts(4), aj_ib_diff, homo_evals, lumo_evals, omega_k, c_ss)


                  CALL dbt_destroy(aj_ib_diff)

               END IF ! c_ss not 0


               CALL dbt_destroy(aj_ib)

            END DO


            CALL para_env%sum(parts)

            s1 = parts(1); ds1 = parts(2)

            s2 = parts(3); ds2 = parts(4)


            !evaluate g and its derivative

            g = eps_i - omega_k + s1 + s2

            dg = -1.0_dp + ds1 + ds2


            !compute the diff to the new step

            diff = -g/dg


            !and the new omega

            omega_k = omega_k + diff

            diff = diff*evolt


            IF (output_unit > 0) THEN

               WRITE (unit=output_unit, fmt="(T21,I18,F32.6)") &

                  iloop, diff

               CALL m_flush(output_unit)

            END IF


            IF (iloop > max_iter) THEN

               cpwarn("GW2X iteration not converged.")

               EXIT

            END IF

         END DO !while loop on eps_iter


         !compute the shift and update donor_state

         donor_state%gw2x_evals(ido_mo, 1) = omega_k


         IF (output_unit > 0) THEN

            WRITE (unit=output_unit, fmt="(/T7,A,F11.6,/,T5,A,F11.6)") &

               "Final GW2X shift for this donor MO (eV):", &

               (donor_state%energy_evals(ido_mo, 1) - omega_k)*evolt

         END IF


      END DO !ido_mo


      CALL dbt_destroy(aj_x)


      CALL timestop(handle)


   END SUBROUTINE gw2x_rcs_iterations


! **************************************************************************************************

!> \brief Preforms the GW2X iterations in the open-shell shell formalism according to the

!>        Newton-Raphson method

!> \param first_domo index of the first core donor MO to consider, for each spin

!> \param ja_X semi-contracted tensors with j: occupied MO, a: virtual MO, X: RI basis element

!> \param oI_Y semi-contracted tensors with o: all MOs, I donor core MO, Y: RI basis element

!> \param mo_template tensor template for fully MO contracted tensor, for each spin combination

!> \param homo_evals ...

!> \param lumo_evals ...

!> \param donor_state ...

!> \param xas_tdp_control ...

!> \param qs_env ...

! **************************************************************************************************

   SUBROUTINE gw2x_os_iterations(first_domo, ja_X, oI_Y, mo_template, homo_evals, lumo_evals, &

                                 donor_state, xas_tdp_control, qs_env)


      INTEGER, INTENT(IN)                                :: first_domo(2)

      TYPE(dbt_type), DIMENSION(:), INTENT(inout)        :: ja_x, oi_y

      TYPE(dbt_type), DIMENSION(:, :), INTENT(inout)     :: mo_template

      TYPE(cp_1d_r_p_type), DIMENSION(:), INTENT(in)     :: homo_evals, lumo_evals

      TYPE(donor_state_type), POINTER                    :: donor_state

      TYPE(xas_tdp_control_type), POINTER                :: xas_tdp_control

      TYPE(qs_environment_type), POINTER                 :: qs_env


      CHARACTER(len=*), PARAMETER :: routinen = 'GW2X_os_iterations'


      INTEGER :: batch_size, bounds_1d(2), bounds_2d(2, 2), handle, i, ibatch, ido_mo, iloop, &

         ispin, max_iter, nbatch_occ, nbatch_virt, nblk_occ, nblk_virt, nblks(3), ndo_mo, &

         nhomo(2), nlumo(2), nspins, occ_bo(2), other_spin, output_unit, tmp_sum, virt_bo(2)

      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: mo_blk_size

      REAL(dp)                                           :: c_os, c_ss, dg, diff, ds1, ds2, eps_i, &

                                                            eps_iter, g, omega_k, parts(4), s1, s2

      TYPE(dbt_type)                                     :: aj_ib, aj_ib_diff, ja_ik, ja_ik_diff

      TYPE(dbt_type), ALLOCATABLE, DIMENSION(:)          :: aj_x

      TYPE(mp_para_env_type), POINTER                    :: para_env


      CALL timeset(routinen, handle)


      eps_iter = xas_tdp_control%gw2x_eps

      max_iter = xas_tdp_control%max_gw2x_iter

      c_os = xas_tdp_control%c_os

      c_ss = xas_tdp_control%c_ss

      batch_size = xas_tdp_control%batch_size


      nspins = 2

      ndo_mo = donor_state%ndo_mo

      output_unit = cp_logger_get_default_io_unit()


      DO ispin = 1, nspins

         nhomo(ispin) = SIZE(homo_evals(ispin)%array)

         nlumo(ispin) = SIZE(lumo_evals(ispin)%array)

      END DO


      CALL get_qs_env(qs_env, para_env=para_env)


      !We use the Newton-Raphson method to find the zero of the function:

      !g(omega) = eps_I - omega + mp2 terms, dg(omega) = -1 + d/d_omega (mp2 terms)

      !We simply compute at each iteration: omega_k+1 = omega_k - g(omega_k)/dg(omega_k)


      ALLOCATE (aj_x(2))

      DO ispin = 1, nspins


         !need transposed tensor of (ja|X) for optimal contraction scheme,

         !s.t. (aj|X) block is on same processor as (ja|X)) and differences can be taken

         CALL dbt_create(ja_x(ispin), aj_x(ispin))

         CALL dbt_copy(ja_x(ispin), aj_x(ispin), order=[2, 1, 3])


      END DO ! ispin

      DO ispin = 1, nspins


         other_spin = 3 - ispin


         !split the MO blocks into batches for memory friendly batched contraction

         !huge dense tensors never need to be stored. Split MOs for the current spin

         CALL dbt_get_info(ja_x(ispin), nblks_total=nblks)

         ALLOCATE (mo_blk_size(nblks(1)))

         CALL dbt_get_info(ja_x(ispin), blk_size_1=mo_blk_size)


         tmp_sum = 0

         DO i = 1, nblks(1)

            tmp_sum = tmp_sum + mo_blk_size(i)

            IF (tmp_sum == nhomo(ispin)) THEN

               nblk_occ = i

               nblk_virt = nblks(1) - i

               EXIT

            END IF

         END DO

         nbatch_occ = max(1, nblk_occ/batch_size)

         nbatch_virt = max(1, nblk_virt/batch_size)


         !Loop over donor_states of the current spin

         DO ido_mo = 1, ndo_mo

            IF (output_unit > 0) THEN

               WRITE (unit=output_unit, fmt="(/,T5,A,I2,A,I4,A,/,T5,A)") &

                  "- GW2X correction for donor MO with spin ", ispin, &

                  " and MO index ", donor_state%mo_indices(ido_mo, ispin), ":", &

                  "                         iteration                convergence (eV)"

               CALL m_flush(output_unit)

            END IF


            !starting values

            eps_i = homo_evals(ispin)%array(first_domo(ispin) + ido_mo - 1)

            omega_k = eps_i

            iloop = 0

            diff = 2.0_dp*eps_iter


            DO WHILE (abs(diff) > eps_iter)

               iloop = iloop + 1


               !Compute the mp2 terms and their first derivative

               parts = 0.0_dp


               !We do batched contraction for (ja|Ik) and (ja|Ib) to never have to carry the full tensor

               DO ibatch = 1, nbatch_occ


                  !opposite-spin contribution, i.e. (j_beta a_beta| I_alpha k_alpha) and vice-versa

                  !do the batching along k because same spin as donor MO

                  occ_bo = get_limit(nblk_occ, nbatch_occ, ibatch - 1)

                  bounds_1d = [sum(mo_blk_size(1:occ_bo(1) - 1)) + 1, sum(mo_blk_size(1:occ_bo(2)))]


                  CALL dbt_create(mo_template(other_spin, ispin), ja_ik)

                  CALL dbt_contract(alpha=1.0_dp, tensor_1=ja_x(other_spin), &

                                    tensor_2=oi_y((ispin - 1)*ndo_mo + ido_mo), &

                                    beta=0.0_dp, tensor_3=ja_ik, contract_1=[3], &

                                    notcontract_1=[1, 2], contract_2=[2], notcontract_2=[1], &

                                    map_1=[1, 2], map_2=[3], bounds_3=bounds_1d)


                  CALL calc_os_oov_contrib(parts(1), parts(2), ja_ik, homo_evals(other_spin)%array, &

                                           lumo_evals(other_spin)%array, homo_evals(ispin)%array, &

                                           omega_k, c_os, nhomo(other_spin))


                  CALL dbt_destroy(ja_ik)


                  !same-spin contribution, need to compute (ja|Ik) - (ka|Ij), all with the current spin

                  !skip if c_ss == 0

                  IF (abs(c_ss) > epsilon(1.0_dp)) THEN


                     !same batching as opposite spin

                     CALL dbt_create(mo_template(ispin, ispin), ja_ik)

                     CALL dbt_contract(alpha=1.0_dp, tensor_1=ja_x(ispin), &

                                       tensor_2=oi_y((ispin - 1)*ndo_mo + ido_mo), &

                                       beta=0.0_dp, tensor_3=ja_ik, contract_1=[3], &

                                       notcontract_1=[1, 2], contract_2=[2], notcontract_2=[1], &

                                       map_1=[1, 2], map_2=[3], bounds_3=bounds_1d)


                     bounds_2d(:, 2) = bounds_1d

                     bounds_2d(1, 1) = nhomo(ispin) + 1

                     bounds_2d(2, 1) = nhomo(ispin) + nlumo(ispin)


                     !the tensor difference is directly taken here

                     CALL dbt_create(ja_ik, ja_ik_diff, map1_2d=[1], map2_2d=[2, 3])

                     CALL dbt_copy(ja_ik, ja_ik_diff, move_data=.true.)


                     CALL dbt_contract(alpha=-1.0_dp, tensor_1=oi_y((ispin - 1)*ndo_mo + ido_mo), &

                                       tensor_2=aj_x(ispin), beta=1.0_dp, tensor_3=ja_ik_diff, &

                                       contract_1=[2], notcontract_1=[1], contract_2=[3], notcontract_2=[1, 2], &

                                       map_1=[1], map_2=[2, 3], bounds_2=[1, nhomo(ispin)], bounds_3=bounds_2d)


                     CALL calc_ss_oov_contrib(parts(1), parts(2), ja_ik_diff, homo_evals(ispin)%array, &

                                              lumo_evals(ispin)%array, omega_k, c_ss)


                     CALL dbt_destroy(ja_ik_diff)

                     CALL dbt_destroy(ja_ik)

                  END IF !c_ss !!= 0


               END DO


               DO ibatch = 1, nbatch_virt


                  !opposite-spin contribution, i.e. (a_beta j_beta| I_alpha b_alpha) and vice-versa

                  !do the batching along b because same spin as donor MO

                  virt_bo = get_limit(nblk_virt, nbatch_virt, ibatch - 1)

                  bounds_1d = [sum(mo_blk_size(1:nblk_occ + virt_bo(1) - 1)) + 1, &

                               sum(mo_blk_size(1:nblk_occ + virt_bo(2)))]


                  CALL dbt_create(mo_template(other_spin, ispin), aj_ib)

                  CALL dbt_contract(alpha=1.0_dp, tensor_1=aj_x(other_spin), &

                                    tensor_2=oi_y((ispin - 1)*ndo_mo + ido_mo), &

                                    beta=0.0_dp, tensor_3=aj_ib, contract_1=[3], &

                                    notcontract_1=[1, 2], contract_2=[2], notcontract_2=[1], &

                                    map_1=[1, 2], map_2=[3], bounds_3=bounds_1d)


                  CALL calc_os_ovv_contrib(parts(3), parts(4), aj_ib, lumo_evals(other_spin)%array, &

                                           homo_evals(other_spin)%array, lumo_evals(ispin)%array, &

                                           omega_k, c_os, nhomo(other_spin), nhomo(ispin))


                  CALL dbt_destroy(aj_ib)


                  !same-spin contribution, need to compute (aj|Ib) - (bj|Ia), all with the current spin

                  !skip if c_ss == 0

                  IF (abs(c_ss) > epsilon(1.0_dp)) THEN


                     !same batching as opposite spin

                     CALL dbt_create(mo_template(ispin, ispin), aj_ib)

                     CALL dbt_contract(alpha=1.0_dp, tensor_1=aj_x(ispin), &

                                       tensor_2=oi_y((ispin - 1)*ndo_mo + ido_mo), &

                                       beta=0.0_dp, tensor_3=aj_ib, contract_1=[3], &

                                       notcontract_1=[1, 2], contract_2=[2], notcontract_2=[1], &

                                       map_1=[1, 2], map_2=[3], bounds_3=bounds_1d)


                     bounds_2d(1, 1) = 1

                     bounds_2d(2, 1) = nhomo(ispin)

                     bounds_2d(:, 2) = bounds_1d


                     CALL dbt_create(aj_ib, aj_ib_diff, map1_2d=[1], map2_2d=[2, 3])

                     CALL dbt_copy(aj_ib, aj_ib_diff, move_data=.true.)


                     CALL dbt_contract(alpha=-1.0_dp, tensor_1=oi_y((ispin - 1)*ndo_mo + ido_mo), &

                                       tensor_2=ja_x(ispin), beta=1.0_dp, tensor_3=aj_ib_diff, &

                                       contract_1=[2], notcontract_1=[1], contract_2=[3], &

                                       notcontract_2=[1, 2], map_1=[1], map_2=[2, 3], &

                                       bounds_2=[nhomo(ispin) + 1, nhomo(ispin) + nlumo(ispin)], &

                                       bounds_3=bounds_2d)


                     CALL calc_ss_ovv_contrib(parts(3), parts(4), aj_ib_diff, homo_evals(ispin)%array, &

                                              lumo_evals(ispin)%array, omega_k, c_ss)


                     CALL dbt_destroy(aj_ib_diff)

                     CALL dbt_destroy(aj_ib)

                  END IF ! c_ss not 0


               END DO


               CALL para_env%sum(parts)

               s1 = parts(1); ds1 = parts(2)

               s2 = parts(3); ds2 = parts(4)


               !evaluate g and its derivative

               g = eps_i - omega_k + s1 + s2

               dg = -1.0_dp + ds1 + ds2


               !compute the diff to the new step

               diff = -g/dg


               !and the new omega

               omega_k = omega_k + diff

               diff = diff*evolt


               IF (output_unit > 0) THEN

                  WRITE (unit=output_unit, fmt="(T21,I18,F32.6)") &

                     iloop, diff

                  CALL m_flush(output_unit)

               END IF


               IF (iloop > max_iter) THEN

                  cpwarn("GW2X iteration not converged.")

                  EXIT

               END IF

            END DO !while loop on eps_iter


            !compute the shift and update donor_state

            donor_state%gw2x_evals(ido_mo, ispin) = omega_k


            IF (output_unit > 0) THEN

               WRITE (unit=output_unit, fmt="(/T7,A,F11.6,/,T5,A,F11.6)") &

                  "Final GW2X shift for this donor MO (eV):", &

                  (donor_state%energy_evals(ido_mo, ispin) - omega_k)*evolt

            END IF


         END DO !ido_mo


         DEALLOCATE (mo_blk_size)

      END DO ! ispin


      DO ispin = 1, nspins

         CALL dbt_destroy(aj_x(ispin))

      END DO


      CALL timestop(handle)


   END SUBROUTINE gw2x_os_iterations


! **************************************************************************************************

!> \brief Takes the 3-center integrals from the ri_ex_3c tensor and returns a full tensor. Since

!>        ri_ex_3c is only half filled because of symmetry, we have to add the transpose

!>        and scale the diagonal blocks by 0.5

!> \param pq_X the full (desymmetrized) tensor containing the (pq|X) exchange integrals, in a new

!>        3d distribution and optimized block sizes

!> \param exat index of current excited atom

!> \param xas_tdp_env ...

!> \param qs_env ...

! **************************************************************************************************

   SUBROUTINE get_full_pqx_from_3c_ex(pq_X, exat, xas_tdp_env, qs_env)


      TYPE(dbt_type), INTENT(INOUT)                      :: pq_x

      INTEGER, INTENT(IN)                                :: exat

      TYPE(xas_tdp_env_type), POINTER                    :: xas_tdp_env

      TYPE(qs_environment_type), POINTER                 :: qs_env


      INTEGER                                            :: i, ind(3), natom, nblk_ri, nsgf_x

      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: orb_blk_size, proc_dist_1, proc_dist_2, &

                                                            proc_dist_3

      INTEGER, DIMENSION(3)                              :: pdims

      LOGICAL                                            :: found

      REAL(dp), ALLOCATABLE, DIMENSION(:, :, :)          :: pblock

      TYPE(dbt_distribution_type)                        :: t_dist

      TYPE(dbt_iterator_type)                            :: iter

      TYPE(dbt_pgrid_type)                               :: t_pgrid

      TYPE(dbt_type)                                     :: pq_x_tmp, work

      TYPE(mp_para_env_type), POINTER                    :: para_env


      NULLIFY (para_env)


      !create work tensor with same 2D dist as pq_X, but only keep excited atom along RI direction

      CALL get_qs_env(qs_env, para_env=para_env, natom=natom)

      CALL dbt_get_info(xas_tdp_env%ri_3c_ex, pdims=pdims)

      nsgf_x = SIZE(xas_tdp_env%ri_inv_ex, 1)

      nblk_ri = 1


      CALL dbt_pgrid_create(para_env, pdims, t_pgrid)

      ALLOCATE (proc_dist_1(natom), proc_dist_2(natom), orb_blk_size(natom))

      CALL dbt_get_info(xas_tdp_env%ri_3c_ex, proc_dist_1=proc_dist_1, proc_dist_2=proc_dist_2, &

                        blk_size_1=orb_blk_size)

      CALL dbt_distribution_new(t_dist, t_pgrid, nd_dist_1=proc_dist_1, nd_dist_2=proc_dist_2, &

                                nd_dist_3=[(0, i=1, nblk_ri)])


      CALL dbt_create(work, name="(pq|X)", dist=t_dist, map1_2d=[1], map2_2d=[2, 3], &

                      blk_size_1=orb_blk_size, blk_size_2=orb_blk_size, blk_size_3=[nsgf_x])

      CALL dbt_distribution_destroy(t_dist)


      !dist of 3c_ex and work match, can simply copy blocks over. Diagonal with factor 0.5


!$OMP PARALLEL DEFAULT(NONE) SHARED(xas_tdp_env,exat,work,orb_blk_size,nsgf_x) &

!$OMP PRIVATE(iter,ind,pblock,found)

      CALL dbt_iterator_start(iter, xas_tdp_env%ri_3c_ex)

      DO WHILE (dbt_iterator_blocks_left(iter))

         CALL dbt_iterator_next_block(iter, ind)

         CALL dbt_get_block(xas_tdp_env%ri_3c_ex, ind, pblock, found)


         IF (ind(1) == ind(2)) pblock = 0.5_dp*pblock

         IF (ind(3) /= exat) cycle


         CALL dbt_put_block(work, [ind(1), ind(2), 1], &

                            [orb_blk_size(ind(1)), orb_blk_size(ind(2)), nsgf_x], pblock)


         DEALLOCATE (pblock)

      END DO

      CALL dbt_iterator_stop(iter)

!$OMP END PARALLEL

      CALL dbt_finalize(work)


      !create (pq|X) based on work and copy over

      CALL dbt_create(work, pq_x_tmp)

      CALL dbt_copy(work, pq_x_tmp)

      CALL dbt_copy(work, pq_x_tmp, order=[2, 1, 3], summation=.true., move_data=.true.)


      CALL dbt_destroy(work)


      !create the pgrid, based on the 2D dbcsr grid

      CALL dbt_pgrid_destroy(t_pgrid)

      pdims = 0

      CALL dbt_pgrid_create(para_env, pdims, t_pgrid, tensor_dims=[natom, natom, 1])


      !cyclic distribution accross all directions.

      ALLOCATE (proc_dist_3(nblk_ri))

      CALL dbt_default_distvec(natom, pdims(1), orb_blk_size, proc_dist_1)

      CALL dbt_default_distvec(natom, pdims(2), orb_blk_size, proc_dist_2)

      CALL dbt_default_distvec(nblk_ri, pdims(3), [nsgf_x], proc_dist_3)

      CALL dbt_distribution_new(t_dist, t_pgrid, nd_dist_1=proc_dist_1, nd_dist_2=proc_dist_2, &

                                nd_dist_3=proc_dist_3)


      CALL dbt_create(pq_x, name="(pq|X)", dist=t_dist, map1_2d=[2, 3], map2_2d=[1], &

                      blk_size_1=orb_blk_size, blk_size_2=orb_blk_size, blk_size_3=[nsgf_x])

      CALL dbt_copy(pq_x_tmp, pq_x, move_data=.true.)


      CALL dbt_distribution_destroy(t_dist)

      CALL dbt_pgrid_destroy(t_pgrid)

      CALL dbt_destroy(pq_x_tmp)


   END SUBROUTINE get_full_pqx_from_3c_ex


! **************************************************************************************************

!> \brief Contracts (pq|X) and (rI|Y) from AOs to MOs to (ja|X) and (oI|Y) respectively, where

!>        j is a occupied MO, a is a virtual MO and o is a general MO

!> \param ja_X partial contraction over occupied MOs j, virtual MOs a: (ja|X), for both spins (alpha-alpha or beta-beta)

!> \param oI_Y partial contraction over all MOs o and donor MOs I (can be more than 1 if 2p or open-shell)

!> \param ja_Io_template template to be able to build tensors after calling this routine, for each spin combination

!> \param mo_coeffs ...

!> \param nocc ...

!> \param nvirt ...

!> \param donor_state ...

!> \param xas_tdp_env ...

!> \param xas_tdp_control ...

!> \param qs_env ...

!> \note the multiplication by (X|Y)^-1 is included in the final (oI|Y) tensor. Only integrals with the

!>       same spin on one center are non-zero, i.e. (oI|Y) is non zero only if both o and Y have the same spin

! **************************************************************************************************

   SUBROUTINE contract_aos_to_mos(ja_X, oI_Y, ja_Io_template, mo_coeffs, nocc, nvirt, &

                                  donor_state, xas_tdp_env, xas_tdp_control, qs_env)


      TYPE(dbt_type), ALLOCATABLE, DIMENSION(:), &

         INTENT(INOUT)                                   :: ja_x, oi_y

      TYPE(dbt_type), ALLOCATABLE, DIMENSION(:, :), &

         INTENT(INOUT)                                   :: ja_io_template

      TYPE(cp_fm_type), DIMENSION(:), INTENT(INOUT)      :: mo_coeffs

      INTEGER, INTENT(IN)                                :: nocc(2), nvirt(2)

      TYPE(donor_state_type), POINTER                    :: donor_state

      TYPE(xas_tdp_env_type), POINTER                    :: xas_tdp_env

      TYPE(xas_tdp_control_type), POINTER                :: xas_tdp_control

      TYPE(qs_environment_type), POINTER                 :: qs_env


      CHARACTER(len=*), PARAMETER :: routinen = 'contract_AOs_to_MOs'


      INTEGER                                            :: bo(2), handle, i, ispin, jspin, &

                                                            nblk_aos, nblk_mos(2), nblk_occ(2), &

                                                            nblk_pqx(3), nblk_ri, nblk_virt(2), &

                                                            nspins

      INTEGER, DIMENSION(3)                              :: pdims

      INTEGER, DIMENSION(:), POINTER                     :: ao_blk_size, ao_col_dist, ao_row_dist, &

                                                            mo_dist_3, ri_blk_size, ri_dist_3

      INTEGER, DIMENSION(:, :), POINTER                  :: mat_pgrid

      TYPE(cp_1d_i_p_type), ALLOCATABLE, DIMENSION(:)    :: mo_blk_size, mo_col_dist, mo_row_dist

      TYPE(dbcsr_distribution_type)                      :: mat_dist

      TYPE(dbcsr_distribution_type), POINTER             :: std_mat_dist

      TYPE(dbcsr_type)                                   :: dbcsr_mo_coeffs

      TYPE(dbt_distribution_type)                        :: t_dist

      TYPE(dbt_pgrid_type)                               :: t_pgrid

      TYPE(dbt_type)                                     :: jq_x, pq_x, t_mo_coeffs

      TYPE(mp_para_env_type), POINTER                    :: para_env


      NULLIFY (ao_blk_size, ao_col_dist, ao_row_dist, mo_dist_3, ri_blk_size, ri_dist_3, mat_pgrid, &

               para_env, std_mat_dist)


      CALL timeset(routinen, handle)


      nspins = 1; IF (xas_tdp_control%do_uks .OR. xas_tdp_control%do_roks) nspins = 2


      !There are 2 contractions to do for the first tensor: (pq|X) --> (jq|X) --> (ja|X)

      !Because memory is the main concern, we move_data everytime at the cost of extra copies


      !Some quantities need to be stored for both spins, because they are later combined

      CALL get_qs_env(qs_env, para_env=para_env)

      ALLOCATE (mo_blk_size(nspins), mo_row_dist(nspins), mo_col_dist(nspins))

      ALLOCATE (ja_x(nspins))

      ALLOCATE (oi_y(nspins*donor_state%ndo_mo))


      DO ispin = 1, nspins


         !First, we need a fully populated pq_X (spin-independent)

         CALL get_full_pqx_from_3c_ex(pq_x, donor_state%at_index, xas_tdp_env, qs_env)


         !Create the tensor pgrid. AOs and RI independent from spin

         IF (ispin == 1) THEN

            CALL dbt_get_info(pq_x, pdims=pdims, nblks_total=nblk_pqx)

            CALL dbt_pgrid_create(para_env, pdims, t_pgrid)

            nblk_aos = nblk_pqx(1)

            nblk_ri = nblk_pqx(3)

         END IF


         !Define MO block sizes, at worst, take one block per proc

         nblk_occ(ispin) = max(pdims(1), nocc(ispin)/16)

         nblk_virt(ispin) = max(pdims(2), nvirt(ispin)/16)

         nblk_mos(ispin) = nblk_occ(ispin) + nblk_virt(ispin)

         ALLOCATE (mo_blk_size(ispin)%array(nblk_mos(ispin)))

         DO i = 1, nblk_occ(ispin)

            bo = get_limit(nocc(ispin), nblk_occ(ispin), i - 1)

            mo_blk_size(ispin)%array(i) = bo(2) - bo(1) + 1

         END DO

         DO i = 1, nblk_virt(ispin)

            bo = get_limit(nvirt(ispin), nblk_virt(ispin), i - 1)

            mo_blk_size(ispin)%array(nblk_occ(ispin) + i) = bo(2) - bo(1) + 1

         END DO


         !Convert the fm mo_coeffs into a dbcsr matrix and then a tensor

         CALL get_qs_env(qs_env, dbcsr_dist=std_mat_dist)

         CALL dbcsr_distribution_get(std_mat_dist, pgrid=mat_pgrid)

         ALLOCATE (ao_blk_size(nblk_aos), ri_blk_size(nblk_ri))

         CALL dbt_get_info(pq_x, blk_size_1=ao_blk_size, blk_size_3=ri_blk_size)


         !we opt for a cyclic dist for the MOs (since they should be rather dense anyways)

         ALLOCATE (ao_row_dist(nblk_aos), mo_col_dist(ispin)%array(nblk_mos(ispin)))

         CALL dbt_default_distvec(nblk_aos, SIZE(mat_pgrid, 1), ao_blk_size, ao_row_dist)

         CALL dbt_default_distvec(nblk_mos(ispin), SIZE(mat_pgrid, 2), mo_blk_size(ispin)%array, &

                                  mo_col_dist(ispin)%array)

         CALL dbcsr_distribution_new(mat_dist, group=para_env%get_handle(), pgrid=mat_pgrid, &

                                     row_dist=ao_row_dist, col_dist=mo_col_dist(ispin)%array)


         CALL dbcsr_create(dbcsr_mo_coeffs, name="MO coeffs", matrix_type="N", dist=mat_dist, &

                           row_blk_size=ao_blk_size, col_blk_size=mo_blk_size(ispin)%array)

         CALL copy_fm_to_dbcsr(mo_coeffs(ispin), dbcsr_mo_coeffs)


         CALL dbt_create(dbcsr_mo_coeffs, t_mo_coeffs)

         CALL dbt_copy_matrix_to_tensor(dbcsr_mo_coeffs, t_mo_coeffs)


         !prepare the (jq|X) tensor for the first contraction (over occupied MOs)

         ALLOCATE (mo_row_dist(ispin)%array(nblk_mos(ispin)), ao_col_dist(nblk_aos), ri_dist_3(nblk_ri))

         CALL dbt_default_distvec(nblk_mos(ispin), pdims(1), mo_blk_size(ispin)%array, mo_row_dist(ispin)%array)

         CALL dbt_default_distvec(nblk_aos, pdims(2), ao_blk_size, ao_col_dist)

         CALL dbt_default_distvec(nblk_ri, pdims(3), ri_blk_size, ri_dist_3)

         CALL dbt_distribution_new(t_dist, t_pgrid, nd_dist_1=mo_row_dist(ispin)%array, &

                                   nd_dist_2=ao_col_dist, nd_dist_3=ri_dist_3)


         CALL dbt_create(jq_x, name="(jq|X)", dist=t_dist, map1_2d=[1, 3], map2_2d=[2], &

                         blk_size_1=mo_blk_size(ispin)%array, blk_size_2=ao_blk_size, blk_size_3=ri_blk_size)

         CALL dbt_distribution_destroy(t_dist)


         !contract (pq|X) into (jq|X)

         CALL dbt_contract(alpha=1.0_dp, tensor_1=pq_x, tensor_2=t_mo_coeffs, &

                           beta=0.0_dp, tensor_3=jq_x, contract_1=[1], &

                           notcontract_1=[2, 3], contract_2=[1], notcontract_2=[2], &

                           map_1=[2, 3], map_2=[1], bounds_3=[1, nocc(ispin)], &!only want occupied MOs for j

                           move_data=.true.)


         CALL dbt_destroy(pq_x)

         CALL dbt_copy_matrix_to_tensor(dbcsr_mo_coeffs, t_mo_coeffs)


         !prepare (ja|X) tensor for the second contraction (over virtual MOs)

         !only virtual-occupied bit of the first 2 indices is occupied + it should be dense

         !take blk dist such that blocks are evenly distributed

         CALL dbt_default_distvec(nblk_occ(ispin), pdims(1), mo_blk_size(ispin)%array(1:nblk_occ(ispin)), &

                                  mo_row_dist(ispin)%array(1:nblk_occ(ispin)))

         CALL dbt_default_distvec(nblk_virt(ispin), pdims(1), &

                                  mo_blk_size(ispin)%array(nblk_occ(ispin) + 1:nblk_mos(ispin)), &

                                  mo_row_dist(ispin)%array(nblk_occ(ispin) + 1:nblk_mos(ispin)))

         CALL dbt_default_distvec(nblk_occ(ispin), pdims(2), mo_blk_size(ispin)%array(1:nblk_occ(ispin)), &

                                  mo_col_dist(ispin)%array(1:nblk_occ(ispin)))

         CALL dbt_default_distvec(nblk_virt(ispin), pdims(2), &

                                  mo_blk_size(ispin)%array(nblk_occ(ispin) + 1:nblk_mos(ispin)), &

                                  mo_col_dist(ispin)%array(nblk_occ(ispin) + 1:nblk_mos(ispin)))

         CALL dbt_distribution_new(t_dist, t_pgrid, nd_dist_1=mo_row_dist(ispin)%array, &

                                   nd_dist_2=mo_col_dist(ispin)%array, nd_dist_3=ri_dist_3)


         CALL dbt_create(ja_x(ispin), name="(ja|X)", dist=t_dist, map1_2d=[1, 2], map2_2d=[3], &

                         blk_size_1=mo_blk_size(ispin)%array, blk_size_2=mo_blk_size(ispin)%array, &

                         blk_size_3=ri_blk_size)

         CALL dbt_distribution_destroy(t_dist)


         !contract (jq|X) into (ja|X)

         CALL dbt_contract(alpha=1.0_dp, tensor_1=jq_x, tensor_2=t_mo_coeffs, &

                           beta=0.0_dp, tensor_3=ja_x(ispin), contract_1=[2], &

                           notcontract_1=[1, 3], contract_2=[1], notcontract_2=[2], &

                           map_1=[1, 3], map_2=[2], move_data=.true., &

                           bounds_3=[nocc(ispin) + 1, nocc(ispin) + nvirt(ispin)])


         CALL dbt_destroy(jq_x)

         CALL dbt_copy_matrix_to_tensor(dbcsr_mo_coeffs, t_mo_coeffs)


         !Finally, get the oI_Y tensors

         CALL get_oiy_tensors(oi_y, ispin, ao_blk_size, mo_blk_size(ispin)%array, ri_blk_size, &

                              t_mo_coeffs, donor_state, xas_tdp_env, xas_tdp_control, qs_env)


         !intermediate clen-up

         CALL dbt_destroy(t_mo_coeffs)

         CALL dbcsr_distribution_release(mat_dist)

         CALL dbcsr_release(dbcsr_mo_coeffs)

         DEALLOCATE (ao_col_dist, ri_dist_3, ri_blk_size, ao_blk_size, ao_row_dist)


      END DO !ispin


      !create a empty tensor template for the fully contracted (ja|Io) MO integrals, for all spin

      !configureations: alpha-alpha|alpha-alpha, alpha-alpha|beta-beta, etc.

      ALLOCATE (ja_io_template(nspins, nspins))

      DO ispin = 1, nspins

         DO jspin = 1, nspins

            ALLOCATE (mo_dist_3(nblk_mos(jspin)))

            CALL dbt_default_distvec(nblk_occ(jspin), pdims(3), mo_blk_size(jspin)%array(1:nblk_occ(jspin)), &

                                     mo_dist_3(1:nblk_occ(jspin)))

            CALL dbt_default_distvec(nblk_virt(jspin), pdims(3), &

                                     mo_blk_size(jspin)%array(nblk_occ(jspin) + 1:nblk_mos(jspin)), &

                                     mo_dist_3(nblk_occ(jspin) + 1:nblk_mos(jspin)))

            CALL dbt_distribution_new(t_dist, t_pgrid, nd_dist_1=mo_row_dist(ispin)%array, &

                                      nd_dist_2=mo_col_dist(ispin)%array, nd_dist_3=mo_dist_3)


            CALL dbt_create(ja_io_template(ispin, jspin), name="(ja|Io)", dist=t_dist, map1_2d=[1, 2], &

                            map2_2d=[3], blk_size_1=mo_blk_size(ispin)%array, &

                            blk_size_2=mo_blk_size(ispin)%array, blk_size_3=mo_blk_size(jspin)%array)

            CALL dbt_distribution_destroy(t_dist)

            DEALLOCATE (mo_dist_3)

         END DO

      END DO


      !clean-up

      CALL dbt_pgrid_destroy(t_pgrid)

      DO ispin = 1, nspins

         DEALLOCATE (mo_blk_size(ispin)%array)

         DEALLOCATE (mo_col_dist(ispin)%array)

         DEALLOCATE (mo_row_dist(ispin)%array)

      END DO


      CALL timestop(handle)


   END SUBROUTINE contract_aos_to_mos


! **************************************************************************************************

!> \brief Contracts the (oI|Y) tensors, for each donor MO

!> \param oI_Y the contracted tensr. It is assumed to be allocated outside of this routine

!> \param ispin ...

!> \param ao_blk_size ...

!> \param mo_blk_size ...

!> \param ri_blk_size ...

!> \param t_mo_coeffs ...

!> \param donor_state ...

!> \param xas_tdp_env ...

!> \param xas_tdp_control ...

!> \param qs_env ...

! **************************************************************************************************

   SUBROUTINE get_oiy_tensors(oI_Y, ispin, ao_blk_size, mo_blk_size, ri_blk_size, t_mo_coeffs, &

                              donor_state, xas_tdp_env, xas_tdp_control, qs_env)


      TYPE(dbt_type), ALLOCATABLE, DIMENSION(:), &

         INTENT(INOUT)                                   :: oi_y

      INTEGER, INTENT(IN)                                :: ispin

      INTEGER, DIMENSION(:), POINTER                     :: ao_blk_size, mo_blk_size, ri_blk_size

      TYPE(dbt_type), INTENT(inout)                      :: t_mo_coeffs

      TYPE(donor_state_type), POINTER                    :: donor_state

      TYPE(xas_tdp_env_type), POINTER                    :: xas_tdp_env

      TYPE(xas_tdp_control_type), POINTER                :: xas_tdp_control

      TYPE(qs_environment_type), POINTER                 :: qs_env


      CHARACTER(len=*), PARAMETER                        :: routinen = 'get_oIY_tensors'


      INTEGER                                            :: bo(2), handle, i, ido_mo, ind(2), natom, &

                                                            nblk_aos, nblk_mos, nblk_ri, ndo_mo, &

                                                            pdims_2d(2), proc_id

      INTEGER, DIMENSION(:), POINTER                     :: ao_row_dist, mo_row_dist, ri_col_dist

      INTEGER, DIMENSION(:, :), POINTER                  :: mat_pgrid

      LOGICAL                                            :: found

      REAL(dp), ALLOCATABLE, DIMENSION(:, :)             :: pblock

      TYPE(dbcsr_distribution_type), POINTER             :: std_mat_dist

      TYPE(dbcsr_p_type), DIMENSION(:), POINTER          :: pi_y

      TYPE(dbt_distribution_type)                        :: t_dist

      TYPE(dbt_iterator_type)                            :: iter

      TYPE(dbt_pgrid_type)                               :: t_pgrid

      TYPE(dbt_type)                                     :: t_pi_y, t_work

      TYPE(mp_para_env_type), POINTER                    :: para_env


      CALL timeset(routinen, handle)


      CALL get_qs_env(qs_env, natom=natom, para_env=para_env, dbcsr_dist=std_mat_dist)

      ndo_mo = donor_state%ndo_mo

      nblk_aos = SIZE(ao_blk_size)

      nblk_mos = SIZE(mo_blk_size)

      nblk_ri = SIZE(ri_blk_size)


      !We first contract (pq|X) over q into I using kernel routines (goes over all MOs and spins)

      CALL contract2_ao_to_domo(pi_y, "EXCHANGE", donor_state, xas_tdp_env, xas_tdp_control, qs_env)


      !multiply by (X|Y)^-1

      CALL ri_all_blocks_mm(pi_y, xas_tdp_env%ri_inv_ex)


      !get standaed 2d matrix proc grid

      CALL dbcsr_distribution_get(std_mat_dist, pgrid=mat_pgrid)


      !Loop over donor MOs of this spin

      DO ido_mo = (ispin - 1)*ndo_mo + 1, ispin*ndo_mo


         !cast the matrix into a tensor

         CALL dbt_create(pi_y(ido_mo)%matrix, t_work)

         CALL dbt_copy_matrix_to_tensor(pi_y(ido_mo)%matrix, t_work)


         !find col proc_id of the only populated column of t_work

         ALLOCATE (ri_col_dist(natom))

         CALL dbt_get_info(t_work, proc_dist_2=ri_col_dist)

         proc_id = ri_col_dist(donor_state%at_index)

         DEALLOCATE (ri_col_dist)


         !preapre (oI_Y) tensor and (pI|Y) tensor in proper dist and blk sizes

         pdims_2d(1) = SIZE(mat_pgrid, 1); pdims_2d(2) = SIZE(mat_pgrid, 2)

         CALL dbt_pgrid_create(para_env, pdims_2d, t_pgrid)


         ALLOCATE (ri_col_dist(nblk_ri), ao_row_dist(nblk_aos), mo_row_dist(nblk_mos))

         CALL dbt_get_info(t_work, proc_dist_1=ao_row_dist)

         ri_col_dist = proc_id


         CALL dbt_distribution_new(t_dist, t_pgrid, nd_dist_1=ao_row_dist, nd_dist_2=ri_col_dist)

         CALL dbt_create(t_pi_y, name="(pI|Y)", dist=t_dist, map1_2d=[1], map2_2d=[2], &

                         blk_size_1=ao_blk_size, blk_size_2=ri_blk_size)

         CALL dbt_distribution_destroy(t_dist)


         !copy block by block, dist match


!$OMP PARALLEL DEFAULT(NONE) SHARED(t_work,t_pI_Y,nblk_ri,ri_blk_size,ao_blk_size) &

!$OMP PRIVATE(iter,ind,pblock,found,bo)

         CALL dbt_iterator_start(iter, t_work)

         DO WHILE (dbt_iterator_blocks_left(iter))

            CALL dbt_iterator_next_block(iter, ind)

            CALL dbt_get_block(t_work, ind, pblock, found)


            DO i = 1, nblk_ri

               bo(1) = sum(ri_blk_size(1:i - 1)) + 1

               bo(2) = bo(1) + ri_blk_size(i) - 1

               CALL dbt_put_block(t_pi_y, [ind(1), i], [ao_blk_size(ind(1)), ri_blk_size(i)], &

                                  pblock(:, bo(1):bo(2)))

            END DO


            DEALLOCATE (pblock)

         END DO

         CALL dbt_iterator_stop(iter)

!$OMP END PARALLEL

         CALL dbt_finalize(t_pi_y)


         !get optimal pgrid  for (oI|Y)

         CALL dbt_pgrid_destroy(t_pgrid)

         pdims_2d = 0

         CALL dbt_pgrid_create(para_env, pdims_2d, t_pgrid, tensor_dims=[nblk_mos, nblk_ri])


         CALL dbt_default_distvec(nblk_aos, pdims_2d(1), ao_blk_size, ao_row_dist)

         CALL dbt_default_distvec(nblk_mos, pdims_2d(1), mo_blk_size, mo_row_dist)

         CALL dbt_default_distvec(nblk_ri, pdims_2d(2), ri_blk_size, ri_col_dist)


         !transfer pI_Y to the correct pgrid

         CALL dbt_destroy(t_work)

         CALL dbt_distribution_new(t_dist, t_pgrid, nd_dist_1=ao_row_dist, nd_dist_2=ri_col_dist)

         CALL dbt_create(t_work, name="t_pI_Y", dist=t_dist, map1_2d=[1], map2_2d=[2], &

                         blk_size_1=ao_blk_size, blk_size_2=ri_blk_size)

         CALL dbt_copy(t_pi_y, t_work, move_data=.true.)

         CALL dbt_distribution_destroy(t_dist)


         !create (oI|Y)

         CALL dbt_distribution_new(t_dist, t_pgrid, nd_dist_1=mo_row_dist, nd_dist_2=ri_col_dist)

         CALL dbt_create(oi_y(ido_mo), name="(oI|Y)", dist=t_dist, map1_2d=[1], map2_2d=[2], &

                         blk_size_1=mo_blk_size, blk_size_2=ri_blk_size)

         CALL dbt_distribution_destroy(t_dist)


         !contract (pI|Y) into (oI|Y)

         CALL dbt_contract(alpha=1.0_dp, tensor_1=t_work, tensor_2=t_mo_coeffs, &

                           beta=0.0_dp, tensor_3=oi_y(ido_mo), contract_1=[1], &

                           notcontract_1=[2], contract_2=[1], notcontract_2=[2], &

                           map_1=[2], map_2=[1]) !no bound, all MOs needed


         !intermediate clean-up

         CALL dbt_destroy(t_work)

         CALL dbt_destroy(t_pi_y)

         CALL dbt_pgrid_destroy(t_pgrid)

         DEALLOCATE (ri_col_dist, ao_row_dist, mo_row_dist)


      END DO !ido_mo


      !final clean-up

      CALL dbcsr_deallocate_matrix_set(pi_y)


      CALL timestop(handle)


   END SUBROUTINE get_oiy_tensors


! **************************************************************************************************

!> \brief Computes the same spin, occupied-occupied-virtual MO contribution to the electron propagator:

!>        0.5 * sum_ajk |<Ia||jk>|^2/(omega + eps_a - epsj_j - eps_k) and its 1st derivative wrt omega:

!>        -0.5 * sum_ajk |<Ia||jk>|^2/(omega + eps_a - epsj_j - eps_k)**2

!> \param contrib ...

!> \param dev the first derivative

!> \param ja_Ik_diff ... contains the (ja|Ik) - (ka|Ij) tensor

!> \param occ_evals ...

!> \param virt_evals ...

!> \param omega ...

!> \param c_ss ...

!> \note since the is same-spin, there is only one possibility for occ_evals and virt_evals

! **************************************************************************************************

   SUBROUTINE calc_ss_oov_contrib(contrib, dev, ja_Ik_diff, occ_evals, virt_evals, omega, c_ss)


      REAL(dp), INTENT(inout)                            :: contrib, dev

      TYPE(dbt_type), INTENT(inout)                      :: ja_ik_diff

      REAL(dp), DIMENSION(:), INTENT(IN)                 :: occ_evals, virt_evals

      REAL(dp), INTENT(in)                               :: omega, c_ss


      CHARACTER(len=*), PARAMETER :: routinen = 'calc_ss_oov_contrib'


      INTEGER                                            :: a, boff(3), bsize(3), handle, idx1, &

                                                            idx2, idx3, ind(3), j, k, nocc

      LOGICAL                                            :: found

      REAL(dp)                                           :: denom, tmp

      REAL(dp), ALLOCATABLE, DIMENSION(:, :, :)          :: tensor_blk

      TYPE(dbt_iterator_type)                            :: iter


      CALL timeset(routinen, handle)


      !<Ia||jk> = <Ia|jk> - <Ia|kj> = (Ij|ak) - (Ik|aj) = (ka|Ij) - (ja|Ik)

      !Note: the same spin contribution only involve spib-orbitals that are all of the same spin


      nocc = SIZE(occ_evals, 1)


      !Iterate over the tensors and sum. Both tensors have same dist


!$OMP PARALLEL DEFAULT(NONE) REDUCTION(+:contrib,dev) &

!$OMP SHARED(ja_Ik_diff,occ_evals,virt_evals,omega,c_ss,nocc) &

!$OMP PRIVATE(iter,ind,boff,bsize,tensor_blk,found,idx1,idx2,idx3,j,A,k,denom,tmp)

      CALL dbt_iterator_start(iter, ja_ik_diff)

      DO WHILE (dbt_iterator_blocks_left(iter))

         CALL dbt_iterator_next_block(iter, ind, blk_offset=boff, blk_size=bsize)

         CALL dbt_get_block(ja_ik_diff, ind, tensor_blk, found)


         IF (found) THEN


            DO idx3 = 1, bsize(3)

               DO idx2 = 1, bsize(2)

                  DO idx1 = 1, bsize(1)


                     !get proper MO indices

                     j = boff(1) + idx1 - 1

                     a = boff(2) + idx2 - 1 - nocc

                     k = boff(3) + idx3 - 1


                     !the denominator

                     denom = omega + virt_evals(a) - occ_evals(j) - occ_evals(k)


                     !the same spin contribution

                     tmp = c_ss*tensor_blk(idx1, idx2, idx3)**2


                     contrib = contrib + 0.5_dp*tmp/denom

                     dev = dev - 0.5_dp*tmp/denom**2


                  END DO

               END DO

            END DO

         END IF

         DEALLOCATE (tensor_blk)

      END DO

      CALL dbt_iterator_stop(iter)

!$OMP END PARALLEL


      CALL timestop(handle)


   END SUBROUTINE calc_ss_oov_contrib


! **************************************************************************************************

!> \brief Computes the opposite spin, occupied-occupied-virtual MO contribution to the electron propagator:

!>        0.5 * sum_ajk |<Ia||jk>|^2/(omega + eps_a - epsj_j - eps_k) and its 1st derivative wrt omega:

!>        -0.5 * sum_ajk |<Ia||jk>|^2/(omega + eps_a - epsj_j - eps_k)**2

!> \param contrib ...

!> \param dev the first derivative

!> \param ja_Ik ...

!> \param j_evals ocucpied evals for j MO

!> \param a_evals virtual evals for a MO

!> \param k_evals ocucpied evals for k MO

!> \param omega ...

!> \param c_os ...

!> \param a_offset the number of occupied MOs for the same spin as a MOs

!> \note since this is opposite-spin, evals might be different for different spins

! **************************************************************************************************

   SUBROUTINE calc_os_oov_contrib(contrib, dev, ja_Ik, j_evals, a_evals, k_evals, omega, c_os, a_offset)


      REAL(dp), INTENT(inout)                            :: contrib, dev

      TYPE(dbt_type), INTENT(inout)                      :: ja_ik

      REAL(dp), DIMENSION(:), INTENT(IN)                 :: j_evals, a_evals, k_evals

      REAL(dp), INTENT(in)                               :: omega, c_os

      INTEGER, INTENT(IN)                                :: a_offset


      CHARACTER(len=*), PARAMETER :: routinen = 'calc_os_oov_contrib'


      INTEGER                                            :: a, boff(3), bsize(3), handle, idx1, &

                                                            idx2, idx3, ind(3), j, k

      LOGICAL                                            :: found

      REAL(dp)                                           :: denom, tmp

      REAL(dp), ALLOCATABLE, DIMENSION(:, :, :)          :: ja_ik_blk

      TYPE(dbt_iterator_type)                            :: iter


      CALL timeset(routinen, handle)


      !<Ia||jk> = <Ia|jk> - <Ia|kj> = (Ij|ak) - (Ik|aj) = (ka|Ij) - (ja|Ik)

      !Note: the opposite spin contribution comes in 2 parts, once (ka|Ij) and once (ja|Ik) only,

      !      where both spin-orbitals on one center have the same spin, but it is the opposite of

      !      the spin on the other center. Because it is eventually summed, can consider only one

      !      of the 2 terms, but with a factor 2


      !Iterate over the tensor and sum


!$OMP PARALLEL DEFAULT(NONE) REDUCTION(+:contrib,dev) &

!$OMP SHARED(ja_Ik,j_evals,a_evals,k_evals,omega,c_os,a_offset) &

!$OMP PRIVATE(iter,ind,boff,bsize,ja_Ik_blk,found,idx1,idx2,idx3,j,A,k,denom,tmp)

      CALL dbt_iterator_start(iter, ja_ik)

      DO WHILE (dbt_iterator_blocks_left(iter))

         CALL dbt_iterator_next_block(iter, ind, blk_offset=boff, blk_size=bsize)

         CALL dbt_get_block(ja_ik, ind, ja_ik_blk, found)


         IF (found) THEN


            DO idx3 = 1, bsize(3)

               DO idx2 = 1, bsize(2)

                  DO idx1 = 1, bsize(1)


                     !get proper MO indices

                     j = boff(1) + idx1 - 1

                     a = boff(2) + idx2 - 1 - a_offset

                     k = boff(3) + idx3 - 1


                     !the denominator

                     denom = omega + a_evals(a) - j_evals(j) - k_evals(k)


                     !the opposite spin contribution

                     tmp = c_os*ja_ik_blk(idx1, idx2, idx3)**2


                     !take factor 2 into acocunt (2 x 0.5 = 1)

                     contrib = contrib + tmp/denom

                     dev = dev - tmp/denom**2


                  END DO

               END DO

            END DO

         END IF

         DEALLOCATE (ja_ik_blk)

      END DO

      CALL dbt_iterator_stop(iter)

!$OMP END PARALLEL


      CALL timestop(handle)


   END SUBROUTINE calc_os_oov_contrib


! **************************************************************************************************

!> \brief Computes the same-spin occupied-virtual-virtual MO contribution to the electron propagator:

!>        0.5 * sum_abj |<Ij||ab>|^2/(omega + eps_j - eps_a - eps_b) as well as its first derivative:

!>        -0.5 * sum_abj |<Ij||ab>|^2/(omega + eps_j - eps_a - eps_b)**2

!> \param contrib ...

!> \param dev the first derivative

!> \param aj_Ib_diff contatins the (aj|Ib) - (bj|Ia) tensor

!> \param occ_evals ...

!> \param virt_evals ...

!> \param omega ...

!> \param c_ss ...

!> \note since the is same-spin, there is only one possibility for occ_evals and virt_evals

! **************************************************************************************************

   SUBROUTINE calc_ss_ovv_contrib(contrib, dev, aj_Ib_diff, occ_evals, virt_evals, omega, c_ss)


      REAL(dp), INTENT(inout)                            :: contrib, dev

      TYPE(dbt_type), INTENT(inout)                      :: aj_ib_diff

      REAL(dp), DIMENSION(:), INTENT(IN)                 :: occ_evals, virt_evals

      REAL(dp), INTENT(in)                               :: omega, c_ss


      CHARACTER(len=*), PARAMETER :: routinen = 'calc_ss_ovv_contrib'


      INTEGER                                            :: a, b, boff(3), bsize(3), handle, idx1, &

                                                            idx2, idx3, ind(3), j, nocc

      LOGICAL                                            :: found

      REAL(dp)                                           :: denom, tmp

      REAL(dp), ALLOCATABLE, DIMENSION(:, :, :)          :: tensor_blk

      TYPE(dbt_iterator_type)                            :: iter


      CALL timeset(routinen, handle)


      !<Ij||ab> = <Ij|ab> - <Ij|ba> = (Ia|jb) - (Ib|ja) = (jb|Ia) - (ja|Ib)

      !Notes: only non-zero contribution if all MOs have the same spin


      nocc = SIZE(occ_evals, 1)


      !tensors have matching distributions, can do that safely


!$OMP PARALLEL DEFAULT(NONE) REDUCTION(+:contrib,dev) &

!$OMP SHARED(aj_Ib_diff,occ_evals,virt_evals,omega,c_ss,nocc) &

!$OMP PRIVATE(iter,ind,boff,bsize,tensor_blk,found,idx1,idx2,idx3,j,A,b,denom,tmp)

      CALL dbt_iterator_start(iter, aj_ib_diff)

      DO WHILE (dbt_iterator_blocks_left(iter))

         CALL dbt_iterator_next_block(iter, ind, blk_offset=boff, blk_size=bsize)

         CALL dbt_get_block(aj_ib_diff, ind, tensor_blk, found)


         IF (found) THEN


            DO idx3 = 1, bsize(3)

               DO idx2 = 1, bsize(2)

                  DO idx1 = 1, bsize(1)


                     !get proper MO indices

                     a = boff(1) + idx1 - 1 - nocc

                     j = boff(2) + idx2 - 1

                     b = boff(3) + idx3 - 1 - nocc


                     !the common denominator

                     denom = omega + occ_evals(j) - virt_evals(a) - virt_evals(b)


                     !the same spin contribution

                     tmp = c_ss*tensor_blk(idx1, idx2, idx3)**2


                     contrib = contrib + 0.5_dp*tmp/denom

                     dev = dev - 0.5_dp*tmp/denom**2


                  END DO

               END DO

            END DO

         END IF

         DEALLOCATE (tensor_blk)

      END DO

      CALL dbt_iterator_stop(iter)

!$OMP END PARALLEL


      CALL timestop(handle)


   END SUBROUTINE calc_ss_ovv_contrib


! **************************************************************************************************

!> \brief Computes the opposite-spin occupied-virtual-virtual MO contribution to the electron propagator:

!>        0.5 * sum_abj |<Ij||ab>|^2/(omega + eps_j - eps_a - eps_b) as well as its first derivative:

!>        -0.5 * sum_abj |<Ij||ab>|^2/(omega + eps_j - eps_a - eps_b)**2

!> \param contrib ...

!> \param dev the first derivative

!> \param aj_Ib ...

!> \param a_evals virtual evals for a MO

!> \param j_evals occupied evals for j MO

!> \param b_evals virtual evals for b MO

!> \param omega ...

!> \param c_os ...

!> \param a_offset number of occupied MOs for the same spin as a MO

!> \param b_offset number of occupied MOs for the same spin as b MO

!> \note since this is opposite-spin, evals might be different for different spins

! **************************************************************************************************

   SUBROUTINE calc_os_ovv_contrib(contrib, dev, aj_Ib, a_evals, j_evals, b_evals, omega, c_os, &

                                  a_offset, b_offset)


      REAL(dp), INTENT(inout)                            :: contrib, dev

      TYPE(dbt_type), INTENT(inout)                      :: aj_ib

      REAL(dp), DIMENSION(:), INTENT(IN)                 :: a_evals, j_evals, b_evals

      REAL(dp), INTENT(in)                               :: omega, c_os

      INTEGER, INTENT(IN)                                :: a_offset, b_offset


      CHARACTER(len=*), PARAMETER :: routinen = 'calc_os_ovv_contrib'


      INTEGER                                            :: a, b, boff(3), bsize(3), handle, idx1, &

                                                            idx2, idx3, ind(3), j

      LOGICAL                                            :: found

      REAL(dp)                                           :: denom, tmp

      REAL(dp), ALLOCATABLE, DIMENSION(:, :, :)          :: aj_ib_blk

      TYPE(dbt_iterator_type)                            :: iter


      CALL timeset(routinen, handle)


      !<Ij||ab> = <Ij|ab> - <Ij|ba> = (Ia|jb) - (Ib|ja) = (jb|Ia) - (ja|Ib)

      !Notes: only 2 distinct contributions, once from (jb|Ia) and once form (ja|Ib) only, when the 2

      !       MOs on one center have one spin and the 2 MOs on the other center have another spin

      !       In the end, the sum is such that can take one of those with a factor 2


!$OMP PARALLEL DEFAULT(NONE) REDUCTION(+:contrib,dev) &

!$OMP SHARED(aj_Ib,a_evals,j_evals,b_evals,omega,c_os,a_offset,b_offset) &

!$OMP PRIVATE(iter,ind,boff,bsize,aj_Ib_blk,found,idx1,idx2,idx3,j,A,b,denom,tmp)

      CALL dbt_iterator_start(iter, aj_ib)

      DO WHILE (dbt_iterator_blocks_left(iter))

         CALL dbt_iterator_next_block(iter, ind, blk_offset=boff, blk_size=bsize)

         CALL dbt_get_block(aj_ib, ind, aj_ib_blk, found)


         IF (found) THEN


            DO idx3 = 1, bsize(3)

               DO idx2 = 1, bsize(2)

                  DO idx1 = 1, bsize(1)


                     !get proper MO indices

                     a = boff(1) + idx1 - 1 - a_offset

                     j = boff(2) + idx2 - 1

                     b = boff(3) + idx3 - 1 - b_offset


                     !the denominator

                     denom = omega + j_evals(j) - a_evals(a) - b_evals(b)


                     !the opposite-spin contribution. Factor 2 taken into account (2 x 0.5 = 1)

                     tmp = c_os*(aj_ib_blk(idx1, idx2, idx3))**2


                     contrib = contrib + tmp/denom

                     dev = dev - tmp/denom**2


                  END DO

               END DO

            END DO

         END IF

         DEALLOCATE (aj_ib_blk)

      END DO

      CALL dbt_iterator_stop(iter)

!$OMP END PARALLEL


      CALL timestop(handle)


   END SUBROUTINE calc_os_ovv_contrib


! **************************************************************************************************

!> \brief We try to compute the spin-orbit splitting via perturbation theory. We keep it

!>\        cheap by only inculding the degenerate states (2p, 3d, 3p, etc.).

!> \param soc_shifts the SOC corrected orbital shifts to apply to original energies, for both spins

!> \param donor_state ...

!> \param xas_tdp_env ...

!> \param xas_tdp_control ...

!> \param qs_env ...

! **************************************************************************************************


   SUBROUTINE get_soc_splitting(soc_shifts, donor_state, xas_tdp_env, xas_tdp_control, qs_env)


      REAL(dp), ALLOCATABLE, DIMENSION(:, :), &

         INTENT(out)                                     :: soc_shifts

      TYPE(donor_state_type), POINTER                    :: donor_state

      TYPE(xas_tdp_env_type), POINTER                    :: xas_tdp_env

      TYPE(xas_tdp_control_type), POINTER                :: xas_tdp_control

      TYPE(qs_environment_type), POINTER                 :: qs_env


      CHARACTER(len=*), PARAMETER                        :: routinen = 'get_soc_splitting'


      COMPLEX(dp), ALLOCATABLE, DIMENSION(:, :)          :: evecs, hami

      INTEGER                                            :: beta_spin, handle, ialpha, ibeta, &

                                                            ido_mo, ispin, nao, ndo_mo, ndo_so, &

                                                            nspins

      REAL(dp)                                           :: alpha_tot_contrib, beta_tot_contrib

      REAL(dp), ALLOCATABLE, DIMENSION(:)                :: evals

      REAL(dp), ALLOCATABLE, DIMENSION(:, :)             :: tmp_shifts

      TYPE(cp_blacs_env_type), POINTER                   :: blacs_env

      TYPE(cp_cfm_type)                                  :: hami_cfm

      TYPE(cp_fm_struct_type), POINTER                   :: ao_domo_struct, domo_domo_struct, &

                                                            doso_doso_struct

      TYPE(cp_fm_type)                                   :: alpha_gs_coeffs, ao_domo_work, &

                                                            beta_gs_coeffs, domo_domo_work, &

                                                            img_fm, real_fm

      TYPE(dbcsr_p_type), DIMENSION(:), POINTER          :: matrix_ks

      TYPE(dbcsr_type), POINTER                          :: orb_soc_x, orb_soc_y, orb_soc_z

      TYPE(mp_para_env_type), POINTER                    :: para_env


      NULLIFY (matrix_ks, para_env, blacs_env, ao_domo_struct, domo_domo_struct, &

               doso_doso_struct, orb_soc_x, orb_soc_y, orb_soc_z)


      CALL timeset(routinen, handle)


      ! Idea: we compute the SOC matrix in the space of the degenerate spin-orbitals, add it to

      !       the KS matrix in the same basis, diagonalize the whole thing and get the corrected energies

      !       for SOC


      CALL get_qs_env(qs_env, matrix_ks=matrix_ks, para_env=para_env, blacs_env=blacs_env)


      orb_soc_x => xas_tdp_env%orb_soc(1)%matrix

      orb_soc_y => xas_tdp_env%orb_soc(2)%matrix

      orb_soc_z => xas_tdp_env%orb_soc(3)%matrix


      ! Whether it is open-shell or not, we have 2*ndo_mo spin-orbitals

      nspins = 2

      ndo_mo = donor_state%ndo_mo

      ndo_so = nspins*ndo_mo

      CALL dbcsr_get_info(matrix_ks(1)%matrix, nfullrows_total=nao)


      ! Build the fm infrastructure

      CALL cp_fm_struct_create(ao_domo_struct, context=blacs_env, para_env=para_env, &

                               nrow_global=nao, ncol_global=ndo_mo)

      CALL cp_fm_struct_create(domo_domo_struct, context=blacs_env, para_env=para_env, &

                               nrow_global=ndo_mo, ncol_global=ndo_mo)

      CALL cp_fm_struct_create(doso_doso_struct, context=blacs_env, para_env=para_env, &

                               nrow_global=ndo_so, ncol_global=ndo_so)


      CALL cp_fm_create(alpha_gs_coeffs, ao_domo_struct)

      CALL cp_fm_create(beta_gs_coeffs, ao_domo_struct)

      CALL cp_fm_create(ao_domo_work, ao_domo_struct)

      CALL cp_fm_create(domo_domo_work, domo_domo_struct)

      CALL cp_fm_create(real_fm, doso_doso_struct)

      CALL cp_fm_create(img_fm, doso_doso_struct)


      ! Put the gs_coeffs in the correct format.

      IF (xas_tdp_control%do_uks) THEN


         CALL cp_fm_to_fm_submat(msource=donor_state%gs_coeffs, mtarget=alpha_gs_coeffs, nrow=nao, &

                                 ncol=ndo_mo, s_firstrow=1, s_firstcol=1, t_firstrow=1, t_firstcol=1)

         CALL cp_fm_to_fm_submat(msource=donor_state%gs_coeffs, mtarget=beta_gs_coeffs, nrow=nao, &

                                 ncol=ndo_mo, s_firstrow=1, s_firstcol=ndo_mo + 1, t_firstrow=1, t_firstcol=1)


      ELSE


         CALL cp_fm_to_fm(donor_state%gs_coeffs, alpha_gs_coeffs)

         CALL cp_fm_to_fm(donor_state%gs_coeffs, beta_gs_coeffs)

      END IF


      ! Compute the KS matrix in this basis, add it to the real part of the final matrix

      !alpha-alpha block in upper left quadrant

      CALL cp_dbcsr_sm_fm_multiply(matrix_ks(1)%matrix, alpha_gs_coeffs, ao_domo_work, ncol=ndo_mo)

      CALL parallel_gemm('T', 'N', ndo_mo, ndo_mo, nao, 1.0_dp, alpha_gs_coeffs, ao_domo_work, 0.0_dp, &

                         domo_domo_work)

      CALL cp_fm_to_fm_submat(msource=domo_domo_work, mtarget=real_fm, nrow=ndo_mo, ncol=ndo_mo, &

                              s_firstrow=1, s_firstcol=1, t_firstrow=1, t_firstcol=1)


      !beta-beta block in lower right quadrant

      beta_spin = 1; IF (xas_tdp_control%do_uks .OR. xas_tdp_control%do_roks) beta_spin = 2

      CALL cp_dbcsr_sm_fm_multiply(matrix_ks(beta_spin)%matrix, beta_gs_coeffs, ao_domo_work, ncol=ndo_mo)

      CALL parallel_gemm('T', 'N', ndo_mo, ndo_mo, nao, 1.0_dp, beta_gs_coeffs, ao_domo_work, 0.0_dp, &

                         domo_domo_work)

      CALL cp_fm_to_fm_submat(msource=domo_domo_work, mtarget=real_fm, nrow=ndo_mo, ncol=ndo_mo, &

                              s_firstrow=1, s_firstcol=1, t_firstrow=ndo_mo + 1, t_firstcol=ndo_mo + 1)


      ! Compute the SOC matrix elements and add them to the real or imaginary part of the matrix

      ! alpha-alpha block, only Hz not zero, purely imaginary, addition

      CALL cp_dbcsr_sm_fm_multiply(orb_soc_z, alpha_gs_coeffs, ao_domo_work, ncol=ndo_mo)

      CALL parallel_gemm('T', 'N', ndo_mo, ndo_mo, nao, 1.0_dp, alpha_gs_coeffs, ao_domo_work, 0.0_dp, &

                         domo_domo_work)

      CALL cp_fm_to_fm_submat(msource=domo_domo_work, mtarget=img_fm, nrow=ndo_mo, ncol=ndo_mo, &

                              s_firstrow=1, s_firstcol=1, t_firstrow=1, t_firstcol=1)


      ! beta-beta block, only Hz not zero, purely imaginary, substraciton

      CALL cp_dbcsr_sm_fm_multiply(orb_soc_z, beta_gs_coeffs, ao_domo_work, ncol=ndo_mo)

      CALL parallel_gemm('T', 'N', ndo_mo, ndo_mo, nao, -1.0_dp, beta_gs_coeffs, ao_domo_work, 0.0_dp, &

                         domo_domo_work)

      CALL cp_fm_to_fm_submat(msource=domo_domo_work, mtarget=img_fm, nrow=ndo_mo, ncol=ndo_mo, &

                              s_firstrow=1, s_firstcol=1, t_firstrow=ndo_mo + 1, t_firstcol=ndo_mo + 1)


      ! alpha-beta block, two non-zero terms in Hx and Hy

      ! Hx term, purely imaginary, addition

      CALL cp_dbcsr_sm_fm_multiply(orb_soc_x, beta_gs_coeffs, ao_domo_work, ncol=ndo_mo)

      CALL parallel_gemm('T', 'N', ndo_mo, ndo_mo, nao, 1.0_dp, alpha_gs_coeffs, ao_domo_work, 0.0_dp, &

                         domo_domo_work)

      CALL cp_fm_to_fm_submat(msource=domo_domo_work, mtarget=img_fm, nrow=ndo_mo, ncol=ndo_mo, &

                              s_firstrow=1, s_firstcol=1, t_firstrow=1, t_firstcol=ndo_mo + 1)

      ! Hy term, purely real, addition

      CALL cp_dbcsr_sm_fm_multiply(orb_soc_y, beta_gs_coeffs, ao_domo_work, ncol=ndo_mo)

      CALL parallel_gemm('T', 'N', ndo_mo, ndo_mo, nao, 1.0_dp, alpha_gs_coeffs, ao_domo_work, 0.0_dp, &

                         domo_domo_work)

      CALL cp_fm_to_fm_submat(msource=domo_domo_work, mtarget=real_fm, nrow=ndo_mo, ncol=ndo_mo, &

                              s_firstrow=1, s_firstcol=1, t_firstrow=1, t_firstcol=ndo_mo + 1)


      ! beta-alpha block, two non-zero terms in Hx and Hy

      ! Hx term, purely imaginary, addition

      CALL cp_dbcsr_sm_fm_multiply(orb_soc_x, alpha_gs_coeffs, ao_domo_work, ncol=ndo_mo)

      CALL parallel_gemm('T', 'N', ndo_mo, ndo_mo, nao, 1.0_dp, beta_gs_coeffs, ao_domo_work, 0.0_dp, &

                         domo_domo_work)

      CALL cp_fm_to_fm_submat(msource=domo_domo_work, mtarget=img_fm, nrow=ndo_mo, ncol=ndo_mo, &

                              s_firstrow=1, s_firstcol=1, t_firstrow=ndo_mo + 1, t_firstcol=1)

      ! Hy term, purely real, substraction

      CALL cp_dbcsr_sm_fm_multiply(orb_soc_y, alpha_gs_coeffs, ao_domo_work, ncol=ndo_mo)

      CALL parallel_gemm('T', 'N', ndo_mo, ndo_mo, nao, -1.0_dp, beta_gs_coeffs, ao_domo_work, 0.0_dp, &

                         domo_domo_work)

      CALL cp_fm_to_fm_submat(msource=domo_domo_work, mtarget=real_fm, nrow=ndo_mo, ncol=ndo_mo, &

                              s_firstrow=1, s_firstcol=1, t_firstrow=ndo_mo + 1, t_firstcol=1)


      ! Cast everything in complex fm format

      CALL cp_cfm_create(hami_cfm, doso_doso_struct)

      CALL cp_fm_to_cfm(real_fm, img_fm, hami_cfm)


      ! And diagonalize. Since tiny matrix (6x6), diagonalize locally

      ALLOCATE (evals(ndo_so), evecs(ndo_so, ndo_so), hami(ndo_so, ndo_so))

      CALL cp_cfm_get_submatrix(hami_cfm, hami)

      CALL complex_diag(hami, evecs, evals)


      !The SOC corrected KS eigenvalues

      ALLOCATE (tmp_shifts(ndo_mo, 2))


      ialpha = 1; ibeta = 1;

      DO ido_mo = 1, ndo_so

         !need to find out whether the eigenvalue corresponds to an alpha or beta spin-orbtial

         alpha_tot_contrib = real(dot_product(evecs(1:ndo_mo, ido_mo), evecs(1:ndo_mo, ido_mo)))

         beta_tot_contrib = real(dot_product(evecs(ndo_mo + 1:ndo_so, ido_mo), evecs(ndo_mo + 1:ndo_so, ido_mo)))


         IF (alpha_tot_contrib > beta_tot_contrib) THEN

            tmp_shifts(ialpha, 1) = evals(ido_mo)

            ialpha = ialpha + 1

         ELSE

            tmp_shifts(ibeta, 2) = evals(ido_mo)

            ibeta = ibeta + 1

         END IF

      END DO


      !compute shift from KS evals

      ALLOCATE (soc_shifts(ndo_mo, SIZE(donor_state%energy_evals, 2)))

      DO ispin = 1, SIZE(donor_state%energy_evals, 2)

         soc_shifts(:, ispin) = tmp_shifts(:, ispin) - donor_state%energy_evals(:, ispin)

      END DO


      ! clean-up

      CALL cp_fm_release(alpha_gs_coeffs)

      CALL cp_fm_release(beta_gs_coeffs)

      CALL cp_fm_release(ao_domo_work)

      CALL cp_fm_release(domo_domo_work)

      CALL cp_fm_release(real_fm)

      CALL cp_fm_release(img_fm)


      CALL cp_cfm_release(hami_cfm)


      CALL cp_fm_struct_release(ao_domo_struct)

      CALL cp_fm_struct_release(domo_domo_struct)

      CALL cp_fm_struct_release(doso_doso_struct)


      CALL timestop(handle)


   END SUBROUTINE get_soc_splitting


END MODULE xas_tdp_correction

idx2
#define idx2(a, i, j)
Definition grid_dgemm_tensor_local.h:258

idx3
#define idx3(a, i, j, k)
Definition grid_dgemm_tensor_local.h:257

cp_dbcsr_operations::dbcsr_deallocate_matrix_set
Definition cp_dbcsr_operations.F:89

cp_fm_types::cp_fm_release
Definition cp_fm_types.F:90

cp_fm_types::cp_fm_to_fm
Definition cp_fm_types.F:85

parallel_gemm_api::parallel_gemm
Definition parallel_gemm_api.F:38

admm_types
Types and set/get functions for auxiliary density matrix methods.
Definition admm_types.F:15

admm_utils
Contains methods used in the context of density fitting.
Definition admm_utils.F:15

admm_utils::admm_correct_for_eigenvalues
subroutine, public admm_correct_for_eigenvalues(ispin, admm_env, ks_matrix)
...
Definition admm_utils.F:53

bibliography
collects all references to literature in CP2K as new algorithms / method are included from literature...
Definition bibliography.F:28

bibliography::shigeta2001
integer, save, public shigeta2001
Definition bibliography.F:43

bibliography::bussy2021b
integer, save, public bussy2021b
Definition bibliography.F:43

cp_array_utils
various utilities that regard array of different kinds: output, allocation,... maybe it is not a good...
Definition cp_array_utils.F:18

cp_blacs_env
methods related to the blacs parallel environment
Definition cp_blacs_env.F:15

cp_cfm_types
Represents a complex full matrix distributed on many processors.
Definition cp_cfm_types.F:12

cp_cfm_types::cp_cfm_get_submatrix
subroutine, public cp_cfm_get_submatrix(fm, target_m, start_row, start_col, n_rows, n_cols, transpose)
Extract a sub-matrix from the full matrix: op(target_m)(1:n_rows,1:n_cols) = fm(start_row:start_row+n...
Definition cp_cfm_types.F:333

cp_cfm_types::cp_cfm_create
subroutine, public cp_cfm_create(matrix, matrix_struct, name)
Creates a new full matrix with the given structure.
Definition cp_cfm_types.F:121

cp_cfm_types::cp_cfm_release
subroutine, public cp_cfm_release(matrix)
Releases a full matrix.
Definition cp_cfm_types.F:159

cp_cfm_types::cp_fm_to_cfm
subroutine, public cp_fm_to_cfm(msourcer, msourcei, mtarget)
Construct a complex full matrix by taking its real and imaginary parts from two separate real-value f...
Definition cp_cfm_types.F:817

cp_control_types
Defines control structures, which contain the parameters and the settings for the DFT-based calculati...
Definition cp_control_types.F:12

cp_dbcsr_operations
DBCSR operations in CP2K.
Definition cp_dbcsr_operations.F:18

cp_dbcsr_operations::cp_dbcsr_sm_fm_multiply
subroutine, public cp_dbcsr_sm_fm_multiply(matrix, fm_in, fm_out, ncol, alpha, beta)
multiply a dbcsr with a fm matrix
Definition cp_dbcsr_operations.F:744

cp_dbcsr_operations::copy_fm_to_dbcsr
subroutine, public copy_fm_to_dbcsr(fm, matrix, keep_sparsity)
Copy a BLACS matrix to a dbcsr matrix.
Definition cp_dbcsr_operations.F:118

cp_fm_diag
used for collecting some of the diagonalization schemes available for cp_fm_type. cp_fm_power also mo...
Definition cp_fm_diag.F:17

cp_fm_diag::choose_eigv_solver
subroutine, public choose_eigv_solver(matrix, eigenvectors, eigenvalues, info)
Choose the Eigensolver depending on which library is available ELPA seems to be unstable for small sy...
Definition cp_fm_diag.F:208

cp_fm_struct
represent the structure of a full matrix
Definition cp_fm_struct.F:14

cp_fm_struct::cp_fm_struct_create
subroutine, public cp_fm_struct_create(fmstruct, para_env, context, nrow_global, ncol_global, nrow_block, ncol_block, descriptor, first_p_pos, local_leading_dimension, template_fmstruct, square_blocks, force_block)
allocates and initializes a full matrix structure
Definition cp_fm_struct.F:125

cp_fm_struct::cp_fm_struct_release
subroutine, public cp_fm_struct_release(fmstruct)
releases a full matrix structure
Definition cp_fm_struct.F:320

cp_fm_types
represent a full matrix distributed on many processors
Definition cp_fm_types.F:15

cp_fm_types::cp_fm_get_diag
subroutine, public cp_fm_get_diag(matrix, diag)
returns the diagonal elements of a fm
Definition cp_fm_types.F:570

cp_fm_types::cp_fm_to_fm_submat
subroutine, public cp_fm_to_fm_submat(msource, mtarget, nrow, ncol, s_firstrow, s_firstcol, t_firstrow, t_firstcol)
copy just a part ot the matrix
Definition cp_fm_types.F:1473

cp_fm_types::cp_fm_get_submatrix
subroutine, public cp_fm_get_submatrix(fm, target_m, start_row, start_col, n_rows, n_cols, transpose)
gets a submatrix of a full matrix op(target_m)(1:n_rows,1:n_cols) =fm(start_row:start_row+n_rows,...
Definition cp_fm_types.F:901

cp_fm_types::cp_fm_create
subroutine, public cp_fm_create(matrix, matrix_struct, name, use_sp)
creates a new full matrix with the given structure
Definition cp_fm_types.F:167

cp_log_handling
various routines to log and control the output. The idea is that decisions about where to log should ...
Definition cp_log_handling.F:41

cp_log_handling::cp_logger_get_default_io_unit
integer function, public cp_logger_get_default_io_unit(logger)
returns the unit nr for the ionode (-1 on all other processors) skips as well checks if the procs cal...
Definition cp_log_handling.F:515

dbt_api
This is the start of a dbt_api, all publically needed functions are exported here....
Definition dbt_api.F:17

hfx_admm_utils
Utilities for hfx and admm methods.
Definition hfx_admm_utils.F:17

hfx_admm_utils::create_admm_xc_section
subroutine, public create_admm_xc_section(x_data, xc_section, admm_env)
This routine modifies the xc section depending on the potential type used for the HF exchange and the...
Definition hfx_admm_utils.F:1460

input_section_types
objects that represent the structure of input sections and the data contained in an input section
Definition input_section_types.F:15

input_section_types::section_vals_create
recursive subroutine, public section_vals_create(section_vals, section)
creates a object where to store the values of a section
Definition input_section_types.F:601

input_section_types::section_vals_retain
subroutine, public section_vals_retain(section_vals)
retains the given section values (see doc/ReferenceCounting.html)
Definition input_section_types.F:631

input_section_types::section_vals_get_subs_vals
recursive type(section_vals_type) function, pointer, public section_vals_get_subs_vals(section_vals, subsection_name, i_rep_section, can_return_null)
returns the values of the requested subsection
Definition input_section_types.F:724

input_section_types::section_vals_set_subs_vals
subroutine, public section_vals_set_subs_vals(section_vals, subsection_name, new_section_vals, i_rep_section)
replaces of the requested subsection with the one given
Definition input_section_types.F:1679

input_section_types::section_vals_release
recursive subroutine, public section_vals_release(section_vals)
releases the given object
Definition input_section_types.F:646

kinds
Defines the basic variable types.
Definition kinds.F:23

kinds::dp
integer, parameter, public dp
Definition kinds.F:34

machine
Machine interface based on Fortran 2003 and POSIX.
Definition machine.F:17

machine::m_flush
subroutine, public m_flush(lunit)
flushes units if the &GLOBAL flag is set accordingly
Definition machine.F:106

mathlib
Collection of simple mathematical functions and subroutines.
Definition mathlib.F:15

mathlib::complex_diag
subroutine, public complex_diag(matrix, eigenvectors, eigenvalues)
Diagonalizes a local complex Hermitian matrix using LAPACK. Based on cp_cfm_heevd.
Definition mathlib.F:1746

message_passing
Interface to the message passing library MPI.
Definition message_passing.F:23

parallel_gemm_api
basic linear algebra operations for full matrixes
Definition parallel_gemm_api.F:14

physcon
Definition of physical constants:
Definition physcon.F:68

physcon::evolt
real(kind=dp), parameter, public evolt
Definition physcon.F:183

qs_environment_types
Definition qs_environment_types.F:14

qs_environment_types::get_qs_env
subroutine, public get_qs_env(qs_env, atomic_kind_set, qs_kind_set, cell, super_cell, cell_ref, use_ref_cell, kpoints, dft_control, mos, sab_orb, sab_all, qmmm, qmmm_periodic, sac_ae, sac_ppl, sac_lri, sap_ppnl, sab_vdw, sab_scp, sap_oce, sab_lrc, sab_se, sab_xtbe, sab_tbe, sab_core, sab_xb, sab_xtb_nonbond, sab_almo, sab_kp, sab_kp_nosym, particle_set, energy, force, matrix_h, matrix_h_im, matrix_ks, matrix_ks_im, matrix_vxc, run_rtp, rtp, matrix_h_kp, matrix_h_im_kp, matrix_ks_kp, matrix_ks_im_kp, matrix_vxc_kp, kinetic_kp, matrix_s_kp, matrix_w_kp, matrix_s_ri_aux_kp, matrix_s, matrix_s_ri_aux, matrix_w, matrix_p_mp2, matrix_p_mp2_admm, rho, rho_xc, pw_env, ewald_env, ewald_pw, active_space, mpools, input, para_env, blacs_env, scf_control, rel_control, kinetic, qs_charges, vppl, rho_core, rho_nlcc, rho_nlcc_g, ks_env, ks_qmmm_env, wf_history, scf_env, local_particles, local_molecules, distribution_2d, dbcsr_dist, molecule_kind_set, molecule_set, subsys, cp_subsys, oce, local_rho_set, rho_atom_set, task_list, task_list_soft, rho0_atom_set, rho0_mpole, rhoz_set, ecoul_1c, rho0_s_rs, rho0_s_gs, do_kpoints, has_unit_metric, requires_mo_derivs, mo_derivs, mo_loc_history, nkind, natom, nelectron_total, nelectron_spin, efield, neighbor_list_id, linres_control, xas_env, virial, cp_ddapc_env, cp_ddapc_ewald, outer_scf_history, outer_scf_ihistory, x_data, et_coupling, dftb_potential, results, se_taper, se_store_int_env, se_nddo_mpole, se_nonbond_env, admm_env, lri_env, lri_density, exstate_env, ec_env, dispersion_env, gcp_env, vee, rho_external, external_vxc, mask, mp2_env, bs_env, kg_env, wanniercentres, atprop, ls_scf_env, do_transport, transport_env, v_hartree_rspace, s_mstruct_changed, rho_changed, potential_changed, forces_up_to_date, mscfg_env, almo_scf_env, gradient_history, variable_history, embed_pot, spin_embed_pot, polar_env, mos_last_converged, rhs)
Get the QUICKSTEP environment.
Definition qs_environment_types.F:502

qs_ks_methods
routines that build the Kohn-Sham matrix (i.e calculate the coulomb and xc parts
Definition qs_ks_methods.F:22

qs_ks_methods::qs_ks_build_kohn_sham_matrix
subroutine, public qs_ks_build_kohn_sham_matrix(qs_env, calculate_forces, just_energy, print_active, ext_ks_matrix)
routine where the real calculations are made: the KS matrix is calculated
Definition qs_ks_methods.F:173

qs_mo_types
Definition and initialisation of the mo data type.
Definition qs_mo_types.F:22

qs_mo_types::duplicate_mo_set
subroutine, public duplicate_mo_set(mo_set_new, mo_set_old)
allocate a new mo_set, and copy the old data
Definition qs_mo_types.F:149

qs_mo_types::deallocate_mo_set
subroutine, public deallocate_mo_set(mo_set)
Deallocate a wavefunction data structure.
Definition qs_mo_types.F:352

qs_mo_types::get_mo_set
subroutine, public get_mo_set(mo_set, maxocc, homo, lfomo, nao, nelectron, n_el_f, nmo, eigenvalues, occupation_numbers, mo_coeff, mo_coeff_b, uniform_occupation, kts, mu, flexible_electron_count)
Get the components of a MO set data structure.
Definition qs_mo_types.F:397

qs_mo_types::reassign_allocated_mos
subroutine, public reassign_allocated_mos(mo_set_new, mo_set_old)
reassign an already allocated mo_set
Definition qs_mo_types.F:109

util
All kind of helpful little routines.
Definition util.F:14

util::get_limit
pure integer function, dimension(2), public get_limit(m, n, me)
divide m entries into n parts, return size of part me
Definition util.F:333

xas_tdp_correction
Second order perturbation correction to XAS_TDP spectra (i.e. shift)
Definition xas_tdp_correction.F:13

xas_tdp_correction::gw2x_shift
subroutine, public gw2x_shift(donor_state, xas_tdp_env, xas_tdp_control, qs_env)
Computes the ionization potential using the GW2X method of Shigeta et. al. The result cam be used for...
Definition xas_tdp_correction.F:103

xas_tdp_correction::get_soc_splitting
subroutine, public get_soc_splitting(soc_shifts, donor_state, xas_tdp_env, xas_tdp_control, qs_env)
We try to compute the spin-orbit splitting via perturbation theory. We keep it \ cheap by only inculd...
Definition xas_tdp_correction.F:1734

xas_tdp_kernel
All the kernel specific subroutines for XAS TDP calculations.
Definition xas_tdp_kernel.F:13

xas_tdp_kernel::contract2_ao_to_domo
subroutine, public contract2_ao_to_domo(contr_int, op_type, donor_state, xas_tdp_env, xas_tdp_control, qs_env)
Contract the ri 3-center integrals stored in a tensor with repect to the donor MOs coeffs,...
Definition xas_tdp_kernel.F:976

xas_tdp_kernel::ri_all_blocks_mm
subroutine, public ri_all_blocks_mm(contr_int, pq)
Multiply all the blocks of a contracted RI integral (aI|P) by a matrix of type (P|....
Definition xas_tdp_kernel.F:1267

xas_tdp_types
Define XAS TDP control type and associated create, release, etc subroutines, as well as XAS TDP envir...
Definition xas_tdp_types.F:13

admm_types::admm_type
stores some data used in wavefunction fitting
Definition admm_types.F:120

cp_array_utils::cp_1d_i_p_type
represent a pointer to a 1d array
Definition cp_array_utils.F:221

cp_array_utils::cp_1d_r_p_type
represent a pointer to a 1d array
Definition cp_array_utils.F:99

cp_blacs_env::cp_blacs_env_type
represent a blacs multidimensional parallel environment (for the mpi corrispective see cp_paratypes/m...
Definition cp_blacs_env.F:53

cp_cfm_types::cp_cfm_type
Represent a complex full matrix.
Definition cp_cfm_types.F:68

cp_control_types::dft_control_type
Definition cp_control_types.F:559

cp_fm_struct::cp_fm_struct_p_type
Definition cp_fm_struct.F:93

cp_fm_struct::cp_fm_struct_type
keeps the information about the structure of a full matrix
Definition cp_fm_struct.F:82

cp_fm_types::cp_fm_type
represent a full matrix
Definition cp_fm_types.F:116

input_section_types::section_vals_type
stores the values of a section
Definition input_section_types.F:126

message_passing::mp_para_env_type
stores all the informations relevant to an mpi environment
Definition message_passing.F:763

qs_environment_types::qs_environment_type
Definition qs_environment_types.F:215

qs_mo_types::mo_set_type
Definition qs_mo_types.F:46

xas_tdp_types::donor_state_type
Type containing informations about a single donor state.
Definition xas_tdp_types.F:269

xas_tdp_types::xas_tdp_control_type
Type containing control information for TDP XAS calculations.
Definition xas_tdp_types.F:102

xas_tdp_types::xas_tdp_env_type
Type containing informations such as inputs and results for TDP XAS calculations.
Definition xas_tdp_types.F:194