(git:1f285aa)
rpa_grad.F
Go to the documentation of this file.
1 !--------------------------------------------------------------------------------------------------!
2 ! CP2K: A general program to perform molecular dynamics simulations !
3 ! Copyright 2000-2024 CP2K developers group <https://cp2k.org> !
4 ! !
5 ! SPDX-License-Identifier: GPL-2.0-or-later !
6 !--------------------------------------------------------------------------------------------------!
7 
8 ! **************************************************************************************************
9 !> \brief Routines to calculate RI-RPA and SOS-MP2 gradients
10 !> \par History
11 !> 10.2021 created [Frederick Stein]
12 ! **************************************************************************************************
13 MODULE rpa_grad
14  USE iso_c_binding, ONLY: c_null_ptr,&
15  c_ptr,&
16  c_associated
17  USE cp_array_utils, ONLY: cp_1d_r_cp_type,&
18  cp_3d_r_cp_type
19  USE cp_blacs_env, ONLY: cp_blacs_env_type
20  USE cp_fm_basic_linalg, ONLY: cp_fm_geadd,&
27  cp_fm_struct_type
28  USE cp_fm_types, ONLY: &
30  cp_fm_to_fm, cp_fm_to_fm_submat_general, cp_fm_type
33  dgemm_counter_type
34  USE group_dist_types, ONLY: create_group_dist,&
35  get_group_dist,&
36  group_dist_d1_type,&
38  maxsize,&
39  release_group_dist
40  USE kahan_sum, ONLY: accurate_dot_product,&
41  accurate_dot_product_2
42  USE kinds, ONLY: dp,&
43  int_8
46  local_gemm,&
50  USE machine, ONLY: m_flush,&
51  m_memory
52  USE mathconstants, ONLY: pi
53  USE message_passing, ONLY: mp_comm_type,&
54  mp_para_env_type,&
56  mp_request_type,&
57  mp_waitall,&
60  USE mp2_ri_grad_util, ONLY: array2fm,&
62  fm2array,&
64  USE mp2_types, ONLY: mp2_type,&
65  one_dim_int_array,&
66  two_dim_int_array,&
67  two_dim_real_array
68  USE parallel_gemm_api, ONLY: parallel_gemm
69  USE qs_environment_types, ONLY: get_qs_env,&
70  qs_environment_type
71  USE rpa_util, ONLY: calc_fm_mat_s_rpa,&
73  USE util, ONLY: get_limit
74 #include "./base/base_uses.f90"
75 
76  IMPLICIT NONE
77 
78  PRIVATE
79 
80  CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'rpa_grad'
81 
83 
84  TYPE sos_mp2_grad_work_type
85  PRIVATE
86  INTEGER, DIMENSION(:, :), ALLOCATABLE :: pair_list
87  TYPE(one_dim_int_array), DIMENSION(:), ALLOCATABLE :: index2send, index2recv
88  REAL(KIND=dp), DIMENSION(:), ALLOCATABLE :: p
89  END TYPE
90 
91  TYPE rpa_grad_work_type
92  TYPE(cp_fm_type) :: fm_mat_Q_copy = cp_fm_type()
93  TYPE(one_dim_int_array), DIMENSION(:, :), ALLOCATABLE :: index2send
94  TYPE(two_dim_int_array), DIMENSION(:, :), ALLOCATABLE :: index2recv
95  TYPE(group_dist_d1_type), DIMENSION(:), ALLOCATABLE :: gd_homo, gd_virtual
96  INTEGER, DIMENSION(2) :: grid = -1, mepos = -1
97  TYPE(two_dim_real_array), DIMENSION(:), ALLOCATABLE :: P_ij, P_ab
98  END TYPE
99 
100  TYPE rpa_grad_type
101  PRIVATE
102  TYPE(cp_fm_type) :: fm_Gamma_PQ = cp_fm_type()
103  TYPE(cp_fm_type), DIMENSION(:), ALLOCATABLE :: fm_Y
104  TYPE(sos_mp2_grad_work_type), ALLOCATABLE, DIMENSION(:) :: sos_mp2_work_occ, sos_mp2_work_virt
105  TYPE(rpa_grad_work_type) :: rpa_work
106  END TYPE
107 
108  INTEGER, PARAMETER :: spla_threshold = 128*128*128*2
109  INTEGER, PARAMETER :: blksize_threshold = 4
110 
111 CONTAINS
112 
113 ! **************************************************************************************************
114 !> \brief Calculates the necessary minimum memory for the Gradient code ion MiB
115 !> \param homo ...
116 !> \param virtual ...
117 !> \param dimen_RI ...
118 !> \param mem_per_rank ...
119 !> \param mem_per_repl ...
120 !> \param do_ri_sos_laplace_mp2 ...
121 !> \return ...
122 ! **************************************************************************************************
123  PURE SUBROUTINE rpa_grad_needed_mem(homo, virtual, dimen_RI, mem_per_rank, mem_per_repl, do_ri_sos_laplace_mp2)
124  INTEGER, DIMENSION(:), INTENT(IN) :: homo, virtual
125  INTEGER, INTENT(IN) :: dimen_ri
126  REAL(kind=dp), INTENT(INOUT) :: mem_per_rank, mem_per_repl
127  LOGICAL, INTENT(IN) :: do_ri_sos_laplace_mp2
128 
129  REAL(kind=dp) :: mem_iak, mem_kl, mem_pab, mem_pij
130 
131  mem_iak = sum(real(virtual, kind=dp)*homo)*dimen_ri
132  mem_pij = sum(real(homo, kind=dp)**2)
133  mem_pab = sum(real(virtual, kind=dp)**2)
134  mem_kl = real(dimen_ri, kind=dp)*dimen_ri
135 
136  ! Required matrices iaK
137  ! Ytot_iaP = sum_tau Y_iaP(tau)
138  ! Y_iaP(tau) = S_iaP(tau)*Q_PQ(tau) (work array)
139  ! Required matrices density matrices
140  ! Pij (local)
141  ! Pab (local)
142  ! Additionally with SOS-MP2
143  ! Send and receive buffers for degenerate orbital pairs (rough estimate: everything)
144  ! Additionally with RPA
145  ! copy of work matrix
146  ! receive buffer for calculation of density matrix
147  ! copy of matrix Q
148  mem_per_rank = mem_per_rank + (mem_pij + mem_pab)*8.0_dp/(1024**2)
149  mem_per_repl = mem_per_repl + (mem_iak + 2.0_dp*mem_iak/SIZE(homo) + mem_kl)*8.0_dp/(1024**2)
150  IF (.NOT. do_ri_sos_laplace_mp2) THEN
151  mem_per_repl = mem_per_rank + (mem_iak/SIZE(homo) + mem_kl)*8.0_dp/(1024**2)
152  END IF
153 
154  END SUBROUTINE rpa_grad_needed_mem
155 
156 ! **************************************************************************************************
157 !> \brief Creates the arrays of a rpa_grad_type
158 !> \param rpa_grad ...
159 !> \param fm_mat_Q ...
160 !> \param fm_mat_S ...
161 !> \param homo ...
162 !> \param virtual ...
163 !> \param mp2_env ...
164 !> \param Eigenval ...
165 !> \param unit_nr ...
166 !> \param do_ri_sos_laplace_mp2 ...
167 ! **************************************************************************************************
168  SUBROUTINE rpa_grad_create(rpa_grad, fm_mat_Q, fm_mat_S, &
169  homo, virtual, mp2_env, Eigenval, unit_nr, do_ri_sos_laplace_mp2)
170  TYPE(rpa_grad_type), INTENT(OUT) :: rpa_grad
171  TYPE(cp_fm_type), INTENT(IN) :: fm_mat_q
172  TYPE(cp_fm_type), DIMENSION(:), INTENT(IN) :: fm_mat_s
173  INTEGER, DIMENSION(:), INTENT(IN) :: homo, virtual
174  TYPE(mp2_type), INTENT(INOUT) :: mp2_env
175  REAL(kind=dp), DIMENSION(:, :), INTENT(IN) :: eigenval
176  INTEGER, INTENT(IN) :: unit_nr
177  LOGICAL, INTENT(IN) :: do_ri_sos_laplace_mp2
178 
179  CHARACTER(LEN=*), PARAMETER :: routinen = 'rpa_grad_create'
180 
181  INTEGER :: handle, ispin, nrow_local, nspins
182 
183  CALL timeset(routinen, handle)
184 
185  CALL cp_fm_create(rpa_grad%fm_Gamma_PQ, matrix_struct=fm_mat_q%matrix_struct)
186  CALL cp_fm_set_all(rpa_grad%fm_Gamma_PQ, 0.0_dp)
187 
188  nspins = SIZE(fm_mat_s)
189 
190  ALLOCATE (rpa_grad%fm_Y(nspins))
191  DO ispin = 1, nspins
192  CALL cp_fm_create(rpa_grad%fm_Y(ispin), fm_mat_s(ispin)%matrix_struct)
193  END DO
194 
195  IF (do_ri_sos_laplace_mp2) THEN
196  CALL sos_mp2_work_type_create(rpa_grad%sos_mp2_work_occ, rpa_grad%sos_mp2_work_virt, &
197  unit_nr, eigenval, homo, virtual, mp2_env%ri_grad%eps_canonical, fm_mat_s)
198  ELSE
199  CALL rpa_work_type_create(rpa_grad%rpa_work, fm_mat_q, fm_mat_s, homo, virtual)
200  END IF
201 
202  ! Set blocksize
203  CALL cp_fm_struct_get(fm_mat_s(1)%matrix_struct, nrow_local=nrow_local)
204  IF (mp2_env%ri_grad%dot_blksize < 1) mp2_env%ri_grad%dot_blksize = nrow_local
205  mp2_env%ri_grad%dot_blksize = min(mp2_env%ri_grad%dot_blksize, nrow_local)
206  IF (unit_nr > 0) THEN
207  WRITE (unit_nr, '(T3,A,T75,I6)') 'GRAD_INFO| Block size for the contraction:', mp2_env%ri_grad%dot_blksize
208  CALL m_flush(unit_nr)
209  END IF
210  CALL fm_mat_s(1)%matrix_struct%para_env%sync()
211 
212  CALL timestop(handle)
213 
214  END SUBROUTINE rpa_grad_create
215 
216 ! **************************************************************************************************
217 !> \brief ...
218 !> \param sos_mp2_work_occ ...
219 !> \param sos_mp2_work_virt ...
220 !> \param unit_nr ...
221 !> \param Eigenval ...
222 !> \param homo ...
223 !> \param virtual ...
224 !> \param eps_degenerate ...
225 !> \param fm_mat_S ...
226 ! **************************************************************************************************
227  SUBROUTINE sos_mp2_work_type_create(sos_mp2_work_occ, sos_mp2_work_virt, unit_nr, &
228  Eigenval, homo, virtual, eps_degenerate, fm_mat_S)
229  TYPE(sos_mp2_grad_work_type), ALLOCATABLE, &
230  DIMENSION(:), INTENT(OUT) :: sos_mp2_work_occ, sos_mp2_work_virt
231  INTEGER, INTENT(IN) :: unit_nr
232  REAL(kind=dp), DIMENSION(:, :), INTENT(IN) :: eigenval
233  INTEGER, DIMENSION(:), INTENT(IN) :: homo, virtual
234  REAL(kind=dp), INTENT(IN) :: eps_degenerate
235  TYPE(cp_fm_type), DIMENSION(:), INTENT(IN) :: fm_mat_s
236 
237  CHARACTER(LEN=*), PARAMETER :: routinen = 'sos_mp2_work_type_create'
238 
239  INTEGER :: handle, ispin, nspins
240 
241  CALL timeset(routinen, handle)
242 
243  nspins = SIZE(fm_mat_s)
244  ALLOCATE (sos_mp2_work_occ(nspins), sos_mp2_work_virt(nspins))
245  DO ispin = 1, nspins
246 
247  CALL create_list_nearly_degen_pairs(eigenval(1:homo(ispin), ispin), &
248  eps_degenerate, sos_mp2_work_occ(ispin)%pair_list)
249  IF (unit_nr > 0) WRITE (unit_nr, "(T3,A,T75,i6)") &
250  "MO_INFO| Number of ij pairs below EPS_CANONICAL:", SIZE(sos_mp2_work_occ(ispin)%pair_list, 2)
251  ALLOCATE (sos_mp2_work_occ(ispin)%P(homo(ispin) + SIZE(sos_mp2_work_occ(ispin)%pair_list, 2)))
252  sos_mp2_work_occ(ispin)%P = 0.0_dp
253  CALL prepare_comm_pij(sos_mp2_work_occ(ispin), virtual(ispin), fm_mat_s(ispin))
254 
255  CALL create_list_nearly_degen_pairs(eigenval(homo(ispin) + 1:, ispin), &
256  eps_degenerate, sos_mp2_work_virt(ispin)%pair_list)
257  IF (unit_nr > 0) WRITE (unit_nr, "(T3,A,T75,i6)") &
258  "MO_INFO| Number of ab pairs below EPS_CANONICAL:", SIZE(sos_mp2_work_virt(ispin)%pair_list, 2)
259  ALLOCATE (sos_mp2_work_virt(ispin)%P(virtual(ispin) + SIZE(sos_mp2_work_virt(ispin)%pair_list, 2)))
260  sos_mp2_work_virt(ispin)%P = 0.0_dp
261  CALL prepare_comm_pab(sos_mp2_work_virt(ispin), virtual(ispin), fm_mat_s(ispin))
262  END DO
263 
264  CALL timestop(handle)
265 
266  END SUBROUTINE
267 
268 ! **************************************************************************************************
269 !> \brief ...
270 !> \param rpa_work ...
271 !> \param fm_mat_Q ...
272 !> \param fm_mat_S ...
273 !> \param homo ...
274 !> \param virtual ...
275 ! **************************************************************************************************
276  SUBROUTINE rpa_work_type_create(rpa_work, fm_mat_Q, fm_mat_S, homo, virtual)
277  TYPE(rpa_grad_work_type), INTENT(OUT) :: rpa_work
278  TYPE(cp_fm_type), INTENT(IN) :: fm_mat_q
279  TYPE(cp_fm_type), DIMENSION(:), INTENT(IN) :: fm_mat_s
280  INTEGER, DIMENSION(:), INTENT(IN) :: homo, virtual
281 
282  CHARACTER(LEN=*), PARAMETER :: routinen = 'rpa_work_type_create'
283 
284  INTEGER :: avirt, col_global, col_local, first_p_pos(2), first_p_pos_col, handle, iocc, &
285  ispin, my_a, my_a_end, my_a_size, my_a_start, my_i, my_i_end, my_i_size, my_i_start, &
286  my_pcol, ncol_block, ncol_local, nspins, num_pe_col, proc_homo, proc_homo_send, &
287  proc_recv, proc_send, proc_virtual, proc_virtual_send
288  INTEGER, ALLOCATABLE, DIMENSION(:) :: data2recv, data2send
289  INTEGER, DIMENSION(:), POINTER :: col_indices
290 
291  CALL timeset(routinen, handle)
292 
293  CALL cp_fm_create(rpa_work%fm_mat_Q_copy, matrix_struct=fm_mat_q%matrix_struct)
294 
295  CALL fm_mat_s(1)%matrix_struct%context%get(number_of_process_columns=num_pe_col, my_process_column=my_pcol)
296 
297  nspins = SIZE(fm_mat_s)
298 
299  ALLOCATE (rpa_work%index2send(0:num_pe_col - 1, nspins), &
300  rpa_work%index2recv(0:num_pe_col - 1, nspins), &
301  rpa_work%gd_homo(nspins), rpa_work%gd_virtual(nspins), &
302  data2send(0:num_pe_col - 1), data2recv(0:num_pe_col - 1), &
303  rpa_work%P_ij(nspins), rpa_work%P_ab(nspins))
304 
305  ! Determine new process grid
306  proc_homo = max(1, ceiling(sqrt(real(num_pe_col, kind=dp))))
307  DO WHILE (mod(num_pe_col, proc_homo) /= 0)
308  proc_homo = proc_homo - 1
309  END DO
310  proc_virtual = num_pe_col/proc_homo
311 
312  rpa_work%grid(1) = proc_virtual
313  rpa_work%grid(2) = proc_homo
314 
315  rpa_work%mepos(1) = mod(my_pcol, proc_virtual)
316  rpa_work%mepos(2) = my_pcol/proc_virtual
317 
318  DO ispin = 1, nspins
319 
320  ! Determine distributions of the orbitals
321  CALL create_group_dist(rpa_work%gd_homo(ispin), proc_homo, homo(ispin))
322  CALL create_group_dist(rpa_work%gd_virtual(ispin), proc_virtual, virtual(ispin))
323 
324  CALL cp_fm_struct_get(fm_mat_s(ispin)%matrix_struct, ncol_local=ncol_local, col_indices=col_indices, &
325  first_p_pos=first_p_pos, ncol_block=ncol_block)
326  first_p_pos_col = first_p_pos(2)
327 
328  data2send = 0
329  ! Count the amount of data2send to each process
330  DO col_local = 1, ncol_local
331  col_global = col_indices(col_local)
332 
333  iocc = (col_global - 1)/virtual(ispin) + 1
334  avirt = col_global - (iocc - 1)*virtual(ispin)
335 
336  proc_homo_send = group_dist_proc(rpa_work%gd_homo(ispin), iocc)
337  proc_virtual_send = group_dist_proc(rpa_work%gd_virtual(ispin), avirt)
338 
339  proc_send = proc_homo_send*proc_virtual + proc_virtual_send
340 
341  data2send(proc_send) = data2send(proc_send) + 1
342  END DO
343 
344  DO proc_send = 0, num_pe_col - 1
345  ALLOCATE (rpa_work%index2send(proc_send, ispin)%array(data2send(proc_send)))
346  END DO
347 
348  ! Prepare the indices
349  data2send = 0
350  DO col_local = 1, ncol_local
351  col_global = col_indices(col_local)
352 
353  iocc = (col_global - 1)/virtual(ispin) + 1
354  avirt = col_global - (iocc - 1)*virtual(ispin)
355 
356  proc_homo_send = group_dist_proc(rpa_work%gd_homo(ispin), iocc)
357  proc_virtual_send = group_dist_proc(rpa_work%gd_virtual(ispin), avirt)
358 
359  proc_send = proc_homo_send*proc_virtual + proc_virtual_send
360 
361  data2send(proc_send) = data2send(proc_send) + 1
362 
363  rpa_work%index2send(proc_send, ispin)%array(data2send(proc_send)) = col_local
364  END DO
365 
366  ! Count the amount of data2recv from each process
367  CALL get_group_dist(rpa_work%gd_homo(ispin), my_pcol/proc_virtual, my_i_start, my_i_end, my_i_size)
368  CALL get_group_dist(rpa_work%gd_virtual(ispin), mod(my_pcol, proc_virtual), my_a_start, my_a_end, my_a_size)
369 
370  data2recv = 0
371  DO my_i = my_i_start, my_i_end
372  DO my_a = my_a_start, my_a_end
373  proc_recv = cp_fm_indxg2p((my_i - 1)*virtual(ispin) + my_a, ncol_block, 0, first_p_pos_col, num_pe_col)
374  data2recv(proc_recv) = data2recv(proc_recv) + 1
375  END DO
376  END DO
377 
378  DO proc_recv = 0, num_pe_col - 1
379  ALLOCATE (rpa_work%index2recv(proc_recv, ispin)%array(2, data2recv(proc_recv)))
380  END DO
381 
382  data2recv = 0
383  DO my_i = my_i_start, my_i_end
384  DO my_a = my_a_start, my_a_end
385  proc_recv = cp_fm_indxg2p((my_i - 1)*virtual(ispin) + my_a, ncol_block, 0, first_p_pos_col, num_pe_col)
386  data2recv(proc_recv) = data2recv(proc_recv) + 1
387 
388  rpa_work%index2recv(proc_recv, ispin)%array(2, data2recv(proc_recv)) = my_i - my_i_start + 1
389  rpa_work%index2recv(proc_recv, ispin)%array(1, data2recv(proc_recv)) = my_a - my_a_start + 1
390  END DO
391  END DO
392 
393  ALLOCATE (rpa_work%P_ij(ispin)%array(my_i_size, homo(ispin)), &
394  rpa_work%P_ab(ispin)%array(my_a_size, virtual(ispin)))
395  rpa_work%P_ij(ispin)%array(:, :) = 0.0_dp
396  rpa_work%P_ab(ispin)%array(:, :) = 0.0_dp
397 
398  END DO
399 
400  DEALLOCATE (data2send, data2recv)
401 
402  CALL timestop(handle)
403 
404  END SUBROUTINE
405 
406 ! **************************************************************************************************
407 !> \brief ...
408 !> \param Eigenval ...
409 !> \param eps_degen ...
410 !> \param pair_list ...
411 ! **************************************************************************************************
412  SUBROUTINE create_list_nearly_degen_pairs(Eigenval, eps_degen, pair_list)
413  REAL(kind=dp), DIMENSION(:), INTENT(IN) :: eigenval
414  REAL(kind=dp), INTENT(IN) :: eps_degen
415  INTEGER, ALLOCATABLE, DIMENSION(:, :), INTENT(OUT) :: pair_list
416 
417  INTEGER :: my_i, my_j, num_orbitals, num_pairs, &
418  pair_counter
419 
420  num_orbitals = SIZE(eigenval)
421 
422 ! Determine number of nearly degenerate orbital pairs
423 ! Trivial cases: diagonal elements
424  num_pairs = 0
425  DO my_i = 1, num_orbitals
426  DO my_j = 1, num_orbitals
427  IF (my_i == my_j) cycle
428  IF (abs(eigenval(my_i) - eigenval(my_j)) < eps_degen) num_pairs = num_pairs + 1
429  END DO
430  END DO
431  ALLOCATE (pair_list(2, num_pairs))
432 
433 ! Print the required pairs
434  pair_counter = 1
435  DO my_i = 1, num_orbitals
436  DO my_j = 1, num_orbitals
437  IF (my_i == my_j) cycle
438  IF (abs(eigenval(my_i) - eigenval(my_j)) < eps_degen) THEN
439  pair_list(1, pair_counter) = my_i
440  pair_list(2, pair_counter) = my_j
441  pair_counter = pair_counter + 1
442  END IF
443  END DO
444  END DO
445 
446  END SUBROUTINE create_list_nearly_degen_pairs
447 
448 ! **************************************************************************************************
449 !> \brief ...
450 !> \param sos_mp2_work ...
451 !> \param virtual ...
452 !> \param fm_mat_S ...
453 ! **************************************************************************************************
454  SUBROUTINE prepare_comm_pij(sos_mp2_work, virtual, fm_mat_S)
455  TYPE(sos_mp2_grad_work_type), INTENT(INOUT) :: sos_mp2_work
456  INTEGER, INTENT(IN) :: virtual
457  TYPE(cp_fm_type), INTENT(IN) :: fm_mat_s
458 
459  CHARACTER(LEN=*), PARAMETER :: routinen = 'prepare_comm_Pij'
460 
461  INTEGER :: avirt, col_global, col_local, counter, first_p_pos(2), first_p_pos_col, handle, &
462  ij_counter, iocc, my_i, my_j, my_pcol, my_prow, ncol_block, ncol_local, nrow_local, &
463  num_ij_pairs, num_pe_col, pcol, pcol_recv, pcol_send, proc_shift, tag
464  INTEGER, ALLOCATABLE, DIMENSION(:) :: data2recv, data2send
465  INTEGER, DIMENSION(:), POINTER :: col_indices, ncol_locals
466  INTEGER, DIMENSION(:, :), POINTER :: blacs2mpi
467  TYPE(cp_blacs_env_type), POINTER :: context
468  TYPE(mp_comm_type) :: comm_exchange
469  TYPE(mp_para_env_type), POINTER :: para_env
470 
471  CALL timeset(routinen, handle)
472 
473  tag = 44
474 
475  CALL fm_mat_s%matrix_struct%context%get(number_of_process_columns=num_pe_col)
476  ALLOCATE (sos_mp2_work%index2send(0:num_pe_col - 1), &
477  sos_mp2_work%index2recv(0:num_pe_col - 1))
478 
479  ALLOCATE (data2send(0:num_pe_col - 1))
480  ALLOCATE (data2recv(0:num_pe_col - 1))
481 
482  CALL cp_fm_struct_get(fm_mat_s%matrix_struct, para_env=para_env, ncol_locals=ncol_locals, &
483  ncol_local=ncol_local, col_indices=col_indices, &
484  context=context, ncol_block=ncol_block, first_p_pos=first_p_pos, nrow_local=nrow_local)
485  CALL context%get(my_process_row=my_prow, my_process_column=my_pcol, &
486  blacs2mpi=blacs2mpi)
487  first_p_pos_col = first_p_pos(2)
488 
489  num_ij_pairs = SIZE(sos_mp2_work%pair_list, 2)
490 
491  IF (num_ij_pairs > 0) THEN
492 
493  CALL comm_exchange%from_split(para_env, my_prow)
494 
495  data2send = 0
496  data2recv = 0
497 
498  DO proc_shift = 0, num_pe_col - 1
499  pcol_send = modulo(my_pcol + proc_shift, num_pe_col)
500 
501  counter = 0
502  DO col_local = 1, ncol_local
503  col_global = col_indices(col_local)
504 
505  iocc = max(1, col_global - 1)/virtual + 1
506  avirt = col_global - (iocc - 1)*virtual
507 
508  DO ij_counter = 1, num_ij_pairs
509 
510  my_i = sos_mp2_work%pair_list(1, ij_counter)
511  my_j = sos_mp2_work%pair_list(2, ij_counter)
512 
513  IF (iocc /= my_j) cycle
514  pcol = cp_fm_indxg2p((my_i - 1)*virtual + avirt, ncol_block, 0, first_p_pos_col, num_pe_col)
515  IF (pcol /= pcol_send) cycle
516 
517  counter = counter + 1
518 
519  EXIT
520 
521  END DO
522  END DO
523  data2send(pcol_send) = counter
524  END DO
525 
526  CALL comm_exchange%alltoall(data2send, data2recv, 1)
527 
528  DO proc_shift = 0, num_pe_col - 1
529  pcol_send = modulo(my_pcol + proc_shift, num_pe_col)
530  pcol_recv = modulo(my_pcol - proc_shift, num_pe_col)
531 
532  ! Collect indices and exchange
533  ALLOCATE (sos_mp2_work%index2send(pcol_send)%array(data2send(pcol_send)))
534 
535  counter = 0
536  DO col_local = 1, ncol_local
537  col_global = col_indices(col_local)
538 
539  iocc = max(1, col_global - 1)/virtual + 1
540  avirt = col_global - (iocc - 1)*virtual
541 
542  DO ij_counter = 1, num_ij_pairs
543 
544  my_i = sos_mp2_work%pair_list(1, ij_counter)
545  my_j = sos_mp2_work%pair_list(2, ij_counter)
546 
547  IF (iocc /= my_j) cycle
548  pcol = cp_fm_indxg2p((my_i - 1)*virtual + avirt, ncol_block, 0, first_p_pos_col, num_pe_col)
549  IF (pcol /= pcol_send) cycle
550 
551  counter = counter + 1
552 
553  sos_mp2_work%index2send(pcol_send)%array(counter) = col_global
554 
555  EXIT
556 
557  END DO
558  END DO
559 
560  ALLOCATE (sos_mp2_work%index2recv(pcol_recv)%array(data2recv(pcol_recv)))
561  !
562  CALL para_env%sendrecv(sos_mp2_work%index2send(pcol_send)%array, blacs2mpi(my_prow, pcol_send), &
563  sos_mp2_work%index2recv(pcol_recv)%array, blacs2mpi(my_prow, pcol_recv), tag)
564 
565  ! Convert to global coordinates to local coordinates as we always work with them
566  DO counter = 1, data2send(pcol_send)
567  sos_mp2_work%index2send(pcol_send)%array(counter) = &
568  cp_fm_indxg2l(sos_mp2_work%index2send(pcol_send)%array(counter), &
569  ncol_block, 0, first_p_pos_col, num_pe_col)
570  END DO
571  END DO
572 
573  CALL comm_exchange%free()
574  END IF
575 
576  DEALLOCATE (data2send, data2recv)
577 
578  CALL timestop(handle)
579 
580  END SUBROUTINE
581 
582 ! **************************************************************************************************
583 !> \brief ...
584 !> \param sos_mp2_work ...
585 !> \param virtual ...
586 !> \param fm_mat_S ...
587 ! **************************************************************************************************
588  SUBROUTINE prepare_comm_pab(sos_mp2_work, virtual, fm_mat_S)
589  TYPE(sos_mp2_grad_work_type), INTENT(INOUT) :: sos_mp2_work
590  INTEGER, INTENT(IN) :: virtual
591  TYPE(cp_fm_type), INTENT(IN) :: fm_mat_s
592 
593  CHARACTER(LEN=*), PARAMETER :: routinen = 'prepare_comm_Pab'
594 
595  INTEGER :: ab_counter, avirt, col_global, col_local, counter, first_p_pos(2), &
596  first_p_pos_col, handle, iocc, my_a, my_b, my_pcol, my_prow, ncol_block, ncol_local, &
597  nrow_local, num_ab_pairs, num_pe_col, pcol, pcol_recv, pcol_send, proc_shift, tag
598  INTEGER, ALLOCATABLE, DIMENSION(:) :: data2recv, data2send
599  INTEGER, DIMENSION(:), POINTER :: col_indices, ncol_locals
600  INTEGER, DIMENSION(:, :), POINTER :: blacs2mpi
601  TYPE(cp_blacs_env_type), POINTER :: context
602  TYPE(mp_comm_type) :: comm_exchange
603  TYPE(mp_para_env_type), POINTER :: para_env
604 
605  CALL timeset(routinen, handle)
606 
607  tag = 44
608 
609  CALL fm_mat_s%matrix_struct%context%get(number_of_process_columns=num_pe_col)
610  ALLOCATE (sos_mp2_work%index2send(0:num_pe_col - 1), &
611  sos_mp2_work%index2recv(0:num_pe_col - 1))
612 
613  num_ab_pairs = SIZE(sos_mp2_work%pair_list, 2)
614  IF (num_ab_pairs > 0) THEN
615 
616  CALL cp_fm_struct_get(fm_mat_s%matrix_struct, para_env=para_env, ncol_locals=ncol_locals, &
617  ncol_local=ncol_local, col_indices=col_indices, &
618  context=context, ncol_block=ncol_block, first_p_pos=first_p_pos, nrow_local=nrow_local)
619  CALL context%get(my_process_row=my_prow, my_process_column=my_pcol, &
620  blacs2mpi=blacs2mpi)
621  first_p_pos_col = first_p_pos(2)
622 
623  CALL comm_exchange%from_split(para_env, my_prow)
624 
625  ALLOCATE (data2send(0:num_pe_col - 1))
626  ALLOCATE (data2recv(0:num_pe_col - 1))
627 
628  data2send = 0
629  data2recv = 0
630  DO proc_shift = 0, num_pe_col - 1
631  pcol_send = modulo(my_pcol + proc_shift, num_pe_col)
632  pcol_recv = modulo(my_pcol - proc_shift, num_pe_col)
633 
634  counter = 0
635  DO col_local = 1, ncol_local
636  col_global = col_indices(col_local)
637 
638  iocc = max(1, col_global - 1)/virtual + 1
639  avirt = col_global - (iocc - 1)*virtual
640 
641  DO ab_counter = 1, num_ab_pairs
642 
643  my_a = sos_mp2_work%pair_list(1, ab_counter)
644  my_b = sos_mp2_work%pair_list(2, ab_counter)
645 
646  IF (avirt /= my_b) cycle
647  pcol = cp_fm_indxg2p((iocc - 1)*virtual + my_a, ncol_block, 0, first_p_pos_col, num_pe_col)
648  IF (pcol /= pcol_send) cycle
649 
650  counter = counter + 1
651 
652  EXIT
653 
654  END DO
655  END DO
656  data2send(pcol_send) = counter
657  END DO
658 
659  CALL comm_exchange%alltoall(data2send, data2recv, 1)
660 
661  DO proc_shift = 0, num_pe_col - 1
662  pcol_send = modulo(my_pcol + proc_shift, num_pe_col)
663  pcol_recv = modulo(my_pcol - proc_shift, num_pe_col)
664 
665  ! Collect indices and exchange
666  ALLOCATE (sos_mp2_work%index2send(pcol_send)%array(data2send(pcol_send)))
667 
668  counter = 0
669  DO col_local = 1, ncol_local
670  col_global = col_indices(col_local)
671 
672  iocc = max(1, col_global - 1)/virtual + 1
673  avirt = col_global - (iocc - 1)*virtual
674 
675  DO ab_counter = 1, num_ab_pairs
676 
677  my_a = sos_mp2_work%pair_list(1, ab_counter)
678  my_b = sos_mp2_work%pair_list(2, ab_counter)
679 
680  IF (avirt /= my_b) cycle
681  pcol = cp_fm_indxg2p((iocc - 1)*virtual + my_a, ncol_block, 0, first_p_pos_col, num_pe_col)
682  IF (pcol /= pcol_send) cycle
683 
684  counter = counter + 1
685 
686  sos_mp2_work%index2send(pcol_send)%array(counter) = col_global
687 
688  EXIT
689 
690  END DO
691  END DO
692 
693  ALLOCATE (sos_mp2_work%index2recv(pcol_recv)%array(data2recv(pcol_recv)))
694  !
695  CALL para_env%sendrecv(sos_mp2_work%index2send(pcol_send)%array, blacs2mpi(my_prow, pcol_send), &
696  sos_mp2_work%index2recv(pcol_recv)%array, blacs2mpi(my_prow, pcol_recv), tag)
697 
698  ! Convert to global coordinates to local coordinates as we always work with them
699  DO counter = 1, data2send(pcol_send)
700  sos_mp2_work%index2send(pcol_send)%array(counter) = &
701  cp_fm_indxg2l(sos_mp2_work%index2send(pcol_send)%array(counter), &
702  ncol_block, 0, first_p_pos_col, num_pe_col)
703  END DO
704  END DO
705 
706  CALL comm_exchange%free()
707  DEALLOCATE (data2send, data2recv)
708 
709  END IF
710 
711  CALL timestop(handle)
712 
713  END SUBROUTINE
714 
715 ! **************************************************************************************************
716 !> \brief ...
717 !> \param fm_mat_Q ...
718 !> \param rpa_grad ...
719 ! **************************************************************************************************
720  SUBROUTINE rpa_grad_copy_q(fm_mat_Q, rpa_grad)
721  TYPE(cp_fm_type), INTENT(IN) :: fm_mat_q
722  TYPE(rpa_grad_type), INTENT(INOUT) :: rpa_grad
723 
724  CALL cp_fm_to_fm(fm_mat_q, rpa_grad%rpa_work%fm_mat_Q_copy)
725 
726  END SUBROUTINE
727 
728 ! **************************************************************************************************
729 !> \brief ...
730 !> \param mp2_env ...
731 !> \param rpa_grad ...
732 !> \param do_ri_sos_laplace_mp2 ...
733 !> \param fm_mat_Q ...
734 !> \param fm_mat_Q_gemm ...
735 !> \param dgemm_counter ...
736 !> \param fm_mat_S ...
737 !> \param omega ...
738 !> \param homo ...
739 !> \param virtual ...
740 !> \param Eigenval ...
741 !> \param weight ...
742 !> \param unit_nr ...
743 ! **************************************************************************************************
744  SUBROUTINE rpa_grad_matrix_operations(mp2_env, rpa_grad, do_ri_sos_laplace_mp2, fm_mat_Q, fm_mat_Q_gemm, &
745  dgemm_counter, fm_mat_S, omega, homo, virtual, Eigenval, weight, unit_nr)
746  TYPE(mp2_type), INTENT(INOUT) :: mp2_env
747  TYPE(rpa_grad_type), INTENT(INOUT) :: rpa_grad
748  LOGICAL, INTENT(IN) :: do_ri_sos_laplace_mp2
749  TYPE(cp_fm_type), DIMENSION(:), INTENT(IN) :: fm_mat_q, fm_mat_q_gemm
750  TYPE(dgemm_counter_type), INTENT(INOUT) :: dgemm_counter
751  TYPE(cp_fm_type), DIMENSION(:), INTENT(IN) :: fm_mat_s
752  REAL(kind=dp), INTENT(IN) :: omega
753  INTEGER, DIMENSION(:), INTENT(IN) :: homo, virtual
754  REAL(kind=dp), DIMENSION(:, :), INTENT(IN) :: eigenval
755  REAL(kind=dp), INTENT(IN) :: weight
756  INTEGER, INTENT(IN) :: unit_nr
757 
758  CHARACTER(LEN=*), PARAMETER :: routinen = 'rpa_grad_matrix_operations'
759 
760  INTEGER :: col_global, col_local, dimen_ia, &
761  dimen_ri, handle, handle2, ispin, &
762  jspin, ncol_local, nrow_local, nspins, &
763  row_local
764  INTEGER, DIMENSION(:), POINTER :: col_indices, row_indices
765  REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :, :), &
766  TARGET :: mat_s_3d, mat_work_iap_3d
767  TYPE(cp_fm_type) :: fm_work_iap, fm_work_pq
768 
769  CALL timeset(routinen, handle)
770 
771  nspins = SIZE(fm_mat_q)
772 
773  CALL cp_fm_get_info(fm_mat_q(1), nrow_global=dimen_ri, nrow_local=nrow_local, ncol_local=ncol_local, &
774  col_indices=col_indices, row_indices=row_indices)
775 
776  IF (.NOT. do_ri_sos_laplace_mp2) THEN
777  CALL cp_fm_create(fm_work_pq, fm_mat_q(1)%matrix_struct)
778 
779  ! calculate [1+Q(iw')]^-1
780  CALL cp_fm_cholesky_invert(fm_mat_q(1))
781  ! symmetrize the result, fm_work_PQ is only a work matrix
782  CALL cp_fm_upper_to_full(fm_mat_q(1), fm_work_pq)
783 
784  CALL cp_fm_release(fm_work_pq)
785 
786  DO col_local = 1, ncol_local
787  col_global = col_indices(col_local)
788  DO row_local = 1, nrow_local
789  IF (col_global == row_indices(row_local)) THEN
790  fm_mat_q(1)%local_data(row_local, col_local) = fm_mat_q(1)%local_data(row_local, col_local) - 1.0_dp
791  EXIT
792  END IF
793  END DO
794  END DO
795 
796  CALL timeset(routinen//"_PQ", handle2)
797  CALL dgemm_counter_start(dgemm_counter)
798  CALL parallel_gemm(transa="N", transb="N", m=dimen_ri, n=dimen_ri, k=dimen_ri, alpha=weight, &
799  matrix_a=rpa_grad%rpa_work%fm_mat_Q_copy, matrix_b=fm_mat_q(1), beta=1.0_dp, &
800  matrix_c=rpa_grad%fm_Gamma_PQ)
801  CALL dgemm_counter_stop(dgemm_counter, dimen_ri, dimen_ri, dimen_ri)
802  CALL timestop(handle2)
803 
804  CALL cp_fm_to_fm_submat_general(fm_mat_q(1), fm_mat_q_gemm(1), dimen_ri, dimen_ri, 1, 1, 1, 1, &
805  fm_mat_q_gemm(1)%matrix_struct%context)
806  END IF
807 
808  DO ispin = 1, nspins
809  IF (do_ri_sos_laplace_mp2) THEN
810  ! The spin of the other Q matrix is always the other spin
811  jspin = nspins - ispin + 1
812  ELSE
813  ! or the first matrix in the case of RPA
814  jspin = 1
815  END IF
816 
817  IF (do_ri_sos_laplace_mp2) THEN
818  CALL timeset(routinen//"_PQ", handle2)
819  CALL dgemm_counter_start(dgemm_counter)
820  CALL parallel_gemm(transa="N", transb="N", m=dimen_ri, n=dimen_ri, k=dimen_ri, alpha=weight, &
821  matrix_a=fm_mat_q(ispin), matrix_b=fm_mat_q(jspin), beta=1.0_dp, &
822  matrix_c=rpa_grad%fm_Gamma_PQ)
823  CALL dgemm_counter_stop(dgemm_counter, dimen_ri, dimen_ri, dimen_ri)
824  CALL timestop(handle2)
825 
826  CALL cp_fm_to_fm_submat_general(fm_mat_q(jspin), fm_mat_q_gemm(jspin), dimen_ri, dimen_ri, 1, 1, 1, 1, &
827  fm_mat_q_gemm(jspin)%matrix_struct%context)
828  ELSE
829  CALL calc_fm_mat_s_rpa(fm_mat_s(ispin), .true., virtual(ispin), eigenval(:, ispin), &
830  homo(ispin), omega, 0.0_dp)
831  END IF
832 
833  CALL timeset(routinen//"_contr_S", handle2)
834  CALL cp_fm_create(fm_work_iap, rpa_grad%fm_Y(ispin)%matrix_struct)
835 
836  CALL cp_fm_get_info(fm_mat_s(ispin), ncol_global=dimen_ia)
837 
838  CALL dgemm_counter_start(dgemm_counter)
839  CALL parallel_gemm(transa="N", transb="N", m=dimen_ri, n=dimen_ia, k=dimen_ri, alpha=1.0_dp, &
840  matrix_a=fm_mat_q_gemm(jspin), matrix_b=fm_mat_s(ispin), beta=0.0_dp, &
841  matrix_c=fm_work_iap)
842  CALL dgemm_counter_stop(dgemm_counter, dimen_ia, dimen_ri, dimen_ri)
843  CALL timestop(handle2)
844 
845  IF (do_ri_sos_laplace_mp2) THEN
846  CALL calc_p_sos_mp2(homo(ispin), fm_mat_s(ispin), fm_work_iap, &
847  rpa_grad%sos_mp2_work_occ(ispin), rpa_grad%sos_mp2_work_virt(ispin), &
848  omega, weight, virtual(ispin), eigenval(:, ispin), mp2_env%ri_grad%dot_blksize)
849 
850  CALL calc_fm_mat_s_laplace(fm_work_iap, homo(ispin), virtual(ispin), eigenval(:, ispin), omega)
851 
852  CALL cp_fm_scale_and_add(1.0_dp, rpa_grad%fm_Y(ispin), -weight, fm_work_iap)
853 
854  CALL cp_fm_release(fm_work_iap)
855  ELSE
856  ! To save memory, we add it now
857  CALL cp_fm_scale_and_add(1.0_dp, rpa_grad%fm_Y(ispin), -weight, fm_work_iap)
858 
859  ! Redistribute both matrices and deallocate fm_work_iaP
860  CALL redistribute_fm_mat_s(rpa_grad%rpa_work%index2send(:, ispin), rpa_grad%rpa_work%index2recv(:, ispin), &
861  fm_work_iap, mat_work_iap_3d, &
862  rpa_grad%rpa_work%gd_homo(ispin), rpa_grad%rpa_work%gd_virtual(ispin), &
863  rpa_grad%rpa_work%mepos)
864  CALL cp_fm_release(fm_work_iap)
865 
866  CALL redistribute_fm_mat_s(rpa_grad%rpa_work%index2send(:, ispin), rpa_grad%rpa_work%index2recv(:, ispin), &
867  fm_mat_s(ispin), mat_s_3d, &
868  rpa_grad%rpa_work%gd_homo(ispin), rpa_grad%rpa_work%gd_virtual(ispin), &
869  rpa_grad%rpa_work%mepos)
870 
871  ! Now collect the density matrix
872  CALL calc_p_rpa(mat_s_3d, mat_work_iap_3d, rpa_grad%rpa_work%gd_homo(ispin), rpa_grad%rpa_work%gd_virtual(ispin), &
873  rpa_grad%rpa_work%grid, rpa_grad%rpa_work%mepos, &
874  fm_mat_s(ispin)%matrix_struct, &
875  rpa_grad%rpa_work%P_ij(ispin)%array, rpa_grad%rpa_work%P_ab(ispin)%array, &
876  weight, omega, eigenval(:, ispin), homo(ispin), unit_nr, mp2_env)
877 
878  DEALLOCATE (mat_work_iap_3d, mat_s_3d)
879 
880  CALL remove_scaling_factor_rpa(fm_mat_s(ispin), virtual(ispin), eigenval(:, ispin), homo(ispin), omega)
881 
882  END IF
883 
884  END DO
885 
886  CALL timestop(handle)
887 
888  END SUBROUTINE
889 
890 ! **************************************************************************************************
891 !> \brief ...
892 !> \param homo ...
893 !> \param fm_mat_S ...
894 !> \param fm_work_iaP ...
895 !> \param sos_mp2_work_occ ...
896 !> \param sos_mp2_work_virt ...
897 !> \param omega ...
898 !> \param weight ...
899 !> \param virtual ...
900 !> \param Eigenval ...
901 !> \param dot_blksize ...
902 ! **************************************************************************************************
903  SUBROUTINE calc_p_sos_mp2(homo, fm_mat_S, fm_work_iaP, sos_mp2_work_occ, sos_mp2_work_virt, &
904  omega, weight, virtual, Eigenval, dot_blksize)
905  INTEGER, INTENT(IN) :: homo
906  TYPE(cp_fm_type), INTENT(IN) :: fm_mat_s, fm_work_iap
907  TYPE(sos_mp2_grad_work_type), INTENT(INOUT) :: sos_mp2_work_occ, sos_mp2_work_virt
908  REAL(kind=dp), INTENT(IN) :: omega, weight
909  INTEGER, INTENT(IN) :: virtual
910  REAL(kind=dp), DIMENSION(:), INTENT(IN) :: eigenval
911  INTEGER, INTENT(IN) :: dot_blksize
912 
913  CHARACTER(LEN=*), PARAMETER :: routinen = 'calc_P_sos_mp2'
914 
915  INTEGER :: avirt, col_global, col_local, handle, &
916  handle2, iocc, my_a, my_i, ncol_local, &
917  nrow_local, num_ab_pairs, num_ij_pairs
918  INTEGER, DIMENSION(:), POINTER :: col_indices
919  REAL(kind=dp) :: ddot, trace
920 
921  CALL timeset(routinen, handle)
922 
923  CALL cp_fm_get_info(fm_mat_s, col_indices=col_indices, ncol_local=ncol_local, nrow_local=nrow_local)
924 
925  CALL timeset(routinen//"_Pij_diag", handle2)
926  DO my_i = 1, homo
927  ! Collect the contributions of the matrix elements
928 
929  trace = 0.0_dp
930 
931  DO col_local = 1, ncol_local
932  col_global = col_indices(col_local)
933 
934  iocc = max(1, col_global - 1)/virtual + 1
935  avirt = col_global - (iocc - 1)*virtual
936 
937  IF (iocc == my_i) trace = trace + &
938  ddot(nrow_local, fm_mat_s%local_data(:, col_local), 1, fm_work_iap%local_data(:, col_local), 1)
939  END DO
940 
941  sos_mp2_work_occ%P(my_i) = sos_mp2_work_occ%P(my_i) - trace*omega*weight
942 
943  END DO
944  CALL timestop(handle2)
945 
946  CALL timeset(routinen//"_Pab_diag", handle2)
947  DO my_a = 1, virtual
948  ! Collect the contributions of the matrix elements
949 
950  trace = 0.0_dp
951 
952  DO col_local = 1, ncol_local
953  col_global = col_indices(col_local)
954 
955  iocc = max(1, col_global - 1)/virtual + 1
956  avirt = col_global - (iocc - 1)*virtual
957 
958  IF (avirt == my_a) trace = trace + &
959  ddot(nrow_local, fm_mat_s%local_data(:, col_local), 1, fm_work_iap%local_data(:, col_local), 1)
960  END DO
961 
962  sos_mp2_work_virt%P(my_a) = sos_mp2_work_virt%P(my_a) + trace*omega*weight
963 
964  END DO
965  CALL timestop(handle2)
966 
967  ! Loop over list and carry out operations
968  num_ij_pairs = SIZE(sos_mp2_work_occ%pair_list, 2)
969  num_ab_pairs = SIZE(sos_mp2_work_virt%pair_list, 2)
970  IF (num_ij_pairs > 0) THEN
971  CALL calc_pij_degen(fm_work_iap, fm_mat_s, sos_mp2_work_occ%pair_list, &
972  virtual, sos_mp2_work_occ%P(homo + 1:), eigenval(:homo), omega, weight, &
973  sos_mp2_work_occ%index2send, sos_mp2_work_occ%index2recv, dot_blksize)
974  END IF
975  IF (num_ab_pairs > 0) THEN
976  CALL calc_pab_degen(fm_work_iap, fm_mat_s, sos_mp2_work_virt%pair_list, &
977  virtual, sos_mp2_work_virt%P(virtual + 1:), eigenval(homo + 1:), omega, weight, &
978  sos_mp2_work_virt%index2send, sos_mp2_work_virt%index2recv, dot_blksize)
979  END IF
980 
981  CALL timestop(handle)
982 
983  END SUBROUTINE
984 
985 ! **************************************************************************************************
986 !> \brief ...
987 !> \param mat_S_1D ...
988 !> \param mat_work_iaP_3D ...
989 !> \param gd_homo ...
990 !> \param gd_virtual ...
991 !> \param grid ...
992 !> \param mepos ...
993 !> \param fm_struct_S ...
994 !> \param P_ij ...
995 !> \param P_ab ...
996 !> \param weight ...
997 !> \param omega ...
998 !> \param Eigenval ...
999 !> \param homo ...
1000 !> \param unit_nr ...
1001 !> \param mp2_env ...
1002 ! **************************************************************************************************
1003  SUBROUTINE calc_p_rpa(mat_S_1D, mat_work_iaP_3D, gd_homo, gd_virtual, grid, mepos, &
1004  fm_struct_S, P_ij, P_ab, weight, omega, Eigenval, homo, unit_nr, mp2_env)
1005  REAL(kind=dp), DIMENSION(*), INTENT(INOUT), TARGET :: mat_s_1d
1006  REAL(kind=dp), DIMENSION(:, :, :), INTENT(INOUT) :: mat_work_iap_3d
1007  TYPE(group_dist_d1_type), INTENT(IN) :: gd_homo, gd_virtual
1008  INTEGER, DIMENSION(2), INTENT(IN) :: grid, mepos
1009  TYPE(cp_fm_struct_type), INTENT(IN), POINTER :: fm_struct_s
1010  REAL(kind=dp), DIMENSION(:, :) :: p_ij, p_ab
1011  REAL(kind=dp), INTENT(IN) :: weight, omega
1012  REAL(kind=dp), DIMENSION(:), INTENT(IN) :: eigenval
1013  INTEGER, INTENT(IN) :: homo, unit_nr
1014  TYPE(mp2_type), INTENT(INOUT) :: mp2_env
1015 
1016  CHARACTER(LEN=*), PARAMETER :: routinen = 'calc_P_rpa'
1017 
1018  INTEGER :: completed, handle, handle2, my_a_end, my_a_size, my_a_start, my_i_end, my_i_size, &
1019  my_i_start, my_p_size, my_prow, number_of_parallel_channels, proc_a_recv, proc_a_send, &
1020  proc_i_recv, proc_i_send, proc_recv, proc_send, proc_shift, recv_a_end, recv_a_size, &
1021  recv_a_start, recv_i_end, recv_i_size, recv_i_start, tag
1022  INTEGER(KIND=int_8) :: mem, number_of_elements_per_blk
1023  INTEGER, ALLOCATABLE, DIMENSION(:) :: procs_recv
1024  INTEGER, DIMENSION(:, :), POINTER :: blacs2mpi
1025  REAL(kind=dp) :: mem_per_block, mem_real
1026  REAL(kind=dp), ALLOCATABLE, DIMENSION(:), TARGET :: buffer_compens_1d
1027  REAL(kind=dp), DIMENSION(:, :, :), POINTER :: mat_s_3d
1028  TYPE(cp_1d_r_cp_type), ALLOCATABLE, DIMENSION(:) :: buffer_1d
1029  TYPE(cp_3d_r_cp_type), ALLOCATABLE, DIMENSION(:) :: buffer_3d
1030  TYPE(mp_para_env_type), POINTER :: para_env
1031  TYPE(mp_request_type), ALLOCATABLE, DIMENSION(:) :: recv_requests, send_requests
1032 
1033  CALL timeset(routinen, handle)
1034 
1035  ! We allocate it at every step to reduce potential memory conflicts with COSMA
1036  IF (mp2_env%ri_grad%dot_blksize >= blksize_threshold) THEN
1037  IF (.NOT. c_associated(mp2_env%local_gemm_ctx)) THEN
1038  CALL local_gemm_create(mp2_env%local_gemm_ctx, local_gemm_pu_gpu)
1039  CALL local_gemm_set_op_threshold_gpu(mp2_env%local_gemm_ctx, spla_threshold)
1040  END IF
1041  END IF
1042 
1043  tag = 47
1044 
1045  my_p_size = SIZE(mat_work_iap_3d, 1)
1046 
1047  CALL cp_fm_struct_get(fm_struct_s, para_env=para_env)
1048  CALL fm_struct_s%context%get(my_process_row=my_prow, blacs2mpi=blacs2mpi, para_env=para_env)
1049 
1050  CALL get_group_dist(gd_virtual, mepos(1), my_a_start, my_a_end, my_a_size)
1051  CALL get_group_dist(gd_homo, mepos(2), my_i_start, my_i_end, my_i_size)
1052 
1053  ! We have to remap the indices because mp_sendrecv requires a 3D array (because of mat_work_iaP_3D)
1054  ! and dgemm requires 2D arrays
1055  ! Fortran 2008 does allow pointer remapping independently of the ranks but GCC 7 does not properly support it
1056  mat_s_3d(1:my_p_size, 1:my_a_size, 1:my_i_size) => mat_s_1d(1:int(my_p_size, int_8)*my_a_size*my_i_size)
1057 
1058  number_of_elements_per_blk = max(int(maxsize(gd_homo), kind=int_8)*my_a_size, &
1059  int(maxsize(gd_virtual), kind=int_8)*my_i_size)*my_p_size
1060 
1061  ! Determine the available memory and estimate the number of possible parallel communication channels
1062  CALL m_memory(mem)
1063  mem_real = real(mem, kind=dp)
1064  mem_per_block = real(number_of_elements_per_blk, kind=dp)*8.0_dp
1065  number_of_parallel_channels = max(1, min(maxval(grid) - 1, floor(mem_real/mem_per_block)))
1066  CALL para_env%min(number_of_parallel_channels)
1067  IF (mp2_env%ri_grad%max_parallel_comm > 0) &
1068  number_of_parallel_channels = min(number_of_parallel_channels, mp2_env%ri_grad%max_parallel_comm)
1069 
1070  IF (unit_nr > 0) THEN
1071  WRITE (unit_nr, '(T3,A,T75,I6)') 'GRAD_INFO| Number of parallel communication channels:', number_of_parallel_channels
1072  CALL m_flush(unit_nr)
1073  END IF
1074  CALL para_env%sync()
1075 
1076  ALLOCATE (buffer_1d(number_of_parallel_channels))
1077  DO proc_shift = 1, number_of_parallel_channels
1078  ALLOCATE (buffer_1d(proc_shift)%array(number_of_elements_per_blk))
1079  END DO
1080 
1081  ALLOCATE (buffer_3d(number_of_parallel_channels))
1082 
1083  ! Allocate buffers for vector version of kahan summation
1084  IF (mp2_env%ri_grad%dot_blksize >= blksize_threshold) THEN
1085  ALLOCATE (buffer_compens_1d(2*max(my_a_size*maxsize(gd_virtual), my_i_size*maxsize(gd_homo))))
1086  END IF
1087 
1088  IF (number_of_parallel_channels > 1) THEN
1089  ALLOCATE (procs_recv(number_of_parallel_channels))
1090  ALLOCATE (recv_requests(number_of_parallel_channels))
1091  ALLOCATE (send_requests(maxval(grid) - 1))
1092  END IF
1093 
1094  IF (number_of_parallel_channels > 1 .AND. grid(1) > 1) THEN
1095  CALL timeset(routinen//"_comm_a", handle2)
1096  recv_requests(:) = mp_request_null
1097  procs_recv(:) = -1
1098  DO proc_shift = 1, min(grid(1) - 1, number_of_parallel_channels)
1099  proc_a_recv = modulo(mepos(1) - proc_shift, grid(1))
1100  proc_recv = mepos(2)*grid(1) + proc_a_recv
1101 
1102  CALL get_group_dist(gd_virtual, proc_a_recv, recv_a_start, recv_a_end, recv_a_size)
1103 
1104  buffer_3d(proc_shift)%array(1:my_p_size, 1:recv_a_size, 1:my_i_size) => &
1105  buffer_1d(proc_shift)%array(1:int(my_p_size, kind=int_8)*recv_a_size*my_i_size)
1106 
1107  CALL para_env%irecv(buffer_3d(proc_shift)%array, blacs2mpi(my_prow, proc_recv), &
1108  recv_requests(proc_shift), tag)
1109 
1110  procs_recv(proc_shift) = proc_a_recv
1111  END DO
1112 
1113  send_requests(:) = mp_request_null
1114  DO proc_shift = 1, grid(1) - 1
1115  proc_a_send = modulo(mepos(1) + proc_shift, grid(1))
1116  proc_send = mepos(2)*grid(1) + proc_a_send
1117 
1118  CALL para_env%isend(mat_work_iap_3d, blacs2mpi(my_prow, proc_send), &
1119  send_requests(proc_shift), tag)
1120  END DO
1121  CALL timestop(handle2)
1122  END IF
1123 
1124  CALL calc_p_rpa_a(p_ab(:, my_a_start:my_a_end), &
1125  mat_s_3d, mat_work_iap_3d, &
1126  mp2_env%ri_grad%dot_blksize, buffer_compens_1d, mp2_env%local_gemm_ctx, &
1127  eigenval(homo + my_a_start:homo + my_a_end), eigenval(my_i_start:my_i_end), &
1128  eigenval(homo + my_a_start:homo + my_a_end), omega, weight)
1129 
1130  DO proc_shift = 1, grid(1) - 1
1131  CALL timeset(routinen//"_comm_a", handle2)
1132  IF (number_of_parallel_channels > 1) THEN
1133  CALL mp_waitany(recv_requests, completed)
1134 
1135  CALL get_group_dist(gd_virtual, procs_recv(completed), recv_a_start, recv_a_end, recv_a_size)
1136  ELSE
1137  proc_a_send = modulo(mepos(1) + proc_shift, grid(1))
1138  proc_a_recv = modulo(mepos(1) - proc_shift, grid(1))
1139 
1140  proc_send = mepos(2)*grid(1) + proc_a_send
1141  proc_recv = mepos(2)*grid(1) + proc_a_recv
1142 
1143  CALL get_group_dist(gd_virtual, proc_a_recv, recv_a_start, recv_a_end, recv_a_size)
1144 
1145  buffer_3d(1)%array(1:my_p_size, 1:recv_a_size, 1:my_i_size) => &
1146  buffer_1d(1)%array(1:int(my_p_size, kind=int_8)*recv_a_size*my_i_size)
1147 
1148  CALL para_env%sendrecv(mat_work_iap_3d, blacs2mpi(my_prow, proc_send), &
1149  buffer_3d(1)%array, blacs2mpi(my_prow, proc_recv), tag)
1150  completed = 1
1151  END IF
1152  CALL timestop(handle2)
1153 
1154  CALL calc_p_rpa_a(p_ab(:, recv_a_start:recv_a_end), &
1155  mat_s_3d, buffer_3d(completed)%array, &
1156  mp2_env%ri_grad%dot_blksize, buffer_compens_1d, mp2_env%local_gemm_ctx, &
1157  eigenval(homo + my_a_start:homo + my_a_end), eigenval(my_i_start:my_i_end), &
1158  eigenval(homo + recv_a_start:homo + recv_a_end), omega, weight)
1159 
1160  IF (number_of_parallel_channels > 1 .AND. number_of_parallel_channels + proc_shift < grid(1)) THEN
1161  proc_a_recv = modulo(mepos(1) - proc_shift - number_of_parallel_channels, grid(1))
1162  proc_recv = mepos(2)*grid(1) + proc_a_recv
1163 
1164  CALL get_group_dist(gd_virtual, proc_a_recv, recv_a_start, recv_a_end, recv_a_size)
1165 
1166  buffer_3d(completed)%array(1:my_p_size, 1:recv_a_size, 1:my_i_size) => &
1167  buffer_1d(completed)%array(1:int(my_p_size, kind=int_8)*recv_a_size*my_i_size)
1168 
1169  CALL para_env%irecv(buffer_3d(completed)%array, blacs2mpi(my_prow, proc_recv), &
1170  recv_requests(completed), tag)
1171 
1172  procs_recv(completed) = proc_a_recv
1173  END IF
1174  END DO
1175 
1176  IF (number_of_parallel_channels > 1 .AND. grid(1) > 1) THEN
1177  CALL mp_waitall(send_requests)
1178  END IF
1179 
1180  IF (number_of_parallel_channels > 1 .AND. grid(2) > 1) THEN
1181  recv_requests(:) = mp_request_null
1182  procs_recv(:) = -1
1183  DO proc_shift = 1, min(grid(2) - 1, number_of_parallel_channels)
1184  proc_i_recv = modulo(mepos(2) - proc_shift, grid(2))
1185  proc_recv = proc_i_recv*grid(1) + mepos(1)
1186 
1187  CALL get_group_dist(gd_homo, proc_i_recv, recv_i_start, recv_i_end, recv_i_size)
1188 
1189  buffer_3d(proc_shift)%array(1:my_p_size, 1:my_a_size, 1:recv_i_size) => &
1190  buffer_1d(proc_shift)%array(1:int(my_p_size, kind=int_8)*my_a_size*recv_i_size)
1191 
1192  CALL para_env%irecv(buffer_3d(proc_shift)%array, blacs2mpi(my_prow, proc_recv), &
1193  recv_requests(proc_shift), tag)
1194 
1195  procs_recv(proc_shift) = proc_i_recv
1196  END DO
1197 
1198  send_requests(:) = mp_request_null
1199  DO proc_shift = 1, grid(2) - 1
1200  proc_i_send = modulo(mepos(2) + proc_shift, grid(2))
1201  proc_send = proc_i_send*grid(1) + mepos(1)
1202 
1203  CALL para_env%isend(mat_work_iap_3d, blacs2mpi(my_prow, proc_send), &
1204  send_requests(proc_shift), tag)
1205  END DO
1206  END IF
1207 
1208  CALL calc_p_rpa_i(p_ij(:, my_i_start:my_i_end), &
1209  mat_s_3d, mat_work_iap_3d, &
1210  mp2_env%ri_grad%dot_blksize, buffer_compens_1d, mp2_env%local_gemm_ctx, &
1211  eigenval(homo + my_a_start:homo + my_a_end), eigenval(my_i_start:my_i_end), &
1212  eigenval(my_i_start:my_i_end), omega, weight)
1213 
1214  DO proc_shift = 1, grid(2) - 1
1215  CALL timeset(routinen//"_comm_i", handle2)
1216  IF (number_of_parallel_channels > 1) THEN
1217  CALL mp_waitany(recv_requests, completed)
1218 
1219  CALL get_group_dist(gd_homo, procs_recv(completed), recv_i_start, recv_i_end, recv_i_size)
1220  ELSE
1221  proc_i_send = modulo(mepos(2) + proc_shift, grid(2))
1222  proc_i_recv = modulo(mepos(2) - proc_shift, grid(2))
1223 
1224  proc_send = proc_i_send*grid(1) + mepos(1)
1225  proc_recv = proc_i_recv*grid(1) + mepos(1)
1226 
1227  CALL get_group_dist(gd_homo, proc_i_recv, recv_i_start, recv_i_end, recv_i_size)
1228 
1229  buffer_3d(1)%array(1:my_p_size, 1:my_a_size, 1:recv_i_size) => &
1230  buffer_1d(1)%array(1:int(my_p_size, kind=int_8)*my_a_size*recv_i_size)
1231 
1232  CALL para_env%sendrecv(mat_work_iap_3d, blacs2mpi(my_prow, proc_send), &
1233  buffer_3d(1)%array, blacs2mpi(my_prow, proc_recv), tag)
1234  completed = 1
1235  END IF
1236  CALL timestop(handle2)
1237 
1238  CALL calc_p_rpa_i(p_ij(:, recv_i_start:recv_i_end), &
1239  mat_s_3d, buffer_3d(completed)%array, &
1240  mp2_env%ri_grad%dot_blksize, buffer_compens_1d, mp2_env%local_gemm_ctx, &
1241  eigenval(homo + my_a_start:homo + my_a_end), eigenval(my_i_start:my_i_end), &
1242  eigenval(recv_i_start:recv_i_end), omega, weight)
1243 
1244  IF (number_of_parallel_channels > 1 .AND. number_of_parallel_channels + proc_shift < grid(2)) THEN
1245  proc_i_recv = modulo(mepos(2) - proc_shift - number_of_parallel_channels, grid(2))
1246  proc_recv = proc_i_recv*grid(1) + mepos(1)
1247 
1248  CALL get_group_dist(gd_homo, proc_i_recv, recv_i_start, recv_a_end, recv_i_size)
1249 
1250  buffer_3d(completed)%array(1:my_p_size, 1:my_a_size, 1:recv_i_size) => &
1251  buffer_1d(completed)%array(1:int(my_p_size, kind=int_8)*my_a_size*recv_i_size)
1252 
1253  CALL para_env%irecv(buffer_3d(completed)%array, blacs2mpi(my_prow, proc_recv), &
1254  recv_requests(completed), tag)
1255 
1256  procs_recv(completed) = proc_i_recv
1257  END IF
1258  END DO
1259 
1260  IF (number_of_parallel_channels > 1 .AND. grid(2) > 1) THEN
1261  CALL mp_waitall(send_requests)
1262  END IF
1263 
1264  IF (number_of_parallel_channels > 1) THEN
1265  DEALLOCATE (procs_recv)
1266  DEALLOCATE (recv_requests)
1267  DEALLOCATE (send_requests)
1268  END IF
1269 
1270  IF (mp2_env%ri_grad%dot_blksize >= blksize_threshold) THEN
1271  ! release memory allocated by local_gemm when run on GPU. local_gemm_ctx is null on cpu only runs
1272  CALL local_gemm_destroy(mp2_env%local_gemm_ctx)
1273  mp2_env%local_gemm_ctx = c_null_ptr
1274  DEALLOCATE (buffer_compens_1d)
1275  END IF
1276 
1277  DO proc_shift = 1, number_of_parallel_channels
1278  NULLIFY (buffer_3d(proc_shift)%array)
1279  DEALLOCATE (buffer_1d(proc_shift)%array)
1280  END DO
1281  DEALLOCATE (buffer_3d, buffer_1d)
1282 
1283  CALL timestop(handle)
1284 
1285  END SUBROUTINE
1286 
1287 ! **************************************************************************************************
1288 !> \brief ...
1289 !> \param P_ab ...
1290 !> \param mat_S ...
1291 !> \param mat_work ...
1292 !> \param dot_blksize ...
1293 !> \param buffer_1D ...
1294 !> \param local_gemm_ctx ...
1295 !> \param my_eval_virt ...
1296 !> \param my_eval_occ ...
1297 !> \param recv_eval_virt ...
1298 !> \param omega ...
1299 !> \param weight ...
1300 ! **************************************************************************************************
1301  SUBROUTINE calc_p_rpa_a(P_ab, mat_S, mat_work, dot_blksize, buffer_1D, local_gemm_ctx, &
1302  my_eval_virt, my_eval_occ, recv_eval_virt, omega, weight)
1303  REAL(kind=dp), DIMENSION(:, :), INTENT(INOUT) :: p_ab
1304  REAL(kind=dp), DIMENSION(:, :, :), INTENT(IN) :: mat_s, mat_work
1305  INTEGER, INTENT(IN) :: dot_blksize
1306  REAL(kind=dp), ALLOCATABLE, DIMENSION(:), &
1307  INTENT(INOUT), TARGET :: buffer_1d
1308  TYPE(c_ptr), INTENT(INOUT) :: local_gemm_ctx
1309  REAL(kind=dp), DIMENSION(:), INTENT(IN) :: my_eval_virt, my_eval_occ, recv_eval_virt
1310  REAL(kind=dp), INTENT(IN) :: omega, weight
1311 
1312  CHARACTER(LEN=*), PARAMETER :: routinen = 'calc_P_rpa_a'
1313 
1314  INTEGER :: handle, my_a, my_a_size, my_i, &
1315  my_i_size, my_p_size, p_end, p_start, &
1316  recv_a_size, stripesize
1317  REAL(kind=dp), DIMENSION(:, :), POINTER :: buffer_compens, buffer_unscaled
1318 
1319  CALL timeset(routinen, handle)
1320 
1321  my_i_size = SIZE(mat_s, 3)
1322  recv_a_size = SIZE(mat_work, 2)
1323  my_a_size = SIZE(mat_s, 2)
1324  my_p_size = SIZE(mat_s, 1)
1325 
1326  IF (dot_blksize >= blksize_threshold) THEN
1327  buffer_compens(1:my_a_size, 1:recv_a_size) => buffer_1d(1:my_a_size*recv_a_size)
1328  buffer_compens = 0.0_dp
1329  buffer_unscaled(1:my_a_size, 1:recv_a_size) => buffer_1d(my_a_size*recv_a_size + 1:2*my_a_size*recv_a_size)
1330 
1331  ! This loop imitates the actual tensor contraction
1332  DO my_i = 1, my_i_size
1333  DO p_start = 1, my_p_size, dot_blksize
1334  stripesize = min(dot_blksize, my_p_size - p_start + 1)
1335  p_end = p_start + stripesize - 1
1336 
1337  CALL local_gemm("T", "N", my_a_size, recv_a_size, stripesize, &
1338  -weight, mat_s(p_start:p_end, :, my_i), stripesize, &
1339  mat_work(p_start:p_end, :, my_i), stripesize, &
1340  0.0_dp, buffer_unscaled, my_a_size, local_gemm_ctx)
1341 
1342  CALL scale_buffer_and_add_compens_virt(buffer_unscaled, buffer_compens, omega, &
1343  my_eval_virt, recv_eval_virt, my_eval_occ(my_i))
1344 
1345  CALL kahan_step(buffer_compens, p_ab)
1346  END DO
1347  END DO
1348  ELSE
1349  block
1350  INTEGER :: recv_a
1351  REAL(kind=dp) :: tmp, e_i, e_a, e_b, omega2, my_compens, my_p, s
1352  omega2 = -omega**2
1353 !$OMP PARALLEL DO COLLAPSE(2) DEFAULT(NONE)&
1354 !$OMP SHARED(my_a_size,recv_a_size,my_i_size,mat_S,my_eval_virt,recv_eval_virt,my_eval_occ,omega2,&
1355 !$OMP P_ab,weight,mat_work)&
1356 !$OMP PRIVATE(tmp,my_a,recv_a,my_i,e_a,e_b,e_i,my_compens,my_p,s)
1357  DO my_a = 1, my_a_size
1358  DO recv_a = 1, recv_a_size
1359  e_a = my_eval_virt(my_a)
1360  e_b = recv_eval_virt(recv_a)
1361  my_p = p_ab(my_a, recv_a)
1362  my_compens = 0.0_dp
1363  DO my_i = 1, my_i_size
1364  e_i = -my_eval_occ(my_i)
1365  tmp = -weight*accurate_dot_product(mat_s(:, my_a, my_i), mat_work(:, recv_a, my_i)) &
1366  *(1.0_dp + omega2/((e_a + e_i)*(e_b + e_i))) - my_compens
1367  s = my_p + tmp
1368  my_compens = (s - my_p) - tmp
1369  my_p = s
1370  END DO
1371  p_ab(my_a, recv_a) = my_p
1372  END DO
1373  END DO
1374  END block
1375  END IF
1376 
1377  CALL timestop(handle)
1378 
1379  END SUBROUTINE calc_p_rpa_a
1380 
1381 ! **************************************************************************************************
1382 !> \brief ...
1383 !> \param P_ij ...
1384 !> \param mat_S ...
1385 !> \param mat_work ...
1386 !> \param dot_blksize ...
1387 !> \param buffer_1D ...
1388 !> \param local_gemm_ctx ...
1389 !> \param my_eval_virt ...
1390 !> \param my_eval_occ ...
1391 !> \param recv_eval_occ ...
1392 !> \param omega ...
1393 !> \param weight ...
1394 ! **************************************************************************************************
1395  SUBROUTINE calc_p_rpa_i(P_ij, mat_S, mat_work, dot_blksize, buffer_1D, local_gemm_ctx, &
1396  my_eval_virt, my_eval_occ, recv_eval_occ, omega, weight)
1397  REAL(kind=dp), DIMENSION(:, :), INTENT(INOUT) :: p_ij
1398  REAL(kind=dp), DIMENSION(:, :, :), INTENT(INOUT) :: mat_s, mat_work
1399  INTEGER, INTENT(IN) :: dot_blksize
1400  REAL(kind=dp), ALLOCATABLE, DIMENSION(:), &
1401  INTENT(INOUT), TARGET :: buffer_1d
1402  TYPE(c_ptr), INTENT(INOUT) :: local_gemm_ctx
1403  REAL(kind=dp), DIMENSION(:), INTENT(IN) :: my_eval_virt, my_eval_occ, recv_eval_occ
1404  REAL(kind=dp), INTENT(IN) :: omega, weight
1405 
1406  CHARACTER(LEN=*), PARAMETER :: routinen = 'calc_P_rpa_i'
1407 
1408  INTEGER :: handle, my_a, my_a_size, my_i, &
1409  my_i_size, my_p_size, p_end, p_start, &
1410  recv_i_size, stripesize
1411  REAL(kind=dp), DIMENSION(:, :), POINTER :: buffer_compens, buffer_unscaled
1412 
1413  CALL timeset(routinen, handle)
1414 
1415  my_i_size = SIZE(mat_s, 3)
1416  recv_i_size = SIZE(mat_work, 3)
1417  my_a_size = SIZE(mat_s, 2)
1418  my_p_size = SIZE(mat_s, 1)
1419 
1420  IF (dot_blksize >= blksize_threshold) THEN
1421  buffer_compens(1:my_i_size, 1:recv_i_size) => buffer_1d(1:my_i_size*recv_i_size)
1422  buffer_compens = 0.0_dp
1423  buffer_unscaled(1:my_i_size, 1:recv_i_size) => buffer_1d(my_i_size*recv_i_size + 1:2*my_i_size*recv_i_size)
1424 
1425  ! This loop imitates the actual tensor contraction
1426  DO my_a = 1, my_a_size
1427  DO p_start = 1, my_p_size, dot_blksize
1428  stripesize = min(dot_blksize, my_p_size - p_start + 1)
1429  p_end = p_start + stripesize - 1
1430 
1431  CALL local_gemm("T", "N", my_i_size, recv_i_size, stripesize, &
1432  weight, mat_s(p_start:p_end, my_a, :), stripesize, &
1433  mat_work(p_start:p_end, my_a, :), stripesize, &
1434  0.0_dp, buffer_unscaled, my_i_size, local_gemm_ctx)
1435 
1436  CALL scale_buffer_and_add_compens_occ(buffer_unscaled, buffer_compens, omega, &
1437  my_eval_occ, recv_eval_occ, my_eval_virt(my_a))
1438 
1439  CALL kahan_step(buffer_compens, p_ij)
1440  END DO
1441  END DO
1442  ELSE
1443  block
1444  REAL(kind=dp) :: tmp, e_i, e_a, e_j, omega2, my_compens, my_p, s
1445  INTEGER :: recv_i
1446  omega2 = -omega**2
1447 !$OMP PARALLEL DO COLLAPSE(2) DEFAULT(NONE)&
1448 !$OMP SHARED(my_a_size,recv_i_size,my_i_size,mat_S,my_eval_occ,my_eval_virt,omega2,&
1449 !$OMP recv_eval_occ,P_ij,weight,mat_work)&
1450 !$OMP PRIVATE(tmp,my_a,recv_i,my_i,e_i,e_j,e_a,my_compens,my_p,s)
1451  DO my_i = 1, my_i_size
1452  DO recv_i = 1, recv_i_size
1453  e_i = my_eval_occ(my_i)
1454  e_j = recv_eval_occ(recv_i)
1455  my_p = p_ij(my_i, recv_i)
1456  my_compens = 0.0_dp
1457  DO my_a = 1, my_a_size
1458  e_a = my_eval_virt(my_a)
1459  tmp = weight*accurate_dot_product(mat_s(:, my_a, my_i), mat_work(:, my_a, recv_i)) &
1460  *(1.0_dp + omega2/((e_a - e_i)*(e_a - e_j))) - my_compens
1461  s = my_p + tmp
1462  my_compens = (s - my_p) - tmp
1463  my_p = s
1464  END DO
1465  p_ij(my_i, recv_i) = my_p
1466  END DO
1467  END DO
1468  END block
1469  END IF
1470 
1471  CALL timestop(handle)
1472 
1473  END SUBROUTINE calc_p_rpa_i
1474 
1475 ! **************************************************************************************************
1476 !> \brief ...
1477 !> \param compens ...
1478 !> \param P ...
1479 ! **************************************************************************************************
1480  SUBROUTINE kahan_step(compens, P)
1481  REAL(kind=dp), DIMENSION(:, :), INTENT(INOUT) :: compens, p
1482 
1483  CHARACTER(LEN=*), PARAMETER :: routinen = 'kahan_step'
1484 
1485  INTEGER :: handle, i, j
1486  REAL(kind=dp) :: my_compens, my_p, s
1487 
1488  CALL timeset(routinen, handle)
1489 
1490 !$OMP PARALLEL DO DEFAULT(NONE) SHARED(P,compens) PRIVATE(i,my_p,my_compens,s, j) COLLAPSE(2)
1491  DO j = 1, SIZE(compens, 2)
1492  DO i = 1, SIZE(compens, 1)
1493  my_p = p(i, j)
1494  my_compens = compens(i, j)
1495  s = my_p + my_compens
1496  compens(i, j) = (s - my_p) - my_compens
1497  p(i, j) = s
1498  END DO
1499  END DO
1500 !$OMP END PARALLEL DO
1501 
1502  CALL timestop(handle)
1503 
1504  END SUBROUTINE
1505 
1506 ! **************************************************************************************************
1507 !> \brief ...
1508 !> \param buffer_unscaled ...
1509 !> \param buffer_compens ...
1510 !> \param omega ...
1511 !> \param my_eval_virt ...
1512 !> \param recv_eval_virt ...
1513 !> \param my_eval_occ ...
1514 ! **************************************************************************************************
1515  SUBROUTINE scale_buffer_and_add_compens_virt(buffer_unscaled, buffer_compens, omega, &
1516  my_eval_virt, recv_eval_virt, my_eval_occ)
1517  REAL(kind=dp), DIMENSION(:, :), INTENT(IN) :: buffer_unscaled
1518  REAL(kind=dp), DIMENSION(:, :), INTENT(INOUT) :: buffer_compens
1519  REAL(kind=dp), INTENT(IN) :: omega
1520  REAL(kind=dp), DIMENSION(:), INTENT(IN) :: my_eval_virt, recv_eval_virt
1521  REAL(kind=dp), INTENT(IN) :: my_eval_occ
1522 
1523  CHARACTER(LEN=*), PARAMETER :: routinen = 'scale_buffer_and_add_compens_virt'
1524 
1525  INTEGER :: handle, my_a, my_b
1526 
1527  CALL timeset(routinen, handle)
1528 
1529 !$OMP PARALLEL DO DEFAULT(NONE) SHARED(buffer_unscaled,buffer_compens,omega,&
1530 !$OMP my_eval_virt,recv_eval_virt,my_eval_occ) PRIVATE(my_a,my_b)
1531  DO my_b = 1, SIZE(buffer_compens, 2)
1532  DO my_a = 1, SIZE(buffer_compens, 1)
1533  buffer_compens(my_a, my_b) = buffer_unscaled(my_a, my_b) &
1534  *(1.0_dp - omega**2/((my_eval_virt(my_a) - my_eval_occ)*(recv_eval_virt(my_b) - my_eval_occ))) &
1535  - buffer_compens(my_a, my_b)
1536  END DO
1537  END DO
1538 !$OMP END PARALLEL DO
1539 
1540  CALL timestop(handle)
1541 
1542  END SUBROUTINE scale_buffer_and_add_compens_virt
1543 
1544 ! **************************************************************************************************
1545 !> \brief ...
1546 !> \param buffer_unscaled ...
1547 !> \param buffer_compens ...
1548 !> \param omega ...
1549 !> \param my_eval_occ ...
1550 !> \param recv_eval_occ ...
1551 !> \param my_eval_virt ...
1552 ! **************************************************************************************************
1553  SUBROUTINE scale_buffer_and_add_compens_occ(buffer_unscaled, buffer_compens, omega, &
1554  my_eval_occ, recv_eval_occ, my_eval_virt)
1555  REAL(kind=dp), DIMENSION(:, :), INTENT(IN) :: buffer_unscaled
1556  REAL(kind=dp), DIMENSION(:, :), INTENT(INOUT) :: buffer_compens
1557  REAL(kind=dp), INTENT(IN) :: omega
1558  REAL(kind=dp), DIMENSION(:), INTENT(IN) :: my_eval_occ, recv_eval_occ
1559  REAL(kind=dp), INTENT(IN) :: my_eval_virt
1560 
1561  CHARACTER(LEN=*), PARAMETER :: routinen = 'scale_buffer_and_add_compens_occ'
1562 
1563  INTEGER :: handle, my_i, my_j
1564 
1565  CALL timeset(routinen, handle)
1566 
1567 !$OMP PARALLEL DO DEFAULT(NONE) SHARED(buffer_compens,buffer_unscaled,omega,&
1568 !$OMP my_eval_virt,my_eval_occ,recv_eval_occ) PRIVATE(my_i,my_j)
1569  DO my_j = 1, SIZE(buffer_compens, 2)
1570  DO my_i = 1, SIZE(buffer_compens, 1)
1571  buffer_compens(my_i, my_j) = buffer_unscaled(my_i, my_j) &
1572  *(1.0_dp - omega**2/((my_eval_virt - my_eval_occ(my_i))*(my_eval_virt - recv_eval_occ(my_j)))) &
1573  - buffer_compens(my_i, my_j)
1574  END DO
1575  END DO
1576 !$OMP END PARALLEL DO
1577 
1578  CALL timestop(handle)
1579 
1580  END SUBROUTINE scale_buffer_and_add_compens_occ
1581 
1582 ! **************************************************************************************************
1583 !> \brief ...
1584 !> \param x ...
1585 !> \return ...
1586 ! **************************************************************************************************
1587  ELEMENTAL FUNCTION sinh_over_x(x) RESULT(res)
1588  REAL(kind=dp), INTENT(IN) :: x
1589  REAL(kind=dp) :: res
1590 
1591  ! Calculate sinh(x)/x
1592  ! Split the intervall to prevent numerical instabilities
1593  IF (abs(x) > 3.0e-4_dp) THEN
1594  res = sinh(x)/x
1595  ELSE
1596  res = 1.0_dp + x**2/6.0_dp
1597  END IF
1598 
1599  END FUNCTION sinh_over_x
1600 
1601 ! **************************************************************************************************
1602 !> \brief ...
1603 !> \param fm_work_iaP ...
1604 !> \param fm_mat_S ...
1605 !> \param pair_list ...
1606 !> \param virtual ...
1607 !> \param P_ij ...
1608 !> \param Eigenval ...
1609 !> \param omega ...
1610 !> \param weight ...
1611 !> \param index2send ...
1612 !> \param index2recv ...
1613 !> \param dot_blksize ...
1614 ! **************************************************************************************************
1615  SUBROUTINE calc_pij_degen(fm_work_iaP, fm_mat_S, pair_list, virtual, P_ij, Eigenval, &
1616  omega, weight, index2send, index2recv, dot_blksize)
1617  TYPE(cp_fm_type), INTENT(IN) :: fm_work_iap, fm_mat_s
1618  INTEGER, DIMENSION(:, :), INTENT(IN) :: pair_list
1619  INTEGER, INTENT(IN) :: virtual
1620  REAL(kind=dp), DIMENSION(:), INTENT(INOUT) :: p_ij
1621  REAL(kind=dp), DIMENSION(:), INTENT(IN) :: eigenval
1622  REAL(kind=dp), INTENT(IN) :: omega, weight
1623  TYPE(one_dim_int_array), DIMENSION(0:), INTENT(IN) :: index2send, index2recv
1624  INTEGER, INTENT(IN) :: dot_blksize
1625 
1626  CHARACTER(LEN=*), PARAMETER :: routinen = 'calc_Pij_degen'
1627 
1628  INTEGER :: avirt, col_global, col_local, counter, first_p_pos(2), first_p_pos_col, handle, &
1629  handle2, ij_counter, iocc, my_col_local, my_i, my_j, my_pcol, my_prow, ncol_block, &
1630  ncol_local, nrow_local, num_ij_pairs, num_pe_col, pcol, pcol_recv, pcol_send, proc_shift, &
1631  recv_size, send_size, size_recv_buffer, size_send_buffer, tag
1632  INTEGER, DIMENSION(:), POINTER :: col_indices, ncol_locals
1633  INTEGER, DIMENSION(:, :), POINTER :: blacs2mpi
1634  REAL(kind=dp) :: trace
1635  REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :) :: buffer_recv, buffer_send
1636  TYPE(cp_blacs_env_type), POINTER :: context
1637  TYPE(mp_para_env_type), POINTER :: para_env
1638 
1639  CALL timeset(routinen, handle)
1640 
1641  CALL cp_fm_struct_get(fm_work_iap%matrix_struct, para_env=para_env, ncol_locals=ncol_locals, &
1642  ncol_local=ncol_local, col_indices=col_indices, &
1643  context=context, ncol_block=ncol_block, first_p_pos=first_p_pos, nrow_local=nrow_local)
1644  CALL context%get(my_process_row=my_prow, my_process_column=my_pcol, &
1645  number_of_process_columns=num_pe_col, blacs2mpi=blacs2mpi)
1646  first_p_pos_col = first_p_pos(2)
1647 
1648  num_ij_pairs = SIZE(pair_list, 2)
1649 
1650  tag = 42
1651 
1652  DO ij_counter = 1, num_ij_pairs
1653 
1654  my_i = pair_list(1, ij_counter)
1655  my_j = pair_list(2, ij_counter)
1656 
1657  trace = 0.0_dp
1658 
1659  DO col_local = 1, ncol_local
1660  col_global = col_indices(col_local)
1661 
1662  iocc = max(1, col_global - 1)/virtual + 1
1663  avirt = col_global - (iocc - 1)*virtual
1664 
1665  IF (iocc /= my_j) cycle
1666  pcol = cp_fm_indxg2p((my_i - 1)*virtual + avirt, ncol_block, 0, first_p_pos_col, num_pe_col)
1667  IF (pcol /= my_pcol) cycle
1668 
1669  my_col_local = cp_fm_indxg2l((my_i - 1)*virtual + avirt, ncol_block, 0, first_p_pos_col, num_pe_col)
1670 
1671  trace = trace + accurate_dot_product_2(fm_mat_s%local_data(:, my_col_local), fm_work_iap%local_data(:, col_local), &
1672  dot_blksize)
1673  END DO
1674 
1675  p_ij(ij_counter) = p_ij(ij_counter) - trace*sinh_over_x(0.5_dp*(eigenval(my_i) - eigenval(my_j))*omega)*omega*weight
1676 
1677  END DO
1678 
1679  IF (num_pe_col > 1) THEN
1680  size_send_buffer = 0
1681  size_recv_buffer = 0
1682  DO proc_shift = 1, num_pe_col - 1
1683  pcol_send = modulo(my_pcol + proc_shift, num_pe_col)
1684  pcol_recv = modulo(my_pcol - proc_shift, num_pe_col)
1685 
1686  IF (ALLOCATED(index2send(pcol_send)%array)) &
1687  size_send_buffer = max(size_send_buffer, SIZE(index2send(pcol_send)%array))
1688 
1689  IF (ALLOCATED(index2recv(pcol_recv)%array)) &
1690  size_recv_buffer = max(size_recv_buffer, SIZE(index2recv(pcol_recv)%array))
1691  END DO
1692 
1693  ALLOCATE (buffer_send(nrow_local, size_send_buffer), buffer_recv(nrow_local, size_recv_buffer))
1694 
1695  DO proc_shift = 1, num_pe_col - 1
1696  pcol_send = modulo(my_pcol + proc_shift, num_pe_col)
1697  pcol_recv = modulo(my_pcol - proc_shift, num_pe_col)
1698 
1699  ! Collect data and exchange
1700  send_size = 0
1701  IF (ALLOCATED(index2send(pcol_send)%array)) send_size = SIZE(index2send(pcol_send)%array)
1702 
1703  DO counter = 1, send_size
1704  buffer_send(:, counter) = fm_work_iap%local_data(:, index2send(pcol_send)%array(counter))
1705  END DO
1706 
1707  recv_size = 0
1708  IF (ALLOCATED(index2recv(pcol_recv)%array)) recv_size = SIZE(index2recv(pcol_recv)%array)
1709  IF (recv_size > 0) THEN
1710  CALL timeset(routinen//"_send", handle2)
1711  IF (send_size > 0) THEN
1712  CALL para_env%sendrecv(buffer_send(:, :send_size), blacs2mpi(my_prow, pcol_send), &
1713  buffer_recv(:, :recv_size), blacs2mpi(my_prow, pcol_recv), tag)
1714  ELSE
1715  CALL para_env%recv(buffer_recv(:, :recv_size), blacs2mpi(my_prow, pcol_recv), tag)
1716  END IF
1717  CALL timestop(handle2)
1718 
1719  DO ij_counter = 1, num_ij_pairs
1720  ! Collect the contributions of the matrix elements
1721 
1722  my_i = pair_list(1, ij_counter)
1723  my_j = pair_list(2, ij_counter)
1724 
1725  trace = 0.0_dp
1726 
1727  DO col_local = 1, recv_size
1728  col_global = index2recv(pcol_recv)%array(col_local)
1729 
1730  iocc = max(1, col_global - 1)/virtual + 1
1731  IF (iocc /= my_j) cycle
1732  avirt = col_global - (iocc - 1)*virtual
1733  pcol = cp_fm_indxg2p((my_i - 1)*virtual + avirt, ncol_block, 0, first_p_pos_col, num_pe_col)
1734  IF (pcol /= my_pcol) cycle
1735 
1736  my_col_local = cp_fm_indxg2l((my_i - 1)*virtual + avirt, ncol_block, 0, first_p_pos_col, num_pe_col)
1737 
1738  trace = trace + accurate_dot_product_2(fm_mat_s%local_data(:, my_col_local), buffer_recv(:, col_local), &
1739  dot_blksize)
1740  END DO
1741 
1742  p_ij(ij_counter) = p_ij(ij_counter) &
1743  - trace*sinh_over_x(0.5_dp*(eigenval(my_i) - eigenval(my_j))*omega)*omega*weight
1744  END DO
1745  ELSE IF (send_size > 0) THEN
1746  CALL timeset(routinen//"_send", handle2)
1747  CALL para_env%send(buffer_send(:, :send_size), blacs2mpi(my_prow, pcol_send), tag)
1748  CALL timestop(handle2)
1749  END IF
1750  END DO
1751  IF (ALLOCATED(buffer_send)) DEALLOCATE (buffer_send)
1752  IF (ALLOCATED(buffer_recv)) DEALLOCATE (buffer_recv)
1753  END IF
1754 
1755  CALL timestop(handle)
1756 
1757  END SUBROUTINE
1758 
1759 ! **************************************************************************************************
1760 !> \brief ...
1761 !> \param fm_work_iaP ...
1762 !> \param fm_mat_S ...
1763 !> \param pair_list ...
1764 !> \param virtual ...
1765 !> \param P_ab ...
1766 !> \param Eigenval ...
1767 !> \param omega ...
1768 !> \param weight ...
1769 !> \param index2send ...
1770 !> \param index2recv ...
1771 !> \param dot_blksize ...
1772 ! **************************************************************************************************
1773  SUBROUTINE calc_pab_degen(fm_work_iaP, fm_mat_S, pair_list, virtual, P_ab, Eigenval, &
1774  omega, weight, index2send, index2recv, dot_blksize)
1775  TYPE(cp_fm_type), INTENT(IN) :: fm_work_iap, fm_mat_s
1776  INTEGER, DIMENSION(:, :), INTENT(IN) :: pair_list
1777  INTEGER, INTENT(IN) :: virtual
1778  REAL(kind=dp), DIMENSION(:), INTENT(INOUT) :: p_ab
1779  REAL(kind=dp), DIMENSION(:), INTENT(IN) :: eigenval
1780  REAL(kind=dp), INTENT(IN) :: omega, weight
1781  TYPE(one_dim_int_array), DIMENSION(0:), INTENT(IN) :: index2send, index2recv
1782  INTEGER, INTENT(IN) :: dot_blksize
1783 
1784  CHARACTER(LEN=*), PARAMETER :: routinen = 'calc_Pab_degen'
1785 
1786  INTEGER :: ab_counter, avirt, col_global, col_local, counter, first_p_pos(2), &
1787  first_p_pos_col, handle, handle2, iocc, my_a, my_b, my_col_local, my_pcol, my_prow, &
1788  ncol_block, ncol_local, nrow_local, num_ab_pairs, num_pe_col, pcol, pcol_recv, pcol_send, &
1789  proc_shift, recv_size, send_size, size_recv_buffer, size_send_buffer, tag
1790  INTEGER, DIMENSION(:), POINTER :: col_indices, ncol_locals
1791  INTEGER, DIMENSION(:, :), POINTER :: blacs2mpi
1792  REAL(kind=dp) :: trace
1793  REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :) :: buffer_recv, buffer_send
1794  TYPE(cp_blacs_env_type), POINTER :: context
1795  TYPE(mp_para_env_type), POINTER :: para_env
1796 
1797  CALL timeset(routinen, handle)
1798 
1799  CALL cp_fm_struct_get(fm_work_iap%matrix_struct, para_env=para_env, ncol_locals=ncol_locals, &
1800  ncol_local=ncol_local, col_indices=col_indices, &
1801  context=context, ncol_block=ncol_block, first_p_pos=first_p_pos, nrow_local=nrow_local)
1802  CALL context%get(my_process_row=my_prow, my_process_column=my_pcol, &
1803  number_of_process_columns=num_pe_col, blacs2mpi=blacs2mpi)
1804  first_p_pos_col = first_p_pos(2)
1805 
1806  num_ab_pairs = SIZE(pair_list, 2)
1807 
1808  tag = 43
1809 
1810  DO ab_counter = 1, num_ab_pairs
1811 
1812  my_a = pair_list(1, ab_counter)
1813  my_b = pair_list(2, ab_counter)
1814 
1815  trace = 0.0_dp
1816 
1817  DO col_local = 1, ncol_local
1818  col_global = col_indices(col_local)
1819 
1820  iocc = max(1, col_global - 1)/virtual + 1
1821  avirt = col_global - (iocc - 1)*virtual
1822 
1823  IF (avirt /= my_b) cycle
1824  pcol = cp_fm_indxg2p((iocc - 1)*virtual + my_a, ncol_block, 0, first_p_pos_col, num_pe_col)
1825  IF (pcol /= my_pcol) cycle
1826  my_col_local = cp_fm_indxg2l((iocc - 1)*virtual + my_a, ncol_block, 0, first_p_pos_col, num_pe_col)
1827 
1828  trace = trace + accurate_dot_product_2(fm_mat_s%local_data(:, my_col_local), fm_work_iap%local_data(:, col_local), &
1829  dot_blksize)
1830 
1831  END DO
1832 
1833  p_ab(ab_counter) = p_ab(ab_counter) &
1834  + trace*sinh_over_x(0.5_dp*(eigenval(my_a) - eigenval(my_b))*omega)*omega*weight
1835 
1836  END DO
1837 
1838  IF (num_pe_col > 1) THEN
1839  size_send_buffer = 0
1840  size_recv_buffer = 0
1841  DO proc_shift = 1, num_pe_col - 1
1842  pcol_send = modulo(my_pcol + proc_shift, num_pe_col)
1843  pcol_recv = modulo(my_pcol - proc_shift, num_pe_col)
1844 
1845  IF (ALLOCATED(index2send(pcol_send)%array)) &
1846  size_send_buffer = max(size_send_buffer, SIZE(index2send(pcol_send)%array))
1847 
1848  IF (ALLOCATED(index2recv(pcol_recv)%array)) &
1849  size_recv_buffer = max(size_recv_buffer, SIZE(index2recv(pcol_recv)%array))
1850  END DO
1851 
1852  ALLOCATE (buffer_send(nrow_local, size_send_buffer), buffer_recv(nrow_local, size_recv_buffer))
1853 
1854  DO proc_shift = 1, num_pe_col - 1
1855  pcol_send = modulo(my_pcol + proc_shift, num_pe_col)
1856  pcol_recv = modulo(my_pcol - proc_shift, num_pe_col)
1857 
1858  ! Collect data and exchange
1859  send_size = 0
1860  IF (ALLOCATED(index2send(pcol_send)%array)) send_size = SIZE(index2send(pcol_send)%array)
1861 
1862  DO counter = 1, send_size
1863  buffer_send(:, counter) = fm_work_iap%local_data(:, index2send(pcol_send)%array(counter))
1864  END DO
1865 
1866  recv_size = 0
1867  IF (ALLOCATED(index2recv(pcol_recv)%array)) recv_size = SIZE(index2recv(pcol_recv)%array)
1868  IF (recv_size > 0) THEN
1869  CALL timeset(routinen//"_send", handle2)
1870  IF (send_size > 0) THEN
1871  CALL para_env%sendrecv(buffer_send(:, :send_size), blacs2mpi(my_prow, pcol_send), &
1872  buffer_recv(:, :recv_size), blacs2mpi(my_prow, pcol_recv), tag)
1873  ELSE
1874  CALL para_env%recv(buffer_recv(:, :recv_size), blacs2mpi(my_prow, pcol_recv), tag)
1875  END IF
1876  CALL timestop(handle2)
1877 
1878  DO ab_counter = 1, num_ab_pairs
1879  ! Collect the contributions of the matrix elements
1880 
1881  my_a = pair_list(1, ab_counter)
1882  my_b = pair_list(2, ab_counter)
1883 
1884  trace = 0.0_dp
1885 
1886  DO col_local = 1, SIZE(index2recv(pcol_recv)%array)
1887  col_global = index2recv(pcol_recv)%array(col_local)
1888 
1889  iocc = max(1, col_global - 1)/virtual + 1
1890  avirt = col_global - (iocc - 1)*virtual
1891  IF (avirt /= my_b) cycle
1892  pcol = cp_fm_indxg2p((iocc - 1)*virtual + my_a, ncol_block, 0, first_p_pos_col, num_pe_col)
1893  IF (pcol /= my_pcol) cycle
1894 
1895  my_col_local = cp_fm_indxg2l((iocc - 1)*virtual + my_a, ncol_block, 0, first_p_pos_col, num_pe_col)
1896 
1897  trace = trace + accurate_dot_product_2(fm_mat_s%local_data(:, my_col_local), buffer_recv(:, col_local), &
1898  dot_blksize)
1899  END DO
1900 
1901  p_ab(ab_counter) = p_ab(ab_counter) &
1902  + trace*sinh_over_x(0.5_dp*(eigenval(my_a) - eigenval(my_b))*omega)*omega*weight
1903 
1904  END DO
1905  ELSE IF (send_size > 0) THEN
1906  CALL timeset(routinen//"_send", handle2)
1907  CALL para_env%send(buffer_send(:, :send_size), blacs2mpi(my_prow, pcol_send), tag)
1908  CALL timestop(handle2)
1909  END IF
1910  END DO
1911  IF (ALLOCATED(buffer_send)) DEALLOCATE (buffer_send)
1912  IF (ALLOCATED(buffer_recv)) DEALLOCATE (buffer_recv)
1913  END IF
1914 
1915  CALL timestop(handle)
1916 
1917  END SUBROUTINE
1918 
1919 ! **************************************************************************************************
1920 !> \brief ...
1921 !> \param index2send ...
1922 !> \param index2recv ...
1923 !> \param fm_mat_S ...
1924 !> \param mat_S_3D ...
1925 !> \param gd_homo ...
1926 !> \param gd_virtual ...
1927 !> \param mepos ...
1928 ! **************************************************************************************************
1929  SUBROUTINE redistribute_fm_mat_s(index2send, index2recv, fm_mat_S, mat_S_3D, gd_homo, gd_virtual, mepos)
1930  TYPE(one_dim_int_array), DIMENSION(0:), INTENT(IN) :: index2send
1931  TYPE(two_dim_int_array), DIMENSION(0:), INTENT(IN) :: index2recv
1932  TYPE(cp_fm_type), INTENT(IN) :: fm_mat_s
1933  REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :, :), &
1934  INTENT(OUT) :: mat_s_3d
1935  TYPE(group_dist_d1_type), INTENT(IN) :: gd_homo, gd_virtual
1936  INTEGER, DIMENSION(2), INTENT(IN) :: mepos
1937 
1938  CHARACTER(LEN=*), PARAMETER :: routinen = 'redistribute_fm_mat_S'
1939 
1940  INTEGER :: col_local, handle, my_a, my_homo, my_i, my_pcol, my_prow, my_virtual, nrow_local, &
1941  num_pe_col, proc_recv, proc_send, proc_shift, recv_size, send_size, size_recv_buffer, &
1942  size_send_buffer, tag
1943  INTEGER, DIMENSION(:, :), POINTER :: blacs2mpi
1944  REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :) :: buffer_recv, buffer_send
1945  TYPE(mp_para_env_type), POINTER :: para_env
1946 
1947  CALL timeset(routinen, handle)
1948 
1949  tag = 46
1950 
1951  CALL fm_mat_s%matrix_struct%context%get(my_process_row=my_prow, my_process_column=my_pcol, &
1952  number_of_process_columns=num_pe_col, blacs2mpi=blacs2mpi)
1953 
1954  CALL cp_fm_struct_get(fm_mat_s%matrix_struct, nrow_local=nrow_local, para_env=para_env)
1955 
1956  CALL get_group_dist(gd_homo, mepos(2), sizes=my_homo)
1957  CALL get_group_dist(gd_virtual, mepos(1), sizes=my_virtual)
1958 
1959  ALLOCATE (mat_s_3d(nrow_local, my_virtual, my_homo))
1960 
1961  IF (ALLOCATED(index2send(my_pcol)%array)) THEN
1962  DO col_local = 1, SIZE(index2send(my_pcol)%array)
1963  my_a = index2recv(my_pcol)%array(1, col_local)
1964  my_i = index2recv(my_pcol)%array(2, col_local)
1965  mat_s_3d(:, my_a, my_i) = fm_mat_s%local_data(:, index2send(my_pcol)%array(col_local))
1966  END DO
1967  END IF
1968 
1969  IF (num_pe_col > 1) THEN
1970  size_send_buffer = 0
1971  size_recv_buffer = 0
1972  DO proc_shift = 1, num_pe_col - 1
1973  proc_send = modulo(my_pcol + proc_shift, num_pe_col)
1974  proc_recv = modulo(my_pcol - proc_shift, num_pe_col)
1975 
1976  send_size = 0
1977  IF (ALLOCATED(index2send(proc_send)%array)) send_size = SIZE(index2send(proc_send)%array)
1978  size_send_buffer = max(size_send_buffer, send_size)
1979 
1980  recv_size = 0
1981  IF (ALLOCATED(index2recv(proc_recv)%array)) recv_size = SIZE(index2recv(proc_recv)%array)
1982  size_recv_buffer = max(size_recv_buffer, recv_size)
1983 
1984  END DO
1985 
1986  ALLOCATE (buffer_send(nrow_local, size_send_buffer), buffer_recv(nrow_local, size_recv_buffer))
1987 
1988  DO proc_shift = 1, num_pe_col - 1
1989  proc_send = modulo(my_pcol + proc_shift, num_pe_col)
1990  proc_recv = modulo(my_pcol - proc_shift, num_pe_col)
1991 
1992  send_size = 0
1993  IF (ALLOCATED(index2send(proc_send)%array)) send_size = SIZE(index2send(proc_send)%array)
1994  DO col_local = 1, send_size
1995  buffer_send(:, col_local) = fm_mat_s%local_data(:, index2send(proc_send)%array(col_local))
1996  END DO
1997 
1998  recv_size = 0
1999  IF (ALLOCATED(index2recv(proc_recv)%array)) recv_size = SIZE(index2recv(proc_recv)%array, 2)
2000  IF (recv_size > 0) THEN
2001  IF (send_size > 0) THEN
2002  CALL para_env%sendrecv(buffer_send(:, :send_size), blacs2mpi(my_prow, proc_send), &
2003  buffer_recv(:, :recv_size), blacs2mpi(my_prow, proc_recv), tag)
2004  ELSE
2005  CALL para_env%recv(buffer_recv(:, :recv_size), blacs2mpi(my_prow, proc_recv), tag)
2006  END IF
2007 
2008  DO col_local = 1, recv_size
2009  my_a = index2recv(proc_recv)%array(1, col_local)
2010  my_i = index2recv(proc_recv)%array(2, col_local)
2011  mat_s_3d(:, my_a, my_i) = buffer_recv(:, col_local)
2012  END DO
2013  ELSE IF (send_size > 0) THEN
2014  CALL para_env%send(buffer_send(:, :send_size), blacs2mpi(my_prow, proc_send), tag)
2015  END IF
2016 
2017  END DO
2018 
2019  IF (ALLOCATED(buffer_send)) DEALLOCATE (buffer_send)
2020  IF (ALLOCATED(buffer_recv)) DEALLOCATE (buffer_recv)
2021  END IF
2022 
2023  CALL timestop(handle)
2024 
2025  END SUBROUTINE
2026 
2027 ! **************************************************************************************************
2028 !> \brief ...
2029 !> \param rpa_grad ...
2030 !> \param mp2_env ...
2031 !> \param para_env_sub ...
2032 !> \param para_env ...
2033 !> \param qs_env ...
2034 !> \param gd_array ...
2035 !> \param color_sub ...
2036 !> \param do_ri_sos_laplace_mp2 ...
2037 !> \param homo ...
2038 !> \param virtual ...
2039 ! **************************************************************************************************
2040  SUBROUTINE rpa_grad_finalize(rpa_grad, mp2_env, para_env_sub, para_env, qs_env, gd_array, &
2041  color_sub, do_ri_sos_laplace_mp2, homo, virtual)
2042  TYPE(rpa_grad_type), INTENT(INOUT) :: rpa_grad
2043  TYPE(mp2_type), INTENT(INOUT) :: mp2_env
2044  TYPE(mp_para_env_type), INTENT(IN), POINTER :: para_env_sub, para_env
2045  TYPE(qs_environment_type), INTENT(IN), POINTER :: qs_env
2046  TYPE(group_dist_d1_type) :: gd_array
2047  INTEGER, INTENT(IN) :: color_sub
2048  LOGICAL, INTENT(IN) :: do_ri_sos_laplace_mp2
2049  INTEGER, DIMENSION(:), INTENT(IN) :: homo, virtual
2050 
2051  CHARACTER(LEN=*), PARAMETER :: routinen = 'rpa_grad_finalize'
2052 
2053  INTEGER :: dimen_ia, dimen_ri, handle, iib, ispin, my_group_l_end, my_group_l_size, &
2054  my_group_l_start, my_ia_end, my_ia_size, my_ia_start, my_p_end, my_p_size, my_p_start, &
2055  ngroup, nspins, pos_group, pos_sub, proc
2056  INTEGER, ALLOCATABLE, DIMENSION(:) :: pos_info
2057  INTEGER, ALLOCATABLE, DIMENSION(:, :) :: group_grid_2_mepos, mepos_2_grid_group
2058  REAL(kind=dp) :: my_scale
2059  REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :) :: gamma_2d
2060  TYPE(cp_blacs_env_type), POINTER :: blacs_env
2061  TYPE(cp_fm_struct_type), POINTER :: fm_struct
2062  TYPE(cp_fm_type) :: fm_g_p_ia, fm_pq, fm_pq_2, fm_pq_half, &
2063  fm_work_pq, fm_work_pq_2, fm_y, &
2064  operator_half
2065  TYPE(group_dist_d1_type) :: gd_array_new, gd_ia, gd_p, gd_p_new
2066 
2067  CALL timeset(routinen, handle)
2068 
2069  ! Release unnecessary matrices to save memory for next steps
2070 
2071  nspins = SIZE(rpa_grad%fm_Y)
2072 
2073  ! Scaling factor is required to scale the density matrices and the Gamma matrices later
2074  IF (do_ri_sos_laplace_mp2) THEN
2075  my_scale = mp2_env%scale_s
2076  ELSE
2077  my_scale = -mp2_env%ri_rpa%scale_rpa/(2.0_dp*pi)
2078  IF (mp2_env%ri_rpa%minimax_quad) my_scale = my_scale/2.0_dp
2079  END IF
2080 
2081  IF (do_ri_sos_laplace_mp2) THEN
2082  CALL sos_mp2_grad_finalize(rpa_grad%sos_mp2_work_occ, rpa_grad%sos_mp2_work_virt, &
2083  para_env, para_env_sub, homo, virtual, mp2_env)
2084  ELSE
2085  CALL rpa_grad_work_finalize(rpa_grad%rpa_work, mp2_env, homo, &
2086  virtual, para_env, para_env_sub)
2087  END IF
2088 
2089  CALL get_qs_env(qs_env, blacs_env=blacs_env)
2090 
2091  CALL cp_fm_get_info(rpa_grad%fm_Gamma_PQ, ncol_global=dimen_ri)
2092 
2093  NULLIFY (fm_struct)
2094  CALL cp_fm_struct_create(fm_struct, context=blacs_env, nrow_global=dimen_ri, &
2095  ncol_global=dimen_ri, para_env=para_env)
2096  CALL cp_fm_create(fm_pq, fm_struct)
2097  CALL cp_fm_create(fm_work_pq, fm_struct)
2098  IF (.NOT. compare_potential_types(mp2_env%ri_metric, mp2_env%potential_parameter)) THEN
2099  CALL cp_fm_create(fm_pq_2, fm_struct)
2100  END IF
2101  CALL cp_fm_struct_release(fm_struct)
2102  CALL cp_fm_set_all(fm_pq, 0.0_dp)
2103 
2104  ! We still have to left- and right multiply it with PQhalf
2105  CALL dereplicate_and_sum_fm(rpa_grad%fm_Gamma_PQ, fm_pq)
2106 
2107  ngroup = para_env%num_pe/para_env_sub%num_pe
2108 
2109  CALL prepare_redistribution(para_env, para_env_sub, ngroup, &
2110  group_grid_2_mepos, mepos_2_grid_group, pos_info=pos_info)
2111 
2112  ! Create fm_PQ_half
2113  CALL create_group_dist(gd_p, para_env_sub%num_pe, dimen_ri)
2114  CALL get_group_dist(gd_p, para_env_sub%mepos, my_p_start, my_p_end, my_p_size)
2115 
2116  CALL get_group_dist(gd_array, color_sub, my_group_l_start, my_group_l_end, my_group_l_size)
2117 
2118  CALL create_group_dist(gd_p_new, para_env%num_pe)
2119  CALL create_group_dist(gd_array_new, para_env%num_pe)
2120 
2121  DO proc = 0, para_env%num_pe - 1
2122  ! calculate position of the group
2123  pos_group = proc/para_env_sub%num_pe
2124  ! calculate position in the subgroup
2125  pos_sub = pos_info(proc)
2126  ! 1 -> rows, 2 -> cols
2127  CALL get_group_dist(gd_array, pos_group, gd_array_new, proc)
2128  CALL get_group_dist(gd_p, pos_sub, gd_p_new, proc)
2129  END DO
2130 
2131  DEALLOCATE (pos_info)
2132  CALL release_group_dist(gd_p)
2133 
2134  CALL array2fm(mp2_env%ri_grad%PQ_half, fm_pq%matrix_struct, &
2135  my_p_start, my_p_end, &
2136  my_group_l_start, my_group_l_end, &
2137  gd_p_new, gd_array_new, &
2138  group_grid_2_mepos, para_env_sub%num_pe, ngroup, &
2139  fm_pq_half)
2140 
2141  IF (.NOT. compare_potential_types(mp2_env%ri_metric, mp2_env%potential_parameter)) THEN
2142  CALL array2fm(mp2_env%ri_grad%operator_half, fm_pq%matrix_struct, my_p_start, my_p_end, &
2143  my_group_l_start, my_group_l_end, &
2144  gd_p_new, gd_array_new, &
2145  group_grid_2_mepos, para_env_sub%num_pe, ngroup, &
2146  operator_half)
2147  END IF
2148 
2149  ! deallocate the info array
2150  CALL release_group_dist(gd_p_new)
2151  CALL release_group_dist(gd_array_new)
2152 
2153  IF (compare_potential_types(mp2_env%ri_metric, mp2_env%potential_parameter)) THEN
2154 ! Finish Gamma_PQ
2155  CALL parallel_gemm(transa="N", transb="T", m=dimen_ri, n=dimen_ri, k=dimen_ri, alpha=1.0_dp, &
2156  matrix_a=fm_pq, matrix_b=fm_pq_half, beta=0.0_dp, &
2157  matrix_c=fm_work_pq)
2158 
2159  CALL parallel_gemm(transa="N", transb="N", m=dimen_ri, n=dimen_ri, k=dimen_ri, alpha=-my_scale, &
2160  matrix_a=fm_pq_half, matrix_b=fm_work_pq, beta=0.0_dp, &
2161  matrix_c=fm_pq)
2162 
2163  CALL cp_fm_release(fm_work_pq)
2164  ELSE
2165  CALL parallel_gemm(transa="N", transb="T", m=dimen_ri, n=dimen_ri, k=dimen_ri, alpha=1.0_dp, &
2166  matrix_a=fm_pq, matrix_b=operator_half, beta=0.0_dp, &
2167  matrix_c=fm_work_pq)
2168 
2169  CALL parallel_gemm(transa="N", transb="N", m=dimen_ri, n=dimen_ri, k=dimen_ri, alpha=my_scale, &
2170  matrix_a=operator_half, matrix_b=fm_work_pq, beta=0.0_dp, &
2171  matrix_c=fm_pq)
2172  CALL cp_fm_release(operator_half)
2173 
2174  CALL cp_fm_create(fm_work_pq_2, fm_pq%matrix_struct, name="fm_Gamma_PQ_2")
2175  CALL parallel_gemm(transa="N", transb="N", m=dimen_ri, n=dimen_ri, k=dimen_ri, alpha=-my_scale, &
2176  matrix_a=fm_pq_half, matrix_b=fm_work_pq, beta=0.0_dp, &
2177  matrix_c=fm_work_pq_2)
2178  CALL cp_fm_to_fm(fm_work_pq_2, fm_pq_2)
2179  CALL cp_fm_geadd(1.0_dp, "T", fm_work_pq_2, 1.0_dp, fm_pq_2)
2180  CALL cp_fm_release(fm_work_pq_2)
2181  CALL cp_fm_release(fm_work_pq)
2182  END IF
2183 
2184  ALLOCATE (mp2_env%ri_grad%Gamma_PQ(my_p_size, my_group_l_size))
2185  CALL fm2array(mp2_env%ri_grad%Gamma_PQ, &
2186  my_p_size, my_p_start, my_p_end, &
2187  my_group_l_size, my_group_l_start, my_group_l_end, &
2188  group_grid_2_mepos, mepos_2_grid_group, &
2189  para_env_sub%num_pe, ngroup, &
2190  fm_pq)
2191 
2192  IF (.NOT. compare_potential_types(mp2_env%ri_metric, mp2_env%potential_parameter)) THEN
2193  ALLOCATE (mp2_env%ri_grad%Gamma_PQ_2(my_p_size, my_group_l_size))
2194  CALL fm2array(mp2_env%ri_grad%Gamma_PQ_2, my_p_size, my_p_start, my_p_end, &
2195  my_group_l_size, my_group_l_start, my_group_l_end, &
2196  group_grid_2_mepos, mepos_2_grid_group, &
2197  para_env_sub%num_pe, ngroup, &
2198  fm_pq_2)
2199  END IF
2200 
2201 ! Now, Gamma_Pia
2202  ALLOCATE (mp2_env%ri_grad%G_P_ia(my_group_l_size, nspins))
2203  DO ispin = 1, nspins
2204  DO iib = 1, my_group_l_size
2205  NULLIFY (mp2_env%ri_grad%G_P_ia(iib, ispin)%matrix)
2206  END DO
2207  END DO
2208 
2209  ! Redistribute the Y matrix
2210  DO ispin = 1, nspins
2211  ! Collect all data of columns for the own sub group locally
2212  CALL cp_fm_get_info(rpa_grad%fm_Y(ispin), ncol_global=dimen_ia)
2213 
2214  CALL get_qs_env(qs_env, blacs_env=blacs_env)
2215 
2216  NULLIFY (fm_struct)
2217  CALL cp_fm_struct_create(fm_struct, template_fmstruct=fm_pq_half%matrix_struct, nrow_global=dimen_ia)
2218  CALL cp_fm_create(fm_y, fm_struct)
2219  CALL cp_fm_struct_release(fm_struct)
2220  CALL cp_fm_set_all(fm_y, 0.0_dp)
2221 
2222  CALL dereplicate_and_sum_fm(rpa_grad%fm_Y(ispin), fm_y)
2223 
2224  CALL cp_fm_create(fm_g_p_ia, fm_y%matrix_struct)
2225  CALL cp_fm_set_all(fm_g_p_ia, 0.0_dp)
2226 
2227  CALL parallel_gemm(transa="N", transb="T", m=dimen_ia, n=dimen_ri, k=dimen_ri, alpha=my_scale, &
2228  matrix_a=fm_y, matrix_b=fm_pq_half, beta=0.0_dp, &
2229  matrix_c=fm_g_p_ia)
2230 
2231  CALL cp_fm_release(fm_y)
2232 
2233  CALL create_group_dist(gd_ia, para_env_sub%num_pe, dimen_ia)
2234  CALL get_group_dist(gd_ia, para_env_sub%mepos, my_ia_start, my_ia_end, my_ia_size)
2235 
2236  CALL fm2array(gamma_2d, my_ia_size, my_ia_start, my_ia_end, &
2237  my_group_l_size, my_group_l_start, my_group_l_end, &
2238  group_grid_2_mepos, mepos_2_grid_group, &
2239  para_env_sub%num_pe, ngroup, &
2240  fm_g_p_ia)
2241 
2242  ! create the Gamma_ia_P in DBCSR style
2243  CALL create_dbcsr_gamma(gamma_2d, homo(ispin), virtual(ispin), dimen_ia, para_env_sub, &
2244  my_ia_start, my_ia_end, my_group_l_size, gd_ia, &
2245  mp2_env%ri_grad%G_P_ia(:, ispin), mp2_env%ri_grad%mo_coeff_o(ispin)%matrix)
2246 
2247  CALL release_group_dist(gd_ia)
2248 
2249  END DO
2250  DEALLOCATE (rpa_grad%fm_Y)
2251  CALL cp_fm_release(fm_pq_half)
2252 
2253  CALL timestop(handle)
2254 
2255  END SUBROUTINE rpa_grad_finalize
2256 
2257 ! **************************************************************************************************
2258 !> \brief ...
2259 !> \param sos_mp2_work_occ ...
2260 !> \param sos_mp2_work_virt ...
2261 !> \param para_env ...
2262 !> \param para_env_sub ...
2263 !> \param homo ...
2264 !> \param virtual ...
2265 !> \param mp2_env ...
2266 ! **************************************************************************************************
2267  SUBROUTINE sos_mp2_grad_finalize(sos_mp2_work_occ, sos_mp2_work_virt, para_env, para_env_sub, homo, virtual, mp2_env)
2268  TYPE(sos_mp2_grad_work_type), ALLOCATABLE, &
2269  DIMENSION(:), INTENT(INOUT) :: sos_mp2_work_occ, sos_mp2_work_virt
2270  TYPE(mp_para_env_type), INTENT(IN), POINTER :: para_env, para_env_sub
2271  INTEGER, DIMENSION(:), INTENT(IN) :: homo, virtual
2272  TYPE(mp2_type), INTENT(INOUT) :: mp2_env
2273 
2274  CHARACTER(LEN=*), PARAMETER :: routinen = 'sos_mp2_grad_finalize'
2275 
2276  INTEGER :: ab_counter, handle, ij_counter, ispin, &
2277  itmp(2), my_a, my_b, my_b_end, &
2278  my_b_size, my_b_start, my_i, my_j, &
2279  nspins, pcol
2280  REAL(kind=dp) :: my_scale
2281 
2282  CALL timeset(routinen, handle)
2283 
2284  nspins = SIZE(sos_mp2_work_occ)
2285  my_scale = mp2_env%scale_s
2286 
2287  DO ispin = 1, nspins
2288  DO pcol = 0, SIZE(sos_mp2_work_occ(ispin)%index2send, 1) - 1
2289  IF (ALLOCATED(sos_mp2_work_occ(ispin)%index2send(pcol)%array)) &
2290  DEALLOCATE (sos_mp2_work_occ(ispin)%index2send(pcol)%array)
2291  IF (ALLOCATED(sos_mp2_work_occ(ispin)%index2send(pcol)%array)) &
2292  DEALLOCATE (sos_mp2_work_occ(ispin)%index2send(pcol)%array)
2293  IF (ALLOCATED(sos_mp2_work_virt(ispin)%index2recv(pcol)%array)) &
2294  DEALLOCATE (sos_mp2_work_virt(ispin)%index2recv(pcol)%array)
2295  IF (ALLOCATED(sos_mp2_work_virt(ispin)%index2recv(pcol)%array)) &
2296  DEALLOCATE (sos_mp2_work_virt(ispin)%index2recv(pcol)%array)
2297  END DO
2298  DEALLOCATE (sos_mp2_work_occ(ispin)%index2send, &
2299  sos_mp2_work_occ(ispin)%index2recv, &
2300  sos_mp2_work_virt(ispin)%index2send, &
2301  sos_mp2_work_virt(ispin)%index2recv)
2302  END DO
2303 
2304  ! Sum P_ij and P_ab and redistribute them
2305  DO ispin = 1, nspins
2306  CALL para_env%sum(sos_mp2_work_occ(ispin)%P)
2307 
2308  ALLOCATE (mp2_env%ri_grad%P_ij(ispin)%array(homo(ispin), homo(ispin)))
2309  mp2_env%ri_grad%P_ij(ispin)%array = 0.0_dp
2310  DO my_i = 1, homo(ispin)
2311  mp2_env%ri_grad%P_ij(ispin)%array(my_i, my_i) = my_scale*sos_mp2_work_occ(ispin)%P(my_i)
2312  END DO
2313  DO ij_counter = 1, SIZE(sos_mp2_work_occ(ispin)%pair_list, 2)
2314  my_i = sos_mp2_work_occ(ispin)%pair_list(1, ij_counter)
2315  my_j = sos_mp2_work_occ(ispin)%pair_list(2, ij_counter)
2316 
2317  mp2_env%ri_grad%P_ij(ispin)%array(my_i, my_j) = my_scale*sos_mp2_work_occ(ispin)%P(homo(ispin) + ij_counter)
2318  END DO
2319  DEALLOCATE (sos_mp2_work_occ(ispin)%P, sos_mp2_work_occ(ispin)%pair_list)
2320 
2321  ! Symmetrize P_ij
2322  mp2_env%ri_grad%P_ij(ispin)%array(:, :) = 0.5_dp*(mp2_env%ri_grad%P_ij(ispin)%array + &
2323  transpose(mp2_env%ri_grad%P_ij(ispin)%array))
2324 
2325  ! The first index of P_ab has to be distributed within the subgroups, so sum it up first and add the required elements later
2326  CALL para_env%sum(sos_mp2_work_virt(ispin)%P)
2327 
2328  itmp = get_limit(virtual(ispin), para_env_sub%num_pe, para_env_sub%mepos)
2329  my_b_size = itmp(2) - itmp(1) + 1
2330  my_b_start = itmp(1)
2331  my_b_end = itmp(2)
2332 
2333  ALLOCATE (mp2_env%ri_grad%P_ab(ispin)%array(my_b_size, virtual(ispin)))
2334  mp2_env%ri_grad%P_ab(ispin)%array = 0.0_dp
2335  DO my_a = itmp(1), itmp(2)
2336  mp2_env%ri_grad%P_ab(ispin)%array(my_a - itmp(1) + 1, my_a) = my_scale*sos_mp2_work_virt(ispin)%P(my_a)
2337  END DO
2338  DO ab_counter = 1, SIZE(sos_mp2_work_virt(ispin)%pair_list, 2)
2339  my_a = sos_mp2_work_virt(ispin)%pair_list(1, ab_counter)
2340  my_b = sos_mp2_work_virt(ispin)%pair_list(2, ab_counter)
2341 
2342  IF (my_a >= itmp(1) .AND. my_a <= itmp(2)) mp2_env%ri_grad%P_ab(ispin)%array(my_a - itmp(1) + 1, my_b) = &
2343  my_scale*sos_mp2_work_virt(ispin)%P(virtual(ispin) + ab_counter)
2344  END DO
2345 
2346  DEALLOCATE (sos_mp2_work_virt(ispin)%P, sos_mp2_work_virt(ispin)%pair_list)
2347 
2348  ! Symmetrize P_ab
2349  IF (para_env_sub%num_pe > 1) THEN
2350  block
2351  INTEGER :: send_a_start, send_a_end, send_a_size, &
2352  recv_a_start, recv_a_end, recv_a_size, proc_shift, proc_send, proc_recv
2353  REAL(kind=dp), DIMENSION(:), ALLOCATABLE, TARGET :: buffer_send_1d
2354  REAL(kind=dp), DIMENSION(:, :), POINTER :: buffer_send
2355  REAL(kind=dp), DIMENSION(:, :), ALLOCATABLE :: buffer_recv
2356  TYPE(group_dist_d1_type) :: gd_virtual_sub
2357 
2358  CALL create_group_dist(gd_virtual_sub, para_env_sub%num_pe, virtual(ispin))
2359 
2360  mp2_env%ri_grad%P_ab(ispin)%array(:, my_b_start:my_b_end) = &
2361  0.5_dp*(mp2_env%ri_grad%P_ab(ispin)%array(:, my_b_start:my_b_end) &
2362  + transpose(mp2_env%ri_grad%P_ab(ispin)%array(:, my_b_start:my_b_end)))
2363 
2364  ALLOCATE (buffer_send_1d(my_b_size*maxsize(gd_virtual_sub)))
2365  ALLOCATE (buffer_recv(my_b_size, maxsize(gd_virtual_sub)))
2366 
2367  DO proc_shift = 1, para_env_sub%num_pe - 1
2368 
2369  proc_send = modulo(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
2370  proc_recv = modulo(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
2371 
2372  CALL get_group_dist(gd_virtual_sub, proc_send, send_a_start, send_a_end, send_a_size)
2373  CALL get_group_dist(gd_virtual_sub, proc_recv, recv_a_start, recv_a_end, recv_a_size)
2374 
2375  buffer_send(1:send_a_size, 1:my_b_size) => buffer_send_1d(1:my_b_size*send_a_size)
2376 
2377  buffer_send(:send_a_size, :) = transpose(mp2_env%ri_grad%P_ab(ispin)%array(:, send_a_start:send_a_end))
2378  CALL para_env_sub%sendrecv(buffer_send(:send_a_size, :), proc_send, &
2379  buffer_recv(:, :recv_a_size), proc_recv)
2380 
2381  mp2_env%ri_grad%P_ab(ispin)%array(:, recv_a_start:recv_a_end) = &
2382  0.5_dp*(mp2_env%ri_grad%P_ab(ispin)%array(:, recv_a_start:recv_a_end) + buffer_recv(:, 1:recv_a_size))
2383 
2384  END DO
2385 
2386  DEALLOCATE (buffer_send_1d, buffer_recv)
2387 
2388  CALL release_group_dist(gd_virtual_sub)
2389  END block
2390  ELSE
2391  mp2_env%ri_grad%P_ab(ispin)%array(:, :) = 0.5_dp*(mp2_env%ri_grad%P_ab(ispin)%array + &
2392  transpose(mp2_env%ri_grad%P_ab(ispin)%array))
2393  END IF
2394 
2395  END DO
2396  DEALLOCATE (sos_mp2_work_occ, sos_mp2_work_virt)
2397  IF (nspins == 1) THEN
2398  mp2_env%ri_grad%P_ij(1)%array(:, :) = 2.0_dp*mp2_env%ri_grad%P_ij(1)%array
2399  mp2_env%ri_grad%P_ab(1)%array(:, :) = 2.0_dp*mp2_env%ri_grad%P_ab(1)%array
2400  END IF
2401 
2402  CALL timestop(handle)
2403 
2404  END SUBROUTINE
2405 
2406 ! **************************************************************************************************
2407 !> \brief ...
2408 !> \param rpa_work ...
2409 !> \param mp2_env ...
2410 !> \param homo ...
2411 !> \param virtual ...
2412 !> \param para_env ...
2413 !> \param para_env_sub ...
2414 ! **************************************************************************************************
2415  SUBROUTINE rpa_grad_work_finalize(rpa_work, mp2_env, homo, virtual, para_env, para_env_sub)
2416  TYPE(rpa_grad_work_type), INTENT(INOUT) :: rpa_work
2417  TYPE(mp2_type), INTENT(INOUT) :: mp2_env
2418  INTEGER, DIMENSION(:), INTENT(IN) :: homo, virtual
2419  TYPE(mp_para_env_type), INTENT(IN), POINTER :: para_env, para_env_sub
2420 
2421  CHARACTER(LEN=*), PARAMETER :: routinen = 'rpa_grad_work_finalize'
2422 
2423  INTEGER :: handle, ispin, itmp(2), my_a_end, my_a_size, my_a_start, my_b_end, my_b_size, &
2424  my_b_start, my_i_end, my_i_size, my_i_start, nspins, proc, proc_recv, proc_send, &
2425  proc_shift, recv_a_end, recv_a_size, recv_a_start, recv_end, recv_start, send_a_end, &
2426  send_a_size, send_a_start, send_end, send_start, size_recv_buffer, size_send_buffer
2427  REAL(kind=dp) :: my_scale
2428  REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :) :: buffer_recv, buffer_send
2429  TYPE(group_dist_d1_type) :: gd_a_sub, gd_virtual_sub
2430 
2431  CALL timeset(routinen, handle)
2432 
2433  nspins = SIZE(homo)
2434  my_scale = mp2_env%ri_rpa%scale_rpa/(2.0_dp*pi)
2435  IF (mp2_env%ri_rpa%minimax_quad) my_scale = my_scale/2.0_dp
2436 
2437  CALL cp_fm_release(rpa_work%fm_mat_Q_copy)
2438 
2439  DO ispin = 1, nspins
2440  DO proc = 0, SIZE(rpa_work%index2send, 1) - 1
2441  IF (ALLOCATED(rpa_work%index2send(proc, ispin)%array)) DEALLOCATE (rpa_work%index2send(proc, ispin)%array)
2442  IF (ALLOCATED(rpa_work%index2recv(proc, ispin)%array)) DEALLOCATE (rpa_work%index2recv(proc, ispin)%array)
2443  END DO
2444  END DO
2445  DEALLOCATE (rpa_work%index2send, rpa_work%index2recv)
2446 
2447  DO ispin = 1, nspins
2448  CALL get_group_dist(rpa_work%gd_homo(ispin), rpa_work%mepos(2), my_i_start, my_i_end, my_i_size)
2449  CALL release_group_dist(rpa_work%gd_homo(ispin))
2450 
2451  ALLOCATE (mp2_env%ri_grad%P_ij(ispin)%array(homo(ispin), homo(ispin)))
2452  mp2_env%ri_grad%P_ij(ispin)%array = 0.0_dp
2453  mp2_env%ri_grad%P_ij(ispin)%array(my_i_start:my_i_end, :) = my_scale*rpa_work%P_ij(ispin)%array
2454  DEALLOCATE (rpa_work%P_ij(ispin)%array)
2455  CALL para_env%sum(mp2_env%ri_grad%P_ij(ispin)%array)
2456 
2457  ! Symmetrize P_ij
2458  mp2_env%ri_grad%P_ij(ispin)%array(:, :) = 0.5_dp*(mp2_env%ri_grad%P_ij(ispin)%array + &
2459  transpose(mp2_env%ri_grad%P_ij(ispin)%array))
2460 
2461  itmp = get_limit(virtual(ispin), para_env_sub%num_pe, para_env_sub%mepos)
2462  my_b_start = itmp(1)
2463  my_b_end = itmp(2)
2464  my_b_size = my_b_end - my_b_start + 1
2465 
2466  ALLOCATE (mp2_env%ri_grad%P_ab(ispin)%array(my_b_size, virtual(ispin)))
2467  mp2_env%ri_grad%P_ab(ispin)%array = 0.0_dp
2468 
2469  CALL get_group_dist(rpa_work%gd_virtual(ispin), rpa_work%mepos(1), my_a_start, my_a_end, my_a_size)
2470  CALL release_group_dist(rpa_work%gd_virtual(ispin))
2471  ! This group dist contains the info which parts of Pab a process currently owns
2472  CALL create_group_dist(gd_a_sub, my_a_start, my_a_end, my_a_size, para_env_sub)
2473  ! This group dist contains the info which parts of Pab a process is supposed to own later
2474  CALL create_group_dist(gd_virtual_sub, para_env_sub%num_pe, virtual(ispin))
2475 
2476  ! Calculate local indices of the common range of own matrix and send process
2477  send_start = max(1, my_b_start - my_a_start + 1)
2478  send_end = min(my_a_size, my_b_end - my_a_start + 1)
2479 
2480  ! Same for recv process but with reverse positions
2481  recv_start = max(1, my_a_start - my_b_start + 1)
2482  recv_end = min(my_b_size, my_a_end - my_b_start + 1)
2483 
2484  mp2_env%ri_grad%P_ab(ispin)%array(recv_start:recv_end, :) = &
2485  my_scale*rpa_work%P_ab(ispin)%array(send_start:send_end, :)
2486 
2487  IF (para_env_sub%num_pe > 1) THEN
2488  size_send_buffer = 0
2489  size_recv_buffer = 0
2490  DO proc_shift = 1, para_env_sub%num_pe - 1
2491  proc_send = modulo(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
2492  proc_recv = modulo(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
2493 
2494  CALL get_group_dist(gd_virtual_sub, proc_send, send_a_start, send_a_end)
2495  CALL get_group_dist(gd_a_sub, proc_recv, recv_a_start, recv_a_end)
2496 
2497  ! Calculate local indices of the common range of own matrix and send process
2498  send_start = max(1, send_a_start - my_a_start + 1)
2499  send_end = min(my_a_size, send_a_end - my_a_start + 1)
2500 
2501  size_send_buffer = max(size_send_buffer, max(send_end - send_start + 1, 0))
2502 
2503  ! Same for recv process but with reverse positions
2504  recv_start = max(1, recv_a_start - my_b_start + 1)
2505  recv_end = min(my_b_size, recv_a_end - my_b_start + 1)
2506 
2507  size_recv_buffer = max(size_recv_buffer, max(recv_end - recv_start + 1, 0))
2508  END DO
2509  ALLOCATE (buffer_send(size_send_buffer, virtual(ispin)), buffer_recv(size_recv_buffer, virtual(ispin)))
2510 
2511  DO proc_shift = 1, para_env_sub%num_pe - 1
2512  proc_send = modulo(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
2513  proc_recv = modulo(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
2514 
2515  CALL get_group_dist(gd_virtual_sub, proc_send, send_a_start, send_a_end)
2516  CALL get_group_dist(gd_a_sub, proc_recv, recv_a_start, recv_a_end)
2517 
2518  ! Calculate local indices of the common range of own matrix and send process
2519  send_start = max(1, send_a_start - my_a_start + 1)
2520  send_end = min(my_a_size, send_a_end - my_a_start + 1)
2521  buffer_send(1:max(send_end - send_start + 1, 0), :) = rpa_work%P_ab(ispin)%array(send_start:send_end, :)
2522 
2523  ! Same for recv process but with reverse positions
2524  recv_start = max(1, recv_a_start - my_b_start + 1)
2525  recv_end = min(my_b_size, recv_a_end - my_b_start + 1)
2526 
2527  CALL para_env_sub%sendrecv(buffer_send(1:max(send_end - send_start + 1, 0), :), proc_send, &
2528  buffer_recv(1:max(recv_end - recv_start + 1, 0), :), proc_recv)
2529 
2530  mp2_env%ri_grad%P_ab(ispin)%array(recv_start:recv_end, :) = &
2531  mp2_env%ri_grad%P_ab(ispin)%array(recv_start:recv_end, :) + &
2532  my_scale*buffer_recv(1:max(recv_end - recv_start + 1, 0), :)
2533 
2534  END DO
2535 
2536  IF (ALLOCATED(buffer_send)) DEALLOCATE (buffer_send)
2537  IF (ALLOCATED(buffer_recv)) DEALLOCATE (buffer_recv)
2538  END IF
2539  DEALLOCATE (rpa_work%P_ab(ispin)%array)
2540 
2541  CALL release_group_dist(gd_a_sub)
2542 
2543  block
2544  TYPE(mp_comm_type) :: comm_exchange
2545  CALL comm_exchange%from_split(para_env, para_env_sub%mepos)
2546  CALL comm_exchange%sum(mp2_env%ri_grad%P_ab(ispin)%array)
2547  CALL comm_exchange%free()
2548  END block
2549 
2550  ! Symmetrize P_ab
2551  IF (para_env_sub%num_pe > 1) THEN
2552  block
2553  REAL(kind=dp), DIMENSION(:), ALLOCATABLE, TARGET :: buffer_send_1d
2554  REAL(kind=dp), DIMENSION(:, :), POINTER :: buffer_send
2555  REAL(kind=dp), DIMENSION(:, :), ALLOCATABLE :: buffer_recv
2556 
2557  mp2_env%ri_grad%P_ab(ispin)%array(:, my_b_start:my_b_end) = &
2558  0.5_dp*(mp2_env%ri_grad%P_ab(ispin)%array(:, my_b_start:my_b_end) &
2559  + transpose(mp2_env%ri_grad%P_ab(ispin)%array(:, my_b_start:my_b_end)))
2560 
2561  ALLOCATE (buffer_send_1d(my_b_size*maxsize(gd_virtual_sub)))
2562  ALLOCATE (buffer_recv(my_b_size, maxsize(gd_virtual_sub)))
2563 
2564  DO proc_shift = 1, para_env_sub%num_pe - 1
2565 
2566  proc_send = modulo(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
2567  proc_recv = modulo(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
2568 
2569  CALL get_group_dist(gd_virtual_sub, proc_send, send_a_start, send_a_end, send_a_size)
2570  CALL get_group_dist(gd_virtual_sub, proc_recv, recv_a_start, recv_a_end, recv_a_size)
2571 
2572  buffer_send(1:send_a_size, 1:my_b_size) => buffer_send_1d(1:my_b_size*send_a_size)
2573 
2574  buffer_send(:send_a_size, :) = transpose(mp2_env%ri_grad%P_ab(ispin)%array(:, send_a_start:send_a_end))
2575  CALL para_env_sub%sendrecv(buffer_send(:send_a_size, :), proc_send, &
2576  buffer_recv(:, :recv_a_size), proc_recv)
2577 
2578  mp2_env%ri_grad%P_ab(ispin)%array(:, recv_a_start:recv_a_end) = &
2579  0.5_dp*(mp2_env%ri_grad%P_ab(ispin)%array(:, recv_a_start:recv_a_end) + buffer_recv(:, 1:recv_a_size))
2580 
2581  END DO
2582 
2583  DEALLOCATE (buffer_send_1d, buffer_recv)
2584  END block
2585  ELSE
2586  mp2_env%ri_grad%P_ab(ispin)%array(:, :) = 0.5_dp*(mp2_env%ri_grad%P_ab(ispin)%array + &
2587  transpose(mp2_env%ri_grad%P_ab(ispin)%array))
2588  END IF
2589 
2590  CALL release_group_dist(gd_virtual_sub)
2591 
2592  END DO
2593  DEALLOCATE (rpa_work%gd_homo, rpa_work%gd_virtual, rpa_work%P_ij, rpa_work%P_ab)
2594  IF (nspins == 1) THEN
2595  mp2_env%ri_grad%P_ij(1)%array(:, :) = 2.0_dp*mp2_env%ri_grad%P_ij(1)%array
2596  mp2_env%ri_grad%P_ab(1)%array(:, :) = 2.0_dp*mp2_env%ri_grad%P_ab(1)%array
2597  END IF
2598 
2599  CALL timestop(handle)
2600  END SUBROUTINE
2601 
2602 ! **************************************************************************************************
2603 !> \brief Dereplicate data from fm_sub and collect in fm_global, overlapping data will be added
2604 !> \param fm_sub replicated matrix, all subgroups have the same size, will be release on output
2605 !> \param fm_global global matrix, on output it will contain the sum of the replicated matrices redistributed
2606 ! **************************************************************************************************
2607  SUBROUTINE dereplicate_and_sum_fm(fm_sub, fm_global)
2608  TYPE(cp_fm_type), INTENT(INOUT) :: fm_sub, fm_global
2609 
2610  CHARACTER(LEN=*), PARAMETER :: routinen = 'dereplicate_and_sum_fm'
2611 
2612  INTEGER :: col_local, elements2recv_col, elements2recv_row, elements2send_col, &
2613  elements2send_row, first_p_pos_global(2), first_p_pos_sub(2), handle, handle2, &
2614  mypcol_global, myprow_global, ncol_block_global, ncol_block_sub, ncol_local_global, &
2615  ncol_local_sub, npcol_global, npcol_sub, nprow_global, nprow_sub, nrow_block_global, &
2616  nrow_block_sub, nrow_local_global, nrow_local_sub, pcol_recv, pcol_send, proc_recv, &
2617  proc_send, proc_send_global, proc_shift, prow_recv, prow_send, row_local, tag
2618  INTEGER(int_8) :: size_recv_buffer, size_send_buffer
2619  INTEGER, ALLOCATABLE, DIMENSION(:) :: data2recv_col, data2recv_row, &
2620  data2send_col, data2send_row, &
2621  subgroup2mepos
2622  INTEGER, DIMENSION(:), POINTER :: col_indices_global, col_indices_sub, &
2623  row_indices_global, row_indices_sub
2624  INTEGER, DIMENSION(:, :), POINTER :: blacs2mpi_global, blacs2mpi_sub, &
2625  mpi2blacs_global, mpi2blacs_sub
2626  REAL(kind=dp), ALLOCATABLE, DIMENSION(:), TARGET :: recv_buffer_1d, send_buffer_1d
2627  REAL(kind=dp), DIMENSION(:, :), POINTER :: recv_buffer, send_buffer
2628  TYPE(mp_para_env_type), POINTER :: para_env, para_env_sub
2629  TYPE(one_dim_int_array), ALLOCATABLE, DIMENSION(:) :: index2recv_col, index2recv_row, &
2630  index2send_col, index2send_row
2631 
2632  CALL timeset(routinen, handle)
2633 
2634  tag = 1
2635 
2636  nprow_sub = fm_sub%matrix_struct%context%num_pe(1)
2637  npcol_sub = fm_sub%matrix_struct%context%num_pe(2)
2638 
2639  myprow_global = fm_global%matrix_struct%context%mepos(1)
2640  mypcol_global = fm_global%matrix_struct%context%mepos(2)
2641  nprow_global = fm_global%matrix_struct%context%num_pe(1)
2642  npcol_global = fm_global%matrix_struct%context%num_pe(2)
2643 
2644  CALL cp_fm_get_info(fm_sub, col_indices=col_indices_sub, row_indices=row_indices_sub, &
2645  nrow_local=nrow_local_sub, ncol_local=ncol_local_sub)
2646  CALL cp_fm_struct_get(fm_sub%matrix_struct, para_env=para_env_sub, first_p_pos=first_p_pos_sub, &
2647  nrow_block=nrow_block_sub, ncol_block=ncol_block_sub)
2648  CALL cp_fm_struct_get(fm_global%matrix_struct, first_p_pos=first_p_pos_global, nrow_block=nrow_block_global, &
2649  ncol_block=ncol_block_global, para_env=para_env, &
2650  col_indices=col_indices_global, row_indices=row_indices_global, &
2651  nrow_local=nrow_local_global, ncol_local=ncol_local_global)
2652  CALL fm_sub%matrix_struct%context%get(blacs2mpi=blacs2mpi_sub, mpi2blacs=mpi2blacs_sub)
2653  CALL fm_global%matrix_struct%context%get(blacs2mpi=blacs2mpi_global, mpi2blacs=mpi2blacs_global)
2654 
2655  IF (para_env%num_pe /= para_env_sub%num_pe) THEN
2656  block
2657  TYPE(mp_comm_type) :: comm_exchange
2658  comm_exchange = fm_sub%matrix_struct%context%interconnect(para_env)
2659  CALL comm_exchange%sum(fm_sub%local_data)
2660  CALL comm_exchange%free()
2661  END block
2662  END IF
2663 
2664  ALLOCATE (subgroup2mepos(0:para_env_sub%num_pe - 1))
2665  CALL para_env_sub%allgather(para_env%mepos, subgroup2mepos)
2666 
2667  CALL timeset(routinen//"_data2", handle2)
2668  ! Create a map how much data has to be sent to what process coordinate, interchange rows and columns to transpose the matrices
2669  CALL get_elements2send(data2send_col, npcol_global, row_indices_sub, ncol_block_global, first_p_pos_global(2), index2send_col)
2670  CALL get_elements2send(data2send_row, nprow_global, col_indices_sub, nrow_block_global, first_p_pos_global(1), index2send_row)
2671 
2672  ! Create a map how much data has to be sent to what process coordinate, interchange rows and columns to transpose the matrices
2673  ! Do the reverse for the recieve processes
2674  CALL get_elements2send(data2recv_col, npcol_sub, row_indices_global, ncol_block_sub, first_p_pos_sub(2), index2recv_col)
2675  CALL get_elements2send(data2recv_row, nprow_sub, col_indices_global, nrow_block_sub, first_p_pos_sub(1), index2recv_row)
2676  CALL timestop(handle2)
2677 
2678  CALL timeset(routinen//"_local", handle2)
2679  ! Loop over local data and transpose
2680  prow_send = mpi2blacs_global(1, para_env%mepos)
2681  pcol_send = mpi2blacs_global(2, para_env%mepos)
2682  prow_recv = mpi2blacs_sub(1, para_env_sub%mepos)
2683  pcol_recv = mpi2blacs_sub(2, para_env_sub%mepos)
2684  elements2recv_col = data2recv_col(pcol_recv)
2685  elements2recv_row = data2recv_row(prow_recv)
2686 
2687 !$OMP PARALLEL DO DEFAULT(NONE) PRIVATE(row_local,col_local) &
2688 !$OMP SHARED(elements2recv_col,elements2recv_row,recv_buffer,fm_global,&
2689 !$OMP index2recv_col,index2recv_row,pcol_recv,prow_recv, &
2690 !$OMP fm_sub,index2send_col,index2send_row,pcol_send,prow_send)
2691  DO col_local = 1, elements2recv_col
2692  DO row_local = 1, elements2recv_row
2693  fm_global%local_data(index2recv_col(pcol_recv)%array(col_local), &
2694  index2recv_row(prow_recv)%array(row_local)) &
2695  = fm_sub%local_data(index2send_col(pcol_send)%array(row_local), &
2696  index2send_row(prow_send)%array(col_local))
2697  END DO
2698  END DO
2699 !$OMP END PARALLEL DO
2700  CALL timestop(handle2)
2701 
2702  IF (para_env_sub%num_pe > 1) THEN
2703  size_send_buffer = 0_int_8
2704  size_recv_buffer = 0_int_8
2705  ! Loop over all processes in para_env_sub
2706  DO proc_shift = 1, para_env_sub%num_pe - 1
2707  proc_send = modulo(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
2708  proc_recv = modulo(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
2709 
2710  proc_send_global = subgroup2mepos(proc_send)
2711  prow_send = mpi2blacs_global(1, proc_send_global)
2712  pcol_send = mpi2blacs_global(2, proc_send_global)
2713  elements2send_col = data2send_col(pcol_send)
2714  elements2send_row = data2send_row(prow_send)
2715 
2716  size_send_buffer = max(size_send_buffer, int(elements2send_col, int_8)*elements2send_row)
2717 
2718  prow_recv = mpi2blacs_sub(1, proc_recv)
2719  pcol_recv = mpi2blacs_sub(2, proc_recv)
2720  elements2recv_col = data2recv_col(pcol_recv)
2721  elements2recv_row = data2recv_row(prow_recv)
2722 
2723  size_recv_buffer = max(size_recv_buffer, int(elements2recv_col, int_8)*elements2recv_row)
2724  END DO
2725  ALLOCATE (send_buffer_1d(size_send_buffer), recv_buffer_1d(size_recv_buffer))
2726 
2727  ! Loop over all processes in para_env_sub
2728  DO proc_shift = 1, para_env_sub%num_pe - 1
2729  proc_send = modulo(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
2730  proc_recv = modulo(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
2731 
2732  proc_send_global = subgroup2mepos(proc_send)
2733  prow_send = mpi2blacs_global(1, proc_send_global)
2734  pcol_send = mpi2blacs_global(2, proc_send_global)
2735  elements2send_col = data2send_col(pcol_send)
2736  elements2send_row = data2send_row(prow_send)
2737 
2738  CALL timeset(routinen//"_pack", handle2)
2739  ! Loop over local data and pack the buffer
2740  ! Transpose the matrix already
2741  send_buffer(1:elements2send_row, 1:elements2send_col) => send_buffer_1d(1:int(elements2send_row, int_8)*elements2send_col)
2742 !$OMP PARALLEL DO DEFAULT(NONE) PRIVATE(row_local,col_local) &
2743 !$OMP SHARED(elements2send_col,elements2send_row,send_buffer,fm_sub,&
2744 !$OMP index2send_col,index2send_row,pcol_send,prow_send)
2745  DO row_local = 1, elements2send_col
2746  DO col_local = 1, elements2send_row
2747  send_buffer(col_local, row_local) = &
2748  fm_sub%local_data(index2send_col(pcol_send)%array(row_local), &
2749  index2send_row(prow_send)%array(col_local))
2750  END DO
2751  END DO
2752 !$OMP END PARALLEL DO
2753  CALL timestop(handle2)
2754 
2755  prow_recv = mpi2blacs_sub(1, proc_recv)
2756  pcol_recv = mpi2blacs_sub(2, proc_recv)
2757  elements2recv_col = data2recv_col(pcol_recv)
2758  elements2recv_row = data2recv_row(prow_recv)
2759 
2760  ! Send data
2761  recv_buffer(1:elements2recv_col, 1:elements2recv_row) => recv_buffer_1d(1:int(elements2recv_row, int_8)*elements2recv_col)
2762  IF (SIZE(recv_buffer) > 0_int_8) THEN
2763  IF (SIZE(send_buffer) > 0_int_8) THEN
2764  CALL para_env_sub%sendrecv(send_buffer, proc_send, recv_buffer, proc_recv, tag)
2765  ELSE
2766  CALL para_env_sub%recv(recv_buffer, proc_recv, tag)
2767  END IF
2768 
2769  CALL timeset(routinen//"_unpack", handle2)
2770 !$OMP PARALLEL DO DEFAULT(NONE) PRIVATE(row_local,col_local) &
2771 !$OMP SHARED(elements2recv_col,elements2recv_row,recv_buffer,fm_global,&
2772 !$OMP index2recv_col,index2recv_row,pcol_recv,prow_recv)
2773  DO col_local = 1, elements2recv_col
2774  DO row_local = 1, elements2recv_row
2775  fm_global%local_data(index2recv_col(pcol_recv)%array(col_local), &
2776  index2recv_row(prow_recv)%array(row_local)) &
2777  = recv_buffer(col_local, row_local)
2778  END DO
2779  END DO
2780 !$OMP END PARALLEL DO
2781  CALL timestop(handle2)
2782  ELSE IF (SIZE(send_buffer) > 0_int_8) THEN
2783  CALL para_env_sub%send(send_buffer, proc_send, tag)
2784  END IF
2785  END DO
2786  END IF
2787 
2788  DEALLOCATE (data2send_col, data2send_row, data2recv_col, data2recv_row)
2789  DO proc_shift = 0, npcol_global - 1
2790  DEALLOCATE (index2send_col(proc_shift)%array)
2791  END DO
2792  DO proc_shift = 0, npcol_sub - 1
2793  DEALLOCATE (index2recv_col(proc_shift)%array)
2794  END DO
2795  DO proc_shift = 0, nprow_global - 1
2796  DEALLOCATE (index2send_row(proc_shift)%array)
2797  END DO
2798  DO proc_shift = 0, nprow_sub - 1
2799  DEALLOCATE (index2recv_row(proc_shift)%array)
2800  END DO
2801  DEALLOCATE (index2send_col, index2recv_col, index2send_row, index2recv_row)
2802 
2803  CALL cp_fm_release(fm_sub)
2804 
2805  CALL timestop(handle)
2806 
2807  END SUBROUTINE dereplicate_and_sum_fm
2808 
2809 ! **************************************************************************************************
2810 !> \brief ...
2811 !> \param data2send ...
2812 !> \param np_global ...
2813 !> \param indices_sub ...
2814 !> \param n_block_global ...
2815 !> \param first_p_pos_global ...
2816 !> \param index2send ...
2817 ! **************************************************************************************************
2818  SUBROUTINE get_elements2send(data2send, np_global, indices_sub, n_block_global, first_p_pos_global, index2send)
2819  INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(OUT) :: data2send
2820  INTEGER, INTENT(IN) :: np_global
2821  INTEGER, DIMENSION(:), INTENT(IN) :: indices_sub
2822  INTEGER, INTENT(IN) :: n_block_global, first_p_pos_global
2823  TYPE(one_dim_int_array), ALLOCATABLE, &
2824  DIMENSION(:), INTENT(OUT) :: index2send
2825 
2826  INTEGER :: i_global, i_local, proc
2827 
2828  ALLOCATE (data2send(0:np_global - 1))
2829  data2send = 0
2830  DO i_local = 1, SIZE(indices_sub)
2831  i_global = indices_sub(i_local)
2832  proc = cp_fm_indxg2p(i_global, n_block_global, 0, first_p_pos_global, np_global)
2833  data2send(proc) = data2send(proc) + 1
2834  END DO
2835 
2836  ALLOCATE (index2send(0:np_global - 1))
2837  DO proc = 0, np_global - 1
2838  ALLOCATE (index2send(proc)%array(data2send(proc)))
2839  ! We want to crash if there is an error
2840  index2send(proc)%array = -1
2841  END DO
2842 
2843  data2send = 0
2844  DO i_local = 1, SIZE(indices_sub)
2845  i_global = indices_sub(i_local)
2846  proc = cp_fm_indxg2p(i_global, n_block_global, 0, first_p_pos_global, np_global)
2847  data2send(proc) = data2send(proc) + 1
2848  index2send(proc)%array(data2send(proc)) = i_local
2849  END DO
2850 
2851  END SUBROUTINE
2852 
2853 END MODULE rpa_grad
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
Definition: grid_common.h:117
various utilities that regard array of different kinds: output, allocation,... maybe it is not a good...
methods related to the blacs parallel environment
Definition: cp_blacs_env.F:15
basic linear algebra operations for full matrices
subroutine, public cp_fm_geadd(alpha, trans, matrix_a, beta, matrix_b)
interface to BLACS geadd: matrix_b = beta*matrix_b + alpha*opt(matrix_a) where opt(matrix_a) can be e...
subroutine, public cp_fm_upper_to_full(matrix, work)
given an upper triangular matrix computes the corresponding full matrix
subroutine, public cp_fm_scale_and_add(alpha, matrix_a, beta, matrix_b)
calc A <- alpha*A + beta*B optimized for alpha == 1.0 (just add beta*B) and beta == 0....
various cholesky decomposition related routines
subroutine, public cp_fm_cholesky_invert(matrix, n, info_out)
used to replace the cholesky decomposition by the inverse
represent the structure of a full matrix
Definition: cp_fm_struct.F:14
subroutine, public cp_fm_struct_create(fmstruct, para_env, context, nrow_global, ncol_global, nrow_block, ncol_block, descriptor, first_p_pos, local_leading_dimension, template_fmstruct, square_blocks, force_block)
allocates and initializes a full matrix structure
Definition: cp_fm_struct.F:125
subroutine, public cp_fm_struct_get(fmstruct, para_env, context, descriptor, ncol_block, nrow_block, nrow_global, ncol_global, first_p_pos, row_indices, col_indices, nrow_local, ncol_local, nrow_locals, ncol_locals, local_leading_dimension)
returns the values of various attributes of the matrix structure
Definition: cp_fm_struct.F:409
subroutine, public cp_fm_struct_release(fmstruct)
releases a full matrix structure
Definition: cp_fm_struct.F:320
represent a full matrix distributed on many processors
Definition: cp_fm_types.F:15
integer function, public cp_fm_indxg2l(INDXGLOB, NB, IPROC, ISRCPROC, NPROCS)
wrapper to scalapack function INDXG2L that computes the local index of a distributed matrix entry poi...
Definition: cp_fm_types.F:2525
subroutine, public cp_fm_get_info(matrix, name, nrow_global, ncol_global, nrow_block, ncol_block, nrow_local, ncol_local, row_indices, col_indices, local_data, context, nrow_locals, ncol_locals, matrix_struct, para_env)
returns all kind of information about the full matrix
Definition: cp_fm_types.F:1016
integer function, public cp_fm_indxg2p(INDXGLOB, NB, IPROC, ISRCPROC, NPROCS)
wrapper to scalapack function INDXG2P that computes the process coordinate which possesses the entry ...
Definition: cp_fm_types.F:2466
subroutine, public cp_fm_to_fm_submat_general(source, destination, nrows, ncols, s_firstrow, s_firstcol, d_firstrow, d_firstcol, global_context)
General copy of a submatrix of fm matrix to a submatrix of another fm matrix. The two matrices can ha...
Definition: cp_fm_types.F:2003
subroutine, public cp_fm_set_all(matrix, alpha, beta)
set all elements of a matrix to the same value, and optionally the diagonal to a different one
Definition: cp_fm_types.F:535
subroutine, public cp_fm_create(matrix, matrix_struct, name, use_sp)
creates a new full matrix with the given structure
Definition: cp_fm_types.F:167
Counters to determine the performance of parallel DGEMMs.
subroutine, public dgemm_counter_start(dgemm_counter)
start timer of the counter
subroutine, public dgemm_counter_stop(dgemm_counter, size1, size2, size3)
stop timer of the counter and provide matrix sizes
Types to describe group distributions.
elemental integer function, public maxsize(this)
...
elemental integer function, public group_dist_proc(this, pos)
...
sums arrays of real/complex numbers with much reduced round-off as compared to a naive implementation...
Definition: kahan_sum.F:29
Defines the basic variable types.
Definition: kinds.F:23
integer, parameter, public int_8
Definition: kinds.F:54
integer, parameter, public dp
Definition: kinds.F:34
2- and 3-center electron repulsion integral routines based on libint2 Currently available operators: ...
Definition: libint_2c_3c.F:14
pure logical function, public compare_potential_types(potential1, potential2)
Helper function to compare libint_potential_types.
Definition: libint_2c_3c.F:963
subroutine, public local_gemm_set_op_threshold_gpu(ctx, opThresholdGPU)
...
integer, parameter, public local_gemm_pu_gpu
subroutine, public local_gemm_destroy(ctx)
release resources associated to a gemm context
subroutine, public local_gemm_create(ctx, pu)
create a context for handling gemm offloading
subroutine, public local_gemm(opA, opB, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, ctx)
...
Machine interface based on Fortran 2003 and POSIX.
Definition: machine.F:17
subroutine, public m_memory(mem)
Returns the total amount of memory [bytes] in use, if known, zero otherwise.
Definition: machine.F:332
subroutine, public m_flush(lunit)
flushes units if the &GLOBAL flag is set accordingly
Definition: machine.F:106
Definition of mathematical constants and functions.
Definition: mathconstants.F:16
real(kind=dp), parameter, public pi
Interface to the message passing library MPI.
subroutine, public mp_waitany(requests, completed)
waits for completion of any of the given requests
type(mp_request_type), parameter, public mp_request_null
Routines to calculate MP2 energy with laplace approach.
Definition: mp2_laplace.F:13
subroutine, public calc_fm_mat_s_laplace(fm_mat_S, homo, virtual, Eigenval, dajquad)
...
Definition: mp2_laplace.F:39
Routines for calculating RI-MP2 gradients.
subroutine, public prepare_redistribution(para_env, para_env_sub, ngroup, group_grid_2_mepos, mepos_2_grid_group, pos_info)
prepare array for redistribution
subroutine, public array2fm(mat2D, fm_struct, my_start_row, my_end_row, my_start_col, my_end_col, gd_row, gd_col, group_grid_2_mepos, ngroup_row, ngroup_col, fm_mat, integ_group_size, color_group, do_release_mat)
redistribute local part of array to fm
subroutine, public create_dbcsr_gamma(Gamma_2D, homo, virtual, dimen_ia, para_env_sub, my_ia_start, my_ia_end, my_group_L_size, gd_ia, dbcsr_Gamma, mo_coeff_o)
redistribute 2D representation of 3d tensor to a set of dbcsr matrices
subroutine, public fm2array(mat2D, my_rows, my_start_row, my_end_row, my_cols, my_start_col, my_end_col, group_grid_2_mepos, mepos_2_grid_group, ngroup_row, ngroup_col, fm_mat)
redistribute fm to local part of array
Types needed for MP2 calculations.
Definition: mp2_types.F:14
basic linear algebra operations for full matrixes
subroutine, public get_qs_env(qs_env, atomic_kind_set, qs_kind_set, cell, super_cell, cell_ref, use_ref_cell, kpoints, dft_control, mos, sab_orb, sab_all, qmmm, qmmm_periodic, sac_ae, sac_ppl, sac_lri, sap_ppnl, sab_vdw, sab_scp, sap_oce, sab_lrc, sab_se, sab_xtbe, sab_tbe, sab_core, sab_xb, sab_xtb_nonbond, sab_almo, sab_kp, sab_kp_nosym, particle_set, energy, force, matrix_h, matrix_h_im, matrix_ks, matrix_ks_im, matrix_vxc, run_rtp, rtp, matrix_h_kp, matrix_h_im_kp, matrix_ks_kp, matrix_ks_im_kp, matrix_vxc_kp, kinetic_kp, matrix_s_kp, matrix_w_kp, matrix_s_RI_aux_kp, matrix_s, matrix_s_RI_aux, matrix_w, matrix_p_mp2, matrix_p_mp2_admm, rho, rho_xc, pw_env, ewald_env, ewald_pw, active_space, mpools, input, para_env, blacs_env, scf_control, rel_control, kinetic, qs_charges, vppl, rho_core, rho_nlcc, rho_nlcc_g, ks_env, ks_qmmm_env, wf_history, scf_env, local_particles, local_molecules, distribution_2d, dbcsr_dist, molecule_kind_set, molecule_set, subsys, cp_subsys, oce, local_rho_set, rho_atom_set, task_list, task_list_soft, rho0_atom_set, rho0_mpole, rhoz_set, ecoul_1c, rho0_s_rs, rho0_s_gs, do_kpoints, has_unit_metric, requires_mo_derivs, mo_derivs, mo_loc_history, nkind, natom, nelectron_total, nelectron_spin, efield, neighbor_list_id, linres_control, xas_env, virial, cp_ddapc_env, cp_ddapc_ewald, outer_scf_history, outer_scf_ihistory, x_data, et_coupling, dftb_potential, results, se_taper, se_store_int_env, se_nddo_mpole, se_nonbond_env, admm_env, lri_env, lri_density, exstate_env, ec_env, dispersion_env, gcp_env, vee, rho_external, external_vxc, mask, mp2_env, bs_env, kg_env, WannierCentres, atprop, ls_scf_env, do_transport, transport_env, v_hartree_rspace, s_mstruct_changed, rho_changed, potential_changed, forces_up_to_date, mscfg_env, almo_scf_env, gradient_history, variable_history, embed_pot, spin_embed_pot, polar_env, mos_last_converged, rhs)
Get the QUICKSTEP environment.
Routines to calculate RI-RPA and SOS-MP2 gradients.
Definition: rpa_grad.F:13
subroutine, public rpa_grad_matrix_operations(mp2_env, rpa_grad, do_ri_sos_laplace_mp2, fm_mat_Q, fm_mat_Q_gemm, dgemm_counter, fm_mat_S, omega, homo, virtual, Eigenval, weight, unit_nr)
...
Definition: rpa_grad.F:746
pure subroutine, public rpa_grad_needed_mem(homo, virtual, dimen_RI, mem_per_rank, mem_per_repl, do_ri_sos_laplace_mp2)
Calculates the necessary minimum memory for the Gradient code ion MiB.
Definition: rpa_grad.F:124
subroutine, public rpa_grad_create(rpa_grad, fm_mat_Q, fm_mat_S, homo, virtual, mp2_env, Eigenval, unit_nr, do_ri_sos_laplace_mp2)
Creates the arrays of a rpa_grad_type.
Definition: rpa_grad.F:170
subroutine, public rpa_grad_finalize(rpa_grad, mp2_env, para_env_sub, para_env, qs_env, gd_array, color_sub, do_ri_sos_laplace_mp2, homo, virtual)
...
Definition: rpa_grad.F:2042
subroutine, public rpa_grad_copy_q(fm_mat_Q, rpa_grad)
...
Definition: rpa_grad.F:721
Utility functions for RPA calculations.
Definition: rpa_util.F:13
subroutine, public calc_fm_mat_s_rpa(fm_mat_S, first_cycle, virtual, Eigenval, homo, omega, omega_old)
...
Definition: rpa_util.F:766
subroutine, public remove_scaling_factor_rpa(fm_mat_S, virtual, Eigenval_last, homo, omega_old)
...
Definition: rpa_util.F:716
All kind of helpful little routines.
Definition: util.F:14
pure integer function, dimension(2), public get_limit(m, n, me)
divide m entries into n parts, return size of part me
Definition: util.F:333