(git:ed6f26b)
Loading...
Searching...
No Matches
mp2_ri_gpw.F
Go to the documentation of this file.
1!--------------------------------------------------------------------------------------------------!
2! CP2K: A general program to perform molecular dynamics simulations !
3! Copyright 2000-2025 CP2K developers group <https://cp2k.org> !
4! !
5! SPDX-License-Identifier: GPL-2.0-or-later !
6!--------------------------------------------------------------------------------------------------!
7
8! **************************************************************************************************
9!> \brief Routines to calculate RI-GPW-MP2 energy using pw
10!> \par History
11!> 06.2012 created [Mauro Del Ben]
12!> 03.2019 Refactored from mp2_ri_gpw [Frederick Stein]
13! **************************************************************************************************
23 maxsize,&
25 USE kinds, ONLY: dp,&
26 int_8
29 USE machine, ONLY: m_flush,&
30 m_memory,&
32 USE message_passing, ONLY: mp_comm_type,&
35 USE mp2_types, ONLY: mp2_type,&
37
38!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num
39#include "./base/base_uses.f90"
40
41 IMPLICIT NONE
42
43 PRIVATE
44
45 CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'mp2_ri_gpw'
46
47 PUBLIC :: mp2_ri_gpw_compute_en
48
49CONTAINS
50
51! **************************************************************************************************
52!> \brief ...
53!> \param Emp2_Cou ...
54!> \param Emp2_EX ...
55!> \param Emp2_S ...
56!> \param Emp2_T ...
57!> \param BIb_C ...
58!> \param mp2_env ...
59!> \param para_env ...
60!> \param para_env_sub ...
61!> \param color_sub ...
62!> \param gd_array ...
63!> \param gd_B_virtual ...
64!> \param Eigenval ...
65!> \param nmo ...
66!> \param homo ...
67!> \param dimen_RI ...
68!> \param unit_nr ...
69!> \param calc_forces ...
70!> \param calc_ex ...
71! **************************************************************************************************
72 SUBROUTINE mp2_ri_gpw_compute_en(Emp2_Cou, Emp2_EX, Emp2_S, Emp2_T, BIb_C, mp2_env, para_env, para_env_sub, color_sub, &
73 gd_array, gd_B_virtual, &
74 Eigenval, nmo, homo, dimen_RI, unit_nr, calc_forces, calc_ex)
75 REAL(kind=dp), INTENT(INOUT) :: emp2_cou, emp2_ex, emp2_s, emp2_t
76 TYPE(three_dim_real_array), DIMENSION(:), &
77 INTENT(INOUT) :: bib_c
78 TYPE(mp2_type) :: mp2_env
79 TYPE(mp_para_env_type), INTENT(IN), POINTER :: para_env, para_env_sub
80 INTEGER, INTENT(IN) :: color_sub
81 TYPE(group_dist_d1_type), INTENT(INOUT) :: gd_array
82 INTEGER, DIMENSION(:), INTENT(IN) :: homo
83 INTEGER, INTENT(IN) :: nmo
84 REAL(kind=dp), DIMENSION(:, :), INTENT(IN) :: eigenval
85 TYPE(group_dist_d1_type), DIMENSION(SIZE(homo)), &
86 INTENT(INOUT) :: gd_b_virtual
87 INTEGER, INTENT(IN) :: dimen_ri, unit_nr
88 LOGICAL, INTENT(IN) :: calc_forces, calc_ex
89
90 CHARACTER(LEN=*), PARAMETER :: routinen = 'mp2_ri_gpw_compute_en'
91
92 INTEGER :: a, a_global, b, b_global, block_size, decil, end_point, handle, handle2, handle3, &
93 iib, ij_counter, ij_counter_send, ij_index, integ_group_size, ispin, jjb, jspin, &
94 max_ij_pairs, my_block_size, my_group_l_end, my_group_l_size, my_group_l_size_orig, &
95 my_group_l_start, my_i, my_ij_pairs, my_j, my_new_group_l_size, ngroup, nspins, &
96 num_integ_group, proc_receive, proc_send, proc_shift, rec_b_size, rec_b_virtual_end, &
97 rec_b_virtual_start, rec_l_size, send_b_size, send_b_virtual_end, send_b_virtual_start, &
98 send_i, send_ij_index, send_j, start_point, tag, total_ij_pairs
99 INTEGER, ALLOCATABLE, DIMENSION(:) :: integ_group_pos2color_sub, my_b_size, &
100 my_b_virtual_end, my_b_virtual_start, num_ij_pairs, sizes_array_orig, virtual
101 INTEGER, ALLOCATABLE, DIMENSION(:, :) :: ij_map
102 INTEGER, ALLOCATABLE, DIMENSION(:, :, :) :: ranges_info_array
103 LOGICAL :: my_alpha_beta_case, my_beta_beta_case, &
104 my_open_shell_ss
105 REAL(kind=dp) :: amp_fac, my_emp2_cou, my_emp2_ex, &
106 sym_fac, t_new, t_start
107 REAL(kind=dp), ALLOCATABLE, DIMENSION(:), TARGET :: buffer_1d
108 REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :), &
109 TARGET :: local_ab, local_ba, t_ab
110 REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :, :), &
111 TARGET :: local_i_al, local_j_al, y_i_ap, y_j_ap
112 REAL(kind=dp), CONTIGUOUS, DIMENSION(:, :), &
113 POINTER :: external_ab, external_i_al
114 REAL(kind=dp), CONTIGUOUS, DIMENSION(:, :, :), &
115 POINTER :: bi_c_rec
116 TYPE(dgemm_counter_type) :: dgemm_counter
117 TYPE(mp_comm_type) :: comm_exchange, comm_rep
118 TYPE(three_dim_real_array), ALLOCATABLE, &
119 DIMENSION(:) :: b_ia_q
120
121 CALL timeset(routinen, handle)
122
123 nspins = SIZE(homo)
124
125 ALLOCATE (virtual(nspins))
126 virtual(:) = nmo - homo(:)
127
128 ALLOCATE (my_b_size(nspins), my_b_virtual_start(nspins), my_b_virtual_end(nspins))
129 DO ispin = 1, nspins
130 CALL get_group_dist(gd_b_virtual(ispin), para_env_sub%mepos, &
131 my_b_virtual_start(ispin), my_b_virtual_end(ispin), my_b_size(ispin))
132 END DO
133
134 CALL get_group_dist(gd_array, color_sub, my_group_l_start, my_group_l_end, my_group_l_size)
135
136 CALL dgemm_counter_init(dgemm_counter, unit_nr, mp2_env%ri_mp2%print_dgemm_info)
137
138 ! local_gemm_ctx has a very footprint the first time this routine is
139 ! called.
140 CALL mp2_env%local_gemm_ctx%create(local_gemm_pu_gpu)
141 CALL mp2_env%local_gemm_ctx%set_op_threshold_gpu(128*128*128*2)
142
143 CALL mp2_ri_get_integ_group_size( &
144 mp2_env, para_env, para_env_sub, gd_array, gd_b_virtual, &
145 homo, dimen_ri, unit_nr, &
146 integ_group_size, ngroup, &
147 num_integ_group, virtual, calc_forces)
148
149 ! now create a group that contains all the proc that have the same virtual starting point
150 ! in the integ group
151 CALL mp2_ri_create_group( &
152 para_env, para_env_sub, color_sub, &
153 gd_array%sizes, calc_forces, &
154 integ_group_size, my_group_l_end, &
155 my_group_l_size, my_group_l_size_orig, my_group_l_start, my_new_group_l_size, &
156 integ_group_pos2color_sub, sizes_array_orig, &
157 ranges_info_array, comm_exchange, comm_rep, num_integ_group)
158
159 ! We cannot fix the tag because of the recv routine
160 tag = 42
161
162 DO jspin = 1, nspins
163
164 CALL replicate_iak_2intgroup(bib_c(jspin)%array, comm_exchange, comm_rep, &
165 homo(jspin), gd_array%sizes, my_b_size(jspin), &
166 my_group_l_size, ranges_info_array)
167
168 DO ispin = 1, jspin
169
170 IF (unit_nr > 0) THEN
171 IF (nspins == 1) THEN
172 WRITE (unit_nr, *) "Start loop run"
173 ELSE IF (ispin == 1 .AND. jspin == 1) THEN
174 WRITE (unit_nr, *) "Start loop run alpha-alpha"
175 ELSE IF (ispin == 1 .AND. jspin == 2) THEN
176 WRITE (unit_nr, *) "Start loop run alpha-beta"
177 ELSE IF (ispin == 2 .AND. jspin == 2) THEN
178 WRITE (unit_nr, *) "Start loop run beta-beta"
179 END IF
180 CALL m_flush(unit_nr)
181 END IF
182
183 my_open_shell_ss = (nspins == 2) .AND. (ispin == jspin)
184
185 ! t_ab = amp_fac*(:,a|:,b)-(:,b|:,a)
186 ! If we calculate the gradient we need to distinguish
187 ! between alpha-alpha and beta-beta cases for UMP2
188
189 my_beta_beta_case = .false.
190 my_alpha_beta_case = .false.
191 IF (ispin /= jspin) THEN
192 my_alpha_beta_case = .true.
193 ELSE IF (my_open_shell_ss) THEN
194 IF (ispin == 2) my_beta_beta_case = .true.
195 END IF
196
197 amp_fac = mp2_env%scale_S + mp2_env%scale_T
198 IF (my_alpha_beta_case .OR. my_open_shell_ss) amp_fac = mp2_env%scale_T
199
200 CALL mp2_ri_allocate_no_blk(local_ab, t_ab, mp2_env, homo, virtual, my_b_size, &
201 my_group_l_size, calc_forces, ispin, jspin, local_ba)
202
203 CALL mp2_ri_get_block_size( &
204 mp2_env, para_env, para_env_sub, gd_array, gd_b_virtual(ispin:jspin), &
205 homo(ispin:jspin), virtual(ispin:jspin), dimen_ri, unit_nr, block_size, &
206 ngroup, num_integ_group, my_open_shell_ss, calc_forces, buffer_1d)
207
208 ! *****************************************************************
209 ! ********** REPLICATION-BLOCKED COMMUNICATION SCHEME ***********
210 ! *****************************************************************
211 ! introduce block size, the number of occupied orbitals has to be a
212 ! multiple of the block size
213
214 ! Calculate the maximum number of ij pairs that have to be computed
215 ! among groups
216 CALL mp2_ri_communication(my_alpha_beta_case, total_ij_pairs, homo(ispin), homo(jspin), &
217 block_size, ngroup, ij_map, color_sub, my_ij_pairs, my_open_shell_ss, unit_nr)
218
219 ALLOCATE (num_ij_pairs(0:comm_exchange%num_pe - 1))
220 CALL comm_exchange%allgather(my_ij_pairs, num_ij_pairs)
221
222 max_ij_pairs = maxval(num_ij_pairs)
223
224 ! start real stuff
225 CALL mp2_ri_allocate_blk(dimen_ri, my_b_size, block_size, local_i_al, &
226 local_j_al, calc_forces, y_i_ap, y_j_ap, ispin, jspin)
227
228 CALL timeset(routinen//"_RI_loop", handle2)
229 my_emp2_cou = 0.0_dp
230 my_emp2_ex = 0.0_dp
231 t_start = m_walltime()
232 DO ij_index = 1, max_ij_pairs
233
234 ! Prediction is unreliable if we are in the first step of the loop
235 IF (unit_nr > 0 .AND. ij_index > 1) THEN
236 decil = ij_index*10/max_ij_pairs
237 IF (decil /= (ij_index - 1)*10/max_ij_pairs) THEN
238 t_new = m_walltime()
239 t_new = (t_new - t_start)/60.0_dp*(max_ij_pairs - ij_index + 1)/(ij_index - 1)
240 WRITE (unit_nr, fmt="(T3,A)") "Percentage of finished loop: "// &
241 cp_to_string(decil*10)//". Minutes left: "//cp_to_string(t_new)
242 CALL m_flush(unit_nr)
243 END IF
244 END IF
245
246 IF (calc_forces) THEN
247 y_i_ap = 0.0_dp
248 y_j_ap = 0.0_dp
249 END IF
250
251 IF (ij_index <= my_ij_pairs) THEN
252 ! We have work to do
253 ij_counter = (ij_index - min(1, color_sub))*ngroup + color_sub
254 my_i = ij_map(1, ij_counter)
255 my_j = ij_map(2, ij_counter)
256 my_block_size = ij_map(3, ij_counter)
257
258 local_i_al = 0.0_dp
259 CALL fill_local_i_al(local_i_al(:, :, 1:my_block_size), ranges_info_array(:, :, comm_exchange%mepos), &
260 bib_c(ispin)%array(:, :, my_i:my_i + my_block_size - 1))
261
262 local_j_al = 0.0_dp
263 CALL fill_local_i_al(local_j_al(:, :, 1:my_block_size), ranges_info_array(:, :, comm_exchange%mepos), &
264 bib_c(jspin)%array(:, :, my_j:my_j + my_block_size - 1))
265
266 ! collect data from other proc
267 CALL timeset(routinen//"_comm", handle3)
268 DO proc_shift = 1, comm_exchange%num_pe - 1
269 proc_send = modulo(comm_exchange%mepos + proc_shift, comm_exchange%num_pe)
270 proc_receive = modulo(comm_exchange%mepos - proc_shift, comm_exchange%num_pe)
271
272 send_ij_index = num_ij_pairs(proc_send)
273
274 CALL get_group_dist(gd_array, proc_receive, sizes=rec_l_size)
275
276 IF (ij_index <= send_ij_index) THEN
277 ij_counter_send = (ij_index - min(1, integ_group_pos2color_sub(proc_send)))*ngroup + &
278 integ_group_pos2color_sub(proc_send)
279 send_i = ij_map(1, ij_counter_send)
280 send_j = ij_map(2, ij_counter_send)
281
282 ! occupied i
283 bi_c_rec(1:rec_l_size, 1:my_b_size(ispin), 1:my_block_size) => &
284 buffer_1d(1:rec_l_size*my_b_size(ispin)*my_block_size)
285 bi_c_rec = 0.0_dp
286 CALL comm_exchange%sendrecv(bib_c(ispin)%array(:, :, send_i:send_i + my_block_size - 1), &
287 proc_send, bi_c_rec, proc_receive, tag)
288
289 CALL fill_local_i_al(local_i_al(:, :, 1:my_block_size), ranges_info_array(:, :, proc_receive), &
290 bi_c_rec(:, 1:my_b_size(ispin), :))
291
292 ! occupied j
293 bi_c_rec(1:rec_l_size, 1:my_b_size(jspin), 1:my_block_size) => &
294 buffer_1d(1:int(rec_l_size, int_8)*my_b_size(jspin)*my_block_size)
295 bi_c_rec = 0.0_dp
296 CALL comm_exchange%sendrecv(bib_c(jspin)%array(:, :, send_j:send_j + my_block_size - 1), &
297 proc_send, bi_c_rec, proc_receive, tag)
298
299 CALL fill_local_i_al(local_j_al(:, :, 1:my_block_size), ranges_info_array(:, :, proc_receive), &
300 bi_c_rec(:, 1:my_b_size(jspin), :))
301
302 ELSE
303 ! we send nothing while we know that we have to receive something
304
305 ! occupied i
306 bi_c_rec(1:rec_l_size, 1:my_b_size(ispin), 1:my_block_size) => &
307 buffer_1d(1:int(rec_l_size, int_8)*my_b_size(ispin)*my_block_size)
308 bi_c_rec = 0.0_dp
309 CALL comm_exchange%recv(bi_c_rec, proc_receive, tag)
310
311 CALL fill_local_i_al(local_i_al(:, :, 1:my_block_size), ranges_info_array(:, :, proc_receive), &
312 bi_c_rec(:, 1:my_b_size(ispin), 1:my_block_size))
313
314 ! occupied j
315 bi_c_rec(1:rec_l_size, 1:my_b_size(jspin), 1:my_block_size) => &
316 buffer_1d(1:int(rec_l_size, int_8)*my_b_size(jspin)*my_block_size)
317 bi_c_rec = 0.0_dp
318 CALL comm_exchange%recv(bi_c_rec, proc_receive, tag)
319
320 CALL fill_local_i_al(local_j_al(:, :, 1:my_block_size), ranges_info_array(:, :, proc_receive), &
321 bi_c_rec(:, 1:my_b_size(jspin), 1:my_block_size))
322
323 END IF
324
325 END DO
326
327 CALL timestop(handle3)
328
329 ! loop over the block elements
330 DO iib = 1, my_block_size
331 DO jjb = 1, my_block_size
332 CALL timeset(routinen//"_expansion", handle3)
333 associate(my_local_i_al => local_i_al(:, :, iib), my_local_j_al => local_j_al(:, :, jjb))
334
335 ! calculate the integrals (ia|jb) strating from my local data ...
336 local_ab = 0.0_dp
337 IF ((my_alpha_beta_case) .AND. (calc_forces)) THEN
338 local_ba = 0.0_dp
339 END IF
340 CALL dgemm_counter_start(dgemm_counter)
341 CALL mp2_env%local_gemm_ctx%gemm('T', 'N', my_b_size(ispin), my_b_size(jspin), dimen_ri, 1.0_dp, &
342 my_local_i_al, dimen_ri, my_local_j_al, dimen_ri, &
343 0.0_dp, local_ab(my_b_virtual_start(ispin):my_b_virtual_end(ispin), :), &
344 my_b_size(ispin))
345 ! Additional integrals only for alpha_beta case and forces
346 IF (my_alpha_beta_case .AND. calc_forces) THEN
347 local_ba(my_b_virtual_start(jspin):my_b_virtual_end(jspin), :) = &
348 transpose(local_ab(my_b_virtual_start(ispin):my_b_virtual_end(ispin), :))
349 END IF
350 ! ... and from the other of my subgroup
351 DO proc_shift = 1, para_env_sub%num_pe - 1
352 proc_send = modulo(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
353 proc_receive = modulo(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
354
355 CALL get_group_dist(gd_b_virtual(ispin), proc_receive, rec_b_virtual_start, &
356 rec_b_virtual_end, rec_b_size)
357
358 external_i_al(1:dimen_ri, 1:rec_b_size) => buffer_1d(1:int(dimen_ri, int_8)*rec_b_size)
359 external_i_al = 0.0_dp
360
361 CALL para_env_sub%sendrecv(my_local_i_al, proc_send, &
362 external_i_al, proc_receive, tag)
363
364 CALL mp2_env%local_gemm_ctx%gemm( &
365 'T', 'N', rec_b_size, my_b_size(jspin), dimen_ri, 1.0_dp, &
366 external_i_al, dimen_ri, my_local_j_al, dimen_ri, &
367 0.0_dp, local_ab(rec_b_virtual_start:rec_b_virtual_end, 1:my_b_size(jspin)), rec_b_size)
368
369 ! Additional integrals only for alpha_beta case and forces
370 IF (my_alpha_beta_case .AND. calc_forces) THEN
371
372 CALL get_group_dist(gd_b_virtual(jspin), proc_receive, rec_b_virtual_start, &
373 rec_b_virtual_end, rec_b_size)
374
375 external_i_al(1:dimen_ri, 1:rec_b_size) => buffer_1d(1:int(dimen_ri, int_8)*rec_b_size)
376 external_i_al = 0.0_dp
377
378 CALL para_env_sub%sendrecv(my_local_j_al, proc_send, &
379 external_i_al, proc_receive, tag)
380
381 CALL mp2_env%local_gemm_ctx%gemm('T', 'N', rec_b_size, my_b_size(ispin), dimen_ri, 1.0_dp, &
382 external_i_al, dimen_ri, my_local_i_al, dimen_ri, &
383 0.0_dp, local_ba(rec_b_virtual_start:rec_b_virtual_end, 1:my_b_size(ispin)), rec_b_size)
384 END IF
385 END DO
386 IF (my_alpha_beta_case .AND. calc_forces) THEN
387 ! Is just an approximation, but the call does not allow it, it ought to be (virtual_i*B_size_j+virtual_j*B_size_i)*dimen_RI
388 CALL dgemm_counter_stop(dgemm_counter, virtual(ispin), my_b_size(ispin) + my_b_size(jspin), dimen_ri)
389 ELSE
390 CALL dgemm_counter_stop(dgemm_counter, virtual(ispin), my_b_size(jspin), dimen_ri)
391 END IF
392 CALL timestop(handle3)
393
394 !sample peak memory
395 CALL m_memory()
396
397 CALL timeset(routinen//"_ener", handle3)
398 ! calculate coulomb only MP2
399 sym_fac = 2.0_dp
400 IF (my_i == my_j) sym_fac = 1.0_dp
401 IF (my_alpha_beta_case) sym_fac = 0.5_dp
402 DO b = 1, my_b_size(jspin)
403 b_global = b + my_b_virtual_start(jspin) - 1
404 DO a = 1, virtual(ispin)
405 my_emp2_cou = my_emp2_cou - sym_fac*2.0_dp*local_ab(a, b)**2/ &
406 (eigenval(homo(ispin) + a, ispin) + eigenval(homo(jspin) + b_global, jspin) - &
407 eigenval(my_i + iib - 1, ispin) - eigenval(my_j + jjb - 1, jspin))
408 END DO
409 END DO
410
411 IF (calc_ex) THEN
412 ! contract integrals with orbital energies for exchange MP2 energy
413 ! starting with local ...
414 IF (calc_forces .AND. (.NOT. my_alpha_beta_case)) t_ab = 0.0_dp
415 DO b = 1, my_b_size(ispin)
416 b_global = b + my_b_virtual_start(ispin) - 1
417 DO a = 1, my_b_size(ispin)
418 a_global = a + my_b_virtual_start(ispin) - 1
419 my_emp2_ex = my_emp2_ex + sym_fac*local_ab(a_global, b)*local_ab(b_global, a)/ &
420 (eigenval(homo(ispin) + a_global, ispin) + eigenval(homo(ispin) + b_global, ispin) - &
421 eigenval(my_i + iib - 1, ispin) - eigenval(my_j + jjb - 1, ispin))
422 IF (calc_forces .AND. (.NOT. my_alpha_beta_case)) THEN
423 t_ab(a_global, b) = -(amp_fac*local_ab(a_global, b) - mp2_env%scale_T*local_ab(b_global, a))/ &
424 (eigenval(homo(ispin) + a_global, ispin) + &
425 eigenval(homo(ispin) + b_global, ispin) - &
426 eigenval(my_i + iib - 1, ispin) - eigenval(my_j + jjb - 1, ispin))
427 END IF
428 END DO
429 END DO
430 ! ... and then with external data
431 DO proc_shift = 1, para_env_sub%num_pe - 1
432 proc_send = modulo(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
433 proc_receive = modulo(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
434
435 CALL get_group_dist(gd_b_virtual(ispin), proc_receive, &
436 rec_b_virtual_start, rec_b_virtual_end, rec_b_size)
437 CALL get_group_dist(gd_b_virtual(ispin), proc_send, &
438 send_b_virtual_start, send_b_virtual_end, send_b_size)
439
440 external_ab(1:my_b_size(ispin), 1:rec_b_size) => &
441 buffer_1d(1:int(rec_b_size, int_8)*my_b_size(ispin))
442 external_ab = 0.0_dp
443
444 CALL para_env_sub%sendrecv(local_ab(send_b_virtual_start:send_b_virtual_end, 1:my_b_size(ispin)), proc_send, &
445 external_ab(1:my_b_size(ispin), 1:rec_b_size), proc_receive, tag)
446
447 DO b = 1, my_b_size(ispin)
448 b_global = b + my_b_virtual_start(ispin) - 1
449 DO a = 1, rec_b_size
450 a_global = a + rec_b_virtual_start - 1
451 my_emp2_ex = my_emp2_ex + sym_fac*local_ab(a_global, b)*external_ab(b, a)/ &
452 (eigenval(homo(ispin) + a_global, ispin) + eigenval(homo(ispin) + b_global, ispin) - &
453 eigenval(my_i + iib - 1, ispin) - eigenval(my_j + jjb - 1, ispin))
454 IF (calc_forces .AND. (.NOT. my_alpha_beta_case)) &
455 t_ab(a_global, b) = -(amp_fac*local_ab(a_global, b) - mp2_env%scale_T*external_ab(b, a))/ &
456 (eigenval(homo(ispin) + a_global, ispin) + &
457 eigenval(homo(ispin) + b_global, ispin) - &
458 eigenval(my_i + iib - 1, ispin) - eigenval(my_j + jjb - 1, ispin))
459 END DO
460 END DO
461 END DO
462 END IF
463 CALL timestop(handle3)
464
465 IF (calc_forces) THEN
466 ! update P_ab, Gamma_P_ia
467 CALL mp2_update_p_gamma(mp2_env, para_env_sub, gd_b_virtual, &
468 eigenval, homo, dimen_ri, iib, jjb, my_b_size, &
469 my_b_virtual_end, my_b_virtual_start, my_i, my_j, virtual, &
470 local_ab, t_ab, my_local_i_al, my_local_j_al, &
471 my_open_shell_ss, y_i_ap(:, :, iib), y_j_ap(:, :, jjb), local_ba, &
472 ispin, jspin, dgemm_counter, buffer_1d)
473
474 END IF
475
476 END associate
477
478 END DO ! jjB
479 END DO ! iiB
480
481 ELSE
482 ! We need it later in case of gradients
483 my_block_size = 1
484
485 CALL timeset(routinen//"_comm", handle3)
486 ! No work to do and we know that we have to receive nothing, but send something
487 ! send data to other proc
488 DO proc_shift = 1, comm_exchange%num_pe - 1
489 proc_send = modulo(comm_exchange%mepos + proc_shift, comm_exchange%num_pe)
490 proc_receive = modulo(comm_exchange%mepos - proc_shift, comm_exchange%num_pe)
491
492 send_ij_index = num_ij_pairs(proc_send)
493
494 IF (ij_index <= send_ij_index) THEN
495 ! something to send
496 ij_counter_send = (ij_index - min(1, integ_group_pos2color_sub(proc_send)))*ngroup + &
497 integ_group_pos2color_sub(proc_send)
498 send_i = ij_map(1, ij_counter_send)
499 send_j = ij_map(2, ij_counter_send)
500
501 ! occupied i
502 CALL comm_exchange%send(bib_c(ispin)%array(:, :, send_i:send_i + my_block_size - 1), &
503 proc_send, tag)
504 ! occupied j
505 CALL comm_exchange%send(bib_c(jspin)%array(:, :, send_j:send_j + my_block_size - 1), &
506 proc_send, tag)
507 END IF
508 END DO
509 CALL timestop(handle3)
510 END IF
511
512 ! redistribute gamma
513 IF (calc_forces) THEN
514 CALL mp2_redistribute_gamma(mp2_env%ri_grad%Gamma_P_ia(ispin)%array, ij_index, my_b_size(ispin), &
515 my_block_size, my_group_l_size, my_i, my_ij_pairs, ngroup, &
516 num_integ_group, integ_group_pos2color_sub, num_ij_pairs, &
517 ij_map, ranges_info_array, y_i_ap(:, :, 1:my_block_size), comm_exchange, &
518 gd_array%sizes, 1, buffer_1d)
519 CALL mp2_redistribute_gamma(mp2_env%ri_grad%Gamma_P_ia(jspin)%array, ij_index, my_b_size(jspin), &
520 my_block_size, my_group_l_size, my_j, my_ij_pairs, ngroup, &
521 num_integ_group, integ_group_pos2color_sub, num_ij_pairs, &
522 ij_map, ranges_info_array, y_j_ap(:, :, 1:my_block_size), comm_exchange, &
523 gd_array%sizes, 2, buffer_1d)
524 END IF
525
526 END DO
527 CALL timestop(handle2)
528
529 DEALLOCATE (local_i_al)
530 DEALLOCATE (local_j_al)
531 DEALLOCATE (ij_map)
532 DEALLOCATE (num_ij_pairs)
533 DEALLOCATE (local_ab)
534
535 IF (calc_forces) THEN
536 DEALLOCATE (y_i_ap)
537 DEALLOCATE (y_j_ap)
538 IF (ALLOCATED(t_ab)) THEN
539 DEALLOCATE (t_ab)
540 END IF
541 DEALLOCATE (local_ba)
542
543 ! here we check if there are almost degenerate ij
544 ! pairs and we update P_ij with these contribution.
545 ! If all pairs are degenerate with each other this step will scale O(N^6),
546 ! if the number of degenerate pairs scales linearly with the system size
547 ! this step will scale O(N^5).
548 ! Start counting the number of almost degenerate ij pairs according
549 ! to eps_canonical
550 CALL quasi_degenerate_p_ij( &
551 mp2_env, eigenval(:, ispin:jspin), homo(ispin:jspin), virtual(ispin:jspin), my_open_shell_ss, &
552 my_beta_beta_case, bib_c(ispin:jspin), unit_nr, dimen_ri, &
553 my_b_size(ispin:jspin), ngroup, my_group_l_size, &
554 color_sub, ranges_info_array, comm_exchange, para_env_sub, para_env, &
555 my_b_virtual_start(ispin:jspin), my_b_virtual_end(ispin:jspin), gd_array%sizes, gd_b_virtual(ispin:jspin), &
556 integ_group_pos2color_sub, dgemm_counter, buffer_1d)
557
558 END IF
559
560 DEALLOCATE (buffer_1d)
561
562 ! Dereplicate BIb_C and Gamma_P_ia to save memory
563 ! These matrices will not be needed in that fashion anymore
564 ! B_ia_Q will needed later
565 IF (calc_forces .AND. jspin == nspins) THEN
566 IF (.NOT. ALLOCATED(b_ia_q)) ALLOCATE (b_ia_q(nspins))
567 ALLOCATE (b_ia_q(ispin)%array(homo(ispin), my_b_size(ispin), my_group_l_size_orig))
568 b_ia_q(ispin)%array = 0.0_dp
569 DO jjb = 1, homo(ispin)
570 DO iib = 1, my_b_size(ispin)
571 b_ia_q(ispin)%array(jjb, iib, 1:my_group_l_size_orig) = &
572 bib_c(ispin)%array(1:my_group_l_size_orig, iib, jjb)
573 END DO
574 END DO
575 DEALLOCATE (bib_c(ispin)%array)
576
577 ! sum Gamma and dereplicate
578 ALLOCATE (bib_c(ispin)%array(my_b_size(ispin), homo(ispin), my_group_l_size_orig))
579 DO proc_shift = 1, comm_rep%num_pe - 1
580 ! invert order
581 proc_send = modulo(comm_rep%mepos - proc_shift, comm_rep%num_pe)
582 proc_receive = modulo(comm_rep%mepos + proc_shift, comm_rep%num_pe)
583
584 start_point = ranges_info_array(3, proc_shift, comm_exchange%mepos)
585 end_point = ranges_info_array(4, proc_shift, comm_exchange%mepos)
586
587 CALL comm_rep%sendrecv(mp2_env%ri_grad%Gamma_P_ia(ispin)%array(:, :, start_point:end_point), &
588 proc_send, bib_c(ispin)%array, proc_receive, tag)
589!$OMP PARALLEL WORKSHARE DEFAULT(NONE) &
590!$OMP SHARED(mp2_env,BIb_C,ispin,homo,my_B_size,my_group_L_size_orig)
591 mp2_env%ri_grad%Gamma_P_ia(ispin)%array(:, :, 1:my_group_l_size_orig) = &
592 mp2_env%ri_grad%Gamma_P_ia(ispin)%array(:, :, 1:my_group_l_size_orig) &
593 + bib_c(ispin)%array(:, :, :)
594!$OMP END PARALLEL WORKSHARE
595 END DO
596
597 bib_c(ispin)%array(:, :, :) = mp2_env%ri_grad%Gamma_P_ia(ispin)%array(:, :, 1:my_group_l_size_orig)
598 DEALLOCATE (mp2_env%ri_grad%Gamma_P_ia(ispin)%array)
599 CALL move_alloc(bib_c(ispin)%array, mp2_env%ri_grad%Gamma_P_ia(ispin)%array)
600 ELSE IF (jspin == nspins) THEN
601 DEALLOCATE (bib_c(ispin)%array)
602 END IF
603
604 CALL para_env%sum(my_emp2_cou)
605 CALL para_env%sum(my_emp2_ex)
606
607 IF (my_open_shell_ss .OR. my_alpha_beta_case) THEN
608 IF (my_alpha_beta_case) THEN
609 emp2_s = emp2_s + my_emp2_cou
610 emp2_cou = emp2_cou + my_emp2_cou
611 ELSE
612 my_emp2_cou = my_emp2_cou*0.25_dp
613 my_emp2_ex = my_emp2_ex*0.5_dp
614 emp2_t = emp2_t + my_emp2_cou + my_emp2_ex
615 emp2_cou = emp2_cou + my_emp2_cou
616 emp2_ex = emp2_ex + my_emp2_ex
617 END IF
618 ELSE
619 emp2_cou = emp2_cou + my_emp2_cou
620 emp2_ex = emp2_ex + my_emp2_ex
621 END IF
622 END DO
623
624 END DO
625
626 DEALLOCATE (integ_group_pos2color_sub)
627 DEALLOCATE (ranges_info_array)
628
629 CALL comm_exchange%free()
630 CALL comm_rep%free()
631
632 IF (calc_forces) THEN
633 ! recover original information (before replication)
634 DEALLOCATE (gd_array%sizes)
635 iib = SIZE(sizes_array_orig)
636 ALLOCATE (gd_array%sizes(0:iib - 1))
637 gd_array%sizes(:) = sizes_array_orig
638 DEALLOCATE (sizes_array_orig)
639
640 ! Remove replication from BIb_C and reorder the matrix
641 my_group_l_size = my_group_l_size_orig
642
643 ! B_ia_Q(ispin)%array will be deallocated inside of complete_gamma
644 DO ispin = 1, nspins
645 CALL complete_gamma(mp2_env, b_ia_q(ispin)%array, dimen_ri, homo(ispin), &
646 virtual(ispin), para_env, para_env_sub, ngroup, &
647 my_group_l_size, my_group_l_start, my_group_l_end, &
648 my_b_size(ispin), my_b_virtual_start(ispin), &
649 gd_array, gd_b_virtual(ispin), &
650 ispin)
651 END DO
652 DEALLOCATE (b_ia_q)
653
654 IF (nspins == 1) mp2_env%ri_grad%P_ab(1)%array(:, :) = mp2_env%ri_grad%P_ab(1)%array(:, :)*2.0_dp
655 block
656 TYPE(mp_comm_type) :: comm
657 CALL comm%from_split(para_env, para_env_sub%mepos)
658 DO ispin = 1, nspins
659 ! P_ab is only replicated over all subgroups
660 CALL comm%sum(mp2_env%ri_grad%P_ab(ispin)%array)
661 ! P_ij is replicated over all processes
662 CALL para_env%sum(mp2_env%ri_grad%P_ij(ispin)%array)
663 END DO
664 CALL comm%free()
665 END block
666 END IF
667
668 CALL release_group_dist(gd_array)
669 DO ispin = 1, nspins
670 IF (ALLOCATED(bib_c(ispin)%array)) DEALLOCATE (bib_c(ispin)%array)
671 CALL release_group_dist(gd_b_virtual(ispin))
672 END DO
673
674 ! We do not need this matrix later, so deallocate it here to safe memory
675 IF (calc_forces) DEALLOCATE (mp2_env%ri_grad%PQ_half)
676 IF (calc_forces .AND. .NOT. compare_potential_types(mp2_env%ri_metric, mp2_env%potential_parameter)) &
677 DEALLOCATE (mp2_env%ri_grad%operator_half)
678
679 CALL dgemm_counter_write(dgemm_counter, para_env)
680
681 ! release memory allocated by local_gemm when run on GPU. local_gemm_ctx is null on cpu only runs
682 CALL mp2_env%local_gemm_ctx%destroy()
683 CALL timestop(handle)
684
685 END SUBROUTINE mp2_ri_gpw_compute_en
686
687! **************************************************************************************************
688!> \brief ...
689!> \param local_i_aL ...
690!> \param ranges_info_array ...
691!> \param BIb_C_rec ...
692! **************************************************************************************************
693 SUBROUTINE fill_local_i_al(local_i_aL, ranges_info_array, BIb_C_rec)
694 REAL(kind=dp), DIMENSION(:, :, :), INTENT(INOUT) :: local_i_al
695 INTEGER, DIMENSION(:, :), INTENT(IN) :: ranges_info_array
696 REAL(kind=dp), DIMENSION(:, :, :), INTENT(IN) :: bib_c_rec
697
698 CHARACTER(LEN=*), PARAMETER :: routinen = 'fill_local_i_aL'
699
700 INTEGER :: end_point, handle, irep, lend_pos, &
701 lstart_pos, start_point
702
703 CALL timeset(routinen, handle)
704
705 DO irep = 1, SIZE(ranges_info_array, 2)
706 lstart_pos = ranges_info_array(1, irep)
707 lend_pos = ranges_info_array(2, irep)
708 start_point = ranges_info_array(3, irep)
709 end_point = ranges_info_array(4, irep)
710
711!$OMP PARALLEL WORKSHARE DEFAULT(NONE) &
712!$OMP SHARED(BIb_C_rec,local_i_aL,Lstart_pos,Lend_pos,start_point,end_point)
713 local_i_al(lstart_pos:lend_pos, :, :) = bib_c_rec(start_point:end_point, :, :)
714!$OMP END PARALLEL WORKSHARE
715 END DO
716
717 CALL timestop(handle)
718
719 END SUBROUTINE fill_local_i_al
720
721! **************************************************************************************************
722!> \brief ...
723!> \param local_i_aL ...
724!> \param ranges_info_array ...
725!> \param BIb_C_rec ...
726! **************************************************************************************************
727 SUBROUTINE fill_local_i_al_2d(local_i_aL, ranges_info_array, BIb_C_rec)
728 REAL(kind=dp), DIMENSION(:, :), INTENT(INOUT) :: local_i_al
729 INTEGER, DIMENSION(:, :), INTENT(IN) :: ranges_info_array
730 REAL(kind=dp), DIMENSION(:, :), INTENT(IN) :: bib_c_rec
731
732 CHARACTER(LEN=*), PARAMETER :: routinen = 'fill_local_i_aL_2D'
733
734 INTEGER :: end_point, handle, irep, lend_pos, &
735 lstart_pos, start_point
736
737 CALL timeset(routinen, handle)
738
739 DO irep = 1, SIZE(ranges_info_array, 2)
740 lstart_pos = ranges_info_array(1, irep)
741 lend_pos = ranges_info_array(2, irep)
742 start_point = ranges_info_array(3, irep)
743 end_point = ranges_info_array(4, irep)
744
745!$OMP PARALLEL WORKSHARE DEFAULT(NONE) &
746!$OMP SHARED(BIb_C_rec,local_i_aL,Lstart_pos,Lend_pos,start_point,end_point)
747 local_i_al(lstart_pos:lend_pos, :) = bib_c_rec(start_point:end_point, :)
748!$OMP END PARALLEL WORKSHARE
749 END DO
750
751 CALL timestop(handle)
752
753 END SUBROUTINE fill_local_i_al_2d
754
755! **************************************************************************************************
756!> \brief ...
757!> \param BIb_C ...
758!> \param comm_exchange ...
759!> \param comm_rep ...
760!> \param homo ...
761!> \param sizes_array ...
762!> \param my_B_size ...
763!> \param my_group_L_size ...
764!> \param ranges_info_array ...
765! **************************************************************************************************
766 SUBROUTINE replicate_iak_2intgroup(BIb_C, comm_exchange, comm_rep, homo, sizes_array, my_B_size, &
767 my_group_L_size, ranges_info_array)
768 REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :, :), &
769 INTENT(INOUT) :: bib_c
770 TYPE(mp_comm_type), INTENT(IN) :: comm_exchange, comm_rep
771 INTEGER, INTENT(IN) :: homo
772 INTEGER, DIMENSION(:), INTENT(IN) :: sizes_array
773 INTEGER, INTENT(IN) :: my_b_size, my_group_l_size
774 INTEGER, DIMENSION(:, 0:, 0:), INTENT(IN) :: ranges_info_array
775
776 CHARACTER(LEN=*), PARAMETER :: routinen = 'replicate_iaK_2intgroup'
777
778 INTEGER :: end_point, handle, max_l_size, &
779 proc_receive, proc_shift, start_point
780 REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :, :) :: bib_c_copy
781 REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :, :, :) :: bib_c_gather
782
783 CALL timeset(routinen, handle)
784
785 ! replication scheme using mpi_allgather
786 ! get the max L size of the
787 max_l_size = maxval(sizes_array)
788
789 ALLOCATE (bib_c_copy(max_l_size, my_b_size, homo))
790 bib_c_copy = 0.0_dp
791 bib_c_copy(1:SIZE(bib_c, 1), 1:my_b_size, 1:homo) = bib_c
792
793 DEALLOCATE (bib_c)
794
795 ALLOCATE (bib_c_gather(max_l_size, my_b_size, homo, 0:comm_rep%num_pe - 1))
796 bib_c_gather = 0.0_dp
797
798 CALL comm_rep%allgather(bib_c_copy, bib_c_gather)
799
800 DEALLOCATE (bib_c_copy)
801
802 ALLOCATE (bib_c(my_group_l_size, my_b_size, homo))
803 bib_c = 0.0_dp
804
805 ! reorder data
806 DO proc_shift = 0, comm_rep%num_pe - 1
807 proc_receive = modulo(comm_rep%mepos - proc_shift, comm_rep%num_pe)
808
809 start_point = ranges_info_array(3, proc_shift, comm_exchange%mepos)
810 end_point = ranges_info_array(4, proc_shift, comm_exchange%mepos)
811
812 bib_c(start_point:end_point, 1:my_b_size, 1:homo) = &
813 bib_c_gather(1:end_point - start_point + 1, 1:my_b_size, 1:homo, proc_receive)
814
815 END DO
816
817 DEALLOCATE (bib_c_gather)
818
819 CALL timestop(handle)
820
821 END SUBROUTINE replicate_iak_2intgroup
822
823! **************************************************************************************************
824!> \brief ...
825!> \param local_ab ...
826!> \param t_ab ...
827!> \param mp2_env ...
828!> \param homo ...
829!> \param virtual ...
830!> \param my_B_size ...
831!> \param my_group_L_size ...
832!> \param calc_forces ...
833!> \param ispin ...
834!> \param jspin ...
835!> \param local_ba ...
836! **************************************************************************************************
837 SUBROUTINE mp2_ri_allocate_no_blk(local_ab, t_ab, mp2_env, homo, virtual, my_B_size, &
838 my_group_L_size, calc_forces, ispin, jspin, local_ba)
839 REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :), &
840 INTENT(OUT) :: local_ab, t_ab
841 TYPE(mp2_type) :: mp2_env
842 INTEGER, INTENT(IN) :: homo(2), virtual(2), my_b_size(2), &
843 my_group_l_size
844 LOGICAL, INTENT(IN) :: calc_forces
845 INTEGER, INTENT(IN) :: ispin, jspin
846 REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :), &
847 INTENT(OUT) :: local_ba
848
849 CHARACTER(LEN=*), PARAMETER :: routinen = 'mp2_ri_allocate_no_blk'
850
851 INTEGER :: handle
852
853 CALL timeset(routinen, handle)
854
855 ALLOCATE (local_ab(virtual(ispin), my_b_size(jspin)))
856 local_ab = 0.0_dp
857
858 IF (calc_forces) THEN
859 IF (.NOT. ALLOCATED(mp2_env%ri_grad%P_ij(jspin)%array)) THEN
860 ALLOCATE (mp2_env%ri_grad%P_ij(jspin)%array(homo(ispin), homo(ispin)))
861 mp2_env%ri_grad%P_ij(jspin)%array = 0.0_dp
862 END IF
863 IF (.NOT. ALLOCATED(mp2_env%ri_grad%P_ab(jspin)%array)) THEN
864 ALLOCATE (mp2_env%ri_grad%P_ab(jspin)%array(my_b_size(jspin), virtual(jspin)))
865 mp2_env%ri_grad%P_ab(jspin)%array = 0.0_dp
866 END IF
867 IF (.NOT. ALLOCATED(mp2_env%ri_grad%Gamma_P_ia(jspin)%array)) THEN
868 ALLOCATE (mp2_env%ri_grad%Gamma_P_ia(jspin)%array(my_b_size(jspin), homo(jspin), my_group_l_size))
869 mp2_env%ri_grad%Gamma_P_ia(jspin)%array = 0.0_dp
870 END IF
871
872 IF (ispin == jspin) THEN
873 ! For non-alpha-beta case we need amplitudes
874 ALLOCATE (t_ab(virtual(ispin), my_b_size(jspin)))
875
876 ! That is just a dummy. In that way, we can pass it as array to other routines w/o requirement for allocatable array
877 ALLOCATE (local_ba(1, 1))
878 ELSE
879 ! We need more integrals
880 ALLOCATE (local_ba(virtual(jspin), my_b_size(ispin)))
881 END IF
882 END IF
883 !
884
885 CALL timestop(handle)
886
887 END SUBROUTINE mp2_ri_allocate_no_blk
888
889! **************************************************************************************************
890!> \brief ...
891!> \param dimen_RI ...
892!> \param my_B_size ...
893!> \param block_size ...
894!> \param local_i_aL ...
895!> \param local_j_aL ...
896!> \param calc_forces ...
897!> \param Y_i_aP ...
898!> \param Y_j_aP ...
899!> \param ispin ...
900!> \param jspin ...
901! **************************************************************************************************
902 SUBROUTINE mp2_ri_allocate_blk(dimen_RI, my_B_size, block_size, &
903 local_i_aL, local_j_aL, calc_forces, &
904 Y_i_aP, Y_j_aP, ispin, jspin)
905 INTEGER, INTENT(IN) :: dimen_ri, my_b_size(2), block_size
906 REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :, :), &
907 INTENT(OUT) :: local_i_al, local_j_al
908 LOGICAL, INTENT(IN) :: calc_forces
909 REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :, :), &
910 INTENT(OUT) :: y_i_ap, y_j_ap
911 INTEGER, INTENT(IN) :: ispin, jspin
912
913 CHARACTER(LEN=*), PARAMETER :: routinen = 'mp2_ri_allocate_blk'
914
915 INTEGER :: handle
916
917 CALL timeset(routinen, handle)
918
919 ALLOCATE (local_i_al(dimen_ri, my_b_size(ispin), block_size))
920 local_i_al = 0.0_dp
921 ALLOCATE (local_j_al(dimen_ri, my_b_size(jspin), block_size))
922 local_j_al = 0.0_dp
923
924 IF (calc_forces) THEN
925 ALLOCATE (y_i_ap(my_b_size(ispin), dimen_ri, block_size))
926 y_i_ap = 0.0_dp
927 ! For closed-shell, alpha-alpha and beta-beta my_B_size_beta=my_b_size
928 ! Not for alpha-beta case: Y_j_aP_beta is sent and received as Y_j_aP
929 ALLOCATE (y_j_ap(my_b_size(jspin), dimen_ri, block_size))
930 y_j_ap = 0.0_dp
931 END IF
932 !
933
934 CALL timestop(handle)
935
936 END SUBROUTINE mp2_ri_allocate_blk
937
938! **************************************************************************************************
939!> \brief ...
940!> \param my_alpha_beta_case ...
941!> \param total_ij_pairs ...
942!> \param homo ...
943!> \param homo_beta ...
944!> \param block_size ...
945!> \param ngroup ...
946!> \param ij_map ...
947!> \param color_sub ...
948!> \param my_ij_pairs ...
949!> \param my_open_shell_SS ...
950!> \param unit_nr ...
951! **************************************************************************************************
952 SUBROUTINE mp2_ri_communication(my_alpha_beta_case, total_ij_pairs, homo, homo_beta, &
953 block_size, ngroup, ij_map, color_sub, my_ij_pairs, my_open_shell_SS, unit_nr)
954 LOGICAL, INTENT(IN) :: my_alpha_beta_case
955 INTEGER, INTENT(OUT) :: total_ij_pairs
956 INTEGER, INTENT(IN) :: homo, homo_beta, block_size, ngroup
957 INTEGER, ALLOCATABLE, DIMENSION(:, :), INTENT(OUT) :: ij_map
958 INTEGER, INTENT(IN) :: color_sub
959 INTEGER, INTENT(OUT) :: my_ij_pairs
960 LOGICAL, INTENT(IN) :: my_open_shell_ss
961 INTEGER, INTENT(IN) :: unit_nr
962
963 CHARACTER(LEN=*), PARAMETER :: routinen = 'mp2_ri_communication'
964
965 INTEGER :: assigned_blocks, first_i_block, first_j_block, handle, iib, ij_block_counter, &
966 ij_counter, jjb, last_i_block, last_j_block, num_block_per_group, num_ij_blocks, &
967 num_ij_blocks_beta, total_ij_block, total_ij_pairs_blocks
968 LOGICAL, ALLOCATABLE, DIMENSION(:, :) :: ij_marker
969
970! Calculate the maximum number of ij pairs that have to be computed
971! among groups
972
973 CALL timeset(routinen, handle)
974
975 IF (.NOT. my_open_shell_ss .AND. .NOT. my_alpha_beta_case) THEN
976 total_ij_pairs = homo*(1 + homo)/2
977 num_ij_blocks = homo/block_size - 1
978
979 first_i_block = 1
980 last_i_block = block_size*(num_ij_blocks - 1)
981
982 first_j_block = block_size + 1
983 last_j_block = block_size*(num_ij_blocks + 1)
984
985 ij_block_counter = 0
986 DO iib = first_i_block, last_i_block, block_size
987 DO jjb = iib + block_size, last_j_block, block_size
988 ij_block_counter = ij_block_counter + 1
989 END DO
990 END DO
991
992 total_ij_block = ij_block_counter
993 num_block_per_group = total_ij_block/ngroup
994 assigned_blocks = num_block_per_group*ngroup
995
996 total_ij_pairs_blocks = assigned_blocks + (total_ij_pairs - assigned_blocks*(block_size**2))
997
998 ALLOCATE (ij_marker(homo, homo))
999 ij_marker = .true.
1000 ALLOCATE (ij_map(3, total_ij_pairs_blocks))
1001 ij_map = 0
1002 ij_counter = 0
1003 my_ij_pairs = 0
1004 DO iib = first_i_block, last_i_block, block_size
1005 DO jjb = iib + block_size, last_j_block, block_size
1006 IF (ij_counter + 1 > assigned_blocks) EXIT
1007 ij_counter = ij_counter + 1
1008 ij_marker(iib:iib + block_size - 1, jjb:jjb + block_size - 1) = .false.
1009 ij_map(1, ij_counter) = iib
1010 ij_map(2, ij_counter) = jjb
1011 ij_map(3, ij_counter) = block_size
1012 IF (mod(ij_counter, ngroup) == color_sub) my_ij_pairs = my_ij_pairs + 1
1013 END DO
1014 END DO
1015 DO iib = 1, homo
1016 DO jjb = iib, homo
1017 IF (ij_marker(iib, jjb)) THEN
1018 ij_counter = ij_counter + 1
1019 ij_map(1, ij_counter) = iib
1020 ij_map(2, ij_counter) = jjb
1021 ij_map(3, ij_counter) = 1
1022 IF (mod(ij_counter, ngroup) == color_sub) my_ij_pairs = my_ij_pairs + 1
1023 END IF
1024 END DO
1025 END DO
1026 DEALLOCATE (ij_marker)
1027
1028 ELSE IF (.NOT. my_alpha_beta_case) THEN
1029 ! THese are the cases alpha/alpha and beta/beta
1030 ! We do not have to consider the diagonal elements
1031 total_ij_pairs = homo*(homo - 1)/2
1032 num_ij_blocks = (homo - 1)/block_size - 1
1033
1034 first_i_block = 1
1035 last_i_block = block_size*(num_ij_blocks - 1)
1036
1037 ! We shift the blocks to prevent the calculation of the diagonal elements which always give zero
1038 first_j_block = block_size + 2
1039 last_j_block = block_size*(num_ij_blocks + 1) + 1
1040
1041 ij_block_counter = 0
1042 DO iib = first_i_block, last_i_block, block_size
1043 DO jjb = iib + block_size + 1, last_j_block, block_size
1044 ij_block_counter = ij_block_counter + 1
1045 END DO
1046 END DO
1047
1048 total_ij_block = ij_block_counter
1049 num_block_per_group = total_ij_block/ngroup
1050 assigned_blocks = num_block_per_group*ngroup
1051
1052 total_ij_pairs_blocks = assigned_blocks + (total_ij_pairs - assigned_blocks*(block_size**2))
1053
1054 ALLOCATE (ij_marker(homo, homo))
1055 ij_marker = .true.
1056 ALLOCATE (ij_map(3, total_ij_pairs_blocks))
1057 ij_map = 0
1058 ij_counter = 0
1059 my_ij_pairs = 0
1060 DO iib = first_i_block, last_i_block, block_size
1061 DO jjb = iib + block_size + 1, last_j_block, block_size
1062 IF (ij_counter + 1 > assigned_blocks) EXIT
1063 ij_counter = ij_counter + 1
1064 ij_marker(iib:iib + block_size - 1, jjb:jjb + block_size - 1) = .false.
1065 ij_map(1, ij_counter) = iib
1066 ij_map(2, ij_counter) = jjb
1067 ij_map(3, ij_counter) = block_size
1068 IF (mod(ij_counter, ngroup) == color_sub) my_ij_pairs = my_ij_pairs + 1
1069 END DO
1070 END DO
1071 DO iib = 1, homo
1072 DO jjb = iib + 1, homo
1073 IF (ij_marker(iib, jjb)) THEN
1074 ij_counter = ij_counter + 1
1075 ij_map(1, ij_counter) = iib
1076 ij_map(2, ij_counter) = jjb
1077 ij_map(3, ij_counter) = 1
1078 IF (mod(ij_counter, ngroup) == color_sub) my_ij_pairs = my_ij_pairs + 1
1079 END IF
1080 END DO
1081 END DO
1082 DEALLOCATE (ij_marker)
1083
1084 ELSE
1085 total_ij_pairs = homo*homo_beta
1086 num_ij_blocks = homo/block_size
1087 num_ij_blocks_beta = homo_beta/block_size
1088
1089 first_i_block = 1
1090 last_i_block = block_size*(num_ij_blocks - 1)
1091
1092 first_j_block = 1
1093 last_j_block = block_size*(num_ij_blocks_beta - 1)
1094
1095 ij_block_counter = 0
1096 DO iib = first_i_block, last_i_block, block_size
1097 DO jjb = first_j_block, last_j_block, block_size
1098 ij_block_counter = ij_block_counter + 1
1099 END DO
1100 END DO
1101
1102 total_ij_block = ij_block_counter
1103 num_block_per_group = total_ij_block/ngroup
1104 assigned_blocks = num_block_per_group*ngroup
1105
1106 total_ij_pairs_blocks = assigned_blocks + (total_ij_pairs - assigned_blocks*(block_size**2))
1107
1108 ALLOCATE (ij_marker(homo, homo_beta))
1109 ij_marker = .true.
1110 ALLOCATE (ij_map(3, total_ij_pairs_blocks))
1111 ij_map = 0
1112 ij_counter = 0
1113 my_ij_pairs = 0
1114 DO iib = first_i_block, last_i_block, block_size
1115 DO jjb = first_j_block, last_j_block, block_size
1116 IF (ij_counter + 1 > assigned_blocks) EXIT
1117 ij_counter = ij_counter + 1
1118 ij_marker(iib:iib + block_size - 1, jjb:jjb + block_size - 1) = .false.
1119 ij_map(1, ij_counter) = iib
1120 ij_map(2, ij_counter) = jjb
1121 ij_map(3, ij_counter) = block_size
1122 IF (mod(ij_counter, ngroup) == color_sub) my_ij_pairs = my_ij_pairs + 1
1123 END DO
1124 END DO
1125 DO iib = 1, homo
1126 DO jjb = 1, homo_beta
1127 IF (ij_marker(iib, jjb)) THEN
1128 ij_counter = ij_counter + 1
1129 ij_map(1, ij_counter) = iib
1130 ij_map(2, ij_counter) = jjb
1131 ij_map(3, ij_counter) = 1
1132 IF (mod(ij_counter, ngroup) == color_sub) my_ij_pairs = my_ij_pairs + 1
1133 END IF
1134 END DO
1135 END DO
1136 DEALLOCATE (ij_marker)
1137 END IF
1138
1139 IF (unit_nr > 0) THEN
1140 IF (block_size == 1) THEN
1141 WRITE (unit=unit_nr, fmt="(T3,A,T66,F15.1)") &
1142 "RI_INFO| Percentage of ij pairs communicated with block size 1:", 100.0_dp
1143 ELSE
1144 WRITE (unit=unit_nr, fmt="(T3,A,T66,F15.1)") &
1145 "RI_INFO| Percentage of ij pairs communicated with block size 1:", &
1146 100.0_dp*real((total_ij_pairs - assigned_blocks*(block_size**2)), kind=dp)/real(total_ij_pairs, kind=dp)
1147 END IF
1148 CALL m_flush(unit_nr)
1149 END IF
1150
1151 CALL timestop(handle)
1152
1153 END SUBROUTINE mp2_ri_communication
1154
1155! **************************************************************************************************
1156!> \brief ...
1157!> \param para_env ...
1158!> \param para_env_sub ...
1159!> \param color_sub ...
1160!> \param sizes_array ...
1161!> \param calc_forces ...
1162!> \param integ_group_size ...
1163!> \param my_group_L_end ...
1164!> \param my_group_L_size ...
1165!> \param my_group_L_size_orig ...
1166!> \param my_group_L_start ...
1167!> \param my_new_group_L_size ...
1168!> \param integ_group_pos2color_sub ...
1169!> \param sizes_array_orig ...
1170!> \param ranges_info_array ...
1171!> \param comm_exchange ...
1172!> \param comm_rep ...
1173!> \param num_integ_group ...
1174! **************************************************************************************************
1175 SUBROUTINE mp2_ri_create_group(para_env, para_env_sub, color_sub, &
1176 sizes_array, calc_forces, &
1177 integ_group_size, my_group_L_end, &
1178 my_group_L_size, my_group_L_size_orig, my_group_L_start, my_new_group_L_size, &
1179 integ_group_pos2color_sub, &
1180 sizes_array_orig, ranges_info_array, comm_exchange, comm_rep, num_integ_group)
1181 TYPE(mp_para_env_type), INTENT(IN) :: para_env, para_env_sub
1182 INTEGER, INTENT(IN) :: color_sub
1183 INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(INOUT) :: sizes_array
1184 LOGICAL, INTENT(IN) :: calc_forces
1185 INTEGER, INTENT(IN) :: integ_group_size, my_group_l_end
1186 INTEGER, INTENT(INOUT) :: my_group_l_size
1187 INTEGER, INTENT(OUT) :: my_group_l_size_orig
1188 INTEGER, INTENT(IN) :: my_group_l_start
1189 INTEGER, INTENT(INOUT) :: my_new_group_l_size
1190 INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(OUT) :: integ_group_pos2color_sub, &
1191 sizes_array_orig
1192 INTEGER, ALLOCATABLE, DIMENSION(:, :, :), &
1193 INTENT(OUT) :: ranges_info_array
1194 TYPE(mp_comm_type), INTENT(OUT) :: comm_exchange, comm_rep
1195 INTEGER, INTENT(IN) :: num_integ_group
1196
1197 CHARACTER(LEN=*), PARAMETER :: routinen = 'mp2_ri_create_group'
1198
1199 INTEGER :: handle, iib, proc_receive, proc_shift, &
1200 sub_sub_color
1201 INTEGER, ALLOCATABLE, DIMENSION(:) :: new_sizes_array, rep_ends_array, &
1202 rep_sizes_array, rep_starts_array
1203 INTEGER, ALLOCATABLE, DIMENSION(:, :) :: my_info
1204
1205 CALL timeset(routinen, handle)
1206 !
1207 sub_sub_color = para_env_sub%mepos*num_integ_group + color_sub/integ_group_size
1208 CALL comm_exchange%from_split(para_env, sub_sub_color)
1209
1210 ! create the replication group
1211 sub_sub_color = para_env_sub%mepos*comm_exchange%num_pe + comm_exchange%mepos
1212 CALL comm_rep%from_split(para_env, sub_sub_color)
1213
1214 ! create the new limits for K according to the size
1215 ! of the integral group
1216
1217 ! info array for replication
1218 ALLOCATE (rep_ends_array(0:comm_rep%num_pe - 1))
1219 ALLOCATE (rep_starts_array(0:comm_rep%num_pe - 1))
1220 ALLOCATE (rep_sizes_array(0:comm_rep%num_pe - 1))
1221
1222 CALL comm_rep%allgather(my_group_l_size, rep_sizes_array)
1223 CALL comm_rep%allgather(my_group_l_start, rep_starts_array)
1224 CALL comm_rep%allgather(my_group_l_end, rep_ends_array)
1225
1226 ! calculate my_new_group_L_size according to sizes_array
1227 my_new_group_l_size = my_group_l_size
1228
1229 ! Info of this process
1230 ALLOCATE (my_info(4, 0:comm_rep%num_pe - 1))
1231 my_info(1, 0) = my_group_l_start
1232 my_info(2, 0) = my_group_l_end
1233 my_info(3, 0) = 1
1234 my_info(4, 0) = my_group_l_size
1235
1236 DO proc_shift = 1, comm_rep%num_pe - 1
1237 proc_receive = modulo(comm_rep%mepos - proc_shift, comm_rep%num_pe)
1238
1239 my_new_group_l_size = my_new_group_l_size + rep_sizes_array(proc_receive)
1240
1241 my_info(1, proc_shift) = rep_starts_array(proc_receive)
1242 my_info(2, proc_shift) = rep_ends_array(proc_receive)
1243 my_info(3, proc_shift) = my_info(4, proc_shift - 1) + 1
1244 my_info(4, proc_shift) = my_new_group_l_size
1245
1246 END DO
1247
1248 ALLOCATE (new_sizes_array(0:comm_exchange%num_pe - 1))
1249 ALLOCATE (ranges_info_array(4, 0:comm_rep%num_pe - 1, 0:comm_exchange%num_pe - 1))
1250 CALL comm_exchange%allgather(my_new_group_l_size, new_sizes_array)
1251 CALL comm_exchange%allgather(my_info, ranges_info_array)
1252
1253 DEALLOCATE (rep_sizes_array)
1254 DEALLOCATE (rep_starts_array)
1255 DEALLOCATE (rep_ends_array)
1256
1257 ALLOCATE (integ_group_pos2color_sub(0:comm_exchange%num_pe - 1))
1258 CALL comm_exchange%allgather(color_sub, integ_group_pos2color_sub)
1259
1260 IF (calc_forces) THEN
1261 iib = SIZE(sizes_array)
1262 ALLOCATE (sizes_array_orig(0:iib - 1))
1263 sizes_array_orig(:) = sizes_array
1264 END IF
1265
1266 my_group_l_size_orig = my_group_l_size
1267 my_group_l_size = my_new_group_l_size
1268 DEALLOCATE (sizes_array)
1269
1270 ALLOCATE (sizes_array(0:integ_group_size - 1))
1271 sizes_array(:) = new_sizes_array
1272
1273 DEALLOCATE (new_sizes_array)
1274 !
1275 CALL timestop(handle)
1276
1277 END SUBROUTINE mp2_ri_create_group
1278
1279! **************************************************************************************************
1280!> \brief ...
1281!> \param mp2_env ...
1282!> \param para_env ...
1283!> \param para_env_sub ...
1284!> \param gd_array ...
1285!> \param gd_B_virtual ...
1286!> \param homo ...
1287!> \param dimen_RI ...
1288!> \param unit_nr ...
1289!> \param integ_group_size ...
1290!> \param ngroup ...
1291!> \param num_integ_group ...
1292!> \param virtual ...
1293!> \param calc_forces ...
1294! **************************************************************************************************
1295 SUBROUTINE mp2_ri_get_integ_group_size(mp2_env, para_env, para_env_sub, gd_array, gd_B_virtual, &
1296 homo, dimen_RI, unit_nr, &
1297 integ_group_size, &
1298 ngroup, num_integ_group, &
1299 virtual, calc_forces)
1300 TYPE(mp2_type) :: mp2_env
1301 TYPE(mp_para_env_type), INTENT(IN) :: para_env, para_env_sub
1302 TYPE(group_dist_d1_type), INTENT(IN) :: gd_array
1303 TYPE(group_dist_d1_type), DIMENSION(:), INTENT(IN) :: gd_b_virtual
1304 INTEGER, DIMENSION(:), INTENT(IN) :: homo
1305 INTEGER, INTENT(IN) :: dimen_ri, unit_nr
1306 INTEGER, INTENT(OUT) :: integ_group_size, ngroup, num_integ_group
1307 INTEGER, DIMENSION(:), INTENT(IN) :: virtual
1308 LOGICAL, INTENT(IN) :: calc_forces
1309
1310 CHARACTER(LEN=*), PARAMETER :: routinen = 'mp2_ri_get_integ_group_size'
1311
1312 INTEGER :: block_size, handle, iib, &
1313 max_repl_group_size, &
1314 min_integ_group_size
1315 INTEGER(KIND=int_8) :: mem
1316 LOGICAL :: calc_group_size
1317 REAL(kind=dp) :: factor, mem_base, mem_min, mem_per_blk, &
1318 mem_per_repl, mem_per_repl_blk, &
1319 mem_real
1320
1321 CALL timeset(routinen, handle)
1322
1323 ngroup = para_env%num_pe/para_env_sub%num_pe
1324
1325 calc_group_size = mp2_env%ri_mp2%number_integration_groups <= 0
1326 IF (.NOT. calc_group_size) THEN
1327 IF (mod(ngroup, mp2_env%ri_mp2%number_integration_groups) /= 0) calc_group_size = .true.
1328 END IF
1329
1330 IF (calc_group_size) THEN
1331 CALL m_memory(mem)
1332 mem_real = (mem + 1024*1024 - 1)/(1024*1024)
1333 CALL para_env%min(mem_real)
1334
1335 mem_base = 0.0_dp
1336 mem_per_blk = 0.0_dp
1337 mem_per_repl = 0.0_dp
1338 mem_per_repl_blk = 0.0_dp
1339
1340 ! BIB_C_copy
1341 mem_per_repl = mem_per_repl + maxval(max(real(homo, kind=dp)*maxsize(gd_array), real(dimen_ri, kind=dp))* &
1342 maxsize(gd_b_virtual))*8.0_dp/(1024**2)
1343 ! BIB_C
1344 mem_per_repl = mem_per_repl + sum(real(homo, kind=dp)*maxsize(gd_b_virtual))*maxsize(gd_array)*8.0_dp/(1024**2)
1345 ! BIB_C_rec
1346 mem_per_repl_blk = mem_per_repl_blk + real(maxval(maxsize(gd_b_virtual)), kind=dp)*maxsize(gd_array)*8.0_dp/(1024**2)
1347 ! local_i_aL+local_j_aL
1348 mem_per_blk = mem_per_blk + 2.0_dp*maxval(maxsize(gd_b_virtual))*real(dimen_ri, kind=dp)*8.0_dp/(1024**2)
1349 ! local_ab
1350 mem_base = mem_base + maxval(real(virtual, kind=dp)*maxsize(gd_b_virtual))*8.0_dp/(1024**2)
1351 ! external_ab/external_i_aL
1352 mem_base = mem_base + real(max(dimen_ri, maxval(virtual)), kind=dp)*maxval(maxsize(gd_b_virtual))*8.0_dp/(1024**2)
1353
1354 IF (calc_forces) THEN
1355 ! Gamma_P_ia
1356 mem_per_repl = mem_per_repl + sum(real(homo, kind=dp)*maxsize(gd_array)* &
1357 maxsize(gd_b_virtual))*8.0_dp/(1024**2)
1358 ! Y_i_aP+Y_j_aP
1359 mem_per_blk = mem_per_blk + 2.0_dp*maxval(maxsize(gd_b_virtual))*dimen_ri*8.0_dp/(1024**2)
1360 ! local_ba/t_ab
1361 mem_base = mem_base + real(maxval(maxsize(gd_b_virtual)), kind=dp)*max(dimen_ri, maxval(virtual))*8.0_dp/(1024**2)
1362 ! P_ij
1363 mem_base = mem_base + sum(real(homo, kind=dp)*homo)*8.0_dp/(1024**2)
1364 ! P_ab
1365 mem_base = mem_base + sum(real(virtual, kind=dp)*maxsize(gd_b_virtual))*8.0_dp/(1024**2)
1366 ! send_ab/send_i_aL
1367 mem_base = mem_base + real(max(dimen_ri, maxval(virtual)), kind=dp)*maxval(maxsize(gd_b_virtual))*8.0_dp/(1024**2)
1368 END IF
1369
1370 ! This a first guess based on the assumption of optimal block sizes
1371 block_size = max(1, min(floor(sqrt(real(minval(homo), kind=dp))), floor(minval(homo)/sqrt(2.0_dp*ngroup))))
1372 IF (mp2_env%ri_mp2%block_size > 0) block_size = mp2_env%ri_mp2%block_size
1373
1374 mem_min = mem_base + mem_per_repl + (mem_per_blk + mem_per_repl_blk)*block_size
1375
1376 IF (unit_nr > 0) WRITE (unit_nr, '(T3,A,T68,F9.2,A4)') 'RI_INFO| Minimum available memory per MPI process:', &
1377 mem_real, ' MiB'
1378 IF (unit_nr > 0) WRITE (unit_nr, '(T3,A,T68,F9.2,A4)') 'RI_INFO| Minimum required memory per MPI process:', &
1379 mem_min, ' MiB'
1380
1381 ! We use the following communication model
1382 ! Comm(replication)+Comm(collection of data for ij pair)+Comm(contraction)
1383 ! One can show that the costs of the contraction step are independent of the block size and the replication group size
1384 ! With gradients, the other two steps are carried out twice (Y_i_aP -> Gamma_i_aP, and dereplication)
1385 ! NL ... number of RI basis functions
1386 ! NR ... replication group size
1387 ! NG ... number of sub groups
1388 ! NB ... Block size
1389 ! o ... number of occupied orbitals
1390 ! Then, we have the communication costs (in multiples of the original BIb_C matrix)
1391 ! (NR/NG)+(1-(NR/NG))*(o/NB+NB-2)/NG = (NR/NG)*(1-(o/NB+NB-2)/NG)+(o/NB+NB-2)/NG
1392 ! and with gradients
1393 ! 2*(NR/NG)+2*(1-(NR/NG))*(o/NB+NB-2)/NG = (NR/NG)*(1-(o/NB+NB-2)/NG)+(o/NB+NB-2)/NG
1394 ! We are looking for the minimum of the communication volume,
1395 ! thus, if the prefactor of (NR/NG) is smaller than zero, use the largest possible replication group size.
1396 ! If the factor is larger than zero, set the replication group size to 1. (For small systems and a large number of subgroups)
1397 ! Replication group size = 1 implies that the integration group size equals the number of subgroups
1398
1399 integ_group_size = ngroup
1400
1401 ! Multiply everything by homo*virtual to consider differences between spin channels in case of open-shell calculations
1402 factor = real(sum(homo*virtual), kind=dp) &
1403 - sum((real(maxval(homo), kind=dp)/block_size + block_size - 2.0_dp)*homo*virtual)/ngroup
1404 IF (SIZE(homo) == 2) factor = factor - 2.0_dp*product(homo)/block_size/ngroup*sum(homo*virtual)
1405
1406 IF (factor <= 0.0_dp) THEN
1407 ! Remove the fixed memory and divide by the memory per replication group size
1408 max_repl_group_size = min(max(floor((mem_real - mem_base - mem_per_blk*block_size)/ &
1409 (mem_per_repl + mem_per_repl_blk*block_size)), 1), ngroup)
1410 ! Convert to an integration group size
1411 min_integ_group_size = ngroup/max_repl_group_size
1412
1413 ! Ensure that the integration group size is a divisor of the number of sub groups
1414 DO iib = max(min(min_integ_group_size, ngroup), 1), ngroup
1415 ! check that the ngroup is a multiple of integ_group_size
1416 IF (mod(ngroup, iib) == 0) THEN
1417 integ_group_size = iib
1418 EXIT
1419 END IF
1420 integ_group_size = integ_group_size + 1
1421 END DO
1422 END IF
1423 ELSE ! We take the user provided group size
1424 integ_group_size = ngroup/mp2_env%ri_mp2%number_integration_groups
1425 END IF
1426
1427 IF (unit_nr > 0) THEN
1428 WRITE (unit=unit_nr, fmt="(T3,A,T75,i6)") &
1429 "RI_INFO| Group size for integral replication:", integ_group_size*para_env_sub%num_pe
1430 CALL m_flush(unit_nr)
1431 END IF
1432
1433 num_integ_group = ngroup/integ_group_size
1434
1435 CALL timestop(handle)
1436
1437 END SUBROUTINE mp2_ri_get_integ_group_size
1438
1439! **************************************************************************************************
1440!> \brief ...
1441!> \param mp2_env ...
1442!> \param para_env ...
1443!> \param para_env_sub ...
1444!> \param gd_array ...
1445!> \param gd_B_virtual ...
1446!> \param homo ...
1447!> \param virtual ...
1448!> \param dimen_RI ...
1449!> \param unit_nr ...
1450!> \param block_size ...
1451!> \param ngroup ...
1452!> \param num_integ_group ...
1453!> \param my_open_shell_ss ...
1454!> \param calc_forces ...
1455!> \param buffer_1D ...
1456! **************************************************************************************************
1457 SUBROUTINE mp2_ri_get_block_size(mp2_env, para_env, para_env_sub, gd_array, gd_B_virtual, &
1458 homo, virtual, dimen_RI, unit_nr, &
1459 block_size, ngroup, num_integ_group, &
1460 my_open_shell_ss, calc_forces, buffer_1D)
1461 TYPE(mp2_type) :: mp2_env
1462 TYPE(mp_para_env_type), INTENT(IN) :: para_env, para_env_sub
1463 TYPE(group_dist_d1_type), INTENT(IN) :: gd_array
1464 TYPE(group_dist_d1_type), DIMENSION(:), INTENT(IN) :: gd_b_virtual
1465 INTEGER, DIMENSION(:), INTENT(IN) :: homo, virtual
1466 INTEGER, INTENT(IN) :: dimen_ri, unit_nr
1467 INTEGER, INTENT(OUT) :: block_size, ngroup
1468 INTEGER, INTENT(IN) :: num_integ_group
1469 LOGICAL, INTENT(IN) :: my_open_shell_ss, calc_forces
1470 REAL(kind=dp), ALLOCATABLE, DIMENSION(:), &
1471 INTENT(OUT) :: buffer_1d
1472
1473 CHARACTER(LEN=*), PARAMETER :: routinen = 'mp2_ri_get_block_size'
1474
1475 INTEGER :: best_block_size, handle, num_ij_blocks
1476 INTEGER(KIND=int_8) :: buffer_size, mem
1477 REAL(kind=dp) :: mem_base, mem_per_blk, mem_per_repl_blk, &
1478 mem_real
1479
1480 CALL timeset(routinen, handle)
1481
1482 ngroup = para_env%num_pe/para_env_sub%num_pe
1483
1484 CALL m_memory(mem)
1485 mem_real = (mem + 1024*1024 - 1)/(1024*1024)
1486 CALL para_env%min(mem_real)
1487
1488 mem_base = 0.0_dp
1489 mem_per_blk = 0.0_dp
1490 mem_per_repl_blk = 0.0_dp
1491
1492 ! external_ab
1493 mem_base = mem_base + maxval(maxsize(gd_b_virtual))*max(dimen_ri, maxval(virtual))*8.0_dp/(1024**2)
1494 ! BIB_C_rec
1495 mem_per_repl_blk = mem_per_repl_blk + real(maxval(maxsize(gd_b_virtual)), kind=dp)*maxsize(gd_array)*8.0_dp/(1024**2)
1496 ! local_i_aL+local_j_aL
1497 mem_per_blk = mem_per_blk + 2.0_dp*maxval(maxsize(gd_b_virtual))*real(dimen_ri, kind=dp)*8.0_dp/(1024**2)
1498 ! Copy to keep arrays contiguous
1499 mem_base = mem_base + maxval(maxsize(gd_b_virtual))*max(dimen_ri, maxval(virtual))*8.0_dp/(1024**2)
1500
1501 IF (calc_forces) THEN
1502 ! Y_i_aP+Y_j_aP+BIb_C_send
1503 mem_per_blk = mem_per_blk + 3.0_dp*maxval(maxsize(gd_b_virtual))*dimen_ri*8.0_dp/(1024**2)
1504 ! send_ab
1505 mem_base = mem_base + maxval(maxsize(gd_b_virtual))*max(dimen_ri, maxval(virtual))*8.0_dp/(1024**2)
1506 END IF
1507
1508 best_block_size = 1
1509
1510 ! Here we split the memory half for the communication, half for replication
1511 IF (mp2_env%ri_mp2%block_size > 0) THEN
1512 best_block_size = mp2_env%ri_mp2%block_size
1513 ELSE
1514 best_block_size = max(floor((mem_real - mem_base)/(mem_per_blk + mem_per_repl_blk*ngroup/num_integ_group)), 1)
1515
1516 DO
1517 IF (SIZE(homo) == 1) THEN
1518 IF (.NOT. my_open_shell_ss) THEN
1519 num_ij_blocks = (homo(1)/best_block_size)
1520 num_ij_blocks = (num_ij_blocks*num_ij_blocks - num_ij_blocks)/2
1521 ELSE
1522 num_ij_blocks = ((homo(1) - 1)/best_block_size)
1523 num_ij_blocks = (num_ij_blocks*num_ij_blocks - num_ij_blocks)/2
1524 END IF
1525 ELSE
1526 num_ij_blocks = product(homo/best_block_size)
1527 END IF
1528 ! Enforce at least one large block for each subgroup
1529 IF ((num_ij_blocks >= ngroup .AND. num_ij_blocks > 0) .OR. best_block_size == 1) THEN
1530 EXIT
1531 ELSE
1532 best_block_size = best_block_size - 1
1533 END IF
1534 END DO
1535
1536 IF (SIZE(homo) == 1) THEN
1537 IF (my_open_shell_ss) THEN
1538 ! check that best_block_size is not bigger than sqrt(homo-1)
1539 ! Diagonal elements do not have to be considered
1540 best_block_size = min(floor(sqrt(real(homo(1) - 1, kind=dp))), best_block_size)
1541 ELSE
1542 ! check that best_block_size is not bigger than sqrt(homo)
1543 best_block_size = min(floor(sqrt(real(homo(1), kind=dp))), best_block_size)
1544 END IF
1545 END IF
1546 END IF
1547 block_size = max(1, best_block_size)
1548
1549 IF (unit_nr > 0) THEN
1550 WRITE (unit=unit_nr, fmt="(T3,A,T75,i6)") &
1551 "RI_INFO| Block size:", block_size
1552 CALL m_flush(unit_nr)
1553 END IF
1554
1555 ! Determine recv buffer size (BI_C_recv, external_i_aL, external_ab)
1556 buffer_size = max(int(maxsize(gd_array), kind=int_8)*block_size, int(max(dimen_ri, maxval(virtual)), kind=int_8)) &
1557 *maxval(maxsize(gd_b_virtual))
1558 ! The send buffer has the same size as the recv buffer
1559 IF (calc_forces) buffer_size = buffer_size*2
1560 ALLOCATE (buffer_1d(buffer_size))
1561
1562 CALL timestop(handle)
1563
1564 END SUBROUTINE mp2_ri_get_block_size
1565
1566! **************************************************************************************************
1567!> \brief ...
1568!> \param mp2_env ...
1569!> \param para_env_sub ...
1570!> \param gd_B_virtual ...
1571!> \param Eigenval ...
1572!> \param homo ...
1573!> \param dimen_RI ...
1574!> \param iiB ...
1575!> \param jjB ...
1576!> \param my_B_size ...
1577!> \param my_B_virtual_end ...
1578!> \param my_B_virtual_start ...
1579!> \param my_i ...
1580!> \param my_j ...
1581!> \param virtual ...
1582!> \param local_ab ...
1583!> \param t_ab ...
1584!> \param my_local_i_aL ...
1585!> \param my_local_j_aL ...
1586!> \param open_ss ...
1587!> \param Y_i_aP ...
1588!> \param Y_j_aP ...
1589!> \param local_ba ...
1590!> \param ispin ...
1591!> \param jspin ...
1592!> \param dgemm_counter ...
1593!> \param buffer_1D ...
1594! **************************************************************************************************
1595 SUBROUTINE mp2_update_p_gamma(mp2_env, para_env_sub, gd_B_virtual, &
1596 Eigenval, homo, dimen_RI, iiB, jjB, my_B_size, &
1597 my_B_virtual_end, my_B_virtual_start, my_i, my_j, virtual, local_ab, &
1598 t_ab, my_local_i_aL, my_local_j_aL, open_ss, Y_i_aP, Y_j_aP, &
1599 local_ba, ispin, jspin, dgemm_counter, buffer_1D)
1600 TYPE(mp2_type) :: mp2_env
1601 TYPE(mp_para_env_type), INTENT(IN) :: para_env_sub
1602 TYPE(group_dist_d1_type), DIMENSION(:), INTENT(IN) :: gd_b_virtual
1603 REAL(kind=dp), DIMENSION(:, :), INTENT(IN) :: eigenval
1604 INTEGER, DIMENSION(:), INTENT(IN) :: homo
1605 INTEGER, INTENT(IN) :: dimen_ri, iib, jjb
1606 INTEGER, DIMENSION(:), INTENT(IN) :: my_b_size, my_b_virtual_end, &
1607 my_b_virtual_start
1608 INTEGER, INTENT(IN) :: my_i, my_j
1609 INTEGER, DIMENSION(:), INTENT(IN) :: virtual
1610 REAL(kind=dp), CONTIGUOUS, DIMENSION(:, :), &
1611 INTENT(INOUT), TARGET :: local_ab
1612 REAL(kind=dp), CONTIGUOUS, DIMENSION(:, :), &
1613 INTENT(IN), TARGET :: t_ab, my_local_i_al, my_local_j_al
1614 LOGICAL, INTENT(IN) :: open_ss
1615 REAL(kind=dp), CONTIGUOUS, DIMENSION(:, :), &
1616 INTENT(INOUT), TARGET :: y_i_ap, y_j_ap, local_ba
1617 INTEGER, INTENT(IN) :: ispin, jspin
1618 TYPE(dgemm_counter_type), INTENT(INOUT) :: dgemm_counter
1619 REAL(kind=dp), CONTIGUOUS, DIMENSION(:), TARGET :: buffer_1d
1620
1621 CHARACTER(LEN=*), PARAMETER :: routinen = 'mp2_update_P_gamma'
1622
1623 INTEGER :: a, b, b_global, handle, proc_receive, proc_send, proc_shift, rec_b_size, &
1624 rec_b_virtual_end, rec_b_virtual_start, send_b_size, send_b_virtual_end, &
1625 send_b_virtual_start
1626 INTEGER(KIND=int_8) :: offset
1627 LOGICAL :: alpha_beta
1628 REAL(kind=dp) :: factor, p_ij_diag
1629 REAL(kind=dp), CONTIGUOUS, DIMENSION(:, :), &
1630 POINTER :: external_ab, send_ab
1631
1632 CALL timeset(routinen//"_Pia", handle)
1633
1634 alpha_beta = .NOT. (ispin == jspin)
1635 IF (open_ss) THEN
1636 factor = 1.0_dp
1637 ELSE
1638 factor = 2.0_dp
1639 END IF
1640 ! divide the (ia|jb) integrals by Delta_ij^ab
1641 DO b = 1, my_b_size(jspin)
1642 b_global = b + my_b_virtual_start(jspin) - 1
1643 DO a = 1, virtual(ispin)
1644 local_ab(a, b) = -local_ab(a, b)/ &
1645 (eigenval(homo(ispin) + a, ispin) + eigenval(homo(jspin) + b_global, jspin) - &
1646 eigenval(my_i + iib - 1, ispin) - eigenval(my_j + jjb - 1, jspin))
1647 END DO
1648 END DO
1649 IF (.NOT. (alpha_beta)) THEN
1650 p_ij_diag = -sum(local_ab*t_ab)*factor
1651 ELSE
1652 ! update diagonal part of P_ij
1653 p_ij_diag = -sum(local_ab*local_ab)*mp2_env%scale_S
1654 ! More integrals needed only for alpha-beta case: local_ba
1655 DO b = 1, my_b_size(ispin)
1656 b_global = b + my_b_virtual_start(ispin) - 1
1657 DO a = 1, virtual(jspin)
1658 local_ba(a, b) = -local_ba(a, b)/ &
1659 (eigenval(homo(jspin) + a, jspin) + eigenval(homo(ispin) + b_global, ispin) - &
1660 eigenval(my_i + iib - 1, ispin) - eigenval(my_j + jjb - 1, jspin))
1661 END DO
1662 END DO
1663 END IF
1664
1665 ! P_ab and add diagonal part of P_ij
1666
1667 CALL dgemm_counter_start(dgemm_counter)
1668 IF (.NOT. (alpha_beta)) THEN
1669 CALL mp2_env%local_gemm_ctx%gemm('T', 'N', my_b_size(ispin), my_b_size(ispin), virtual(ispin), 1.0_dp, &
1670 t_ab, virtual(ispin), local_ab, virtual(ispin), &
1671 1.0_dp, mp2_env%ri_grad%P_ab(ispin)%array(:, &
1672 my_b_virtual_start(ispin):my_b_virtual_end(ispin)), my_b_size(ispin))
1673 mp2_env%ri_grad%P_ij(ispin)%array(my_i + iib - 1, my_i + iib - 1) = &
1674 mp2_env%ri_grad%P_ij(ispin)%array(my_i + iib - 1, my_i + iib - 1) + p_ij_diag
1675 ELSE
1676 CALL mp2_env%local_gemm_ctx%gemm('T', 'N', my_b_size(ispin), my_b_size(ispin), virtual(jspin), mp2_env%scale_S, &
1677 local_ba, virtual(jspin), local_ba, virtual(jspin), 1.0_dp, &
1678 mp2_env%ri_grad%P_ab(ispin)%array(:, my_b_virtual_start(ispin):my_b_virtual_end(ispin)), my_b_size(ispin))
1679
1680 mp2_env%ri_grad%P_ij(ispin)%array(my_i + iib - 1, my_i + iib - 1) = &
1681 mp2_env%ri_grad%P_ij(ispin)%array(my_i + iib - 1, my_i + iib - 1) + p_ij_diag
1682
1683 CALL mp2_env%local_gemm_ctx%gemm('T', 'N', my_b_size(jspin), my_b_size(jspin), virtual(ispin), mp2_env%scale_S, &
1684 local_ab, virtual(ispin), local_ab, virtual(ispin), 1.0_dp, &
1685 mp2_env%ri_grad%P_ab(jspin)%array(:, my_b_virtual_start(jspin):my_b_virtual_end(jspin)), my_b_size(jspin))
1686
1687 mp2_env%ri_grad%P_ij(jspin)%array(my_j + jjb - 1, my_j + jjb - 1) = &
1688 mp2_env%ri_grad%P_ij(jspin)%array(my_j + jjb - 1, my_j + jjb - 1) + p_ij_diag
1689 END IF
1690 ! The summation is over unique pairs. In alpha-beta case, all pairs are unique: subroutine is called for
1691 ! both i^alpha,j^beta and i^beta,j^alpha. Formally, my_i can be equal to my_j, but they are different
1692 ! due to spin in alpha-beta case.
1693 IF ((my_i /= my_j) .AND. (.NOT. alpha_beta)) THEN
1694
1695 CALL mp2_env%local_gemm_ctx%gemm('N', 'T', my_b_size(ispin), virtual(ispin), my_b_size(ispin), 1.0_dp, &
1696 t_ab(my_b_virtual_start(ispin):my_b_virtual_end(ispin), :), my_b_size(ispin), &
1697 local_ab, virtual(ispin), &
1698 1.0_dp, mp2_env%ri_grad%P_ab(ispin)%array, my_b_size(ispin))
1699
1700 mp2_env%ri_grad%P_ij(ispin)%array(my_j + jjb - 1, my_j + jjb - 1) = &
1701 mp2_env%ri_grad%P_ij(ispin)%array(my_j + jjb - 1, my_j + jjb - 1) + p_ij_diag
1702 END IF
1703 DO proc_shift = 1, para_env_sub%num_pe - 1
1704 proc_send = modulo(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
1705 proc_receive = modulo(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
1706
1707 CALL get_group_dist(gd_b_virtual(jspin), proc_receive, rec_b_virtual_start, rec_b_virtual_end, rec_b_size)
1708 CALL get_group_dist(gd_b_virtual(jspin), proc_send, send_b_virtual_start, send_b_virtual_end, send_b_size)
1709
1710 external_ab(1:virtual(ispin), 1:rec_b_size) => buffer_1d(1:int(virtual(ispin), int_8)*rec_b_size)
1711 external_ab = 0.0_dp
1712
1713 CALL para_env_sub%sendrecv(local_ab, proc_send, &
1714 external_ab, proc_receive)
1715
1716 IF (.NOT. (alpha_beta)) THEN
1717 CALL mp2_env%local_gemm_ctx%gemm('T', 'N', my_b_size(ispin), rec_b_size, virtual(ispin), 1.0_dp, &
1718 t_ab, virtual(ispin), external_ab, virtual(ispin), &
1719 1.0_dp, mp2_env%ri_grad%P_ab(ispin)%array(:, rec_b_virtual_start:rec_b_virtual_end), my_b_size(ispin))
1720 ELSE
1721 CALL mp2_env%local_gemm_ctx%gemm('T', 'N', my_b_size(jspin), rec_b_size, virtual(ispin), mp2_env%scale_S, &
1722 local_ab, virtual(ispin), external_ab, virtual(ispin), &
1723 1.0_dp, mp2_env%ri_grad%P_ab(jspin)%array(:, rec_b_virtual_start:rec_b_virtual_end), &
1724 my_b_size(jspin))
1725
1726 ! For alpha-beta part of alpha-density we need a new parallel code
1727 ! And new external_ab (of a different size)
1728 CALL get_group_dist(gd_b_virtual(ispin), proc_receive, rec_b_virtual_start, rec_b_virtual_end, rec_b_size)
1729 CALL get_group_dist(gd_b_virtual(ispin), proc_send, send_b_virtual_start, send_b_virtual_end, send_b_size)
1730 external_ab(1:virtual(jspin), 1:rec_b_size) => buffer_1d(1:int(virtual(jspin), int_8)*rec_b_size)
1731 external_ab = 0.0_dp
1732 CALL para_env_sub%sendrecv(local_ba, proc_send, &
1733 external_ab, proc_receive)
1734 CALL mp2_env%local_gemm_ctx%gemm('T', 'N', my_b_size(ispin), rec_b_size, virtual(jspin), mp2_env%scale_S, &
1735 local_ba, virtual(jspin), external_ab, virtual(jspin), &
1736 1.0_dp, mp2_env%ri_grad%P_ab(ispin)%array(:, rec_b_virtual_start:rec_b_virtual_end), my_b_size(ispin))
1737 END IF
1738
1739 IF ((my_i /= my_j) .AND. (.NOT. alpha_beta)) THEN
1740 external_ab(1:my_b_size(ispin), 1:virtual(ispin)) => &
1741 buffer_1d(1:int(virtual(ispin), int_8)*my_b_size(ispin))
1742 external_ab = 0.0_dp
1743
1744 offset = int(virtual(ispin), int_8)*my_b_size(ispin)
1745
1746 send_ab(1:send_b_size, 1:virtual(ispin)) => buffer_1d(offset + 1:offset + int(send_b_size, int_8)*virtual(ispin))
1747 send_ab = 0.0_dp
1748
1749 CALL mp2_env%local_gemm_ctx%gemm('N', 'T', send_b_size, virtual(ispin), my_b_size(ispin), 1.0_dp, &
1750 t_ab(send_b_virtual_start:send_b_virtual_end, :), send_b_size, &
1751 local_ab, virtual(ispin), 0.0_dp, send_ab, send_b_size)
1752 CALL para_env_sub%sendrecv(send_ab, proc_send, &
1753 external_ab, proc_receive)
1754
1755 mp2_env%ri_grad%P_ab(ispin)%array(:, :) = mp2_env%ri_grad%P_ab(ispin)%array + external_ab
1756 END IF
1757
1758 END DO
1759 IF (.NOT. alpha_beta) THEN
1760 IF (my_i /= my_j) THEN
1761 CALL dgemm_counter_stop(dgemm_counter, 2*my_b_size(ispin), virtual(ispin), virtual(ispin))
1762 ELSE
1763 CALL dgemm_counter_stop(dgemm_counter, my_b_size(ispin), virtual(ispin), virtual(ispin))
1764 END IF
1765 ELSE
1766 CALL dgemm_counter_stop(dgemm_counter, sum(my_b_size), virtual(ispin), virtual(jspin))
1767 END IF
1768 CALL timestop(handle)
1769
1770 ! Now, Gamma_P_ia (made of Y_ia_P)
1771
1772 CALL timeset(routinen//"_Gamma", handle)
1773 CALL dgemm_counter_start(dgemm_counter)
1774 IF (.NOT. alpha_beta) THEN
1775 ! Alpha-alpha, beta-beta and closed shell
1776 CALL mp2_env%local_gemm_ctx%gemm('N', 'T', my_b_size(ispin), dimen_ri, my_b_size(ispin), 1.0_dp, &
1777 t_ab(my_b_virtual_start(ispin):my_b_virtual_end(ispin), :), my_b_size(ispin), &
1778 my_local_j_al, dimen_ri, 1.0_dp, y_i_ap, my_b_size(ispin))
1779 ELSE ! Alpha-beta
1780 CALL mp2_env%local_gemm_ctx%gemm('N', 'T', my_b_size(ispin), dimen_ri, my_b_size(jspin), mp2_env%scale_S, &
1781 local_ab(my_b_virtual_start(ispin):my_b_virtual_end(ispin), :), my_b_size(ispin), &
1782 my_local_j_al, dimen_ri, 1.0_dp, y_i_ap, my_b_size(ispin))
1783 CALL mp2_env%local_gemm_ctx%gemm('T', 'T', my_b_size(jspin), dimen_ri, my_b_size(ispin), mp2_env%scale_S, &
1784 local_ab(my_b_virtual_start(ispin):my_b_virtual_end(ispin), :), my_b_size(ispin), &
1785 my_local_i_al, dimen_ri, 1.0_dp, y_j_ap, my_b_size(jspin))
1786 END IF
1787
1788 IF (para_env_sub%num_pe > 1) THEN
1789 external_ab(1:my_b_size(ispin), 1:dimen_ri) => buffer_1d(1:int(my_b_size(ispin), int_8)*dimen_ri)
1790 external_ab = 0.0_dp
1791
1792 offset = int(my_b_size(ispin), int_8)*dimen_ri
1793 END IF
1794 !
1795 DO proc_shift = 1, para_env_sub%num_pe - 1
1796 proc_send = modulo(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
1797 proc_receive = modulo(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
1798
1799 CALL get_group_dist(gd_b_virtual(ispin), proc_receive, rec_b_virtual_start, rec_b_virtual_end, rec_b_size)
1800 CALL get_group_dist(gd_b_virtual(ispin), proc_send, send_b_virtual_start, send_b_virtual_end, send_b_size)
1801
1802 send_ab(1:send_b_size, 1:dimen_ri) => buffer_1d(offset + 1:offset + int(dimen_ri, int_8)*send_b_size)
1803 send_ab = 0.0_dp
1804 IF (.NOT. alpha_beta) THEN
1805 CALL mp2_env%local_gemm_ctx%gemm('N', 'T', send_b_size, dimen_ri, my_b_size(ispin), 1.0_dp, &
1806 t_ab(send_b_virtual_start:send_b_virtual_end, :), send_b_size, &
1807 my_local_j_al, dimen_ri, 0.0_dp, send_ab, send_b_size)
1808 CALL para_env_sub%sendrecv(send_ab, proc_send, external_ab, proc_receive)
1809
1810 y_i_ap(:, :) = y_i_ap + external_ab
1811
1812 ELSE ! Alpha-beta case
1813 ! Alpha-alpha part
1814 CALL mp2_env%local_gemm_ctx%gemm('N', 'T', send_b_size, dimen_ri, my_b_size(jspin), mp2_env%scale_S, &
1815 local_ab(send_b_virtual_start:send_b_virtual_end, :), send_b_size, &
1816 my_local_j_al, dimen_ri, 0.0_dp, send_ab, send_b_size)
1817 CALL para_env_sub%sendrecv(send_ab, proc_send, external_ab, proc_receive)
1818 y_i_ap(:, :) = y_i_ap + external_ab
1819 END IF
1820 END DO
1821
1822 IF (alpha_beta) THEN
1823 ! For beta-beta part (in alpha-beta case) we need a new parallel code
1824 IF (para_env_sub%num_pe > 1) THEN
1825 external_ab(1:my_b_size(jspin), 1:dimen_ri) => buffer_1d(1:int(my_b_size(jspin), int_8)*dimen_ri)
1826 external_ab = 0.0_dp
1827
1828 offset = int(my_b_size(jspin), int_8)*dimen_ri
1829 END IF
1830 DO proc_shift = 1, para_env_sub%num_pe - 1
1831 proc_send = modulo(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
1832 proc_receive = modulo(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
1833
1834 CALL get_group_dist(gd_b_virtual(jspin), proc_send, send_b_virtual_start, send_b_virtual_end, send_b_size)
1835 send_ab(1:send_b_size, 1:dimen_ri) => buffer_1d(offset + 1:offset + int(dimen_ri, int_8)*send_b_size)
1836 send_ab = 0.0_dp
1837 CALL mp2_env%local_gemm_ctx%gemm('N', 'T', send_b_size, dimen_ri, my_b_size(ispin), mp2_env%scale_S, &
1838 local_ba(send_b_virtual_start:send_b_virtual_end, :), send_b_size, &
1839 my_local_i_al, dimen_ri, 0.0_dp, send_ab, send_b_size)
1840 CALL para_env_sub%sendrecv(send_ab, proc_send, external_ab, proc_receive)
1841 y_j_ap(:, :) = y_j_ap + external_ab
1842
1843 END DO
1844
1845 ! Here, we just use approximate bounds. For large systems virtual(ispin) is approx virtual(jspin), same for B_size
1846 CALL dgemm_counter_stop(dgemm_counter, 3*virtual(ispin), dimen_ri, my_b_size(jspin))
1847 ELSE
1848 CALL dgemm_counter_stop(dgemm_counter, virtual(ispin), dimen_ri, my_b_size(ispin))
1849 END IF
1850
1851 IF ((my_i /= my_j) .AND. (.NOT. alpha_beta)) THEN
1852 ! Alpha-alpha, beta-beta and closed shell
1853 CALL dgemm_counter_start(dgemm_counter)
1854 CALL mp2_env%local_gemm_ctx%gemm('T', 'T', my_b_size(ispin), dimen_ri, my_b_size(ispin), 1.0_dp, &
1855 t_ab(my_b_virtual_start(ispin):my_b_virtual_end(ispin), :), my_b_size(ispin), &
1856 my_local_i_al, dimen_ri, 1.0_dp, y_j_ap, my_b_size(ispin))
1857 DO proc_shift = 1, para_env_sub%num_pe - 1
1858 proc_send = modulo(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
1859 proc_receive = modulo(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
1860
1861 CALL get_group_dist(gd_b_virtual(ispin), proc_receive, rec_b_virtual_start, rec_b_virtual_end, rec_b_size)
1862
1863 external_ab(1:dimen_ri, 1:rec_b_size) => buffer_1d(1:int(dimen_ri, int_8)*rec_b_size)
1864 external_ab = 0.0_dp
1865
1866 CALL para_env_sub%sendrecv(my_local_i_al, proc_send, &
1867 external_ab, proc_receive)
1868
1869 ! Alpha-alpha, beta-beta and closed shell
1870 CALL mp2_env%local_gemm_ctx%gemm('T', 'T', my_b_size(ispin), dimen_ri, rec_b_size, 1.0_dp, &
1871 t_ab(rec_b_virtual_start:rec_b_virtual_end, :), rec_b_size, &
1872 external_ab, dimen_ri, 1.0_dp, y_j_ap, my_b_size(ispin))
1873 END DO
1874
1875 CALL dgemm_counter_stop(dgemm_counter, my_b_size(ispin), dimen_ri, virtual(ispin))
1876 END IF
1877
1878 CALL timestop(handle)
1879 END SUBROUTINE mp2_update_p_gamma
1880
1881! **************************************************************************************************
1882!> \brief ...
1883!> \param Gamma_P_ia ...
1884!> \param ij_index ...
1885!> \param my_B_size ...
1886!> \param my_block_size ...
1887!> \param my_group_L_size ...
1888!> \param my_i ...
1889!> \param my_ij_pairs ...
1890!> \param ngroup ...
1891!> \param num_integ_group ...
1892!> \param integ_group_pos2color_sub ...
1893!> \param num_ij_pairs ...
1894!> \param ij_map ...
1895!> \param ranges_info_array ...
1896!> \param Y_i_aP ...
1897!> \param comm_exchange ...
1898!> \param sizes_array ...
1899!> \param spin ...
1900!> \param buffer_1D ...
1901! **************************************************************************************************
1902 SUBROUTINE mp2_redistribute_gamma(Gamma_P_ia, ij_index, my_B_size, &
1903 my_block_size, my_group_L_size, my_i, my_ij_pairs, ngroup, &
1904 num_integ_group, integ_group_pos2color_sub, num_ij_pairs, &
1905 ij_map, ranges_info_array, Y_i_aP, comm_exchange, &
1906 sizes_array, spin, buffer_1D)
1907
1908 REAL(kind=dp), DIMENSION(:, :, :), INTENT(INOUT) :: gamma_p_ia
1909 INTEGER, INTENT(IN) :: ij_index, my_b_size, my_block_size, &
1910 my_group_l_size, my_i, my_ij_pairs, &
1911 ngroup, num_integ_group
1912 INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(IN) :: integ_group_pos2color_sub, num_ij_pairs
1913 INTEGER, ALLOCATABLE, DIMENSION(:, :), INTENT(IN) :: ij_map
1914 INTEGER, ALLOCATABLE, DIMENSION(:, :, :), &
1915 INTENT(IN) :: ranges_info_array
1916 REAL(kind=dp), DIMENSION(:, :, :), INTENT(IN) :: y_i_ap
1917 TYPE(mp_comm_type), INTENT(IN) :: comm_exchange
1918 INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(IN) :: sizes_array
1919 INTEGER, INTENT(IN) :: spin
1920 REAL(kind=dp), CONTIGUOUS, DIMENSION(:), TARGET :: buffer_1d
1921
1922 CHARACTER(LEN=*), PARAMETER :: routinen = 'mp2_redistribute_gamma'
1923
1924 INTEGER :: end_point, handle, handle2, iib, ij_counter_rec, irep, kkk, lll, lstart_pos, &
1925 proc_receive, proc_send, proc_shift, rec_i, rec_ij_index, send_l_size, start_point, tag
1926 INTEGER(KIND=int_8) :: offset
1927 REAL(kind=dp), CONTIGUOUS, DIMENSION(:, :, :), &
1928 POINTER :: bi_c_rec, bi_c_send
1929
1930! In alpha-beta case Y_i_aP_beta is sent as Y_j_aP
1931
1932 CALL timeset(routinen//"_comm2", handle)
1933
1934 tag = 43
1935
1936 IF (ij_index <= my_ij_pairs) THEN
1937 ! somethig to send
1938 ! start with myself
1939 CALL timeset(routinen//"_comm2_w", handle2)
1940 DO irep = 0, num_integ_group - 1
1941 lstart_pos = ranges_info_array(1, irep, comm_exchange%mepos)
1942 start_point = ranges_info_array(3, irep, comm_exchange%mepos)
1943 end_point = ranges_info_array(4, irep, comm_exchange%mepos)
1944!$OMP PARALLEL DO DEFAULT(NONE) &
1945!$OMP PRIVATE(kkk,lll,iiB) &
1946!$OMP SHARED(start_point,end_point,Lstart_pos,my_block_size,&
1947!$OMP Gamma_P_ia,my_i,my_B_size,Y_i_aP)
1948 DO kkk = start_point, end_point
1949 lll = kkk - start_point + lstart_pos
1950 DO iib = 1, my_block_size
1951 gamma_p_ia(1:my_b_size, my_i + iib - 1, kkk) = &
1952 gamma_p_ia(1:my_b_size, my_i + iib - 1, kkk) + &
1953 y_i_ap(1:my_b_size, lll, iib)
1954 END DO
1955 END DO
1956!$OMP END PARALLEL DO
1957 END DO
1958 CALL timestop(handle2)
1959
1960 ! Y_i_aP(my_B_size,dimen_RI,block_size)
1961
1962 DO proc_shift = 1, comm_exchange%num_pe - 1
1963 proc_send = modulo(comm_exchange%mepos + proc_shift, comm_exchange%num_pe)
1964 proc_receive = modulo(comm_exchange%mepos - proc_shift, comm_exchange%num_pe)
1965
1966 send_l_size = sizes_array(proc_send)
1967 bi_c_send(1:my_b_size, 1:my_block_size, 1:send_l_size) => &
1968 buffer_1d(1:int(my_b_size, int_8)*my_block_size*send_l_size)
1969
1970 offset = int(my_b_size, int_8)*my_block_size*send_l_size
1971
1972 CALL timeset(routinen//"_comm2_w", handle2)
1973 bi_c_send = 0.0_dp
1974 DO irep = 0, num_integ_group - 1
1975 lstart_pos = ranges_info_array(1, irep, proc_send)
1976 start_point = ranges_info_array(3, irep, proc_send)
1977 end_point = ranges_info_array(4, irep, proc_send)
1978!$OMP PARALLEL DO DEFAULT(NONE) &
1979!$OMP PRIVATE(kkk,lll,iiB) &
1980!$OMP SHARED(start_point,end_point,Lstart_pos,my_block_size,&
1981!$OMP BI_C_send,my_B_size,Y_i_aP)
1982 DO kkk = start_point, end_point
1983 lll = kkk - start_point + lstart_pos
1984 DO iib = 1, my_block_size
1985 bi_c_send(1:my_b_size, iib, kkk) = y_i_ap(1:my_b_size, lll, iib)
1986 END DO
1987 END DO
1988!$OMP END PARALLEL DO
1989 END DO
1990 CALL timestop(handle2)
1991
1992 rec_ij_index = num_ij_pairs(proc_receive)
1993
1994 IF (ij_index <= rec_ij_index) THEN
1995 ! we know that proc_receive has something to send for us, let's see what
1996 ij_counter_rec = &
1997 (ij_index - min(1, integ_group_pos2color_sub(proc_receive)))*ngroup + integ_group_pos2color_sub(proc_receive)
1998
1999 rec_i = ij_map(spin, ij_counter_rec)
2000
2001 bi_c_rec(1:my_b_size, 1:my_block_size, 1:my_group_l_size) => &
2002 buffer_1d(offset + 1:offset + int(my_b_size, int_8)*my_block_size*my_group_l_size)
2003 bi_c_rec = 0.0_dp
2004
2005 CALL comm_exchange%sendrecv(bi_c_send, proc_send, &
2006 bi_c_rec, proc_receive, tag)
2007
2008 CALL timeset(routinen//"_comm2_w", handle2)
2009 DO irep = 0, num_integ_group - 1
2010 start_point = ranges_info_array(3, irep, comm_exchange%mepos)
2011 end_point = ranges_info_array(4, irep, comm_exchange%mepos)
2012!$OMP PARALLEL WORKSHARE DEFAULT(NONE) &
2013!$OMP SHARED(start_point,end_point,my_block_size,&
2014!$OMP Gamma_P_ia,rec_i,iiB,my_B_size,BI_C_rec)
2015 gamma_p_ia(:, rec_i:rec_i + my_block_size - 1, start_point:end_point) = &
2016 gamma_p_ia(:, rec_i:rec_i + my_block_size - 1, start_point:end_point) + &
2017 bi_c_rec(1:my_b_size, :, start_point:end_point)
2018!$OMP END PARALLEL WORKSHARE
2019 END DO
2020 CALL timestop(handle2)
2021
2022 ELSE
2023 ! we have something to send but nothing to receive
2024 CALL comm_exchange%send(bi_c_send, proc_send, tag)
2025
2026 END IF
2027
2028 END DO
2029
2030 ELSE
2031 ! noting to send check if we have to receive
2032 DO proc_shift = 1, comm_exchange%num_pe - 1
2033 proc_send = modulo(comm_exchange%mepos + proc_shift, comm_exchange%num_pe)
2034 proc_receive = modulo(comm_exchange%mepos - proc_shift, comm_exchange%num_pe)
2035 rec_ij_index = num_ij_pairs(proc_receive)
2036
2037 IF (ij_index <= rec_ij_index) THEN
2038 ! we know that proc_receive has something to send for us, let's see what
2039 ij_counter_rec = &
2040 (ij_index - min(1, integ_group_pos2color_sub(proc_receive)))*ngroup + integ_group_pos2color_sub(proc_receive)
2041
2042 rec_i = ij_map(spin, ij_counter_rec)
2043
2044 bi_c_rec(1:my_b_size, 1:my_block_size, 1:my_group_l_size) => &
2045 buffer_1d(1:int(my_b_size, int_8)*my_block_size*my_group_l_size)
2046
2047 bi_c_rec = 0.0_dp
2048
2049 CALL comm_exchange%recv(bi_c_rec, proc_receive, tag)
2050
2051 CALL timeset(routinen//"_comm2_w", handle2)
2052 DO irep = 0, num_integ_group - 1
2053 start_point = ranges_info_array(3, irep, comm_exchange%mepos)
2054 end_point = ranges_info_array(4, irep, comm_exchange%mepos)
2055#if !defined(__INTEL_LLVM_COMPILER) || (20250000 <= __INTEL_LLVM_COMPILER)
2056!$OMP PARALLEL WORKSHARE DEFAULT(NONE) &
2057!$OMP SHARED(start_point,end_point,my_block_size,&
2058!$OMP Gamma_P_ia,rec_i,my_B_size,BI_C_rec)
2059#endif
2060 gamma_p_ia(:, rec_i:rec_i + my_block_size - 1, start_point:end_point) = &
2061 gamma_p_ia(:, rec_i:rec_i + my_block_size - 1, start_point:end_point) + &
2062 bi_c_rec(1:my_b_size, :, start_point:end_point)
2063#if !defined(__INTEL_LLVM_COMPILER) || (20250000 <= __INTEL_LLVM_COMPILER)
2064!$OMP END PARALLEL WORKSHARE
2065#endif
2066 END DO
2067 CALL timestop(handle2)
2068
2069 END IF
2070 END DO
2071
2072 END IF
2073 CALL timestop(handle)
2074
2075 END SUBROUTINE mp2_redistribute_gamma
2076
2077! **************************************************************************************************
2078!> \brief ...
2079!> \param mp2_env ...
2080!> \param Eigenval ...
2081!> \param homo ...
2082!> \param virtual ...
2083!> \param open_shell ...
2084!> \param beta_beta ...
2085!> \param Bib_C ...
2086!> \param unit_nr ...
2087!> \param dimen_RI ...
2088!> \param my_B_size ...
2089!> \param ngroup ...
2090!> \param my_group_L_size ...
2091!> \param color_sub ...
2092!> \param ranges_info_array ...
2093!> \param comm_exchange ...
2094!> \param para_env_sub ...
2095!> \param para_env ...
2096!> \param my_B_virtual_start ...
2097!> \param my_B_virtual_end ...
2098!> \param sizes_array ...
2099!> \param gd_B_virtual ...
2100!> \param integ_group_pos2color_sub ...
2101!> \param dgemm_counter ...
2102!> \param buffer_1D ...
2103! **************************************************************************************************
2104 SUBROUTINE quasi_degenerate_p_ij(mp2_env, Eigenval, homo, virtual, open_shell, &
2105 beta_beta, Bib_C, unit_nr, dimen_RI, &
2106 my_B_size, ngroup, my_group_L_size, &
2107 color_sub, ranges_info_array, comm_exchange, para_env_sub, para_env, &
2108 my_B_virtual_start, my_B_virtual_end, sizes_array, gd_B_virtual, &
2109 integ_group_pos2color_sub, dgemm_counter, buffer_1D)
2110 TYPE(mp2_type) :: mp2_env
2111 REAL(kind=dp), DIMENSION(:, :), INTENT(IN) :: eigenval
2112 INTEGER, DIMENSION(:), INTENT(IN) :: homo, virtual
2113 LOGICAL, INTENT(IN) :: open_shell, beta_beta
2114 TYPE(three_dim_real_array), DIMENSION(:), &
2115 INTENT(IN) :: bib_c
2116 INTEGER, INTENT(IN) :: unit_nr, dimen_ri
2117 INTEGER, DIMENSION(:), INTENT(IN) :: my_b_size
2118 INTEGER, INTENT(IN) :: ngroup, my_group_l_size, color_sub
2119 INTEGER, ALLOCATABLE, DIMENSION(:, :, :), &
2120 INTENT(IN) :: ranges_info_array
2121 TYPE(mp_comm_type), INTENT(IN) :: comm_exchange
2122 TYPE(mp_para_env_type), INTENT(IN) :: para_env_sub, para_env
2123 INTEGER, DIMENSION(:), INTENT(IN) :: my_b_virtual_start, my_b_virtual_end
2124 INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(IN) :: sizes_array
2125 TYPE(group_dist_d1_type), DIMENSION(:), INTENT(IN) :: gd_b_virtual
2126 INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(IN) :: integ_group_pos2color_sub
2127 TYPE(dgemm_counter_type), INTENT(INOUT) :: dgemm_counter
2128 REAL(kind=dp), CONTIGUOUS, DIMENSION(:), TARGET :: buffer_1d
2129
2130 CHARACTER(LEN=*), PARAMETER :: routinen = 'quasi_degenerate_P_ij'
2131
2132 INTEGER :: a, a_global, b, b_global, block_size, decil, handle, handle2, ijk_counter, &
2133 ijk_counter_send, ijk_index, ispin, kkb, kspin, max_block_size, max_ijk, my_i, my_ijk, &
2134 my_j, my_k, my_last_k(2), my_virtual, nspins, proc_receive, proc_send, proc_shift, &
2135 rec_b_size, rec_b_virtual_end, rec_b_virtual_start, rec_l_size, send_b_size, &
2136 send_b_virtual_end, send_b_virtual_start, send_i, send_ijk_index, send_j, send_k, &
2137 size_b_i, size_b_k, tag, tag2
2138 INTEGER, ALLOCATABLE, DIMENSION(:) :: num_ijk
2139 INTEGER, ALLOCATABLE, DIMENSION(:, :) :: ijk_map, send_last_k
2140 LOGICAL :: alpha_beta, do_recv_i, do_recv_j, &
2141 do_recv_k, do_send_i, do_send_j, &
2142 do_send_k
2143 REAL(kind=dp) :: amp_fac, p_ij_elem, t_new, t_start
2144 REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :), &
2145 TARGET :: local_ab, local_al_i, local_al_j, t_ab
2146 REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :, :) :: local_al_k
2147 REAL(kind=dp), DIMENSION(:, :), POINTER :: bi_c_rec, external_ab, external_al
2148 REAL(kind=dp), DIMENSION(:, :, :), POINTER :: bi_c_rec_3d
2149
2150 CALL timeset(routinen//"_ij_sing", handle)
2151
2152 tag = 44
2153 tag2 = 45
2154
2155 nspins = SIZE(bib_c)
2156 alpha_beta = (nspins == 2)
2157
2158 ! Set amplitude factor
2159 amp_fac = mp2_env%scale_S + mp2_env%scale_T
2160 IF (open_shell) amp_fac = mp2_env%scale_T
2161
2162 ALLOCATE (send_last_k(2, comm_exchange%num_pe - 1))
2163
2164 ! Loop(s) over orbital triplets
2165 DO ispin = 1, nspins
2166 size_b_i = my_b_size(ispin)
2167 IF (ispin == 1 .AND. alpha_beta) THEN
2168 kspin = 2
2169 ELSE
2170 kspin = 1
2171 END IF
2172 size_b_k = my_b_size(kspin)
2173
2174 ! Find the number of quasi-degenerate orbitals and orbital triplets
2175
2176 CALL find_quasi_degenerate_ij(my_ijk, homo(ispin), homo(kspin), eigenval(:, ispin), mp2_env, ijk_map, unit_nr, ngroup, &
2177 .NOT. beta_beta .AND. ispin /= 2, comm_exchange, num_ijk, max_ijk, color_sub, &
2178 SIZE(buffer_1d), my_group_l_size, size_b_k, para_env, virtual(ispin), size_b_i)
2179
2180 my_virtual = virtual(ispin)
2181 IF (SIZE(ijk_map, 2) > 0) THEN
2182 max_block_size = ijk_map(4, 1)
2183 ELSE
2184 max_block_size = 1
2185 END IF
2186
2187 ALLOCATE (local_al_i(dimen_ri, size_b_i))
2188 ALLOCATE (local_al_j(dimen_ri, size_b_i))
2189 ALLOCATE (local_al_k(dimen_ri, size_b_k, max_block_size))
2190 ALLOCATE (t_ab(my_virtual, size_b_k))
2191
2192 my_last_k = -1
2193 send_last_k = -1
2194
2195 t_start = m_walltime()
2196 DO ijk_index = 1, max_ijk
2197
2198 ! Prediction is unreliable if we are in the first step of the loop
2199 IF (unit_nr > 0 .AND. ijk_index > 1) THEN
2200 decil = ijk_index*10/max_ijk
2201 IF (decil /= (ijk_index - 1)*10/max_ijk) THEN
2202 t_new = m_walltime()
2203 t_new = (t_new - t_start)/60.0_dp*(max_ijk - ijk_index + 1)/(ijk_index - 1)
2204 WRITE (unit_nr, fmt="(T3,A)") "Percentage of finished loop: "// &
2205 cp_to_string(decil*10)//". Minutes left: "//cp_to_string(t_new)
2206 CALL m_flush(unit_nr)
2207 END IF
2208 END IF
2209
2210 IF (ijk_index <= my_ijk) THEN
2211 ! work to be done
2212 ijk_counter = (ijk_index - min(1, color_sub))*ngroup + color_sub
2213 my_i = ijk_map(1, ijk_counter)
2214 my_j = ijk_map(2, ijk_counter)
2215 my_k = ijk_map(3, ijk_counter)
2216 block_size = ijk_map(4, ijk_counter)
2217
2218 do_recv_i = (ispin /= kspin) .OR. my_i < my_k .OR. my_i > my_k + block_size - 1
2219 do_recv_j = (ispin /= kspin) .OR. my_j < my_k .OR. my_j > my_k + block_size - 1
2220 do_recv_k = my_k /= my_last_k(1) .OR. my_k + block_size - 1 /= my_last_k(2)
2221 my_last_k(1) = my_k
2222 my_last_k(2) = my_k + block_size - 1
2223
2224 local_al_i = 0.0_dp
2225 IF (do_recv_i) THEN
2226 CALL fill_local_i_al_2d(local_al_i, ranges_info_array(:, :, comm_exchange%mepos), &
2227 bib_c(ispin)%array(:, :, my_i))
2228 END IF
2229
2230 local_al_j = 0.0_dp
2231 IF (do_recv_j) THEN
2232 CALL fill_local_i_al_2d(local_al_j, ranges_info_array(:, :, comm_exchange%mepos), &
2233 bib_c(ispin)%array(:, :, my_j))
2234 END IF
2235
2236 IF (do_recv_k) THEN
2237 local_al_k = 0.0_dp
2238 CALL fill_local_i_al(local_al_k(:, :, 1:block_size), ranges_info_array(:, :, comm_exchange%mepos), &
2239 bib_c(kspin)%array(:, :, my_k:my_k + block_size - 1))
2240 END IF
2241
2242 CALL timeset(routinen//"_comm", handle2)
2243 DO proc_shift = 1, comm_exchange%num_pe - 1
2244 proc_send = modulo(comm_exchange%mepos + proc_shift, comm_exchange%num_pe)
2245 proc_receive = modulo(comm_exchange%mepos - proc_shift, comm_exchange%num_pe)
2246
2247 send_ijk_index = num_ijk(proc_send)
2248
2249 rec_l_size = sizes_array(proc_receive)
2250 bi_c_rec(1:rec_l_size, 1:size_b_i) => buffer_1d(1:int(rec_l_size, kind=int_8)*size_b_i)
2251
2252 do_send_i = .false.
2253 do_send_j = .false.
2254 do_send_k = .false.
2255 IF (ijk_index <= send_ijk_index) THEN
2256 ! something to send
2257 ijk_counter_send = (ijk_index - min(1, integ_group_pos2color_sub(proc_send)))* &
2258 ngroup + integ_group_pos2color_sub(proc_send)
2259 send_i = ijk_map(1, ijk_counter_send)
2260 send_j = ijk_map(2, ijk_counter_send)
2261 send_k = ijk_map(3, ijk_counter_send)
2262
2263 do_send_i = (ispin /= kspin) .OR. send_i < send_k .OR. send_i > send_k + block_size - 1
2264 do_send_j = (ispin /= kspin) .OR. send_j < send_k .OR. send_j > send_k + block_size - 1
2265 do_send_k = send_k /= send_last_k(1, proc_shift) .OR. send_k + block_size - 1 /= send_last_k(2, proc_shift)
2266 send_last_k(1, proc_shift) = send_k
2267 send_last_k(2, proc_shift) = send_k + block_size - 1
2268 END IF
2269
2270 ! occupied i
2271 bi_c_rec = 0.0_dp
2272 IF (do_send_i) THEN
2273 IF (do_recv_i) THEN
2274 CALL comm_exchange%sendrecv(bib_c(ispin)%array(:, :, send_i), proc_send, &
2275 bi_c_rec, proc_receive, tag)
2276 ELSE
2277 CALL comm_exchange%send(bib_c(ispin)%array(:, :, send_i), proc_send, tag)
2278 END IF
2279 ELSE IF (do_recv_i) THEN
2280 CALL comm_exchange%recv(bi_c_rec, proc_receive, tag)
2281 END IF
2282 IF (do_recv_i) THEN
2283 CALL fill_local_i_al_2d(local_al_i, ranges_info_array(:, :, proc_receive), bi_c_rec)
2284 END IF
2285
2286 ! occupied j
2287 bi_c_rec = 0.0_dp
2288 IF (do_send_j) THEN
2289 IF (do_recv_j) THEN
2290 CALL comm_exchange%sendrecv(bib_c(ispin)%array(:, :, send_j), proc_send, &
2291 bi_c_rec, proc_receive, tag)
2292 ELSE
2293 CALL comm_exchange%send(bib_c(ispin)%array(:, :, send_j), proc_send, tag)
2294 END IF
2295 ELSE IF (do_recv_j) THEN
2296 CALL comm_exchange%recv(bi_c_rec, proc_receive, tag)
2297 END IF
2298 IF (do_recv_j) THEN
2299 CALL fill_local_i_al_2d(local_al_j, ranges_info_array(:, :, proc_receive), bi_c_rec)
2300 END IF
2301
2302 ! occupied k
2303 bi_c_rec_3d(1:rec_l_size, 1:size_b_k, 1:block_size) => &
2304 buffer_1d(1:int(rec_l_size, kind=int_8)*size_b_k*block_size)
2305 IF (do_send_k) THEN
2306 IF (do_recv_k) THEN
2307 CALL comm_exchange%sendrecv(bib_c(kspin)%array(:, :, send_k:send_k + block_size - 1), proc_send, &
2308 bi_c_rec_3d, proc_receive, tag)
2309 ELSE
2310 CALL comm_exchange%send(bi_c_rec, proc_receive, tag)
2311 END IF
2312 ELSE IF (do_recv_k) THEN
2313 CALL comm_exchange%recv(bi_c_rec_3d, proc_receive, tag)
2314 END IF
2315 IF (do_recv_k) THEN
2316 CALL fill_local_i_al(local_al_k(:, :, 1:block_size), ranges_info_array(:, :, proc_receive), bi_c_rec_3d)
2317 END IF
2318 END DO
2319
2320 IF (.NOT. do_recv_i) local_al_i(:, :) = local_al_k(:, :, my_i - my_k + 1)
2321 IF (.NOT. do_recv_j) local_al_j(:, :) = local_al_k(:, :, my_j - my_k + 1)
2322 CALL timestop(handle2)
2323
2324 ! expand integrals
2325 DO kkb = 1, block_size
2326 CALL timeset(routinen//"_exp_ik", handle2)
2327 CALL dgemm_counter_start(dgemm_counter)
2328 ALLOCATE (local_ab(my_virtual, size_b_k))
2329 local_ab = 0.0_dp
2330 CALL mp2_env%local_gemm_ctx%gemm('T', 'N', size_b_i, size_b_k, dimen_ri, 1.0_dp, &
2331 local_al_i, dimen_ri, local_al_k(:, :, kkb), dimen_ri, &
2332 0.0_dp, local_ab(my_b_virtual_start(ispin):my_b_virtual_end(ispin), 1:size_b_k), size_b_i)
2333 DO proc_shift = 1, para_env_sub%num_pe - 1
2334 proc_send = modulo(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
2335 proc_receive = modulo(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
2336
2337 CALL get_group_dist(gd_b_virtual(ispin), proc_receive, rec_b_virtual_start, rec_b_virtual_end, rec_b_size)
2338
2339 external_al(1:dimen_ri, 1:rec_b_size) => buffer_1d(1:int(dimen_ri, kind=int_8)*rec_b_size)
2340
2341 CALL comm_exchange%sendrecv(local_al_i, proc_send, &
2342 external_al, proc_receive, tag)
2343
2344 CALL mp2_env%local_gemm_ctx%gemm('T', 'N', rec_b_size, size_b_k, dimen_ri, 1.0_dp, &
2345 external_al, dimen_ri, local_al_k(:, :, kkb), dimen_ri, &
2346 0.0_dp, local_ab(rec_b_virtual_start:rec_b_virtual_end, 1:size_b_k), rec_b_size)
2347 END DO
2348 CALL dgemm_counter_stop(dgemm_counter, my_virtual, size_b_k, dimen_ri)
2349 CALL timestop(handle2)
2350
2351 ! Amplitudes
2352 CALL timeset(routinen//"_tab", handle2)
2353 t_ab = 0.0_dp
2354 ! Alpha-alpha, beta-beta and closed shell
2355 IF (.NOT. alpha_beta) THEN
2356 DO b = 1, size_b_k
2357 b_global = b + my_b_virtual_start(1) - 1
2358 DO a = 1, my_b_size(1)
2359 a_global = a + my_b_virtual_start(1) - 1
2360 t_ab(a_global, b) = (amp_fac*local_ab(a_global, b) - mp2_env%scale_T*local_ab(b_global, a))/ &
2361 (eigenval(my_i, 1) + eigenval(my_k + kkb - 1, 1) &
2362 - eigenval(homo(1) + a_global, 1) - eigenval(homo(1) + b_global, 1))
2363 END DO
2364 END DO
2365 ELSE
2366 DO b = 1, size_b_k
2367 b_global = b + my_b_virtual_start(kspin) - 1
2368 DO a = 1, my_b_size(ispin)
2369 a_global = a + my_b_virtual_start(ispin) - 1
2370 t_ab(a_global, b) = mp2_env%scale_S*local_ab(a_global, b)/ &
2371 (eigenval(my_i, ispin) + eigenval(my_k + kkb - 1, kspin) &
2372 - eigenval(homo(ispin) + a_global, ispin) - eigenval(homo(kspin) + b_global, kspin))
2373 END DO
2374 END DO
2375 END IF
2376
2377 IF (.NOT. alpha_beta) THEN
2378 DO proc_shift = 1, para_env_sub%num_pe - 1
2379 proc_send = modulo(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
2380 proc_receive = modulo(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
2381 CALL get_group_dist(gd_b_virtual(1), proc_receive, rec_b_virtual_start, rec_b_virtual_end, rec_b_size)
2382 CALL get_group_dist(gd_b_virtual(1), proc_send, send_b_virtual_start, send_b_virtual_end, send_b_size)
2383
2384 external_ab(1:size_b_i, 1:rec_b_size) => buffer_1d(1:int(size_b_i, kind=int_8)*rec_b_size)
2385 CALL para_env_sub%sendrecv(local_ab(send_b_virtual_start:send_b_virtual_end, 1:size_b_k), proc_send, &
2386 external_ab(1:size_b_i, 1:rec_b_size), proc_receive, tag)
2387
2388 DO b = 1, my_b_size(1)
2389 b_global = b + my_b_virtual_start(1) - 1
2390 DO a = 1, rec_b_size
2391 a_global = a + rec_b_virtual_start - 1
2392 t_ab(a_global, b) = (amp_fac*local_ab(a_global, b) - mp2_env%scale_T*external_ab(b, a))/ &
2393 (eigenval(my_i, 1) + eigenval(my_k + kkb - 1, 1) &
2394 - eigenval(homo(1) + a_global, 1) - eigenval(homo(1) + b_global, 1))
2395 END DO
2396 END DO
2397 END DO
2398 END IF
2399 CALL timestop(handle2)
2400
2401 ! Expand the second set of integrals
2402 CALL timeset(routinen//"_exp_jk", handle2)
2403 local_ab = 0.0_dp
2404 CALL dgemm_counter_start(dgemm_counter)
2405 CALL mp2_env%local_gemm_ctx%gemm('T', 'N', size_b_i, size_b_k, dimen_ri, 1.0_dp, &
2406 local_al_j, dimen_ri, local_al_k(:, :, kkb), dimen_ri, &
2407 0.0_dp, local_ab(my_b_virtual_start(ispin):my_b_virtual_end(ispin), 1:size_b_k), size_b_i)
2408 DO proc_shift = 1, para_env_sub%num_pe - 1
2409 proc_send = modulo(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
2410 proc_receive = modulo(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
2411
2412 CALL get_group_dist(gd_b_virtual(ispin), proc_receive, rec_b_virtual_start, rec_b_virtual_end, rec_b_size)
2413
2414 external_al(1:dimen_ri, 1:rec_b_size) => buffer_1d(1:int(dimen_ri, kind=int_8)*rec_b_size)
2415
2416 CALL comm_exchange%sendrecv(local_al_j, proc_send, &
2417 external_al, proc_receive, tag)
2418 CALL mp2_env%local_gemm_ctx%gemm('T', 'N', rec_b_size, size_b_k, dimen_ri, 1.0_dp, &
2419 external_al, dimen_ri, local_al_k(:, :, kkb), dimen_ri, &
2420 0.0_dp, local_ab(rec_b_virtual_start:rec_b_virtual_end, 1:size_b_k), rec_b_size)
2421 END DO
2422 CALL dgemm_counter_stop(dgemm_counter, my_virtual, size_b_k, dimen_ri)
2423 CALL timestop(handle2)
2424
2425 CALL timeset(routinen//"_Pij", handle2)
2426 DO b = 1, size_b_k
2427 b_global = b + my_b_virtual_start(kspin) - 1
2428 DO a = 1, my_b_size(ispin)
2429 a_global = a + my_b_virtual_start(ispin) - 1
2430 local_ab(a_global, b) = &
2431 local_ab(a_global, b)/(eigenval(my_j, ispin) + eigenval(my_k + kkb - 1, kspin) &
2432 - eigenval(homo(ispin) + a_global, ispin) - eigenval(homo(kspin) + b_global, kspin))
2433 END DO
2434 END DO
2435 !
2436 p_ij_elem = sum(local_ab*t_ab)
2437 DEALLOCATE (local_ab)
2438 IF ((.NOT. open_shell) .AND. (.NOT. alpha_beta)) THEN
2439 p_ij_elem = p_ij_elem*2.0_dp
2440 END IF
2441 IF (beta_beta) THEN
2442 mp2_env%ri_grad%P_ij(2)%array(my_i, my_j) = mp2_env%ri_grad%P_ij(2)%array(my_i, my_j) - p_ij_elem
2443 mp2_env%ri_grad%P_ij(2)%array(my_j, my_i) = mp2_env%ri_grad%P_ij(2)%array(my_j, my_i) - p_ij_elem
2444 ELSE
2445 mp2_env%ri_grad%P_ij(ispin)%array(my_i, my_j) = mp2_env%ri_grad%P_ij(ispin)%array(my_i, my_j) - p_ij_elem
2446 mp2_env%ri_grad%P_ij(ispin)%array(my_j, my_i) = mp2_env%ri_grad%P_ij(ispin)%array(my_j, my_i) - p_ij_elem
2447 END IF
2448 CALL timestop(handle2)
2449 END DO
2450 ELSE
2451 CALL timeset(routinen//"_comm", handle2)
2452 ! no work to be done, possible messeges to be exchanged
2453 DO proc_shift = 1, comm_exchange%num_pe - 1
2454 proc_send = modulo(comm_exchange%mepos + proc_shift, comm_exchange%num_pe)
2455 proc_receive = modulo(comm_exchange%mepos - proc_shift, comm_exchange%num_pe)
2456
2457 send_ijk_index = num_ijk(proc_send)
2458
2459 IF (ijk_index <= send_ijk_index) THEN
2460 ! somethig to send
2461 ijk_counter_send = (ijk_index - min(1, integ_group_pos2color_sub(proc_send)))*ngroup + &
2462 integ_group_pos2color_sub(proc_send)
2463 send_i = ijk_map(1, ijk_counter_send)
2464 send_j = ijk_map(2, ijk_counter_send)
2465 send_k = ijk_map(3, ijk_counter_send)
2466 block_size = ijk_map(4, ijk_counter_send)
2467
2468 do_send_i = (ispin /= kspin) .OR. send_i < send_k .OR. send_i > send_k + block_size - 1
2469 do_send_j = (ispin /= kspin) .OR. send_j < send_k .OR. send_j > send_k + block_size - 1
2470 ! occupied i
2471 IF (do_send_i) THEN
2472 CALL comm_exchange%send(bib_c(ispin)%array(:, :, send_i), proc_send, tag)
2473 END IF
2474 ! occupied j
2475 IF (do_send_j) THEN
2476 CALL comm_exchange%send(bib_c(ispin)%array(:, :, send_j), proc_send, tag)
2477 END IF
2478 ! occupied k
2479 CALL comm_exchange%send(bib_c(kspin)%array(:, :, send_k:send_k + block_size - 1), proc_send, tag)
2480 END IF
2481
2482 END DO ! proc loop
2483 CALL timestop(handle2)
2484 END IF
2485 END DO ! ijk_index loop
2486 DEALLOCATE (local_al_i)
2487 DEALLOCATE (local_al_j)
2488 DEALLOCATE (local_al_k)
2489 DEALLOCATE (t_ab)
2490 DEALLOCATE (ijk_map)
2491 END DO ! over number of loops (ispin)
2492 CALL timestop(handle)
2493
2494 END SUBROUTINE quasi_degenerate_p_ij
2495
2496! **************************************************************************************************
2497!> \brief ...
2498!> \param my_ijk ...
2499!> \param homo ...
2500!> \param homo_beta ...
2501!> \param Eigenval ...
2502!> \param mp2_env ...
2503!> \param ijk_map ...
2504!> \param unit_nr ...
2505!> \param ngroup ...
2506!> \param do_print_alpha ...
2507!> \param comm_exchange ...
2508!> \param num_ijk ...
2509!> \param max_ijk ...
2510!> \param color_sub ...
2511!> \param buffer_size ...
2512!> \param my_group_L_size ...
2513!> \param B_size_k ...
2514!> \param para_env ...
2515!> \param virtual ...
2516!> \param B_size_i ...
2517! **************************************************************************************************
2518 SUBROUTINE find_quasi_degenerate_ij(my_ijk, homo, homo_beta, Eigenval, mp2_env, ijk_map, unit_nr, ngroup, &
2519 do_print_alpha, comm_exchange, num_ijk, max_ijk, color_sub, &
2520 buffer_size, my_group_L_size, B_size_k, para_env, virtual, B_size_i)
2521
2522 INTEGER, INTENT(OUT) :: my_ijk
2523 INTEGER, INTENT(IN) :: homo, homo_beta
2524 REAL(kind=dp), DIMENSION(:), INTENT(IN) :: eigenval
2525 TYPE(mp2_type), INTENT(IN) :: mp2_env
2526 INTEGER, ALLOCATABLE, DIMENSION(:, :), INTENT(OUT) :: ijk_map
2527 INTEGER, INTENT(IN) :: unit_nr, ngroup
2528 LOGICAL, INTENT(IN) :: do_print_alpha
2529 TYPE(mp_comm_type), INTENT(IN) :: comm_exchange
2530 INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(OUT) :: num_ijk
2531 INTEGER, INTENT(OUT) :: max_ijk
2532 INTEGER, INTENT(IN) :: color_sub, buffer_size, my_group_l_size, &
2533 b_size_k
2534 TYPE(mp_para_env_type), INTENT(IN) :: para_env
2535 INTEGER, INTENT(IN) :: virtual, b_size_i
2536
2537 INTEGER :: block_size, communication_steps, communication_volume, iib, ij_counter, &
2538 ijk_counter, jjb, kkb, max_block_size, max_num_k_blocks, min_communication_volume, &
2539 my_steps, num_k_blocks, num_sing_ij, total_ijk
2540 INTEGER(KIND=int_8) :: mem
2541 LOGICAL, ALLOCATABLE, DIMENSION(:, :) :: ijk_marker
2542
2543 ALLOCATE (num_ijk(0:comm_exchange%num_pe - 1))
2544
2545 num_sing_ij = 0
2546 DO iib = 1, homo
2547 ! diagonal elements already updated
2548 DO jjb = iib + 1, homo
2549 IF (abs(eigenval(jjb) - eigenval(iib)) < mp2_env%ri_grad%eps_canonical) &
2550 num_sing_ij = num_sing_ij + 1
2551 END DO
2552 END DO
2553
2554 IF (unit_nr > 0) THEN
2555 IF (do_print_alpha) THEN
2556 WRITE (unit=unit_nr, fmt="(T3,A,T75,i6)") &
2557 "MO_INFO| Number of ij pairs below EPS_CANONICAL:", num_sing_ij
2558 ELSE
2559 WRITE (unit=unit_nr, fmt="(T3,A,T75,i6)") &
2560 "MO_INFO| Number of ij pairs (spin beta) below EPS_CANONICAL:", num_sing_ij
2561 END IF
2562 END IF
2563
2564 ! Determine the block size, first guess: use available buffer
2565 max_block_size = buffer_size/(my_group_l_size*b_size_k)
2566
2567 ! Second limit: memory
2568 CALL m_memory(mem)
2569 ! Convert to number of doubles
2570 mem = mem/8
2571 ! Remove local_ab (2x) and local_aL_i (2x)
2572 mem = mem - 2_int_8*(virtual*b_size_k + b_size_i*my_group_l_size)
2573 max_block_size = min(max_block_size, max(1, int(mem/(my_group_l_size*b_size_k), kind(max_block_size))))
2574
2575 ! Exchange the limit
2576 CALL para_env%min(max_block_size)
2577
2578 ! Find now the block size which minimizes the communication volume and then the number of communication steps
2579 block_size = 1
2580 min_communication_volume = 3*homo_beta*num_sing_ij
2581 communication_steps = 3*homo_beta*num_sing_ij
2582 DO iib = max_block_size, 2, -1
2583 max_num_k_blocks = homo_beta/iib*num_sing_ij
2584 num_k_blocks = max_num_k_blocks - mod(max_num_k_blocks, ngroup)
2585 communication_volume = num_k_blocks*(2 + iib) + 3*(homo_beta*num_sing_ij - iib*num_k_blocks)
2586 my_steps = num_k_blocks + homo_beta*num_sing_ij - iib*num_k_blocks
2587 IF (communication_volume < min_communication_volume) THEN
2588 block_size = iib
2589 min_communication_volume = communication_volume
2590 communication_steps = my_steps
2591 ELSE IF (communication_volume == min_communication_volume .AND. my_steps < communication_steps) THEN
2592 block_size = iib
2593 communication_steps = my_steps
2594 END IF
2595 END DO
2596
2597 IF (unit_nr > 0) THEN
2598 WRITE (unit=unit_nr, fmt="(T3,A,T75,i6)") &
2599 "MO_INFO| Block size:", block_size
2600 CALL m_flush(unit_nr)
2601 END IF
2602
2603 ! Calculate number of large blocks
2604 max_num_k_blocks = homo_beta/block_size*num_sing_ij
2605 num_k_blocks = max_num_k_blocks - mod(max_num_k_blocks, ngroup)
2606
2607 total_ijk = num_k_blocks + homo_beta*num_sing_ij - num_k_blocks*block_size
2608 ALLOCATE (ijk_map(4, total_ijk))
2609 ijk_map = 0
2610 ALLOCATE (ijk_marker(homo_beta, num_sing_ij))
2611 ijk_marker = .true.
2612
2613 my_ijk = 0
2614 ijk_counter = 0
2615 ij_counter = 0
2616 DO iib = 1, homo
2617 ! diagonal elements already updated
2618 DO jjb = iib + 1, homo
2619 IF (abs(eigenval(jjb) - eigenval(iib)) >= mp2_env%ri_grad%eps_canonical) cycle
2620 ij_counter = ij_counter + 1
2621 DO kkb = 1, homo_beta - mod(homo_beta, block_size), block_size
2622 IF (ijk_counter + 1 > num_k_blocks) EXIT
2623 ijk_counter = ijk_counter + 1
2624 ijk_marker(kkb:kkb + block_size - 1, ij_counter) = .false.
2625 ijk_map(1, ijk_counter) = iib
2626 ijk_map(2, ijk_counter) = jjb
2627 ijk_map(3, ijk_counter) = kkb
2628 ijk_map(4, ijk_counter) = block_size
2629 IF (mod(ijk_counter, ngroup) == color_sub) my_ijk = my_ijk + 1
2630 END DO
2631 END DO
2632 END DO
2633 ij_counter = 0
2634 DO iib = 1, homo
2635 ! diagonal elements already updated
2636 DO jjb = iib + 1, homo
2637 IF (abs(eigenval(jjb) - eigenval(iib)) >= mp2_env%ri_grad%eps_canonical) cycle
2638 ij_counter = ij_counter + 1
2639 DO kkb = 1, homo_beta
2640 IF (ijk_marker(kkb, ij_counter)) THEN
2641 ijk_counter = ijk_counter + 1
2642 ijk_map(1, ijk_counter) = iib
2643 ijk_map(2, ijk_counter) = jjb
2644 ijk_map(3, ijk_counter) = kkb
2645 ijk_map(4, ijk_counter) = 1
2646 IF (mod(ijk_counter, ngroup) == color_sub) my_ijk = my_ijk + 1
2647 END IF
2648 END DO
2649 END DO
2650 END DO
2651
2652 DEALLOCATE (ijk_marker)
2653
2654 CALL comm_exchange%allgather(my_ijk, num_ijk)
2655 max_ijk = maxval(num_ijk)
2656
2657 END SUBROUTINE find_quasi_degenerate_ij
2658
2659END MODULE mp2_ri_gpw
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
various routines to log and control the output. The idea is that decisions about where to log should ...
Counters to determine the performance of parallel DGEMMs.
elemental subroutine, public dgemm_counter_init(dgemm_counter, unit_nr, print_info)
Initialize a dgemm_counter.
subroutine, public dgemm_counter_write(dgemm_counter, para_env)
calculate and print flop rates
subroutine, public dgemm_counter_start(dgemm_counter)
start timer of the counter
subroutine, public dgemm_counter_stop(dgemm_counter, size1, size2, size3)
stop timer of the counter and provide matrix sizes
Types to describe group distributions.
elemental integer function, public maxsize(this)
...
Defines the basic variable types.
Definition kinds.F:23
integer, parameter, public int_8
Definition kinds.F:54
integer, parameter, public dp
Definition kinds.F:34
2- and 3-center electron repulsion integral routines based on libint2 Currently available operators: ...
pure logical function, public compare_potential_types(potential1, potential2)
Helper function to compare libint_potential_types.
integer, parameter, public local_gemm_pu_gpu
Machine interface based on Fortran 2003 and POSIX.
Definition machine.F:17
subroutine, public m_memory(mem)
Returns the total amount of memory [bytes] in use, if known, zero otherwise.
Definition machine.F:440
subroutine, public m_flush(lunit)
flushes units if the &GLOBAL flag is set accordingly
Definition machine.F:130
real(kind=dp) function, public m_walltime()
returns time from a real-time clock, protected against rolling early/easily
Definition machine.F:147
Interface to the message passing library MPI.
Routines to calculate RI-GPW-MP2 energy using pw.
Definition mp2_ri_gpw.F:14
subroutine, public mp2_ri_gpw_compute_en(emp2_cou, emp2_ex, emp2_s, emp2_t, bib_c, mp2_env, para_env, para_env_sub, color_sub, gd_array, gd_b_virtual, eigenval, nmo, homo, dimen_ri, unit_nr, calc_forces, calc_ex)
...
Definition mp2_ri_gpw.F:75
Routines for calculating RI-MP2 gradients.
subroutine, public complete_gamma(mp2_env, b_ia_q, dimen_ri, homo, virtual, para_env, para_env_sub, ngroup, my_group_l_size, my_group_l_start, my_group_l_end, my_b_size, my_b_virtual_start, gd_array, gd_b_virtual, kspin)
complete the calculation of the Gamma matrices
Types needed for MP2 calculations.
Definition mp2_types.F:14
stores all the informations relevant to an mpi environment