(git:1155b05)
Loading...
Searching...
No Matches
mp2_ri_gpw.F
Go to the documentation of this file.
1!--------------------------------------------------------------------------------------------------!
2! CP2K: A general program to perform molecular dynamics simulations !
3! Copyright 2000-2026 CP2K developers group <https://cp2k.org> !
4! !
5! SPDX-License-Identifier: GPL-2.0-or-later !
6!--------------------------------------------------------------------------------------------------!
7
8! **************************************************************************************************
9!> \brief Routines to calculate RI-GPW-MP2 energy using pw
10!> \par History
11!> 06.2012 created [Mauro Del Ben]
12!> 03.2019 Refactored from mp2_ri_gpw [Frederick Stein]
13! **************************************************************************************************
23 maxsize,&
25 USE kinds, ONLY: dp,&
26 int_8
29 USE machine, ONLY: m_flush,&
30 m_memory,&
32 USE message_passing, ONLY: mp_comm_type,&
35 USE mp2_types, ONLY: mp2_type,&
37
38!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num
39#include "./base/base_uses.f90"
40
41 IMPLICIT NONE
42
43 PRIVATE
44
45 CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'mp2_ri_gpw'
46
47 PUBLIC :: mp2_ri_gpw_compute_en
48
49CONTAINS
50
51! **************************************************************************************************
52!> \brief ...
53!> \param Emp2_Cou ...
54!> \param Emp2_EX ...
55!> \param Emp2_S ...
56!> \param Emp2_T ...
57!> \param BIb_C ...
58!> \param mp2_env ...
59!> \param para_env ...
60!> \param para_env_sub ...
61!> \param color_sub ...
62!> \param gd_array ...
63!> \param gd_B_virtual ...
64!> \param Eigenval ...
65!> \param nmo ...
66!> \param homo ...
67!> \param dimen_RI ...
68!> \param unit_nr ...
69!> \param calc_forces ...
70!> \param calc_ex ...
71! **************************************************************************************************
72 SUBROUTINE mp2_ri_gpw_compute_en(Emp2_Cou, Emp2_EX, Emp2_S, Emp2_T, BIb_C, mp2_env, para_env, para_env_sub, color_sub, &
73 gd_array, gd_B_virtual, &
74 Eigenval, nmo, homo, dimen_RI, unit_nr, calc_forces, calc_ex)
75 REAL(kind=dp), INTENT(INOUT) :: emp2_cou, emp2_ex, emp2_s, emp2_t
76 TYPE(three_dim_real_array), DIMENSION(:), &
77 INTENT(INOUT) :: bib_c
78 TYPE(mp2_type) :: mp2_env
79 TYPE(mp_para_env_type), INTENT(IN), POINTER :: para_env, para_env_sub
80 INTEGER, INTENT(IN) :: color_sub
81 TYPE(group_dist_d1_type), INTENT(INOUT) :: gd_array
82 INTEGER, DIMENSION(:), INTENT(IN) :: homo
83 INTEGER, INTENT(IN) :: nmo
84 REAL(kind=dp), DIMENSION(:, :), INTENT(IN) :: eigenval
85 TYPE(group_dist_d1_type), DIMENSION(SIZE(homo)), &
86 INTENT(INOUT) :: gd_b_virtual
87 INTEGER, INTENT(IN) :: dimen_ri, unit_nr
88 LOGICAL, INTENT(IN) :: calc_forces, calc_ex
89
90 CHARACTER(LEN=*), PARAMETER :: routinen = 'mp2_ri_gpw_compute_en'
91
92 INTEGER :: a, a_global, b, b_global, block_size, decil, end_point, handle, handle2, handle3, &
93 iib, ij_counter, ij_counter_send, ij_index, integ_group_size, ispin, jjb, jspin, &
94 max_ij_pairs, my_block_size, my_group_l_end, my_group_l_size, my_group_l_size_orig, &
95 my_group_l_start, my_i, my_ij_pairs, my_j, my_new_group_l_size, ngroup, nspins, &
96 num_integ_group, proc_receive, proc_send, proc_shift, rec_b_size, rec_b_virtual_end, &
97 rec_b_virtual_start, rec_l_size, send_b_size, send_b_virtual_end, send_b_virtual_start, &
98 send_i, send_ij_index, send_j, start_point, tag, total_ij_pairs
99 INTEGER, ALLOCATABLE, DIMENSION(:) :: integ_group_pos2color_sub, my_b_size, &
100 my_b_virtual_end, my_b_virtual_start, num_ij_pairs, sizes_array_orig, virtual
101 INTEGER, ALLOCATABLE, DIMENSION(:, :) :: ij_map
102 INTEGER, ALLOCATABLE, DIMENSION(:, :, :) :: ranges_info_array
103 LOGICAL :: my_alpha_beta_case, my_beta_beta_case, &
104 my_open_shell_ss
105 REAL(kind=dp) :: amp_fac, my_emp2_cou, my_emp2_ex, &
106 sym_fac, t_new, t_start
107 REAL(kind=dp), ALLOCATABLE, DIMENSION(:), TARGET :: buffer_1d
108 REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :), &
109 TARGET :: local_ab, local_ba, t_ab
110 REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :, :), &
111 TARGET :: local_i_al, local_j_al, y_i_ap, y_j_ap
112 REAL(kind=dp), CONTIGUOUS, DIMENSION(:, :), &
113 POINTER :: external_ab, external_i_al
114 REAL(kind=dp), CONTIGUOUS, DIMENSION(:, :, :), &
115 POINTER :: bi_c_rec
116 TYPE(dgemm_counter_type) :: dgemm_counter
117 TYPE(mp_comm_type) :: comm_exchange, comm_rep
118 TYPE(three_dim_real_array), ALLOCATABLE, &
119 DIMENSION(:) :: b_ia_q
120
121 CALL timeset(routinen, handle)
122
123 nspins = SIZE(homo)
124
125 ALLOCATE (virtual(nspins))
126 virtual(:) = nmo - homo(:)
127
128 ALLOCATE (my_b_size(nspins), my_b_virtual_start(nspins), my_b_virtual_end(nspins))
129 DO ispin = 1, nspins
130 CALL get_group_dist(gd_b_virtual(ispin), para_env_sub%mepos, &
131 my_b_virtual_start(ispin), my_b_virtual_end(ispin), my_b_size(ispin))
132 END DO
133
134 CALL get_group_dist(gd_array, color_sub, my_group_l_start, my_group_l_end, my_group_l_size)
135
136 CALL dgemm_counter_init(dgemm_counter, unit_nr, mp2_env%ri_mp2%print_dgemm_info)
137
138 ! local_gemm_ctx has a very footprint the first time this routine is
139 ! called.
140 CALL mp2_env%local_gemm_ctx%create(local_gemm_pu_gpu)
141 CALL mp2_env%local_gemm_ctx%set_op_threshold_gpu(128*128*128*2)
142
143 CALL mp2_ri_get_integ_group_size( &
144 mp2_env, para_env, para_env_sub, gd_array, gd_b_virtual, &
145 homo, dimen_ri, unit_nr, &
146 integ_group_size, ngroup, &
147 num_integ_group, virtual, calc_forces)
148
149 ! now create a group that contains all the proc that have the same virtual starting point
150 ! in the integ group
151 CALL mp2_ri_create_group( &
152 para_env, para_env_sub, color_sub, &
153 gd_array%sizes, calc_forces, &
154 integ_group_size, my_group_l_end, &
155 my_group_l_size, my_group_l_size_orig, my_group_l_start, my_new_group_l_size, &
156 integ_group_pos2color_sub, sizes_array_orig, &
157 ranges_info_array, comm_exchange, comm_rep, num_integ_group)
158
159 ! We cannot fix the tag because of the recv routine
160 tag = 42
161
162 DO jspin = 1, nspins
163
164 CALL replicate_iak_2intgroup(bib_c(jspin)%array, comm_exchange, comm_rep, &
165 homo(jspin), gd_array%sizes, my_b_size(jspin), &
166 my_group_l_size, ranges_info_array)
167
168 DO ispin = 1, jspin
169
170 IF (unit_nr > 0) THEN
171 IF (nspins == 1) THEN
172 WRITE (unit_nr, *) "Start loop run"
173 ELSE IF (ispin == 1 .AND. jspin == 1) THEN
174 WRITE (unit_nr, *) "Start loop run alpha-alpha"
175 ELSE IF (ispin == 1 .AND. jspin == 2) THEN
176 WRITE (unit_nr, *) "Start loop run alpha-beta"
177 ELSE IF (ispin == 2 .AND. jspin == 2) THEN
178 WRITE (unit_nr, *) "Start loop run beta-beta"
179 END IF
180 CALL m_flush(unit_nr)
181 END IF
182
183 my_open_shell_ss = (nspins == 2) .AND. (ispin == jspin)
184
185 ! t_ab = amp_fac*(:,a|:,b)-(:,b|:,a)
186 ! If we calculate the gradient we need to distinguish
187 ! between alpha-alpha and beta-beta cases for UMP2
188
189 my_beta_beta_case = .false.
190 my_alpha_beta_case = .false.
191 IF (ispin /= jspin) THEN
192 my_alpha_beta_case = .true.
193 ELSE IF (my_open_shell_ss) THEN
194 IF (ispin == 2) my_beta_beta_case = .true.
195 END IF
196
197 amp_fac = mp2_env%scale_S + mp2_env%scale_T
198 IF (my_alpha_beta_case .OR. my_open_shell_ss) amp_fac = mp2_env%scale_T
199
200 CALL mp2_ri_allocate_no_blk(local_ab, t_ab, mp2_env, homo, virtual, my_b_size, &
201 my_group_l_size, calc_forces, ispin, jspin, local_ba)
202
203 CALL mp2_ri_get_block_size( &
204 mp2_env, para_env, para_env_sub, gd_array, gd_b_virtual(ispin:jspin), &
205 homo(ispin:jspin), virtual(ispin:jspin), dimen_ri, unit_nr, block_size, &
206 ngroup, num_integ_group, my_open_shell_ss, calc_forces, buffer_1d)
207
208 ! *****************************************************************
209 ! ********** REPLICATION-BLOCKED COMMUNICATION SCHEME ***********
210 ! *****************************************************************
211 ! introduce block size, the number of occupied orbitals has to be a
212 ! multiple of the block size
213
214 ! Calculate the maximum number of ij pairs that have to be computed
215 ! among groups
216 CALL mp2_ri_communication(my_alpha_beta_case, total_ij_pairs, homo(ispin), homo(jspin), &
217 block_size, ngroup, ij_map, color_sub, my_ij_pairs, my_open_shell_ss, unit_nr)
218
219 ALLOCATE (num_ij_pairs(0:comm_exchange%num_pe - 1))
220 CALL comm_exchange%allgather(my_ij_pairs, num_ij_pairs)
221
222 max_ij_pairs = maxval(num_ij_pairs)
223
224 ! start real stuff
225 CALL mp2_ri_allocate_blk(dimen_ri, my_b_size, block_size, local_i_al, &
226 local_j_al, calc_forces, y_i_ap, y_j_ap, ispin, jspin)
227
228 CALL timeset(routinen//"_RI_loop", handle2)
229 my_emp2_cou = 0.0_dp
230 my_emp2_ex = 0.0_dp
231 t_start = m_walltime()
232 DO ij_index = 1, max_ij_pairs
233
234 ! Prediction is unreliable if we are in the first step of the loop
235 IF (unit_nr > 0 .AND. ij_index > 1) THEN
236 decil = ij_index*10/max_ij_pairs
237 IF (decil /= (ij_index - 1)*10/max_ij_pairs) THEN
238 t_new = m_walltime()
239 t_new = (t_new - t_start)/60.0_dp*(max_ij_pairs - ij_index + 1)/(ij_index - 1)
240 WRITE (unit_nr, fmt="(T3,A)") "Percentage of finished loop: "// &
241 cp_to_string(decil*10)//". Minutes left: "//cp_to_string(t_new)
242 CALL m_flush(unit_nr)
243 END IF
244 END IF
245
246 IF (calc_forces) THEN
247 y_i_ap = 0.0_dp
248 y_j_ap = 0.0_dp
249 END IF
250
251 IF (ij_index <= my_ij_pairs) THEN
252 ! We have work to do
253 ij_counter = (ij_index - min(1, color_sub))*ngroup + color_sub
254 my_i = ij_map(1, ij_counter)
255 my_j = ij_map(2, ij_counter)
256 my_block_size = ij_map(3, ij_counter)
257
258 local_i_al = 0.0_dp
259 CALL fill_local_i_al(local_i_al(:, :, 1:my_block_size), ranges_info_array(:, :, comm_exchange%mepos), &
260 bib_c(ispin)%array(:, :, my_i:my_i + my_block_size - 1))
261
262 local_j_al = 0.0_dp
263 CALL fill_local_i_al(local_j_al(:, :, 1:my_block_size), ranges_info_array(:, :, comm_exchange%mepos), &
264 bib_c(jspin)%array(:, :, my_j:my_j + my_block_size - 1))
265
266 ! collect data from other proc
267 CALL timeset(routinen//"_comm", handle3)
268 DO proc_shift = 1, comm_exchange%num_pe - 1
269 proc_send = modulo(comm_exchange%mepos + proc_shift, comm_exchange%num_pe)
270 proc_receive = modulo(comm_exchange%mepos - proc_shift, comm_exchange%num_pe)
271
272 send_ij_index = num_ij_pairs(proc_send)
273
274 CALL get_group_dist(gd_array, proc_receive, sizes=rec_l_size)
275
276 IF (ij_index <= send_ij_index) THEN
277 ij_counter_send = (ij_index - min(1, integ_group_pos2color_sub(proc_send)))*ngroup + &
278 integ_group_pos2color_sub(proc_send)
279 send_i = ij_map(1, ij_counter_send)
280 send_j = ij_map(2, ij_counter_send)
281
282 ! occupied i
283 bi_c_rec(1:rec_l_size, 1:my_b_size(ispin), 1:my_block_size) => &
284 buffer_1d(1:rec_l_size*my_b_size(ispin)*my_block_size)
285 bi_c_rec = 0.0_dp
286 CALL comm_exchange%sendrecv(bib_c(ispin)%array(:, :, send_i:send_i + my_block_size - 1), &
287 proc_send, bi_c_rec, proc_receive, tag)
288
289 CALL fill_local_i_al(local_i_al(:, :, 1:my_block_size), ranges_info_array(:, :, proc_receive), &
290 bi_c_rec(:, 1:my_b_size(ispin), :))
291
292 ! occupied j
293 bi_c_rec(1:rec_l_size, 1:my_b_size(jspin), 1:my_block_size) => &
294 buffer_1d(1:int(rec_l_size, int_8)*my_b_size(jspin)*my_block_size)
295 bi_c_rec = 0.0_dp
296 CALL comm_exchange%sendrecv(bib_c(jspin)%array(:, :, send_j:send_j + my_block_size - 1), &
297 proc_send, bi_c_rec, proc_receive, tag)
298
299 CALL fill_local_i_al(local_j_al(:, :, 1:my_block_size), ranges_info_array(:, :, proc_receive), &
300 bi_c_rec(:, 1:my_b_size(jspin), :))
301
302 ELSE
303 ! we send nothing while we know that we have to receive something
304
305 ! occupied i
306 bi_c_rec(1:rec_l_size, 1:my_b_size(ispin), 1:my_block_size) => &
307 buffer_1d(1:int(rec_l_size, int_8)*my_b_size(ispin)*my_block_size)
308 bi_c_rec = 0.0_dp
309 CALL comm_exchange%recv(bi_c_rec, proc_receive, tag)
310
311 CALL fill_local_i_al(local_i_al(:, :, 1:my_block_size), ranges_info_array(:, :, proc_receive), &
312 bi_c_rec(:, 1:my_b_size(ispin), 1:my_block_size))
313
314 ! occupied j
315 bi_c_rec(1:rec_l_size, 1:my_b_size(jspin), 1:my_block_size) => &
316 buffer_1d(1:int(rec_l_size, int_8)*my_b_size(jspin)*my_block_size)
317 bi_c_rec = 0.0_dp
318 CALL comm_exchange%recv(bi_c_rec, proc_receive, tag)
319
320 CALL fill_local_i_al(local_j_al(:, :, 1:my_block_size), ranges_info_array(:, :, proc_receive), &
321 bi_c_rec(:, 1:my_b_size(jspin), 1:my_block_size))
322
323 END IF
324
325 END DO
326
327 CALL timestop(handle3)
328
329 ! loop over the block elements
330 DO iib = 1, my_block_size
331 DO jjb = 1, my_block_size
332 CALL timeset(routinen//"_expansion", handle3)
333 associate(my_local_i_al => local_i_al(:, :, iib), my_local_j_al => local_j_al(:, :, jjb))
334
335 ! calculate the integrals (ia|jb) strating from my local data ...
336 local_ab = 0.0_dp
337 IF ((my_alpha_beta_case) .AND. (calc_forces)) THEN
338 local_ba = 0.0_dp
339 END IF
340 CALL dgemm_counter_start(dgemm_counter)
341 CALL mp2_env%local_gemm_ctx%gemm('T', 'N', my_b_size(ispin), my_b_size(jspin), dimen_ri, 1.0_dp, &
342 my_local_i_al, dimen_ri, my_local_j_al, dimen_ri, &
343 0.0_dp, local_ab(my_b_virtual_start(ispin):my_b_virtual_end(ispin), :), &
344 my_b_size(ispin))
345 ! Additional integrals only for alpha_beta case and forces
346 IF (my_alpha_beta_case .AND. calc_forces) THEN
347 local_ba(my_b_virtual_start(jspin):my_b_virtual_end(jspin), :) = &
348 transpose(local_ab(my_b_virtual_start(ispin):my_b_virtual_end(ispin), :))
349 END IF
350 ! ... and from the other of my subgroup
351 DO proc_shift = 1, para_env_sub%num_pe - 1
352 proc_send = modulo(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
353 proc_receive = modulo(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
354
355 CALL get_group_dist(gd_b_virtual(ispin), proc_receive, rec_b_virtual_start, &
356 rec_b_virtual_end, rec_b_size)
357
358 external_i_al(1:dimen_ri, 1:rec_b_size) => buffer_1d(1:int(dimen_ri, int_8)*rec_b_size)
359 external_i_al = 0.0_dp
360
361 CALL para_env_sub%sendrecv(my_local_i_al, proc_send, &
362 external_i_al, proc_receive, tag)
363
364 CALL mp2_env%local_gemm_ctx%gemm( &
365 'T', 'N', rec_b_size, my_b_size(jspin), dimen_ri, 1.0_dp, &
366 external_i_al, dimen_ri, my_local_j_al, dimen_ri, &
367 0.0_dp, local_ab(rec_b_virtual_start:rec_b_virtual_end, 1:my_b_size(jspin)), rec_b_size)
368
369 ! Additional integrals only for alpha_beta case and forces
370 IF (my_alpha_beta_case .AND. calc_forces) THEN
371
372 CALL get_group_dist(gd_b_virtual(jspin), proc_receive, rec_b_virtual_start, &
373 rec_b_virtual_end, rec_b_size)
374
375 external_i_al(1:dimen_ri, 1:rec_b_size) => buffer_1d(1:int(dimen_ri, int_8)*rec_b_size)
376 external_i_al = 0.0_dp
377
378 CALL para_env_sub%sendrecv(my_local_j_al, proc_send, &
379 external_i_al, proc_receive, tag)
380
381 CALL mp2_env%local_gemm_ctx%gemm('T', 'N', rec_b_size, my_b_size(ispin), dimen_ri, 1.0_dp, &
382 external_i_al, dimen_ri, my_local_i_al, dimen_ri, &
383 0.0_dp, local_ba(rec_b_virtual_start:rec_b_virtual_end, 1:my_b_size(ispin)), rec_b_size)
384 END IF
385 END DO
386 IF (my_alpha_beta_case .AND. calc_forces) THEN
387 ! Is just an approximation, but the call does not allow it, it ought to be (virtual_i*B_size_j+virtual_j*B_size_i)*dimen_RI
388 CALL dgemm_counter_stop(dgemm_counter, virtual(ispin), my_b_size(ispin) + my_b_size(jspin), dimen_ri)
389 ELSE
390 CALL dgemm_counter_stop(dgemm_counter, virtual(ispin), my_b_size(jspin), dimen_ri)
391 END IF
392 CALL timestop(handle3)
393
394 !sample peak memory
395 CALL m_memory()
396
397 CALL timeset(routinen//"_ener", handle3)
398 ! calculate coulomb only MP2
399 sym_fac = 2.0_dp
400 IF (my_i == my_j) sym_fac = 1.0_dp
401 IF (my_alpha_beta_case) sym_fac = 0.5_dp
402 DO b = 1, my_b_size(jspin)
403 b_global = b + my_b_virtual_start(jspin) - 1
404 DO a = 1, virtual(ispin)
405 my_emp2_cou = my_emp2_cou - sym_fac*2.0_dp*local_ab(a, b)**2/ &
406 (eigenval(homo(ispin) + a, ispin) + eigenval(homo(jspin) + b_global, jspin) - &
407 eigenval(my_i + iib - 1, ispin) - eigenval(my_j + jjb - 1, jspin))
408 END DO
409 END DO
410
411 IF (calc_ex) THEN
412 ! contract integrals with orbital energies for exchange MP2 energy
413 ! starting with local ...
414 IF (calc_forces .AND. (.NOT. my_alpha_beta_case)) t_ab = 0.0_dp
415 DO b = 1, my_b_size(ispin)
416 b_global = b + my_b_virtual_start(ispin) - 1
417 DO a = 1, my_b_size(ispin)
418 a_global = a + my_b_virtual_start(ispin) - 1
419 my_emp2_ex = my_emp2_ex + sym_fac*local_ab(a_global, b)*local_ab(b_global, a)/ &
420 (eigenval(homo(ispin) + a_global, ispin) + eigenval(homo(ispin) + b_global, ispin) - &
421 eigenval(my_i + iib - 1, ispin) - eigenval(my_j + jjb - 1, ispin))
422 IF (calc_forces .AND. (.NOT. my_alpha_beta_case)) THEN
423 t_ab(a_global, b) = -(amp_fac*local_ab(a_global, b) - mp2_env%scale_T*local_ab(b_global, a))/ &
424 (eigenval(homo(ispin) + a_global, ispin) + &
425 eigenval(homo(ispin) + b_global, ispin) - &
426 eigenval(my_i + iib - 1, ispin) - eigenval(my_j + jjb - 1, ispin))
427 END IF
428 END DO
429 END DO
430 ! ... and then with external data
431 DO proc_shift = 1, para_env_sub%num_pe - 1
432 proc_send = modulo(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
433 proc_receive = modulo(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
434
435 CALL get_group_dist(gd_b_virtual(ispin), proc_receive, &
436 rec_b_virtual_start, rec_b_virtual_end, rec_b_size)
437 CALL get_group_dist(gd_b_virtual(ispin), proc_send, &
438 send_b_virtual_start, send_b_virtual_end, send_b_size)
439
440 external_ab(1:my_b_size(ispin), 1:rec_b_size) => &
441 buffer_1d(1:int(rec_b_size, int_8)*my_b_size(ispin))
442 external_ab = 0.0_dp
443
444 CALL para_env_sub%sendrecv(local_ab(send_b_virtual_start:send_b_virtual_end, 1:my_b_size(ispin)), proc_send, &
445 external_ab(1:my_b_size(ispin), 1:rec_b_size), proc_receive, tag)
446
447 DO b = 1, my_b_size(ispin)
448 b_global = b + my_b_virtual_start(ispin) - 1
449 DO a = 1, rec_b_size
450 a_global = a + rec_b_virtual_start - 1
451 my_emp2_ex = my_emp2_ex + sym_fac*local_ab(a_global, b)*external_ab(b, a)/ &
452 (eigenval(homo(ispin) + a_global, ispin) + eigenval(homo(ispin) + b_global, ispin) - &
453 eigenval(my_i + iib - 1, ispin) - eigenval(my_j + jjb - 1, ispin))
454 IF (calc_forces .AND. (.NOT. my_alpha_beta_case)) &
455 t_ab(a_global, b) = -(amp_fac*local_ab(a_global, b) - mp2_env%scale_T*external_ab(b, a))/ &
456 (eigenval(homo(ispin) + a_global, ispin) + &
457 eigenval(homo(ispin) + b_global, ispin) - &
458 eigenval(my_i + iib - 1, ispin) - eigenval(my_j + jjb - 1, ispin))
459 END DO
460 END DO
461 END DO
462 END IF
463 CALL timestop(handle3)
464
465 IF (calc_forces) THEN
466 ! update P_ab, Gamma_P_ia
467 CALL mp2_update_p_gamma(mp2_env, para_env_sub, gd_b_virtual, &
468 eigenval, homo, dimen_ri, iib, jjb, my_b_size, &
469 my_b_virtual_end, my_b_virtual_start, my_i, my_j, virtual, &
470 local_ab, t_ab, my_local_i_al, my_local_j_al, &
471 my_open_shell_ss, y_i_ap(:, :, iib), y_j_ap(:, :, jjb), local_ba, &
472 ispin, jspin, dgemm_counter, buffer_1d)
473
474 END IF
475
476 END associate
477
478 END DO ! jjB
479 END DO ! iiB
480
481 ELSE
482 ! We need it later in case of gradients
483 my_block_size = 1
484
485 CALL timeset(routinen//"_comm", handle3)
486 ! No work to do and we know that we have to receive nothing, but send something
487 ! send data to other proc
488 DO proc_shift = 1, comm_exchange%num_pe - 1
489 proc_send = modulo(comm_exchange%mepos + proc_shift, comm_exchange%num_pe)
490 proc_receive = modulo(comm_exchange%mepos - proc_shift, comm_exchange%num_pe)
491
492 send_ij_index = num_ij_pairs(proc_send)
493
494 IF (ij_index <= send_ij_index) THEN
495 ! something to send
496 ij_counter_send = (ij_index - min(1, integ_group_pos2color_sub(proc_send)))*ngroup + &
497 integ_group_pos2color_sub(proc_send)
498 send_i = ij_map(1, ij_counter_send)
499 send_j = ij_map(2, ij_counter_send)
500
501 ! occupied i
502 CALL comm_exchange%send(bib_c(ispin)%array(:, :, send_i:send_i + my_block_size - 1), &
503 proc_send, tag)
504 ! occupied j
505 CALL comm_exchange%send(bib_c(jspin)%array(:, :, send_j:send_j + my_block_size - 1), &
506 proc_send, tag)
507 END IF
508 END DO
509 CALL timestop(handle3)
510 END IF
511
512 ! redistribute gamma
513 IF (calc_forces) THEN
514 CALL mp2_redistribute_gamma(mp2_env%ri_grad%Gamma_P_ia(ispin)%array, ij_index, my_b_size(ispin), &
515 my_block_size, my_group_l_size, my_i, my_ij_pairs, ngroup, &
516 num_integ_group, integ_group_pos2color_sub, num_ij_pairs, &
517 ij_map, ranges_info_array, y_i_ap(:, :, 1:my_block_size), comm_exchange, &
518 gd_array%sizes, 1, buffer_1d)
519 CALL mp2_redistribute_gamma(mp2_env%ri_grad%Gamma_P_ia(jspin)%array, ij_index, my_b_size(jspin), &
520 my_block_size, my_group_l_size, my_j, my_ij_pairs, ngroup, &
521 num_integ_group, integ_group_pos2color_sub, num_ij_pairs, &
522 ij_map, ranges_info_array, y_j_ap(:, :, 1:my_block_size), comm_exchange, &
523 gd_array%sizes, 2, buffer_1d)
524 END IF
525
526 END DO
527 CALL timestop(handle2)
528
529 DEALLOCATE (local_i_al)
530 DEALLOCATE (local_j_al)
531 DEALLOCATE (ij_map)
532 DEALLOCATE (num_ij_pairs)
533 DEALLOCATE (local_ab)
534
535 IF (calc_forces) THEN
536 DEALLOCATE (y_i_ap)
537 DEALLOCATE (y_j_ap)
538 IF (ALLOCATED(t_ab)) THEN
539 DEALLOCATE (t_ab)
540 END IF
541 DEALLOCATE (local_ba)
542
543 ! here we check if there are almost degenerate ij
544 ! pairs and we update P_ij with these contribution.
545 ! If all pairs are degenerate with each other this step will scale O(N^6),
546 ! if the number of degenerate pairs scales linearly with the system size
547 ! this step will scale O(N^5).
548 ! Start counting the number of almost degenerate ij pairs according
549 ! to eps_canonical
550 CALL quasi_degenerate_p_ij( &
551 mp2_env, eigenval(:, ispin:jspin), homo(ispin:jspin), virtual(ispin:jspin), my_open_shell_ss, &
552 my_beta_beta_case, bib_c(ispin:jspin), unit_nr, dimen_ri, &
553 my_b_size(ispin:jspin), ngroup, my_group_l_size, &
554 color_sub, ranges_info_array, comm_exchange, para_env_sub, para_env, &
555 my_b_virtual_start(ispin:jspin), my_b_virtual_end(ispin:jspin), gd_array%sizes, gd_b_virtual(ispin:jspin), &
556 integ_group_pos2color_sub, dgemm_counter, buffer_1d)
557
558 END IF
559
560 DEALLOCATE (buffer_1d)
561
562 ! Dereplicate BIb_C and Gamma_P_ia to save memory
563 ! These matrices will not be needed in that fashion anymore
564 ! B_ia_Q will needed later
565 IF (calc_forces .AND. jspin == nspins) THEN
566 IF (.NOT. ALLOCATED(b_ia_q)) ALLOCATE (b_ia_q(nspins))
567 ALLOCATE (b_ia_q(ispin)%array(homo(ispin), my_b_size(ispin), my_group_l_size_orig))
568 b_ia_q(ispin)%array = 0.0_dp
569 DO jjb = 1, homo(ispin)
570 DO iib = 1, my_b_size(ispin)
571 b_ia_q(ispin)%array(jjb, iib, 1:my_group_l_size_orig) = &
572 bib_c(ispin)%array(1:my_group_l_size_orig, iib, jjb)
573 END DO
574 END DO
575 DEALLOCATE (bib_c(ispin)%array)
576
577 ! sum Gamma and dereplicate
578 ALLOCATE (bib_c(ispin)%array(my_b_size(ispin), homo(ispin), my_group_l_size_orig))
579 DO proc_shift = 1, comm_rep%num_pe - 1
580 ! invert order
581 proc_send = modulo(comm_rep%mepos - proc_shift, comm_rep%num_pe)
582 proc_receive = modulo(comm_rep%mepos + proc_shift, comm_rep%num_pe)
583
584 start_point = ranges_info_array(3, proc_shift, comm_exchange%mepos)
585 end_point = ranges_info_array(4, proc_shift, comm_exchange%mepos)
586
587 CALL comm_rep%sendrecv(mp2_env%ri_grad%Gamma_P_ia(ispin)%array(:, :, start_point:end_point), &
588 proc_send, bib_c(ispin)%array, proc_receive, tag)
589!$OMP PARALLEL WORKSHARE DEFAULT(NONE) &
590!$OMP SHARED(mp2_env,BIb_C,ispin,homo,my_B_size,my_group_L_size_orig)
591 mp2_env%ri_grad%Gamma_P_ia(ispin)%array(:, :, 1:my_group_l_size_orig) = &
592 mp2_env%ri_grad%Gamma_P_ia(ispin)%array(:, :, 1:my_group_l_size_orig) &
593 + bib_c(ispin)%array(:, :, :)
594!$OMP END PARALLEL WORKSHARE
595 END DO
596
597 bib_c(ispin)%array(:, :, :) = mp2_env%ri_grad%Gamma_P_ia(ispin)%array(:, :, 1:my_group_l_size_orig)
598 DEALLOCATE (mp2_env%ri_grad%Gamma_P_ia(ispin)%array)
599 CALL move_alloc(bib_c(ispin)%array, mp2_env%ri_grad%Gamma_P_ia(ispin)%array)
600 ELSE IF (jspin == nspins) THEN
601 DEALLOCATE (bib_c(ispin)%array)
602 END IF
603
604 CALL para_env%sum(my_emp2_cou)
605 CALL para_env%sum(my_emp2_ex)
606
607 IF (my_open_shell_ss .OR. my_alpha_beta_case) THEN
608 IF (my_alpha_beta_case) THEN
609 emp2_s = emp2_s + my_emp2_cou
610 emp2_cou = emp2_cou + my_emp2_cou
611 ELSE
612 my_emp2_cou = my_emp2_cou*0.25_dp
613 my_emp2_ex = my_emp2_ex*0.5_dp
614 emp2_t = emp2_t + my_emp2_cou + my_emp2_ex
615 emp2_cou = emp2_cou + my_emp2_cou
616 emp2_ex = emp2_ex + my_emp2_ex
617 END IF
618 ELSE
619 emp2_cou = emp2_cou + my_emp2_cou
620 emp2_ex = emp2_ex + my_emp2_ex
621 END IF
622 END DO
623
624 END DO
625
626 DEALLOCATE (integ_group_pos2color_sub)
627 DEALLOCATE (ranges_info_array)
628
629 CALL comm_exchange%free()
630 CALL comm_rep%free()
631
632 IF (calc_forces) THEN
633 ! recover original information (before replication)
634 DEALLOCATE (gd_array%sizes)
635 iib = SIZE(sizes_array_orig)
636 ALLOCATE (gd_array%sizes(0:iib - 1))
637 gd_array%sizes(:) = sizes_array_orig
638 DEALLOCATE (sizes_array_orig)
639
640 ! Remove replication from BIb_C and reorder the matrix
641 my_group_l_size = my_group_l_size_orig
642
643 ! B_ia_Q(ispin)%array will be deallocated inside of complete_gamma
644 DO ispin = 1, nspins
645 CALL complete_gamma(mp2_env, b_ia_q(ispin)%array, dimen_ri, homo(ispin), &
646 virtual(ispin), para_env, para_env_sub, ngroup, &
647 my_group_l_size, my_group_l_start, my_group_l_end, &
648 my_b_size(ispin), my_b_virtual_start(ispin), &
649 gd_array, gd_b_virtual(ispin), &
650 ispin)
651 END DO
652 DEALLOCATE (b_ia_q)
653
654 IF (nspins == 1) mp2_env%ri_grad%P_ab(1)%array(:, :) = mp2_env%ri_grad%P_ab(1)%array(:, :)*2.0_dp
655 block
656 TYPE(mp_comm_type) :: comm
657 CALL comm%from_split(para_env, para_env_sub%mepos)
658 DO ispin = 1, nspins
659 ! P_ab is only replicated over all subgroups
660 CALL comm%sum(mp2_env%ri_grad%P_ab(ispin)%array)
661 ! P_ij is replicated over all processes
662 CALL para_env%sum(mp2_env%ri_grad%P_ij(ispin)%array)
663 END DO
664 CALL comm%free()
665 END block
666 END IF
667
668 CALL release_group_dist(gd_array)
669 DO ispin = 1, nspins
670 IF (ALLOCATED(bib_c(ispin)%array)) DEALLOCATE (bib_c(ispin)%array)
671 CALL release_group_dist(gd_b_virtual(ispin))
672 END DO
673
674 ! We do not need this matrix later, so deallocate it here to safe memory
675 IF (calc_forces) DEALLOCATE (mp2_env%ri_grad%PQ_half)
676 IF (calc_forces .AND. .NOT. compare_potential_types(mp2_env%ri_metric, mp2_env%potential_parameter)) &
677 DEALLOCATE (mp2_env%ri_grad%operator_half)
678
679 CALL dgemm_counter_write(dgemm_counter, para_env)
680
681 ! release memory allocated by local_gemm when run on GPU. local_gemm_ctx is null on cpu only runs
682 CALL mp2_env%local_gemm_ctx%destroy()
683 CALL timestop(handle)
684
685 END SUBROUTINE mp2_ri_gpw_compute_en
686
687! **************************************************************************************************
688!> \brief ...
689!> \param local_i_aL ...
690!> \param ranges_info_array ...
691!> \param BIb_C_rec ...
692! **************************************************************************************************
693 SUBROUTINE fill_local_i_al(local_i_aL, ranges_info_array, BIb_C_rec)
694 REAL(kind=dp), DIMENSION(:, :, :), INTENT(INOUT) :: local_i_al
695 INTEGER, DIMENSION(:, :), INTENT(IN) :: ranges_info_array
696 REAL(kind=dp), DIMENSION(:, :, :), INTENT(IN) :: bib_c_rec
697
698 CHARACTER(LEN=*), PARAMETER :: routinen = 'fill_local_i_aL'
699
700 INTEGER :: end_point, handle, irep, lend_pos, &
701 lstart_pos, start_point
702
703 CALL timeset(routinen, handle)
704
705 DO irep = 1, SIZE(ranges_info_array, 2)
706 lstart_pos = ranges_info_array(1, irep)
707 lend_pos = ranges_info_array(2, irep)
708 start_point = ranges_info_array(3, irep)
709 end_point = ranges_info_array(4, irep)
710
711!$OMP PARALLEL WORKSHARE DEFAULT(NONE) &
712!$OMP SHARED(BIb_C_rec,local_i_aL,Lstart_pos,Lend_pos,start_point,end_point)
713 local_i_al(lstart_pos:lend_pos, :, :) = bib_c_rec(start_point:end_point, :, :)
714!$OMP END PARALLEL WORKSHARE
715 END DO
716
717 CALL timestop(handle)
718
719 END SUBROUTINE fill_local_i_al
720
721! **************************************************************************************************
722!> \brief ...
723!> \param local_i_aL ...
724!> \param ranges_info_array ...
725!> \param BIb_C_rec ...
726! **************************************************************************************************
727 SUBROUTINE fill_local_i_al_2d(local_i_aL, ranges_info_array, BIb_C_rec)
728 REAL(kind=dp), DIMENSION(:, :), INTENT(INOUT) :: local_i_al
729 INTEGER, DIMENSION(:, :), INTENT(IN) :: ranges_info_array
730 REAL(kind=dp), DIMENSION(:, :), INTENT(IN) :: bib_c_rec
731
732 CHARACTER(LEN=*), PARAMETER :: routinen = 'fill_local_i_aL_2D'
733
734 INTEGER :: end_point, handle, irep, lend_pos, &
735 lstart_pos, start_point
736
737 CALL timeset(routinen, handle)
738
739 DO irep = 1, SIZE(ranges_info_array, 2)
740 lstart_pos = ranges_info_array(1, irep)
741 lend_pos = ranges_info_array(2, irep)
742 start_point = ranges_info_array(3, irep)
743 end_point = ranges_info_array(4, irep)
744
745!$OMP PARALLEL WORKSHARE DEFAULT(NONE) &
746!$OMP SHARED(BIb_C_rec,local_i_aL,Lstart_pos,Lend_pos,start_point,end_point)
747 local_i_al(lstart_pos:lend_pos, :) = bib_c_rec(start_point:end_point, :)
748!$OMP END PARALLEL WORKSHARE
749 END DO
750
751 CALL timestop(handle)
752
753 END SUBROUTINE fill_local_i_al_2d
754
755! **************************************************************************************************
756!> \brief ...
757!> \param BIb_C ...
758!> \param comm_exchange ...
759!> \param comm_rep ...
760!> \param homo ...
761!> \param sizes_array ...
762!> \param my_B_size ...
763!> \param my_group_L_size ...
764!> \param ranges_info_array ...
765! **************************************************************************************************
766 SUBROUTINE replicate_iak_2intgroup(BIb_C, comm_exchange, comm_rep, homo, sizes_array, my_B_size, &
767 my_group_L_size, ranges_info_array)
768 REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :, :), &
769 INTENT(INOUT) :: bib_c
770 TYPE(mp_comm_type), INTENT(IN) :: comm_exchange, comm_rep
771 INTEGER, INTENT(IN) :: homo
772 INTEGER, DIMENSION(:), INTENT(IN) :: sizes_array
773 INTEGER, INTENT(IN) :: my_b_size, my_group_l_size
774 INTEGER, DIMENSION(:, 0:, 0:), INTENT(IN) :: ranges_info_array
775
776 CHARACTER(LEN=*), PARAMETER :: routinen = 'replicate_iaK_2intgroup'
777
778 INTEGER :: end_point, handle, max_l_size, &
779 proc_receive, proc_shift, start_point
780 REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :, :) :: bib_c_copy
781 REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :, :, :) :: bib_c_gather
782
783 CALL timeset(routinen, handle)
784
785 ! replication scheme using mpi_allgather
786 ! get the max L size of the
787 max_l_size = maxval(sizes_array)
788
789 ALLOCATE (bib_c_copy(max_l_size, my_b_size, homo))
790 bib_c_copy = 0.0_dp
791 bib_c_copy(1:SIZE(bib_c, 1), 1:my_b_size, 1:homo) = bib_c
792
793 DEALLOCATE (bib_c)
794
795 ALLOCATE (bib_c_gather(max_l_size, my_b_size, homo, 0:comm_rep%num_pe - 1))
796 bib_c_gather = 0.0_dp
797
798 CALL comm_rep%allgather(bib_c_copy, bib_c_gather)
799
800 DEALLOCATE (bib_c_copy)
801
802 ALLOCATE (bib_c(my_group_l_size, my_b_size, homo))
803 bib_c = 0.0_dp
804
805 ! reorder data
806 DO proc_shift = 0, comm_rep%num_pe - 1
807 proc_receive = modulo(comm_rep%mepos - proc_shift, comm_rep%num_pe)
808
809 start_point = ranges_info_array(3, proc_shift, comm_exchange%mepos)
810 end_point = ranges_info_array(4, proc_shift, comm_exchange%mepos)
811
812 bib_c(start_point:end_point, 1:my_b_size, 1:homo) = &
813 bib_c_gather(1:end_point - start_point + 1, 1:my_b_size, 1:homo, proc_receive)
814
815 END DO
816
817 DEALLOCATE (bib_c_gather)
818
819 CALL timestop(handle)
820
821 END SUBROUTINE replicate_iak_2intgroup
822
823! **************************************************************************************************
824!> \brief ...
825!> \param local_ab ...
826!> \param t_ab ...
827!> \param mp2_env ...
828!> \param homo ...
829!> \param virtual ...
830!> \param my_B_size ...
831!> \param my_group_L_size ...
832!> \param calc_forces ...
833!> \param ispin ...
834!> \param jspin ...
835!> \param local_ba ...
836! **************************************************************************************************
837 SUBROUTINE mp2_ri_allocate_no_blk(local_ab, t_ab, mp2_env, homo, virtual, my_B_size, &
838 my_group_L_size, calc_forces, ispin, jspin, local_ba)
839 REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :), &
840 INTENT(OUT) :: local_ab, t_ab
841 TYPE(mp2_type) :: mp2_env
842 INTEGER, INTENT(IN) :: homo(2), virtual(2), my_b_size(2), &
843 my_group_l_size
844 LOGICAL, INTENT(IN) :: calc_forces
845 INTEGER, INTENT(IN) :: ispin, jspin
846 REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :), &
847 INTENT(OUT) :: local_ba
848
849 CHARACTER(LEN=*), PARAMETER :: routinen = 'mp2_ri_allocate_no_blk'
850
851 INTEGER :: handle
852
853 CALL timeset(routinen, handle)
854
855 ALLOCATE (local_ab(virtual(ispin), my_b_size(jspin)))
856 local_ab = 0.0_dp
857
858 IF (calc_forces) THEN
859 IF (.NOT. ALLOCATED(mp2_env%ri_grad%P_ij(jspin)%array)) THEN
860 ALLOCATE (mp2_env%ri_grad%P_ij(jspin)%array(homo(ispin), homo(ispin)))
861 mp2_env%ri_grad%P_ij(jspin)%array = 0.0_dp
862 END IF
863 IF (.NOT. ALLOCATED(mp2_env%ri_grad%P_ab(jspin)%array)) THEN
864 ALLOCATE (mp2_env%ri_grad%P_ab(jspin)%array(my_b_size(jspin), virtual(jspin)))
865 mp2_env%ri_grad%P_ab(jspin)%array = 0.0_dp
866 END IF
867 IF (.NOT. ALLOCATED(mp2_env%ri_grad%Gamma_P_ia(jspin)%array)) THEN
868 ALLOCATE (mp2_env%ri_grad%Gamma_P_ia(jspin)%array(my_b_size(jspin), homo(jspin), my_group_l_size))
869 mp2_env%ri_grad%Gamma_P_ia(jspin)%array = 0.0_dp
870 END IF
871
872 IF (ispin == jspin) THEN
873 ! For non-alpha-beta case we need amplitudes
874 ALLOCATE (t_ab(virtual(ispin), my_b_size(jspin)))
875
876 ! That is just a dummy. In that way, we can pass it as array to other routines w/o requirement for allocatable array
877 ALLOCATE (local_ba(1, 1))
878 ELSE
879 ! We need more integrals
880 ALLOCATE (local_ba(virtual(jspin), my_b_size(ispin)))
881 END IF
882 END IF
883 !
884
885 CALL timestop(handle)
886
887 END SUBROUTINE mp2_ri_allocate_no_blk
888
889! **************************************************************************************************
890!> \brief ...
891!> \param dimen_RI ...
892!> \param my_B_size ...
893!> \param block_size ...
894!> \param local_i_aL ...
895!> \param local_j_aL ...
896!> \param calc_forces ...
897!> \param Y_i_aP ...
898!> \param Y_j_aP ...
899!> \param ispin ...
900!> \param jspin ...
901! **************************************************************************************************
902 SUBROUTINE mp2_ri_allocate_blk(dimen_RI, my_B_size, block_size, &
903 local_i_aL, local_j_aL, calc_forces, &
904 Y_i_aP, Y_j_aP, ispin, jspin)
905 INTEGER, INTENT(IN) :: dimen_ri, my_b_size(2), block_size
906 REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :, :), &
907 INTENT(OUT) :: local_i_al, local_j_al
908 LOGICAL, INTENT(IN) :: calc_forces
909 REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :, :), &
910 INTENT(OUT) :: y_i_ap, y_j_ap
911 INTEGER, INTENT(IN) :: ispin, jspin
912
913 CHARACTER(LEN=*), PARAMETER :: routinen = 'mp2_ri_allocate_blk'
914
915 INTEGER :: handle
916
917 CALL timeset(routinen, handle)
918
919 ALLOCATE (local_i_al(dimen_ri, my_b_size(ispin), block_size))
920 local_i_al = 0.0_dp
921 ALLOCATE (local_j_al(dimen_ri, my_b_size(jspin), block_size))
922 local_j_al = 0.0_dp
923
924 IF (calc_forces) THEN
925 ALLOCATE (y_i_ap(my_b_size(ispin), dimen_ri, block_size))
926 y_i_ap = 0.0_dp
927 ! For closed-shell, alpha-alpha and beta-beta my_B_size_beta=my_b_size
928 ! Not for alpha-beta case: Y_j_aP_beta is sent and received as Y_j_aP
929 ALLOCATE (y_j_ap(my_b_size(jspin), dimen_ri, block_size))
930 y_j_ap = 0.0_dp
931 END IF
932 !
933
934 CALL timestop(handle)
935
936 END SUBROUTINE mp2_ri_allocate_blk
937
938! **************************************************************************************************
939!> \brief ...
940!> \param my_alpha_beta_case ...
941!> \param total_ij_pairs ...
942!> \param homo ...
943!> \param homo_beta ...
944!> \param block_size ...
945!> \param ngroup ...
946!> \param ij_map ...
947!> \param color_sub ...
948!> \param my_ij_pairs ...
949!> \param my_open_shell_SS ...
950!> \param unit_nr ...
951! **************************************************************************************************
952 SUBROUTINE mp2_ri_communication(my_alpha_beta_case, total_ij_pairs, homo, homo_beta, &
953 block_size, ngroup, ij_map, color_sub, my_ij_pairs, my_open_shell_SS, unit_nr)
954 LOGICAL, INTENT(IN) :: my_alpha_beta_case
955 INTEGER, INTENT(OUT) :: total_ij_pairs
956 INTEGER, INTENT(IN) :: homo, homo_beta, block_size, ngroup
957 INTEGER, ALLOCATABLE, DIMENSION(:, :), INTENT(OUT) :: ij_map
958 INTEGER, INTENT(IN) :: color_sub
959 INTEGER, INTENT(OUT) :: my_ij_pairs
960 LOGICAL, INTENT(IN) :: my_open_shell_ss
961 INTEGER, INTENT(IN) :: unit_nr
962
963 CHARACTER(LEN=*), PARAMETER :: routinen = 'mp2_ri_communication'
964
965 INTEGER :: assigned_blocks, first_i_block, first_j_block, handle, iib, ij_block_counter, &
966 ij_counter, jjb, last_i_block, last_j_block, num_block_per_group, num_ij_blocks, &
967 num_ij_blocks_beta, total_ij_block, total_ij_pairs_blocks
968 LOGICAL, ALLOCATABLE, DIMENSION(:, :) :: ij_marker
969
970! Calculate the maximum number of ij pairs that have to be computed
971! among groups
972
973 CALL timeset(routinen, handle)
974
975 IF (.NOT. my_open_shell_ss .AND. .NOT. my_alpha_beta_case) THEN
976 total_ij_pairs = homo*(1 + homo)/2
977 num_ij_blocks = homo/block_size - 1
978
979 first_i_block = 1
980 last_i_block = block_size*(num_ij_blocks - 1)
981
982 first_j_block = block_size + 1
983 last_j_block = block_size*(num_ij_blocks + 1)
984
985 ij_block_counter = 0
986 DO iib = first_i_block, last_i_block, block_size
987 DO jjb = iib + block_size, last_j_block, block_size
988 ij_block_counter = ij_block_counter + 1
989 END DO
990 END DO
991
992 total_ij_block = ij_block_counter
993 num_block_per_group = total_ij_block/ngroup
994 assigned_blocks = num_block_per_group*ngroup
995
996 total_ij_pairs_blocks = assigned_blocks + (total_ij_pairs - assigned_blocks*(block_size**2))
997
998 ALLOCATE (ij_marker(homo, homo))
999 ij_marker = .true.
1000 ALLOCATE (ij_map(3, total_ij_pairs_blocks))
1001 ij_map = 0
1002 ij_counter = 0
1003 my_ij_pairs = 0
1004 DO iib = first_i_block, last_i_block, block_size
1005 DO jjb = iib + block_size, last_j_block, block_size
1006 IF (ij_counter + 1 > assigned_blocks) EXIT
1007 ij_counter = ij_counter + 1
1008 ij_marker(iib:iib + block_size - 1, jjb:jjb + block_size - 1) = .false.
1009 ij_map(1, ij_counter) = iib
1010 ij_map(2, ij_counter) = jjb
1011 ij_map(3, ij_counter) = block_size
1012 IF (mod(ij_counter, ngroup) == color_sub) my_ij_pairs = my_ij_pairs + 1
1013 END DO
1014 END DO
1015 DO iib = 1, homo
1016 DO jjb = iib, homo
1017 IF (ij_marker(iib, jjb)) THEN
1018 ij_counter = ij_counter + 1
1019 ij_map(1, ij_counter) = iib
1020 ij_map(2, ij_counter) = jjb
1021 ij_map(3, ij_counter) = 1
1022 IF (mod(ij_counter, ngroup) == color_sub) my_ij_pairs = my_ij_pairs + 1
1023 END IF
1024 END DO
1025 END DO
1026 DEALLOCATE (ij_marker)
1027
1028 ELSE IF (.NOT. my_alpha_beta_case) THEN
1029 ! THese are the cases alpha/alpha and beta/beta
1030 ! We do not have to consider the diagonal elements
1031 total_ij_pairs = homo*(homo - 1)/2
1032 num_ij_blocks = (homo - 1)/block_size - 1
1033
1034 first_i_block = 1
1035 last_i_block = block_size*(num_ij_blocks - 1)
1036
1037 ! We shift the blocks to prevent the calculation of the diagonal elements which always give zero
1038 first_j_block = block_size + 2
1039 last_j_block = block_size*(num_ij_blocks + 1) + 1
1040
1041 ij_block_counter = 0
1042 DO iib = first_i_block, last_i_block, block_size
1043 DO jjb = iib + block_size + 1, last_j_block, block_size
1044 ij_block_counter = ij_block_counter + 1
1045 END DO
1046 END DO
1047
1048 total_ij_block = ij_block_counter
1049 num_block_per_group = total_ij_block/ngroup
1050 assigned_blocks = num_block_per_group*ngroup
1051
1052 total_ij_pairs_blocks = assigned_blocks + (total_ij_pairs - assigned_blocks*(block_size**2))
1053
1054 ALLOCATE (ij_marker(homo, homo))
1055 ij_marker = .true.
1056 ALLOCATE (ij_map(3, total_ij_pairs_blocks))
1057 ij_map = 0
1058 ij_counter = 0
1059 my_ij_pairs = 0
1060 DO iib = first_i_block, last_i_block, block_size
1061 DO jjb = iib + block_size + 1, last_j_block, block_size
1062 IF (ij_counter + 1 > assigned_blocks) EXIT
1063 ij_counter = ij_counter + 1
1064 ij_marker(iib:iib + block_size - 1, jjb:jjb + block_size - 1) = .false.
1065 ij_map(1, ij_counter) = iib
1066 ij_map(2, ij_counter) = jjb
1067 ij_map(3, ij_counter) = block_size
1068 IF (mod(ij_counter, ngroup) == color_sub) my_ij_pairs = my_ij_pairs + 1
1069 END DO
1070 END DO
1071 DO iib = 1, homo
1072 DO jjb = iib + 1, homo
1073 IF (ij_marker(iib, jjb)) THEN
1074 ij_counter = ij_counter + 1
1075 ij_map(1, ij_counter) = iib
1076 ij_map(2, ij_counter) = jjb
1077 ij_map(3, ij_counter) = 1
1078 IF (mod(ij_counter, ngroup) == color_sub) my_ij_pairs = my_ij_pairs + 1
1079 END IF
1080 END DO
1081 END DO
1082 DEALLOCATE (ij_marker)
1083
1084 ELSE
1085 total_ij_pairs = homo*homo_beta
1086 num_ij_blocks = homo/block_size
1087 num_ij_blocks_beta = homo_beta/block_size
1088
1089 first_i_block = 1
1090 last_i_block = block_size*(num_ij_blocks - 1)
1091
1092 first_j_block = 1
1093 last_j_block = block_size*(num_ij_blocks_beta - 1)
1094
1095 ij_block_counter = 0
1096 DO iib = first_i_block, last_i_block, block_size
1097 DO jjb = first_j_block, last_j_block, block_size
1098 ij_block_counter = ij_block_counter + 1
1099 END DO
1100 END DO
1101
1102 total_ij_block = ij_block_counter
1103 num_block_per_group = total_ij_block/ngroup
1104 assigned_blocks = num_block_per_group*ngroup
1105
1106 total_ij_pairs_blocks = assigned_blocks + (total_ij_pairs - assigned_blocks*(block_size**2))
1107
1108 ALLOCATE (ij_marker(homo, homo_beta))
1109 ij_marker = .true.
1110 ALLOCATE (ij_map(3, total_ij_pairs_blocks))
1111 ij_map = 0
1112 ij_counter = 0
1113 my_ij_pairs = 0
1114 DO iib = first_i_block, last_i_block, block_size
1115 DO jjb = first_j_block, last_j_block, block_size
1116 IF (ij_counter + 1 > assigned_blocks) EXIT
1117 ij_counter = ij_counter + 1
1118 ij_marker(iib:iib + block_size - 1, jjb:jjb + block_size - 1) = .false.
1119 ij_map(1, ij_counter) = iib
1120 ij_map(2, ij_counter) = jjb
1121 ij_map(3, ij_counter) = block_size
1122 IF (mod(ij_counter, ngroup) == color_sub) my_ij_pairs = my_ij_pairs + 1
1123 END DO
1124 END DO
1125 DO iib = 1, homo
1126 DO jjb = 1, homo_beta
1127 IF (ij_marker(iib, jjb)) THEN
1128 ij_counter = ij_counter + 1
1129 ij_map(1, ij_counter) = iib
1130 ij_map(2, ij_counter) = jjb
1131 ij_map(3, ij_counter) = 1
1132 IF (mod(ij_counter, ngroup) == color_sub) my_ij_pairs = my_ij_pairs + 1
1133 END IF
1134 END DO
1135 END DO
1136 DEALLOCATE (ij_marker)
1137 END IF
1138
1139 IF (unit_nr > 0) THEN
1140 IF (block_size == 1) THEN
1141 WRITE (unit=unit_nr, fmt="(T3,A,T66,F15.1)") &
1142 "RI_INFO| Percentage of ij pairs communicated with block size 1:", 100.0_dp
1143 ELSE
1144 WRITE (unit=unit_nr, fmt="(T3,A,T66,F15.1)") &
1145 "RI_INFO| Percentage of ij pairs communicated with block size 1:", &
1146 100.0_dp*real((total_ij_pairs - assigned_blocks*(block_size**2)), kind=dp)/real(total_ij_pairs, kind=dp)
1147 END IF
1148 CALL m_flush(unit_nr)
1149 END IF
1150
1151 CALL timestop(handle)
1152
1153 END SUBROUTINE mp2_ri_communication
1154
1155! **************************************************************************************************
1156!> \brief ...
1157!> \param para_env ...
1158!> \param para_env_sub ...
1159!> \param color_sub ...
1160!> \param sizes_array ...
1161!> \param calc_forces ...
1162!> \param integ_group_size ...
1163!> \param my_group_L_end ...
1164!> \param my_group_L_size ...
1165!> \param my_group_L_size_orig ...
1166!> \param my_group_L_start ...
1167!> \param my_new_group_L_size ...
1168!> \param integ_group_pos2color_sub ...
1169!> \param sizes_array_orig ...
1170!> \param ranges_info_array ...
1171!> \param comm_exchange ...
1172!> \param comm_rep ...
1173!> \param num_integ_group ...
1174! **************************************************************************************************
1175 SUBROUTINE mp2_ri_create_group(para_env, para_env_sub, color_sub, &
1176 sizes_array, calc_forces, &
1177 integ_group_size, my_group_L_end, &
1178 my_group_L_size, my_group_L_size_orig, my_group_L_start, my_new_group_L_size, &
1179 integ_group_pos2color_sub, &
1180 sizes_array_orig, ranges_info_array, comm_exchange, comm_rep, num_integ_group)
1181 TYPE(mp_para_env_type), INTENT(IN) :: para_env, para_env_sub
1182 INTEGER, INTENT(IN) :: color_sub
1183 INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(INOUT) :: sizes_array
1184 LOGICAL, INTENT(IN) :: calc_forces
1185 INTEGER, INTENT(IN) :: integ_group_size, my_group_l_end
1186 INTEGER, INTENT(INOUT) :: my_group_l_size
1187 INTEGER, INTENT(OUT) :: my_group_l_size_orig
1188 INTEGER, INTENT(IN) :: my_group_l_start
1189 INTEGER, INTENT(INOUT) :: my_new_group_l_size
1190 INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(OUT) :: integ_group_pos2color_sub, &
1191 sizes_array_orig
1192 INTEGER, ALLOCATABLE, DIMENSION(:, :, :), &
1193 INTENT(OUT) :: ranges_info_array
1194 TYPE(mp_comm_type), INTENT(OUT) :: comm_exchange, comm_rep
1195 INTEGER, INTENT(IN) :: num_integ_group
1196
1197 CHARACTER(LEN=*), PARAMETER :: routinen = 'mp2_ri_create_group'
1198
1199 INTEGER :: handle, iib, proc_receive, proc_shift, &
1200 sub_sub_color
1201 INTEGER, ALLOCATABLE, DIMENSION(:) :: new_sizes_array, rep_ends_array, &
1202 rep_sizes_array, rep_starts_array
1203 INTEGER, ALLOCATABLE, DIMENSION(:, :) :: my_info
1204
1205 CALL timeset(routinen, handle)
1206 !
1207 sub_sub_color = para_env_sub%mepos*num_integ_group + color_sub/integ_group_size
1208 CALL comm_exchange%from_split(para_env, sub_sub_color)
1209
1210 ! create the replication group
1211 sub_sub_color = para_env_sub%mepos*comm_exchange%num_pe + comm_exchange%mepos
1212 CALL comm_rep%from_split(para_env, sub_sub_color)
1213
1214 ! create the new limits for K according to the size
1215 ! of the integral group
1216
1217 ! info array for replication
1218 ALLOCATE (rep_ends_array(0:comm_rep%num_pe - 1))
1219 ALLOCATE (rep_starts_array(0:comm_rep%num_pe - 1))
1220 ALLOCATE (rep_sizes_array(0:comm_rep%num_pe - 1))
1221
1222 CALL comm_rep%allgather(my_group_l_size, rep_sizes_array)
1223 CALL comm_rep%allgather(my_group_l_start, rep_starts_array)
1224 CALL comm_rep%allgather(my_group_l_end, rep_ends_array)
1225
1226 ! calculate my_new_group_L_size according to sizes_array
1227 my_new_group_l_size = my_group_l_size
1228
1229 ! Info of this process
1230 ALLOCATE (my_info(4, 0:comm_rep%num_pe - 1))
1231 my_info(1, 0) = my_group_l_start
1232 my_info(2, 0) = my_group_l_end
1233 my_info(3, 0) = 1
1234 my_info(4, 0) = my_group_l_size
1235
1236 DO proc_shift = 1, comm_rep%num_pe - 1
1237 proc_receive = modulo(comm_rep%mepos - proc_shift, comm_rep%num_pe)
1238
1239 my_new_group_l_size = my_new_group_l_size + rep_sizes_array(proc_receive)
1240
1241 my_info(1, proc_shift) = rep_starts_array(proc_receive)
1242 my_info(2, proc_shift) = rep_ends_array(proc_receive)
1243 my_info(3, proc_shift) = my_info(4, proc_shift - 1) + 1
1244 my_info(4, proc_shift) = my_new_group_l_size
1245
1246 END DO
1247
1248 ALLOCATE (new_sizes_array(0:comm_exchange%num_pe - 1))
1249 ALLOCATE (ranges_info_array(4, 0:comm_rep%num_pe - 1, 0:comm_exchange%num_pe - 1))
1250 CALL comm_exchange%allgather(my_new_group_l_size, new_sizes_array)
1251 CALL comm_exchange%allgather(my_info, ranges_info_array)
1252
1253 DEALLOCATE (rep_sizes_array)
1254 DEALLOCATE (rep_starts_array)
1255 DEALLOCATE (rep_ends_array)
1256
1257 ALLOCATE (integ_group_pos2color_sub(0:comm_exchange%num_pe - 1))
1258 CALL comm_exchange%allgather(color_sub, integ_group_pos2color_sub)
1259
1260 IF (calc_forces) THEN
1261 iib = SIZE(sizes_array)
1262 ALLOCATE (sizes_array_orig(0:iib - 1))
1263 sizes_array_orig(:) = sizes_array
1264 END IF
1265
1266 my_group_l_size_orig = my_group_l_size
1267 my_group_l_size = my_new_group_l_size
1268 DEALLOCATE (sizes_array)
1269
1270 ALLOCATE (sizes_array(0:integ_group_size - 1))
1271 sizes_array(:) = new_sizes_array
1272
1273 DEALLOCATE (new_sizes_array)
1274 !
1275 CALL timestop(handle)
1276
1277 END SUBROUTINE mp2_ri_create_group
1278
1279! **************************************************************************************************
1280!> \brief ...
1281!> \param mp2_env ...
1282!> \param para_env ...
1283!> \param para_env_sub ...
1284!> \param gd_array ...
1285!> \param gd_B_virtual ...
1286!> \param homo ...
1287!> \param dimen_RI ...
1288!> \param unit_nr ...
1289!> \param integ_group_size ...
1290!> \param ngroup ...
1291!> \param num_integ_group ...
1292!> \param virtual ...
1293!> \param calc_forces ...
1294! **************************************************************************************************
1295 SUBROUTINE mp2_ri_get_integ_group_size(mp2_env, para_env, para_env_sub, gd_array, gd_B_virtual, &
1296 homo, dimen_RI, unit_nr, &
1297 integ_group_size, &
1298 ngroup, num_integ_group, &
1299 virtual, calc_forces)
1300 TYPE(mp2_type) :: mp2_env
1301 TYPE(mp_para_env_type), INTENT(IN) :: para_env, para_env_sub
1302 TYPE(group_dist_d1_type), INTENT(IN) :: gd_array
1303 TYPE(group_dist_d1_type), DIMENSION(:), INTENT(IN) :: gd_b_virtual
1304 INTEGER, DIMENSION(:), INTENT(IN) :: homo
1305 INTEGER, INTENT(IN) :: dimen_ri, unit_nr
1306 INTEGER, INTENT(OUT) :: integ_group_size, ngroup, num_integ_group
1307 INTEGER, DIMENSION(:), INTENT(IN) :: virtual
1308 LOGICAL, INTENT(IN) :: calc_forces
1309
1310 CHARACTER(LEN=*), PARAMETER :: routinen = 'mp2_ri_get_integ_group_size'
1311
1312 INTEGER :: block_size, handle, iib, &
1313 max_repl_group_size, &
1314 min_integ_group_size
1315 INTEGER(KIND=int_8) :: mem
1316 LOGICAL :: calc_group_size
1317 REAL(kind=dp) :: factor, mem_base, mem_min, mem_per_blk, &
1318 mem_per_repl, mem_per_repl_blk, &
1319 mem_real
1320
1321 CALL timeset(routinen, handle)
1322
1323 ngroup = para_env%num_pe/para_env_sub%num_pe
1324
1325 calc_group_size = mp2_env%ri_mp2%number_integration_groups <= 0
1326 IF (.NOT. calc_group_size) THEN
1327 IF (mod(ngroup, mp2_env%ri_mp2%number_integration_groups) /= 0) calc_group_size = .true.
1328 END IF
1329
1330 IF (calc_group_size) THEN
1331 CALL m_memory(mem)
1332 mem_real = (mem + 1024*1024 - 1)/(1024*1024)
1333 CALL para_env%min(mem_real)
1334 mem_real = min(mem_real, mp2_env%mp2_memory)
1335
1336 mem_base = 0.0_dp
1337 mem_per_blk = 0.0_dp
1338 mem_per_repl = 0.0_dp
1339 mem_per_repl_blk = 0.0_dp
1340
1341 ! BIB_C_copy
1342 mem_per_repl = mem_per_repl + maxval(max(real(homo, kind=dp)*maxsize(gd_array), real(dimen_ri, kind=dp))* &
1343 maxsize(gd_b_virtual))*8.0_dp/(1024**2)
1344 ! BIB_C
1345 mem_per_repl = mem_per_repl + sum(real(homo, kind=dp)*maxsize(gd_b_virtual))*maxsize(gd_array)*8.0_dp/(1024**2)
1346 ! BIB_C_rec
1347 mem_per_repl_blk = mem_per_repl_blk + real(maxval(maxsize(gd_b_virtual)), kind=dp)*maxsize(gd_array)*8.0_dp/(1024**2)
1348 ! local_i_aL+local_j_aL
1349 mem_per_blk = mem_per_blk + 2.0_dp*maxval(maxsize(gd_b_virtual))*real(dimen_ri, kind=dp)*8.0_dp/(1024**2)
1350 ! local_ab
1351 mem_base = mem_base + maxval(real(virtual, kind=dp)*maxsize(gd_b_virtual))*8.0_dp/(1024**2)
1352 ! external_ab/external_i_aL
1353 mem_base = mem_base + real(max(dimen_ri, maxval(virtual)), kind=dp)*maxval(maxsize(gd_b_virtual))*8.0_dp/(1024**2)
1354
1355 IF (calc_forces) THEN
1356 ! Gamma_P_ia
1357 mem_per_repl = mem_per_repl + sum(real(homo, kind=dp)*maxsize(gd_array)* &
1358 maxsize(gd_b_virtual))*8.0_dp/(1024**2)
1359 ! Y_i_aP+Y_j_aP
1360 mem_per_blk = mem_per_blk + 2.0_dp*maxval(maxsize(gd_b_virtual))*dimen_ri*8.0_dp/(1024**2)
1361 ! local_ba/t_ab
1362 mem_base = mem_base + real(maxval(maxsize(gd_b_virtual)), kind=dp)*max(dimen_ri, maxval(virtual))*8.0_dp/(1024**2)
1363 ! P_ij
1364 mem_base = mem_base + sum(real(homo, kind=dp)*homo)*8.0_dp/(1024**2)
1365 ! P_ab
1366 mem_base = mem_base + sum(real(virtual, kind=dp)*maxsize(gd_b_virtual))*8.0_dp/(1024**2)
1367 ! send_ab/send_i_aL
1368 mem_base = mem_base + real(max(dimen_ri, maxval(virtual)), kind=dp)*maxval(maxsize(gd_b_virtual))*8.0_dp/(1024**2)
1369 END IF
1370
1371 ! This a first guess based on the assumption of optimal block sizes
1372 block_size = max(1, min(floor(sqrt(real(minval(homo), kind=dp))), floor(minval(homo)/sqrt(2.0_dp*ngroup))))
1373 IF (mp2_env%ri_mp2%block_size > 0) block_size = mp2_env%ri_mp2%block_size
1374
1375 mem_min = mem_base + mem_per_repl + (mem_per_blk + mem_per_repl_blk)*block_size
1376
1377 IF (unit_nr > 0) WRITE (unit_nr, '(T3,A,T68,F9.2,A4)') 'RI_INFO| Minimum available memory per MPI process:', &
1378 mem_real, ' MiB'
1379 IF (unit_nr > 0) WRITE (unit_nr, '(T3,A,T68,F9.2,A4)') 'RI_INFO| Minimum required memory per MPI process:', &
1380 mem_min, ' MiB'
1381
1382 ! We use the following communication model
1383 ! Comm(replication)+Comm(collection of data for ij pair)+Comm(contraction)
1384 ! One can show that the costs of the contraction step are independent of the block size and the replication group size
1385 ! With gradients, the other two steps are carried out twice (Y_i_aP -> Gamma_i_aP, and dereplication)
1386 ! NL ... number of RI basis functions
1387 ! NR ... replication group size
1388 ! NG ... number of sub groups
1389 ! NB ... Block size
1390 ! o ... number of occupied orbitals
1391 ! Then, we have the communication costs (in multiples of the original BIb_C matrix)
1392 ! (NR/NG)+(1-(NR/NG))*(o/NB+NB-2)/NG = (NR/NG)*(1-(o/NB+NB-2)/NG)+(o/NB+NB-2)/NG
1393 ! and with gradients
1394 ! 2*(NR/NG)+2*(1-(NR/NG))*(o/NB+NB-2)/NG = (NR/NG)*(1-(o/NB+NB-2)/NG)+(o/NB+NB-2)/NG
1395 ! We are looking for the minimum of the communication volume,
1396 ! thus, if the prefactor of (NR/NG) is smaller than zero, use the largest possible replication group size.
1397 ! If the factor is larger than zero, set the replication group size to 1. (For small systems and a large number of subgroups)
1398 ! Replication group size = 1 implies that the integration group size equals the number of subgroups
1399
1400 integ_group_size = ngroup
1401
1402 ! Multiply everything by homo*virtual to consider differences between spin channels in case of open-shell calculations
1403 factor = real(sum(homo*virtual), kind=dp) &
1404 - sum((real(maxval(homo), kind=dp)/block_size + block_size - 2.0_dp)*homo*virtual)/ngroup
1405 IF (SIZE(homo) == 2) factor = factor - 2.0_dp*product(homo)/block_size/ngroup*sum(homo*virtual)
1406
1407 IF (factor <= 0.0_dp) THEN
1408 ! Remove the fixed memory and divide by the memory per replication group size
1409 max_repl_group_size = min(max(floor((mem_real - mem_base - mem_per_blk*block_size)/ &
1410 (mem_per_repl + mem_per_repl_blk*block_size)), 1), ngroup)
1411 ! Convert to an integration group size
1412 min_integ_group_size = ngroup/max_repl_group_size
1413
1414 ! Ensure that the integration group size is a divisor of the number of sub groups
1415 DO iib = max(min(min_integ_group_size, ngroup), 1), ngroup
1416 ! check that the ngroup is a multiple of integ_group_size
1417 IF (mod(ngroup, iib) == 0) THEN
1418 integ_group_size = iib
1419 EXIT
1420 END IF
1421 integ_group_size = integ_group_size + 1
1422 END DO
1423 END IF
1424 ELSE ! We take the user provided group size
1425 integ_group_size = ngroup/mp2_env%ri_mp2%number_integration_groups
1426 END IF
1427
1428 IF (unit_nr > 0) THEN
1429 WRITE (unit=unit_nr, fmt="(T3,A,T75,i6)") &
1430 "RI_INFO| Group size for integral replication:", integ_group_size*para_env_sub%num_pe
1431 CALL m_flush(unit_nr)
1432 END IF
1433
1434 num_integ_group = ngroup/integ_group_size
1435
1436 CALL timestop(handle)
1437
1438 END SUBROUTINE mp2_ri_get_integ_group_size
1439
1440! **************************************************************************************************
1441!> \brief ...
1442!> \param mp2_env ...
1443!> \param para_env ...
1444!> \param para_env_sub ...
1445!> \param gd_array ...
1446!> \param gd_B_virtual ...
1447!> \param homo ...
1448!> \param virtual ...
1449!> \param dimen_RI ...
1450!> \param unit_nr ...
1451!> \param block_size ...
1452!> \param ngroup ...
1453!> \param num_integ_group ...
1454!> \param my_open_shell_ss ...
1455!> \param calc_forces ...
1456!> \param buffer_1D ...
1457! **************************************************************************************************
1458 SUBROUTINE mp2_ri_get_block_size(mp2_env, para_env, para_env_sub, gd_array, gd_B_virtual, &
1459 homo, virtual, dimen_RI, unit_nr, &
1460 block_size, ngroup, num_integ_group, &
1461 my_open_shell_ss, calc_forces, buffer_1D)
1462 TYPE(mp2_type) :: mp2_env
1463 TYPE(mp_para_env_type), INTENT(IN) :: para_env, para_env_sub
1464 TYPE(group_dist_d1_type), INTENT(IN) :: gd_array
1465 TYPE(group_dist_d1_type), DIMENSION(:), INTENT(IN) :: gd_b_virtual
1466 INTEGER, DIMENSION(:), INTENT(IN) :: homo, virtual
1467 INTEGER, INTENT(IN) :: dimen_ri, unit_nr
1468 INTEGER, INTENT(OUT) :: block_size, ngroup
1469 INTEGER, INTENT(IN) :: num_integ_group
1470 LOGICAL, INTENT(IN) :: my_open_shell_ss, calc_forces
1471 REAL(kind=dp), ALLOCATABLE, DIMENSION(:), &
1472 INTENT(OUT) :: buffer_1d
1473
1474 CHARACTER(LEN=*), PARAMETER :: routinen = 'mp2_ri_get_block_size'
1475
1476 INTEGER :: best_block_size, handle, num_ij_blocks
1477 INTEGER(KIND=int_8) :: buffer_size, mem
1478 REAL(kind=dp) :: mem_base, mem_per_blk, mem_per_repl_blk, &
1479 mem_real
1480
1481 CALL timeset(routinen, handle)
1482
1483 ngroup = para_env%num_pe/para_env_sub%num_pe
1484
1485 CALL m_memory(mem)
1486 mem_real = (mem + 1024*1024 - 1)/(1024*1024)
1487 CALL para_env%min(mem_real)
1488
1489 mem_base = 0.0_dp
1490 mem_per_blk = 0.0_dp
1491 mem_per_repl_blk = 0.0_dp
1492
1493 ! external_ab
1494 mem_base = mem_base + maxval(maxsize(gd_b_virtual))*max(dimen_ri, maxval(virtual))*8.0_dp/(1024**2)
1495 ! BIB_C_rec
1496 mem_per_repl_blk = mem_per_repl_blk + real(maxval(maxsize(gd_b_virtual)), kind=dp)*maxsize(gd_array)*8.0_dp/(1024**2)
1497 ! local_i_aL+local_j_aL
1498 mem_per_blk = mem_per_blk + 2.0_dp*maxval(maxsize(gd_b_virtual))*real(dimen_ri, kind=dp)*8.0_dp/(1024**2)
1499 ! Copy to keep arrays contiguous
1500 mem_base = mem_base + maxval(maxsize(gd_b_virtual))*max(dimen_ri, maxval(virtual))*8.0_dp/(1024**2)
1501
1502 IF (calc_forces) THEN
1503 ! Y_i_aP+Y_j_aP+BIb_C_send
1504 mem_per_blk = mem_per_blk + 3.0_dp*maxval(maxsize(gd_b_virtual))*dimen_ri*8.0_dp/(1024**2)
1505 ! send_ab
1506 mem_base = mem_base + maxval(maxsize(gd_b_virtual))*max(dimen_ri, maxval(virtual))*8.0_dp/(1024**2)
1507 END IF
1508
1509 best_block_size = 1
1510
1511 ! Here we split the memory half for the communication, half for replication
1512 IF (mp2_env%ri_mp2%block_size > 0) THEN
1513 best_block_size = mp2_env%ri_mp2%block_size
1514 ELSE
1515 best_block_size = max(floor((mem_real - mem_base)/(mem_per_blk + mem_per_repl_blk*ngroup/num_integ_group)), 1)
1516
1517 DO
1518 IF (SIZE(homo) == 1) THEN
1519 IF (.NOT. my_open_shell_ss) THEN
1520 num_ij_blocks = (homo(1)/best_block_size)
1521 num_ij_blocks = (num_ij_blocks*num_ij_blocks - num_ij_blocks)/2
1522 ELSE
1523 num_ij_blocks = ((homo(1) - 1)/best_block_size)
1524 num_ij_blocks = (num_ij_blocks*num_ij_blocks - num_ij_blocks)/2
1525 END IF
1526 ELSE
1527 num_ij_blocks = product(homo/best_block_size)
1528 END IF
1529 ! Enforce at least one large block for each subgroup
1530 IF ((num_ij_blocks >= ngroup .AND. num_ij_blocks > 0) .OR. best_block_size == 1) THEN
1531 EXIT
1532 ELSE
1533 best_block_size = best_block_size - 1
1534 END IF
1535 END DO
1536
1537 IF (SIZE(homo) == 1) THEN
1538 IF (my_open_shell_ss) THEN
1539 ! check that best_block_size is not bigger than sqrt(homo-1)
1540 ! Diagonal elements do not have to be considered
1541 best_block_size = min(floor(sqrt(real(homo(1) - 1, kind=dp))), best_block_size)
1542 ELSE
1543 ! check that best_block_size is not bigger than sqrt(homo)
1544 best_block_size = min(floor(sqrt(real(homo(1), kind=dp))), best_block_size)
1545 END IF
1546 END IF
1547 END IF
1548 block_size = max(1, best_block_size)
1549
1550 IF (unit_nr > 0) THEN
1551 WRITE (unit=unit_nr, fmt="(T3,A,T75,i6)") &
1552 "RI_INFO| Block size:", block_size
1553 CALL m_flush(unit_nr)
1554 END IF
1555
1556 ! Determine recv buffer size (BI_C_recv, external_i_aL, external_ab)
1557 buffer_size = max(int(maxsize(gd_array), kind=int_8)*block_size, int(max(dimen_ri, maxval(virtual)), kind=int_8)) &
1558 *maxval(maxsize(gd_b_virtual))
1559 ! The send buffer has the same size as the recv buffer
1560 IF (calc_forces) buffer_size = buffer_size*2
1561 ALLOCATE (buffer_1d(buffer_size))
1562
1563 CALL timestop(handle)
1564
1565 END SUBROUTINE mp2_ri_get_block_size
1566
1567! **************************************************************************************************
1568!> \brief ...
1569!> \param mp2_env ...
1570!> \param para_env_sub ...
1571!> \param gd_B_virtual ...
1572!> \param Eigenval ...
1573!> \param homo ...
1574!> \param dimen_RI ...
1575!> \param iiB ...
1576!> \param jjB ...
1577!> \param my_B_size ...
1578!> \param my_B_virtual_end ...
1579!> \param my_B_virtual_start ...
1580!> \param my_i ...
1581!> \param my_j ...
1582!> \param virtual ...
1583!> \param local_ab ...
1584!> \param t_ab ...
1585!> \param my_local_i_aL ...
1586!> \param my_local_j_aL ...
1587!> \param open_ss ...
1588!> \param Y_i_aP ...
1589!> \param Y_j_aP ...
1590!> \param local_ba ...
1591!> \param ispin ...
1592!> \param jspin ...
1593!> \param dgemm_counter ...
1594!> \param buffer_1D ...
1595! **************************************************************************************************
1596 SUBROUTINE mp2_update_p_gamma(mp2_env, para_env_sub, gd_B_virtual, &
1597 Eigenval, homo, dimen_RI, iiB, jjB, my_B_size, &
1598 my_B_virtual_end, my_B_virtual_start, my_i, my_j, virtual, local_ab, &
1599 t_ab, my_local_i_aL, my_local_j_aL, open_ss, Y_i_aP, Y_j_aP, &
1600 local_ba, ispin, jspin, dgemm_counter, buffer_1D)
1601 TYPE(mp2_type) :: mp2_env
1602 TYPE(mp_para_env_type), INTENT(IN) :: para_env_sub
1603 TYPE(group_dist_d1_type), DIMENSION(:), INTENT(IN) :: gd_b_virtual
1604 REAL(kind=dp), DIMENSION(:, :), INTENT(IN) :: eigenval
1605 INTEGER, DIMENSION(:), INTENT(IN) :: homo
1606 INTEGER, INTENT(IN) :: dimen_ri, iib, jjb
1607 INTEGER, DIMENSION(:), INTENT(IN) :: my_b_size, my_b_virtual_end, &
1608 my_b_virtual_start
1609 INTEGER, INTENT(IN) :: my_i, my_j
1610 INTEGER, DIMENSION(:), INTENT(IN) :: virtual
1611 REAL(kind=dp), CONTIGUOUS, DIMENSION(:, :), &
1612 INTENT(INOUT), TARGET :: local_ab
1613 REAL(kind=dp), CONTIGUOUS, DIMENSION(:, :), &
1614 INTENT(IN), TARGET :: t_ab, my_local_i_al, my_local_j_al
1615 LOGICAL, INTENT(IN) :: open_ss
1616 REAL(kind=dp), CONTIGUOUS, DIMENSION(:, :), &
1617 INTENT(INOUT), TARGET :: y_i_ap, y_j_ap, local_ba
1618 INTEGER, INTENT(IN) :: ispin, jspin
1619 TYPE(dgemm_counter_type), INTENT(INOUT) :: dgemm_counter
1620 REAL(kind=dp), CONTIGUOUS, DIMENSION(:), TARGET :: buffer_1d
1621
1622 CHARACTER(LEN=*), PARAMETER :: routinen = 'mp2_update_P_gamma'
1623
1624 INTEGER :: a, b, b_global, handle, proc_receive, proc_send, proc_shift, rec_b_size, &
1625 rec_b_virtual_end, rec_b_virtual_start, send_b_size, send_b_virtual_end, &
1626 send_b_virtual_start
1627 INTEGER(KIND=int_8) :: offset
1628 LOGICAL :: alpha_beta
1629 REAL(kind=dp) :: factor, p_ij_diag
1630 REAL(kind=dp), CONTIGUOUS, DIMENSION(:, :), &
1631 POINTER :: external_ab, send_ab
1632
1633 CALL timeset(routinen//"_Pia", handle)
1634
1635 alpha_beta = .NOT. (ispin == jspin)
1636 IF (open_ss) THEN
1637 factor = 1.0_dp
1638 ELSE
1639 factor = 2.0_dp
1640 END IF
1641 ! divide the (ia|jb) integrals by Delta_ij^ab
1642 DO b = 1, my_b_size(jspin)
1643 b_global = b + my_b_virtual_start(jspin) - 1
1644 DO a = 1, virtual(ispin)
1645 local_ab(a, b) = -local_ab(a, b)/ &
1646 (eigenval(homo(ispin) + a, ispin) + eigenval(homo(jspin) + b_global, jspin) - &
1647 eigenval(my_i + iib - 1, ispin) - eigenval(my_j + jjb - 1, jspin))
1648 END DO
1649 END DO
1650 IF (.NOT. (alpha_beta)) THEN
1651 p_ij_diag = -sum(local_ab*t_ab)*factor
1652 ELSE
1653 ! update diagonal part of P_ij
1654 p_ij_diag = -sum(local_ab*local_ab)*mp2_env%scale_S
1655 ! More integrals needed only for alpha-beta case: local_ba
1656 DO b = 1, my_b_size(ispin)
1657 b_global = b + my_b_virtual_start(ispin) - 1
1658 DO a = 1, virtual(jspin)
1659 local_ba(a, b) = -local_ba(a, b)/ &
1660 (eigenval(homo(jspin) + a, jspin) + eigenval(homo(ispin) + b_global, ispin) - &
1661 eigenval(my_i + iib - 1, ispin) - eigenval(my_j + jjb - 1, jspin))
1662 END DO
1663 END DO
1664 END IF
1665
1666 ! P_ab and add diagonal part of P_ij
1667
1668 CALL dgemm_counter_start(dgemm_counter)
1669 IF (.NOT. (alpha_beta)) THEN
1670 CALL mp2_env%local_gemm_ctx%gemm('T', 'N', my_b_size(ispin), my_b_size(ispin), virtual(ispin), 1.0_dp, &
1671 t_ab, virtual(ispin), local_ab, virtual(ispin), &
1672 1.0_dp, mp2_env%ri_grad%P_ab(ispin)%array(:, &
1673 my_b_virtual_start(ispin):my_b_virtual_end(ispin)), my_b_size(ispin))
1674 mp2_env%ri_grad%P_ij(ispin)%array(my_i + iib - 1, my_i + iib - 1) = &
1675 mp2_env%ri_grad%P_ij(ispin)%array(my_i + iib - 1, my_i + iib - 1) + p_ij_diag
1676 ELSE
1677 CALL mp2_env%local_gemm_ctx%gemm('T', 'N', my_b_size(ispin), my_b_size(ispin), virtual(jspin), mp2_env%scale_S, &
1678 local_ba, virtual(jspin), local_ba, virtual(jspin), 1.0_dp, &
1679 mp2_env%ri_grad%P_ab(ispin)%array(:, my_b_virtual_start(ispin):my_b_virtual_end(ispin)), my_b_size(ispin))
1680
1681 mp2_env%ri_grad%P_ij(ispin)%array(my_i + iib - 1, my_i + iib - 1) = &
1682 mp2_env%ri_grad%P_ij(ispin)%array(my_i + iib - 1, my_i + iib - 1) + p_ij_diag
1683
1684 CALL mp2_env%local_gemm_ctx%gemm('T', 'N', my_b_size(jspin), my_b_size(jspin), virtual(ispin), mp2_env%scale_S, &
1685 local_ab, virtual(ispin), local_ab, virtual(ispin), 1.0_dp, &
1686 mp2_env%ri_grad%P_ab(jspin)%array(:, my_b_virtual_start(jspin):my_b_virtual_end(jspin)), my_b_size(jspin))
1687
1688 mp2_env%ri_grad%P_ij(jspin)%array(my_j + jjb - 1, my_j + jjb - 1) = &
1689 mp2_env%ri_grad%P_ij(jspin)%array(my_j + jjb - 1, my_j + jjb - 1) + p_ij_diag
1690 END IF
1691 ! The summation is over unique pairs. In alpha-beta case, all pairs are unique: subroutine is called for
1692 ! both i^alpha,j^beta and i^beta,j^alpha. Formally, my_i can be equal to my_j, but they are different
1693 ! due to spin in alpha-beta case.
1694 IF ((my_i /= my_j) .AND. (.NOT. alpha_beta)) THEN
1695
1696 CALL mp2_env%local_gemm_ctx%gemm('N', 'T', my_b_size(ispin), virtual(ispin), my_b_size(ispin), 1.0_dp, &
1697 t_ab(my_b_virtual_start(ispin):my_b_virtual_end(ispin), :), my_b_size(ispin), &
1698 local_ab, virtual(ispin), &
1699 1.0_dp, mp2_env%ri_grad%P_ab(ispin)%array, my_b_size(ispin))
1700
1701 mp2_env%ri_grad%P_ij(ispin)%array(my_j + jjb - 1, my_j + jjb - 1) = &
1702 mp2_env%ri_grad%P_ij(ispin)%array(my_j + jjb - 1, my_j + jjb - 1) + p_ij_diag
1703 END IF
1704 DO proc_shift = 1, para_env_sub%num_pe - 1
1705 proc_send = modulo(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
1706 proc_receive = modulo(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
1707
1708 CALL get_group_dist(gd_b_virtual(jspin), proc_receive, rec_b_virtual_start, rec_b_virtual_end, rec_b_size)
1709 CALL get_group_dist(gd_b_virtual(jspin), proc_send, send_b_virtual_start, send_b_virtual_end, send_b_size)
1710
1711 external_ab(1:virtual(ispin), 1:rec_b_size) => buffer_1d(1:int(virtual(ispin), int_8)*rec_b_size)
1712 external_ab = 0.0_dp
1713
1714 CALL para_env_sub%sendrecv(local_ab, proc_send, &
1715 external_ab, proc_receive)
1716
1717 IF (.NOT. (alpha_beta)) THEN
1718 CALL mp2_env%local_gemm_ctx%gemm('T', 'N', my_b_size(ispin), rec_b_size, virtual(ispin), 1.0_dp, &
1719 t_ab, virtual(ispin), external_ab, virtual(ispin), &
1720 1.0_dp, mp2_env%ri_grad%P_ab(ispin)%array(:, rec_b_virtual_start:rec_b_virtual_end), my_b_size(ispin))
1721 ELSE
1722 CALL mp2_env%local_gemm_ctx%gemm('T', 'N', my_b_size(jspin), rec_b_size, virtual(ispin), mp2_env%scale_S, &
1723 local_ab, virtual(ispin), external_ab, virtual(ispin), &
1724 1.0_dp, mp2_env%ri_grad%P_ab(jspin)%array(:, rec_b_virtual_start:rec_b_virtual_end), &
1725 my_b_size(jspin))
1726
1727 ! For alpha-beta part of alpha-density we need a new parallel code
1728 ! And new external_ab (of a different size)
1729 CALL get_group_dist(gd_b_virtual(ispin), proc_receive, rec_b_virtual_start, rec_b_virtual_end, rec_b_size)
1730 CALL get_group_dist(gd_b_virtual(ispin), proc_send, send_b_virtual_start, send_b_virtual_end, send_b_size)
1731 external_ab(1:virtual(jspin), 1:rec_b_size) => buffer_1d(1:int(virtual(jspin), int_8)*rec_b_size)
1732 external_ab = 0.0_dp
1733 CALL para_env_sub%sendrecv(local_ba, proc_send, &
1734 external_ab, proc_receive)
1735 CALL mp2_env%local_gemm_ctx%gemm('T', 'N', my_b_size(ispin), rec_b_size, virtual(jspin), mp2_env%scale_S, &
1736 local_ba, virtual(jspin), external_ab, virtual(jspin), &
1737 1.0_dp, mp2_env%ri_grad%P_ab(ispin)%array(:, rec_b_virtual_start:rec_b_virtual_end), my_b_size(ispin))
1738 END IF
1739
1740 IF ((my_i /= my_j) .AND. (.NOT. alpha_beta)) THEN
1741 external_ab(1:my_b_size(ispin), 1:virtual(ispin)) => &
1742 buffer_1d(1:int(virtual(ispin), int_8)*my_b_size(ispin))
1743 external_ab = 0.0_dp
1744
1745 offset = int(virtual(ispin), int_8)*my_b_size(ispin)
1746
1747 send_ab(1:send_b_size, 1:virtual(ispin)) => buffer_1d(offset + 1:offset + int(send_b_size, int_8)*virtual(ispin))
1748 send_ab = 0.0_dp
1749
1750 CALL mp2_env%local_gemm_ctx%gemm('N', 'T', send_b_size, virtual(ispin), my_b_size(ispin), 1.0_dp, &
1751 t_ab(send_b_virtual_start:send_b_virtual_end, :), send_b_size, &
1752 local_ab, virtual(ispin), 0.0_dp, send_ab, send_b_size)
1753 CALL para_env_sub%sendrecv(send_ab, proc_send, &
1754 external_ab, proc_receive)
1755
1756 mp2_env%ri_grad%P_ab(ispin)%array(:, :) = mp2_env%ri_grad%P_ab(ispin)%array + external_ab
1757 END IF
1758
1759 END DO
1760 IF (.NOT. alpha_beta) THEN
1761 IF (my_i /= my_j) THEN
1762 CALL dgemm_counter_stop(dgemm_counter, 2*my_b_size(ispin), virtual(ispin), virtual(ispin))
1763 ELSE
1764 CALL dgemm_counter_stop(dgemm_counter, my_b_size(ispin), virtual(ispin), virtual(ispin))
1765 END IF
1766 ELSE
1767 CALL dgemm_counter_stop(dgemm_counter, sum(my_b_size), virtual(ispin), virtual(jspin))
1768 END IF
1769 CALL timestop(handle)
1770
1771 ! Now, Gamma_P_ia (made of Y_ia_P)
1772
1773 CALL timeset(routinen//"_Gamma", handle)
1774 CALL dgemm_counter_start(dgemm_counter)
1775 IF (.NOT. alpha_beta) THEN
1776 ! Alpha-alpha, beta-beta and closed shell
1777 CALL mp2_env%local_gemm_ctx%gemm('N', 'T', my_b_size(ispin), dimen_ri, my_b_size(ispin), 1.0_dp, &
1778 t_ab(my_b_virtual_start(ispin):my_b_virtual_end(ispin), :), my_b_size(ispin), &
1779 my_local_j_al, dimen_ri, 1.0_dp, y_i_ap, my_b_size(ispin))
1780 ELSE ! Alpha-beta
1781 CALL mp2_env%local_gemm_ctx%gemm('N', 'T', my_b_size(ispin), dimen_ri, my_b_size(jspin), mp2_env%scale_S, &
1782 local_ab(my_b_virtual_start(ispin):my_b_virtual_end(ispin), :), my_b_size(ispin), &
1783 my_local_j_al, dimen_ri, 1.0_dp, y_i_ap, my_b_size(ispin))
1784 CALL mp2_env%local_gemm_ctx%gemm('T', 'T', my_b_size(jspin), dimen_ri, my_b_size(ispin), mp2_env%scale_S, &
1785 local_ab(my_b_virtual_start(ispin):my_b_virtual_end(ispin), :), my_b_size(ispin), &
1786 my_local_i_al, dimen_ri, 1.0_dp, y_j_ap, my_b_size(jspin))
1787 END IF
1788
1789 IF (para_env_sub%num_pe > 1) THEN
1790 external_ab(1:my_b_size(ispin), 1:dimen_ri) => buffer_1d(1:int(my_b_size(ispin), int_8)*dimen_ri)
1791 external_ab = 0.0_dp
1792
1793 offset = int(my_b_size(ispin), int_8)*dimen_ri
1794 END IF
1795 !
1796 DO proc_shift = 1, para_env_sub%num_pe - 1
1797 proc_send = modulo(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
1798 proc_receive = modulo(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
1799
1800 CALL get_group_dist(gd_b_virtual(ispin), proc_receive, rec_b_virtual_start, rec_b_virtual_end, rec_b_size)
1801 CALL get_group_dist(gd_b_virtual(ispin), proc_send, send_b_virtual_start, send_b_virtual_end, send_b_size)
1802
1803 send_ab(1:send_b_size, 1:dimen_ri) => buffer_1d(offset + 1:offset + int(dimen_ri, int_8)*send_b_size)
1804 send_ab = 0.0_dp
1805 IF (.NOT. alpha_beta) THEN
1806 CALL mp2_env%local_gemm_ctx%gemm('N', 'T', send_b_size, dimen_ri, my_b_size(ispin), 1.0_dp, &
1807 t_ab(send_b_virtual_start:send_b_virtual_end, :), send_b_size, &
1808 my_local_j_al, dimen_ri, 0.0_dp, send_ab, send_b_size)
1809 CALL para_env_sub%sendrecv(send_ab, proc_send, external_ab, proc_receive)
1810
1811 y_i_ap(:, :) = y_i_ap + external_ab
1812
1813 ELSE ! Alpha-beta case
1814 ! Alpha-alpha part
1815 CALL mp2_env%local_gemm_ctx%gemm('N', 'T', send_b_size, dimen_ri, my_b_size(jspin), mp2_env%scale_S, &
1816 local_ab(send_b_virtual_start:send_b_virtual_end, :), send_b_size, &
1817 my_local_j_al, dimen_ri, 0.0_dp, send_ab, send_b_size)
1818 CALL para_env_sub%sendrecv(send_ab, proc_send, external_ab, proc_receive)
1819 y_i_ap(:, :) = y_i_ap + external_ab
1820 END IF
1821 END DO
1822
1823 IF (alpha_beta) THEN
1824 ! For beta-beta part (in alpha-beta case) we need a new parallel code
1825 IF (para_env_sub%num_pe > 1) THEN
1826 external_ab(1:my_b_size(jspin), 1:dimen_ri) => buffer_1d(1:int(my_b_size(jspin), int_8)*dimen_ri)
1827 external_ab = 0.0_dp
1828
1829 offset = int(my_b_size(jspin), int_8)*dimen_ri
1830 END IF
1831 DO proc_shift = 1, para_env_sub%num_pe - 1
1832 proc_send = modulo(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
1833 proc_receive = modulo(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
1834
1835 CALL get_group_dist(gd_b_virtual(jspin), proc_send, send_b_virtual_start, send_b_virtual_end, send_b_size)
1836 send_ab(1:send_b_size, 1:dimen_ri) => buffer_1d(offset + 1:offset + int(dimen_ri, int_8)*send_b_size)
1837 send_ab = 0.0_dp
1838 CALL mp2_env%local_gemm_ctx%gemm('N', 'T', send_b_size, dimen_ri, my_b_size(ispin), mp2_env%scale_S, &
1839 local_ba(send_b_virtual_start:send_b_virtual_end, :), send_b_size, &
1840 my_local_i_al, dimen_ri, 0.0_dp, send_ab, send_b_size)
1841 CALL para_env_sub%sendrecv(send_ab, proc_send, external_ab, proc_receive)
1842 y_j_ap(:, :) = y_j_ap + external_ab
1843
1844 END DO
1845
1846 ! Here, we just use approximate bounds. For large systems virtual(ispin) is approx virtual(jspin), same for B_size
1847 CALL dgemm_counter_stop(dgemm_counter, 3*virtual(ispin), dimen_ri, my_b_size(jspin))
1848 ELSE
1849 CALL dgemm_counter_stop(dgemm_counter, virtual(ispin), dimen_ri, my_b_size(ispin))
1850 END IF
1851
1852 IF ((my_i /= my_j) .AND. (.NOT. alpha_beta)) THEN
1853 ! Alpha-alpha, beta-beta and closed shell
1854 CALL dgemm_counter_start(dgemm_counter)
1855 CALL mp2_env%local_gemm_ctx%gemm('T', 'T', my_b_size(ispin), dimen_ri, my_b_size(ispin), 1.0_dp, &
1856 t_ab(my_b_virtual_start(ispin):my_b_virtual_end(ispin), :), my_b_size(ispin), &
1857 my_local_i_al, dimen_ri, 1.0_dp, y_j_ap, my_b_size(ispin))
1858 DO proc_shift = 1, para_env_sub%num_pe - 1
1859 proc_send = modulo(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
1860 proc_receive = modulo(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
1861
1862 CALL get_group_dist(gd_b_virtual(ispin), proc_receive, rec_b_virtual_start, rec_b_virtual_end, rec_b_size)
1863
1864 external_ab(1:dimen_ri, 1:rec_b_size) => buffer_1d(1:int(dimen_ri, int_8)*rec_b_size)
1865 external_ab = 0.0_dp
1866
1867 CALL para_env_sub%sendrecv(my_local_i_al, proc_send, &
1868 external_ab, proc_receive)
1869
1870 ! Alpha-alpha, beta-beta and closed shell
1871 CALL mp2_env%local_gemm_ctx%gemm('T', 'T', my_b_size(ispin), dimen_ri, rec_b_size, 1.0_dp, &
1872 t_ab(rec_b_virtual_start:rec_b_virtual_end, :), rec_b_size, &
1873 external_ab, dimen_ri, 1.0_dp, y_j_ap, my_b_size(ispin))
1874 END DO
1875
1876 CALL dgemm_counter_stop(dgemm_counter, my_b_size(ispin), dimen_ri, virtual(ispin))
1877 END IF
1878
1879 CALL timestop(handle)
1880 END SUBROUTINE mp2_update_p_gamma
1881
1882! **************************************************************************************************
1883!> \brief ...
1884!> \param Gamma_P_ia ...
1885!> \param ij_index ...
1886!> \param my_B_size ...
1887!> \param my_block_size ...
1888!> \param my_group_L_size ...
1889!> \param my_i ...
1890!> \param my_ij_pairs ...
1891!> \param ngroup ...
1892!> \param num_integ_group ...
1893!> \param integ_group_pos2color_sub ...
1894!> \param num_ij_pairs ...
1895!> \param ij_map ...
1896!> \param ranges_info_array ...
1897!> \param Y_i_aP ...
1898!> \param comm_exchange ...
1899!> \param sizes_array ...
1900!> \param spin ...
1901!> \param buffer_1D ...
1902! **************************************************************************************************
1903 SUBROUTINE mp2_redistribute_gamma(Gamma_P_ia, ij_index, my_B_size, &
1904 my_block_size, my_group_L_size, my_i, my_ij_pairs, ngroup, &
1905 num_integ_group, integ_group_pos2color_sub, num_ij_pairs, &
1906 ij_map, ranges_info_array, Y_i_aP, comm_exchange, &
1907 sizes_array, spin, buffer_1D)
1908
1909 REAL(kind=dp), DIMENSION(:, :, :), INTENT(INOUT) :: gamma_p_ia
1910 INTEGER, INTENT(IN) :: ij_index, my_b_size, my_block_size, &
1911 my_group_l_size, my_i, my_ij_pairs, &
1912 ngroup, num_integ_group
1913 INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(IN) :: integ_group_pos2color_sub, num_ij_pairs
1914 INTEGER, ALLOCATABLE, DIMENSION(:, :), INTENT(IN) :: ij_map
1915 INTEGER, ALLOCATABLE, DIMENSION(:, :, :), &
1916 INTENT(IN) :: ranges_info_array
1917 REAL(kind=dp), DIMENSION(:, :, :), INTENT(IN) :: y_i_ap
1918 TYPE(mp_comm_type), INTENT(IN) :: comm_exchange
1919 INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(IN) :: sizes_array
1920 INTEGER, INTENT(IN) :: spin
1921 REAL(kind=dp), CONTIGUOUS, DIMENSION(:), TARGET :: buffer_1d
1922
1923 CHARACTER(LEN=*), PARAMETER :: routinen = 'mp2_redistribute_gamma'
1924
1925 INTEGER :: end_point, handle, handle2, iib, ij_counter_rec, irep, kkk, lll, lstart_pos, &
1926 proc_receive, proc_send, proc_shift, rec_i, rec_ij_index, send_l_size, start_point, tag
1927 INTEGER(KIND=int_8) :: offset
1928 REAL(kind=dp), CONTIGUOUS, DIMENSION(:, :, :), &
1929 POINTER :: bi_c_rec, bi_c_send
1930
1931! In alpha-beta case Y_i_aP_beta is sent as Y_j_aP
1932
1933 CALL timeset(routinen//"_comm2", handle)
1934
1935 tag = 43
1936
1937 IF (ij_index <= my_ij_pairs) THEN
1938 ! somethig to send
1939 ! start with myself
1940 CALL timeset(routinen//"_comm2_w", handle2)
1941 DO irep = 0, num_integ_group - 1
1942 lstart_pos = ranges_info_array(1, irep, comm_exchange%mepos)
1943 start_point = ranges_info_array(3, irep, comm_exchange%mepos)
1944 end_point = ranges_info_array(4, irep, comm_exchange%mepos)
1945!$OMP PARALLEL DO DEFAULT(NONE) &
1946!$OMP PRIVATE(kkk,lll,iiB) &
1947!$OMP SHARED(start_point,end_point,Lstart_pos,my_block_size,&
1948!$OMP Gamma_P_ia,my_i,my_B_size,Y_i_aP)
1949 DO kkk = start_point, end_point
1950 lll = kkk - start_point + lstart_pos
1951 DO iib = 1, my_block_size
1952 gamma_p_ia(1:my_b_size, my_i + iib - 1, kkk) = &
1953 gamma_p_ia(1:my_b_size, my_i + iib - 1, kkk) + &
1954 y_i_ap(1:my_b_size, lll, iib)
1955 END DO
1956 END DO
1957!$OMP END PARALLEL DO
1958 END DO
1959 CALL timestop(handle2)
1960
1961 ! Y_i_aP(my_B_size,dimen_RI,block_size)
1962
1963 DO proc_shift = 1, comm_exchange%num_pe - 1
1964 proc_send = modulo(comm_exchange%mepos + proc_shift, comm_exchange%num_pe)
1965 proc_receive = modulo(comm_exchange%mepos - proc_shift, comm_exchange%num_pe)
1966
1967 send_l_size = sizes_array(proc_send)
1968 bi_c_send(1:my_b_size, 1:my_block_size, 1:send_l_size) => &
1969 buffer_1d(1:int(my_b_size, int_8)*my_block_size*send_l_size)
1970
1971 offset = int(my_b_size, int_8)*my_block_size*send_l_size
1972
1973 CALL timeset(routinen//"_comm2_w", handle2)
1974 bi_c_send = 0.0_dp
1975 DO irep = 0, num_integ_group - 1
1976 lstart_pos = ranges_info_array(1, irep, proc_send)
1977 start_point = ranges_info_array(3, irep, proc_send)
1978 end_point = ranges_info_array(4, irep, proc_send)
1979!$OMP PARALLEL DO DEFAULT(NONE) &
1980!$OMP PRIVATE(kkk,lll,iiB) &
1981!$OMP SHARED(start_point,end_point,Lstart_pos,my_block_size,&
1982!$OMP BI_C_send,my_B_size,Y_i_aP)
1983 DO kkk = start_point, end_point
1984 lll = kkk - start_point + lstart_pos
1985 DO iib = 1, my_block_size
1986 bi_c_send(1:my_b_size, iib, kkk) = y_i_ap(1:my_b_size, lll, iib)
1987 END DO
1988 END DO
1989!$OMP END PARALLEL DO
1990 END DO
1991 CALL timestop(handle2)
1992
1993 rec_ij_index = num_ij_pairs(proc_receive)
1994
1995 IF (ij_index <= rec_ij_index) THEN
1996 ! we know that proc_receive has something to send for us, let's see what
1997 ij_counter_rec = &
1998 (ij_index - min(1, integ_group_pos2color_sub(proc_receive)))*ngroup + integ_group_pos2color_sub(proc_receive)
1999
2000 rec_i = ij_map(spin, ij_counter_rec)
2001
2002 bi_c_rec(1:my_b_size, 1:my_block_size, 1:my_group_l_size) => &
2003 buffer_1d(offset + 1:offset + int(my_b_size, int_8)*my_block_size*my_group_l_size)
2004 bi_c_rec = 0.0_dp
2005
2006 CALL comm_exchange%sendrecv(bi_c_send, proc_send, &
2007 bi_c_rec, proc_receive, tag)
2008
2009 CALL timeset(routinen//"_comm2_w", handle2)
2010 DO irep = 0, num_integ_group - 1
2011 start_point = ranges_info_array(3, irep, comm_exchange%mepos)
2012 end_point = ranges_info_array(4, irep, comm_exchange%mepos)
2013!$OMP PARALLEL WORKSHARE DEFAULT(NONE) &
2014!$OMP SHARED(start_point,end_point,my_block_size,&
2015!$OMP Gamma_P_ia,rec_i,iiB,my_B_size,BI_C_rec)
2016 gamma_p_ia(:, rec_i:rec_i + my_block_size - 1, start_point:end_point) = &
2017 gamma_p_ia(:, rec_i:rec_i + my_block_size - 1, start_point:end_point) + &
2018 bi_c_rec(1:my_b_size, :, start_point:end_point)
2019!$OMP END PARALLEL WORKSHARE
2020 END DO
2021 CALL timestop(handle2)
2022
2023 ELSE
2024 ! we have something to send but nothing to receive
2025 CALL comm_exchange%send(bi_c_send, proc_send, tag)
2026
2027 END IF
2028
2029 END DO
2030
2031 ELSE
2032 ! noting to send check if we have to receive
2033 DO proc_shift = 1, comm_exchange%num_pe - 1
2034 proc_send = modulo(comm_exchange%mepos + proc_shift, comm_exchange%num_pe)
2035 proc_receive = modulo(comm_exchange%mepos - proc_shift, comm_exchange%num_pe)
2036 rec_ij_index = num_ij_pairs(proc_receive)
2037
2038 IF (ij_index <= rec_ij_index) THEN
2039 ! we know that proc_receive has something to send for us, let's see what
2040 ij_counter_rec = &
2041 (ij_index - min(1, integ_group_pos2color_sub(proc_receive)))*ngroup + integ_group_pos2color_sub(proc_receive)
2042
2043 rec_i = ij_map(spin, ij_counter_rec)
2044
2045 bi_c_rec(1:my_b_size, 1:my_block_size, 1:my_group_l_size) => &
2046 buffer_1d(1:int(my_b_size, int_8)*my_block_size*my_group_l_size)
2047
2048 bi_c_rec = 0.0_dp
2049
2050 CALL comm_exchange%recv(bi_c_rec, proc_receive, tag)
2051
2052 CALL timeset(routinen//"_comm2_w", handle2)
2053 DO irep = 0, num_integ_group - 1
2054 start_point = ranges_info_array(3, irep, comm_exchange%mepos)
2055 end_point = ranges_info_array(4, irep, comm_exchange%mepos)
2056#if !defined(__INTEL_LLVM_COMPILER) || (20250000 <= __INTEL_LLVM_COMPILER)
2057!$OMP PARALLEL WORKSHARE DEFAULT(NONE) &
2058!$OMP SHARED(start_point,end_point,my_block_size,&
2059!$OMP Gamma_P_ia,rec_i,my_B_size,BI_C_rec)
2060#endif
2061 gamma_p_ia(:, rec_i:rec_i + my_block_size - 1, start_point:end_point) = &
2062 gamma_p_ia(:, rec_i:rec_i + my_block_size - 1, start_point:end_point) + &
2063 bi_c_rec(1:my_b_size, :, start_point:end_point)
2064#if !defined(__INTEL_LLVM_COMPILER) || (20250000 <= __INTEL_LLVM_COMPILER)
2065!$OMP END PARALLEL WORKSHARE
2066#endif
2067 END DO
2068 CALL timestop(handle2)
2069
2070 END IF
2071 END DO
2072
2073 END IF
2074 CALL timestop(handle)
2075
2076 END SUBROUTINE mp2_redistribute_gamma
2077
2078! **************************************************************************************************
2079!> \brief ...
2080!> \param mp2_env ...
2081!> \param Eigenval ...
2082!> \param homo ...
2083!> \param virtual ...
2084!> \param open_shell ...
2085!> \param beta_beta ...
2086!> \param Bib_C ...
2087!> \param unit_nr ...
2088!> \param dimen_RI ...
2089!> \param my_B_size ...
2090!> \param ngroup ...
2091!> \param my_group_L_size ...
2092!> \param color_sub ...
2093!> \param ranges_info_array ...
2094!> \param comm_exchange ...
2095!> \param para_env_sub ...
2096!> \param para_env ...
2097!> \param my_B_virtual_start ...
2098!> \param my_B_virtual_end ...
2099!> \param sizes_array ...
2100!> \param gd_B_virtual ...
2101!> \param integ_group_pos2color_sub ...
2102!> \param dgemm_counter ...
2103!> \param buffer_1D ...
2104! **************************************************************************************************
2105 SUBROUTINE quasi_degenerate_p_ij(mp2_env, Eigenval, homo, virtual, open_shell, &
2106 beta_beta, Bib_C, unit_nr, dimen_RI, &
2107 my_B_size, ngroup, my_group_L_size, &
2108 color_sub, ranges_info_array, comm_exchange, para_env_sub, para_env, &
2109 my_B_virtual_start, my_B_virtual_end, sizes_array, gd_B_virtual, &
2110 integ_group_pos2color_sub, dgemm_counter, buffer_1D)
2111 TYPE(mp2_type) :: mp2_env
2112 REAL(kind=dp), DIMENSION(:, :), INTENT(IN) :: eigenval
2113 INTEGER, DIMENSION(:), INTENT(IN) :: homo, virtual
2114 LOGICAL, INTENT(IN) :: open_shell, beta_beta
2115 TYPE(three_dim_real_array), DIMENSION(:), &
2116 INTENT(IN) :: bib_c
2117 INTEGER, INTENT(IN) :: unit_nr, dimen_ri
2118 INTEGER, DIMENSION(:), INTENT(IN) :: my_b_size
2119 INTEGER, INTENT(IN) :: ngroup, my_group_l_size, color_sub
2120 INTEGER, ALLOCATABLE, DIMENSION(:, :, :), &
2121 INTENT(IN) :: ranges_info_array
2122 TYPE(mp_comm_type), INTENT(IN) :: comm_exchange
2123 TYPE(mp_para_env_type), INTENT(IN) :: para_env_sub, para_env
2124 INTEGER, DIMENSION(:), INTENT(IN) :: my_b_virtual_start, my_b_virtual_end
2125 INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(IN) :: sizes_array
2126 TYPE(group_dist_d1_type), DIMENSION(:), INTENT(IN) :: gd_b_virtual
2127 INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(IN) :: integ_group_pos2color_sub
2128 TYPE(dgemm_counter_type), INTENT(INOUT) :: dgemm_counter
2129 REAL(kind=dp), CONTIGUOUS, DIMENSION(:), TARGET :: buffer_1d
2130
2131 CHARACTER(LEN=*), PARAMETER :: routinen = 'quasi_degenerate_P_ij'
2132
2133 INTEGER :: a, a_global, b, b_global, block_size, decil, handle, handle2, ijk_counter, &
2134 ijk_counter_send, ijk_index, ispin, kkb, kspin, max_block_size, max_ijk, my_i, my_ijk, &
2135 my_j, my_k, my_last_k(2), my_virtual, nspins, proc_receive, proc_send, proc_shift, &
2136 rec_b_size, rec_b_virtual_end, rec_b_virtual_start, rec_l_size, send_b_size, &
2137 send_b_virtual_end, send_b_virtual_start, send_i, send_ijk_index, send_j, send_k, &
2138 size_b_i, size_b_k, tag, tag2
2139 INTEGER, ALLOCATABLE, DIMENSION(:) :: num_ijk
2140 INTEGER, ALLOCATABLE, DIMENSION(:, :) :: ijk_map, send_last_k
2141 LOGICAL :: alpha_beta, do_recv_i, do_recv_j, &
2142 do_recv_k, do_send_i, do_send_j, &
2143 do_send_k
2144 REAL(kind=dp) :: amp_fac, p_ij_elem, t_new, t_start
2145 REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :), &
2146 TARGET :: local_ab, local_al_i, local_al_j, t_ab
2147 REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :, :) :: local_al_k
2148 REAL(kind=dp), DIMENSION(:, :), POINTER :: bi_c_rec, external_ab, external_al
2149 REAL(kind=dp), DIMENSION(:, :, :), POINTER :: bi_c_rec_3d
2150
2151 CALL timeset(routinen//"_ij_sing", handle)
2152
2153 tag = 44
2154 tag2 = 45
2155
2156 nspins = SIZE(bib_c)
2157 alpha_beta = (nspins == 2)
2158
2159 ! Set amplitude factor
2160 amp_fac = mp2_env%scale_S + mp2_env%scale_T
2161 IF (open_shell) amp_fac = mp2_env%scale_T
2162
2163 ALLOCATE (send_last_k(2, comm_exchange%num_pe - 1))
2164
2165 ! Loop(s) over orbital triplets
2166 DO ispin = 1, nspins
2167 size_b_i = my_b_size(ispin)
2168 IF (ispin == 1 .AND. alpha_beta) THEN
2169 kspin = 2
2170 ELSE
2171 kspin = 1
2172 END IF
2173 size_b_k = my_b_size(kspin)
2174
2175 ! Find the number of quasi-degenerate orbitals and orbital triplets
2176
2177 CALL find_quasi_degenerate_ij(my_ijk, homo(ispin), homo(kspin), eigenval(:, ispin), mp2_env, ijk_map, unit_nr, ngroup, &
2178 .NOT. beta_beta .AND. ispin /= 2, comm_exchange, num_ijk, max_ijk, color_sub, &
2179 SIZE(buffer_1d), my_group_l_size, size_b_k, para_env, virtual(ispin), size_b_i)
2180
2181 my_virtual = virtual(ispin)
2182 IF (SIZE(ijk_map, 2) > 0) THEN
2183 max_block_size = ijk_map(4, 1)
2184 ELSE
2185 max_block_size = 1
2186 END IF
2187
2188 ALLOCATE (local_al_i(dimen_ri, size_b_i))
2189 ALLOCATE (local_al_j(dimen_ri, size_b_i))
2190 ALLOCATE (local_al_k(dimen_ri, size_b_k, max_block_size))
2191 ALLOCATE (t_ab(my_virtual, size_b_k))
2192
2193 my_last_k = -1
2194 send_last_k = -1
2195
2196 t_start = m_walltime()
2197 DO ijk_index = 1, max_ijk
2198
2199 ! Prediction is unreliable if we are in the first step of the loop
2200 IF (unit_nr > 0 .AND. ijk_index > 1) THEN
2201 decil = ijk_index*10/max_ijk
2202 IF (decil /= (ijk_index - 1)*10/max_ijk) THEN
2203 t_new = m_walltime()
2204 t_new = (t_new - t_start)/60.0_dp*(max_ijk - ijk_index + 1)/(ijk_index - 1)
2205 WRITE (unit_nr, fmt="(T3,A)") "Percentage of finished loop: "// &
2206 cp_to_string(decil*10)//". Minutes left: "//cp_to_string(t_new)
2207 CALL m_flush(unit_nr)
2208 END IF
2209 END IF
2210
2211 IF (ijk_index <= my_ijk) THEN
2212 ! work to be done
2213 ijk_counter = (ijk_index - min(1, color_sub))*ngroup + color_sub
2214 my_i = ijk_map(1, ijk_counter)
2215 my_j = ijk_map(2, ijk_counter)
2216 my_k = ijk_map(3, ijk_counter)
2217 block_size = ijk_map(4, ijk_counter)
2218
2219 do_recv_i = (ispin /= kspin) .OR. my_i < my_k .OR. my_i > my_k + block_size - 1
2220 do_recv_j = (ispin /= kspin) .OR. my_j < my_k .OR. my_j > my_k + block_size - 1
2221 do_recv_k = my_k /= my_last_k(1) .OR. my_k + block_size - 1 /= my_last_k(2)
2222 my_last_k(1) = my_k
2223 my_last_k(2) = my_k + block_size - 1
2224
2225 local_al_i = 0.0_dp
2226 IF (do_recv_i) THEN
2227 CALL fill_local_i_al_2d(local_al_i, ranges_info_array(:, :, comm_exchange%mepos), &
2228 bib_c(ispin)%array(:, :, my_i))
2229 END IF
2230
2231 local_al_j = 0.0_dp
2232 IF (do_recv_j) THEN
2233 CALL fill_local_i_al_2d(local_al_j, ranges_info_array(:, :, comm_exchange%mepos), &
2234 bib_c(ispin)%array(:, :, my_j))
2235 END IF
2236
2237 IF (do_recv_k) THEN
2238 local_al_k = 0.0_dp
2239 CALL fill_local_i_al(local_al_k(:, :, 1:block_size), ranges_info_array(:, :, comm_exchange%mepos), &
2240 bib_c(kspin)%array(:, :, my_k:my_k + block_size - 1))
2241 END IF
2242
2243 CALL timeset(routinen//"_comm", handle2)
2244 DO proc_shift = 1, comm_exchange%num_pe - 1
2245 proc_send = modulo(comm_exchange%mepos + proc_shift, comm_exchange%num_pe)
2246 proc_receive = modulo(comm_exchange%mepos - proc_shift, comm_exchange%num_pe)
2247
2248 send_ijk_index = num_ijk(proc_send)
2249
2250 rec_l_size = sizes_array(proc_receive)
2251 bi_c_rec(1:rec_l_size, 1:size_b_i) => buffer_1d(1:int(rec_l_size, kind=int_8)*size_b_i)
2252
2253 do_send_i = .false.
2254 do_send_j = .false.
2255 do_send_k = .false.
2256 IF (ijk_index <= send_ijk_index) THEN
2257 ! something to send
2258 ijk_counter_send = (ijk_index - min(1, integ_group_pos2color_sub(proc_send)))* &
2259 ngroup + integ_group_pos2color_sub(proc_send)
2260 send_i = ijk_map(1, ijk_counter_send)
2261 send_j = ijk_map(2, ijk_counter_send)
2262 send_k = ijk_map(3, ijk_counter_send)
2263
2264 do_send_i = (ispin /= kspin) .OR. send_i < send_k .OR. send_i > send_k + block_size - 1
2265 do_send_j = (ispin /= kspin) .OR. send_j < send_k .OR. send_j > send_k + block_size - 1
2266 do_send_k = send_k /= send_last_k(1, proc_shift) .OR. send_k + block_size - 1 /= send_last_k(2, proc_shift)
2267 send_last_k(1, proc_shift) = send_k
2268 send_last_k(2, proc_shift) = send_k + block_size - 1
2269 END IF
2270
2271 ! occupied i
2272 bi_c_rec = 0.0_dp
2273 IF (do_send_i) THEN
2274 IF (do_recv_i) THEN
2275 CALL comm_exchange%sendrecv(bib_c(ispin)%array(:, :, send_i), proc_send, &
2276 bi_c_rec, proc_receive, tag)
2277 ELSE
2278 CALL comm_exchange%send(bib_c(ispin)%array(:, :, send_i), proc_send, tag)
2279 END IF
2280 ELSE IF (do_recv_i) THEN
2281 CALL comm_exchange%recv(bi_c_rec, proc_receive, tag)
2282 END IF
2283 IF (do_recv_i) THEN
2284 CALL fill_local_i_al_2d(local_al_i, ranges_info_array(:, :, proc_receive), bi_c_rec)
2285 END IF
2286
2287 ! occupied j
2288 bi_c_rec = 0.0_dp
2289 IF (do_send_j) THEN
2290 IF (do_recv_j) THEN
2291 CALL comm_exchange%sendrecv(bib_c(ispin)%array(:, :, send_j), proc_send, &
2292 bi_c_rec, proc_receive, tag)
2293 ELSE
2294 CALL comm_exchange%send(bib_c(ispin)%array(:, :, send_j), proc_send, tag)
2295 END IF
2296 ELSE IF (do_recv_j) THEN
2297 CALL comm_exchange%recv(bi_c_rec, proc_receive, tag)
2298 END IF
2299 IF (do_recv_j) THEN
2300 CALL fill_local_i_al_2d(local_al_j, ranges_info_array(:, :, proc_receive), bi_c_rec)
2301 END IF
2302
2303 ! occupied k
2304 bi_c_rec_3d(1:rec_l_size, 1:size_b_k, 1:block_size) => &
2305 buffer_1d(1:int(rec_l_size, kind=int_8)*size_b_k*block_size)
2306 IF (do_send_k) THEN
2307 IF (do_recv_k) THEN
2308 CALL comm_exchange%sendrecv(bib_c(kspin)%array(:, :, send_k:send_k + block_size - 1), proc_send, &
2309 bi_c_rec_3d, proc_receive, tag)
2310 ELSE
2311 CALL comm_exchange%send(bi_c_rec, proc_receive, tag)
2312 END IF
2313 ELSE IF (do_recv_k) THEN
2314 CALL comm_exchange%recv(bi_c_rec_3d, proc_receive, tag)
2315 END IF
2316 IF (do_recv_k) THEN
2317 CALL fill_local_i_al(local_al_k(:, :, 1:block_size), ranges_info_array(:, :, proc_receive), bi_c_rec_3d)
2318 END IF
2319 END DO
2320
2321 IF (.NOT. do_recv_i) local_al_i(:, :) = local_al_k(:, :, my_i - my_k + 1)
2322 IF (.NOT. do_recv_j) local_al_j(:, :) = local_al_k(:, :, my_j - my_k + 1)
2323 CALL timestop(handle2)
2324
2325 ! expand integrals
2326 DO kkb = 1, block_size
2327 CALL timeset(routinen//"_exp_ik", handle2)
2328 CALL dgemm_counter_start(dgemm_counter)
2329 ALLOCATE (local_ab(my_virtual, size_b_k))
2330 local_ab = 0.0_dp
2331 CALL mp2_env%local_gemm_ctx%gemm('T', 'N', size_b_i, size_b_k, dimen_ri, 1.0_dp, &
2332 local_al_i, dimen_ri, local_al_k(:, :, kkb), dimen_ri, &
2333 0.0_dp, local_ab(my_b_virtual_start(ispin):my_b_virtual_end(ispin), 1:size_b_k), size_b_i)
2334 DO proc_shift = 1, para_env_sub%num_pe - 1
2335 proc_send = modulo(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
2336 proc_receive = modulo(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
2337
2338 CALL get_group_dist(gd_b_virtual(ispin), proc_receive, rec_b_virtual_start, rec_b_virtual_end, rec_b_size)
2339
2340 external_al(1:dimen_ri, 1:rec_b_size) => buffer_1d(1:int(dimen_ri, kind=int_8)*rec_b_size)
2341
2342 CALL comm_exchange%sendrecv(local_al_i, proc_send, &
2343 external_al, proc_receive, tag)
2344
2345 CALL mp2_env%local_gemm_ctx%gemm('T', 'N', rec_b_size, size_b_k, dimen_ri, 1.0_dp, &
2346 external_al, dimen_ri, local_al_k(:, :, kkb), dimen_ri, &
2347 0.0_dp, local_ab(rec_b_virtual_start:rec_b_virtual_end, 1:size_b_k), rec_b_size)
2348 END DO
2349 CALL dgemm_counter_stop(dgemm_counter, my_virtual, size_b_k, dimen_ri)
2350 CALL timestop(handle2)
2351
2352 ! Amplitudes
2353 CALL timeset(routinen//"_tab", handle2)
2354 t_ab = 0.0_dp
2355 ! Alpha-alpha, beta-beta and closed shell
2356 IF (.NOT. alpha_beta) THEN
2357 DO b = 1, size_b_k
2358 b_global = b + my_b_virtual_start(1) - 1
2359 DO a = 1, my_b_size(1)
2360 a_global = a + my_b_virtual_start(1) - 1
2361 t_ab(a_global, b) = (amp_fac*local_ab(a_global, b) - mp2_env%scale_T*local_ab(b_global, a))/ &
2362 (eigenval(my_i, 1) + eigenval(my_k + kkb - 1, 1) &
2363 - eigenval(homo(1) + a_global, 1) - eigenval(homo(1) + b_global, 1))
2364 END DO
2365 END DO
2366 ELSE
2367 DO b = 1, size_b_k
2368 b_global = b + my_b_virtual_start(kspin) - 1
2369 DO a = 1, my_b_size(ispin)
2370 a_global = a + my_b_virtual_start(ispin) - 1
2371 t_ab(a_global, b) = mp2_env%scale_S*local_ab(a_global, b)/ &
2372 (eigenval(my_i, ispin) + eigenval(my_k + kkb - 1, kspin) &
2373 - eigenval(homo(ispin) + a_global, ispin) - eigenval(homo(kspin) + b_global, kspin))
2374 END DO
2375 END DO
2376 END IF
2377
2378 IF (.NOT. alpha_beta) THEN
2379 DO proc_shift = 1, para_env_sub%num_pe - 1
2380 proc_send = modulo(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
2381 proc_receive = modulo(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
2382 CALL get_group_dist(gd_b_virtual(1), proc_receive, rec_b_virtual_start, rec_b_virtual_end, rec_b_size)
2383 CALL get_group_dist(gd_b_virtual(1), proc_send, send_b_virtual_start, send_b_virtual_end, send_b_size)
2384
2385 external_ab(1:size_b_i, 1:rec_b_size) => buffer_1d(1:int(size_b_i, kind=int_8)*rec_b_size)
2386 CALL para_env_sub%sendrecv(local_ab(send_b_virtual_start:send_b_virtual_end, 1:size_b_k), proc_send, &
2387 external_ab(1:size_b_i, 1:rec_b_size), proc_receive, tag)
2388
2389 DO b = 1, my_b_size(1)
2390 b_global = b + my_b_virtual_start(1) - 1
2391 DO a = 1, rec_b_size
2392 a_global = a + rec_b_virtual_start - 1
2393 t_ab(a_global, b) = (amp_fac*local_ab(a_global, b) - mp2_env%scale_T*external_ab(b, a))/ &
2394 (eigenval(my_i, 1) + eigenval(my_k + kkb - 1, 1) &
2395 - eigenval(homo(1) + a_global, 1) - eigenval(homo(1) + b_global, 1))
2396 END DO
2397 END DO
2398 END DO
2399 END IF
2400 CALL timestop(handle2)
2401
2402 ! Expand the second set of integrals
2403 CALL timeset(routinen//"_exp_jk", handle2)
2404 local_ab = 0.0_dp
2405 CALL dgemm_counter_start(dgemm_counter)
2406 CALL mp2_env%local_gemm_ctx%gemm('T', 'N', size_b_i, size_b_k, dimen_ri, 1.0_dp, &
2407 local_al_j, dimen_ri, local_al_k(:, :, kkb), dimen_ri, &
2408 0.0_dp, local_ab(my_b_virtual_start(ispin):my_b_virtual_end(ispin), 1:size_b_k), size_b_i)
2409 DO proc_shift = 1, para_env_sub%num_pe - 1
2410 proc_send = modulo(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
2411 proc_receive = modulo(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
2412
2413 CALL get_group_dist(gd_b_virtual(ispin), proc_receive, rec_b_virtual_start, rec_b_virtual_end, rec_b_size)
2414
2415 external_al(1:dimen_ri, 1:rec_b_size) => buffer_1d(1:int(dimen_ri, kind=int_8)*rec_b_size)
2416
2417 CALL comm_exchange%sendrecv(local_al_j, proc_send, &
2418 external_al, proc_receive, tag)
2419 CALL mp2_env%local_gemm_ctx%gemm('T', 'N', rec_b_size, size_b_k, dimen_ri, 1.0_dp, &
2420 external_al, dimen_ri, local_al_k(:, :, kkb), dimen_ri, &
2421 0.0_dp, local_ab(rec_b_virtual_start:rec_b_virtual_end, 1:size_b_k), rec_b_size)
2422 END DO
2423 CALL dgemm_counter_stop(dgemm_counter, my_virtual, size_b_k, dimen_ri)
2424 CALL timestop(handle2)
2425
2426 CALL timeset(routinen//"_Pij", handle2)
2427 DO b = 1, size_b_k
2428 b_global = b + my_b_virtual_start(kspin) - 1
2429 DO a = 1, my_b_size(ispin)
2430 a_global = a + my_b_virtual_start(ispin) - 1
2431 local_ab(a_global, b) = &
2432 local_ab(a_global, b)/(eigenval(my_j, ispin) + eigenval(my_k + kkb - 1, kspin) &
2433 - eigenval(homo(ispin) + a_global, ispin) - eigenval(homo(kspin) + b_global, kspin))
2434 END DO
2435 END DO
2436 !
2437 p_ij_elem = sum(local_ab*t_ab)
2438 DEALLOCATE (local_ab)
2439 IF ((.NOT. open_shell) .AND. (.NOT. alpha_beta)) THEN
2440 p_ij_elem = p_ij_elem*2.0_dp
2441 END IF
2442 IF (beta_beta) THEN
2443 mp2_env%ri_grad%P_ij(2)%array(my_i, my_j) = mp2_env%ri_grad%P_ij(2)%array(my_i, my_j) - p_ij_elem
2444 mp2_env%ri_grad%P_ij(2)%array(my_j, my_i) = mp2_env%ri_grad%P_ij(2)%array(my_j, my_i) - p_ij_elem
2445 ELSE
2446 mp2_env%ri_grad%P_ij(ispin)%array(my_i, my_j) = mp2_env%ri_grad%P_ij(ispin)%array(my_i, my_j) - p_ij_elem
2447 mp2_env%ri_grad%P_ij(ispin)%array(my_j, my_i) = mp2_env%ri_grad%P_ij(ispin)%array(my_j, my_i) - p_ij_elem
2448 END IF
2449 CALL timestop(handle2)
2450 END DO
2451 ELSE
2452 CALL timeset(routinen//"_comm", handle2)
2453 ! no work to be done, possible messeges to be exchanged
2454 DO proc_shift = 1, comm_exchange%num_pe - 1
2455 proc_send = modulo(comm_exchange%mepos + proc_shift, comm_exchange%num_pe)
2456 proc_receive = modulo(comm_exchange%mepos - proc_shift, comm_exchange%num_pe)
2457
2458 send_ijk_index = num_ijk(proc_send)
2459
2460 IF (ijk_index <= send_ijk_index) THEN
2461 ! somethig to send
2462 ijk_counter_send = (ijk_index - min(1, integ_group_pos2color_sub(proc_send)))*ngroup + &
2463 integ_group_pos2color_sub(proc_send)
2464 send_i = ijk_map(1, ijk_counter_send)
2465 send_j = ijk_map(2, ijk_counter_send)
2466 send_k = ijk_map(3, ijk_counter_send)
2467 block_size = ijk_map(4, ijk_counter_send)
2468
2469 do_send_i = (ispin /= kspin) .OR. send_i < send_k .OR. send_i > send_k + block_size - 1
2470 do_send_j = (ispin /= kspin) .OR. send_j < send_k .OR. send_j > send_k + block_size - 1
2471 ! occupied i
2472 IF (do_send_i) THEN
2473 CALL comm_exchange%send(bib_c(ispin)%array(:, :, send_i), proc_send, tag)
2474 END IF
2475 ! occupied j
2476 IF (do_send_j) THEN
2477 CALL comm_exchange%send(bib_c(ispin)%array(:, :, send_j), proc_send, tag)
2478 END IF
2479 ! occupied k
2480 CALL comm_exchange%send(bib_c(kspin)%array(:, :, send_k:send_k + block_size - 1), proc_send, tag)
2481 END IF
2482
2483 END DO ! proc loop
2484 CALL timestop(handle2)
2485 END IF
2486 END DO ! ijk_index loop
2487 DEALLOCATE (local_al_i)
2488 DEALLOCATE (local_al_j)
2489 DEALLOCATE (local_al_k)
2490 DEALLOCATE (t_ab)
2491 DEALLOCATE (ijk_map)
2492 END DO ! over number of loops (ispin)
2493 CALL timestop(handle)
2494
2495 END SUBROUTINE quasi_degenerate_p_ij
2496
2497! **************************************************************************************************
2498!> \brief ...
2499!> \param my_ijk ...
2500!> \param homo ...
2501!> \param homo_beta ...
2502!> \param Eigenval ...
2503!> \param mp2_env ...
2504!> \param ijk_map ...
2505!> \param unit_nr ...
2506!> \param ngroup ...
2507!> \param do_print_alpha ...
2508!> \param comm_exchange ...
2509!> \param num_ijk ...
2510!> \param max_ijk ...
2511!> \param color_sub ...
2512!> \param buffer_size ...
2513!> \param my_group_L_size ...
2514!> \param B_size_k ...
2515!> \param para_env ...
2516!> \param virtual ...
2517!> \param B_size_i ...
2518! **************************************************************************************************
2519 SUBROUTINE find_quasi_degenerate_ij(my_ijk, homo, homo_beta, Eigenval, mp2_env, ijk_map, unit_nr, ngroup, &
2520 do_print_alpha, comm_exchange, num_ijk, max_ijk, color_sub, &
2521 buffer_size, my_group_L_size, B_size_k, para_env, virtual, B_size_i)
2522
2523 INTEGER, INTENT(OUT) :: my_ijk
2524 INTEGER, INTENT(IN) :: homo, homo_beta
2525 REAL(kind=dp), DIMENSION(:), INTENT(IN) :: eigenval
2526 TYPE(mp2_type), INTENT(IN) :: mp2_env
2527 INTEGER, ALLOCATABLE, DIMENSION(:, :), INTENT(OUT) :: ijk_map
2528 INTEGER, INTENT(IN) :: unit_nr, ngroup
2529 LOGICAL, INTENT(IN) :: do_print_alpha
2530 TYPE(mp_comm_type), INTENT(IN) :: comm_exchange
2531 INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(OUT) :: num_ijk
2532 INTEGER, INTENT(OUT) :: max_ijk
2533 INTEGER, INTENT(IN) :: color_sub, buffer_size, my_group_l_size, &
2534 b_size_k
2535 TYPE(mp_para_env_type), INTENT(IN) :: para_env
2536 INTEGER, INTENT(IN) :: virtual, b_size_i
2537
2538 INTEGER :: block_size, communication_steps, communication_volume, iib, ij_counter, &
2539 ijk_counter, jjb, kkb, max_block_size, max_num_k_blocks, min_communication_volume, &
2540 my_steps, num_k_blocks, num_sing_ij, total_ijk
2541 INTEGER(KIND=int_8) :: mem
2542 LOGICAL, ALLOCATABLE, DIMENSION(:, :) :: ijk_marker
2543
2544 ALLOCATE (num_ijk(0:comm_exchange%num_pe - 1))
2545
2546 num_sing_ij = 0
2547 DO iib = 1, homo
2548 ! diagonal elements already updated
2549 DO jjb = iib + 1, homo
2550 IF (abs(eigenval(jjb) - eigenval(iib)) < mp2_env%ri_grad%eps_canonical) &
2551 num_sing_ij = num_sing_ij + 1
2552 END DO
2553 END DO
2554
2555 IF (unit_nr > 0) THEN
2556 IF (do_print_alpha) THEN
2557 WRITE (unit=unit_nr, fmt="(T3,A,T75,i6)") &
2558 "MO_INFO| Number of ij pairs below EPS_CANONICAL:", num_sing_ij
2559 ELSE
2560 WRITE (unit=unit_nr, fmt="(T3,A,T75,i6)") &
2561 "MO_INFO| Number of ij pairs (spin beta) below EPS_CANONICAL:", num_sing_ij
2562 END IF
2563 END IF
2564
2565 ! Determine the block size, first guess: use available buffer
2566 max_block_size = buffer_size/(my_group_l_size*b_size_k)
2567
2568 ! Second limit: memory
2569 CALL m_memory(mem)
2570 ! Convert to number of doubles
2571 mem = mem/8
2572 ! Remove local_ab (2x) and local_aL_i (2x)
2573 mem = mem - 2_int_8*(virtual*b_size_k + b_size_i*my_group_l_size)
2574 max_block_size = min(max_block_size, max(1, int(mem/(my_group_l_size*b_size_k), kind(max_block_size))))
2575
2576 ! Exchange the limit
2577 CALL para_env%min(max_block_size)
2578
2579 ! Find now the block size which minimizes the communication volume and then the number of communication steps
2580 block_size = 1
2581 min_communication_volume = 3*homo_beta*num_sing_ij
2582 communication_steps = 3*homo_beta*num_sing_ij
2583 DO iib = max_block_size, 2, -1
2584 max_num_k_blocks = homo_beta/iib*num_sing_ij
2585 num_k_blocks = max_num_k_blocks - mod(max_num_k_blocks, ngroup)
2586 communication_volume = num_k_blocks*(2 + iib) + 3*(homo_beta*num_sing_ij - iib*num_k_blocks)
2587 my_steps = num_k_blocks + homo_beta*num_sing_ij - iib*num_k_blocks
2588 IF (communication_volume < min_communication_volume) THEN
2589 block_size = iib
2590 min_communication_volume = communication_volume
2591 communication_steps = my_steps
2592 ELSE IF (communication_volume == min_communication_volume .AND. my_steps < communication_steps) THEN
2593 block_size = iib
2594 communication_steps = my_steps
2595 END IF
2596 END DO
2597
2598 IF (unit_nr > 0) THEN
2599 WRITE (unit=unit_nr, fmt="(T3,A,T75,i6)") &
2600 "MO_INFO| Block size:", block_size
2601 CALL m_flush(unit_nr)
2602 END IF
2603
2604 ! Calculate number of large blocks
2605 max_num_k_blocks = homo_beta/block_size*num_sing_ij
2606 num_k_blocks = max_num_k_blocks - mod(max_num_k_blocks, ngroup)
2607
2608 total_ijk = num_k_blocks + homo_beta*num_sing_ij - num_k_blocks*block_size
2609 ALLOCATE (ijk_map(4, total_ijk))
2610 ijk_map = 0
2611 ALLOCATE (ijk_marker(homo_beta, num_sing_ij))
2612 ijk_marker = .true.
2613
2614 my_ijk = 0
2615 ijk_counter = 0
2616 ij_counter = 0
2617 DO iib = 1, homo
2618 ! diagonal elements already updated
2619 DO jjb = iib + 1, homo
2620 IF (abs(eigenval(jjb) - eigenval(iib)) >= mp2_env%ri_grad%eps_canonical) cycle
2621 ij_counter = ij_counter + 1
2622 DO kkb = 1, homo_beta - mod(homo_beta, block_size), block_size
2623 IF (ijk_counter + 1 > num_k_blocks) EXIT
2624 ijk_counter = ijk_counter + 1
2625 ijk_marker(kkb:kkb + block_size - 1, ij_counter) = .false.
2626 ijk_map(1, ijk_counter) = iib
2627 ijk_map(2, ijk_counter) = jjb
2628 ijk_map(3, ijk_counter) = kkb
2629 ijk_map(4, ijk_counter) = block_size
2630 IF (mod(ijk_counter, ngroup) == color_sub) my_ijk = my_ijk + 1
2631 END DO
2632 END DO
2633 END DO
2634 ij_counter = 0
2635 DO iib = 1, homo
2636 ! diagonal elements already updated
2637 DO jjb = iib + 1, homo
2638 IF (abs(eigenval(jjb) - eigenval(iib)) >= mp2_env%ri_grad%eps_canonical) cycle
2639 ij_counter = ij_counter + 1
2640 DO kkb = 1, homo_beta
2641 IF (ijk_marker(kkb, ij_counter)) THEN
2642 ijk_counter = ijk_counter + 1
2643 ijk_map(1, ijk_counter) = iib
2644 ijk_map(2, ijk_counter) = jjb
2645 ijk_map(3, ijk_counter) = kkb
2646 ijk_map(4, ijk_counter) = 1
2647 IF (mod(ijk_counter, ngroup) == color_sub) my_ijk = my_ijk + 1
2648 END IF
2649 END DO
2650 END DO
2651 END DO
2652
2653 DEALLOCATE (ijk_marker)
2654
2655 CALL comm_exchange%allgather(my_ijk, num_ijk)
2656 max_ijk = maxval(num_ijk)
2657
2658 END SUBROUTINE find_quasi_degenerate_ij
2659
2660END MODULE mp2_ri_gpw
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
various routines to log and control the output. The idea is that decisions about where to log should ...
Counters to determine the performance of parallel DGEMMs.
elemental subroutine, public dgemm_counter_init(dgemm_counter, unit_nr, print_info)
Initialize a dgemm_counter.
subroutine, public dgemm_counter_write(dgemm_counter, para_env)
calculate and print flop rates
subroutine, public dgemm_counter_start(dgemm_counter)
start timer of the counter
subroutine, public dgemm_counter_stop(dgemm_counter, size1, size2, size3)
stop timer of the counter and provide matrix sizes
Types to describe group distributions.
elemental integer function, public maxsize(this)
...
Defines the basic variable types.
Definition kinds.F:23
integer, parameter, public int_8
Definition kinds.F:54
integer, parameter, public dp
Definition kinds.F:34
2- and 3-center electron repulsion integral routines based on libint2 Currently available operators: ...
pure logical function, public compare_potential_types(potential1, potential2)
Helper function to compare libint_potential_types.
integer, parameter, public local_gemm_pu_gpu
Machine interface based on Fortran 2003 and POSIX.
Definition machine.F:17
subroutine, public m_memory(mem)
Returns the total amount of memory [bytes] in use, if known, zero otherwise.
Definition machine.F:452
subroutine, public m_flush(lunit)
flushes units if the &GLOBAL flag is set accordingly
Definition machine.F:136
real(kind=dp) function, public m_walltime()
returns time from a real-time clock, protected against rolling early/easily
Definition machine.F:153
Interface to the message passing library MPI.
Routines to calculate RI-GPW-MP2 energy using pw.
Definition mp2_ri_gpw.F:14
subroutine, public mp2_ri_gpw_compute_en(emp2_cou, emp2_ex, emp2_s, emp2_t, bib_c, mp2_env, para_env, para_env_sub, color_sub, gd_array, gd_b_virtual, eigenval, nmo, homo, dimen_ri, unit_nr, calc_forces, calc_ex)
...
Definition mp2_ri_gpw.F:75
Routines for calculating RI-MP2 gradients.
subroutine, public complete_gamma(mp2_env, b_ia_q, dimen_ri, homo, virtual, para_env, para_env_sub, ngroup, my_group_l_size, my_group_l_start, my_group_l_end, my_b_size, my_b_virtual_start, gd_array, gd_b_virtual, kspin)
complete the calculation of the Gamma matrices
Types needed for MP2 calculations.
Definition mp2_types.F:14
stores all the informations relevant to an mpi environment