(git:374b731)
Loading...
Searching...
No Matches
ps_wavelet_kernel.F
Go to the documentation of this file.
1!--------------------------------------------------------------------------------------------------!
2! CP2K: A general program to perform molecular dynamics simulations !
3! Copyright 2000-2024 CP2K developers group <https://cp2k.org> !
4! !
5! SPDX-License-Identifier: GPL-2.0-or-later !
6!--------------------------------------------------------------------------------------------------!
7
8! **************************************************************************************************
9!> \brief Creates the wavelet kernel for the wavelet based poisson solver.
10!> \author Florian Schiffmann (09.2007,fschiff)
11! **************************************************************************************************
13
14 USE kinds, ONLY: dp
17 USE ps_wavelet_fft3d, ONLY: ctrig,&
19 fftstp
24#include "../base/base_uses.f90"
25
26 IMPLICIT NONE
27
28 PRIVATE
29
30 CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'ps_wavelet_kernel'
31
32! *** Public data types ***
33
34 PUBLIC :: createkernel
35
36CONTAINS
37
38! **************************************************************************************************
39!> \brief Allocate a pointer which corresponds to the zero-padded FFT slice needed for
40!> calculating the convolution with the kernel expressed in the interpolating scaling
41!> function basis. The kernel pointer is unallocated on input, allocated on output.
42!> \param geocode Indicates the boundary conditions (BC) of the problem:
43!> 'F' free BC, isolated systems.
44!> The program calculates the solution as if the given density is
45!> "alone" in R^3 space.
46!> 'S' surface BC, isolated in y direction, periodic in xz plane
47!> The given density is supposed to be periodic in the xz plane,
48!> so the dimensions in these direction mus be compatible with the FFT
49!> Beware of the fact that the isolated direction is y!
50!> 'P' periodic BC.
51!> The density is supposed to be periodic in all the three directions,
52!> then all the dimensions must be compatible with the FFT.
53!> No need for setting up the kernel.
54!> \param n01 dimensions of the real space grid to be hit with the Poisson Solver
55!> \param n02 dimensions of the real space grid to be hit with the Poisson Solver
56!> \param n03 dimensions of the real space grid to be hit with the Poisson Solver
57!> \param hx grid spacings. For the isolated BC case for the moment they are supposed to
58!> be equal in the three directions
59!> \param hy grid spacings. For the isolated BC case for the moment they are supposed to
60!> be equal in the three directions
61!> \param hz grid spacings. For the isolated BC case for the moment they are supposed to
62!> be equal in the three directions
63!> \param itype_scf order of the interpolating scaling functions used in the decomposition
64!> \param iproc ,nproc: number of process, number of processes
65!> \param nproc ...
66!> \param kernel pointer for the kernel FFT. Unallocated on input, allocated on output.
67!> Its dimensions are equivalent to the region of the FFT space for which the
68!> kernel is injective. This will divide by two each direction,
69!> since the kernel for the zero-padded convolution is real and symmetric.
70!> \param mpi_group ...
71!> \date February 2007
72!> \author Luigi Genovese
73!> \note Due to the fact that the kernel dimensions are unknown before the calling, the kernel
74!> must be declared as pointer in input of this routine.
75!> To avoid that, one can properly define the kernel dimensions by adding
76!> the nd1,nd2,nd3 arguments to the PS_dim4allocation routine, then eliminating the pointer
77!> declaration.
78! **************************************************************************************************
79 SUBROUTINE createkernel(geocode, n01, n02, n03, hx, hy, hz, itype_scf, iproc, nproc, kernel, mpi_group)
80
81 CHARACTER(len=1), INTENT(in) :: geocode
82 INTEGER, INTENT(in) :: n01, n02, n03
83 REAL(kind=dp), INTENT(in) :: hx, hy, hz
84 INTEGER, INTENT(in) :: itype_scf, iproc, nproc
85 REAL(kind=dp), POINTER :: kernel(:)
86
87 CLASS(mp_comm_type), INTENT(in) :: mpi_group
88
89 INTEGER :: m1, m2, m3, md1, md2, md3, n1, n2, n3, &
90 nd1, nd2, nd3, nlimd, nlimk
91 REAL(kind=dp) :: hgrid
92
93 hgrid = max(hx, hy, hz)
94
95 IF (geocode == 'P') THEN
96
97 CALL f_fft_dimensions(n01, n02, n03, m1, m2, m3, n1, n2, n3, &
98 md1, md2, md3, nd1, nd2, nd3, nproc)
99
100 ALLOCATE (kernel(1))
101 nlimd = n2
102 nlimk = 0
103
104 ELSE IF (geocode == 'S') THEN
105
106 CALL s_fft_dimensions(n01, n02, n03, m1, m2, m3, n1, n2, n3, &
107 md1, md2, md3, nd1, nd2, nd3, nproc)
108
109 ALLOCATE (kernel(nd1*nd2*nd3/nproc))
110
111 !the kernel must be built and scattered to all the processes
112
113 CALL surfaces_kernel(n1, n2, n3, m3, nd1, nd2, nd3, hx, hz, hy, &
114 itype_scf, kernel, iproc, nproc, mpi_group)
115 !last plane calculated for the density and the kernel
116
117 nlimd = n2
118 nlimk = n3/2 + 1
119 ELSE IF (geocode == 'F') THEN
120
121 !Build the Kernel
122
123 CALL f_fft_dimensions(n01, n02, n03, m1, m2, m3, n1, n2, n3, &
124 md1, md2, md3, nd1, nd2, nd3, nproc)
125 ALLOCATE (kernel(nd1*nd2*nd3/nproc))
126
127 !the kernel must be built and scattered to all the processes
128 CALL free_kernel(n01, n02, n03, n1, n2, n3, nd1, nd2, nd3, hgrid, &
129 itype_scf, iproc, nproc, kernel, mpi_group)
130
131 !last plane calculated for the density and the kernel
132 nlimd = n2/2
133 nlimk = n3/2 + 1
134
135 ELSE
136
137 cpabort("No wavelet based poisson solver for given geometry")
138
139 END IF
140!!! IF (iproc==0) THEN
141!!! write(*,*)'done.'
142!!! write(*,'(1x,a,i0)') 'Allocate words for kernel ',nd1*nd2*nd3/nproc
143!!! !print the load balancing of the different dimensions on screen
144!!! write(*,'(1x,a,3(i5))')'Grid Dimensions:',n01,n02,n03
145!!! if (nproc > 1) then
146!!! write(*,'(1x,a,3(i5),a,3(i5),a,3(i5))')&
147!!! 'Memory occ. per proc. Density',md1,md3,md2/nproc,' Kernel',nd1,nd2,nd3/nproc
148!!! write(*,'(1x,a)')'Load Balancing--------------------------------------------'
149!!! jhd=1000
150!!! jzd=1000
151!!! npd=0
152!!! load_balancing: do jproc=0,nproc-1
153!!! !print *,'jproc,jfull=',jproc,jproc*md2/nproc,(jproc+1)*md2/nproc
154!!! if ((jproc+1)*md2/nproc <= nlimd) then
155!!! jfd=jproc
156!!! else if (jproc*md2/nproc <= nlimd) then
157!!! jhd=jproc
158!!! npd=nint(real(nlimd-(jproc)*md2/nproc,KIND=dp)/real(md2/nproc,KIND=dp)*100._dp)
159!!! else
160!!! jzd=jproc
161!!! exit load_balancing
162!!! end if
163!!! end do load_balancing
164!!! write(*,'(1x,a,i3,a)')'LB_den : processors 0 -',jfd,' work at 100%'
165!!! if (jfd < nproc-1) write(*,'(1x,a,i3,a,i3,1a)')' processor ',jhd,&
166!!! ' work at ',npd,'%'
167!!! if (jhd < nproc-1) write(*,'(1x,a,i3,1a,i3,a)')' processors ',&
168!!! jzd,' -',nproc-1,' work at 0%'
169!!! jhk=1000
170!!! jzk=1000
171!!! npk=0
172!!! if (geocode /= 'P') then
173!!! load_balancingk: do jproc=0,nproc-1
174!!! !print *,'jproc,jfull=',jproc,jproc*nd3/nproc,(jproc+1)*nd3/nproc
175!!! if ((jproc+1)*nd3/nproc <= nlimk) then
176!!! jfk=jproc
177!!! else if (jproc*nd3/nproc <= nlimk) then
178!!! jhk=jproc
179!!! npk=nint(real(nlimk-(jproc)*nd3/nproc,KIND=dp)/real(nd3/nproc,KIND=dp)*100._dp)
180!!! else
181!!! jzk=jproc
182!!! exit load_balancingk
183!!! end if
184!!! end do load_balancingk
185!!! write(*,'(1x,a,i3,a)')'LB_ker : processors 0 -',jfk,' work at 100%'
186!!! if (jfk < nproc-1) write(*,'(1x,a,i3,a,i3,1a)')' processor ',jhk,&
187!!! ' work at ',npk,'%'
188!!! if (jhk < nproc-1) write(*,'(1x,a,i3,1a,i3,a)')' processors ',jzk,' -',nproc-1,&
189!!! ' work at 0%'
190!!! end if
191!!! write(*,'(1x,a)')'The LB per processor is 1/3 LB_den + 2/3 LB_ker-----------'
192!!! end if
193!!!
194!!! END IF
195 END SUBROUTINE createkernel
196
197! **************************************************************************************************
198!> \brief Build the kernel of the Poisson operator with
199!> surfaces Boundary conditions
200!> in an interpolating scaling functions basis.
201!> Beware of the fact that the nonperiodic direction is y!
202!> \param n1 Dimensions for the FFT
203!> \param n2 Dimensions for the FFT
204!> \param n3 Dimensions for the FFT
205!> \param m3 Actual dimension in non-periodic direction
206!> \param nker1 Dimensions of the kernel (nker3=n3/2+1) nker(1,2)=n(1,2)/2+1
207!> \param nker2 Dimensions of the kernel (nker3=n3/2+1) nker(1,2)=n(1,2)/2+1
208!> \param nker3 Dimensions of the kernel (nker3=n3/2+1) nker(1,2)=n(1,2)/2+1
209!> \param h1 Mesh steps in the three dimensions
210!> \param h2 Mesh steps in the three dimensions
211!> \param h3 Mesh steps in the three dimensions
212!> \param itype_scf Order of the scaling function
213!> \param karray output array
214!> \param iproc Number of process
215!> \param nproc number of processes
216!> \param mpi_group ...
217!> \date October 2006
218!> \author L. Genovese
219! **************************************************************************************************
220 SUBROUTINE surfaces_kernel(n1, n2, n3, m3, nker1, nker2, nker3, h1, h2, h3, &
221 itype_scf, karray, iproc, nproc, mpi_group)
222
223 INTEGER, INTENT(in) :: n1, n2, n3, m3, nker1, nker2, nker3
224 REAL(kind=dp), INTENT(in) :: h1, h2, h3
225 INTEGER, INTENT(in) :: itype_scf, nproc, iproc
226 REAL(kind=dp), &
227 DIMENSION(nker1, nker2, nker3/nproc), &
228 INTENT(out) :: karray
229 TYPE(mp_comm_type), INTENT(in) :: mpi_group
230
231 INTEGER, PARAMETER :: n_points = 2**6, ncache_optimal = 8*1024
232
233 INTEGER :: i, i1, i2, i3, ic, iend, imu, ind1, ind2, inzee, ipolyord, ireim, istart, j2, &
234 j2nd, j2st, jnd1, jp2, jreim, n_cell, n_range, n_scf, nact2, ncache, nfft, num_of_mus, &
235 shift
236 INTEGER, ALLOCATABLE, DIMENSION(:) :: after, before, now
237 REAL(kind=dp) :: a, b, c, cp, d, diff, dx, fei, fer, foi, &
238 for, fr, mu1, pi, pion, ponx, pony, &
239 sp, value, x
240 REAL(kind=dp), ALLOCATABLE, DIMENSION(:) :: kernel_scf, x_scf, y_scf
241 REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :) :: btrig, cossinarr
242 REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :, :) :: halfft_cache, kernel
243 REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :, :, :) :: kernel_mpi
244 REAL(kind=dp), DIMENSION(9, 8) :: cpol
245
246!Better if higher (1024 points are enough 10^{-14}: 2*itype_scf*n_points)
247! include "perfdata.inc"
248!FFT arrays
249!coefficients for the polynomial interpolation
250!assign the values of the coefficients
251
252 karray = 0.0_dp
253 cpol(:, :) = 1._dp
254
255 cpol(1, 2) = .25_dp
256
257 cpol(1, 3) = 1._dp/3._dp
258
259 cpol(1, 4) = 7._dp/12._dp
260 cpol(2, 4) = 8._dp/3._dp
261
262 cpol(1, 5) = 19._dp/50._dp
263 cpol(2, 5) = 3._dp/2._dp
264
265 cpol(1, 6) = 41._dp/272._dp
266 cpol(2, 6) = 27._dp/34._dp
267 cpol(3, 6) = 27._dp/272._dp
268
269 cpol(1, 7) = 751._dp/2989._dp
270 cpol(2, 7) = 73._dp/61._dp
271 cpol(3, 7) = 27._dp/61._dp
272
273 cpol(1, 8) = -989._dp/4540._dp
274 cpol(2, 8) = -1472._dp/1135._dp
275 cpol(3, 8) = 232._dp/1135._dp
276 cpol(4, 8) = -2624._dp/1135._dp
277
278 !renormalize values
279 cpol(1, 1) = .5_dp*cpol(1, 1)
280 cpol(1:2, 2) = 2._dp/3._dp*cpol(1:2, 2)
281 cpol(1:2, 3) = 3._dp/8._dp*cpol(1:2, 3)
282 cpol(1:3, 4) = 2._dp/15._dp*cpol(1:3, 4)
283 cpol(1:3, 5) = 25._dp/144._dp*cpol(1:3, 5)
284 cpol(1:4, 6) = 34._dp/105._dp*cpol(1:4, 6)
285 cpol(1:4, 7) = 2989._dp/17280._dp*cpol(1:4, 7)
286 cpol(1:5, 8) = -454._dp/2835._dp*cpol(1:5, 8)
287
288 !assign the complete values
289 cpol(2, 1) = cpol(1, 1)
290
291 cpol(3, 2) = cpol(1, 2)
292
293 cpol(3, 3) = cpol(2, 3)
294 cpol(4, 3) = cpol(1, 3)
295
296 cpol(4, 4) = cpol(2, 4)
297 cpol(5, 4) = cpol(1, 4)
298
299 cpol(4, 5) = cpol(3, 5)
300 cpol(5, 5) = cpol(2, 5)
301 cpol(6, 5) = cpol(1, 5)
302
303 cpol(5, 6) = cpol(3, 6)
304 cpol(6, 6) = cpol(2, 6)
305 cpol(7, 6) = cpol(1, 6)
306
307 cpol(5, 7) = cpol(4, 7)
308 cpol(6, 7) = cpol(3, 7)
309 cpol(7, 7) = cpol(2, 7)
310 cpol(8, 7) = cpol(1, 7)
311
312 cpol(6, 8) = cpol(4, 8)
313 cpol(7, 8) = cpol(3, 8)
314 cpol(8, 8) = cpol(2, 8)
315 cpol(9, 8) = cpol(1, 8)
316
317 !Number of integration points : 2*itype_scf*n_points
318 n_scf = 2*itype_scf*n_points
319 !Allocations
320 ALLOCATE (x_scf(0:n_scf))
321 ALLOCATE (y_scf(0:n_scf))
322
323 !Build the scaling function
324 CALL scaling_function(itype_scf, n_scf, n_range, x_scf, y_scf)
325 !Step grid for the integration
326 dx = real(n_range, kind=dp)/real(n_scf, kind=dp)
327 !Extend the range (no more calculations because fill in by 0._dp)
328 n_cell = m3
329 n_range = max(n_cell, n_range)
330
331 !Allocations
332 ncache = ncache_optimal
333 !the HalFFT must be performed only in the third dimension,
334 !and nker3=n3/2+1, hence
335 IF (ncache <= (nker3 - 1)*4) ncache = nker3 - 1*4
336
337 !enlarge the second dimension of the kernel to be compatible with nproc
338 nact2 = nker2
339 enlarge_ydim: DO
340 IF (nproc*(nact2/nproc) /= nact2) THEN
341 nact2 = nact2 + 1
342 ELSE
343 EXIT enlarge_ydim
344 END IF
345 END DO enlarge_ydim
346
347 !array for the MPI procedure
348 ALLOCATE (kernel(nker1, nact2/nproc, nker3))
349 ALLOCATE (kernel_mpi(nker1, nact2/nproc, nker3/nproc, nproc))
350 ALLOCATE (kernel_scf(n_range))
351 ALLOCATE (halfft_cache(2, ncache/4, 2))
352 ALLOCATE (cossinarr(2, n3/2 - 1))
353 ALLOCATE (btrig(2, ctrig_length))
354 ALLOCATE (after(7))
355 ALLOCATE (now(7))
356 ALLOCATE (before(7))
357
358 !constants
359 pi = 4._dp*atan(1._dp)
360
361 !arrays for the halFFT
362 CALL ctrig(n3/2, btrig, after, before, now, 1, ic)
363
364 !build the phases for the HalFFT reconstruction
365 pion = 2._dp*pi/real(n3, kind=dp)
366 DO i3 = 2, n3/2
367 x = real(i3 - 1, kind=dp)*pion
368 cossinarr(1, i3 - 1) = cos(x)
369 cossinarr(2, i3 - 1) = -sin(x)
370 END DO
371
372 ! satisfy valgrind, init arrays to large value, even if the offending bit is (likely?) padding
373 kernel = huge(0._dp)
374 kernel_mpi = huge(0._dp)
375
376 !calculate the limits of the FFT calculations
377 !that can be performed in a row remaining inside the cache
378 num_of_mus = ncache/(2*n3)
379
380 diff = 0._dp
381 !order of the polynomial to be used for integration (must be a power of two)
382 ipolyord = 8 !this part should be incorporated inside the numerical integration
383 !here we have to choice the piece of the x-y grid to cover
384
385 !let us now calculate the fraction of mu that will be considered
386 j2st = iproc*(nact2/nproc)
387 j2nd = min((iproc + 1)*(nact2/nproc), n2/2 + 1)
388
389 DO ind2 = (n1/2 + 1)*j2st + 1, (n1/2 + 1)*j2nd, num_of_mus
390 istart = ind2
391 iend = min(ind2 + (num_of_mus - 1), (n1/2 + 1)*j2nd)
392 nfft = iend - istart + 1
393 shift = 0
394
395 !initialization of the interesting part of the cache array
396 halfft_cache(:, :, :) = 0._dp
397
398 IF (istart == 1) THEN
399 !i2=1
400 shift = 1
401
402 CALL calculates_green_opt_muzero(n_range, n_scf, ipolyord, x_scf, y_scf, &
403 cpol(1, ipolyord), dx, kernel_scf)
404
405 !copy of the first zero value
406 halfft_cache(1, 1, 1) = 0._dp
407
408 DO i3 = 1, m3
409
410 value = 0.5_dp*h3*kernel_scf(i3)
411 !index in where to copy the value of the kernel
412 CALL indices(ireim, num_of_mus, n3/2 + i3, 1, ind1)
413 !index in where to copy the symmetric value
414 CALL indices(jreim, num_of_mus, n3/2 + 2 - i3, 1, jnd1)
415 halfft_cache(ireim, ind1, 1) = value
416 halfft_cache(jreim, jnd1, 1) = value
417
418 END DO
419
420 END IF
421
422 loopimpulses: DO imu = istart + shift, iend
423
424 !here there is the value of mu associated to hgrid
425 !note that we have multiplicated mu for hgrid to be comparable
426 !with mu0ref
427
428 !calculate the proper value of mu taking into account the periodic dimensions
429 !corresponding value of i1 and i2
430 i1 = mod(imu, n1/2 + 1)
431 IF (i1 == 0) i1 = n1/2 + 1
432 i2 = (imu - i1)/(n1/2 + 1) + 1
433 ponx = real(i1 - 1, kind=dp)/real(n1, kind=dp)
434 pony = real(i2 - 1, kind=dp)/real(n2, kind=dp)
435
436 mu1 = 2._dp*pi*sqrt((ponx/h1)**2 + (pony/h2)**2)*h3
437
438 CALL calculates_green_opt(n_range, n_scf, itype_scf, ipolyord, x_scf, y_scf, &
439 cpol(1, ipolyord), mu1, dx, kernel_scf)
440
441 !readjust the coefficient and define the final kernel
442
443 !copy of the first zero value
444 halfft_cache(1, imu - istart + 1, 1) = 0._dp
445 DO i3 = 1, m3
446 value = -0.5_dp*h3/mu1*kernel_scf(i3)
447 !write(80,*)mu1,i3,kernel_scf(i03)
448 !index in where to copy the value of the kernel
449 CALL indices(ireim, num_of_mus, n3/2 + i3, imu - istart + 1, ind1)
450 !index in where to copy the symmetric value
451 CALL indices(jreim, num_of_mus, n3/2 + 2 - i3, imu - istart + 1, jnd1)
452 halfft_cache(ireim, ind1, 1) = value
453 halfft_cache(jreim, jnd1, 1) = value
454 END DO
455
456 END DO loopimpulses
457
458 !now perform the FFT of the array in cache
459 inzee = 1
460 DO i = 1, ic
461 CALL fftstp(num_of_mus, nfft, n3/2, num_of_mus, n3/2, &
462 halfft_cache(1, 1, inzee), halfft_cache(1, 1, 3 - inzee), &
463 btrig, after(i), now(i), before(i), 1)
464 inzee = 3 - inzee
465 END DO
466 !assign the values of the FFT array
467 !and compare with the good results
468 DO imu = istart, iend
469
470 !corresponding value of i1 and i2
471 i1 = mod(imu, n1/2 + 1)
472 IF (i1 == 0) i1 = n1/2 + 1
473 i2 = (imu - i1)/(n1/2 + 1) + 1
474
475 j2 = i2 - j2st
476
477 a = halfft_cache(1, imu - istart + 1, inzee)
478 b = halfft_cache(2, imu - istart + 1, inzee)
479 kernel(i1, j2, 1) = a + b
480 kernel(i1, j2, n3/2 + 1) = a - b
481
482 DO i3 = 2, n3/2
483 ind1 = imu - istart + 1 + num_of_mus*(i3 - 1)
484 jnd1 = imu - istart + 1 + num_of_mus*(n3/2 + 2 - i3 - 1)
485 cp = cossinarr(1, i3 - 1)
486 sp = cossinarr(2, i3 - 1)
487 a = halfft_cache(1, ind1, inzee)
488 b = halfft_cache(2, ind1, inzee)
489 c = halfft_cache(1, jnd1, inzee)
490 d = halfft_cache(2, jnd1, inzee)
491 fer = .5_dp*(a + c)
492 fei = .5_dp*(b - d)
493 for = .5_dp*(a - c)
494 foi = .5_dp*(b + d)
495 fr = fer + cp*foi - sp*for
496 kernel(i1, j2, i3) = fr
497 END DO
498 END DO
499
500 END DO
501
502 !give to each processor a slice of the third dimension
503 IF (nproc > 1) THEN
504 CALL mpi_group%alltoall(kernel, &!nker1*(nact2/nproc)*(nker3/nproc), &
505 kernel_mpi, nker1*(nact2/nproc)*(nker3/nproc))
506
507 DO jp2 = 1, nproc
508 DO i3 = 1, nker3/nproc
509 DO i2 = 1, nact2/nproc
510 j2 = i2 + (jp2 - 1)*(nact2/nproc)
511 IF (j2 <= nker2) THEN
512 DO i1 = 1, nker1
513 karray(i1, j2, i3) = &
514 kernel_mpi(i1, i2, i3, jp2)
515 END DO
516 END IF
517 END DO
518 END DO
519 END DO
520
521 ELSE
522 karray(1:nker1, 1:nker2, 1:nker3) = kernel(1:nker1, 1:nker2, 1:nker3)
523 END IF
524
525 !De-allocations
526 DEALLOCATE (kernel)
527 DEALLOCATE (kernel_mpi)
528 DEALLOCATE (btrig)
529 DEALLOCATE (after)
530 DEALLOCATE (now)
531 DEALLOCATE (before)
532 DEALLOCATE (halfft_cache)
533 DEALLOCATE (kernel_scf)
534 DEALLOCATE (x_scf)
535 DEALLOCATE (y_scf)
536
537 END SUBROUTINE surfaces_kernel
538
539! **************************************************************************************************
540!> \brief ...
541!> \param n ...
542!> \param n_scf ...
543!> \param itype_scf ...
544!> \param intorder ...
545!> \param xval ...
546!> \param yval ...
547!> \param c ...
548!> \param mu ...
549!> \param hres ...
550!> \param g_mu ...
551! **************************************************************************************************
552 SUBROUTINE calculates_green_opt(n, n_scf, itype_scf, intorder, xval, yval, c, mu, hres, g_mu)
553 INTEGER, INTENT(in) :: n, n_scf, itype_scf, intorder
554 REAL(kind=dp), DIMENSION(0:n_scf), INTENT(in) :: xval, yval
555 REAL(kind=dp), DIMENSION(intorder+1), INTENT(in) :: c
556 REAL(kind=dp), INTENT(in) :: mu, hres
557 REAL(kind=dp), DIMENSION(n), INTENT(out) :: g_mu
558
559 REAL(kind=dp), PARAMETER :: mu_max = 0.2_dp
560
561 INTEGER :: i, iend, ikern, ivalue, izero, n_iter, &
562 nrec
563 REAL(kind=dp) :: f, filter, fl, fr, gleft, gltmp, gright, &
564 grtmp, mu0, ratio, x, x0, x1
565 REAL(kind=dp), ALLOCATABLE, DIMENSION(:) :: green, green1
566
567 g_mu = 0.0_dp
568 !We calculate the number of iterations to go from mu0 to mu0_ref
569 IF (mu <= mu_max) THEN
570 n_iter = 0
571 mu0 = mu
572 ELSE
573 n_iter = 1
574 loop_iter: DO
575 ratio = real(2**n_iter, kind=dp)
576 mu0 = mu/ratio
577 IF (mu0 <= mu_max) THEN
578 EXIT loop_iter
579 END IF
580 n_iter = n_iter + 1
581 END DO loop_iter
582 END IF
583
584 !dimension needed for the correct calculation of the recursion
585 nrec = 2**n_iter*n
586
587 ALLOCATE (green(-nrec:nrec))
588
589 !initialization of the branching value
590 ikern = 0
591 izero = 0
592 initialization: DO
593 IF (xval(izero) >= real(ikern, kind=dp) .OR. izero == n_scf) EXIT initialization
594 izero = izero + 1
595 END DO initialization
596 green = 0._dp
597 !now perform the interpolation in right direction
598 ivalue = izero
599 gright = 0._dp
600 loop_right: DO
601 IF (ivalue >= n_scf - intorder - 1) EXIT loop_right
602 DO i = 1, intorder + 1
603 x = xval(ivalue) - real(ikern, kind=dp)
604 f = yval(ivalue)*exp(-mu0*x)
605 filter = intorder*c(i)
606 gright = gright + filter*f
607 ivalue = ivalue + 1
608 END DO
609 ivalue = ivalue - 1
610 END DO loop_right
611 iend = n_scf - ivalue
612 DO i = 1, iend
613 x = xval(ivalue) - real(ikern, kind=dp)
614 f = yval(ivalue)*exp(-mu0*x)
615 filter = intorder*c(i)
616 gright = gright + filter*f
617 ivalue = ivalue + 1
618 END DO
619 gright = hres*gright
620
621 !the scaling function is symmetric, so the same for the other direction
622 gleft = gright
623
624 green(ikern) = gleft + gright
625
626 !now the loop until the last value
627 DO ikern = 1, nrec
628 gltmp = 0._dp
629 grtmp = 0._dp
630 ivalue = izero
631 x0 = xval(izero)
632 loop_integration: DO
633 IF (izero == n_scf) EXIT loop_integration
634 DO i = 1, intorder + 1
635 x = xval(ivalue)
636 fl = yval(ivalue)*exp(mu0*x)
637 fr = yval(ivalue)*exp(-mu0*x)
638 filter = intorder*c(i)
639 gltmp = gltmp + filter*fl
640 grtmp = grtmp + filter*fr
641 ivalue = ivalue + 1
642 IF (xval(izero) >= real(ikern, kind=dp) .OR. izero == n_scf) THEN
643 x1 = xval(izero)
644 EXIT loop_integration
645 END IF
646 izero = izero + 1
647 END DO
648 ivalue = ivalue - 1
649 izero = izero - 1
650 END DO loop_integration
651 gleft = exp(-mu0)*(gleft + hres*exp(-mu0*real(ikern - 1, kind=dp))*gltmp)
652 IF (izero == n_scf) THEN
653 gright = 0._dp
654 ELSE
655 gright = exp(mu0)*(gright - hres*exp(mu0*real(ikern - 1, kind=dp))*grtmp)
656 END IF
657 green(ikern) = gleft + gright
658 green(-ikern) = gleft + gright
659 IF (abs(green(ikern)) <= 1.e-20_dp) THEN
660 nrec = ikern
661 EXIT
662 END IF
663 !print *,ikern,izero,n_scf,gltmp,grtmp,gleft,gright,x0,x1,green(ikern)
664 END DO
665 !now we must calculate the recursion
666 ALLOCATE (green1(-nrec:nrec))
667
668 !Start the iteration to go from mu0 to mu
669 CALL scf_recursion(itype_scf, n_iter, nrec, green(-nrec), green1(-nrec))
670
671 DO i = 1, min(n, nrec)
672 g_mu(i) = green(i - 1)
673 END DO
674 DO i = min(n, nrec) + 1, n
675 g_mu(i) = 0._dp
676 END DO
677
678 DEALLOCATE (green, green1)
679
680 END SUBROUTINE calculates_green_opt
681
682! **************************************************************************************************
683!> \brief ...
684!> \param n ...
685!> \param n_scf ...
686!> \param intorder ...
687!> \param xval ...
688!> \param yval ...
689!> \param c ...
690!> \param hres ...
691!> \param green ...
692! **************************************************************************************************
693 SUBROUTINE calculates_green_opt_muzero(n, n_scf, intorder, xval, yval, c, hres, green)
694 INTEGER, INTENT(in) :: n, n_scf, intorder
695 REAL(kind=dp), DIMENSION(0:n_scf), INTENT(in) :: xval, yval
696 REAL(kind=dp), DIMENSION(intorder+1), INTENT(in) :: c
697 REAL(kind=dp), INTENT(in) :: hres
698 REAL(kind=dp), DIMENSION(n), INTENT(out) :: green
699
700 INTEGER :: i, iend, ikern, ivalue, izero
701 REAL(kind=dp) :: c0, c1, filter, gl0, gl1, gr0, gr1, x, y
702
703!initialization of the branching value
704
705 ikern = 0
706 izero = 0
707 initialization: DO
708 IF (xval(izero) >= real(ikern, kind=dp) .OR. izero == n_scf) EXIT initialization
709 izero = izero + 1
710 END DO initialization
711 green = 0._dp
712 !first case, ikern=0
713 !now perform the interpolation in right direction
714 ivalue = izero
715 gr1 = 0._dp
716 loop_right: DO
717 IF (ivalue >= n_scf - intorder - 1) EXIT loop_right
718 DO i = 1, intorder + 1
719 x = xval(ivalue)
720 y = yval(ivalue)
721 filter = intorder*c(i)
722 gr1 = gr1 + filter*x*y
723 ivalue = ivalue + 1
724 END DO
725 ivalue = ivalue - 1
726 END DO loop_right
727 iend = n_scf - ivalue
728 DO i = 1, iend
729 x = xval(ivalue)
730 y = yval(ivalue)
731 filter = intorder*c(i)
732 gr1 = gr1 + filter*x*y
733 ivalue = ivalue + 1
734 END DO
735 gr1 = hres*gr1
736 !the scaling function is symmetric
737 gl1 = -gr1
738 gl0 = 0.5_dp
739 gr0 = 0.5_dp
740
741 green(1) = 2._dp*gr1
742
743 !now the loop until the last value
744 DO ikern = 1, n - 1
745 c0 = 0._dp
746 c1 = 0._dp
747 ivalue = izero
748 loop_integration: DO
749 IF (izero == n_scf) EXIT loop_integration
750 DO i = 1, intorder + 1
751 x = xval(ivalue)
752 y = yval(ivalue)
753 filter = intorder*c(i)
754 c0 = c0 + filter*y
755 c1 = c1 + filter*y*x
756 ivalue = ivalue + 1
757 IF (xval(izero) >= real(ikern, kind=dp) .OR. izero == n_scf) THEN
758 EXIT loop_integration
759 END IF
760 izero = izero + 1
761 END DO
762 ivalue = ivalue - 1
763 izero = izero - 1
764 END DO loop_integration
765 c0 = hres*c0
766 c1 = hres*c1
767
768 gl0 = gl0 + c0
769 gl1 = gl1 + c1
770 gr0 = gr0 - c0
771 gr1 = gr1 - c1
772 !general case
773 green(ikern + 1) = real(ikern, kind=dp)*(gl0 - gr0) + gr1 - gl1
774 !print *,ikern,izero,n_scf,gltmp,grtmp,gleft,gright,x0,x1,green(ikern)
775 END DO
776
777 END SUBROUTINE calculates_green_opt_muzero
778
779! **************************************************************************************************
780!> \brief ...
781!> \param var_realimag ...
782!> \param nelem ...
783!> \param intrn ...
784!> \param extrn ...
785!> \param index ...
786! **************************************************************************************************
787 SUBROUTINE indices(var_realimag, nelem, intrn, extrn, index)
788
789 INTEGER, INTENT(out) :: var_realimag
790 INTEGER, INTENT(in) :: nelem, intrn, extrn
791 INTEGER, INTENT(out) :: index
792
793 INTEGER :: i
794
795!real or imaginary part
796
797 var_realimag = 2 - mod(intrn, 2)
798!actual index over half the length
799
800 i = (intrn + 1)/2
801 !check
802 IF (2*(i - 1) + var_realimag /= intrn) THEN
803 print *, 'error, index=', intrn, 'var_realimag=', var_realimag, 'i=', i
804 END IF
805 !complete index to be assigned
806 index = extrn + nelem*(i - 1)
807
808 END SUBROUTINE indices
809
810! **************************************************************************************************
811!> \brief Build the kernel of a gaussian function
812!> for interpolating scaling functions.
813!> Do the parallel HalFFT of the symmetrized function and stores into
814!> memory only 1/8 of the grid divided by the number of processes nproc
815!>
816!> Build the kernel (karray) of a gaussian function
817!> for interpolating scaling functions
818!> $$ K(j) = \sum_k \omega_k \int \int \phi(x) g_k(x'-x) \delta(x'- j) dx dx' $$
819!> \param n01 Mesh dimensions of the density
820!> \param n02 Mesh dimensions of the density
821!> \param n03 Mesh dimensions of the density
822!> \param nfft1 Dimensions of the FFT grid (HalFFT in the third direction)
823!> \param nfft2 Dimensions of the FFT grid (HalFFT in the third direction)
824!> \param nfft3 Dimensions of the FFT grid (HalFFT in the third direction)
825!> \param n1k Dimensions of the kernel FFT
826!> \param n2k Dimensions of the kernel FFT
827!> \param n3k Dimensions of the kernel FFT
828!> \param hgrid Mesh step
829!> \param itype_scf Order of the scaling function (8,14,16)
830!> \param iproc ...
831!> \param nproc ...
832!> \param karray ...
833!> \param mpi_group ...
834!> \date February 2006
835!> \author T. Deutsch, L. Genovese
836! **************************************************************************************************
837 SUBROUTINE free_kernel(n01, n02, n03, nfft1, nfft2, nfft3, n1k, n2k, n3k, &
838 hgrid, itype_scf, iproc, nproc, karray, mpi_group)
839
840 INTEGER, INTENT(in) :: n01, n02, n03, nfft1, nfft2, nfft3, n1k, &
841 n2k, n3k
842 REAL(kind=dp), INTENT(in) :: hgrid
843 INTEGER, INTENT(in) :: itype_scf, iproc, nproc
844 REAL(kind=dp), DIMENSION(n1k, n2k, n3k/nproc), &
845 INTENT(out) :: karray
846 TYPE(mp_comm_type), INTENT(in) :: mpi_group
847
848 INTEGER, PARAMETER :: n_gauss = 89, n_points = 2**6
849 REAL(kind=dp), PARAMETER :: p0_ref = 1._dp
850
851 INTEGER :: i, i01, i02, i03, i1, i2, i3, i_gauss, &
852 i_kern, iend, istart, istart1, n1h, &
853 n2h, n3h, n_cell, n_iter, n_range, &
854 n_scf, nker1, nker2, nker3
855 REAL(kind=dp) :: a1, a2, a3, a_range, absci, acc_gauss, &
856 dr_gauss, dx, factor, factor2, kern, &
857 p0_cell, p0gauss, pgauss, ur_gauss
858 REAL(kind=dp), ALLOCATABLE, DIMENSION(:) :: kern_1_scf, kernel_scf, x_scf, y_scf
859 REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :, :) :: kp
860 REAL(kind=dp), DIMENSION(n_gauss) :: p_gauss, w_gauss
861
862!Do not touch !!!!
863!Better if higher (1024 points are enough 10^{-14}: 2*itype_scf*n_points)
864!Better p_gauss for calculation
865!(the support of the exponential should be inside [-n_range/2,n_range/2])
866!Number of integration points : 2*itype_scf*n_points
867
868 n_scf = 2*itype_scf*n_points
869 !Set karray
870 karray = 0.0_dp
871 !here we must set the dimensions for the fft part, starting from the nfft
872 !remember that actually nfft2 is associated to n03 and viceversa
873
874 !dimensions that define the center of symmetry
875 n1h = nfft1/2
876 n2h = nfft2/2
877 n3h = nfft3/2
878
879 !Auxiliary dimensions only for building the FFT part
880 nker1 = nfft1
881 nker2 = nfft2
882 nker3 = nfft3/2 + 1
883
884 !adjusting the last two dimensions to be multiples of nproc
885 DO
886 IF (modulo(nker2, nproc) == 0) EXIT
887 nker2 = nker2 + 1
888 END DO
889 DO
890 IF (modulo(nker3, nproc) == 0) EXIT
891 nker3 = nker3 + 1
892 END DO
893
894 !this will be the array of the kernel in the real space
895 ALLOCATE (kp(n1h + 1, n3h + 1, nker2/nproc))
896
897 !defining proper extremes for the calculation of the
898 !local part of the kernel
899
900 istart = iproc*nker2/nproc + 1
901 iend = min((iproc + 1)*nker2/nproc, n2h + n03)
902
903 istart1 = istart
904 IF (iproc .EQ. 0) istart1 = n2h - n03 + 2
905
906 !Allocations
907 ALLOCATE (x_scf(0:n_scf))
908 ALLOCATE (y_scf(0:n_scf))
909
910 !Build the scaling function
911 CALL scaling_function(itype_scf, n_scf, n_range, x_scf, y_scf)
912 !Step grid for the integration
913 dx = real(n_range, kind=dp)/real(n_scf, kind=dp)
914 !Extend the range (no more calculations because fill in by 0._dp)
915 n_cell = max(n01, n02, n03)
916 n_range = max(n_cell, n_range)
917
918 !Allocations
919 ALLOCATE (kernel_scf(-n_range:n_range))
920 ALLOCATE (kern_1_scf(-n_range:n_range))
921
922 !Lengthes of the box (use FFT dimension)
923 a1 = hgrid*real(n01, kind=dp)
924 a2 = hgrid*real(n02, kind=dp)
925 a3 = hgrid*real(n03, kind=dp)
926
927 x_scf(:) = hgrid*x_scf(:)
928 y_scf(:) = 1._dp/hgrid*y_scf(:)
929 dx = hgrid*dx
930 !To have a correct integration
931 p0_cell = p0_ref/(hgrid*hgrid)
932
933 !Initialization of the gaussian (Beylkin)
934 CALL gequad(p_gauss, w_gauss, ur_gauss, dr_gauss, acc_gauss)
935 !In order to have a range from a_range=sqrt(a1*a1+a2*a2+a3*a3)
936 !(biggest length in the cube)
937 !We divide the p_gauss by a_range**2 and a_gauss by a_range
938 a_range = sqrt(a1*a1 + a2*a2 + a3*a3)
939 factor = 1._dp/a_range
940 !factor2 = factor*factor
941 factor2 = 1._dp/(a1*a1 + a2*a2 + a3*a3)
942 DO i_gauss = 1, n_gauss
943 p_gauss(i_gauss) = factor2*p_gauss(i_gauss)
944 END DO
945 DO i_gauss = 1, n_gauss
946 w_gauss(i_gauss) = factor*w_gauss(i_gauss)
947 END DO
948
949 kp(:, :, :) = 0._dp
950 !Use in this order (better for accuracy).
951 loop_gauss: DO i_gauss = n_gauss, 1, -1
952 !Gaussian
953 pgauss = p_gauss(i_gauss)
954
955 !We calculate the number of iterations to go from pgauss to p0_ref
956 n_iter = nint((log(pgauss) - log(p0_cell))/log(4._dp))
957 IF (n_iter <= 0) THEN
958 n_iter = 0
959 p0gauss = pgauss
960 ELSE
961 p0gauss = pgauss/4._dp**n_iter
962 END IF
963
964 !Stupid integration
965 !Do the integration with the exponential centered in i_kern
966 kernel_scf(:) = 0._dp
967 DO i_kern = 0, n_range
968 kern = 0._dp
969 DO i = 0, n_scf
970 absci = x_scf(i) - real(i_kern, kind=dp)*hgrid
971 absci = absci*absci
972 kern = kern + y_scf(i)*exp(-p0gauss*absci)*dx
973 END DO
974 kernel_scf(i_kern) = kern
975 kernel_scf(-i_kern) = kern
976 IF (abs(kern) < 1.e-18_dp) THEN
977 !Too small not useful to calculate
978 EXIT
979 END IF
980 END DO
981
982 !Start the iteration to go from p0gauss to pgauss
983 CALL scf_recursion(itype_scf, n_iter, n_range, kernel_scf, kern_1_scf)
984
985 !Add to the kernel (only the local part)
986
987 DO i3 = istart1, iend
988 i03 = i3 - n2h - 1
989 ! Crash if index out of range
990 ! Without compiler bounds checking, the calculation might run successfully but
991 ! it is also possible that the Hartree energy will blow up
992 ! This seems to happen with large MPI processor counts if the size of the
993 ! RS grid is not directly compatible with the allowed FFT dimensions in
994 ! subroutine fourier_dim (ps_wavelet_fft3d.F)
995 IF (i03 .LT. -n_range .OR. i03 .GT. n_range) THEN
996 CALL cp_abort(__location__, "Index out of range in wavelet solver. "// &
997 "Try decreasing the number of MPI processors, or adjust the PW_CUTOFF or cell size "// &
998 "so that 2*(number of RS grid points) matches the allowed FFT dimensions "// &
999 "(see ps_wavelet_fft3d.F) exactly.")
1000 END IF
1001 DO i2 = 1, n02
1002 i02 = i2 - 1
1003 DO i1 = 1, n01
1004 i01 = i1 - 1
1005 kp(i1, i2, i3 - istart + 1) = kp(i1, i2, i3 - istart + 1) + w_gauss(i_gauss)* &
1006 kernel_scf(i01)*kernel_scf(i02)*kernel_scf(i03)
1007 END DO
1008 END DO
1009 END DO
1010
1011 END DO loop_gauss
1012
1013 !De-allocations
1014 DEALLOCATE (kernel_scf)
1015 DEALLOCATE (kern_1_scf)
1016 DEALLOCATE (x_scf)
1017 DEALLOCATE (y_scf)
1018
1019!!!!END KERNEL CONSTRUCTION
1020
1021!!$ if(iproc .eq. 0) print *,"Do a 3D PHalFFT for the kernel"
1022
1023 CALL kernelfft(nfft1, nfft2, nfft3, nker1, nker2, nker3, n1k, n2k, n3k, nproc, iproc, &
1024 kp, karray, mpi_group)
1025
1026 !De-allocations
1027 DEALLOCATE (kp)
1028
1029 END SUBROUTINE free_kernel
1030
1031! **************************************************************************************************
1032!> \brief ...
1033!> \param n1 ...
1034!> \param n3 ...
1035!> \param lot ...
1036!> \param nfft ...
1037!> \param i1 ...
1038!> \param zf ...
1039!> \param zw ...
1040! **************************************************************************************************
1041 SUBROUTINE inserthalf(n1, n3, lot, nfft, i1, zf, zw)
1042 INTEGER, INTENT(in) :: n1, n3, lot, nfft, i1
1043 REAL(kind=dp), DIMENSION(n1/2+1, n3/2+1), &
1044 INTENT(in) :: zf
1045 REAL(kind=dp), DIMENSION(2, lot, n3/2), &
1046 INTENT(out) :: zw
1047
1048 INTEGER :: i01, i03i, i03r, i3, l1, l3
1049
1050 zw = 0.0_dp
1051 i3 = 0
1052 DO l3 = 1, n3, 2
1053 i3 = i3 + 1
1054 i03r = abs(l3 - n3/2 - 1) + 1
1055 i03i = abs(l3 - n3/2) + 1
1056 DO l1 = 1, nfft
1057 i01 = abs(l1 - 1 + i1 - n1/2 - 1) + 1
1058 zw(1, l1, i3) = zf(i01, i03r)
1059 zw(2, l1, i3) = zf(i01, i03i)
1060 END DO
1061 END DO
1062
1063 END SUBROUTINE inserthalf
1064
1065! **************************************************************************************************
1066!> \brief (Based on suitable modifications of S.Goedecker routines)
1067!> Calculates the FFT of the distributed kernel
1068!> \param n1 logical dimension of the transform.
1069!> \param n2 logical dimension of the transform.
1070!> \param n3 logical dimension of the transform.
1071!> \param nd1 Dimensions of work arrays
1072!> \param nd2 Dimensions of work arrays
1073!> \param nd3 Dimensions of work arrays
1074!> \param nk1 ...
1075!> \param nk2 ...
1076!> \param nk3 ...
1077!> \param nproc number of processors used as returned by MPI_COMM_SIZE
1078!> \param iproc [0:nproc-1] number of processor as returned by MPI_COMM_RANK
1079!> \param zf Real kernel (input)
1080!> zf(i1,i2,i3)
1081!> \param zr Distributed Kernel FFT
1082!> zr(2,i1,i2,i3)
1083!> \param mpi_group ...
1084!> \date February 2006
1085!> \par Restrictions
1086!> Copyright (C) Stefan Goedecker, Cornell University, Ithaca, USA, 1994
1087!> Copyright (C) Stefan Goedecker, MPI Stuttgart, Germany, 1999
1088!> Copyright (C) 2002 Stefan Goedecker, CEA Grenoble
1089!> This file is distributed under the terms of the
1090!> GNU General Public License, see http://www.gnu.org/copyleft/gpl.txt .
1091!> \author S. Goedecker, L. Genovese
1092!> \note As transform lengths
1093!> most products of the prime factors 2,3,5 are allowed.
1094!> The detailed table with allowed transform lengths can
1095!> be found in subroutine CTRIG
1096! **************************************************************************************************
1097 SUBROUTINE kernelfft(n1, n2, n3, nd1, nd2, nd3, nk1, nk2, nk3, nproc, iproc, zf, zr, mpi_group)
1098
1099 INTEGER, INTENT(in) :: n1, n2, n3, nd1, nd2, nd3, nk1, nk2, &
1100 nk3, nproc, iproc
1101 REAL(kind=dp), &
1102 DIMENSION(n1/2+1, n3/2+1, nd2/nproc), &
1103 INTENT(in) :: zf
1104 REAL(kind=dp), DIMENSION(nk1, nk2, nk3/nproc), &
1105 INTENT(inout) :: zr
1106 TYPE(mp_comm_type), INTENT(in) :: mpi_group
1107
1108 INTEGER, PARAMETER :: ncache_optimal = 8*1024
1109
1110 INTEGER :: i, i1, i3, ic1, ic2, ic3, inzee, j, j2, &
1111 j2st, j3, jp2st, lot, lzt, ma, mb, &
1112 ncache, nfft
1113 INTEGER, ALLOCATABLE, DIMENSION(:) :: after1, after2, after3, before1, &
1114 before2, before3, now1, now2, now3
1115 REAL(kind=dp) :: twopion
1116 REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :) :: cosinarr, trig1, trig2, trig3
1117 REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :, :) :: zt, zw
1118 REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :, :, :) :: zmpi2
1119 REAL(kind=dp), ALLOCATABLE, &
1120 DIMENSION(:, :, :, :, :) :: zmpi1
1121
1122! include "perfdata.inc"
1123!work arrays for transpositions
1124!work arrays for MPI
1125!cache work array
1126!FFT work arrays
1127!Body
1128!check input
1129
1130 cpassert(nd1 .GE. n1)
1131 cpassert(nd2 .GE. n2)
1132 cpassert(nd3 .GE. n3/2 + 1)
1133 cpassert(mod(nd3, nproc) .EQ. 0)
1134 cpassert(mod(nd2, nproc) .EQ. 0)
1135 mark_used(nd1)
1136
1137 !defining work arrays dimensions
1138 ncache = ncache_optimal
1139 IF (ncache <= max(n1, n2, n3/2)*4) ncache = max(n1, n2, n3/2)*4
1140 lzt = n2
1141 IF (mod(n2, 2) .EQ. 0) lzt = lzt + 1
1142 IF (mod(n2, 4) .EQ. 0) lzt = lzt + 1
1143
1144 !Allocations
1145 ALLOCATE (trig1(2, ctrig_length))
1146 ALLOCATE (after1(7))
1147 ALLOCATE (now1(7))
1148 ALLOCATE (before1(7))
1149 ALLOCATE (trig2(2, ctrig_length))
1150 ALLOCATE (after2(7))
1151 ALLOCATE (now2(7))
1152 ALLOCATE (before2(7))
1153 ALLOCATE (trig3(2, ctrig_length))
1154 ALLOCATE (after3(7))
1155 ALLOCATE (now3(7))
1156 ALLOCATE (before3(7))
1157 ALLOCATE (zw(2, ncache/4, 2))
1158 ALLOCATE (zt(2, lzt, n1))
1159 ALLOCATE (zmpi2(2, n1, nd2/nproc, nd3))
1160 ALLOCATE (cosinarr(2, n3/2))
1161 IF (nproc .GT. 1) THEN
1162 ALLOCATE (zmpi1(2, n1, nd2/nproc, nd3/nproc, nproc))
1163 zmpi1 = 0.0_dp
1164 END IF
1165
1166 zmpi2 = 0.0_dp
1167 !calculating the FFT work arrays (beware on the HalFFT in n3 dimension)
1168 CALL ctrig(n3/2, trig3, after3, before3, now3, 1, ic3)
1169 CALL ctrig(n1, trig1, after1, before1, now1, 1, ic1)
1170 CALL ctrig(n2, trig2, after2, before2, now2, 1, ic2)
1171
1172 !Calculating array of phases for HalFFT decoding
1173 twopion = 8._dp*atan(1._dp)/real(n3, kind=dp)
1174 DO i3 = 1, n3/2
1175 cosinarr(1, i3) = cos(twopion*(i3 - 1))
1176 cosinarr(2, i3) = -sin(twopion*(i3 - 1))
1177 END DO
1178
1179 !transform along z axis
1180
1181 lot = ncache/(2*n3)
1182 cpassert(lot .GE. 1)
1183
1184 DO j2 = 1, nd2/nproc
1185 !this condition ensures that we manage only the interesting part for the FFT
1186 IF (iproc*(nd2/nproc) + j2 .LE. n2) THEN
1187 DO i1 = 1, n1, lot
1188 ma = i1
1189 mb = min(i1 + (lot - 1), n1)
1190 nfft = mb - ma + 1
1191
1192 !inserting real data into complex array of half length
1193 !input: I1,I3,J2,(Jp2)
1194
1195 CALL inserthalf(n1, n3, lot, nfft, i1, zf(1, 1, j2), zw(1, 1, 1))
1196
1197 !performing FFT
1198 inzee = 1
1199 DO i = 1, ic3
1200 CALL fftstp(lot, nfft, n3/2, lot, n3/2, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
1201 trig3, after3(i), now3(i), before3(i), 1)
1202 inzee = 3 - inzee
1203 END DO
1204 !output: I1,i3,J2,(Jp2)
1205
1206 !unpacking FFT in order to restore correct result,
1207 !while exchanging components
1208 !input: I1,i3,J2,(Jp2)
1209 CALL scramble_unpack(i1, j2, lot, nfft, n1, n3, nd2, nproc, nd3, zw(1, 1, inzee), zmpi2, cosinarr)
1210 !output: I1,J2,i3,(Jp2)
1211 END DO
1212 END IF
1213 END DO
1214
1215 !Interprocessor data transposition
1216 !input: I1,J2,j3,jp3,(Jp2)
1217 IF (nproc .GT. 1) THEN
1218 !communication scheduling
1219 CALL mpi_group%alltoall(zmpi2, &!2*n1*(nd2/nproc)*(nd3/nproc), &
1220 zmpi1, 2*n1*(nd2/nproc)*(nd3/nproc))
1221 ! output: I1,J2,j3,Jp2,(jp3)
1222 END IF
1223
1224 DO j3 = 1, nd3/nproc
1225 !this condition ensures that we manage only the interesting part for the FFT
1226 IF (iproc*(nd3/nproc) + j3 .LE. n3/2 + 1) THEN
1227 jp2st = 1
1228 j2st = 1
1229
1230 !transform along x axis
1231 lot = ncache/(4*n1)
1232 cpassert(lot .GE. 1)
1233
1234 DO j = 1, n2, lot
1235 ma = j
1236 mb = min(j + (lot - 1), n2)
1237 nfft = mb - ma + 1
1238
1239 !reverse ordering
1240 !input: I1,J2,j3,Jp2,(jp3)
1241 IF (nproc .EQ. 1) THEN
1242 CALL mpiswitch(j3, nfft, jp2st, j2st, lot, n1, nd2, nd3, nproc, zmpi2, zw(1, 1, 1))
1243 ELSE
1244 CALL mpiswitch(j3, nfft, jp2st, j2st, lot, n1, nd2, nd3, nproc, zmpi1, zw(1, 1, 1))
1245 END IF
1246 !output: J2,Jp2,I1,j3,(jp3)
1247
1248 !performing FFT
1249 !input: I2,I1,j3,(jp3)
1250 inzee = 1
1251 DO i = 1, ic1 - 1
1252 CALL fftstp(lot, nfft, n1, lot, n1, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
1253 trig1, after1(i), now1(i), before1(i), 1)
1254 inzee = 3 - inzee
1255 END DO
1256 !storing the last step into zt
1257 i = ic1
1258 CALL fftstp(lot, nfft, n1, lzt, n1, zw(1, 1, inzee), zt(1, j, 1), &
1259 trig1, after1(i), now1(i), before1(i), 1)
1260 !output: I2,i1,j3,(jp3)
1261 END DO
1262
1263 !transform along y axis, and taking only the first half
1264 lot = ncache/(4*n2)
1265 cpassert(lot .GE. 1)
1266
1267 DO j = 1, nk1, lot
1268 ma = j
1269 mb = min(j + (lot - 1), nk1)
1270 nfft = mb - ma + 1
1271
1272 !reverse ordering
1273 !input: I2,i1,j3,(jp3)
1274 CALL switch(nfft, n2, lot, n1, lzt, zt(1, 1, j), zw(1, 1, 1))
1275 !output: i1,I2,j3,(jp3)
1276
1277 !performing FFT
1278 !input: i1,I2,j3,(jp3)
1279 inzee = 1
1280 DO i = 1, ic2
1281 CALL fftstp(lot, nfft, n2, lot, n2, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
1282 trig2, after2(i), now2(i), before2(i), 1)
1283 inzee = 3 - inzee
1284 END DO
1285
1286 CALL realcopy(lot, nfft, n2, nk1, nk2, zw(1, 1, inzee), zr(j, 1, j3))
1287
1288 END DO
1289 !output: i1,i2,j3,(jp3)
1290 END IF
1291 END DO
1292
1293 !De-allocations
1294 DEALLOCATE (trig1)
1295 DEALLOCATE (after1)
1296 DEALLOCATE (now1)
1297 DEALLOCATE (before1)
1298 DEALLOCATE (trig2)
1299 DEALLOCATE (after2)
1300 DEALLOCATE (now2)
1301 DEALLOCATE (before2)
1302 DEALLOCATE (trig3)
1303 DEALLOCATE (after3)
1304 DEALLOCATE (now3)
1305 DEALLOCATE (before3)
1306 DEALLOCATE (zmpi2)
1307 DEALLOCATE (zw)
1308 DEALLOCATE (zt)
1309 DEALLOCATE (cosinarr)
1310 IF (nproc .GT. 1) DEALLOCATE (zmpi1)
1311
1312 END SUBROUTINE kernelfft
1313
1314! **************************************************************************************************
1315!> \brief ...
1316!> \param lot ...
1317!> \param nfft ...
1318!> \param n2 ...
1319!> \param nk1 ...
1320!> \param nk2 ...
1321!> \param zin ...
1322!> \param zout ...
1323! **************************************************************************************************
1324 SUBROUTINE realcopy(lot, nfft, n2, nk1, nk2, zin, zout)
1325 INTEGER, INTENT(in) :: lot, nfft, n2, nk1, nk2
1326 REAL(kind=dp), DIMENSION(2, lot, n2), INTENT(in) :: zin
1327 REAL(kind=dp), DIMENSION(nk1, nk2), INTENT(inout) :: zout
1328
1329 INTEGER :: i, j
1330
1331 DO i = 1, nk2
1332 DO j = 1, nfft
1333 zout(j, i) = zin(1, j, i)
1334 END DO
1335 END DO
1336
1337 END SUBROUTINE realcopy
1338
1339! **************************************************************************************************
1340!> \brief ...
1341!> \param nfft ...
1342!> \param n2 ...
1343!> \param lot ...
1344!> \param n1 ...
1345!> \param lzt ...
1346!> \param zt ...
1347!> \param zw ...
1348! **************************************************************************************************
1349 SUBROUTINE switch(nfft, n2, lot, n1, lzt, zt, zw)
1350 INTEGER :: nfft, n2, lot, n1, lzt
1351 REAL(kind=dp) :: zt(2, lzt, n1), zw(2, lot, n2)
1352
1353 INTEGER :: i, j
1354
1355 DO 200, j = 1, nfft
1356 DO 100, i = 1, n2
1357 zw(1, j, i) = zt(1, i, j)
1358 zw(2, j, i) = zt(2, i, j)
1359100 CONTINUE
1360200 CONTINUE
1361 RETURN
1362 END SUBROUTINE switch
1363
1364! **************************************************************************************************
1365!> \brief ...
1366!> \param j3 ...
1367!> \param nfft ...
1368!> \param Jp2st ...
1369!> \param J2st ...
1370!> \param lot ...
1371!> \param n1 ...
1372!> \param nd2 ...
1373!> \param nd3 ...
1374!> \param nproc ...
1375!> \param zmpi1 ...
1376!> \param zw ...
1377! **************************************************************************************************
1378 SUBROUTINE mpiswitch(j3, nfft, Jp2st, J2st, lot, n1, nd2, nd3, nproc, zmpi1, zw)
1379 INTEGER :: j3, nfft, jp2st, j2st, lot, n1, nd2, &
1380 nd3, nproc
1381 REAL(kind=dp) :: zmpi1(2, n1, nd2/nproc, nd3/nproc, nproc), zw(2, lot, n1)
1382
1383 INTEGER :: i1, j2, jp2, mfft
1384
1385 mfft = 0
1386 DO 300, jp2 = jp2st, nproc
1387 DO 200, j2 = j2st, nd2/nproc
1388 mfft = mfft + 1
1389 IF (mfft .GT. nfft) THEN
1390 jp2st = jp2
1391 j2st = j2
1392 RETURN
1393 END IF
1394 DO 100, i1 = 1, n1
1395 zw(1, mfft, i1) = zmpi1(1, i1, j2, j3, jp2)
1396 zw(2, mfft, i1) = zmpi1(2, i1, j2, j3, jp2)
1397100 CONTINUE
1398200 CONTINUE
1399 j2st = 1
1400300 CONTINUE
1401 END SUBROUTINE mpiswitch
1402
1403! **************************************************************************************************
1404!> \brief ...
1405!> \param p ...
1406!> \param w ...
1407!> \param urange ...
1408!> \param drange ...
1409!> \param acc ...
1410! **************************************************************************************************
1411 SUBROUTINE gequad(p, w, urange, drange, acc)
1412!
1413 REAL(kind=dp) :: p(*), w(*), urange, drange, acc
1414
1415!
1416!
1417! range [10^(-9),1] and accuracy ~10^(-8);
1418!
1419!
1420
1421 p(1) = 4.96142640560223544e19_dp
1422 p(2) = 1.37454269147978052e19_dp
1423 p(3) = 7.58610013441204679e18_dp
1424 p(4) = 4.42040691347806996e18_dp
1425 p(5) = 2.61986077948367892e18_dp
1426 p(6) = 1.56320138155496681e18_dp
1427 p(7) = 9.35645215863028402e17_dp
1428 p(8) = 5.60962910452691703e17_dp
1429 p(9) = 3.3666225119686761e17_dp
1430 p(10) = 2.0218253197947866e17_dp
1431 p(11) = 1.21477756091902017e17_dp
1432 p(12) = 7.3012982513608503e16_dp
1433 p(13) = 4.38951893556421099e16_dp
1434 p(14) = 2.63949482512262325e16_dp
1435 p(15) = 1.58742054072786174e16_dp
1436 p(16) = 9.54806587737665531e15_dp
1437 p(17) = 5.74353712364571709e15_dp
1438 p(18) = 3.455214877389445e15_dp
1439 p(19) = 2.07871658520326804e15_dp
1440 p(20) = 1.25064667315629928e15_dp
1441 p(21) = 7.52469429541933745e14_dp
1442 p(22) = 4.5274603337253175e14_dp
1443 p(23) = 2.72414006900059548e14_dp
1444 p(24) = 1.63912168349216752e14_dp
1445 p(25) = 9.86275802590865738e13_dp
1446 p(26) = 5.93457701624974985e13_dp
1447 p(27) = 3.5709554322296296e13_dp
1448 p(28) = 2.14872890367310454e13_dp
1449 p(29) = 1.29294719957726902e13_dp
1450 p(30) = 7.78003375426361016e12_dp
1451 p(31) = 4.68148199759876704e12_dp
1452 p(32) = 2.8169955024829868e12_dp
1453 p(33) = 1.69507790481958464e12_dp
1454 p(34) = 1.01998486064607581e12_dp
1455 p(35) = 6.13759486539856459e11_dp
1456 p(36) = 3.69320183828682544e11_dp
1457 p(37) = 2.22232783898905102e11_dp
1458 p(38) = 1.33725247623668682e11_dp
1459 p(39) = 8.0467192739036288e10_dp
1460 p(40) = 4.84199582415144143e10_dp
1461 p(41) = 2.91360091170559564e10_dp
1462 p(42) = 1.75321747475309216e10_dp
1463 p(43) = 1.0549735552210995e10_dp
1464 p(44) = 6.34815321079006586e9_dp
1465 p(45) = 3.81991113733594231e9_dp
1466 p(46) = 2.29857747533101109e9_dp
1467 p(47) = 1.38313653595483694e9_dp
1468 p(48) = 8.32282908580025358e8_dp
1469 p(49) = 5.00814519374587467e8_dp
1470 p(50) = 3.01358090773319025e8_dp
1471 p(51) = 1.81337994217503535e8_dp
1472 p(52) = 1.09117589961086823e8_dp
1473 p(53) = 6.56599771718640323e7_dp
1474 p(54) = 3.95099693638497164e7_dp
1475 p(55) = 2.37745694710665991e7_dp
1476 p(56) = 1.43060135285912813e7_dp
1477 p(57) = 8.60844290313506695e6_dp
1478 p(58) = 5.18000974075383424e6_dp
1479 p(59) = 3.116998193057466e6_dp
1480 p(60) = 1.87560993870024029e6_dp
1481 p(61) = 1.12862197183979562e6_dp
1482 p(62) = 679132.441326077231_dp
1483 p(63) = 408658.421279877969_dp
1484 p(64) = 245904.473450669789_dp
1485 p(65) = 147969.568088321005_dp
1486 p(66) = 89038.612357311147_dp
1487 p(67) = 53577.7362552358895_dp
1488 p(68) = 32239.6513926914668_dp
1489 p(69) = 19399.7580852362791_dp
1490 p(70) = 11673.5323603058634_dp
1491 p(71) = 7024.38438577707758_dp
1492 p(72) = 4226.82479307685999_dp
1493 p(73) = 2543.43254175354295_dp
1494 p(74) = 1530.47486269122675_dp
1495 p(75) = 920.941785160749482_dp
1496 p(76) = 554.163803906291646_dp
1497 p(77) = 333.46029740785694_dp
1498 p(78) = 200.6550575335041_dp
1499 p(79) = 120.741366914147284_dp
1500 p(80) = 72.6544243200329916_dp
1501 p(81) = 43.7187810415471025_dp
1502 p(82) = 26.3071631447061043_dp
1503 p(83) = 15.8299486353816329_dp
1504 p(84) = 9.52493152341244004_dp
1505 p(85) = 5.72200417067776041_dp
1506 p(86) = 3.36242234070940928_dp
1507 p(87) = 1.75371394604499472_dp
1508 p(88) = 0.64705932650658966_dp
1509 p(89) = 0.072765905943708247_dp
1510 !
1511 w(1) = 47.67445484528304247e10_dp
1512 w(2) = 11.37485774750442175e9_dp
1513 w(3) = 78.64340976880190239e8_dp
1514 w(4) = 46.27335788759590498e8_dp
1515 w(5) = 24.7380464827152951e8_dp
1516 w(6) = 13.62904116438987719e8_dp
1517 w(7) = 92.79560029045882433e8_dp
1518 w(8) = 52.15931216254660251e8_dp
1519 w(9) = 31.67018011061666244e8_dp
1520 w(10) = 1.29291036801493046e8_dp
1521 w(11) = 1.00139319988015862e8_dp
1522 w(12) = 7.75892350510188341e7_dp
1523 w(13) = 6.01333567950731271e7_dp
1524 w(14) = 4.66141178654796875e7_dp
1525 w(15) = 3.61398903394911448e7_dp
1526 w(16) = 2.80225846672956389e7_dp
1527 w(17) = 2.1730509180930247e7_dp
1528 w(18) = 1.68524482625876965e7_dp
1529 w(19) = 1.30701489345870338e7_dp
1530 w(20) = 1.01371784832269282e7_dp
1531 w(21) = 7.86264116300379329e6_dp
1532 w(22) = 6.09861667912273717e6_dp
1533 w(23) = 4.73045784039455683e6_dp
1534 w(24) = 3.66928949951594161e6_dp
1535 w(25) = 2.8462050836230259e6_dp
1536 w(26) = 2.20777394798527011e6_dp
1537 w(27) = 1.71256191589205524e6_dp
1538 w(28) = 1.32843556197737076e6_dp
1539 w(29) = 1.0304731275955989e6_dp
1540 w(30) = 799345.206572271448_dp
1541 w(31) = 620059.354143595343_dp
1542 w(32) = 480986.704107449333_dp
1543 w(33) = 373107.167700228515_dp
1544 w(34) = 289424.08337412132_dp
1545 w(35) = 224510.248231581788_dp
1546 w(36) = 174155.825690028966_dp
1547 w(37) = 135095.256919654065_dp
1548 w(38) = 104795.442776800312_dp
1549 w(39) = 81291.4458222430418_dp
1550 w(40) = 63059.0493649328682_dp
1551 w(41) = 48915.9040455329689_dp
1552 w(42) = 37944.8484018048756_dp
1553 w(43) = 29434.4290473253969_dp
1554 w(44) = 22832.7622054490044_dp
1555 w(45) = 17711.743950151233_dp
1556 w(46) = 13739.287867104177_dp
1557 w(47) = 10657.7895710752585_dp
1558 w(48) = 8267.42141053961834_dp
1559 w(49) = 6413.17397520136448_dp
1560 w(50) = 4974.80402838654277_dp
1561 w(51) = 3859.03698188553047_dp
1562 w(52) = 2993.51824493299154_dp
1563 w(53) = 2322.1211966811754_dp
1564 w(54) = 1801.30750964719641_dp
1565 w(55) = 1397.30379659817038_dp
1566 w(56) = 1083.91149143250697_dp
1567 w(57) = 840.807939169209188_dp
1568 w(58) = 652.228524366749422_dp
1569 w(59) = 505.944376983506128_dp
1570 w(60) = 392.469362317941064_dp
1571 w(61) = 304.444930257324312_dp
1572 w(62) = 236.162932842453601_dp
1573 w(63) = 183.195466078603525_dp
1574 w(64) = 142.107732186551471_dp
1575 w(65) = 110.23530215723992_dp
1576 w(66) = 85.5113346705382257_dp
1577 w(67) = 66.3325469806696621_dp
1578 w(68) = 51.4552463353841373_dp
1579 w(69) = 39.9146798429449273_dp
1580 w(70) = 30.9624728409162095_dp
1581 w(71) = 24.018098812215013_dp
1582 w(72) = 18.6312338024296588_dp
1583 w(73) = 14.4525541233150501_dp
1584 w(74) = 11.2110836519105938_dp
1585 w(75) = 8.69662175848497178_dp
1586 w(76) = 6.74611236165731961_dp
1587 w(77) = 5.23307018057529994_dp
1588 w(78) = 4.05937850501539556_dp
1589 w(79) = 3.14892659076635714_dp
1590 w(80) = 2.44267408211071604_dp
1591 w(81) = 1.89482240522855261_dp
1592 w(82) = 1.46984505907050079_dp
1593 w(83) = 1.14019261330527007_dp
1594 w(84) = 0.884791217422925293_dp
1595 w(85) = 0.692686387080616483_dp
1596 w(86) = 0.585244576897023282_dp
1597 w(87) = 0.576182522545327589_dp
1598 w(88) = 0.596688817388997178_dp
1599 w(89) = 0.607879901151108771_dp
1600 !
1601 !
1602 urange = 1._dp
1603 drange = 1e-08_dp
1604 acc = 1e-08_dp
1605 !
1606 RETURN
1607 END SUBROUTINE gequad
1608
1609 END MODULE ps_wavelet_kernel
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
for(int lxp=0;lxp<=lp;lxp++)
Defines the basic variable types.
Definition kinds.F:23
integer, parameter, public dp
Definition kinds.F:34
integer, parameter, public sp
Definition kinds.F:33
Interface to the message passing library MPI.
Creates the wavelet kernel for the wavelet based poisson solver.
subroutine, public scramble_unpack(i1, j2, lot, nfft, n1, n3, md2, nproc, nd3, zw, zmpi2, cosinarr)
(Based on suitable modifications of S.Goedecker routines) Assign the correct planes to the work array...
integer, parameter, public ctrig_length
subroutine, public fftstp(mm, nfft, m, nn, n, zin, zout, trig, after, now, before, isign)
...
subroutine, public ctrig(n, trig, after, before, now, isign, ic)
...
Creates the wavelet kernel for the wavelet based poisson solver.
subroutine, public createkernel(geocode, n01, n02, n03, hx, hy, hz, itype_scf, iproc, nproc, kernel, mpi_group)
Allocate a pointer which corresponds to the zero-padded FFT slice needed for calculating the convolut...
Creates the wavelet kernel for the wavelet based poisson solver.
subroutine, public scf_recursion(itype, n_iter, n_range, kernel_scf, kern_1_scf)
Do iterations to go from p0gauss to pgauss order interpolating scaling function.
subroutine, public scaling_function(itype, nd, nrange, a, x)
Calculate the values of a scaling function in real uniform grid.
Performs a wavelet based solution of the Poisson equation.
subroutine, public s_fft_dimensions(n01, n02, n03, m1, m2, m3, n1, n2, n3, md1, md2, md3, nd1, nd2, nd3, nproc)
Calculate four sets of dimension needed for the calculation of the convolution for the surface system...
subroutine, public f_fft_dimensions(n01, n02, n03, m1, m2, m3, n1, n2, n3, md1, md2, md3, nd1, nd2, nd3, nproc)
Calculate four sets of dimension needed for the calculation of the zero-padded convolution.