(git:e7e05ae)
pw_gpu.F
Go to the documentation of this file.
1 !--------------------------------------------------------------------------------------------------!
2 ! CP2K: A general program to perform molecular dynamics simulations !
3 ! Copyright 2000-2024 CP2K developers group <https://cp2k.org> !
4 ! !
5 ! SPDX-License-Identifier: GPL-2.0-or-later !
6 !--------------------------------------------------------------------------------------------------!
7 
8 ! **************************************************************************************************
9 !> \note
10 !> This module contains routines necessary to operate on plane waves on GPUs
11 ! > independently of the GPU platform.
12 !> \par History
13 !> BGL (06-Mar-2008) : Created
14 !> AG (18-May-2012) : Refacturing:
15 !> - added explicit interfaces to C routines
16 !> - enable double precision complex transformations
17 !> AG (11-Sept-2012) : Modifications:
18 !> - use pointers if precision mapping is not required
19 !> - use OMP for mapping
20 !> MT (Jan 2022) : Modifications
21 !> - use a generic interface for fft calls to GPUs
22 !> - Support both Nvidia and AMD GPUs. Other GPUs manufacturers
23 !> can be added easily.
24 !> \author Benjamin G. Levine
25 ! **************************************************************************************************
26 MODULE pw_gpu
27  USE iso_c_binding, ONLY: c_double,&
28  c_int,&
29  c_loc,&
30  c_ptr
31  USE fft_tools, ONLY: &
34  USE kinds, ONLY: dp
35  USE mathconstants, ONLY: z_zero
36  USE message_passing, ONLY: mp_cart_type,&
37  mp_comm_type
38  USE pw_grid_types, ONLY: fullspace
39  USE pw_types, ONLY: pw_c1d_gs_type,&
40  pw_r3d_rs_type
41 #include "../base/base_uses.f90"
42 
43  IMPLICIT NONE
44 
45  PRIVATE
46 
47  PUBLIC :: pw_gpu_r3dc1d_3d
48  PUBLIC :: pw_gpu_c1dr3d_3d
49  PUBLIC :: pw_gpu_r3dc1d_3d_ps
50  PUBLIC :: pw_gpu_c1dr3d_3d_ps
51  PUBLIC :: pw_gpu_init, pw_gpu_finalize
52 
53  CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'pw_gpu'
54  LOGICAL, PARAMETER, PRIVATE :: debug_this_module = .false.
55 
56 CONTAINS
57 
58 ! **************************************************************************************************
59 !> \brief Allocates resources on the gpu device for gpu fft acceleration
60 !> \author Ole Schuett
61 ! **************************************************************************************************
62  SUBROUTINE pw_gpu_init()
63  INTEGER :: dummy
64  INTERFACE
65  SUBROUTINE pw_gpu_init_c() BIND(C, name="pw_gpu_init")
66  END SUBROUTINE pw_gpu_init_c
67  END INTERFACE
68 
69  mark_used(dummy) ! TODO: fix fpretty
70 #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
71  CALL pw_gpu_init_c()
72 #else
73  ! Nothing to do.
74 #endif
75  END SUBROUTINE pw_gpu_init
76 
77 ! **************************************************************************************************
78 !> \brief Releases resources on the gpu device for gpu fft acceleration
79 !> \author Ole Schuett
80 ! **************************************************************************************************
81  SUBROUTINE pw_gpu_finalize()
82  INTEGER :: dummy
83  INTERFACE
84  SUBROUTINE pw_gpu_finalize_c() BIND(C, name="pw_gpu_finalize")
85  END SUBROUTINE pw_gpu_finalize_c
86  END INTERFACE
87 
88  mark_used(dummy) ! TODO: fix fpretty
89 #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
90  CALL pw_gpu_finalize_c()
91 #else
92  ! Nothing to do.
93 #endif
94  END SUBROUTINE pw_gpu_finalize
95 
96 ! **************************************************************************************************
97 !> \brief perform an fft followed by a gather on the gpu
98 !> \param pw1 ...
99 !> \param pw2 ...
100 !> \param scale ...
101 !> \author Benjamin G Levine
102 ! **************************************************************************************************
103  SUBROUTINE pw_gpu_r3dc1d_3d(pw1, pw2, scale)
104  TYPE(pw_r3d_rs_type), INTENT(IN) :: pw1
105  TYPE(pw_c1d_gs_type), INTENT(INOUT) :: pw2
106  REAL(kind=dp), INTENT(IN) :: scale
107 
108  CHARACTER(len=*), PARAMETER :: routinen = 'pw_gpu_r3dc1d_3d'
109 
110  COMPLEX(KIND=dp), POINTER :: ptr_pwout
111  INTEGER :: handle, l1, l2, l3, ngpts
112  INTEGER, DIMENSION(:), POINTER :: npts
113  INTEGER, POINTER :: ptr_ghatmap
114  REAL(kind=dp), POINTER :: ptr_pwin
115  INTERFACE
116  SUBROUTINE pw_gpu_cfffg_c(din, zout, ghatmap, npts, ngpts, scale) BIND(C, name="pw_gpu_cfffg")
117  IMPORT
118  TYPE(c_ptr), INTENT(IN), VALUE :: din
119  TYPE(c_ptr), VALUE :: zout
120  TYPE(c_ptr), INTENT(IN), VALUE :: ghatmap
121  INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
122  INTEGER(KIND=C_INT), INTENT(IN), VALUE :: ngpts
123  REAL(kind=c_double), INTENT(IN), VALUE :: scale
124 
125  END SUBROUTINE pw_gpu_cfffg_c
126  END INTERFACE
127 
128  CALL timeset(routinen, handle)
129 
130  ngpts = SIZE(pw2%pw_grid%gsq)
131  l1 = lbound(pw1%array, 1)
132  l2 = lbound(pw1%array, 2)
133  l3 = lbound(pw1%array, 3)
134  npts => pw1%pw_grid%npts
135 
136  ! pointers to data arrays
137  ptr_pwin => pw1%array(l1, l2, l3)
138  ptr_pwout => pw2%array(1)
139 
140  ! pointer to map array
141  ptr_ghatmap => pw2%pw_grid%g_hatmap(1, 1)
142 
143  ! invoke the combined transformation
144 #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
145  CALL pw_gpu_cfffg_c(c_loc(ptr_pwin), c_loc(ptr_pwout), c_loc(ptr_ghatmap), npts, ngpts, scale)
146 #else
147  mark_used(scale)
148  cpabort("Compiled without pw offloading.")
149 #endif
150 
151  CALL timestop(handle)
152  END SUBROUTINE pw_gpu_r3dc1d_3d
153 
154 ! **************************************************************************************************
155 !> \brief perform an scatter followed by a fft on the gpu
156 !> \param pw1 ...
157 !> \param pw2 ...
158 !> \param scale ...
159 !> \author Benjamin G Levine
160 ! **************************************************************************************************
161  SUBROUTINE pw_gpu_c1dr3d_3d(pw1, pw2, scale)
162  TYPE(pw_c1d_gs_type), INTENT(IN) :: pw1
163  TYPE(pw_r3d_rs_type), INTENT(INOUT) :: pw2
164  REAL(kind=dp), INTENT(IN) :: scale
165 
166  CHARACTER(len=*), PARAMETER :: routinen = 'pw_gpu_c1dr3d_3d'
167 
168  COMPLEX(KIND=dp), POINTER :: ptr_pwin
169  INTEGER :: handle, l1, l2, l3, ngpts, nmaps
170  INTEGER, DIMENSION(:), POINTER :: npts
171  INTEGER, POINTER :: ptr_ghatmap
172  REAL(kind=dp), POINTER :: ptr_pwout
173  INTERFACE
174  SUBROUTINE pw_gpu_sfffc_c(zin, dout, ghatmap, npts, ngpts, nmaps, scale) BIND(C, name="pw_gpu_sfffc")
175  IMPORT
176  TYPE(c_ptr), INTENT(IN), VALUE :: zin
177  TYPE(c_ptr), VALUE :: dout
178  TYPE(c_ptr), INTENT(IN), VALUE :: ghatmap
179  INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
180  INTEGER(KIND=C_INT), INTENT(IN), VALUE :: ngpts, nmaps
181  REAL(kind=c_double), INTENT(IN), VALUE :: scale
182  END SUBROUTINE pw_gpu_sfffc_c
183  END INTERFACE
184 
185  CALL timeset(routinen, handle)
186 
187  ngpts = SIZE(pw1%pw_grid%gsq)
188  l1 = lbound(pw2%array, 1)
189  l2 = lbound(pw2%array, 2)
190  l3 = lbound(pw2%array, 3)
191  npts => pw1%pw_grid%npts
192 
193  ! pointers to data arrays
194  ptr_pwin => pw1%array(1)
195  ptr_pwout => pw2%array(l1, l2, l3)
196 
197  ! pointer to map array
198  nmaps = SIZE(pw1%pw_grid%g_hatmap, 2)
199  ptr_ghatmap => pw1%pw_grid%g_hatmap(1, 1)
200 
201  ! invoke the combined transformation
202 #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
203  CALL pw_gpu_sfffc_c(c_loc(ptr_pwin), c_loc(ptr_pwout), c_loc(ptr_ghatmap), npts, ngpts, nmaps, scale)
204 #else
205  mark_used(scale)
206  cpabort("Compiled without pw offloading")
207 #endif
208 
209  CALL timestop(handle)
210  END SUBROUTINE pw_gpu_c1dr3d_3d
211 
212 ! **************************************************************************************************
213 !> \brief perform an parallel fft followed by a gather on the gpu
214 !> \param pw1 ...
215 !> \param pw2 ...
216 !> \param scale ...
217 !> \author Andreas Gloess
218 ! **************************************************************************************************
219  SUBROUTINE pw_gpu_r3dc1d_3d_ps(pw1, pw2, scale)
220  TYPE(pw_r3d_rs_type), INTENT(IN) :: pw1
221  TYPE(pw_c1d_gs_type), INTENT(INOUT) :: pw2
222  REAL(kind=dp), INTENT(IN) :: scale
223 
224  CHARACTER(len=*), PARAMETER :: routinen = 'pw_gpu_r3dc1d_3d_ps'
225 
226  COMPLEX(KIND=dp), DIMENSION(:, :), POINTER :: grays, pbuf, qbuf, rbuf, sbuf
227  COMPLEX(KIND=dp), DIMENSION(:, :, :), POINTER :: tbuf
228  INTEGER :: g_pos, handle, lg, lmax, mg, mmax, mx2, &
229  mz2, n1, n2, ngpts, nmax, numtask, &
230  numtask_g, numtask_r, rp
231  INTEGER, ALLOCATABLE, DIMENSION(:) :: p2p
232  INTEGER, DIMENSION(2) :: r_dim, r_pos
233  INTEGER, DIMENSION(:), POINTER :: n, nloc, nyzray
234  INTEGER, DIMENSION(:, :, :, :), POINTER :: bo
235  TYPE(fft_scratch_sizes) :: fft_scratch_size
236  TYPE(fft_scratch_type), POINTER :: fft_scratch
237  TYPE(mp_cart_type) :: rs_group
238  TYPE(mp_comm_type) :: gs_group
239 
240  CALL timeset(routinen, handle)
241 
242  ! dimensions
243  n => pw1%pw_grid%npts
244  nloc => pw1%pw_grid%npts_local
245  grays => pw1%pw_grid%grays
246  ngpts = nloc(1)*nloc(2)*nloc(3)
247 
248  !..transform
249  IF (pw1%pw_grid%para%ray_distribution) THEN
250  gs_group = pw1%pw_grid%para%group
251  rs_group = pw1%pw_grid%para%rs_group
252  nyzray => pw1%pw_grid%para%nyzray
253  bo => pw1%pw_grid%para%bo
254 
255  numtask_g = gs_group%num_pe
256  g_pos = gs_group%mepos
257  numtask_r = rs_group%num_pe
258  r_dim = rs_group%num_pe_cart
259  r_pos = rs_group%mepos_cart
260  IF (numtask_g /= numtask_r) THEN
261  cpabort("Real space and G space groups are different.")
262  END IF
263  numtask = numtask_r
264 
265  lg = SIZE(grays, 1)
266  mg = SIZE(grays, 2)
267  mmax = max(mg, 1)
268  lmax = max(lg, (ngpts/mmax + 1))
269 
270  ALLOCATE (p2p(0:numtask - 1))
271 
272  CALL gs_group%rank_compare(rs_group, p2p)
273 
274  rp = p2p(g_pos)
275  mx2 = bo(2, 1, rp, 2) - bo(1, 1, rp, 2) + 1
276  mz2 = bo(2, 3, rp, 2) - bo(1, 3, rp, 2) + 1
277  n1 = maxval(bo(2, 1, :, 1) - bo(1, 1, :, 1) + 1)
278  n2 = maxval(bo(2, 2, :, 1) - bo(1, 2, :, 1) + 1)
279  nmax = max((2*n2)/numtask, 2)*mx2*mz2
280  nmax = max(nmax, n1*maxval(nyzray))
281 
282  fft_scratch_size%nx = nloc(1)
283  fft_scratch_size%ny = nloc(2)
284  fft_scratch_size%nz = nloc(3)
285  fft_scratch_size%lmax = lmax
286  fft_scratch_size%mmax = mmax
287  fft_scratch_size%mx1 = bo(2, 1, rp, 1) - bo(1, 1, rp, 1) + 1
288  fft_scratch_size%mx2 = mx2
289  fft_scratch_size%my1 = bo(2, 2, rp, 1) - bo(1, 2, rp, 1) + 1
290  fft_scratch_size%mz2 = mz2
291  fft_scratch_size%lg = lg
292  fft_scratch_size%mg = mg
293  fft_scratch_size%nbx = maxval(bo(2, 1, :, 2))
294  fft_scratch_size%nbz = maxval(bo(2, 3, :, 2))
295  fft_scratch_size%mcz1 = maxval(bo(2, 3, :, 1) - bo(1, 3, :, 1) + 1)
296  fft_scratch_size%mcx2 = maxval(bo(2, 1, :, 2) - bo(1, 1, :, 2) + 1)
297  fft_scratch_size%mcz2 = maxval(bo(2, 3, :, 2) - bo(1, 3, :, 2) + 1)
298  fft_scratch_size%nmax = nmax
299  fft_scratch_size%nmray = maxval(nyzray)
300  fft_scratch_size%nyzray = nyzray(g_pos)
301  fft_scratch_size%gs_group = gs_group
302  fft_scratch_size%rs_group = rs_group
303  fft_scratch_size%g_pos = g_pos
304  fft_scratch_size%r_pos = r_pos
305  fft_scratch_size%r_dim = r_dim
306  fft_scratch_size%numtask = numtask
307 
308  IF (r_dim(2) > 1) THEN
309  !
310  ! real space is distributed over x and y coordinate
311  ! we have two stages of communication
312  !
313  IF (r_dim(1) == 1) &
314  cpabort("This processor distribution is not supported.")
315 
316  CALL get_fft_scratch(fft_scratch, tf_type=300, n=n, fft_sizes=fft_scratch_size)
317 
318  ! assign buffers
319  qbuf => fft_scratch%p2buf
320  rbuf => fft_scratch%p3buf
321  pbuf => fft_scratch%p4buf
322  sbuf => fft_scratch%p5buf
323 
324  ! FFT along z
325  CALL pw_gpu_cf(pw1, qbuf)
326 
327  ! Exchange data ( transpose of matrix )
328  CALL cube_transpose_2(qbuf, bo(:, :, :, 1), bo(:, :, :, 2), rbuf, fft_scratch)
329 
330  ! FFT along y
331  ! use the inbuild fft-lib
332  ! CALL fft_1dm(fft_scratch%fft_plan(2), rbuf, pbuf, 1.0_dp, stat)
333  ! or cufft (works faster, but is only faster if plans are stored)
334  CALL pw_gpu_f(rbuf, pbuf, +1, n(2), mx2*mz2)
335 
336  ! Exchange data ( transpose of matrix ) and sort
337  CALL xz_to_yz(pbuf, rs_group, r_dim, g_pos, p2p, pw1%pw_grid%para%yzp, nyzray, &
338  bo(:, :, :, 2), sbuf, fft_scratch)
339 
340  ! FFT along x
341  CALL pw_gpu_fg(sbuf, pw2, scale)
342 
343  CALL release_fft_scratch(fft_scratch)
344 
345  ELSE
346  !
347  ! real space is only distributed over x coordinate
348  ! we have one stage of communication, after the transform of
349  ! direction x
350  !
351 
352  CALL get_fft_scratch(fft_scratch, tf_type=200, n=n, fft_sizes=fft_scratch_size)
353 
354  ! assign buffers
355  tbuf => fft_scratch%tbuf
356  sbuf => fft_scratch%r1buf
357 
358  ! FFT along y and z
359  CALL pw_gpu_cff(pw1, tbuf)
360 
361  ! Exchange data ( transpose of matrix ) and sort
362  CALL yz_to_x(tbuf, gs_group, g_pos, p2p, pw1%pw_grid%para%yzp, nyzray, &
363  bo(:, :, :, 2), sbuf, fft_scratch)
364 
365  ! FFT along x
366  CALL pw_gpu_fg(sbuf, pw2, scale)
367 
368  CALL release_fft_scratch(fft_scratch)
369 
370  END IF
371 
372  DEALLOCATE (p2p)
373 
374 !--------------------------------------------------------------------------
375  ELSE
376  cpabort("Not implemented (no ray_distr.) in: pw_gpu_r3dc1d_3d_ps.")
377  !CALL fft3d ( dir, n, pwin, grays, pw1%pw_grid%para%rs_group, &
378  ! pw1%pw_grid%para%bo, scale = scale, debug=test )
379  END IF
380 
381  CALL timestop(handle)
382  END SUBROUTINE pw_gpu_r3dc1d_3d_ps
383 
384 ! **************************************************************************************************
385 !> \brief perform an parallel scatter followed by a fft on the gpu
386 !> \param pw1 ...
387 !> \param pw2 ...
388 !> \param scale ...
389 !> \author Andreas Gloess
390 ! **************************************************************************************************
391  SUBROUTINE pw_gpu_c1dr3d_3d_ps(pw1, pw2, scale)
392  TYPE(pw_c1d_gs_type), INTENT(IN) :: pw1
393  TYPE(pw_r3d_rs_type), INTENT(INOUT) :: pw2
394  REAL(kind=dp), INTENT(IN) :: scale
395 
396  CHARACTER(len=*), PARAMETER :: routinen = 'pw_gpu_c1dr3d_3d_ps'
397 
398  COMPLEX(KIND=dp), DIMENSION(:, :), POINTER :: grays, pbuf, qbuf, rbuf, sbuf
399  COMPLEX(KIND=dp), DIMENSION(:, :, :), POINTER :: tbuf
400  INTEGER :: g_pos, handle, lg, lmax, mg, mmax, mx2, &
401  mz2, n1, n2, ngpts, nmax, numtask, &
402  numtask_g, numtask_r, rp
403  INTEGER, ALLOCATABLE, DIMENSION(:) :: p2p
404  INTEGER, DIMENSION(2) :: r_dim, r_pos
405  INTEGER, DIMENSION(:), POINTER :: n, nloc, nyzray
406  INTEGER, DIMENSION(:, :, :, :), POINTER :: bo
407  TYPE(fft_scratch_sizes) :: fft_scratch_size
408  TYPE(fft_scratch_type), POINTER :: fft_scratch
409  TYPE(mp_cart_type) :: rs_group
410  TYPE(mp_comm_type) :: gs_group
411 
412  CALL timeset(routinen, handle)
413 
414  ! dimensions
415  n => pw1%pw_grid%npts
416  nloc => pw1%pw_grid%npts_local
417  grays => pw1%pw_grid%grays
418  ngpts = nloc(1)*nloc(2)*nloc(3)
419 
420  !..transform
421  IF (pw1%pw_grid%para%ray_distribution) THEN
422  gs_group = pw1%pw_grid%para%group
423  rs_group = pw1%pw_grid%para%rs_group
424  nyzray => pw1%pw_grid%para%nyzray
425  bo => pw1%pw_grid%para%bo
426 
427  numtask_g = gs_group%num_pe
428  g_pos = gs_group%mepos
429  numtask_r = rs_group%num_pe
430  r_dim = rs_group%num_pe_cart
431  r_pos = rs_group%mepos_cart
432  IF (numtask_g /= numtask_r) THEN
433  cpabort("Real space and G space groups are different.")
434  END IF
435  numtask = numtask_r
436 
437  lg = SIZE(grays, 1)
438  mg = SIZE(grays, 2)
439  mmax = max(mg, 1)
440  lmax = max(lg, (ngpts/mmax + 1))
441 
442  ALLOCATE (p2p(0:numtask - 1))
443 
444  CALL gs_group%rank_compare(rs_group, p2p)
445 
446  rp = p2p(g_pos)
447  mx2 = bo(2, 1, rp, 2) - bo(1, 1, rp, 2) + 1
448  mz2 = bo(2, 3, rp, 2) - bo(1, 3, rp, 2) + 1
449  n1 = maxval(bo(2, 1, :, 1) - bo(1, 1, :, 1) + 1)
450  n2 = maxval(bo(2, 2, :, 1) - bo(1, 2, :, 1) + 1)
451  nmax = max((2*n2)/numtask, 2)*mx2*mz2
452  nmax = max(nmax, n1*maxval(nyzray))
453 
454  fft_scratch_size%nx = nloc(1)
455  fft_scratch_size%ny = nloc(2)
456  fft_scratch_size%nz = nloc(3)
457  fft_scratch_size%lmax = lmax
458  fft_scratch_size%mmax = mmax
459  fft_scratch_size%mx1 = bo(2, 1, rp, 1) - bo(1, 1, rp, 1) + 1
460  fft_scratch_size%mx2 = mx2
461  fft_scratch_size%my1 = bo(2, 2, rp, 1) - bo(1, 2, rp, 1) + 1
462  fft_scratch_size%mz2 = mz2
463  fft_scratch_size%lg = lg
464  fft_scratch_size%mg = mg
465  fft_scratch_size%nbx = maxval(bo(2, 1, :, 2))
466  fft_scratch_size%nbz = maxval(bo(2, 3, :, 2))
467  fft_scratch_size%mcz1 = maxval(bo(2, 3, :, 1) - bo(1, 3, :, 1) + 1)
468  fft_scratch_size%mcx2 = maxval(bo(2, 1, :, 2) - bo(1, 1, :, 2) + 1)
469  fft_scratch_size%mcz2 = maxval(bo(2, 3, :, 2) - bo(1, 3, :, 2) + 1)
470  fft_scratch_size%nmax = nmax
471  fft_scratch_size%nmray = maxval(nyzray)
472  fft_scratch_size%nyzray = nyzray(g_pos)
473  fft_scratch_size%gs_group = gs_group
474  fft_scratch_size%rs_group = rs_group
475  fft_scratch_size%g_pos = g_pos
476  fft_scratch_size%r_pos = r_pos
477  fft_scratch_size%r_dim = r_dim
478  fft_scratch_size%numtask = numtask
479 
480  IF (r_dim(2) > 1) THEN
481  !
482  ! real space is distributed over x and y coordinate
483  ! we have two stages of communication
484  !
485  IF (r_dim(1) == 1) &
486  cpabort("This processor distribution is not supported.")
487 
488  CALL get_fft_scratch(fft_scratch, tf_type=300, n=n, fft_sizes=fft_scratch_size)
489 
490  ! assign buffers
491  pbuf => fft_scratch%p7buf
492  qbuf => fft_scratch%p4buf
493  rbuf => fft_scratch%p3buf
494  sbuf => fft_scratch%p2buf
495 
496  ! FFT along x
497  CALL pw_gpu_sf(pw1, pbuf, scale)
498 
499  ! Exchange data ( transpose of matrix ) and sort
500  IF (pw1%pw_grid%grid_span /= fullspace) qbuf = z_zero
501  CALL yz_to_xz(pbuf, rs_group, r_dim, g_pos, p2p, pw1%pw_grid%para%yzp, nyzray, &
502  bo(:, :, :, 2), qbuf, fft_scratch)
503 
504  ! FFT along y
505  ! use the inbuild fft-lib
506  ! CALL fft_1dm(fft_scratch%fft_plan(5), qbuf, rbuf, 1.0_dp, stat)
507  ! or cufft (works faster, but is only faster if plans are stored)
508  CALL pw_gpu_f(qbuf, rbuf, -1, n(2), mx2*mz2)
509 
510  ! Exchange data ( transpose of matrix )
511  IF (pw1%pw_grid%grid_span /= fullspace) sbuf = z_zero
512 
513  CALL cube_transpose_1(rbuf, bo(:, :, :, 2), bo(:, :, :, 1), sbuf, fft_scratch)
514 
515  ! FFT along z
516  CALL pw_gpu_fc(sbuf, pw2)
517 
518  CALL release_fft_scratch(fft_scratch)
519 
520  ELSE
521  !
522  ! real space is only distributed over x coordinate
523  ! we have one stage of communication, after the transform of
524  ! direction x
525  !
526 
527  CALL get_fft_scratch(fft_scratch, tf_type=200, n=n, fft_sizes=fft_scratch_size)
528 
529  ! assign buffers
530  sbuf => fft_scratch%r1buf
531  tbuf => fft_scratch%tbuf
532 
533  ! FFT along x
534  CALL pw_gpu_sf(pw1, sbuf, scale)
535 
536  ! Exchange data ( transpose of matrix ) and sort
537  IF (pw1%pw_grid%grid_span /= fullspace) tbuf = z_zero
538  CALL x_to_yz(sbuf, gs_group, g_pos, p2p, pw1%pw_grid%para%yzp, nyzray, &
539  bo(:, :, :, 2), tbuf, fft_scratch)
540 
541  ! FFT along y and z
542  CALL pw_gpu_ffc(tbuf, pw2)
543 
544  CALL release_fft_scratch(fft_scratch)
545 
546  END IF
547 
548  DEALLOCATE (p2p)
549 
550 !--------------------------------------------------------------------------
551  ELSE
552  cpabort("Not implemented (no ray_distr.) in: pw_gpu_c1dr3d_3d_ps.")
553  !CALL fft3d ( dir, n, pwin, grays, pw1%pw_grid%para%rs_group, &
554  ! pw1%pw_grid%para%bo, scale = scale, debug=test )
555  END IF
556 
557  CALL timestop(handle)
558  END SUBROUTINE pw_gpu_c1dr3d_3d_ps
559 
560 ! **************************************************************************************************
561 !> \brief perform a parallel real_to_complex copy followed by a 2D-FFT on the gpu
562 !> \param pw1 ...
563 !> \param pwbuf ...
564 !> \author Andreas Gloess
565 ! **************************************************************************************************
566  SUBROUTINE pw_gpu_cff(pw1, pwbuf)
567  TYPE(pw_r3d_rs_type), INTENT(IN) :: pw1
568  COMPLEX(KIND=dp), DIMENSION(:, :, :), &
569  INTENT(INOUT), TARGET :: pwbuf
570 
571  CHARACTER(len=*), PARAMETER :: routinen = 'pw_gpu_cff'
572 
573  COMPLEX(KIND=dp), POINTER :: ptr_pwout
574  INTEGER :: handle, l1, l2, l3
575  INTEGER, DIMENSION(:), POINTER :: npts
576  REAL(kind=dp), POINTER :: ptr_pwin
577  INTERFACE
578  SUBROUTINE pw_gpu_cff_c(din, zout, npts) BIND(C, name="pw_gpu_cff")
579  IMPORT
580  TYPE(c_ptr), INTENT(IN), VALUE :: din
581  TYPE(c_ptr), VALUE :: zout
582  INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
583  END SUBROUTINE pw_gpu_cff_c
584  END INTERFACE
585 
586  CALL timeset(routinen, handle)
587 
588  ! dimensions
589  npts => pw1%pw_grid%npts_local
590  l1 = lbound(pw1%array, 1)
591  l2 = lbound(pw1%array, 2)
592  l3 = lbound(pw1%array, 3)
593 
594  ! pointers to data arrays
595  ptr_pwin => pw1%array(l1, l2, l3)
596  ptr_pwout => pwbuf(1, 1, 1)
597 
598  ! invoke the combined transformation
599 #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
600  CALL pw_gpu_cff_c(c_loc(ptr_pwin), c_loc(ptr_pwout), npts)
601 #else
602  cpabort("Compiled without pw offloading")
603 #endif
604 
605  CALL timestop(handle)
606  END SUBROUTINE pw_gpu_cff
607 
608 ! **************************************************************************************************
609 !> \brief perform a parallel 2D-FFT followed by a complex_to_real copy on the gpu
610 !> \param pwbuf ...
611 !> \param pw2 ...
612 !> \author Andreas Gloess
613 ! **************************************************************************************************
614  SUBROUTINE pw_gpu_ffc(pwbuf, pw2)
615  COMPLEX(KIND=dp), DIMENSION(:, :, :), INTENT(IN), &
616  TARGET :: pwbuf
617  TYPE(pw_r3d_rs_type), INTENT(IN) :: pw2
618 
619  CHARACTER(len=*), PARAMETER :: routinen = 'pw_gpu_ffc'
620 
621  COMPLEX(KIND=dp), POINTER :: ptr_pwin
622  INTEGER :: handle, l1, l2, l3
623  INTEGER, DIMENSION(:), POINTER :: npts
624  REAL(kind=dp), POINTER :: ptr_pwout
625  INTERFACE
626  SUBROUTINE pw_gpu_ffc_c(zin, dout, npts) BIND(C, name="pw_gpu_ffc")
627  IMPORT
628  TYPE(c_ptr), INTENT(IN), VALUE :: zin
629  TYPE(c_ptr), VALUE :: dout
630  INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
631  END SUBROUTINE pw_gpu_ffc_c
632  END INTERFACE
633 
634  CALL timeset(routinen, handle)
635 
636  ! dimensions
637  npts => pw2%pw_grid%npts_local
638  l1 = lbound(pw2%array, 1)
639  l2 = lbound(pw2%array, 2)
640  l3 = lbound(pw2%array, 3)
641 
642  ! pointers to data arrays
643  ptr_pwin => pwbuf(1, 1, 1)
644  ptr_pwout => pw2%array(l1, l2, l3)
645 
646  ! invoke the combined transformation
647 #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
648  CALL pw_gpu_ffc_c(c_loc(ptr_pwin), c_loc(ptr_pwout), npts)
649 #else
650  cpabort("Compiled without pw offloading")
651 #endif
652 
653  CALL timestop(handle)
654  END SUBROUTINE pw_gpu_ffc
655 
656 ! **************************************************************************************************
657 !> \brief perform a parallel real_to_complex copy followed by a 1D-FFT on the gpu
658 !> \param pw1 ...
659 !> \param pwbuf ...
660 !> \author Andreas Gloess
661 ! **************************************************************************************************
662  SUBROUTINE pw_gpu_cf(pw1, pwbuf)
663  TYPE(pw_r3d_rs_type), INTENT(IN) :: pw1
664  COMPLEX(KIND=dp), DIMENSION(:, :), INTENT(INOUT), &
665  TARGET :: pwbuf
666 
667  CHARACTER(len=*), PARAMETER :: routinen = 'pw_gpu_cf'
668 
669  COMPLEX(KIND=dp), POINTER :: ptr_pwout
670  INTEGER :: handle, l1, l2, l3
671  INTEGER, DIMENSION(:), POINTER :: npts
672  REAL(kind=dp), POINTER :: ptr_pwin
673  INTERFACE
674  SUBROUTINE pw_gpu_cf_c(din, zout, npts) BIND(C, name="pw_gpu_cf")
675  IMPORT
676  TYPE(c_ptr), INTENT(IN), VALUE :: din
677  TYPE(c_ptr), VALUE :: zout
678  INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
679  END SUBROUTINE pw_gpu_cf_c
680  END INTERFACE
681 
682  CALL timeset(routinen, handle)
683 
684  ! dimensions
685  npts => pw1%pw_grid%npts_local
686  l1 = lbound(pw1%array, 1)
687  l2 = lbound(pw1%array, 2)
688  l3 = lbound(pw1%array, 3)
689 
690  ! pointers to data arrays
691  ptr_pwin => pw1%array(l1, l2, l3)
692  ptr_pwout => pwbuf(1, 1)
693 
694  ! invoke the combined transformation
695 #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
696  CALL pw_gpu_cf_c(c_loc(ptr_pwin), c_loc(ptr_pwout), npts)
697 #else
698  cpabort("Compiled without pw offloading")
699 #endif
700  CALL timestop(handle)
701  END SUBROUTINE pw_gpu_cf
702 
703 ! **************************************************************************************************
704 !> \brief perform a parallel 1D-FFT followed by a complex_to_real copy on the gpu
705 !> \param pwbuf ...
706 !> \param pw2 ...
707 !> \author Andreas Gloess
708 ! **************************************************************************************************
709  SUBROUTINE pw_gpu_fc(pwbuf, pw2)
710  COMPLEX(KIND=dp), DIMENSION(:, :), INTENT(IN), &
711  TARGET :: pwbuf
712  TYPE(pw_r3d_rs_type), INTENT(IN) :: pw2
713 
714  CHARACTER(len=*), PARAMETER :: routinen = 'pw_gpu_fc'
715 
716  COMPLEX(KIND=dp), POINTER :: ptr_pwin
717  INTEGER :: handle, l1, l2, l3
718  INTEGER, DIMENSION(:), POINTER :: npts
719  REAL(kind=dp), POINTER :: ptr_pwout
720  INTERFACE
721  SUBROUTINE pw_gpu_fc_c(zin, dout, npts) BIND(C, name="pw_gpu_fc")
722  IMPORT
723  TYPE(c_ptr), INTENT(IN), VALUE :: zin
724  TYPE(c_ptr), VALUE :: dout
725  INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
726  END SUBROUTINE pw_gpu_fc_c
727  END INTERFACE
728 
729  CALL timeset(routinen, handle)
730 
731  npts => pw2%pw_grid%npts_local
732  l1 = lbound(pw2%array, 1)
733  l2 = lbound(pw2%array, 2)
734  l3 = lbound(pw2%array, 3)
735 
736  ! pointers to data arrays
737  ptr_pwin => pwbuf(1, 1)
738  ptr_pwout => pw2%array(l1, l2, l3)
739 
740  ! invoke the combined transformation
741 #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
742  CALL pw_gpu_fc_c(c_loc(ptr_pwin), c_loc(ptr_pwout), npts)
743 #else
744  cpabort("Compiled without pw offloading")
745 #endif
746 
747  CALL timestop(handle)
748  END SUBROUTINE pw_gpu_fc
749 
750 ! **************************************************************************************************
751 !> \brief perform a parallel 1D-FFT on the gpu
752 !> \param pwbuf1 ...
753 !> \param pwbuf2 ...
754 !> \param dir ...
755 !> \param n ...
756 !> \param m ...
757 !> \author Andreas Gloess
758 ! **************************************************************************************************
759  SUBROUTINE pw_gpu_f(pwbuf1, pwbuf2, dir, n, m)
760  COMPLEX(KIND=dp), DIMENSION(:, :), INTENT(IN), &
761  TARGET :: pwbuf1
762  COMPLEX(KIND=dp), DIMENSION(:, :), INTENT(INOUT), &
763  TARGET :: pwbuf2
764  INTEGER, INTENT(IN) :: dir, n, m
765 
766  CHARACTER(len=*), PARAMETER :: routinen = 'pw_gpu_f'
767 
768  COMPLEX(KIND=dp), POINTER :: ptr_pwin, ptr_pwout
769  INTEGER :: handle
770  INTERFACE
771  SUBROUTINE pw_gpu_f_c(zin, zout, dir, n, m) BIND(C, name="pw_gpu_f")
772  IMPORT
773  TYPE(c_ptr), INTENT(IN), VALUE :: zin
774  TYPE(c_ptr), VALUE :: zout
775  INTEGER(KIND=C_INT), INTENT(IN), VALUE :: dir, n, m
776  END SUBROUTINE pw_gpu_f_c
777  END INTERFACE
778 
779  CALL timeset(routinen, handle)
780 
781  IF (n*m /= 0) THEN
782  ! pointers to data arrays
783  ptr_pwin => pwbuf1(1, 1)
784  ptr_pwout => pwbuf2(1, 1)
785 
786  ! invoke the combined transformation
787 #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
788  CALL pw_gpu_f_c(c_loc(ptr_pwin), c_loc(ptr_pwout), dir, n, m)
789 #else
790  mark_used(dir)
791  cpabort("Compiled without pw offloading")
792 #endif
793  END IF
794 
795  CALL timestop(handle)
796  END SUBROUTINE pw_gpu_f
797 ! **************************************************************************************************
798 !> \brief perform a parallel 1D-FFT followed by a gather on the gpu
799 !> \param pwbuf ...
800 !> \param pw2 ...
801 !> \param scale ...
802 !> \author Andreas Gloess
803 ! **************************************************************************************************
804  SUBROUTINE pw_gpu_fg(pwbuf, pw2, scale)
805  COMPLEX(KIND=dp), DIMENSION(:, :), INTENT(IN), &
806  TARGET :: pwbuf
807  TYPE(pw_c1d_gs_type), INTENT(IN) :: pw2
808  REAL(kind=dp), INTENT(IN) :: scale
809 
810  CHARACTER(len=*), PARAMETER :: routinen = 'pw_gpu_fg'
811 
812  COMPLEX(KIND=dp), POINTER :: ptr_pwin, ptr_pwout
813  INTEGER :: handle, mg, mmax, ngpts
814  INTEGER, DIMENSION(:), POINTER :: npts
815  INTEGER, POINTER :: ptr_ghatmap
816  INTERFACE
817  SUBROUTINE pw_gpu_fg_c(zin, zout, ghatmap, npts, mmax, ngpts, scale) BIND(C, name="pw_gpu_fg")
818  IMPORT
819  TYPE(c_ptr), INTENT(IN), VALUE :: zin
820  TYPE(c_ptr), VALUE :: zout
821  TYPE(c_ptr), INTENT(IN), VALUE :: ghatmap
822  INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
823  INTEGER(KIND=C_INT), INTENT(IN), VALUE :: mmax, ngpts
824  REAL(kind=c_double), INTENT(IN), VALUE :: scale
825 
826  END SUBROUTINE pw_gpu_fg_c
827  END INTERFACE
828 
829  CALL timeset(routinen, handle)
830 
831  ngpts = SIZE(pw2%pw_grid%gsq)
832  npts => pw2%pw_grid%npts
833 
834  IF ((npts(1) /= 0) .AND. (ngpts /= 0)) THEN
835  mg = SIZE(pw2%pw_grid%grays, 2)
836  mmax = max(mg, 1)
837 
838  ! pointers to data arrays
839  ptr_pwin => pwbuf(1, 1)
840  ptr_pwout => pw2%array(1)
841 
842  ! pointer to map array
843  ptr_ghatmap => pw2%pw_grid%g_hatmap(1, 1)
844 
845  ! invoke the combined transformation
846 #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
847  CALL pw_gpu_fg_c(c_loc(ptr_pwin), c_loc(ptr_pwout), c_loc(ptr_ghatmap), npts, mmax, ngpts, scale)
848 #else
849  mark_used(scale)
850  cpabort("Compiled without pw offloading")
851 #endif
852  END IF
853 
854  CALL timestop(handle)
855  END SUBROUTINE pw_gpu_fg
856 
857 ! **************************************************************************************************
858 !> \brief perform a parallel scatter followed by a 1D-FFT on the gpu
859 !> \param pw1 ...
860 !> \param pwbuf ...
861 !> \param scale ...
862 !> \author Andreas Gloess
863 ! **************************************************************************************************
864  SUBROUTINE pw_gpu_sf(pw1, pwbuf, scale)
865  TYPE(pw_c1d_gs_type), INTENT(IN) :: pw1
866  COMPLEX(KIND=dp), DIMENSION(:, :), INTENT(INOUT), &
867  TARGET :: pwbuf
868  REAL(kind=dp), INTENT(IN) :: scale
869 
870  CHARACTER(len=*), PARAMETER :: routinen = 'pw_gpu_sf'
871 
872  COMPLEX(KIND=dp), POINTER :: ptr_pwin, ptr_pwout
873  INTEGER :: handle, mg, mmax, ngpts, nmaps
874  INTEGER, DIMENSION(:), POINTER :: npts
875  INTEGER, POINTER :: ptr_ghatmap
876  INTERFACE
877  SUBROUTINE pw_gpu_sf_c(zin, zout, ghatmap, npts, mmax, ngpts, nmaps, scale) BIND(C, name="pw_gpu_sf")
878  IMPORT
879  TYPE(c_ptr), INTENT(IN), VALUE :: zin
880  TYPE(c_ptr), VALUE :: zout
881  TYPE(c_ptr), INTENT(IN), VALUE :: ghatmap
882  INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
883  INTEGER(KIND=C_INT), INTENT(IN), VALUE :: mmax, ngpts, nmaps
884  REAL(kind=c_double), INTENT(IN), VALUE :: scale
885 
886  END SUBROUTINE pw_gpu_sf_c
887  END INTERFACE
888 
889  CALL timeset(routinen, handle)
890 
891  ngpts = SIZE(pw1%pw_grid%gsq)
892  npts => pw1%pw_grid%npts
893 
894  IF ((npts(1) /= 0) .AND. (ngpts /= 0)) THEN
895  mg = SIZE(pw1%pw_grid%grays, 2)
896  mmax = max(mg, 1)
897 
898  ! pointers to data arrays
899  ptr_pwin => pw1%array(1)
900  ptr_pwout => pwbuf(1, 1)
901 
902  ! pointer to map array
903  nmaps = SIZE(pw1%pw_grid%g_hatmap, 2)
904  ptr_ghatmap => pw1%pw_grid%g_hatmap(1, 1)
905 
906  ! invoke the combined transformation
907 #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
908  CALL pw_gpu_sf_c(c_loc(ptr_pwin), c_loc(ptr_pwout), c_loc(ptr_ghatmap), npts, mmax, ngpts, nmaps, scale)
909 #else
910  mark_used(scale)
911  cpabort("Compiled without pw offloading")
912 #endif
913  END IF
914 
915  CALL timestop(handle)
916  END SUBROUTINE pw_gpu_sf
917 
918 END MODULE pw_gpu
919 
subroutine, public yz_to_x(tb, group, my_pos, p2p, yzp, nray, bo, sb, fft_scratch)
...
Definition: fft_tools.F:1497
subroutine, public yz_to_xz(sb, group, dims, my_pos, p2p, yzp, nray, bo, tb, fft_scratch)
...
Definition: fft_tools.F:1613
subroutine, public cube_transpose_1(cin, boin, boout, sout, fft_scratch)
...
Definition: fft_tools.F:2017
subroutine, public release_fft_scratch(fft_scratch)
...
Definition: fft_tools.F:3254
subroutine, public x_to_yz(sb, group, my_pos, p2p, yzp, nray, bo, tb, fft_scratch)
...
Definition: fft_tools.F:1382
subroutine, public get_fft_scratch(fft_scratch, tf_type, n, fft_sizes)
...
Definition: fft_tools.F:2876
subroutine, public xz_to_yz(sb, group, dims, my_pos, p2p, yzp, nray, bo, tb, fft_scratch)
...
Definition: fft_tools.F:1823
subroutine, public cube_transpose_2(cin, boin, boout, sout, fft_scratch)
...
Definition: fft_tools.F:2107
Defines the basic variable types.
Definition: kinds.F:23
integer, parameter, public dp
Definition: kinds.F:34
Definition of mathematical constants and functions.
Definition: mathconstants.F:16
complex(kind=dp), parameter, public z_zero
Interface to the message passing library MPI.
Definition: pw_gpu.F:26
subroutine, public pw_gpu_c1dr3d_3d(pw1, pw2, scale)
perform an scatter followed by a fft on the gpu
Definition: pw_gpu.F:162
subroutine, public pw_gpu_c1dr3d_3d_ps(pw1, pw2, scale)
perform an parallel scatter followed by a fft on the gpu
Definition: pw_gpu.F:392
subroutine, public pw_gpu_r3dc1d_3d(pw1, pw2, scale)
perform an fft followed by a gather on the gpu
Definition: pw_gpu.F:104
subroutine, public pw_gpu_init()
Allocates resources on the gpu device for gpu fft acceleration.
Definition: pw_gpu.F:63
subroutine, public pw_gpu_finalize()
Releases resources on the gpu device for gpu fft acceleration.
Definition: pw_gpu.F:82
subroutine, public pw_gpu_r3dc1d_3d_ps(pw1, pw2, scale)
perform an parallel fft followed by a gather on the gpu
Definition: pw_gpu.F:220
integer, parameter, public fullspace
Definition: pw_grid_types.F:28