(git:33f85d8)
Loading...
Searching...
No Matches
pw_gpu.F
Go to the documentation of this file.
1!--------------------------------------------------------------------------------------------------!
2! CP2K: A general program to perform molecular dynamics simulations !
3! Copyright 2000-2025 CP2K developers group <https://cp2k.org> !
4! !
5! SPDX-License-Identifier: GPL-2.0-or-later !
6!--------------------------------------------------------------------------------------------------!
7
8! **************************************************************************************************
9!> \note
10!> This module contains routines necessary to operate on plane waves on GPUs
11! > independently of the GPU platform.
12!> \par History
13!> BGL (06-Mar-2008) : Created
14!> AG (18-May-2012) : Refacturing:
15!> - added explicit interfaces to C routines
16!> - enable double precision complex transformations
17!> AG (11-Sept-2012) : Modifications:
18!> - use pointers if precision mapping is not required
19!> - use OMP for mapping
20!> MT (Jan 2022) : Modifications
21!> - use a generic interface for fft calls to GPUs
22!> - Support both Nvidia and AMD GPUs. Other GPUs manufacturers
23!> can be added easily.
24!> \author Benjamin G. Levine
25! **************************************************************************************************
26MODULE pw_gpu
27 USE iso_c_binding, ONLY: c_double,&
28 c_int,&
29 c_loc,&
30 c_ptr
31 USE fft_tools, ONLY: &
34 USE kinds, ONLY: dp
35 USE mathconstants, ONLY: z_zero
37 USE pw_grid_types, ONLY: fullspace
38 USE pw_types, ONLY: pw_c1d_gs_type,&
40#include "../base/base_uses.f90"
41
42 IMPLICIT NONE
43
44 PRIVATE
45
46 PUBLIC :: pw_gpu_r3dc1d_3d
47 PUBLIC :: pw_gpu_c1dr3d_3d
48 PUBLIC :: pw_gpu_r3dc1d_3d_ps
49 PUBLIC :: pw_gpu_c1dr3d_3d_ps
51
52 CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'pw_gpu'
53 LOGICAL, PARAMETER, PRIVATE :: debug_this_module = .false.
54
55CONTAINS
56
57! **************************************************************************************************
58!> \brief Allocates resources on the gpu device for gpu fft acceleration
59!> \author Ole Schuett
60! **************************************************************************************************
61 SUBROUTINE pw_gpu_init()
62 INTEGER :: dummy
63 INTERFACE
64 SUBROUTINE pw_gpu_init_c() BIND(C, name="pw_gpu_init")
65 END SUBROUTINE pw_gpu_init_c
66 END INTERFACE
67
68 mark_used(dummy) ! TODO: fix fpretty
69#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
70 CALL pw_gpu_init_c()
71#else
72 ! Nothing to do.
73#endif
74 END SUBROUTINE pw_gpu_init
75
76! **************************************************************************************************
77!> \brief Releases resources on the gpu device for gpu fft acceleration
78!> \author Ole Schuett
79! **************************************************************************************************
80 SUBROUTINE pw_gpu_finalize()
81 INTEGER :: dummy
82 INTERFACE
83 SUBROUTINE pw_gpu_finalize_c() BIND(C, name="pw_gpu_finalize")
84 END SUBROUTINE pw_gpu_finalize_c
85 END INTERFACE
86
87 mark_used(dummy) ! TODO: fix fpretty
88#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
89 CALL pw_gpu_finalize_c()
90#else
91 ! Nothing to do.
92#endif
93 END SUBROUTINE pw_gpu_finalize
94
95! **************************************************************************************************
96!> \brief perform an fft followed by a gather on the gpu
97!> \param pw1 ...
98!> \param pw2 ...
99!> \author Benjamin G Levine
100! **************************************************************************************************
101 SUBROUTINE pw_gpu_r3dc1d_3d(pw1, pw2)
102 TYPE(pw_r3d_rs_type), INTENT(IN) :: pw1
103 TYPE(pw_c1d_gs_type), INTENT(INOUT) :: pw2
104
105 CHARACTER(len=*), PARAMETER :: routinen = 'pw_gpu_r3dc1d_3d'
106
107 COMPLEX(KIND=dp), POINTER :: ptr_pwout
108 INTEGER :: handle, l1, l2, l3, ngpts
109 INTEGER, DIMENSION(:), POINTER :: npts
110 INTEGER, POINTER :: ptr_ghatmap
111 REAL(kind=dp) :: scale
112 REAL(kind=dp), POINTER :: ptr_pwin
113 INTERFACE
114 SUBROUTINE pw_gpu_cfffg_c(din, zout, ghatmap, npts, ngpts, scale) BIND(C, name="pw_gpu_cfffg")
115 IMPORT
116 TYPE(c_ptr), INTENT(IN), VALUE :: din
117 TYPE(c_ptr), VALUE :: zout
118 TYPE(c_ptr), INTENT(IN), VALUE :: ghatmap
119 INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
120 INTEGER(KIND=C_INT), INTENT(IN), VALUE :: ngpts
121 REAL(kind=c_double), INTENT(IN), VALUE :: scale
122
123 END SUBROUTINE pw_gpu_cfffg_c
124 END INTERFACE
125
126 CALL timeset(routinen, handle)
127
128 scale = 1.0_dp/real(pw1%pw_grid%ngpts, kind=dp)
129
130 ngpts = SIZE(pw2%pw_grid%gsq)
131 l1 = lbound(pw1%array, 1)
132 l2 = lbound(pw1%array, 2)
133 l3 = lbound(pw1%array, 3)
134 npts => pw1%pw_grid%npts
135
136 ! pointers to data arrays
137 ptr_pwin => pw1%array(l1, l2, l3)
138 ptr_pwout => pw2%array(1)
139
140 ! pointer to map array
141 ptr_ghatmap => pw2%pw_grid%g_hatmap(1, 1)
142
143 ! invoke the combined transformation
144#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
145 CALL pw_gpu_cfffg_c(c_loc(ptr_pwin), c_loc(ptr_pwout), c_loc(ptr_ghatmap), npts, ngpts, scale)
146#else
147 cpabort("Compiled without pw offloading.")
148#endif
149
150 CALL timestop(handle)
151 END SUBROUTINE pw_gpu_r3dc1d_3d
152
153! **************************************************************************************************
154!> \brief perform an scatter followed by a fft on the gpu
155!> \param pw1 ...
156!> \param pw2 ...
157!> \author Benjamin G Levine
158! **************************************************************************************************
159 SUBROUTINE pw_gpu_c1dr3d_3d(pw1, pw2)
160 TYPE(pw_c1d_gs_type), INTENT(IN) :: pw1
161 TYPE(pw_r3d_rs_type), INTENT(INOUT) :: pw2
162
163 CHARACTER(len=*), PARAMETER :: routinen = 'pw_gpu_c1dr3d_3d'
164
165 COMPLEX(KIND=dp), POINTER :: ptr_pwin
166 INTEGER :: handle, l1, l2, l3, ngpts, nmaps
167 INTEGER, DIMENSION(:), POINTER :: npts
168 INTEGER, POINTER :: ptr_ghatmap
169 REAL(kind=dp) :: scale
170 REAL(kind=dp), POINTER :: ptr_pwout
171 INTERFACE
172 SUBROUTINE pw_gpu_sfffc_c(zin, dout, ghatmap, npts, ngpts, nmaps, scale) BIND(C, name="pw_gpu_sfffc")
173 IMPORT
174 TYPE(c_ptr), INTENT(IN), VALUE :: zin
175 TYPE(c_ptr), VALUE :: dout
176 TYPE(c_ptr), INTENT(IN), VALUE :: ghatmap
177 INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
178 INTEGER(KIND=C_INT), INTENT(IN), VALUE :: ngpts, nmaps
179 REAL(kind=c_double), INTENT(IN), VALUE :: scale
180 END SUBROUTINE pw_gpu_sfffc_c
181 END INTERFACE
182
183 CALL timeset(routinen, handle)
184
185 scale = 1.0_dp
186
187 ngpts = SIZE(pw1%pw_grid%gsq)
188 l1 = lbound(pw2%array, 1)
189 l2 = lbound(pw2%array, 2)
190 l3 = lbound(pw2%array, 3)
191 npts => pw1%pw_grid%npts
192
193 ! pointers to data arrays
194 ptr_pwin => pw1%array(1)
195 ptr_pwout => pw2%array(l1, l2, l3)
196
197 ! pointer to map array
198 nmaps = SIZE(pw1%pw_grid%g_hatmap, 2)
199 ptr_ghatmap => pw1%pw_grid%g_hatmap(1, 1)
200
201 ! invoke the combined transformation
202#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
203 CALL pw_gpu_sfffc_c(c_loc(ptr_pwin), c_loc(ptr_pwout), c_loc(ptr_ghatmap), npts, ngpts, nmaps, scale)
204#else
205 cpabort("Compiled without pw offloading")
206#endif
207
208 CALL timestop(handle)
209 END SUBROUTINE pw_gpu_c1dr3d_3d
210
211! **************************************************************************************************
212!> \brief perform an parallel fft followed by a gather on the gpu
213!> \param pw1 ...
214!> \param pw2 ...
215!> \author Andreas Gloess
216! **************************************************************************************************
217 SUBROUTINE pw_gpu_r3dc1d_3d_ps(pw1, pw2)
218 TYPE(pw_r3d_rs_type), INTENT(IN) :: pw1
219 TYPE(pw_c1d_gs_type), INTENT(INOUT) :: pw2
220
221 CHARACTER(len=*), PARAMETER :: routinen = 'pw_gpu_r3dc1d_3d_ps'
222
223 COMPLEX(KIND=dp), DIMENSION(:, :), POINTER :: grays, pbuf, qbuf, rbuf, sbuf
224 COMPLEX(KIND=dp), DIMENSION(:, :, :), POINTER :: tbuf
225 INTEGER :: g_pos, handle, lg, lmax, mg, mmax, mx2, &
226 mz2, n1, n2, ngpts, nmax, numtask, rp
227 INTEGER, ALLOCATABLE, DIMENSION(:) :: p2p
228 INTEGER, DIMENSION(2) :: r_dim, r_pos
229 INTEGER, DIMENSION(:), POINTER :: n, nloc, nyzray
230 INTEGER, DIMENSION(:, :, :, :), POINTER :: bo
231 REAL(kind=dp) :: scale
232 TYPE(fft_scratch_sizes) :: fft_scratch_size
233 TYPE(fft_scratch_type), POINTER :: fft_scratch
234 TYPE(mp_cart_type) :: rs_group
235
236 CALL timeset(routinen, handle)
237
238 scale = 1.0_dp/real(pw1%pw_grid%ngpts, kind=dp)
239
240 ! dimensions
241 n => pw1%pw_grid%npts
242 nloc => pw1%pw_grid%npts_local
243 grays => pw1%pw_grid%grays
244 ngpts = nloc(1)*nloc(2)*nloc(3)
245
246 !..transform
247 IF (pw1%pw_grid%para%ray_distribution) THEN
248 rs_group = pw1%pw_grid%para%group
249 nyzray => pw1%pw_grid%para%nyzray
250 bo => pw1%pw_grid%para%bo
251
252 g_pos = rs_group%mepos
253 numtask = rs_group%num_pe
254 r_dim = rs_group%num_pe_cart
255 r_pos = rs_group%mepos_cart
256
257 lg = SIZE(grays, 1)
258 mg = SIZE(grays, 2)
259 mmax = max(mg, 1)
260 lmax = max(lg, (ngpts/mmax + 1))
261
262 ALLOCATE (p2p(0:numtask - 1))
263
264 CALL rs_group%rank_compare(rs_group, p2p)
265
266 rp = p2p(g_pos)
267 mx2 = bo(2, 1, rp, 2) - bo(1, 1, rp, 2) + 1
268 mz2 = bo(2, 3, rp, 2) - bo(1, 3, rp, 2) + 1
269 n1 = maxval(bo(2, 1, :, 1) - bo(1, 1, :, 1) + 1)
270 n2 = maxval(bo(2, 2, :, 1) - bo(1, 2, :, 1) + 1)
271 nmax = max((2*n2)/numtask, 2)*mx2*mz2
272 nmax = max(nmax, n1*maxval(nyzray))
273
274 fft_scratch_size%nx = nloc(1)
275 fft_scratch_size%ny = nloc(2)
276 fft_scratch_size%nz = nloc(3)
277 fft_scratch_size%lmax = lmax
278 fft_scratch_size%mmax = mmax
279 fft_scratch_size%mx1 = bo(2, 1, rp, 1) - bo(1, 1, rp, 1) + 1
280 fft_scratch_size%mx2 = mx2
281 fft_scratch_size%my1 = bo(2, 2, rp, 1) - bo(1, 2, rp, 1) + 1
282 fft_scratch_size%mz2 = mz2
283 fft_scratch_size%lg = lg
284 fft_scratch_size%mg = mg
285 fft_scratch_size%nbx = maxval(bo(2, 1, :, 2))
286 fft_scratch_size%nbz = maxval(bo(2, 3, :, 2))
287 fft_scratch_size%mcz1 = maxval(bo(2, 3, :, 1) - bo(1, 3, :, 1) + 1)
288 fft_scratch_size%mcx2 = maxval(bo(2, 1, :, 2) - bo(1, 1, :, 2) + 1)
289 fft_scratch_size%mcz2 = maxval(bo(2, 3, :, 2) - bo(1, 3, :, 2) + 1)
290 fft_scratch_size%nmax = nmax
291 fft_scratch_size%nmray = maxval(nyzray)
292 fft_scratch_size%nyzray = nyzray(g_pos)
293 fft_scratch_size%rs_group = rs_group
294 fft_scratch_size%g_pos = g_pos
295 fft_scratch_size%r_pos = r_pos
296 fft_scratch_size%r_dim = r_dim
297 fft_scratch_size%numtask = numtask
298
299 IF (r_dim(2) > 1) THEN
300 !
301 ! real space is distributed over x and y coordinate
302 ! we have two stages of communication
303 !
304 IF (r_dim(1) == 1) &
305 cpabort("This processor distribution is not supported.")
306
307 CALL get_fft_scratch(fft_scratch, tf_type=300, n=n, fft_sizes=fft_scratch_size)
308
309 ! assign buffers
310 qbuf => fft_scratch%p2buf
311 rbuf => fft_scratch%p3buf
312 pbuf => fft_scratch%p4buf
313 sbuf => fft_scratch%p5buf
314
315 ! FFT along z
316 CALL pw_gpu_cf(pw1, qbuf)
317
318 ! Exchange data ( transpose of matrix )
319 CALL cube_transpose_2(qbuf, bo(:, :, :, 1), bo(:, :, :, 2), rbuf, fft_scratch)
320
321 ! FFT along y
322 ! use the inbuild fft-lib
323 ! CALL fft_1dm(fft_scratch%fft_plan(2), rbuf, pbuf, 1.0_dp, stat)
324 ! or cufft (works faster, but is only faster if plans are stored)
325 CALL pw_gpu_f(rbuf, pbuf, +1, n(2), mx2*mz2)
326
327 ! Exchange data ( transpose of matrix ) and sort
328 CALL xz_to_yz(pbuf, rs_group, r_dim, g_pos, p2p, pw1%pw_grid%para%yzp, nyzray, &
329 bo(:, :, :, 2), sbuf, fft_scratch)
330
331 ! FFT along x
332 CALL pw_gpu_fg(sbuf, pw2, scale)
333
334 CALL release_fft_scratch(fft_scratch)
335
336 ELSE
337 !
338 ! real space is only distributed over x coordinate
339 ! we have one stage of communication, after the transform of
340 ! direction x
341 !
342
343 CALL get_fft_scratch(fft_scratch, tf_type=200, n=n, fft_sizes=fft_scratch_size)
344
345 ! assign buffers
346 tbuf => fft_scratch%tbuf
347 sbuf => fft_scratch%r1buf
348
349 ! FFT along y and z
350 CALL pw_gpu_cff(pw1, tbuf)
351
352 ! Exchange data ( transpose of matrix ) and sort
353 CALL yz_to_x(tbuf, rs_group, g_pos, p2p, pw1%pw_grid%para%yzp, nyzray, &
354 bo(:, :, :, 2), sbuf, fft_scratch)
355
356 ! FFT along x
357 CALL pw_gpu_fg(sbuf, pw2, scale)
358
359 CALL release_fft_scratch(fft_scratch)
360
361 END IF
362
363 DEALLOCATE (p2p)
364
365!--------------------------------------------------------------------------
366 ELSE
367 cpabort("Not implemented (no ray_distr.) in: pw_gpu_r3dc1d_3d_ps.")
368 END IF
369
370 CALL timestop(handle)
371 END SUBROUTINE pw_gpu_r3dc1d_3d_ps
372
373! **************************************************************************************************
374!> \brief perform an parallel scatter followed by a fft on the gpu
375!> \param pw1 ...
376!> \param pw2 ...
377!> \author Andreas Gloess
378! **************************************************************************************************
379 SUBROUTINE pw_gpu_c1dr3d_3d_ps(pw1, pw2)
380 TYPE(pw_c1d_gs_type), INTENT(IN) :: pw1
381 TYPE(pw_r3d_rs_type), INTENT(INOUT) :: pw2
382
383 CHARACTER(len=*), PARAMETER :: routinen = 'pw_gpu_c1dr3d_3d_ps'
384
385 COMPLEX(KIND=dp), DIMENSION(:, :), POINTER :: grays, pbuf, qbuf, rbuf, sbuf
386 COMPLEX(KIND=dp), DIMENSION(:, :, :), POINTER :: tbuf
387 INTEGER :: g_pos, handle, lg, lmax, mg, mmax, mx2, &
388 mz2, n1, n2, ngpts, nmax, numtask, rp
389 INTEGER, ALLOCATABLE, DIMENSION(:) :: p2p
390 INTEGER, DIMENSION(2) :: r_dim, r_pos
391 INTEGER, DIMENSION(:), POINTER :: n, nloc, nyzray
392 INTEGER, DIMENSION(:, :, :, :), POINTER :: bo
393 REAL(kind=dp) :: scale
394 TYPE(fft_scratch_sizes) :: fft_scratch_size
395 TYPE(fft_scratch_type), POINTER :: fft_scratch
396 TYPE(mp_cart_type) :: rs_group
397
398 CALL timeset(routinen, handle)
399
400 scale = 1.0_dp
401
402 ! dimensions
403 n => pw1%pw_grid%npts
404 nloc => pw1%pw_grid%npts_local
405 grays => pw1%pw_grid%grays
406 ngpts = nloc(1)*nloc(2)*nloc(3)
407
408 !..transform
409 IF (pw1%pw_grid%para%ray_distribution) THEN
410 rs_group = pw1%pw_grid%para%group
411 nyzray => pw1%pw_grid%para%nyzray
412 bo => pw1%pw_grid%para%bo
413
414 g_pos = rs_group%mepos
415 numtask = rs_group%num_pe
416 r_dim = rs_group%num_pe_cart
417 r_pos = rs_group%mepos_cart
418
419 lg = SIZE(grays, 1)
420 mg = SIZE(grays, 2)
421 mmax = max(mg, 1)
422 lmax = max(lg, (ngpts/mmax + 1))
423
424 ALLOCATE (p2p(0:numtask - 1))
425
426 CALL rs_group%rank_compare(rs_group, p2p)
427
428 rp = p2p(g_pos)
429 mx2 = bo(2, 1, rp, 2) - bo(1, 1, rp, 2) + 1
430 mz2 = bo(2, 3, rp, 2) - bo(1, 3, rp, 2) + 1
431 n1 = maxval(bo(2, 1, :, 1) - bo(1, 1, :, 1) + 1)
432 n2 = maxval(bo(2, 2, :, 1) - bo(1, 2, :, 1) + 1)
433 nmax = max((2*n2)/numtask, 2)*mx2*mz2
434 nmax = max(nmax, n1*maxval(nyzray))
435
436 fft_scratch_size%nx = nloc(1)
437 fft_scratch_size%ny = nloc(2)
438 fft_scratch_size%nz = nloc(3)
439 fft_scratch_size%lmax = lmax
440 fft_scratch_size%mmax = mmax
441 fft_scratch_size%mx1 = bo(2, 1, rp, 1) - bo(1, 1, rp, 1) + 1
442 fft_scratch_size%mx2 = mx2
443 fft_scratch_size%my1 = bo(2, 2, rp, 1) - bo(1, 2, rp, 1) + 1
444 fft_scratch_size%mz2 = mz2
445 fft_scratch_size%lg = lg
446 fft_scratch_size%mg = mg
447 fft_scratch_size%nbx = maxval(bo(2, 1, :, 2))
448 fft_scratch_size%nbz = maxval(bo(2, 3, :, 2))
449 fft_scratch_size%mcz1 = maxval(bo(2, 3, :, 1) - bo(1, 3, :, 1) + 1)
450 fft_scratch_size%mcx2 = maxval(bo(2, 1, :, 2) - bo(1, 1, :, 2) + 1)
451 fft_scratch_size%mcz2 = maxval(bo(2, 3, :, 2) - bo(1, 3, :, 2) + 1)
452 fft_scratch_size%nmax = nmax
453 fft_scratch_size%nmray = maxval(nyzray)
454 fft_scratch_size%nyzray = nyzray(g_pos)
455 fft_scratch_size%rs_group = rs_group
456 fft_scratch_size%g_pos = g_pos
457 fft_scratch_size%r_pos = r_pos
458 fft_scratch_size%r_dim = r_dim
459 fft_scratch_size%numtask = numtask
460
461 IF (r_dim(2) > 1) THEN
462 !
463 ! real space is distributed over x and y coordinate
464 ! we have two stages of communication
465 !
466 IF (r_dim(1) == 1) &
467 cpabort("This processor distribution is not supported.")
468
469 CALL get_fft_scratch(fft_scratch, tf_type=300, n=n, fft_sizes=fft_scratch_size)
470
471 ! assign buffers
472 pbuf => fft_scratch%p7buf
473 qbuf => fft_scratch%p4buf
474 rbuf => fft_scratch%p3buf
475 sbuf => fft_scratch%p2buf
476
477 ! FFT along x
478 CALL pw_gpu_sf(pw1, pbuf, scale)
479
480 ! Exchange data ( transpose of matrix ) and sort
481 IF (pw1%pw_grid%grid_span /= fullspace) qbuf = z_zero
482 CALL yz_to_xz(pbuf, rs_group, r_dim, g_pos, p2p, pw1%pw_grid%para%yzp, nyzray, &
483 bo(:, :, :, 2), qbuf, fft_scratch)
484
485 ! FFT along y
486 ! use the inbuild fft-lib
487 ! CALL fft_1dm(fft_scratch%fft_plan(5), qbuf, rbuf, 1.0_dp, stat)
488 ! or cufft (works faster, but is only faster if plans are stored)
489 CALL pw_gpu_f(qbuf, rbuf, -1, n(2), mx2*mz2)
490
491 ! Exchange data ( transpose of matrix )
492 IF (pw1%pw_grid%grid_span /= fullspace) sbuf = z_zero
493
494 CALL cube_transpose_1(rbuf, bo(:, :, :, 2), bo(:, :, :, 1), sbuf, fft_scratch)
495
496 ! FFT along z
497 CALL pw_gpu_fc(sbuf, pw2)
498
499 CALL release_fft_scratch(fft_scratch)
500
501 ELSE
502 !
503 ! real space is only distributed over x coordinate
504 ! we have one stage of communication, after the transform of
505 ! direction x
506 !
507
508 CALL get_fft_scratch(fft_scratch, tf_type=200, n=n, fft_sizes=fft_scratch_size)
509
510 ! assign buffers
511 sbuf => fft_scratch%r1buf
512 tbuf => fft_scratch%tbuf
513
514 ! FFT along x
515 CALL pw_gpu_sf(pw1, sbuf, scale)
516
517 ! Exchange data ( transpose of matrix ) and sort
518 IF (pw1%pw_grid%grid_span /= fullspace) tbuf = z_zero
519 CALL x_to_yz(sbuf, rs_group, g_pos, p2p, pw1%pw_grid%para%yzp, nyzray, &
520 bo(:, :, :, 2), tbuf, fft_scratch)
521
522 ! FFT along y and z
523 CALL pw_gpu_ffc(tbuf, pw2)
524
525 CALL release_fft_scratch(fft_scratch)
526
527 END IF
528
529 DEALLOCATE (p2p)
530
531!--------------------------------------------------------------------------
532 ELSE
533 cpabort("Not implemented (no ray_distr.) in: pw_gpu_c1dr3d_3d_ps.")
534 END IF
535
536 CALL timestop(handle)
537 END SUBROUTINE pw_gpu_c1dr3d_3d_ps
538
539! **************************************************************************************************
540!> \brief perform a parallel real_to_complex copy followed by a 2D-FFT on the gpu
541!> \param pw1 ...
542!> \param pwbuf ...
543!> \author Andreas Gloess
544! **************************************************************************************************
545 SUBROUTINE pw_gpu_cff(pw1, pwbuf)
546 TYPE(pw_r3d_rs_type), INTENT(IN) :: pw1
547 COMPLEX(KIND=dp), DIMENSION(:, :, :), &
548 INTENT(INOUT), TARGET :: pwbuf
549
550 CHARACTER(len=*), PARAMETER :: routinen = 'pw_gpu_cff'
551
552 COMPLEX(KIND=dp), POINTER :: ptr_pwout
553 INTEGER :: handle, l1, l2, l3
554 INTEGER, DIMENSION(:), POINTER :: npts
555 REAL(kind=dp), POINTER :: ptr_pwin
556 INTERFACE
557 SUBROUTINE pw_gpu_cff_c(din, zout, npts) BIND(C, name="pw_gpu_cff")
558 IMPORT
559 TYPE(c_ptr), INTENT(IN), VALUE :: din
560 TYPE(c_ptr), VALUE :: zout
561 INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
562 END SUBROUTINE pw_gpu_cff_c
563 END INTERFACE
564
565 CALL timeset(routinen, handle)
566
567 ! dimensions
568 npts => pw1%pw_grid%npts_local
569 l1 = lbound(pw1%array, 1)
570 l2 = lbound(pw1%array, 2)
571 l3 = lbound(pw1%array, 3)
572
573 ! pointers to data arrays
574 ptr_pwin => pw1%array(l1, l2, l3)
575 ptr_pwout => pwbuf(1, 1, 1)
576
577 ! invoke the combined transformation
578#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
579 CALL pw_gpu_cff_c(c_loc(ptr_pwin), c_loc(ptr_pwout), npts)
580#else
581 cpabort("Compiled without pw offloading")
582#endif
583
584 CALL timestop(handle)
585 END SUBROUTINE pw_gpu_cff
586
587! **************************************************************************************************
588!> \brief perform a parallel 2D-FFT followed by a complex_to_real copy on the gpu
589!> \param pwbuf ...
590!> \param pw2 ...
591!> \author Andreas Gloess
592! **************************************************************************************************
593 SUBROUTINE pw_gpu_ffc(pwbuf, pw2)
594 COMPLEX(KIND=dp), DIMENSION(:, :, :), INTENT(IN), &
595 TARGET :: pwbuf
596 TYPE(pw_r3d_rs_type), INTENT(IN) :: pw2
597
598 CHARACTER(len=*), PARAMETER :: routinen = 'pw_gpu_ffc'
599
600 COMPLEX(KIND=dp), POINTER :: ptr_pwin
601 INTEGER :: handle, l1, l2, l3
602 INTEGER, DIMENSION(:), POINTER :: npts
603 REAL(kind=dp), POINTER :: ptr_pwout
604 INTERFACE
605 SUBROUTINE pw_gpu_ffc_c(zin, dout, npts) BIND(C, name="pw_gpu_ffc")
606 IMPORT
607 TYPE(c_ptr), INTENT(IN), VALUE :: zin
608 TYPE(c_ptr), VALUE :: dout
609 INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
610 END SUBROUTINE pw_gpu_ffc_c
611 END INTERFACE
612
613 CALL timeset(routinen, handle)
614
615 ! dimensions
616 npts => pw2%pw_grid%npts_local
617 l1 = lbound(pw2%array, 1)
618 l2 = lbound(pw2%array, 2)
619 l3 = lbound(pw2%array, 3)
620
621 ! pointers to data arrays
622 ptr_pwin => pwbuf(1, 1, 1)
623 ptr_pwout => pw2%array(l1, l2, l3)
624
625 ! invoke the combined transformation
626#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
627 CALL pw_gpu_ffc_c(c_loc(ptr_pwin), c_loc(ptr_pwout), npts)
628#else
629 cpabort("Compiled without pw offloading")
630#endif
631
632 CALL timestop(handle)
633 END SUBROUTINE pw_gpu_ffc
634
635! **************************************************************************************************
636!> \brief perform a parallel real_to_complex copy followed by a 1D-FFT on the gpu
637!> \param pw1 ...
638!> \param pwbuf ...
639!> \author Andreas Gloess
640! **************************************************************************************************
641 SUBROUTINE pw_gpu_cf(pw1, pwbuf)
642 TYPE(pw_r3d_rs_type), INTENT(IN) :: pw1
643 COMPLEX(KIND=dp), DIMENSION(:, :), INTENT(INOUT), &
644 TARGET :: pwbuf
645
646 CHARACTER(len=*), PARAMETER :: routinen = 'pw_gpu_cf'
647
648 COMPLEX(KIND=dp), POINTER :: ptr_pwout
649 INTEGER :: handle, l1, l2, l3
650 INTEGER, DIMENSION(:), POINTER :: npts
651 REAL(kind=dp), POINTER :: ptr_pwin
652 INTERFACE
653 SUBROUTINE pw_gpu_cf_c(din, zout, npts) BIND(C, name="pw_gpu_cf")
654 IMPORT
655 TYPE(c_ptr), INTENT(IN), VALUE :: din
656 TYPE(c_ptr), VALUE :: zout
657 INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
658 END SUBROUTINE pw_gpu_cf_c
659 END INTERFACE
660
661 CALL timeset(routinen, handle)
662
663 ! dimensions
664 npts => pw1%pw_grid%npts_local
665 l1 = lbound(pw1%array, 1)
666 l2 = lbound(pw1%array, 2)
667 l3 = lbound(pw1%array, 3)
668
669 ! pointers to data arrays
670 ptr_pwin => pw1%array(l1, l2, l3)
671 ptr_pwout => pwbuf(1, 1)
672
673 ! invoke the combined transformation
674#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
675 CALL pw_gpu_cf_c(c_loc(ptr_pwin), c_loc(ptr_pwout), npts)
676#else
677 cpabort("Compiled without pw offloading")
678#endif
679 CALL timestop(handle)
680 END SUBROUTINE pw_gpu_cf
681
682! **************************************************************************************************
683!> \brief perform a parallel 1D-FFT followed by a complex_to_real copy on the gpu
684!> \param pwbuf ...
685!> \param pw2 ...
686!> \author Andreas Gloess
687! **************************************************************************************************
688 SUBROUTINE pw_gpu_fc(pwbuf, pw2)
689 COMPLEX(KIND=dp), DIMENSION(:, :), INTENT(IN), &
690 TARGET :: pwbuf
691 TYPE(pw_r3d_rs_type), INTENT(IN) :: pw2
692
693 CHARACTER(len=*), PARAMETER :: routinen = 'pw_gpu_fc'
694
695 COMPLEX(KIND=dp), POINTER :: ptr_pwin
696 INTEGER :: handle, l1, l2, l3
697 INTEGER, DIMENSION(:), POINTER :: npts
698 REAL(kind=dp), POINTER :: ptr_pwout
699 INTERFACE
700 SUBROUTINE pw_gpu_fc_c(zin, dout, npts) BIND(C, name="pw_gpu_fc")
701 IMPORT
702 TYPE(c_ptr), INTENT(IN), VALUE :: zin
703 TYPE(c_ptr), VALUE :: dout
704 INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
705 END SUBROUTINE pw_gpu_fc_c
706 END INTERFACE
707
708 CALL timeset(routinen, handle)
709
710 npts => pw2%pw_grid%npts_local
711 l1 = lbound(pw2%array, 1)
712 l2 = lbound(pw2%array, 2)
713 l3 = lbound(pw2%array, 3)
714
715 ! pointers to data arrays
716 ptr_pwin => pwbuf(1, 1)
717 ptr_pwout => pw2%array(l1, l2, l3)
718
719 ! invoke the combined transformation
720#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
721 CALL pw_gpu_fc_c(c_loc(ptr_pwin), c_loc(ptr_pwout), npts)
722#else
723 cpabort("Compiled without pw offloading")
724#endif
725
726 CALL timestop(handle)
727 END SUBROUTINE pw_gpu_fc
728
729! **************************************************************************************************
730!> \brief perform a parallel 1D-FFT on the gpu
731!> \param pwbuf1 ...
732!> \param pwbuf2 ...
733!> \param dir ...
734!> \param n ...
735!> \param m ...
736!> \author Andreas Gloess
737! **************************************************************************************************
738 SUBROUTINE pw_gpu_f(pwbuf1, pwbuf2, dir, n, m)
739 COMPLEX(KIND=dp), DIMENSION(:, :), INTENT(IN), &
740 TARGET :: pwbuf1
741 COMPLEX(KIND=dp), DIMENSION(:, :), INTENT(INOUT), &
742 TARGET :: pwbuf2
743 INTEGER, INTENT(IN) :: dir, n, m
744
745 CHARACTER(len=*), PARAMETER :: routinen = 'pw_gpu_f'
746
747 COMPLEX(KIND=dp), POINTER :: ptr_pwin, ptr_pwout
748 INTEGER :: handle
749 INTERFACE
750 SUBROUTINE pw_gpu_f_c(zin, zout, dir, n, m) BIND(C, name="pw_gpu_f")
751 IMPORT
752 TYPE(c_ptr), INTENT(IN), VALUE :: zin
753 TYPE(c_ptr), VALUE :: zout
754 INTEGER(KIND=C_INT), INTENT(IN), VALUE :: dir, n, m
755 END SUBROUTINE pw_gpu_f_c
756 END INTERFACE
757
758 CALL timeset(routinen, handle)
759
760 IF (n*m /= 0) THEN
761 ! pointers to data arrays
762 ptr_pwin => pwbuf1(1, 1)
763 ptr_pwout => pwbuf2(1, 1)
764
765 ! invoke the combined transformation
766#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
767 CALL pw_gpu_f_c(c_loc(ptr_pwin), c_loc(ptr_pwout), dir, n, m)
768#else
769 mark_used(dir)
770 cpabort("Compiled without pw offloading")
771#endif
772 END IF
773
774 CALL timestop(handle)
775 END SUBROUTINE pw_gpu_f
776! **************************************************************************************************
777!> \brief perform a parallel 1D-FFT followed by a gather on the gpu
778!> \param pwbuf ...
779!> \param pw2 ...
780!> \param scale ...
781!> \author Andreas Gloess
782! **************************************************************************************************
783 SUBROUTINE pw_gpu_fg(pwbuf, pw2, scale)
784 COMPLEX(KIND=dp), DIMENSION(:, :), INTENT(IN), &
785 TARGET :: pwbuf
786 TYPE(pw_c1d_gs_type), INTENT(IN) :: pw2
787 REAL(kind=dp), INTENT(IN) :: scale
788
789 CHARACTER(len=*), PARAMETER :: routinen = 'pw_gpu_fg'
790
791 COMPLEX(KIND=dp), POINTER :: ptr_pwin, ptr_pwout
792 INTEGER :: handle, mg, mmax, ngpts
793 INTEGER, DIMENSION(:), POINTER :: npts
794 INTEGER, POINTER :: ptr_ghatmap
795 INTERFACE
796 SUBROUTINE pw_gpu_fg_c(zin, zout, ghatmap, npts, mmax, ngpts, scale) BIND(C, name="pw_gpu_fg")
797 IMPORT
798 TYPE(c_ptr), INTENT(IN), VALUE :: zin
799 TYPE(c_ptr), VALUE :: zout
800 TYPE(c_ptr), INTENT(IN), VALUE :: ghatmap
801 INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
802 INTEGER(KIND=C_INT), INTENT(IN), VALUE :: mmax, ngpts
803 REAL(kind=c_double), INTENT(IN), VALUE :: scale
804
805 END SUBROUTINE pw_gpu_fg_c
806 END INTERFACE
807
808 CALL timeset(routinen, handle)
809
810 ngpts = SIZE(pw2%pw_grid%gsq)
811 npts => pw2%pw_grid%npts
812
813 IF ((npts(1) /= 0) .AND. (ngpts /= 0)) THEN
814 mg = SIZE(pw2%pw_grid%grays, 2)
815 mmax = max(mg, 1)
816
817 ! pointers to data arrays
818 ptr_pwin => pwbuf(1, 1)
819 ptr_pwout => pw2%array(1)
820
821 ! pointer to map array
822 ptr_ghatmap => pw2%pw_grid%g_hatmap(1, 1)
823
824 ! invoke the combined transformation
825#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
826 CALL pw_gpu_fg_c(c_loc(ptr_pwin), c_loc(ptr_pwout), c_loc(ptr_ghatmap), npts, mmax, ngpts, scale)
827#else
828 mark_used(scale)
829 cpabort("Compiled without pw offloading")
830#endif
831 END IF
832
833 CALL timestop(handle)
834 END SUBROUTINE pw_gpu_fg
835
836! **************************************************************************************************
837!> \brief perform a parallel scatter followed by a 1D-FFT on the gpu
838!> \param pw1 ...
839!> \param pwbuf ...
840!> \param scale ...
841!> \author Andreas Gloess
842! **************************************************************************************************
843 SUBROUTINE pw_gpu_sf(pw1, pwbuf, scale)
844 TYPE(pw_c1d_gs_type), INTENT(IN) :: pw1
845 COMPLEX(KIND=dp), DIMENSION(:, :), INTENT(INOUT), &
846 TARGET :: pwbuf
847 REAL(kind=dp), INTENT(IN) :: scale
848
849 CHARACTER(len=*), PARAMETER :: routinen = 'pw_gpu_sf'
850
851 COMPLEX(KIND=dp), POINTER :: ptr_pwin, ptr_pwout
852 INTEGER :: handle, mg, mmax, ngpts, nmaps
853 INTEGER, DIMENSION(:), POINTER :: npts
854 INTEGER, POINTER :: ptr_ghatmap
855 INTERFACE
856 SUBROUTINE pw_gpu_sf_c(zin, zout, ghatmap, npts, mmax, ngpts, nmaps, scale) BIND(C, name="pw_gpu_sf")
857 IMPORT
858 TYPE(c_ptr), INTENT(IN), VALUE :: zin
859 TYPE(c_ptr), VALUE :: zout
860 TYPE(c_ptr), INTENT(IN), VALUE :: ghatmap
861 INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
862 INTEGER(KIND=C_INT), INTENT(IN), VALUE :: mmax, ngpts, nmaps
863 REAL(kind=c_double), INTENT(IN), VALUE :: scale
864
865 END SUBROUTINE pw_gpu_sf_c
866 END INTERFACE
867
868 CALL timeset(routinen, handle)
869
870 ngpts = SIZE(pw1%pw_grid%gsq)
871 npts => pw1%pw_grid%npts
872
873 IF ((npts(1) /= 0) .AND. (ngpts /= 0)) THEN
874 mg = SIZE(pw1%pw_grid%grays, 2)
875 mmax = max(mg, 1)
876
877 ! pointers to data arrays
878 ptr_pwin => pw1%array(1)
879 ptr_pwout => pwbuf(1, 1)
880
881 ! pointer to map array
882 nmaps = SIZE(pw1%pw_grid%g_hatmap, 2)
883 ptr_ghatmap => pw1%pw_grid%g_hatmap(1, 1)
884
885 ! invoke the combined transformation
886#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
887 CALL pw_gpu_sf_c(c_loc(ptr_pwin), c_loc(ptr_pwout), c_loc(ptr_ghatmap), npts, mmax, ngpts, nmaps, scale)
888#else
889 mark_used(scale)
890 cpabort("Compiled without pw offloading")
891#endif
892 END IF
893
894 CALL timestop(handle)
895 END SUBROUTINE pw_gpu_sf
896
897END MODULE pw_gpu
898
subroutine, public yz_to_x(tb, group, my_pos, p2p, yzp, nray, bo, sb, fft_scratch)
...
Definition fft_tools.F:1488
subroutine, public yz_to_xz(sb, group, dims, my_pos, p2p, yzp, nray, bo, tb, fft_scratch)
...
Definition fft_tools.F:1605
subroutine, public cube_transpose_1(cin, boin, boout, sout, fft_scratch)
...
Definition fft_tools.F:2009
subroutine, public release_fft_scratch(fft_scratch)
...
Definition fft_tools.F:3253
subroutine, public x_to_yz(sb, group, my_pos, p2p, yzp, nray, bo, tb, fft_scratch)
...
Definition fft_tools.F:1372
subroutine, public get_fft_scratch(fft_scratch, tf_type, n, fft_sizes)
...
Definition fft_tools.F:2869
subroutine, public xz_to_yz(sb, group, dims, my_pos, p2p, yzp, nray, bo, tb, fft_scratch)
...
Definition fft_tools.F:1815
subroutine, public cube_transpose_2(cin, boin, boout, sout, fft_scratch)
...
Definition fft_tools.F:2099
Defines the basic variable types.
Definition kinds.F:23
integer, parameter, public dp
Definition kinds.F:34
Definition of mathematical constants and functions.
complex(kind=dp), parameter, public z_zero
Interface to the message passing library MPI.
subroutine, public pw_gpu_c1dr3d_3d_ps(pw1, pw2)
perform an parallel scatter followed by a fft on the gpu
Definition pw_gpu.F:380
subroutine, public pw_gpu_c1dr3d_3d(pw1, pw2)
perform an scatter followed by a fft on the gpu
Definition pw_gpu.F:160
subroutine, public pw_gpu_init()
Allocates resources on the gpu device for gpu fft acceleration.
Definition pw_gpu.F:62
subroutine, public pw_gpu_r3dc1d_3d(pw1, pw2)
perform an fft followed by a gather on the gpu
Definition pw_gpu.F:102
subroutine, public pw_gpu_finalize()
Releases resources on the gpu device for gpu fft acceleration.
Definition pw_gpu.F:81
subroutine, public pw_gpu_r3dc1d_3d_ps(pw1, pw2)
perform an parallel fft followed by a gather on the gpu
Definition pw_gpu.F:218
integer, parameter, public fullspace