(git:374b731)
Loading...
Searching...
No Matches
pw_gpu.F
Go to the documentation of this file.
1!--------------------------------------------------------------------------------------------------!
2! CP2K: A general program to perform molecular dynamics simulations !
3! Copyright 2000-2024 CP2K developers group <https://cp2k.org> !
4! !
5! SPDX-License-Identifier: GPL-2.0-or-later !
6!--------------------------------------------------------------------------------------------------!
7
8! **************************************************************************************************
9!> \note
10!> This module contains routines necessary to operate on plane waves on GPUs
11! > independently of the GPU platform.
12!> \par History
13!> BGL (06-Mar-2008) : Created
14!> AG (18-May-2012) : Refacturing:
15!> - added explicit interfaces to C routines
16!> - enable double precision complex transformations
17!> AG (11-Sept-2012) : Modifications:
18!> - use pointers if precision mapping is not required
19!> - use OMP for mapping
20!> MT (Jan 2022) : Modifications
21!> - use a generic interface for fft calls to GPUs
22!> - Support both Nvidia and AMD GPUs. Other GPUs manufacturers
23!> can be added easily.
24!> \author Benjamin G. Levine
25! **************************************************************************************************
26MODULE pw_gpu
27 USE iso_c_binding, ONLY: c_double,&
28 c_int,&
29 c_loc,&
30 c_ptr
31 USE fft_tools, ONLY: &
34 USE kinds, ONLY: dp
35 USE mathconstants, ONLY: z_zero
36 USE message_passing, ONLY: mp_cart_type,&
38 USE pw_grid_types, ONLY: fullspace
39 USE pw_types, ONLY: pw_c1d_gs_type,&
41#include "../base/base_uses.f90"
42
43 IMPLICIT NONE
44
45 PRIVATE
46
47 PUBLIC :: pw_gpu_r3dc1d_3d
48 PUBLIC :: pw_gpu_c1dr3d_3d
49 PUBLIC :: pw_gpu_r3dc1d_3d_ps
50 PUBLIC :: pw_gpu_c1dr3d_3d_ps
52
53 CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'pw_gpu'
54 LOGICAL, PARAMETER, PRIVATE :: debug_this_module = .false.
55
56CONTAINS
57
58! **************************************************************************************************
59!> \brief Allocates resources on the gpu device for gpu fft acceleration
60!> \author Ole Schuett
61! **************************************************************************************************
62 SUBROUTINE pw_gpu_init()
63 INTEGER :: dummy
64 INTERFACE
65 SUBROUTINE pw_gpu_init_c() BIND(C, name="pw_gpu_init")
66 END SUBROUTINE pw_gpu_init_c
67 END INTERFACE
68
69 mark_used(dummy) ! TODO: fix fpretty
70#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
71 CALL pw_gpu_init_c()
72#else
73 ! Nothing to do.
74#endif
75 END SUBROUTINE pw_gpu_init
76
77! **************************************************************************************************
78!> \brief Releases resources on the gpu device for gpu fft acceleration
79!> \author Ole Schuett
80! **************************************************************************************************
81 SUBROUTINE pw_gpu_finalize()
82 INTEGER :: dummy
83 INTERFACE
84 SUBROUTINE pw_gpu_finalize_c() BIND(C, name="pw_gpu_finalize")
85 END SUBROUTINE pw_gpu_finalize_c
86 END INTERFACE
87
88 mark_used(dummy) ! TODO: fix fpretty
89#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
90 CALL pw_gpu_finalize_c()
91#else
92 ! Nothing to do.
93#endif
94 END SUBROUTINE pw_gpu_finalize
95
96! **************************************************************************************************
97!> \brief perform an fft followed by a gather on the gpu
98!> \param pw1 ...
99!> \param pw2 ...
100!> \param scale ...
101!> \author Benjamin G Levine
102! **************************************************************************************************
103 SUBROUTINE pw_gpu_r3dc1d_3d(pw1, pw2, scale)
104 TYPE(pw_r3d_rs_type), INTENT(IN) :: pw1
105 TYPE(pw_c1d_gs_type), INTENT(INOUT) :: pw2
106 REAL(kind=dp), INTENT(IN) :: scale
107
108 CHARACTER(len=*), PARAMETER :: routinen = 'pw_gpu_r3dc1d_3d'
109
110 COMPLEX(KIND=dp), POINTER :: ptr_pwout
111 INTEGER :: handle, l1, l2, l3, ngpts
112 INTEGER, DIMENSION(:), POINTER :: npts
113 INTEGER, POINTER :: ptr_ghatmap
114 REAL(kind=dp), POINTER :: ptr_pwin
115 INTERFACE
116 SUBROUTINE pw_gpu_cfffg_c(din, zout, ghatmap, npts, ngpts, scale) BIND(C, name="pw_gpu_cfffg")
117 IMPORT
118 TYPE(c_ptr), INTENT(IN), VALUE :: din
119 TYPE(c_ptr), VALUE :: zout
120 TYPE(c_ptr), INTENT(IN), VALUE :: ghatmap
121 INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
122 INTEGER(KIND=C_INT), INTENT(IN), VALUE :: ngpts
123 REAL(kind=c_double), INTENT(IN), VALUE :: scale
124
125 END SUBROUTINE pw_gpu_cfffg_c
126 END INTERFACE
127
128 CALL timeset(routinen, handle)
129
130 ngpts = SIZE(pw2%pw_grid%gsq)
131 l1 = lbound(pw1%array, 1)
132 l2 = lbound(pw1%array, 2)
133 l3 = lbound(pw1%array, 3)
134 npts => pw1%pw_grid%npts
135
136 ! pointers to data arrays
137 ptr_pwin => pw1%array(l1, l2, l3)
138 ptr_pwout => pw2%array(1)
139
140 ! pointer to map array
141 ptr_ghatmap => pw2%pw_grid%g_hatmap(1, 1)
142
143 ! invoke the combined transformation
144#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
145 CALL pw_gpu_cfffg_c(c_loc(ptr_pwin), c_loc(ptr_pwout), c_loc(ptr_ghatmap), npts, ngpts, scale)
146#else
147 mark_used(scale)
148 cpabort("Compiled without pw offloading.")
149#endif
150
151 CALL timestop(handle)
152 END SUBROUTINE pw_gpu_r3dc1d_3d
153
154! **************************************************************************************************
155!> \brief perform an scatter followed by a fft on the gpu
156!> \param pw1 ...
157!> \param pw2 ...
158!> \param scale ...
159!> \author Benjamin G Levine
160! **************************************************************************************************
161 SUBROUTINE pw_gpu_c1dr3d_3d(pw1, pw2, scale)
162 TYPE(pw_c1d_gs_type), INTENT(IN) :: pw1
163 TYPE(pw_r3d_rs_type), INTENT(INOUT) :: pw2
164 REAL(kind=dp), INTENT(IN) :: scale
165
166 CHARACTER(len=*), PARAMETER :: routinen = 'pw_gpu_c1dr3d_3d'
167
168 COMPLEX(KIND=dp), POINTER :: ptr_pwin
169 INTEGER :: handle, l1, l2, l3, ngpts, nmaps
170 INTEGER, DIMENSION(:), POINTER :: npts
171 INTEGER, POINTER :: ptr_ghatmap
172 REAL(kind=dp), POINTER :: ptr_pwout
173 INTERFACE
174 SUBROUTINE pw_gpu_sfffc_c(zin, dout, ghatmap, npts, ngpts, nmaps, scale) BIND(C, name="pw_gpu_sfffc")
175 IMPORT
176 TYPE(c_ptr), INTENT(IN), VALUE :: zin
177 TYPE(c_ptr), VALUE :: dout
178 TYPE(c_ptr), INTENT(IN), VALUE :: ghatmap
179 INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
180 INTEGER(KIND=C_INT), INTENT(IN), VALUE :: ngpts, nmaps
181 REAL(kind=c_double), INTENT(IN), VALUE :: scale
182 END SUBROUTINE pw_gpu_sfffc_c
183 END INTERFACE
184
185 CALL timeset(routinen, handle)
186
187 ngpts = SIZE(pw1%pw_grid%gsq)
188 l1 = lbound(pw2%array, 1)
189 l2 = lbound(pw2%array, 2)
190 l3 = lbound(pw2%array, 3)
191 npts => pw1%pw_grid%npts
192
193 ! pointers to data arrays
194 ptr_pwin => pw1%array(1)
195 ptr_pwout => pw2%array(l1, l2, l3)
196
197 ! pointer to map array
198 nmaps = SIZE(pw1%pw_grid%g_hatmap, 2)
199 ptr_ghatmap => pw1%pw_grid%g_hatmap(1, 1)
200
201 ! invoke the combined transformation
202#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
203 CALL pw_gpu_sfffc_c(c_loc(ptr_pwin), c_loc(ptr_pwout), c_loc(ptr_ghatmap), npts, ngpts, nmaps, scale)
204#else
205 mark_used(scale)
206 cpabort("Compiled without pw offloading")
207#endif
208
209 CALL timestop(handle)
210 END SUBROUTINE pw_gpu_c1dr3d_3d
211
212! **************************************************************************************************
213!> \brief perform an parallel fft followed by a gather on the gpu
214!> \param pw1 ...
215!> \param pw2 ...
216!> \param scale ...
217!> \author Andreas Gloess
218! **************************************************************************************************
219 SUBROUTINE pw_gpu_r3dc1d_3d_ps(pw1, pw2, scale)
220 TYPE(pw_r3d_rs_type), INTENT(IN) :: pw1
221 TYPE(pw_c1d_gs_type), INTENT(INOUT) :: pw2
222 REAL(kind=dp), INTENT(IN) :: scale
223
224 CHARACTER(len=*), PARAMETER :: routinen = 'pw_gpu_r3dc1d_3d_ps'
225
226 COMPLEX(KIND=dp), DIMENSION(:, :), POINTER :: grays, pbuf, qbuf, rbuf, sbuf
227 COMPLEX(KIND=dp), DIMENSION(:, :, :), POINTER :: tbuf
228 INTEGER :: g_pos, handle, lg, lmax, mg, mmax, mx2, &
229 mz2, n1, n2, ngpts, nmax, numtask, &
230 numtask_g, numtask_r, rp
231 INTEGER, ALLOCATABLE, DIMENSION(:) :: p2p
232 INTEGER, DIMENSION(2) :: r_dim, r_pos
233 INTEGER, DIMENSION(:), POINTER :: n, nloc, nyzray
234 INTEGER, DIMENSION(:, :, :, :), POINTER :: bo
235 TYPE(fft_scratch_sizes) :: fft_scratch_size
236 TYPE(fft_scratch_type), POINTER :: fft_scratch
237 TYPE(mp_cart_type) :: rs_group
238 TYPE(mp_comm_type) :: gs_group
239
240 CALL timeset(routinen, handle)
241
242 ! dimensions
243 n => pw1%pw_grid%npts
244 nloc => pw1%pw_grid%npts_local
245 grays => pw1%pw_grid%grays
246 ngpts = nloc(1)*nloc(2)*nloc(3)
247
248 !..transform
249 IF (pw1%pw_grid%para%ray_distribution) THEN
250 gs_group = pw1%pw_grid%para%group
251 rs_group = pw1%pw_grid%para%rs_group
252 nyzray => pw1%pw_grid%para%nyzray
253 bo => pw1%pw_grid%para%bo
254
255 numtask_g = gs_group%num_pe
256 g_pos = gs_group%mepos
257 numtask_r = rs_group%num_pe
258 r_dim = rs_group%num_pe_cart
259 r_pos = rs_group%mepos_cart
260 IF (numtask_g /= numtask_r) THEN
261 cpabort("Real space and G space groups are different.")
262 END IF
263 numtask = numtask_r
264
265 lg = SIZE(grays, 1)
266 mg = SIZE(grays, 2)
267 mmax = max(mg, 1)
268 lmax = max(lg, (ngpts/mmax + 1))
269
270 ALLOCATE (p2p(0:numtask - 1))
271
272 CALL gs_group%rank_compare(rs_group, p2p)
273
274 rp = p2p(g_pos)
275 mx2 = bo(2, 1, rp, 2) - bo(1, 1, rp, 2) + 1
276 mz2 = bo(2, 3, rp, 2) - bo(1, 3, rp, 2) + 1
277 n1 = maxval(bo(2, 1, :, 1) - bo(1, 1, :, 1) + 1)
278 n2 = maxval(bo(2, 2, :, 1) - bo(1, 2, :, 1) + 1)
279 nmax = max((2*n2)/numtask, 2)*mx2*mz2
280 nmax = max(nmax, n1*maxval(nyzray))
281
282 fft_scratch_size%nx = nloc(1)
283 fft_scratch_size%ny = nloc(2)
284 fft_scratch_size%nz = nloc(3)
285 fft_scratch_size%lmax = lmax
286 fft_scratch_size%mmax = mmax
287 fft_scratch_size%mx1 = bo(2, 1, rp, 1) - bo(1, 1, rp, 1) + 1
288 fft_scratch_size%mx2 = mx2
289 fft_scratch_size%my1 = bo(2, 2, rp, 1) - bo(1, 2, rp, 1) + 1
290 fft_scratch_size%mz2 = mz2
291 fft_scratch_size%lg = lg
292 fft_scratch_size%mg = mg
293 fft_scratch_size%nbx = maxval(bo(2, 1, :, 2))
294 fft_scratch_size%nbz = maxval(bo(2, 3, :, 2))
295 fft_scratch_size%mcz1 = maxval(bo(2, 3, :, 1) - bo(1, 3, :, 1) + 1)
296 fft_scratch_size%mcx2 = maxval(bo(2, 1, :, 2) - bo(1, 1, :, 2) + 1)
297 fft_scratch_size%mcz2 = maxval(bo(2, 3, :, 2) - bo(1, 3, :, 2) + 1)
298 fft_scratch_size%nmax = nmax
299 fft_scratch_size%nmray = maxval(nyzray)
300 fft_scratch_size%nyzray = nyzray(g_pos)
301 fft_scratch_size%gs_group = gs_group
302 fft_scratch_size%rs_group = rs_group
303 fft_scratch_size%g_pos = g_pos
304 fft_scratch_size%r_pos = r_pos
305 fft_scratch_size%r_dim = r_dim
306 fft_scratch_size%numtask = numtask
307
308 IF (r_dim(2) > 1) THEN
309 !
310 ! real space is distributed over x and y coordinate
311 ! we have two stages of communication
312 !
313 IF (r_dim(1) == 1) &
314 cpabort("This processor distribution is not supported.")
315
316 CALL get_fft_scratch(fft_scratch, tf_type=300, n=n, fft_sizes=fft_scratch_size)
317
318 ! assign buffers
319 qbuf => fft_scratch%p2buf
320 rbuf => fft_scratch%p3buf
321 pbuf => fft_scratch%p4buf
322 sbuf => fft_scratch%p5buf
323
324 ! FFT along z
325 CALL pw_gpu_cf(pw1, qbuf)
326
327 ! Exchange data ( transpose of matrix )
328 CALL cube_transpose_2(qbuf, bo(:, :, :, 1), bo(:, :, :, 2), rbuf, fft_scratch)
329
330 ! FFT along y
331 ! use the inbuild fft-lib
332 ! CALL fft_1dm(fft_scratch%fft_plan(2), rbuf, pbuf, 1.0_dp, stat)
333 ! or cufft (works faster, but is only faster if plans are stored)
334 CALL pw_gpu_f(rbuf, pbuf, +1, n(2), mx2*mz2)
335
336 ! Exchange data ( transpose of matrix ) and sort
337 CALL xz_to_yz(pbuf, rs_group, r_dim, g_pos, p2p, pw1%pw_grid%para%yzp, nyzray, &
338 bo(:, :, :, 2), sbuf, fft_scratch)
339
340 ! FFT along x
341 CALL pw_gpu_fg(sbuf, pw2, scale)
342
343 CALL release_fft_scratch(fft_scratch)
344
345 ELSE
346 !
347 ! real space is only distributed over x coordinate
348 ! we have one stage of communication, after the transform of
349 ! direction x
350 !
351
352 CALL get_fft_scratch(fft_scratch, tf_type=200, n=n, fft_sizes=fft_scratch_size)
353
354 ! assign buffers
355 tbuf => fft_scratch%tbuf
356 sbuf => fft_scratch%r1buf
357
358 ! FFT along y and z
359 CALL pw_gpu_cff(pw1, tbuf)
360
361 ! Exchange data ( transpose of matrix ) and sort
362 CALL yz_to_x(tbuf, gs_group, g_pos, p2p, pw1%pw_grid%para%yzp, nyzray, &
363 bo(:, :, :, 2), sbuf, fft_scratch)
364
365 ! FFT along x
366 CALL pw_gpu_fg(sbuf, pw2, scale)
367
368 CALL release_fft_scratch(fft_scratch)
369
370 END IF
371
372 DEALLOCATE (p2p)
373
374!--------------------------------------------------------------------------
375 ELSE
376 cpabort("Not implemented (no ray_distr.) in: pw_gpu_r3dc1d_3d_ps.")
377 !CALL fft3d ( dir, n, pwin, grays, pw1%pw_grid%para%rs_group, &
378 ! pw1%pw_grid%para%bo, scale = scale, debug=test )
379 END IF
380
381 CALL timestop(handle)
382 END SUBROUTINE pw_gpu_r3dc1d_3d_ps
383
384! **************************************************************************************************
385!> \brief perform an parallel scatter followed by a fft on the gpu
386!> \param pw1 ...
387!> \param pw2 ...
388!> \param scale ...
389!> \author Andreas Gloess
390! **************************************************************************************************
391 SUBROUTINE pw_gpu_c1dr3d_3d_ps(pw1, pw2, scale)
392 TYPE(pw_c1d_gs_type), INTENT(IN) :: pw1
393 TYPE(pw_r3d_rs_type), INTENT(INOUT) :: pw2
394 REAL(kind=dp), INTENT(IN) :: scale
395
396 CHARACTER(len=*), PARAMETER :: routinen = 'pw_gpu_c1dr3d_3d_ps'
397
398 COMPLEX(KIND=dp), DIMENSION(:, :), POINTER :: grays, pbuf, qbuf, rbuf, sbuf
399 COMPLEX(KIND=dp), DIMENSION(:, :, :), POINTER :: tbuf
400 INTEGER :: g_pos, handle, lg, lmax, mg, mmax, mx2, &
401 mz2, n1, n2, ngpts, nmax, numtask, &
402 numtask_g, numtask_r, rp
403 INTEGER, ALLOCATABLE, DIMENSION(:) :: p2p
404 INTEGER, DIMENSION(2) :: r_dim, r_pos
405 INTEGER, DIMENSION(:), POINTER :: n, nloc, nyzray
406 INTEGER, DIMENSION(:, :, :, :), POINTER :: bo
407 TYPE(fft_scratch_sizes) :: fft_scratch_size
408 TYPE(fft_scratch_type), POINTER :: fft_scratch
409 TYPE(mp_cart_type) :: rs_group
410 TYPE(mp_comm_type) :: gs_group
411
412 CALL timeset(routinen, handle)
413
414 ! dimensions
415 n => pw1%pw_grid%npts
416 nloc => pw1%pw_grid%npts_local
417 grays => pw1%pw_grid%grays
418 ngpts = nloc(1)*nloc(2)*nloc(3)
419
420 !..transform
421 IF (pw1%pw_grid%para%ray_distribution) THEN
422 gs_group = pw1%pw_grid%para%group
423 rs_group = pw1%pw_grid%para%rs_group
424 nyzray => pw1%pw_grid%para%nyzray
425 bo => pw1%pw_grid%para%bo
426
427 numtask_g = gs_group%num_pe
428 g_pos = gs_group%mepos
429 numtask_r = rs_group%num_pe
430 r_dim = rs_group%num_pe_cart
431 r_pos = rs_group%mepos_cart
432 IF (numtask_g /= numtask_r) THEN
433 cpabort("Real space and G space groups are different.")
434 END IF
435 numtask = numtask_r
436
437 lg = SIZE(grays, 1)
438 mg = SIZE(grays, 2)
439 mmax = max(mg, 1)
440 lmax = max(lg, (ngpts/mmax + 1))
441
442 ALLOCATE (p2p(0:numtask - 1))
443
444 CALL gs_group%rank_compare(rs_group, p2p)
445
446 rp = p2p(g_pos)
447 mx2 = bo(2, 1, rp, 2) - bo(1, 1, rp, 2) + 1
448 mz2 = bo(2, 3, rp, 2) - bo(1, 3, rp, 2) + 1
449 n1 = maxval(bo(2, 1, :, 1) - bo(1, 1, :, 1) + 1)
450 n2 = maxval(bo(2, 2, :, 1) - bo(1, 2, :, 1) + 1)
451 nmax = max((2*n2)/numtask, 2)*mx2*mz2
452 nmax = max(nmax, n1*maxval(nyzray))
453
454 fft_scratch_size%nx = nloc(1)
455 fft_scratch_size%ny = nloc(2)
456 fft_scratch_size%nz = nloc(3)
457 fft_scratch_size%lmax = lmax
458 fft_scratch_size%mmax = mmax
459 fft_scratch_size%mx1 = bo(2, 1, rp, 1) - bo(1, 1, rp, 1) + 1
460 fft_scratch_size%mx2 = mx2
461 fft_scratch_size%my1 = bo(2, 2, rp, 1) - bo(1, 2, rp, 1) + 1
462 fft_scratch_size%mz2 = mz2
463 fft_scratch_size%lg = lg
464 fft_scratch_size%mg = mg
465 fft_scratch_size%nbx = maxval(bo(2, 1, :, 2))
466 fft_scratch_size%nbz = maxval(bo(2, 3, :, 2))
467 fft_scratch_size%mcz1 = maxval(bo(2, 3, :, 1) - bo(1, 3, :, 1) + 1)
468 fft_scratch_size%mcx2 = maxval(bo(2, 1, :, 2) - bo(1, 1, :, 2) + 1)
469 fft_scratch_size%mcz2 = maxval(bo(2, 3, :, 2) - bo(1, 3, :, 2) + 1)
470 fft_scratch_size%nmax = nmax
471 fft_scratch_size%nmray = maxval(nyzray)
472 fft_scratch_size%nyzray = nyzray(g_pos)
473 fft_scratch_size%gs_group = gs_group
474 fft_scratch_size%rs_group = rs_group
475 fft_scratch_size%g_pos = g_pos
476 fft_scratch_size%r_pos = r_pos
477 fft_scratch_size%r_dim = r_dim
478 fft_scratch_size%numtask = numtask
479
480 IF (r_dim(2) > 1) THEN
481 !
482 ! real space is distributed over x and y coordinate
483 ! we have two stages of communication
484 !
485 IF (r_dim(1) == 1) &
486 cpabort("This processor distribution is not supported.")
487
488 CALL get_fft_scratch(fft_scratch, tf_type=300, n=n, fft_sizes=fft_scratch_size)
489
490 ! assign buffers
491 pbuf => fft_scratch%p7buf
492 qbuf => fft_scratch%p4buf
493 rbuf => fft_scratch%p3buf
494 sbuf => fft_scratch%p2buf
495
496 ! FFT along x
497 CALL pw_gpu_sf(pw1, pbuf, scale)
498
499 ! Exchange data ( transpose of matrix ) and sort
500 IF (pw1%pw_grid%grid_span /= fullspace) qbuf = z_zero
501 CALL yz_to_xz(pbuf, rs_group, r_dim, g_pos, p2p, pw1%pw_grid%para%yzp, nyzray, &
502 bo(:, :, :, 2), qbuf, fft_scratch)
503
504 ! FFT along y
505 ! use the inbuild fft-lib
506 ! CALL fft_1dm(fft_scratch%fft_plan(5), qbuf, rbuf, 1.0_dp, stat)
507 ! or cufft (works faster, but is only faster if plans are stored)
508 CALL pw_gpu_f(qbuf, rbuf, -1, n(2), mx2*mz2)
509
510 ! Exchange data ( transpose of matrix )
511 IF (pw1%pw_grid%grid_span /= fullspace) sbuf = z_zero
512
513 CALL cube_transpose_1(rbuf, bo(:, :, :, 2), bo(:, :, :, 1), sbuf, fft_scratch)
514
515 ! FFT along z
516 CALL pw_gpu_fc(sbuf, pw2)
517
518 CALL release_fft_scratch(fft_scratch)
519
520 ELSE
521 !
522 ! real space is only distributed over x coordinate
523 ! we have one stage of communication, after the transform of
524 ! direction x
525 !
526
527 CALL get_fft_scratch(fft_scratch, tf_type=200, n=n, fft_sizes=fft_scratch_size)
528
529 ! assign buffers
530 sbuf => fft_scratch%r1buf
531 tbuf => fft_scratch%tbuf
532
533 ! FFT along x
534 CALL pw_gpu_sf(pw1, sbuf, scale)
535
536 ! Exchange data ( transpose of matrix ) and sort
537 IF (pw1%pw_grid%grid_span /= fullspace) tbuf = z_zero
538 CALL x_to_yz(sbuf, gs_group, g_pos, p2p, pw1%pw_grid%para%yzp, nyzray, &
539 bo(:, :, :, 2), tbuf, fft_scratch)
540
541 ! FFT along y and z
542 CALL pw_gpu_ffc(tbuf, pw2)
543
544 CALL release_fft_scratch(fft_scratch)
545
546 END IF
547
548 DEALLOCATE (p2p)
549
550!--------------------------------------------------------------------------
551 ELSE
552 cpabort("Not implemented (no ray_distr.) in: pw_gpu_c1dr3d_3d_ps.")
553 !CALL fft3d ( dir, n, pwin, grays, pw1%pw_grid%para%rs_group, &
554 ! pw1%pw_grid%para%bo, scale = scale, debug=test )
555 END IF
556
557 CALL timestop(handle)
558 END SUBROUTINE pw_gpu_c1dr3d_3d_ps
559
560! **************************************************************************************************
561!> \brief perform a parallel real_to_complex copy followed by a 2D-FFT on the gpu
562!> \param pw1 ...
563!> \param pwbuf ...
564!> \author Andreas Gloess
565! **************************************************************************************************
566 SUBROUTINE pw_gpu_cff(pw1, pwbuf)
567 TYPE(pw_r3d_rs_type), INTENT(IN) :: pw1
568 COMPLEX(KIND=dp), DIMENSION(:, :, :), &
569 INTENT(INOUT), TARGET :: pwbuf
570
571 CHARACTER(len=*), PARAMETER :: routinen = 'pw_gpu_cff'
572
573 COMPLEX(KIND=dp), POINTER :: ptr_pwout
574 INTEGER :: handle, l1, l2, l3
575 INTEGER, DIMENSION(:), POINTER :: npts
576 REAL(kind=dp), POINTER :: ptr_pwin
577 INTERFACE
578 SUBROUTINE pw_gpu_cff_c(din, zout, npts) BIND(C, name="pw_gpu_cff")
579 IMPORT
580 TYPE(c_ptr), INTENT(IN), VALUE :: din
581 TYPE(c_ptr), VALUE :: zout
582 INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
583 END SUBROUTINE pw_gpu_cff_c
584 END INTERFACE
585
586 CALL timeset(routinen, handle)
587
588 ! dimensions
589 npts => pw1%pw_grid%npts_local
590 l1 = lbound(pw1%array, 1)
591 l2 = lbound(pw1%array, 2)
592 l3 = lbound(pw1%array, 3)
593
594 ! pointers to data arrays
595 ptr_pwin => pw1%array(l1, l2, l3)
596 ptr_pwout => pwbuf(1, 1, 1)
597
598 ! invoke the combined transformation
599#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
600 CALL pw_gpu_cff_c(c_loc(ptr_pwin), c_loc(ptr_pwout), npts)
601#else
602 cpabort("Compiled without pw offloading")
603#endif
604
605 CALL timestop(handle)
606 END SUBROUTINE pw_gpu_cff
607
608! **************************************************************************************************
609!> \brief perform a parallel 2D-FFT followed by a complex_to_real copy on the gpu
610!> \param pwbuf ...
611!> \param pw2 ...
612!> \author Andreas Gloess
613! **************************************************************************************************
614 SUBROUTINE pw_gpu_ffc(pwbuf, pw2)
615 COMPLEX(KIND=dp), DIMENSION(:, :, :), INTENT(IN), &
616 TARGET :: pwbuf
617 TYPE(pw_r3d_rs_type), INTENT(IN) :: pw2
618
619 CHARACTER(len=*), PARAMETER :: routinen = 'pw_gpu_ffc'
620
621 COMPLEX(KIND=dp), POINTER :: ptr_pwin
622 INTEGER :: handle, l1, l2, l3
623 INTEGER, DIMENSION(:), POINTER :: npts
624 REAL(kind=dp), POINTER :: ptr_pwout
625 INTERFACE
626 SUBROUTINE pw_gpu_ffc_c(zin, dout, npts) BIND(C, name="pw_gpu_ffc")
627 IMPORT
628 TYPE(c_ptr), INTENT(IN), VALUE :: zin
629 TYPE(c_ptr), VALUE :: dout
630 INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
631 END SUBROUTINE pw_gpu_ffc_c
632 END INTERFACE
633
634 CALL timeset(routinen, handle)
635
636 ! dimensions
637 npts => pw2%pw_grid%npts_local
638 l1 = lbound(pw2%array, 1)
639 l2 = lbound(pw2%array, 2)
640 l3 = lbound(pw2%array, 3)
641
642 ! pointers to data arrays
643 ptr_pwin => pwbuf(1, 1, 1)
644 ptr_pwout => pw2%array(l1, l2, l3)
645
646 ! invoke the combined transformation
647#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
648 CALL pw_gpu_ffc_c(c_loc(ptr_pwin), c_loc(ptr_pwout), npts)
649#else
650 cpabort("Compiled without pw offloading")
651#endif
652
653 CALL timestop(handle)
654 END SUBROUTINE pw_gpu_ffc
655
656! **************************************************************************************************
657!> \brief perform a parallel real_to_complex copy followed by a 1D-FFT on the gpu
658!> \param pw1 ...
659!> \param pwbuf ...
660!> \author Andreas Gloess
661! **************************************************************************************************
662 SUBROUTINE pw_gpu_cf(pw1, pwbuf)
663 TYPE(pw_r3d_rs_type), INTENT(IN) :: pw1
664 COMPLEX(KIND=dp), DIMENSION(:, :), INTENT(INOUT), &
665 TARGET :: pwbuf
666
667 CHARACTER(len=*), PARAMETER :: routinen = 'pw_gpu_cf'
668
669 COMPLEX(KIND=dp), POINTER :: ptr_pwout
670 INTEGER :: handle, l1, l2, l3
671 INTEGER, DIMENSION(:), POINTER :: npts
672 REAL(kind=dp), POINTER :: ptr_pwin
673 INTERFACE
674 SUBROUTINE pw_gpu_cf_c(din, zout, npts) BIND(C, name="pw_gpu_cf")
675 IMPORT
676 TYPE(c_ptr), INTENT(IN), VALUE :: din
677 TYPE(c_ptr), VALUE :: zout
678 INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
679 END SUBROUTINE pw_gpu_cf_c
680 END INTERFACE
681
682 CALL timeset(routinen, handle)
683
684 ! dimensions
685 npts => pw1%pw_grid%npts_local
686 l1 = lbound(pw1%array, 1)
687 l2 = lbound(pw1%array, 2)
688 l3 = lbound(pw1%array, 3)
689
690 ! pointers to data arrays
691 ptr_pwin => pw1%array(l1, l2, l3)
692 ptr_pwout => pwbuf(1, 1)
693
694 ! invoke the combined transformation
695#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
696 CALL pw_gpu_cf_c(c_loc(ptr_pwin), c_loc(ptr_pwout), npts)
697#else
698 cpabort("Compiled without pw offloading")
699#endif
700 CALL timestop(handle)
701 END SUBROUTINE pw_gpu_cf
702
703! **************************************************************************************************
704!> \brief perform a parallel 1D-FFT followed by a complex_to_real copy on the gpu
705!> \param pwbuf ...
706!> \param pw2 ...
707!> \author Andreas Gloess
708! **************************************************************************************************
709 SUBROUTINE pw_gpu_fc(pwbuf, pw2)
710 COMPLEX(KIND=dp), DIMENSION(:, :), INTENT(IN), &
711 TARGET :: pwbuf
712 TYPE(pw_r3d_rs_type), INTENT(IN) :: pw2
713
714 CHARACTER(len=*), PARAMETER :: routinen = 'pw_gpu_fc'
715
716 COMPLEX(KIND=dp), POINTER :: ptr_pwin
717 INTEGER :: handle, l1, l2, l3
718 INTEGER, DIMENSION(:), POINTER :: npts
719 REAL(kind=dp), POINTER :: ptr_pwout
720 INTERFACE
721 SUBROUTINE pw_gpu_fc_c(zin, dout, npts) BIND(C, name="pw_gpu_fc")
722 IMPORT
723 TYPE(c_ptr), INTENT(IN), VALUE :: zin
724 TYPE(c_ptr), VALUE :: dout
725 INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
726 END SUBROUTINE pw_gpu_fc_c
727 END INTERFACE
728
729 CALL timeset(routinen, handle)
730
731 npts => pw2%pw_grid%npts_local
732 l1 = lbound(pw2%array, 1)
733 l2 = lbound(pw2%array, 2)
734 l3 = lbound(pw2%array, 3)
735
736 ! pointers to data arrays
737 ptr_pwin => pwbuf(1, 1)
738 ptr_pwout => pw2%array(l1, l2, l3)
739
740 ! invoke the combined transformation
741#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
742 CALL pw_gpu_fc_c(c_loc(ptr_pwin), c_loc(ptr_pwout), npts)
743#else
744 cpabort("Compiled without pw offloading")
745#endif
746
747 CALL timestop(handle)
748 END SUBROUTINE pw_gpu_fc
749
750! **************************************************************************************************
751!> \brief perform a parallel 1D-FFT on the gpu
752!> \param pwbuf1 ...
753!> \param pwbuf2 ...
754!> \param dir ...
755!> \param n ...
756!> \param m ...
757!> \author Andreas Gloess
758! **************************************************************************************************
759 SUBROUTINE pw_gpu_f(pwbuf1, pwbuf2, dir, n, m)
760 COMPLEX(KIND=dp), DIMENSION(:, :), INTENT(IN), &
761 TARGET :: pwbuf1
762 COMPLEX(KIND=dp), DIMENSION(:, :), INTENT(INOUT), &
763 TARGET :: pwbuf2
764 INTEGER, INTENT(IN) :: dir, n, m
765
766 CHARACTER(len=*), PARAMETER :: routinen = 'pw_gpu_f'
767
768 COMPLEX(KIND=dp), POINTER :: ptr_pwin, ptr_pwout
769 INTEGER :: handle
770 INTERFACE
771 SUBROUTINE pw_gpu_f_c(zin, zout, dir, n, m) BIND(C, name="pw_gpu_f")
772 IMPORT
773 TYPE(c_ptr), INTENT(IN), VALUE :: zin
774 TYPE(c_ptr), VALUE :: zout
775 INTEGER(KIND=C_INT), INTENT(IN), VALUE :: dir, n, m
776 END SUBROUTINE pw_gpu_f_c
777 END INTERFACE
778
779 CALL timeset(routinen, handle)
780
781 IF (n*m /= 0) THEN
782 ! pointers to data arrays
783 ptr_pwin => pwbuf1(1, 1)
784 ptr_pwout => pwbuf2(1, 1)
785
786 ! invoke the combined transformation
787#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
788 CALL pw_gpu_f_c(c_loc(ptr_pwin), c_loc(ptr_pwout), dir, n, m)
789#else
790 mark_used(dir)
791 cpabort("Compiled without pw offloading")
792#endif
793 END IF
794
795 CALL timestop(handle)
796 END SUBROUTINE pw_gpu_f
797! **************************************************************************************************
798!> \brief perform a parallel 1D-FFT followed by a gather on the gpu
799!> \param pwbuf ...
800!> \param pw2 ...
801!> \param scale ...
802!> \author Andreas Gloess
803! **************************************************************************************************
804 SUBROUTINE pw_gpu_fg(pwbuf, pw2, scale)
805 COMPLEX(KIND=dp), DIMENSION(:, :), INTENT(IN), &
806 TARGET :: pwbuf
807 TYPE(pw_c1d_gs_type), INTENT(IN) :: pw2
808 REAL(kind=dp), INTENT(IN) :: scale
809
810 CHARACTER(len=*), PARAMETER :: routinen = 'pw_gpu_fg'
811
812 COMPLEX(KIND=dp), POINTER :: ptr_pwin, ptr_pwout
813 INTEGER :: handle, mg, mmax, ngpts
814 INTEGER, DIMENSION(:), POINTER :: npts
815 INTEGER, POINTER :: ptr_ghatmap
816 INTERFACE
817 SUBROUTINE pw_gpu_fg_c(zin, zout, ghatmap, npts, mmax, ngpts, scale) BIND(C, name="pw_gpu_fg")
818 IMPORT
819 TYPE(c_ptr), INTENT(IN), VALUE :: zin
820 TYPE(c_ptr), VALUE :: zout
821 TYPE(c_ptr), INTENT(IN), VALUE :: ghatmap
822 INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
823 INTEGER(KIND=C_INT), INTENT(IN), VALUE :: mmax, ngpts
824 REAL(kind=c_double), INTENT(IN), VALUE :: scale
825
826 END SUBROUTINE pw_gpu_fg_c
827 END INTERFACE
828
829 CALL timeset(routinen, handle)
830
831 ngpts = SIZE(pw2%pw_grid%gsq)
832 npts => pw2%pw_grid%npts
833
834 IF ((npts(1) /= 0) .AND. (ngpts /= 0)) THEN
835 mg = SIZE(pw2%pw_grid%grays, 2)
836 mmax = max(mg, 1)
837
838 ! pointers to data arrays
839 ptr_pwin => pwbuf(1, 1)
840 ptr_pwout => pw2%array(1)
841
842 ! pointer to map array
843 ptr_ghatmap => pw2%pw_grid%g_hatmap(1, 1)
844
845 ! invoke the combined transformation
846#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
847 CALL pw_gpu_fg_c(c_loc(ptr_pwin), c_loc(ptr_pwout), c_loc(ptr_ghatmap), npts, mmax, ngpts, scale)
848#else
849 mark_used(scale)
850 cpabort("Compiled without pw offloading")
851#endif
852 END IF
853
854 CALL timestop(handle)
855 END SUBROUTINE pw_gpu_fg
856
857! **************************************************************************************************
858!> \brief perform a parallel scatter followed by a 1D-FFT on the gpu
859!> \param pw1 ...
860!> \param pwbuf ...
861!> \param scale ...
862!> \author Andreas Gloess
863! **************************************************************************************************
864 SUBROUTINE pw_gpu_sf(pw1, pwbuf, scale)
865 TYPE(pw_c1d_gs_type), INTENT(IN) :: pw1
866 COMPLEX(KIND=dp), DIMENSION(:, :), INTENT(INOUT), &
867 TARGET :: pwbuf
868 REAL(kind=dp), INTENT(IN) :: scale
869
870 CHARACTER(len=*), PARAMETER :: routinen = 'pw_gpu_sf'
871
872 COMPLEX(KIND=dp), POINTER :: ptr_pwin, ptr_pwout
873 INTEGER :: handle, mg, mmax, ngpts, nmaps
874 INTEGER, DIMENSION(:), POINTER :: npts
875 INTEGER, POINTER :: ptr_ghatmap
876 INTERFACE
877 SUBROUTINE pw_gpu_sf_c(zin, zout, ghatmap, npts, mmax, ngpts, nmaps, scale) BIND(C, name="pw_gpu_sf")
878 IMPORT
879 TYPE(c_ptr), INTENT(IN), VALUE :: zin
880 TYPE(c_ptr), VALUE :: zout
881 TYPE(c_ptr), INTENT(IN), VALUE :: ghatmap
882 INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
883 INTEGER(KIND=C_INT), INTENT(IN), VALUE :: mmax, ngpts, nmaps
884 REAL(kind=c_double), INTENT(IN), VALUE :: scale
885
886 END SUBROUTINE pw_gpu_sf_c
887 END INTERFACE
888
889 CALL timeset(routinen, handle)
890
891 ngpts = SIZE(pw1%pw_grid%gsq)
892 npts => pw1%pw_grid%npts
893
894 IF ((npts(1) /= 0) .AND. (ngpts /= 0)) THEN
895 mg = SIZE(pw1%pw_grid%grays, 2)
896 mmax = max(mg, 1)
897
898 ! pointers to data arrays
899 ptr_pwin => pw1%array(1)
900 ptr_pwout => pwbuf(1, 1)
901
902 ! pointer to map array
903 nmaps = SIZE(pw1%pw_grid%g_hatmap, 2)
904 ptr_ghatmap => pw1%pw_grid%g_hatmap(1, 1)
905
906 ! invoke the combined transformation
907#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
908 CALL pw_gpu_sf_c(c_loc(ptr_pwin), c_loc(ptr_pwout), c_loc(ptr_ghatmap), npts, mmax, ngpts, nmaps, scale)
909#else
910 mark_used(scale)
911 cpabort("Compiled without pw offloading")
912#endif
913 END IF
914
915 CALL timestop(handle)
916 END SUBROUTINE pw_gpu_sf
917
918END MODULE pw_gpu
919
subroutine, public yz_to_x(tb, group, my_pos, p2p, yzp, nray, bo, sb, fft_scratch)
...
Definition fft_tools.F:1497
subroutine, public yz_to_xz(sb, group, dims, my_pos, p2p, yzp, nray, bo, tb, fft_scratch)
...
Definition fft_tools.F:1613
subroutine, public cube_transpose_1(cin, boin, boout, sout, fft_scratch)
...
Definition fft_tools.F:2017
subroutine, public release_fft_scratch(fft_scratch)
...
Definition fft_tools.F:3254
subroutine, public x_to_yz(sb, group, my_pos, p2p, yzp, nray, bo, tb, fft_scratch)
...
Definition fft_tools.F:1382
subroutine, public get_fft_scratch(fft_scratch, tf_type, n, fft_sizes)
...
Definition fft_tools.F:2876
subroutine, public xz_to_yz(sb, group, dims, my_pos, p2p, yzp, nray, bo, tb, fft_scratch)
...
Definition fft_tools.F:1823
subroutine, public cube_transpose_2(cin, boin, boout, sout, fft_scratch)
...
Definition fft_tools.F:2107
Defines the basic variable types.
Definition kinds.F:23
integer, parameter, public dp
Definition kinds.F:34
Definition of mathematical constants and functions.
complex(kind=dp), parameter, public z_zero
Interface to the message passing library MPI.
subroutine, public pw_gpu_c1dr3d_3d(pw1, pw2, scale)
perform an scatter followed by a fft on the gpu
Definition pw_gpu.F:162
subroutine, public pw_gpu_c1dr3d_3d_ps(pw1, pw2, scale)
perform an parallel scatter followed by a fft on the gpu
Definition pw_gpu.F:392
subroutine, public pw_gpu_r3dc1d_3d(pw1, pw2, scale)
perform an fft followed by a gather on the gpu
Definition pw_gpu.F:104
subroutine, public pw_gpu_init()
Allocates resources on the gpu device for gpu fft acceleration.
Definition pw_gpu.F:63
subroutine, public pw_gpu_finalize()
Releases resources on the gpu device for gpu fft acceleration.
Definition pw_gpu.F:82
subroutine, public pw_gpu_r3dc1d_3d_ps(pw1, pw2, scale)
perform an parallel fft followed by a gather on the gpu
Definition pw_gpu.F:220
integer, parameter, public fullspace