(git:34ef472)
pw_gpu_kernels.cu
Go to the documentation of this file.
1 /*----------------------------------------------------------------------------*/
2 /* CP2K: A general program to perform molecular dynamics simulations */
3 /* Copyright 2000-2024 CP2K developers group <https://cp2k.org> */
4 /* */
5 /* SPDX-License-Identifier: GPL-2.0-or-later */
6 /*----------------------------------------------------------------------------*/
7 
8 #include "../../offload/offload_runtime.h"
9 #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
10 
11 #include "pw_gpu_kernels.h"
12 
13 #if defined(_OMP_H)
14 #error "OpenMP should not be used in .cu files to accommodate HIP."
15 #endif
16 
17 /*******************************************************************************
18  * \brief Performs a out-of-place copy of a double precision vector (first
19  * half filled) into a double precision complex vector on the GPU.
20  * It requires a global double precision vector 'zout' of length '2n'.
21  * [memory (shared): none Byte
22  * memory (private): 4 Byte
23  * memory (global): 16*n Byte]
24  * n - size of double precision input vector
25  * \author Andreas Gloess
26  ******************************************************************************/
27 __global__ void pw_real_to_complex(const double *din, double *zout,
28  const int ngpts) {
29  const int igpt = blockIdx.x * blockDim.x + threadIdx.x;
30  if (igpt < ngpts) {
31  zout[2 * igpt] = din[igpt];
32  zout[2 * igpt + 1] = 0.0;
33  }
34 }
35 
36 /*******************************************************************************
37  * \brief Launcher for pw_real_to_complex kernel.
38  * \author Ole Schuett
39  ******************************************************************************/
40 void pw_gpu_launch_real_to_complex(const double *din, double *zout,
41  const int ngpts, offloadStream_t stream) {
42  const int threadsPerBlock = 1024;
43  const int numBlocks = (ngpts + threadsPerBlock - 1) / threadsPerBlock;
44  pw_real_to_complex<<<numBlocks, threadsPerBlock, 0, stream>>>(din, zout,
45  ngpts);
46 }
47 
48 /*******************************************************************************
49  * \brief Performs a out-of-place copy of a double precision complex vector
50  * (real part) into a double precision vector on the GPU.
51  * It requires a global double precision vector 'dout' of length 'n'.
52  * [memory (shared): none Byte
53  * memory (private): 4 Byte
54  * memory (global): 16*n Byte]
55  * n - size of double precision output vector
56  * \author Andreas Gloess
57  ******************************************************************************/
58 __global__ void pw_complex_to_real(const double *zin, double *dout,
59  const int ngpts) {
60  const int igpt = blockIdx.x * blockDim.x + threadIdx.x;
61  if (igpt < ngpts) {
62  dout[igpt] = zin[2 * igpt];
63  }
64 }
65 
66 /*******************************************************************************
67  * \brief Launcher for pw_complex_to_real kernel.
68  * \author Ole Schuett
69  ******************************************************************************/
70 void pw_gpu_launch_complex_to_real(const double *zin, double *dout,
71  const int ngpts, offloadStream_t stream) {
72  const int threadsPerBlock = 1024;
73  const int numBlocks = (ngpts + threadsPerBlock - 1) / threadsPerBlock;
74  pw_complex_to_real<<<numBlocks, threadsPerBlock, 0, stream>>>(zin, dout,
75  ngpts);
76 }
77 
78 /*******************************************************************************
79  * \brief Performs a (double precision complex) gather and scale on the GPU.
80  * \author Andreas Gloess
81  ******************************************************************************/
82 __global__ void pw_gather(double *pwcc, const double *c, const double scale,
83  const int ngpts, const int *ghatmap) {
84  const int igpt = blockIdx.x * blockDim.x + threadIdx.x;
85  if (igpt < ngpts) {
86  pwcc[2 * igpt] = scale * c[2 * ghatmap[igpt]];
87  pwcc[2 * igpt + 1] = scale * c[2 * ghatmap[igpt] + 1];
88  }
89 }
90 
91 /*******************************************************************************
92  * \brief Launcher for pw_gather kernel.
93  * \author Ole Schuett
94  ******************************************************************************/
95 void pw_gpu_launch_gather(double *pwcc, const double *c, const double scale,
96  const int ngpts, const int *ghatmap,
97  offloadStream_t stream) {
98  const int threadsPerBlock = 32;
99  const int numBlocks = (ngpts + threadsPerBlock - 1) / threadsPerBlock;
100  pw_gather<<<numBlocks, threadsPerBlock, 0, stream>>>(pwcc, c, scale, ngpts,
101  ghatmap);
102 }
103 
104 /*******************************************************************************
105  * \brief Performs a (double precision complex) scatter and scale on the GPU.
106  * \author Andreas Gloess
107  ******************************************************************************/
108 __global__ void pw_scatter(double *c, const double *pwcc, const double scale,
109  const int ngpts, const int nmaps,
110  const int *ghatmap) {
111  const int igpt = blockIdx.x * blockDim.x + threadIdx.x;
112  if (igpt < ngpts) {
113  c[2 * ghatmap[igpt]] = scale * pwcc[2 * igpt];
114  c[2 * ghatmap[igpt] + 1] = scale * pwcc[2 * igpt + 1];
115  if (nmaps == 2) {
116  c[2 * ghatmap[igpt + ngpts]] = scale * pwcc[2 * igpt];
117  c[2 * ghatmap[igpt + ngpts] + 1] = -scale * pwcc[2 * igpt + 1];
118  }
119  }
120 }
121 
122 /*******************************************************************************
123  * \brief Launcher for pw_scatter kernel.
124  * \author Ole Schuett
125  ******************************************************************************/
126 void pw_gpu_launch_scatter(double *c, const double *pwcc, const double scale,
127  const int ngpts, const int nmaps, const int *ghatmap,
128  offloadStream_t stream) {
129  const int threadsPerBlock = 32;
130  const int numBlocks = (ngpts + threadsPerBlock - 1) / threadsPerBlock;
131  pw_scatter<<<numBlocks, threadsPerBlock, 0, stream>>>(c, pwcc, scale, ngpts,
132  nmaps, ghatmap);
133 }
134 
135 #endif // defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
136 
137 // EOF
real(dp), dimension(3) c
Definition: ai_eri_debug.F:31