(git:374b731)
Loading...
Searching...
No Matches
pw_gpu_kernels.cu
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2024 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: GPL-2.0-or-later */
6/*----------------------------------------------------------------------------*/
7
8#include "../../offload/offload_runtime.h"
9#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
10
11#include "pw_gpu_kernels.h"
12
13#if defined(_OMP_H)
14#error "OpenMP should not be used in .cu files to accommodate HIP."
15#endif
16
17/*******************************************************************************
18 * \brief Performs a out-of-place copy of a double precision vector (first
19 * half filled) into a double precision complex vector on the GPU.
20 * It requires a global double precision vector 'zout' of length '2n'.
21 * [memory (shared): none Byte
22 * memory (private): 4 Byte
23 * memory (global): 16*n Byte]
24 * n - size of double precision input vector
25 * \author Andreas Gloess
26 ******************************************************************************/
27__global__ void pw_real_to_complex(const double *din, double *zout,
28 const int ngpts) {
29 const int igpt = blockIdx.x * blockDim.x + threadIdx.x;
30 if (igpt < ngpts) {
31 zout[2 * igpt] = din[igpt];
32 zout[2 * igpt + 1] = 0.0;
33 }
34}
35
36/*******************************************************************************
37 * \brief Launcher for pw_real_to_complex kernel.
38 * \author Ole Schuett
39 ******************************************************************************/
40void pw_gpu_launch_real_to_complex(const double *din, double *zout,
41 const int ngpts, offloadStream_t stream) {
42 const int threadsPerBlock = 1024;
43 const int numBlocks = (ngpts + threadsPerBlock - 1) / threadsPerBlock;
44 pw_real_to_complex<<<numBlocks, threadsPerBlock, 0, stream>>>(din, zout,
45 ngpts);
46}
47
48/*******************************************************************************
49 * \brief Performs a out-of-place copy of a double precision complex vector
50 * (real part) into a double precision vector on the GPU.
51 * It requires a global double precision vector 'dout' of length 'n'.
52 * [memory (shared): none Byte
53 * memory (private): 4 Byte
54 * memory (global): 16*n Byte]
55 * n - size of double precision output vector
56 * \author Andreas Gloess
57 ******************************************************************************/
58__global__ void pw_complex_to_real(const double *zin, double *dout,
59 const int ngpts) {
60 const int igpt = blockIdx.x * blockDim.x + threadIdx.x;
61 if (igpt < ngpts) {
62 dout[igpt] = zin[2 * igpt];
63 }
64}
65
66/*******************************************************************************
67 * \brief Launcher for pw_complex_to_real kernel.
68 * \author Ole Schuett
69 ******************************************************************************/
70void pw_gpu_launch_complex_to_real(const double *zin, double *dout,
71 const int ngpts, offloadStream_t stream) {
72 const int threadsPerBlock = 1024;
73 const int numBlocks = (ngpts + threadsPerBlock - 1) / threadsPerBlock;
74 pw_complex_to_real<<<numBlocks, threadsPerBlock, 0, stream>>>(zin, dout,
75 ngpts);
76}
77
78/*******************************************************************************
79 * \brief Performs a (double precision complex) gather and scale on the GPU.
80 * \author Andreas Gloess
81 ******************************************************************************/
82__global__ void pw_gather(double *pwcc, const double *c, const double scale,
83 const int ngpts, const int *ghatmap) {
84 const int igpt = blockIdx.x * blockDim.x + threadIdx.x;
85 if (igpt < ngpts) {
86 pwcc[2 * igpt] = scale * c[2 * ghatmap[igpt]];
87 pwcc[2 * igpt + 1] = scale * c[2 * ghatmap[igpt] + 1];
88 }
89}
90
91/*******************************************************************************
92 * \brief Launcher for pw_gather kernel.
93 * \author Ole Schuett
94 ******************************************************************************/
95void pw_gpu_launch_gather(double *pwcc, const double *c, const double scale,
96 const int ngpts, const int *ghatmap,
97 offloadStream_t stream) {
98 const int threadsPerBlock = 32;
99 const int numBlocks = (ngpts + threadsPerBlock - 1) / threadsPerBlock;
100 pw_gather<<<numBlocks, threadsPerBlock, 0, stream>>>(pwcc, c, scale, ngpts,
101 ghatmap);
102}
103
104/*******************************************************************************
105 * \brief Performs a (double precision complex) scatter and scale on the GPU.
106 * \author Andreas Gloess
107 ******************************************************************************/
108__global__ void pw_scatter(double *c, const double *pwcc, const double scale,
109 const int ngpts, const int nmaps,
110 const int *ghatmap) {
111 const int igpt = blockIdx.x * blockDim.x + threadIdx.x;
112 if (igpt < ngpts) {
113 c[2 * ghatmap[igpt]] = scale * pwcc[2 * igpt];
114 c[2 * ghatmap[igpt] + 1] = scale * pwcc[2 * igpt + 1];
115 if (nmaps == 2) {
116 c[2 * ghatmap[igpt + ngpts]] = scale * pwcc[2 * igpt];
117 c[2 * ghatmap[igpt + ngpts] + 1] = -scale * pwcc[2 * igpt + 1];
118 }
119 }
120}
121
122/*******************************************************************************
123 * \brief Launcher for pw_scatter kernel.
124 * \author Ole Schuett
125 ******************************************************************************/
126void pw_gpu_launch_scatter(double *c, const double *pwcc, const double scale,
127 const int ngpts, const int nmaps, const int *ghatmap,
128 offloadStream_t stream) {
129 const int threadsPerBlock = 32;
130 const int numBlocks = (ngpts + threadsPerBlock - 1) / threadsPerBlock;
131 pw_scatter<<<numBlocks, threadsPerBlock, 0, stream>>>(c, pwcc, scale, ngpts,
132 nmaps, ghatmap);
133}
134
135#endif // defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
136
137// EOF
real(dp), dimension(3) c