8 #include "../../offload/offload_runtime.h"
9 #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
14 #error "OpenMP should not be used in .cu files to accommodate HIP."
27 __global__
void pw_real_to_complex(
const double *din,
double *zout,
29 const int igpt = blockIdx.x * blockDim.x + threadIdx.x;
31 zout[2 * igpt] = din[igpt];
32 zout[2 * igpt + 1] = 0.0;
40 void pw_gpu_launch_real_to_complex(
const double *din,
double *zout,
41 const int ngpts, offloadStream_t stream) {
42 const int threadsPerBlock = 1024;
43 const int numBlocks = (ngpts + threadsPerBlock - 1) / threadsPerBlock;
44 pw_real_to_complex<<<numBlocks, threadsPerBlock, 0, stream>>>(din, zout,
58 __global__
void pw_complex_to_real(
const double *zin,
double *dout,
60 const int igpt = blockIdx.x * blockDim.x + threadIdx.x;
62 dout[igpt] = zin[2 * igpt];
70 void pw_gpu_launch_complex_to_real(
const double *zin,
double *dout,
71 const int ngpts, offloadStream_t stream) {
72 const int threadsPerBlock = 1024;
73 const int numBlocks = (ngpts + threadsPerBlock - 1) / threadsPerBlock;
74 pw_complex_to_real<<<numBlocks, threadsPerBlock, 0, stream>>>(zin, dout,
82 __global__
void pw_gather(
double *pwcc,
const double *
c,
const double scale,
83 const int ngpts,
const int *ghatmap) {
84 const int igpt = blockIdx.x * blockDim.x + threadIdx.x;
86 pwcc[2 * igpt] = scale *
c[2 * ghatmap[igpt]];
87 pwcc[2 * igpt + 1] = scale *
c[2 * ghatmap[igpt] + 1];
95 void pw_gpu_launch_gather(
double *pwcc,
const double *
c,
const double scale,
96 const int ngpts,
const int *ghatmap,
97 offloadStream_t stream) {
98 const int threadsPerBlock = 32;
99 const int numBlocks = (ngpts + threadsPerBlock - 1) / threadsPerBlock;
100 pw_gather<<<numBlocks, threadsPerBlock, 0, stream>>>(pwcc,
c, scale, ngpts,
108 __global__
void pw_scatter(
double *
c,
const double *pwcc,
const double scale,
109 const int ngpts,
const int nmaps,
110 const int *ghatmap) {
111 const int igpt = blockIdx.x * blockDim.x + threadIdx.x;
113 c[2 * ghatmap[igpt]] = scale * pwcc[2 * igpt];
114 c[2 * ghatmap[igpt] + 1] = scale * pwcc[2 * igpt + 1];
116 c[2 * ghatmap[igpt + ngpts]] = scale * pwcc[2 * igpt];
117 c[2 * ghatmap[igpt + ngpts] + 1] = -scale * pwcc[2 * igpt + 1];
126 void pw_gpu_launch_scatter(
double *
c,
const double *pwcc,
const double scale,
127 const int ngpts,
const int nmaps,
const int *ghatmap,
128 offloadStream_t stream) {
129 const int threadsPerBlock = 32;
130 const int numBlocks = (ngpts + threadsPerBlock - 1) / threadsPerBlock;
131 pw_scatter<<<numBlocks, threadsPerBlock, 0, stream>>>(
c, pwcc, scale, ngpts,