7#include "../../offload/offload_runtime.h"
8#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
10#include "../../offload/offload_fft.h"
11#include "../../offload/offload_library.h"
26 offload_fftHandle *plan;
29#define PW_GPU_CACHE_SIZE 32
30static cache_entry cache[PW_GPU_CACHE_SIZE];
31static int cache_oldest_entry = 0;
33static double *buffer_dev_1, *buffer_dev_2;
34static int *ghatmap_dev;
35static size_t allocated_buffer_size, allocated_map_size;
37static offloadStream_t stream;
38static bool is_initialized =
false;
45 assert(omp_get_num_threads() == 1);
51 memset(cache, 0,
sizeof(cache_entry) * PW_GPU_CACHE_SIZE);
52 cache_oldest_entry = 0;
54 allocated_buffer_size = 1;
55 allocated_map_size = 1;
57 offloadMalloc((
void **)&buffer_dev_1, allocated_buffer_size);
58 offloadMalloc((
void **)&buffer_dev_2, allocated_buffer_size);
59 offloadMalloc((
void **)&ghatmap_dev, allocated_map_size);
61 offloadStreamCreate(&stream);
62 is_initialized =
true;
70 assert(omp_get_num_threads() == 1);
71 if (!is_initialized) {
76 for (
int i = 0;
i < PW_GPU_CACHE_SIZE;
i++) {
77 if (cache[
i].plan != NULL) {
78 offload_fftDestroy(*cache[
i].plan);
82 offloadFree(buffer_dev_1);
83 offloadFree(buffer_dev_2);
84 offloadFree(ghatmap_dev);
85 offloadStreamDestroy(stream);
86 is_initialized =
false;
93static void ensure_memory_sizes(
const size_t requested_buffer_size,
94 const size_t requested_map_size) {
95 assert(is_initialized);
96 if (requested_buffer_size > allocated_buffer_size) {
97 offloadFree(buffer_dev_1);
98 offloadFree(buffer_dev_2);
99 offloadMalloc((
void **)&buffer_dev_1, requested_buffer_size);
100 offloadMalloc((
void **)&buffer_dev_2, requested_buffer_size);
101 allocated_buffer_size = requested_buffer_size;
103 if (requested_map_size > allocated_map_size) {
104 offloadFree(ghatmap_dev);
105 offloadMalloc((
void **)&ghatmap_dev, requested_map_size);
106 allocated_map_size = requested_map_size;
114static offload_fftHandle *lookup_plan_from_cache(
const int key[4]) {
115 assert(is_initialized);
116 for (
int i = 0;
i < PW_GPU_CACHE_SIZE;
i++) {
117 const int *x = cache[
i].key;
118 if (x[0] == key[0] && x[1] == key[1] && x[2] == key[2] && x[3] == key[3]) {
119 return cache[
i].plan;
129static void add_plan_to_cache(
const int key[4], offload_fftHandle *plan) {
130 const int i = cache_oldest_entry;
131 cache_oldest_entry = (cache_oldest_entry + 1) % PW_GPU_CACHE_SIZE;
132 if (cache[
i].plan != NULL) {
133 offload_fftDestroy(*cache[
i].plan);
136 cache[
i].key[0] = key[0];
137 cache[
i].key[1] = key[1];
138 cache[
i].key[2] = key[2];
139 cache[
i].key[3] = key[3];
140 cache[
i].plan = plan;
149static void fft_1d(
const int direction,
const int n,
const int m,
150 const double *data_in,
double *data_out) {
151 const int key[4] = {1, direction, n, m};
152 offload_fftHandle *plan = lookup_plan_from_cache(key);
156 int inembed[1] = {0};
157 int onembed[1] = {0};
159 int istride, idist, ostride, odist;
160 if (direction == OFFLOAD_FFT_FORWARD) {
171 plan = malloc(
sizeof(cache_entry));
172 offload_fftPlanMany(plan, 1, nsize, inembed, istride, idist, onembed,
173 ostride, odist, OFFLOAD_FFT_Z2Z, batch);
174 offload_fftSetStream(*plan, stream);
175 add_plan_to_cache(key, plan);
178 offload_fftExecZ2Z(*plan, data_in, data_out, direction);
186static void fft_3d(
const int direction,
const int nx,
const int ny,
187 const int nz,
double *data) {
188 const int key[4] = {3, nx, ny, nz};
189 offload_fftHandle *plan = lookup_plan_from_cache(key);
192 plan = malloc(
sizeof(cache_entry));
193 offload_fftPlan3d(plan, nx, ny, nz, OFFLOAD_FFT_Z2Z);
194 offload_fftSetStream(*plan, stream);
195 add_plan_to_cache(key, plan);
198 offload_fftExecZ2Z(*plan, data, data, direction);
206void pw_gpu_cfffg(
const double *din,
double *zout,
const int *ghatmap,
207 const int *npts,
const int ngpts,
const double scale) {
209 assert(omp_get_num_threads() == 1);
210 const int nrpts = npts[0] * npts[1] * npts[2];
211 assert(ngpts <= nrpts);
212 if (nrpts == 0 || ngpts == 0) {
218 const size_t buffer_size = 2 *
sizeof(double) * nrpts;
219 const size_t map_size =
sizeof(int) * ngpts;
220 ensure_memory_sizes(buffer_size, map_size);
223 offloadMemcpyAsyncHtoD(buffer_dev_1, din, buffer_size / 2, stream);
224 pw_gpu_launch_real_to_complex(buffer_dev_1, buffer_dev_2, nrpts, stream);
227 fft_3d(OFFLOAD_FFT_FORWARD, npts[2], npts[1], npts[0], buffer_dev_2);
230 offloadMemcpyAsyncHtoD(ghatmap_dev, ghatmap, map_size, stream);
231 pw_gpu_launch_gather(buffer_dev_1, buffer_dev_2, scale, ngpts, ghatmap_dev,
235 offloadMemcpyAsyncDtoH(zout, buffer_dev_1, 2 *
sizeof(
double) * ngpts,
237 offloadStreamSynchronize(stream);
245void pw_gpu_sfffc(
const double *zin,
double *dout,
const int *ghatmap,
246 const int *npts,
const int ngpts,
const int nmaps,
247 const double scale) {
249 assert(omp_get_num_threads() == 1);
250 const int nrpts = npts[0] * npts[1] * npts[2];
251 assert(ngpts <= nrpts);
252 if (nrpts == 0 || ngpts == 0) {
258 const size_t buffer_size = 2 *
sizeof(double) * nrpts;
259 const size_t map_size =
sizeof(int) * nmaps * ngpts;
260 ensure_memory_sizes(buffer_size, map_size);
263 offloadMemcpyAsyncHtoD(buffer_dev_1, zin, 2 *
sizeof(
double) * ngpts, stream);
266 offloadMemcpyAsyncHtoD(ghatmap_dev, ghatmap, map_size, stream);
267 offloadMemsetAsync(buffer_dev_2, 0, buffer_size, stream);
268 pw_gpu_launch_scatter(buffer_dev_2, buffer_dev_1, scale, ngpts, nmaps,
269 ghatmap_dev, stream);
272 fft_3d(OFFLOAD_FFT_INVERSE, npts[2], npts[1], npts[0], buffer_dev_2);
275 pw_gpu_launch_complex_to_real(buffer_dev_2, buffer_dev_1, nrpts, stream);
276 offloadMemcpyAsyncDtoH(dout, buffer_dev_1, buffer_size / 2, stream);
277 offloadStreamSynchronize(stream);
285void pw_gpu_cff(
const double *din,
double *zout,
const int *npts) {
287 assert(omp_get_num_threads() == 1);
288 const int nrpts = npts[0] * npts[1] * npts[2];
295 const size_t buffer_size = 2 *
sizeof(double) * nrpts;
296 ensure_memory_sizes(buffer_size, 0);
299 offloadMemcpyAsyncHtoD(buffer_dev_1, din, buffer_size / 2, stream);
300 pw_gpu_launch_real_to_complex(buffer_dev_1, buffer_dev_2, nrpts, stream);
304 fft_1d(OFFLOAD_FFT_FORWARD, npts[2], npts[0] * npts[1], buffer_dev_2,
306 fft_1d(OFFLOAD_FFT_FORWARD, npts[1], npts[0] * npts[2], buffer_dev_1,
310 offloadMemcpyAsyncDtoH(zout, buffer_dev_2, buffer_size, stream);
311 offloadStreamSynchronize(stream);
319void pw_gpu_ffc(
const double *zin,
double *dout,
const int *npts) {
321 assert(omp_get_num_threads() == 1);
322 const int nrpts = npts[0] * npts[1] * npts[2];
329 const size_t buffer_size = 2 *
sizeof(double) * nrpts;
330 ensure_memory_sizes(buffer_size, 0);
333 offloadMemcpyAsyncHtoD(buffer_dev_1, zin, buffer_size, stream);
337 fft_1d(OFFLOAD_FFT_INVERSE, npts[1], npts[0] * npts[2], buffer_dev_1,
339 fft_1d(OFFLOAD_FFT_INVERSE, npts[2], npts[0] * npts[1], buffer_dev_2,
341 pw_gpu_launch_complex_to_real(buffer_dev_1, buffer_dev_2, nrpts, stream);
344 offloadMemcpyAsyncDtoH(dout, buffer_dev_2, buffer_size / 2, stream);
345 offloadStreamSynchronize(stream);
353void pw_gpu_cf(
const double *din,
double *zout,
const int *npts) {
355 assert(omp_get_num_threads() == 1);
356 const int nrpts = npts[0] * npts[1] * npts[2];
363 const size_t buffer_size = 2 *
sizeof(double) * nrpts;
364 ensure_memory_sizes(buffer_size, 0);
367 offloadMemcpyAsyncHtoD(buffer_dev_1, din, buffer_size / 2, stream);
368 pw_gpu_launch_real_to_complex(buffer_dev_1, buffer_dev_2, nrpts, stream);
371 fft_1d(OFFLOAD_FFT_FORWARD, npts[2], npts[0] * npts[1], buffer_dev_2,
375 offloadMemcpyAsyncDtoH(zout, buffer_dev_1, buffer_size, stream);
376 offloadStreamSynchronize(stream);
384void pw_gpu_fc(
const double *zin,
double *dout,
const int *npts) {
386 assert(omp_get_num_threads() == 1);
387 const int nrpts = npts[0] * npts[1] * npts[2];
394 const size_t buffer_size = 2 *
sizeof(double) * nrpts;
395 ensure_memory_sizes(buffer_size, 0);
398 offloadMemcpyAsyncHtoD(buffer_dev_1, zin, buffer_size, stream);
401 fft_1d(OFFLOAD_FFT_INVERSE, npts[2], npts[0] * npts[1], buffer_dev_1,
405 pw_gpu_launch_complex_to_real(buffer_dev_2, buffer_dev_1, nrpts, stream);
406 offloadMemcpyAsyncDtoH(dout, buffer_dev_1, buffer_size / 2, stream);
407 offloadStreamSynchronize(stream);
414void pw_gpu_f(
const double *zin,
double *zout,
const int dir,
const int n,
417 assert(omp_get_num_threads() == 1);
418 const int nrpts = n * m;
425 const size_t buffer_size = 2 *
sizeof(double) * nrpts;
426 ensure_memory_sizes(buffer_size, 0);
429 offloadMemcpyAsyncHtoD(buffer_dev_1, zin, buffer_size, stream);
433 fft_1d(OFFLOAD_FFT_FORWARD, n, m, buffer_dev_1, buffer_dev_2);
435 fft_1d(OFFLOAD_FFT_INVERSE, n, m, buffer_dev_1, buffer_dev_2);
439 offloadMemcpyAsyncDtoH(zout, buffer_dev_2, buffer_size, stream);
440 offloadStreamSynchronize(stream);
448void pw_gpu_fg(
const double *zin,
double *zout,
const int *ghatmap,
449 const int *npts,
const int mmax,
const int ngpts,
450 const double scale) {
452 assert(omp_get_num_threads() == 1);
453 const int nrpts = npts[0] * mmax;
454 assert(ngpts <= nrpts);
455 if (nrpts == 0 || ngpts == 0) {
461 const size_t buffer_size = 2 *
sizeof(double) * nrpts;
462 const size_t map_size =
sizeof(int) * ngpts;
463 ensure_memory_sizes(buffer_size, map_size);
466 offloadMemcpyAsyncHtoD(buffer_dev_1, zin, buffer_size, stream);
469 fft_1d(OFFLOAD_FFT_FORWARD, npts[0], mmax, buffer_dev_1, buffer_dev_2);
472 offloadMemcpyAsyncHtoD(ghatmap_dev, ghatmap, map_size, stream);
473 pw_gpu_launch_gather(buffer_dev_1, buffer_dev_2, scale, ngpts, ghatmap_dev,
477 offloadMemcpyAsyncDtoH(zout, buffer_dev_1, 2 *
sizeof(
double) * ngpts,
479 offloadStreamSynchronize(stream);
487void pw_gpu_sf(
const double *zin,
double *zout,
const int *ghatmap,
488 const int *npts,
const int mmax,
const int ngpts,
489 const int nmaps,
const double scale) {
491 assert(omp_get_num_threads() == 1);
492 const int nrpts = npts[0] * mmax;
493 assert(ngpts <= nrpts);
494 if (nrpts == 0 || ngpts == 0) {
500 const size_t buffer_size = 2 *
sizeof(double) * nrpts;
501 const size_t map_size =
sizeof(int) * nmaps * ngpts;
502 ensure_memory_sizes(buffer_size, map_size);
505 offloadMemcpyAsyncHtoD(buffer_dev_1, zin, 2 *
sizeof(
double) * ngpts, stream);
508 offloadMemcpyAsyncHtoD(ghatmap_dev, ghatmap, map_size, stream);
509 offloadMemsetAsync(buffer_dev_2, 0, buffer_size, stream);
510 pw_gpu_launch_scatter(buffer_dev_2, buffer_dev_1, scale, ngpts, nmaps,
511 ghatmap_dev, stream);
514 fft_1d(OFFLOAD_FFT_INVERSE, npts[0], mmax, buffer_dev_2, buffer_dev_1);
517 offloadMemcpyAsyncDtoH(zout, buffer_dev_1, buffer_size, stream);
518 offloadStreamSynchronize(stream);
static void const int const int i
subroutine, public fft_3d(plan, scale, zin, zout, stat)
...
subroutine, public offload_activate_chosen_device()
Activates the device selected via offload_set_chosen_device()
subroutine, public pw_gpu_init()
Allocates resources on the gpu device for gpu fft acceleration.
subroutine, public pw_gpu_finalize()
Releases resources on the gpu device for gpu fft acceleration.