8#include "../../offload/offload_runtime.h"
9#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
17#include "../../offload/offload_fft.h"
18#include "../../offload/offload_library.h"
27 offload_fftHandle *plan;
30#define PW_GPU_CACHE_SIZE 32
31static cache_entry cache[PW_GPU_CACHE_SIZE];
32static int cache_oldest_entry = 0;
34static double *buffer_dev_1, *buffer_dev_2;
35static int *ghatmap_dev;
36static size_t allocated_buffer_size, allocated_map_size;
38static offloadStream_t stream;
39static bool is_initialized =
false;
46 assert(omp_get_num_threads() == 1);
52 memset(cache, 0,
sizeof(cache_entry) * PW_GPU_CACHE_SIZE);
53 cache_oldest_entry = 0;
55 allocated_buffer_size = 1;
56 allocated_map_size = 1;
58 offloadMalloc((
void **)&buffer_dev_1, allocated_buffer_size);
59 offloadMalloc((
void **)&buffer_dev_2, allocated_buffer_size);
60 offloadMalloc((
void **)&ghatmap_dev, allocated_map_size);
62 offloadStreamCreate(&stream);
63 is_initialized =
true;
71 assert(omp_get_num_threads() == 1);
72 if (!is_initialized) {
77 for (
int i = 0;
i < PW_GPU_CACHE_SIZE;
i++) {
78 if (cache[
i].plan != NULL) {
79 offload_fftDestroy(*cache[
i].plan);
83 offloadFree(buffer_dev_1);
84 offloadFree(buffer_dev_2);
85 offloadFree(ghatmap_dev);
86 offloadStreamDestroy(stream);
87 is_initialized =
false;
94static void ensure_memory_sizes(
const size_t requested_buffer_size,
95 const size_t requested_map_size) {
96 assert(is_initialized);
97 if (requested_buffer_size > allocated_buffer_size) {
98 offloadFree(buffer_dev_1);
99 offloadFree(buffer_dev_2);
100 offloadMalloc((
void **)&buffer_dev_1, requested_buffer_size);
101 offloadMalloc((
void **)&buffer_dev_2, requested_buffer_size);
102 allocated_buffer_size = requested_buffer_size;
104 if (requested_map_size > allocated_map_size) {
105 offloadFree(ghatmap_dev);
106 offloadMalloc((
void **)&ghatmap_dev, requested_map_size);
107 allocated_map_size = requested_map_size;
115static offload_fftHandle *lookup_plan_from_cache(
const int key[4]) {
116 assert(is_initialized);
117 for (
int i = 0;
i < PW_GPU_CACHE_SIZE;
i++) {
118 const int *x = cache[
i].key;
119 if (x[0] == key[0] && x[1] == key[1] && x[2] == key[2] && x[3] == key[3]) {
120 return cache[
i].plan;
130static void add_plan_to_cache(
const int key[4], offload_fftHandle *plan) {
131 const int i = cache_oldest_entry;
132 cache_oldest_entry = (cache_oldest_entry + 1) % PW_GPU_CACHE_SIZE;
133 if (cache[
i].plan != NULL) {
134 offload_fftDestroy(*cache[
i].plan);
137 cache[
i].key[0] = key[0];
138 cache[
i].key[1] = key[1];
139 cache[
i].key[2] = key[2];
140 cache[
i].key[3] = key[3];
141 cache[
i].plan = plan;
150static void fft_1d(
const int direction,
const int n,
const int m,
151 const double *data_in,
double *data_out) {
152 const int key[4] = {1, direction, n, m};
153 offload_fftHandle *plan = lookup_plan_from_cache(key);
157 int inembed[1] = {0};
158 int onembed[1] = {0};
160 int istride, idist, ostride, odist;
161 if (direction == OFFLOAD_FFT_FORWARD) {
172 plan = malloc(
sizeof(cache_entry));
173 offload_fftPlanMany(plan, 1, nsize, inembed, istride, idist, onembed,
174 ostride, odist, OFFLOAD_FFT_Z2Z, batch);
175 offload_fftSetStream(*plan, stream);
176 add_plan_to_cache(key, plan);
179 offload_fftExecZ2Z(*plan, data_in, data_out, direction);
187static void fft_3d(
const int direction,
const int nx,
const int ny,
188 const int nz,
double *data) {
189 const int key[4] = {3, nx, ny, nz};
190 offload_fftHandle *plan = lookup_plan_from_cache(key);
193 plan = malloc(
sizeof(cache_entry));
194 offload_fftPlan3d(plan, nx, ny, nz, OFFLOAD_FFT_Z2Z);
195 offload_fftSetStream(*plan, stream);
196 add_plan_to_cache(key, plan);
199 offload_fftExecZ2Z(*plan, data, data, direction);
207void pw_gpu_cfffg(
const double *din,
double *zout,
const int *ghatmap,
208 const int *npts,
const int ngpts,
const double scale) {
210 assert(omp_get_num_threads() == 1);
211 const int nrpts = npts[0] * npts[1] * npts[2];
212 assert(ngpts <= nrpts);
213 if (nrpts == 0 || ngpts == 0) {
219 const size_t buffer_size = 2 *
sizeof(double) * nrpts;
220 const size_t map_size =
sizeof(int) * ngpts;
221 ensure_memory_sizes(buffer_size, map_size);
224 offloadMemcpyAsyncHtoD(buffer_dev_1, din, buffer_size / 2, stream);
225 pw_gpu_launch_real_to_complex(buffer_dev_1, buffer_dev_2, nrpts, stream);
228 fft_3d(OFFLOAD_FFT_FORWARD, npts[2], npts[1], npts[0], buffer_dev_2);
231 offloadMemcpyAsyncHtoD(ghatmap_dev, ghatmap, map_size, stream);
232 pw_gpu_launch_gather(buffer_dev_1, buffer_dev_2, scale, ngpts, ghatmap_dev,
236 offloadMemcpyAsyncDtoH(zout, buffer_dev_1, 2 *
sizeof(
double) * ngpts,
238 offloadStreamSynchronize(stream);
246void pw_gpu_sfffc(
const double *zin,
double *dout,
const int *ghatmap,
247 const int *npts,
const int ngpts,
const int nmaps,
248 const double scale) {
250 assert(omp_get_num_threads() == 1);
251 const int nrpts = npts[0] * npts[1] * npts[2];
252 assert(ngpts <= nrpts);
253 if (nrpts == 0 || ngpts == 0) {
259 const size_t buffer_size = 2 *
sizeof(double) * nrpts;
260 const size_t map_size =
sizeof(int) * nmaps * ngpts;
261 ensure_memory_sizes(buffer_size, map_size);
264 offloadMemcpyAsyncHtoD(buffer_dev_1, zin, 2 *
sizeof(
double) * ngpts, stream);
267 offloadMemcpyAsyncHtoD(ghatmap_dev, ghatmap, map_size, stream);
268 offloadMemsetAsync(buffer_dev_2, 0, buffer_size, stream);
269 pw_gpu_launch_scatter(buffer_dev_2, buffer_dev_1, scale, ngpts, nmaps,
270 ghatmap_dev, stream);
273 fft_3d(OFFLOAD_FFT_INVERSE, npts[2], npts[1], npts[0], buffer_dev_2);
276 pw_gpu_launch_complex_to_real(buffer_dev_2, buffer_dev_1, nrpts, stream);
277 offloadMemcpyAsyncDtoH(dout, buffer_dev_1, buffer_size / 2, stream);
278 offloadStreamSynchronize(stream);
286void pw_gpu_cff(
const double *din,
double *zout,
const int *npts) {
288 assert(omp_get_num_threads() == 1);
289 const int nrpts = npts[0] * npts[1] * npts[2];
296 const size_t buffer_size = 2 *
sizeof(double) * nrpts;
297 ensure_memory_sizes(buffer_size, 0);
300 offloadMemcpyAsyncHtoD(buffer_dev_1, din, buffer_size / 2, stream);
301 pw_gpu_launch_real_to_complex(buffer_dev_1, buffer_dev_2, nrpts, stream);
305 fft_1d(OFFLOAD_FFT_FORWARD, npts[2], npts[0] * npts[1], buffer_dev_2,
307 fft_1d(OFFLOAD_FFT_FORWARD, npts[1], npts[0] * npts[2], buffer_dev_1,
311 offloadMemcpyAsyncDtoH(zout, buffer_dev_2, buffer_size, stream);
312 offloadStreamSynchronize(stream);
320void pw_gpu_ffc(
const double *zin,
double *dout,
const int *npts) {
322 assert(omp_get_num_threads() == 1);
323 const int nrpts = npts[0] * npts[1] * npts[2];
330 const size_t buffer_size = 2 *
sizeof(double) * nrpts;
331 ensure_memory_sizes(buffer_size, 0);
334 offloadMemcpyAsyncHtoD(buffer_dev_1, zin, buffer_size, stream);
338 fft_1d(OFFLOAD_FFT_INVERSE, npts[1], npts[0] * npts[2], buffer_dev_1,
340 fft_1d(OFFLOAD_FFT_INVERSE, npts[2], npts[0] * npts[1], buffer_dev_2,
342 pw_gpu_launch_complex_to_real(buffer_dev_1, buffer_dev_2, nrpts, stream);
345 offloadMemcpyAsyncDtoH(dout, buffer_dev_2, buffer_size / 2, stream);
346 offloadStreamSynchronize(stream);
354void pw_gpu_cf(
const double *din,
double *zout,
const int *npts) {
356 assert(omp_get_num_threads() == 1);
357 const int nrpts = npts[0] * npts[1] * npts[2];
364 const size_t buffer_size = 2 *
sizeof(double) * nrpts;
365 ensure_memory_sizes(buffer_size, 0);
368 offloadMemcpyAsyncHtoD(buffer_dev_1, din, buffer_size / 2, stream);
369 pw_gpu_launch_real_to_complex(buffer_dev_1, buffer_dev_2, nrpts, stream);
372 fft_1d(OFFLOAD_FFT_FORWARD, npts[2], npts[0] * npts[1], buffer_dev_2,
376 offloadMemcpyAsyncDtoH(zout, buffer_dev_1, buffer_size, stream);
377 offloadStreamSynchronize(stream);
385void pw_gpu_fc(
const double *zin,
double *dout,
const int *npts) {
387 assert(omp_get_num_threads() == 1);
388 const int nrpts = npts[0] * npts[1] * npts[2];
395 const size_t buffer_size = 2 *
sizeof(double) * nrpts;
396 ensure_memory_sizes(buffer_size, 0);
399 offloadMemcpyAsyncHtoD(buffer_dev_1, zin, buffer_size, stream);
402 fft_1d(OFFLOAD_FFT_INVERSE, npts[2], npts[0] * npts[1], buffer_dev_1,
406 pw_gpu_launch_complex_to_real(buffer_dev_2, buffer_dev_1, nrpts, stream);
407 offloadMemcpyAsyncDtoH(dout, buffer_dev_1, buffer_size / 2, stream);
408 offloadStreamSynchronize(stream);
415void pw_gpu_f(
const double *zin,
double *zout,
const int dir,
const int n,
418 assert(omp_get_num_threads() == 1);
419 const int nrpts = n * m;
426 const size_t buffer_size = 2 *
sizeof(double) * nrpts;
427 ensure_memory_sizes(buffer_size, 0);
430 offloadMemcpyAsyncHtoD(buffer_dev_1, zin, buffer_size, stream);
434 fft_1d(OFFLOAD_FFT_FORWARD, n, m, buffer_dev_1, buffer_dev_2);
436 fft_1d(OFFLOAD_FFT_INVERSE, n, m, buffer_dev_1, buffer_dev_2);
440 offloadMemcpyAsyncDtoH(zout, buffer_dev_2, buffer_size, stream);
441 offloadStreamSynchronize(stream);
449void pw_gpu_fg(
const double *zin,
double *zout,
const int *ghatmap,
450 const int *npts,
const int mmax,
const int ngpts,
451 const double scale) {
453 assert(omp_get_num_threads() == 1);
454 const int nrpts = npts[0] * mmax;
455 assert(ngpts <= nrpts);
456 if (nrpts == 0 || ngpts == 0) {
462 const size_t buffer_size = 2 *
sizeof(double) * nrpts;
463 const size_t map_size =
sizeof(int) * ngpts;
464 ensure_memory_sizes(buffer_size, map_size);
467 offloadMemcpyAsyncHtoD(buffer_dev_1, zin, buffer_size, stream);
470 fft_1d(OFFLOAD_FFT_FORWARD, npts[0], mmax, buffer_dev_1, buffer_dev_2);
473 offloadMemcpyAsyncHtoD(ghatmap_dev, ghatmap, map_size, stream);
474 pw_gpu_launch_gather(buffer_dev_1, buffer_dev_2, scale, ngpts, ghatmap_dev,
478 offloadMemcpyAsyncDtoH(zout, buffer_dev_1, 2 *
sizeof(
double) * ngpts,
480 offloadStreamSynchronize(stream);
488void pw_gpu_sf(
const double *zin,
double *zout,
const int *ghatmap,
489 const int *npts,
const int mmax,
const int ngpts,
490 const int nmaps,
const double scale) {
492 assert(omp_get_num_threads() == 1);
493 const int nrpts = npts[0] * mmax;
494 assert(ngpts <= nrpts);
495 if (nrpts == 0 || ngpts == 0) {
501 const size_t buffer_size = 2 *
sizeof(double) * nrpts;
502 const size_t map_size =
sizeof(int) * nmaps * ngpts;
503 ensure_memory_sizes(buffer_size, map_size);
506 offloadMemcpyAsyncHtoD(buffer_dev_1, zin, 2 *
sizeof(
double) * ngpts, stream);
509 offloadMemcpyAsyncHtoD(ghatmap_dev, ghatmap, map_size, stream);
510 offloadMemsetAsync(buffer_dev_2, 0, buffer_size, stream);
511 pw_gpu_launch_scatter(buffer_dev_2, buffer_dev_1, scale, ngpts, nmaps,
512 ghatmap_dev, stream);
515 fft_1d(OFFLOAD_FFT_INVERSE, npts[0], mmax, buffer_dev_2, buffer_dev_1);
518 offloadMemcpyAsyncDtoH(zout, buffer_dev_1, buffer_size, stream);
519 offloadStreamSynchronize(stream);
static void const int const int i
subroutine, public fft_3d(plan, scale, zin, zout, stat)
...
subroutine, public offload_activate_chosen_device()
Activates the device selected via offload_set_chosen_device()
subroutine, public pw_gpu_init()
Allocates resources on the gpu device for gpu fft acceleration.
subroutine, public pw_gpu_finalize()
Releases resources on the gpu device for gpu fft acceleration.