(git:0de0cc2)
pw_gpu_internal.c
Go to the documentation of this file.
1 /*----------------------------------------------------------------------------*/
2 /* CP2K: A general program to perform molecular dynamics simulations */
3 /* Copyright 2000-2024 CP2K developers group <https://cp2k.org> */
4 /* */
5 /* SPDX-License-Identifier: GPL-2.0-or-later */
6 /*----------------------------------------------------------------------------*/
7 
8 #include "../../offload/offload_runtime.h"
9 #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
10 
11 #include <assert.h>
12 #include <omp.h>
13 #include <stdbool.h>
14 #include <stddef.h>
15 #include <string.h>
16 
17 #include "../../offload/offload_fft.h"
18 #include "../../offload/offload_library.h"
19 #include "pw_gpu_kernels.h"
20 
21 /*******************************************************************************
22  * \brief Static variables for retaining objects that are expensive to create.
23  * \author Ole Schuett
24  ******************************************************************************/
25 typedef struct {
26  int key[4];
27  offload_fftHandle *plan;
28 } cache_entry;
29 
30 #define PW_GPU_CACHE_SIZE 32
31 static cache_entry cache[PW_GPU_CACHE_SIZE];
32 static int cache_oldest_entry = 0; // used for LRU eviction
33 
34 static double *buffer_dev_1, *buffer_dev_2;
35 static int *ghatmap_dev;
36 static size_t allocated_buffer_size, allocated_map_size;
37 
38 static offloadStream_t stream;
39 static bool is_initialized = false;
40 
41 /*******************************************************************************
42  * \brief Initializes the pw_gpu library.
43  * \author Ole Schuett
44  ******************************************************************************/
45 void pw_gpu_init(void) {
46  assert(omp_get_num_threads() == 1);
47  if (is_initialized) {
48  // fprintf(stderr, "Error: pw_gpu was already initialized.\n");
49  // TODO abort();
50  return;
51  }
52  memset(cache, 0, sizeof(cache_entry) * PW_GPU_CACHE_SIZE);
53  cache_oldest_entry = 0;
54 
55  allocated_buffer_size = 1; // start small
56  allocated_map_size = 1;
58  offloadMalloc((void **)&buffer_dev_1, allocated_buffer_size);
59  offloadMalloc((void **)&buffer_dev_2, allocated_buffer_size);
60  offloadMalloc((void **)&ghatmap_dev, allocated_map_size);
61 
62  offloadStreamCreate(&stream);
63  is_initialized = true;
64 }
65 
66 /*******************************************************************************
67  * \brief Releases resources held by the pw_gpu library.
68  * \author Ole Schuett
69  ******************************************************************************/
70 void pw_gpu_finalize(void) {
71  assert(omp_get_num_threads() == 1);
72  if (!is_initialized) {
73  // fprintf(stderr, "Error: pw_gpu is not initialized.\n");
74  // TODO abort();
75  return;
76  }
77  for (int i = 0; i < PW_GPU_CACHE_SIZE; i++) {
78  if (cache[i].plan != NULL) {
79  offload_fftDestroy(*cache[i].plan);
80  free(cache[i].plan);
81  }
82  }
83  offloadFree(buffer_dev_1);
84  offloadFree(buffer_dev_2);
85  offloadFree(ghatmap_dev);
86  offloadStreamDestroy(stream);
87  is_initialized = false;
88 }
89 
90 /*******************************************************************************
91  * \brief Checks size of device buffers and re-allocates them if necessary.
92  * \author Ole Schuett
93  ******************************************************************************/
94 static void ensure_memory_sizes(const size_t requested_buffer_size,
95  const size_t requested_map_size) {
96  assert(is_initialized);
97  if (requested_buffer_size > allocated_buffer_size) {
98  offloadFree(buffer_dev_1);
99  offloadFree(buffer_dev_2);
100  offloadMalloc((void **)&buffer_dev_1, requested_buffer_size);
101  offloadMalloc((void **)&buffer_dev_2, requested_buffer_size);
102  allocated_buffer_size = requested_buffer_size;
103  }
104  if (requested_map_size > allocated_map_size) {
105  offloadFree(ghatmap_dev);
106  offloadMalloc((void **)&ghatmap_dev, requested_map_size);
107  allocated_map_size = requested_map_size;
108  }
109 }
110 
111 /*******************************************************************************
112  * \brief Fetches an fft plan from the cache. Returns NULL if not found.
113  * \author Ole Schuett
114  ******************************************************************************/
115 static offload_fftHandle *lookup_plan_from_cache(const int key[4]) {
116  assert(is_initialized);
117  for (int i = 0; i < PW_GPU_CACHE_SIZE; i++) {
118  const int *x = cache[i].key;
119  if (x[0] == key[0] && x[1] == key[1] && x[2] == key[2] && x[3] == key[3]) {
120  return cache[i].plan;
121  }
122  }
123  return NULL;
124 }
125 
126 /*******************************************************************************
127  * \brief Adds an fft plan to the cache. Assumes ownership of plan's memory.
128  * \author Ole Schuett
129  ******************************************************************************/
130 static void add_plan_to_cache(const int key[4], offload_fftHandle *plan) {
131  const int i = cache_oldest_entry;
132  cache_oldest_entry = (cache_oldest_entry + 1) % PW_GPU_CACHE_SIZE;
133  if (cache[i].plan != NULL) {
134  offload_fftDestroy(*cache[i].plan);
135  free(cache[i].plan);
136  }
137  cache[i].key[0] = key[0];
138  cache[i].key[1] = key[1];
139  cache[i].key[2] = key[2];
140  cache[i].key[3] = key[3];
141  cache[i].plan = plan;
142 }
143 
144 /*******************************************************************************
145  * \brief Performs a scaled double precision complex 1D-FFT many times on
146  * the GPU.
147  * Input/output are DEVICE pointers (data_in, date_out).
148  * \author Andreas Gloess, Ole Schuett
149  ******************************************************************************/
150 static void fft_1d(const int direction, const int n, const int m,
151  const double *data_in, double *data_out) {
152  const int key[4] = {1, direction, n, m}; // first key entry is dimensions
153  offload_fftHandle *plan = lookup_plan_from_cache(key);
154 
155  if (plan == NULL) {
156  int nsize[1] = {n};
157  int inembed[1] = {0}; // Is ignored, but is not allowed to be NULL.
158  int onembed[1] = {0}; // Is ignored, but is not allowed to be NULL.
159  int batch = m;
160  int istride, idist, ostride, odist;
161  if (direction == OFFLOAD_FFT_FORWARD) {
162  istride = m;
163  idist = 1;
164  ostride = 1;
165  odist = n;
166  } else {
167  istride = 1;
168  idist = n;
169  ostride = m;
170  odist = 1;
171  }
172  plan = malloc(sizeof(cache_entry));
173  offload_fftPlanMany(plan, 1, nsize, inembed, istride, idist, onembed,
174  ostride, odist, OFFLOAD_FFT_Z2Z, batch);
175  offload_fftSetStream(*plan, stream);
176  add_plan_to_cache(key, plan);
177  }
178 
179  offload_fftExecZ2Z(*plan, data_in, data_out, direction);
180 }
181 
182 /*******************************************************************************
183  * \brief Performs a scaled double precision complex 3D-FFT on the GPU.
184  * Input/output is a DEVICE pointer (data).
185  * \author Andreas Gloess, Ole Schuett
186  ******************************************************************************/
187 static void fft_3d(const int direction, const int nx, const int ny,
188  const int nz, double *data) {
189  const int key[4] = {3, nx, ny, nz}; // first key entry is dimensions
190  offload_fftHandle *plan = lookup_plan_from_cache(key);
191 
192  if (plan == NULL) {
193  plan = malloc(sizeof(cache_entry));
194  offload_fftPlan3d(plan, nx, ny, nz, OFFLOAD_FFT_Z2Z);
195  offload_fftSetStream(*plan, stream);
196  add_plan_to_cache(key, plan);
197  }
198 
199  offload_fftExecZ2Z(*plan, data, data, direction);
200 }
201 
202 /*******************************************************************************
203  * \brief Performs a (double precision complex) FFT, followed by a (double
204  * precision complex) gather, on the GPU.
205  * \author Andreas Gloess, Ole Schuett
206  ******************************************************************************/
207 void pw_gpu_cfffg(const double *din, double *zout, const int *ghatmap,
208  const int *npts, const int ngpts, const double scale) {
209  // Check inputs.
210  assert(omp_get_num_threads() == 1);
211  const int nrpts = npts[0] * npts[1] * npts[2];
212  assert(ngpts <= nrpts);
213  if (nrpts == 0 || ngpts == 0) {
214  return; // Nothing to do.
215  }
216 
217  // Allocate device memory.
219  const size_t buffer_size = 2 * sizeof(double) * nrpts;
220  const size_t map_size = sizeof(int) * ngpts;
221  ensure_memory_sizes(buffer_size, map_size);
222 
223  // Upload REAL input and convert to COMPLEX on device.
224  offloadMemcpyAsyncHtoD(buffer_dev_1, din, buffer_size / 2, stream);
225  pw_gpu_launch_real_to_complex(buffer_dev_1, buffer_dev_2, nrpts, stream);
226 
227  // Run FFT on the device.
228  fft_3d(OFFLOAD_FFT_FORWARD, npts[2], npts[1], npts[0], buffer_dev_2);
229 
230  // Upload map and run gather on the device.
231  offloadMemcpyAsyncHtoD(ghatmap_dev, ghatmap, map_size, stream);
232  pw_gpu_launch_gather(buffer_dev_1, buffer_dev_2, scale, ngpts, ghatmap_dev,
233  stream);
234 
235  // Download COMPLEX results to host.
236  offloadMemcpyAsyncDtoH(zout, buffer_dev_1, 2 * sizeof(double) * ngpts,
237  stream);
238  offloadStreamSynchronize(stream);
239 }
240 
241 /*******************************************************************************
242  * \brief Performs a (double precision complex) scatter, followed by a
243  * (double precision complex) FFT, on the GPU.
244  * \author Andreas Gloess, Ole Schuett
245  ******************************************************************************/
246 void pw_gpu_sfffc(const double *zin, double *dout, const int *ghatmap,
247  const int *npts, const int ngpts, const int nmaps,
248  const double scale) {
249  // Check inputs.
250  assert(omp_get_num_threads() == 1);
251  const int nrpts = npts[0] * npts[1] * npts[2];
252  assert(ngpts <= nrpts);
253  if (nrpts == 0 || ngpts == 0) {
254  return; // Nothing to do.
255  }
256 
257  // Allocate device memory.
259  const size_t buffer_size = 2 * sizeof(double) * nrpts;
260  const size_t map_size = sizeof(int) * nmaps * ngpts;
261  ensure_memory_sizes(buffer_size, map_size);
262 
263  // Upload COMPLEX inputs to device.
264  offloadMemcpyAsyncHtoD(buffer_dev_1, zin, 2 * sizeof(double) * ngpts, stream);
265 
266  // Upload map and run scatter on the device.
267  offloadMemcpyAsyncHtoD(ghatmap_dev, ghatmap, map_size, stream);
268  offloadMemsetAsync(buffer_dev_2, 0, buffer_size, stream);
269  pw_gpu_launch_scatter(buffer_dev_2, buffer_dev_1, scale, ngpts, nmaps,
270  ghatmap_dev, stream);
271 
272  // Run FFT on the device.
273  fft_3d(OFFLOAD_FFT_INVERSE, npts[2], npts[1], npts[0], buffer_dev_2);
274 
275  // Convert COMPLEX results to REAL and download to host.
276  pw_gpu_launch_complex_to_real(buffer_dev_2, buffer_dev_1, nrpts, stream);
277  offloadMemcpyAsyncDtoH(dout, buffer_dev_1, buffer_size / 2, stream);
278  offloadStreamSynchronize(stream);
279 }
280 
281 /*******************************************************************************
282  * \brief Performs a (double to complex double) blow-up and a (double
283  * precision complex) 2D-FFT on the GPU.
284  * \author Andreas Gloess, Ole Schuett
285  ******************************************************************************/
286 void pw_gpu_cff(const double *din, double *zout, const int *npts) {
287  // Check inputs.
288  assert(omp_get_num_threads() == 1);
289  const int nrpts = npts[0] * npts[1] * npts[2];
290  if (nrpts == 0) {
291  return; // Nothing to do.
292  }
293 
294  // Allocate device memory.
296  const size_t buffer_size = 2 * sizeof(double) * nrpts;
297  ensure_memory_sizes(buffer_size, 0);
298 
299  // Upload REAL input and convert to COMPLEX on device.
300  offloadMemcpyAsyncHtoD(buffer_dev_1, din, buffer_size / 2, stream);
301  pw_gpu_launch_real_to_complex(buffer_dev_1, buffer_dev_2, nrpts, stream);
302 
303  // Run FFT on the device.
304  // NOTE: Could use 2D-FFT, but CUDA does them C-shaped which is not optimal.
305  fft_1d(OFFLOAD_FFT_FORWARD, npts[2], npts[0] * npts[1], buffer_dev_2,
306  buffer_dev_1);
307  fft_1d(OFFLOAD_FFT_FORWARD, npts[1], npts[0] * npts[2], buffer_dev_1,
308  buffer_dev_2);
309 
310  // Download COMPLEX results to host.
311  offloadMemcpyAsyncDtoH(zout, buffer_dev_2, buffer_size, stream);
312  offloadStreamSynchronize(stream);
313 }
314 
315 /*******************************************************************************
316  * \brief Performs a (double precision complex) 2D-FFT and a (double complex
317  * to double) shrink-down on the GPU.
318  * \author Andreas Gloess, Ole Schuett
319  ******************************************************************************/
320 void pw_gpu_ffc(const double *zin, double *dout, const int *npts) {
321  // Check inputs.
322  assert(omp_get_num_threads() == 1);
323  const int nrpts = npts[0] * npts[1] * npts[2];
324  if (nrpts == 0) {
325  return; // Nothing to do.
326  }
327 
328  // Allocate device memory.
330  const size_t buffer_size = 2 * sizeof(double) * nrpts;
331  ensure_memory_sizes(buffer_size, 0);
332 
333  // Upload COMPLEX input to device.
334  offloadMemcpyAsyncHtoD(buffer_dev_1, zin, buffer_size, stream);
335 
336  // Run FFT on the device.
337  // NOTE: Could use 2D-FFT, but CUDA does them C-shaped which is not optimal.
338  fft_1d(OFFLOAD_FFT_INVERSE, npts[1], npts[0] * npts[2], buffer_dev_1,
339  buffer_dev_2);
340  fft_1d(OFFLOAD_FFT_INVERSE, npts[2], npts[0] * npts[1], buffer_dev_2,
341  buffer_dev_1);
342  pw_gpu_launch_complex_to_real(buffer_dev_1, buffer_dev_2, nrpts, stream);
343 
344  // Download REAL results to host.
345  offloadMemcpyAsyncDtoH(dout, buffer_dev_2, buffer_size / 2, stream);
346  offloadStreamSynchronize(stream);
347 }
348 
349 /*******************************************************************************
350  * \brief Performs a (double to complex double) blow-up and a (double
351  * precision complex) 1D-FFT on the GPU.
352  * \author Andreas Gloess, Ole Schuett
353  ******************************************************************************/
354 void pw_gpu_cf(const double *din, double *zout, const int *npts) {
355  // Check inputs.
356  assert(omp_get_num_threads() == 1);
357  const int nrpts = npts[0] * npts[1] * npts[2];
358  if (nrpts == 0) {
359  return; // Nothing to do.
360  }
361 
362  // Allocate device memory.
364  const size_t buffer_size = 2 * sizeof(double) * nrpts;
365  ensure_memory_sizes(buffer_size, 0);
366 
367  // Upload REAL input and convert to COMPLEX on device.
368  offloadMemcpyAsyncHtoD(buffer_dev_1, din, buffer_size / 2, stream);
369  pw_gpu_launch_real_to_complex(buffer_dev_1, buffer_dev_2, nrpts, stream);
370 
371  // Run FFT on the device.
372  fft_1d(OFFLOAD_FFT_FORWARD, npts[2], npts[0] * npts[1], buffer_dev_2,
373  buffer_dev_1);
374 
375  // Download COMPLEX results from device.
376  offloadMemcpyAsyncDtoH(zout, buffer_dev_1, buffer_size, stream);
377  offloadStreamSynchronize(stream);
378 }
379 
380 /*******************************************************************************
381  * \brief Performs a (double precision complex) 1D-FFT and a (double complex
382  * to double) shrink-down on the GPU.
383  * \author Andreas Gloess, Ole Schuett
384  ******************************************************************************/
385 void pw_gpu_fc(const double *zin, double *dout, const int *npts) {
386  // Check inputs.
387  assert(omp_get_num_threads() == 1);
388  const int nrpts = npts[0] * npts[1] * npts[2];
389  if (nrpts == 0) {
390  return; // Nothing to do.
391  }
392 
393  // Allocate device memory.
395  const size_t buffer_size = 2 * sizeof(double) * nrpts;
396  ensure_memory_sizes(buffer_size, 0);
397 
398  // Upload COMPLEX input to device.
399  offloadMemcpyAsyncHtoD(buffer_dev_1, zin, buffer_size, stream);
400 
401  // Run FFT on the device.
402  fft_1d(OFFLOAD_FFT_INVERSE, npts[2], npts[0] * npts[1], buffer_dev_1,
403  buffer_dev_2);
404 
405  // Convert COMPLEX results to REAL and download to host.
406  pw_gpu_launch_complex_to_real(buffer_dev_2, buffer_dev_1, nrpts, stream);
407  offloadMemcpyAsyncDtoH(dout, buffer_dev_1, buffer_size / 2, stream);
408  offloadStreamSynchronize(stream);
409 }
410 
411 /*******************************************************************************
412  * \brief Performs a (double precision complex) 1D-FFT on the GPU.
413  * \author Andreas Gloess, Ole Schuett
414  ******************************************************************************/
415 void pw_gpu_f(const double *zin, double *zout, const int dir, const int n,
416  const int m) {
417  // Check inputs.
418  assert(omp_get_num_threads() == 1);
419  const int nrpts = n * m;
420  if (nrpts == 0) {
421  return; // Nothing to do.
422  }
423 
424  // Allocate device memory.
426  const size_t buffer_size = 2 * sizeof(double) * nrpts;
427  ensure_memory_sizes(buffer_size, 0);
428 
429  // Upload COMPLEX input to device.
430  offloadMemcpyAsyncHtoD(buffer_dev_1, zin, buffer_size, stream);
431 
432  // Run FFT on the device.
433  if (dir > 0) {
434  fft_1d(OFFLOAD_FFT_FORWARD, n, m, buffer_dev_1, buffer_dev_2);
435  } else {
436  fft_1d(OFFLOAD_FFT_INVERSE, n, m, buffer_dev_1, buffer_dev_2);
437  }
438 
439  // Download COMPLEX results from device.
440  offloadMemcpyAsyncDtoH(zout, buffer_dev_2, buffer_size, stream);
441  offloadStreamSynchronize(stream);
442 }
443 
444 /*******************************************************************************
445  * \brief Performs a (double precision complex) 1D-FFT, followed by a (double
446  * precision complex) gather, on the GPU.
447  * \author Andreas Gloess, Ole Schuett
448  ******************************************************************************/
449 void pw_gpu_fg(const double *zin, double *zout, const int *ghatmap,
450  const int *npts, const int mmax, const int ngpts,
451  const double scale) {
452  // Check inputs.
453  assert(omp_get_num_threads() == 1);
454  const int nrpts = npts[0] * mmax;
455  assert(ngpts <= nrpts);
456  if (nrpts == 0 || ngpts == 0) {
457  return; // Nothing to do.
458  }
459 
460  // Allocate device memory.
462  const size_t buffer_size = 2 * sizeof(double) * nrpts;
463  const size_t map_size = sizeof(int) * ngpts;
464  ensure_memory_sizes(buffer_size, map_size);
465 
466  // Upload COMPLEX inputs to device.
467  offloadMemcpyAsyncHtoD(buffer_dev_1, zin, buffer_size, stream);
468 
469  // Run FFT on the device.
470  fft_1d(OFFLOAD_FFT_FORWARD, npts[0], mmax, buffer_dev_1, buffer_dev_2);
471 
472  // Upload map and run gather on the device.
473  offloadMemcpyAsyncHtoD(ghatmap_dev, ghatmap, map_size, stream);
474  pw_gpu_launch_gather(buffer_dev_1, buffer_dev_2, scale, ngpts, ghatmap_dev,
475  stream);
476 
477  // Download COMPLEX results from device.
478  offloadMemcpyAsyncDtoH(zout, buffer_dev_1, 2 * sizeof(double) * ngpts,
479  stream);
480  offloadStreamSynchronize(stream);
481 }
482 
483 /*******************************************************************************
484  * \brief Performs a (double precision complex) scatter, followed by a
485  * (double precision complex) 1D-FFT, on the GPU.
486  * \author Andreas Gloess, Ole Schuett
487  ******************************************************************************/
488 void pw_gpu_sf(const double *zin, double *zout, const int *ghatmap,
489  const int *npts, const int mmax, const int ngpts,
490  const int nmaps, const double scale) {
491  // Check inputs.
492  assert(omp_get_num_threads() == 1);
493  const int nrpts = npts[0] * mmax;
494  assert(ngpts <= nrpts);
495  if (nrpts == 0 || ngpts == 0) {
496  return; // Nothing to do.
497  }
498 
499  // Allocate device memory.
501  const size_t buffer_size = 2 * sizeof(double) * nrpts;
502  const size_t map_size = sizeof(int) * nmaps * ngpts;
503  ensure_memory_sizes(buffer_size, map_size);
504 
505  // Upload COMPLEX inputs to device.
506  offloadMemcpyAsyncHtoD(buffer_dev_1, zin, 2 * sizeof(double) * ngpts, stream);
507 
508  // Upload map and run scatter on the device.
509  offloadMemcpyAsyncHtoD(ghatmap_dev, ghatmap, map_size, stream);
510  offloadMemsetAsync(buffer_dev_2, 0, buffer_size, stream);
511  pw_gpu_launch_scatter(buffer_dev_2, buffer_dev_1, scale, ngpts, nmaps,
512  ghatmap_dev, stream);
513 
514  // Run FFT on the device.
515  fft_1d(OFFLOAD_FFT_INVERSE, npts[0], mmax, buffer_dev_2, buffer_dev_1);
516 
517  // Download COMPLEX results from device.
518  offloadMemcpyAsyncDtoH(zout, buffer_dev_1, buffer_size, stream);
519  offloadStreamSynchronize(stream);
520 }
521 
522 #endif // defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
523 
524 // EOF
static void const int const int i
subroutine, public fft_3d(plan, scale, zin, zout, stat)
...
Definition: fft_lib.F:172
subroutine, public offload_activate_chosen_device()
Activates the device selected via offload_set_chosen_device()
Definition: offload_api.F:174
subroutine, public pw_gpu_init()
Allocates resources on the gpu device for gpu fft acceleration.
Definition: pw_gpu.F:63
subroutine, public pw_gpu_finalize()
Releases resources on the gpu device for gpu fft acceleration.
Definition: pw_gpu.F:82