(git:ed6f26b)
Loading...
Searching...
No Matches
pw_gpu_internal.c
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2025 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: GPL-2.0-or-later */
6/*----------------------------------------------------------------------------*/
7
8#include "../../offload/offload_runtime.h"
9#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
10
11#include <assert.h>
12#include <omp.h>
13#include <stdbool.h>
14#include <stddef.h>
15#include <string.h>
16
17#include "../../offload/offload_fft.h"
18#include "../../offload/offload_library.h"
19#include "pw_gpu_kernels.h"
20
21/*******************************************************************************
22 * \brief Static variables for retaining objects that are expensive to create.
23 * \author Ole Schuett
24 ******************************************************************************/
25typedef struct {
26 int key[4];
27 offload_fftHandle *plan;
28} cache_entry;
29
30#define PW_GPU_CACHE_SIZE 32
31static cache_entry cache[PW_GPU_CACHE_SIZE];
32static int cache_oldest_entry = 0; // used for LRU eviction
33
34static double *buffer_dev_1, *buffer_dev_2;
35static int *ghatmap_dev;
36static size_t allocated_buffer_size, allocated_map_size;
37
38static offloadStream_t stream;
39static bool is_initialized = false;
40
41/*******************************************************************************
42 * \brief Initializes the pw_gpu library.
43 * \author Ole Schuett
44 ******************************************************************************/
45void pw_gpu_init(void) {
46 assert(omp_get_num_threads() == 1);
47 if (is_initialized) {
48 // fprintf(stderr, "Error: pw_gpu was already initialized.\n");
49 // TODO abort();
50 return;
51 }
52 memset(cache, 0, sizeof(cache_entry) * PW_GPU_CACHE_SIZE);
53 cache_oldest_entry = 0;
54
55 allocated_buffer_size = 1; // start small
56 allocated_map_size = 1;
58 offloadMalloc((void **)&buffer_dev_1, allocated_buffer_size);
59 offloadMalloc((void **)&buffer_dev_2, allocated_buffer_size);
60 offloadMalloc((void **)&ghatmap_dev, allocated_map_size);
61
62 offloadStreamCreate(&stream);
63 is_initialized = true;
64}
65
66/*******************************************************************************
67 * \brief Releases resources held by the pw_gpu library.
68 * \author Ole Schuett
69 ******************************************************************************/
70void pw_gpu_finalize(void) {
71 assert(omp_get_num_threads() == 1);
72 if (!is_initialized) {
73 // fprintf(stderr, "Error: pw_gpu is not initialized.\n");
74 // TODO abort();
75 return;
76 }
77 for (int i = 0; i < PW_GPU_CACHE_SIZE; i++) {
78 if (cache[i].plan != NULL) {
79 offload_fftDestroy(*cache[i].plan);
80 free(cache[i].plan);
81 }
82 }
83 offloadFree(buffer_dev_1);
84 offloadFree(buffer_dev_2);
85 offloadFree(ghatmap_dev);
86 offloadStreamDestroy(stream);
87 is_initialized = false;
88}
89
90/*******************************************************************************
91 * \brief Checks size of device buffers and re-allocates them if necessary.
92 * \author Ole Schuett
93 ******************************************************************************/
94static void ensure_memory_sizes(const size_t requested_buffer_size,
95 const size_t requested_map_size) {
96 assert(is_initialized);
97 if (requested_buffer_size > allocated_buffer_size) {
98 offloadFree(buffer_dev_1);
99 offloadFree(buffer_dev_2);
100 offloadMalloc((void **)&buffer_dev_1, requested_buffer_size);
101 offloadMalloc((void **)&buffer_dev_2, requested_buffer_size);
102 allocated_buffer_size = requested_buffer_size;
103 }
104 if (requested_map_size > allocated_map_size) {
105 offloadFree(ghatmap_dev);
106 offloadMalloc((void **)&ghatmap_dev, requested_map_size);
107 allocated_map_size = requested_map_size;
108 }
109}
110
111/*******************************************************************************
112 * \brief Fetches an fft plan from the cache. Returns NULL if not found.
113 * \author Ole Schuett
114 ******************************************************************************/
115static offload_fftHandle *lookup_plan_from_cache(const int key[4]) {
116 assert(is_initialized);
117 for (int i = 0; i < PW_GPU_CACHE_SIZE; i++) {
118 const int *x = cache[i].key;
119 if (x[0] == key[0] && x[1] == key[1] && x[2] == key[2] && x[3] == key[3]) {
120 return cache[i].plan;
121 }
122 }
123 return NULL;
124}
125
126/*******************************************************************************
127 * \brief Adds an fft plan to the cache. Assumes ownership of plan's memory.
128 * \author Ole Schuett
129 ******************************************************************************/
130static void add_plan_to_cache(const int key[4], offload_fftHandle *plan) {
131 const int i = cache_oldest_entry;
132 cache_oldest_entry = (cache_oldest_entry + 1) % PW_GPU_CACHE_SIZE;
133 if (cache[i].plan != NULL) {
134 offload_fftDestroy(*cache[i].plan);
135 free(cache[i].plan);
136 }
137 cache[i].key[0] = key[0];
138 cache[i].key[1] = key[1];
139 cache[i].key[2] = key[2];
140 cache[i].key[3] = key[3];
141 cache[i].plan = plan;
142}
143
144/*******************************************************************************
145 * \brief Performs a scaled double precision complex 1D-FFT many times on
146 * the GPU.
147 * Input/output are DEVICE pointers (data_in, date_out).
148 * \author Andreas Gloess, Ole Schuett
149 ******************************************************************************/
150static void fft_1d(const int direction, const int n, const int m,
151 const double *data_in, double *data_out) {
152 const int key[4] = {1, direction, n, m}; // first key entry is dimensions
153 offload_fftHandle *plan = lookup_plan_from_cache(key);
154
155 if (plan == NULL) {
156 int nsize[1] = {n};
157 int inembed[1] = {0}; // Is ignored, but is not allowed to be NULL.
158 int onembed[1] = {0}; // Is ignored, but is not allowed to be NULL.
159 int batch = m;
160 int istride, idist, ostride, odist;
161 if (direction == OFFLOAD_FFT_FORWARD) {
162 istride = m;
163 idist = 1;
164 ostride = 1;
165 odist = n;
166 } else {
167 istride = 1;
168 idist = n;
169 ostride = m;
170 odist = 1;
171 }
172 plan = malloc(sizeof(cache_entry));
173 offload_fftPlanMany(plan, 1, nsize, inembed, istride, idist, onembed,
174 ostride, odist, OFFLOAD_FFT_Z2Z, batch);
175 offload_fftSetStream(*plan, stream);
176 add_plan_to_cache(key, plan);
177 }
178
179 offload_fftExecZ2Z(*plan, data_in, data_out, direction);
180}
181
182/*******************************************************************************
183 * \brief Performs a scaled double precision complex 3D-FFT on the GPU.
184 * Input/output is a DEVICE pointer (data).
185 * \author Andreas Gloess, Ole Schuett
186 ******************************************************************************/
187static void fft_3d(const int direction, const int nx, const int ny,
188 const int nz, double *data) {
189 const int key[4] = {3, nx, ny, nz}; // first key entry is dimensions
190 offload_fftHandle *plan = lookup_plan_from_cache(key);
191
192 if (plan == NULL) {
193 plan = malloc(sizeof(cache_entry));
194 offload_fftPlan3d(plan, nx, ny, nz, OFFLOAD_FFT_Z2Z);
195 offload_fftSetStream(*plan, stream);
196 add_plan_to_cache(key, plan);
197 }
198
199 offload_fftExecZ2Z(*plan, data, data, direction);
200}
201
202/*******************************************************************************
203 * \brief Performs a (double precision complex) FFT, followed by a (double
204 * precision complex) gather, on the GPU.
205 * \author Andreas Gloess, Ole Schuett
206 ******************************************************************************/
207void pw_gpu_cfffg(const double *din, double *zout, const int *ghatmap,
208 const int *npts, const int ngpts, const double scale) {
209 // Check inputs.
210 assert(omp_get_num_threads() == 1);
211 const int nrpts = npts[0] * npts[1] * npts[2];
212 assert(ngpts <= nrpts);
213 if (nrpts == 0 || ngpts == 0) {
214 return; // Nothing to do.
215 }
216
217 // Allocate device memory.
219 const size_t buffer_size = 2 * sizeof(double) * nrpts;
220 const size_t map_size = sizeof(int) * ngpts;
221 ensure_memory_sizes(buffer_size, map_size);
222
223 // Upload REAL input and convert to COMPLEX on device.
224 offloadMemcpyAsyncHtoD(buffer_dev_1, din, buffer_size / 2, stream);
225 pw_gpu_launch_real_to_complex(buffer_dev_1, buffer_dev_2, nrpts, stream);
226
227 // Run FFT on the device.
228 fft_3d(OFFLOAD_FFT_FORWARD, npts[2], npts[1], npts[0], buffer_dev_2);
229
230 // Upload map and run gather on the device.
231 offloadMemcpyAsyncHtoD(ghatmap_dev, ghatmap, map_size, stream);
232 pw_gpu_launch_gather(buffer_dev_1, buffer_dev_2, scale, ngpts, ghatmap_dev,
233 stream);
234
235 // Download COMPLEX results to host.
236 offloadMemcpyAsyncDtoH(zout, buffer_dev_1, 2 * sizeof(double) * ngpts,
237 stream);
238 offloadStreamSynchronize(stream);
239}
240
241/*******************************************************************************
242 * \brief Performs a (double precision complex) scatter, followed by a
243 * (double precision complex) FFT, on the GPU.
244 * \author Andreas Gloess, Ole Schuett
245 ******************************************************************************/
246void pw_gpu_sfffc(const double *zin, double *dout, const int *ghatmap,
247 const int *npts, const int ngpts, const int nmaps,
248 const double scale) {
249 // Check inputs.
250 assert(omp_get_num_threads() == 1);
251 const int nrpts = npts[0] * npts[1] * npts[2];
252 assert(ngpts <= nrpts);
253 if (nrpts == 0 || ngpts == 0) {
254 return; // Nothing to do.
255 }
256
257 // Allocate device memory.
259 const size_t buffer_size = 2 * sizeof(double) * nrpts;
260 const size_t map_size = sizeof(int) * nmaps * ngpts;
261 ensure_memory_sizes(buffer_size, map_size);
262
263 // Upload COMPLEX inputs to device.
264 offloadMemcpyAsyncHtoD(buffer_dev_1, zin, 2 * sizeof(double) * ngpts, stream);
265
266 // Upload map and run scatter on the device.
267 offloadMemcpyAsyncHtoD(ghatmap_dev, ghatmap, map_size, stream);
268 offloadMemsetAsync(buffer_dev_2, 0, buffer_size, stream);
269 pw_gpu_launch_scatter(buffer_dev_2, buffer_dev_1, scale, ngpts, nmaps,
270 ghatmap_dev, stream);
271
272 // Run FFT on the device.
273 fft_3d(OFFLOAD_FFT_INVERSE, npts[2], npts[1], npts[0], buffer_dev_2);
274
275 // Convert COMPLEX results to REAL and download to host.
276 pw_gpu_launch_complex_to_real(buffer_dev_2, buffer_dev_1, nrpts, stream);
277 offloadMemcpyAsyncDtoH(dout, buffer_dev_1, buffer_size / 2, stream);
278 offloadStreamSynchronize(stream);
279}
280
281/*******************************************************************************
282 * \brief Performs a (double to complex double) blow-up and a (double
283 * precision complex) 2D-FFT on the GPU.
284 * \author Andreas Gloess, Ole Schuett
285 ******************************************************************************/
286void pw_gpu_cff(const double *din, double *zout, const int *npts) {
287 // Check inputs.
288 assert(omp_get_num_threads() == 1);
289 const int nrpts = npts[0] * npts[1] * npts[2];
290 if (nrpts == 0) {
291 return; // Nothing to do.
292 }
293
294 // Allocate device memory.
296 const size_t buffer_size = 2 * sizeof(double) * nrpts;
297 ensure_memory_sizes(buffer_size, 0);
298
299 // Upload REAL input and convert to COMPLEX on device.
300 offloadMemcpyAsyncHtoD(buffer_dev_1, din, buffer_size / 2, stream);
301 pw_gpu_launch_real_to_complex(buffer_dev_1, buffer_dev_2, nrpts, stream);
302
303 // Run FFT on the device.
304 // NOTE: Could use 2D-FFT, but CUDA does them C-shaped which is not optimal.
305 fft_1d(OFFLOAD_FFT_FORWARD, npts[2], npts[0] * npts[1], buffer_dev_2,
306 buffer_dev_1);
307 fft_1d(OFFLOAD_FFT_FORWARD, npts[1], npts[0] * npts[2], buffer_dev_1,
308 buffer_dev_2);
309
310 // Download COMPLEX results to host.
311 offloadMemcpyAsyncDtoH(zout, buffer_dev_2, buffer_size, stream);
312 offloadStreamSynchronize(stream);
313}
314
315/*******************************************************************************
316 * \brief Performs a (double precision complex) 2D-FFT and a (double complex
317 * to double) shrink-down on the GPU.
318 * \author Andreas Gloess, Ole Schuett
319 ******************************************************************************/
320void pw_gpu_ffc(const double *zin, double *dout, const int *npts) {
321 // Check inputs.
322 assert(omp_get_num_threads() == 1);
323 const int nrpts = npts[0] * npts[1] * npts[2];
324 if (nrpts == 0) {
325 return; // Nothing to do.
326 }
327
328 // Allocate device memory.
330 const size_t buffer_size = 2 * sizeof(double) * nrpts;
331 ensure_memory_sizes(buffer_size, 0);
332
333 // Upload COMPLEX input to device.
334 offloadMemcpyAsyncHtoD(buffer_dev_1, zin, buffer_size, stream);
335
336 // Run FFT on the device.
337 // NOTE: Could use 2D-FFT, but CUDA does them C-shaped which is not optimal.
338 fft_1d(OFFLOAD_FFT_INVERSE, npts[1], npts[0] * npts[2], buffer_dev_1,
339 buffer_dev_2);
340 fft_1d(OFFLOAD_FFT_INVERSE, npts[2], npts[0] * npts[1], buffer_dev_2,
341 buffer_dev_1);
342 pw_gpu_launch_complex_to_real(buffer_dev_1, buffer_dev_2, nrpts, stream);
343
344 // Download REAL results to host.
345 offloadMemcpyAsyncDtoH(dout, buffer_dev_2, buffer_size / 2, stream);
346 offloadStreamSynchronize(stream);
347}
348
349/*******************************************************************************
350 * \brief Performs a (double to complex double) blow-up and a (double
351 * precision complex) 1D-FFT on the GPU.
352 * \author Andreas Gloess, Ole Schuett
353 ******************************************************************************/
354void pw_gpu_cf(const double *din, double *zout, const int *npts) {
355 // Check inputs.
356 assert(omp_get_num_threads() == 1);
357 const int nrpts = npts[0] * npts[1] * npts[2];
358 if (nrpts == 0) {
359 return; // Nothing to do.
360 }
361
362 // Allocate device memory.
364 const size_t buffer_size = 2 * sizeof(double) * nrpts;
365 ensure_memory_sizes(buffer_size, 0);
366
367 // Upload REAL input and convert to COMPLEX on device.
368 offloadMemcpyAsyncHtoD(buffer_dev_1, din, buffer_size / 2, stream);
369 pw_gpu_launch_real_to_complex(buffer_dev_1, buffer_dev_2, nrpts, stream);
370
371 // Run FFT on the device.
372 fft_1d(OFFLOAD_FFT_FORWARD, npts[2], npts[0] * npts[1], buffer_dev_2,
373 buffer_dev_1);
374
375 // Download COMPLEX results from device.
376 offloadMemcpyAsyncDtoH(zout, buffer_dev_1, buffer_size, stream);
377 offloadStreamSynchronize(stream);
378}
379
380/*******************************************************************************
381 * \brief Performs a (double precision complex) 1D-FFT and a (double complex
382 * to double) shrink-down on the GPU.
383 * \author Andreas Gloess, Ole Schuett
384 ******************************************************************************/
385void pw_gpu_fc(const double *zin, double *dout, const int *npts) {
386 // Check inputs.
387 assert(omp_get_num_threads() == 1);
388 const int nrpts = npts[0] * npts[1] * npts[2];
389 if (nrpts == 0) {
390 return; // Nothing to do.
391 }
392
393 // Allocate device memory.
395 const size_t buffer_size = 2 * sizeof(double) * nrpts;
396 ensure_memory_sizes(buffer_size, 0);
397
398 // Upload COMPLEX input to device.
399 offloadMemcpyAsyncHtoD(buffer_dev_1, zin, buffer_size, stream);
400
401 // Run FFT on the device.
402 fft_1d(OFFLOAD_FFT_INVERSE, npts[2], npts[0] * npts[1], buffer_dev_1,
403 buffer_dev_2);
404
405 // Convert COMPLEX results to REAL and download to host.
406 pw_gpu_launch_complex_to_real(buffer_dev_2, buffer_dev_1, nrpts, stream);
407 offloadMemcpyAsyncDtoH(dout, buffer_dev_1, buffer_size / 2, stream);
408 offloadStreamSynchronize(stream);
409}
410
411/*******************************************************************************
412 * \brief Performs a (double precision complex) 1D-FFT on the GPU.
413 * \author Andreas Gloess, Ole Schuett
414 ******************************************************************************/
415void pw_gpu_f(const double *zin, double *zout, const int dir, const int n,
416 const int m) {
417 // Check inputs.
418 assert(omp_get_num_threads() == 1);
419 const int nrpts = n * m;
420 if (nrpts == 0) {
421 return; // Nothing to do.
422 }
423
424 // Allocate device memory.
426 const size_t buffer_size = 2 * sizeof(double) * nrpts;
427 ensure_memory_sizes(buffer_size, 0);
428
429 // Upload COMPLEX input to device.
430 offloadMemcpyAsyncHtoD(buffer_dev_1, zin, buffer_size, stream);
431
432 // Run FFT on the device.
433 if (dir > 0) {
434 fft_1d(OFFLOAD_FFT_FORWARD, n, m, buffer_dev_1, buffer_dev_2);
435 } else {
436 fft_1d(OFFLOAD_FFT_INVERSE, n, m, buffer_dev_1, buffer_dev_2);
437 }
438
439 // Download COMPLEX results from device.
440 offloadMemcpyAsyncDtoH(zout, buffer_dev_2, buffer_size, stream);
441 offloadStreamSynchronize(stream);
442}
443
444/*******************************************************************************
445 * \brief Performs a (double precision complex) 1D-FFT, followed by a (double
446 * precision complex) gather, on the GPU.
447 * \author Andreas Gloess, Ole Schuett
448 ******************************************************************************/
449void pw_gpu_fg(const double *zin, double *zout, const int *ghatmap,
450 const int *npts, const int mmax, const int ngpts,
451 const double scale) {
452 // Check inputs.
453 assert(omp_get_num_threads() == 1);
454 const int nrpts = npts[0] * mmax;
455 assert(ngpts <= nrpts);
456 if (nrpts == 0 || ngpts == 0) {
457 return; // Nothing to do.
458 }
459
460 // Allocate device memory.
462 const size_t buffer_size = 2 * sizeof(double) * nrpts;
463 const size_t map_size = sizeof(int) * ngpts;
464 ensure_memory_sizes(buffer_size, map_size);
465
466 // Upload COMPLEX inputs to device.
467 offloadMemcpyAsyncHtoD(buffer_dev_1, zin, buffer_size, stream);
468
469 // Run FFT on the device.
470 fft_1d(OFFLOAD_FFT_FORWARD, npts[0], mmax, buffer_dev_1, buffer_dev_2);
471
472 // Upload map and run gather on the device.
473 offloadMemcpyAsyncHtoD(ghatmap_dev, ghatmap, map_size, stream);
474 pw_gpu_launch_gather(buffer_dev_1, buffer_dev_2, scale, ngpts, ghatmap_dev,
475 stream);
476
477 // Download COMPLEX results from device.
478 offloadMemcpyAsyncDtoH(zout, buffer_dev_1, 2 * sizeof(double) * ngpts,
479 stream);
480 offloadStreamSynchronize(stream);
481}
482
483/*******************************************************************************
484 * \brief Performs a (double precision complex) scatter, followed by a
485 * (double precision complex) 1D-FFT, on the GPU.
486 * \author Andreas Gloess, Ole Schuett
487 ******************************************************************************/
488void pw_gpu_sf(const double *zin, double *zout, const int *ghatmap,
489 const int *npts, const int mmax, const int ngpts,
490 const int nmaps, const double scale) {
491 // Check inputs.
492 assert(omp_get_num_threads() == 1);
493 const int nrpts = npts[0] * mmax;
494 assert(ngpts <= nrpts);
495 if (nrpts == 0 || ngpts == 0) {
496 return; // Nothing to do.
497 }
498
499 // Allocate device memory.
501 const size_t buffer_size = 2 * sizeof(double) * nrpts;
502 const size_t map_size = sizeof(int) * nmaps * ngpts;
503 ensure_memory_sizes(buffer_size, map_size);
504
505 // Upload COMPLEX inputs to device.
506 offloadMemcpyAsyncHtoD(buffer_dev_1, zin, 2 * sizeof(double) * ngpts, stream);
507
508 // Upload map and run scatter on the device.
509 offloadMemcpyAsyncHtoD(ghatmap_dev, ghatmap, map_size, stream);
510 offloadMemsetAsync(buffer_dev_2, 0, buffer_size, stream);
511 pw_gpu_launch_scatter(buffer_dev_2, buffer_dev_1, scale, ngpts, nmaps,
512 ghatmap_dev, stream);
513
514 // Run FFT on the device.
515 fft_1d(OFFLOAD_FFT_INVERSE, npts[0], mmax, buffer_dev_2, buffer_dev_1);
516
517 // Download COMPLEX results from device.
518 offloadMemcpyAsyncDtoH(zout, buffer_dev_1, buffer_size, stream);
519 offloadStreamSynchronize(stream);
520}
521
522#endif // defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
523
524// EOF
static void const int const int i
subroutine, public fft_3d(plan, scale, zin, zout, stat)
...
Definition fft_lib.F:172
subroutine, public offload_activate_chosen_device()
Activates the device selected via offload_set_chosen_device()
subroutine, public pw_gpu_init()
Allocates resources on the gpu device for gpu fft acceleration.
Definition pw_gpu.F:62
subroutine, public pw_gpu_finalize()
Releases resources on the gpu device for gpu fft acceleration.
Definition pw_gpu.F:81