(git:5fbb678)
Loading...
Searching...
No Matches
pw_gpu_internal.c
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2025 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: GPL-2.0-or-later */
6/*----------------------------------------------------------------------------*/
7#include "../../offload/offload_runtime.h"
8#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
9
10#include "../../offload/offload_fft.h"
11#include "../../offload/offload_library.h"
12#include "pw_gpu_kernels.h"
13
14#include <assert.h>
15#include <omp.h>
16#include <stdbool.h>
17#include <stddef.h>
18#include <string.h>
19
20/*******************************************************************************
21 * \brief Static variables for retaining objects that are expensive to create.
22 * \author Ole Schuett
23 ******************************************************************************/
24typedef struct {
25 int key[4];
26 offload_fftHandle *plan;
27} cache_entry;
28
29#define PW_GPU_CACHE_SIZE 32
30static cache_entry cache[PW_GPU_CACHE_SIZE];
31static int cache_oldest_entry = 0; // used for LRU eviction
32
33static double *buffer_dev_1, *buffer_dev_2;
34static int *ghatmap_dev;
35static size_t allocated_buffer_size, allocated_map_size;
36
37static offloadStream_t stream;
38static bool is_initialized = false;
39
40/*******************************************************************************
41 * \brief Initializes the pw_gpu library.
42 * \author Ole Schuett
43 ******************************************************************************/
44void pw_gpu_init(void) {
45 assert(omp_get_num_threads() == 1);
46 if (is_initialized) {
47 // fprintf(stderr, "Error: pw_gpu was already initialized.\n");
48 // TODO abort();
49 return;
50 }
51 memset(cache, 0, sizeof(cache_entry) * PW_GPU_CACHE_SIZE);
52 cache_oldest_entry = 0;
53
54 allocated_buffer_size = 1; // start small
55 allocated_map_size = 1;
57 offloadMalloc((void **)&buffer_dev_1, allocated_buffer_size);
58 offloadMalloc((void **)&buffer_dev_2, allocated_buffer_size);
59 offloadMalloc((void **)&ghatmap_dev, allocated_map_size);
60
61 offloadStreamCreate(&stream);
62 is_initialized = true;
63}
64
65/*******************************************************************************
66 * \brief Releases resources held by the pw_gpu library.
67 * \author Ole Schuett
68 ******************************************************************************/
69void pw_gpu_finalize(void) {
70 assert(omp_get_num_threads() == 1);
71 if (!is_initialized) {
72 // fprintf(stderr, "Error: pw_gpu is not initialized.\n");
73 // TODO abort();
74 return;
75 }
76 for (int i = 0; i < PW_GPU_CACHE_SIZE; i++) {
77 if (cache[i].plan != NULL) {
78 offload_fftDestroy(*cache[i].plan);
79 free(cache[i].plan);
80 }
81 }
82 offloadFree(buffer_dev_1);
83 offloadFree(buffer_dev_2);
84 offloadFree(ghatmap_dev);
85 offloadStreamDestroy(stream);
86 is_initialized = false;
87}
88
89/*******************************************************************************
90 * \brief Checks size of device buffers and re-allocates them if necessary.
91 * \author Ole Schuett
92 ******************************************************************************/
93static void ensure_memory_sizes(const size_t requested_buffer_size,
94 const size_t requested_map_size) {
95 assert(is_initialized);
96 if (requested_buffer_size > allocated_buffer_size) {
97 offloadFree(buffer_dev_1);
98 offloadFree(buffer_dev_2);
99 offloadMalloc((void **)&buffer_dev_1, requested_buffer_size);
100 offloadMalloc((void **)&buffer_dev_2, requested_buffer_size);
101 allocated_buffer_size = requested_buffer_size;
102 }
103 if (requested_map_size > allocated_map_size) {
104 offloadFree(ghatmap_dev);
105 offloadMalloc((void **)&ghatmap_dev, requested_map_size);
106 allocated_map_size = requested_map_size;
107 }
108}
109
110/*******************************************************************************
111 * \brief Fetches an fft plan from the cache. Returns NULL if not found.
112 * \author Ole Schuett
113 ******************************************************************************/
114static offload_fftHandle *lookup_plan_from_cache(const int key[4]) {
115 assert(is_initialized);
116 for (int i = 0; i < PW_GPU_CACHE_SIZE; i++) {
117 const int *x = cache[i].key;
118 if (x[0] == key[0] && x[1] == key[1] && x[2] == key[2] && x[3] == key[3]) {
119 return cache[i].plan;
120 }
121 }
122 return NULL;
123}
124
125/*******************************************************************************
126 * \brief Adds an fft plan to the cache. Assumes ownership of plan's memory.
127 * \author Ole Schuett
128 ******************************************************************************/
129static void add_plan_to_cache(const int key[4], offload_fftHandle *plan) {
130 const int i = cache_oldest_entry;
131 cache_oldest_entry = (cache_oldest_entry + 1) % PW_GPU_CACHE_SIZE;
132 if (cache[i].plan != NULL) {
133 offload_fftDestroy(*cache[i].plan);
134 free(cache[i].plan);
135 }
136 cache[i].key[0] = key[0];
137 cache[i].key[1] = key[1];
138 cache[i].key[2] = key[2];
139 cache[i].key[3] = key[3];
140 cache[i].plan = plan;
141}
142
143/*******************************************************************************
144 * \brief Performs a scaled double precision complex 1D-FFT many times on
145 * the GPU.
146 * Input/output are DEVICE pointers (data_in, date_out).
147 * \author Andreas Gloess, Ole Schuett
148 ******************************************************************************/
149static void fft_1d(const int direction, const int n, const int m,
150 const double *data_in, double *data_out) {
151 const int key[4] = {1, direction, n, m}; // first key entry is dimensions
152 offload_fftHandle *plan = lookup_plan_from_cache(key);
153
154 if (plan == NULL) {
155 int nsize[1] = {n};
156 int inembed[1] = {0}; // Is ignored, but is not allowed to be NULL.
157 int onembed[1] = {0}; // Is ignored, but is not allowed to be NULL.
158 int batch = m;
159 int istride, idist, ostride, odist;
160 if (direction == OFFLOAD_FFT_FORWARD) {
161 istride = m;
162 idist = 1;
163 ostride = 1;
164 odist = n;
165 } else {
166 istride = 1;
167 idist = n;
168 ostride = m;
169 odist = 1;
170 }
171 plan = malloc(sizeof(cache_entry));
172 offload_fftPlanMany(plan, 1, nsize, inembed, istride, idist, onembed,
173 ostride, odist, OFFLOAD_FFT_Z2Z, batch);
174 offload_fftSetStream(*plan, stream);
175 add_plan_to_cache(key, plan);
176 }
177
178 offload_fftExecZ2Z(*plan, data_in, data_out, direction);
179}
180
181/*******************************************************************************
182 * \brief Performs a scaled double precision complex 3D-FFT on the GPU.
183 * Input/output is a DEVICE pointer (data).
184 * \author Andreas Gloess, Ole Schuett
185 ******************************************************************************/
186static void fft_3d(const int direction, const int nx, const int ny,
187 const int nz, double *data) {
188 const int key[4] = {3, nx, ny, nz}; // first key entry is dimensions
189 offload_fftHandle *plan = lookup_plan_from_cache(key);
190
191 if (plan == NULL) {
192 plan = malloc(sizeof(cache_entry));
193 offload_fftPlan3d(plan, nx, ny, nz, OFFLOAD_FFT_Z2Z);
194 offload_fftSetStream(*plan, stream);
195 add_plan_to_cache(key, plan);
196 }
197
198 offload_fftExecZ2Z(*plan, data, data, direction);
199}
200
201/*******************************************************************************
202 * \brief Performs a (double precision complex) FFT, followed by a (double
203 * precision complex) gather, on the GPU.
204 * \author Andreas Gloess, Ole Schuett
205 ******************************************************************************/
206void pw_gpu_cfffg(const double *din, double *zout, const int *ghatmap,
207 const int *npts, const int ngpts, const double scale) {
208 // Check inputs.
209 assert(omp_get_num_threads() == 1);
210 const int nrpts = npts[0] * npts[1] * npts[2];
211 assert(ngpts <= nrpts);
212 if (nrpts == 0 || ngpts == 0) {
213 return; // Nothing to do.
214 }
215
216 // Allocate device memory.
218 const size_t buffer_size = 2 * sizeof(double) * nrpts;
219 const size_t map_size = sizeof(int) * ngpts;
220 ensure_memory_sizes(buffer_size, map_size);
221
222 // Upload REAL input and convert to COMPLEX on device.
223 offloadMemcpyAsyncHtoD(buffer_dev_1, din, buffer_size / 2, stream);
224 pw_gpu_launch_real_to_complex(buffer_dev_1, buffer_dev_2, nrpts, stream);
225
226 // Run FFT on the device.
227 fft_3d(OFFLOAD_FFT_FORWARD, npts[2], npts[1], npts[0], buffer_dev_2);
228
229 // Upload map and run gather on the device.
230 offloadMemcpyAsyncHtoD(ghatmap_dev, ghatmap, map_size, stream);
231 pw_gpu_launch_gather(buffer_dev_1, buffer_dev_2, scale, ngpts, ghatmap_dev,
232 stream);
233
234 // Download COMPLEX results to host.
235 offloadMemcpyAsyncDtoH(zout, buffer_dev_1, 2 * sizeof(double) * ngpts,
236 stream);
237 offloadStreamSynchronize(stream);
238}
239
240/*******************************************************************************
241 * \brief Performs a (double precision complex) scatter, followed by a
242 * (double precision complex) FFT, on the GPU.
243 * \author Andreas Gloess, Ole Schuett
244 ******************************************************************************/
245void pw_gpu_sfffc(const double *zin, double *dout, const int *ghatmap,
246 const int *npts, const int ngpts, const int nmaps,
247 const double scale) {
248 // Check inputs.
249 assert(omp_get_num_threads() == 1);
250 const int nrpts = npts[0] * npts[1] * npts[2];
251 assert(ngpts <= nrpts);
252 if (nrpts == 0 || ngpts == 0) {
253 return; // Nothing to do.
254 }
255
256 // Allocate device memory.
258 const size_t buffer_size = 2 * sizeof(double) * nrpts;
259 const size_t map_size = sizeof(int) * nmaps * ngpts;
260 ensure_memory_sizes(buffer_size, map_size);
261
262 // Upload COMPLEX inputs to device.
263 offloadMemcpyAsyncHtoD(buffer_dev_1, zin, 2 * sizeof(double) * ngpts, stream);
264
265 // Upload map and run scatter on the device.
266 offloadMemcpyAsyncHtoD(ghatmap_dev, ghatmap, map_size, stream);
267 offloadMemsetAsync(buffer_dev_2, 0, buffer_size, stream);
268 pw_gpu_launch_scatter(buffer_dev_2, buffer_dev_1, scale, ngpts, nmaps,
269 ghatmap_dev, stream);
270
271 // Run FFT on the device.
272 fft_3d(OFFLOAD_FFT_INVERSE, npts[2], npts[1], npts[0], buffer_dev_2);
273
274 // Convert COMPLEX results to REAL and download to host.
275 pw_gpu_launch_complex_to_real(buffer_dev_2, buffer_dev_1, nrpts, stream);
276 offloadMemcpyAsyncDtoH(dout, buffer_dev_1, buffer_size / 2, stream);
277 offloadStreamSynchronize(stream);
278}
279
280/*******************************************************************************
281 * \brief Performs a (double to complex double) blow-up and a (double
282 * precision complex) 2D-FFT on the GPU.
283 * \author Andreas Gloess, Ole Schuett
284 ******************************************************************************/
285void pw_gpu_cff(const double *din, double *zout, const int *npts) {
286 // Check inputs.
287 assert(omp_get_num_threads() == 1);
288 const int nrpts = npts[0] * npts[1] * npts[2];
289 if (nrpts == 0) {
290 return; // Nothing to do.
291 }
292
293 // Allocate device memory.
295 const size_t buffer_size = 2 * sizeof(double) * nrpts;
296 ensure_memory_sizes(buffer_size, 0);
297
298 // Upload REAL input and convert to COMPLEX on device.
299 offloadMemcpyAsyncHtoD(buffer_dev_1, din, buffer_size / 2, stream);
300 pw_gpu_launch_real_to_complex(buffer_dev_1, buffer_dev_2, nrpts, stream);
301
302 // Run FFT on the device.
303 // NOTE: Could use 2D-FFT, but CUDA does them C-shaped which is not optimal.
304 fft_1d(OFFLOAD_FFT_FORWARD, npts[2], npts[0] * npts[1], buffer_dev_2,
305 buffer_dev_1);
306 fft_1d(OFFLOAD_FFT_FORWARD, npts[1], npts[0] * npts[2], buffer_dev_1,
307 buffer_dev_2);
308
309 // Download COMPLEX results to host.
310 offloadMemcpyAsyncDtoH(zout, buffer_dev_2, buffer_size, stream);
311 offloadStreamSynchronize(stream);
312}
313
314/*******************************************************************************
315 * \brief Performs a (double precision complex) 2D-FFT and a (double complex
316 * to double) shrink-down on the GPU.
317 * \author Andreas Gloess, Ole Schuett
318 ******************************************************************************/
319void pw_gpu_ffc(const double *zin, double *dout, const int *npts) {
320 // Check inputs.
321 assert(omp_get_num_threads() == 1);
322 const int nrpts = npts[0] * npts[1] * npts[2];
323 if (nrpts == 0) {
324 return; // Nothing to do.
325 }
326
327 // Allocate device memory.
329 const size_t buffer_size = 2 * sizeof(double) * nrpts;
330 ensure_memory_sizes(buffer_size, 0);
331
332 // Upload COMPLEX input to device.
333 offloadMemcpyAsyncHtoD(buffer_dev_1, zin, buffer_size, stream);
334
335 // Run FFT on the device.
336 // NOTE: Could use 2D-FFT, but CUDA does them C-shaped which is not optimal.
337 fft_1d(OFFLOAD_FFT_INVERSE, npts[1], npts[0] * npts[2], buffer_dev_1,
338 buffer_dev_2);
339 fft_1d(OFFLOAD_FFT_INVERSE, npts[2], npts[0] * npts[1], buffer_dev_2,
340 buffer_dev_1);
341 pw_gpu_launch_complex_to_real(buffer_dev_1, buffer_dev_2, nrpts, stream);
342
343 // Download REAL results to host.
344 offloadMemcpyAsyncDtoH(dout, buffer_dev_2, buffer_size / 2, stream);
345 offloadStreamSynchronize(stream);
346}
347
348/*******************************************************************************
349 * \brief Performs a (double to complex double) blow-up and a (double
350 * precision complex) 1D-FFT on the GPU.
351 * \author Andreas Gloess, Ole Schuett
352 ******************************************************************************/
353void pw_gpu_cf(const double *din, double *zout, const int *npts) {
354 // Check inputs.
355 assert(omp_get_num_threads() == 1);
356 const int nrpts = npts[0] * npts[1] * npts[2];
357 if (nrpts == 0) {
358 return; // Nothing to do.
359 }
360
361 // Allocate device memory.
363 const size_t buffer_size = 2 * sizeof(double) * nrpts;
364 ensure_memory_sizes(buffer_size, 0);
365
366 // Upload REAL input and convert to COMPLEX on device.
367 offloadMemcpyAsyncHtoD(buffer_dev_1, din, buffer_size / 2, stream);
368 pw_gpu_launch_real_to_complex(buffer_dev_1, buffer_dev_2, nrpts, stream);
369
370 // Run FFT on the device.
371 fft_1d(OFFLOAD_FFT_FORWARD, npts[2], npts[0] * npts[1], buffer_dev_2,
372 buffer_dev_1);
373
374 // Download COMPLEX results from device.
375 offloadMemcpyAsyncDtoH(zout, buffer_dev_1, buffer_size, stream);
376 offloadStreamSynchronize(stream);
377}
378
379/*******************************************************************************
380 * \brief Performs a (double precision complex) 1D-FFT and a (double complex
381 * to double) shrink-down on the GPU.
382 * \author Andreas Gloess, Ole Schuett
383 ******************************************************************************/
384void pw_gpu_fc(const double *zin, double *dout, const int *npts) {
385 // Check inputs.
386 assert(omp_get_num_threads() == 1);
387 const int nrpts = npts[0] * npts[1] * npts[2];
388 if (nrpts == 0) {
389 return; // Nothing to do.
390 }
391
392 // Allocate device memory.
394 const size_t buffer_size = 2 * sizeof(double) * nrpts;
395 ensure_memory_sizes(buffer_size, 0);
396
397 // Upload COMPLEX input to device.
398 offloadMemcpyAsyncHtoD(buffer_dev_1, zin, buffer_size, stream);
399
400 // Run FFT on the device.
401 fft_1d(OFFLOAD_FFT_INVERSE, npts[2], npts[0] * npts[1], buffer_dev_1,
402 buffer_dev_2);
403
404 // Convert COMPLEX results to REAL and download to host.
405 pw_gpu_launch_complex_to_real(buffer_dev_2, buffer_dev_1, nrpts, stream);
406 offloadMemcpyAsyncDtoH(dout, buffer_dev_1, buffer_size / 2, stream);
407 offloadStreamSynchronize(stream);
408}
409
410/*******************************************************************************
411 * \brief Performs a (double precision complex) 1D-FFT on the GPU.
412 * \author Andreas Gloess, Ole Schuett
413 ******************************************************************************/
414void pw_gpu_f(const double *zin, double *zout, const int dir, const int n,
415 const int m) {
416 // Check inputs.
417 assert(omp_get_num_threads() == 1);
418 const int nrpts = n * m;
419 if (nrpts == 0) {
420 return; // Nothing to do.
421 }
422
423 // Allocate device memory.
425 const size_t buffer_size = 2 * sizeof(double) * nrpts;
426 ensure_memory_sizes(buffer_size, 0);
427
428 // Upload COMPLEX input to device.
429 offloadMemcpyAsyncHtoD(buffer_dev_1, zin, buffer_size, stream);
430
431 // Run FFT on the device.
432 if (dir > 0) {
433 fft_1d(OFFLOAD_FFT_FORWARD, n, m, buffer_dev_1, buffer_dev_2);
434 } else {
435 fft_1d(OFFLOAD_FFT_INVERSE, n, m, buffer_dev_1, buffer_dev_2);
436 }
437
438 // Download COMPLEX results from device.
439 offloadMemcpyAsyncDtoH(zout, buffer_dev_2, buffer_size, stream);
440 offloadStreamSynchronize(stream);
441}
442
443/*******************************************************************************
444 * \brief Performs a (double precision complex) 1D-FFT, followed by a (double
445 * precision complex) gather, on the GPU.
446 * \author Andreas Gloess, Ole Schuett
447 ******************************************************************************/
448void pw_gpu_fg(const double *zin, double *zout, const int *ghatmap,
449 const int *npts, const int mmax, const int ngpts,
450 const double scale) {
451 // Check inputs.
452 assert(omp_get_num_threads() == 1);
453 const int nrpts = npts[0] * mmax;
454 assert(ngpts <= nrpts);
455 if (nrpts == 0 || ngpts == 0) {
456 return; // Nothing to do.
457 }
458
459 // Allocate device memory.
461 const size_t buffer_size = 2 * sizeof(double) * nrpts;
462 const size_t map_size = sizeof(int) * ngpts;
463 ensure_memory_sizes(buffer_size, map_size);
464
465 // Upload COMPLEX inputs to device.
466 offloadMemcpyAsyncHtoD(buffer_dev_1, zin, buffer_size, stream);
467
468 // Run FFT on the device.
469 fft_1d(OFFLOAD_FFT_FORWARD, npts[0], mmax, buffer_dev_1, buffer_dev_2);
470
471 // Upload map and run gather on the device.
472 offloadMemcpyAsyncHtoD(ghatmap_dev, ghatmap, map_size, stream);
473 pw_gpu_launch_gather(buffer_dev_1, buffer_dev_2, scale, ngpts, ghatmap_dev,
474 stream);
475
476 // Download COMPLEX results from device.
477 offloadMemcpyAsyncDtoH(zout, buffer_dev_1, 2 * sizeof(double) * ngpts,
478 stream);
479 offloadStreamSynchronize(stream);
480}
481
482/*******************************************************************************
483 * \brief Performs a (double precision complex) scatter, followed by a
484 * (double precision complex) 1D-FFT, on the GPU.
485 * \author Andreas Gloess, Ole Schuett
486 ******************************************************************************/
487void pw_gpu_sf(const double *zin, double *zout, const int *ghatmap,
488 const int *npts, const int mmax, const int ngpts,
489 const int nmaps, const double scale) {
490 // Check inputs.
491 assert(omp_get_num_threads() == 1);
492 const int nrpts = npts[0] * mmax;
493 assert(ngpts <= nrpts);
494 if (nrpts == 0 || ngpts == 0) {
495 return; // Nothing to do.
496 }
497
498 // Allocate device memory.
500 const size_t buffer_size = 2 * sizeof(double) * nrpts;
501 const size_t map_size = sizeof(int) * nmaps * ngpts;
502 ensure_memory_sizes(buffer_size, map_size);
503
504 // Upload COMPLEX inputs to device.
505 offloadMemcpyAsyncHtoD(buffer_dev_1, zin, 2 * sizeof(double) * ngpts, stream);
506
507 // Upload map and run scatter on the device.
508 offloadMemcpyAsyncHtoD(ghatmap_dev, ghatmap, map_size, stream);
509 offloadMemsetAsync(buffer_dev_2, 0, buffer_size, stream);
510 pw_gpu_launch_scatter(buffer_dev_2, buffer_dev_1, scale, ngpts, nmaps,
511 ghatmap_dev, stream);
512
513 // Run FFT on the device.
514 fft_1d(OFFLOAD_FFT_INVERSE, npts[0], mmax, buffer_dev_2, buffer_dev_1);
515
516 // Download COMPLEX results from device.
517 offloadMemcpyAsyncDtoH(zout, buffer_dev_1, buffer_size, stream);
518 offloadStreamSynchronize(stream);
519}
520
521#endif // defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
522
523// EOF
static void const int const int i
subroutine, public fft_3d(plan, scale, zin, zout, stat)
...
Definition fft_lib.F:172
subroutine, public offload_activate_chosen_device()
Activates the device selected via offload_set_chosen_device()
subroutine, public pw_gpu_init()
Allocates resources on the gpu device for gpu fft acceleration.
Definition pw_gpu.F:62
subroutine, public pw_gpu_finalize()
Releases resources on the gpu device for gpu fft acceleration.
Definition pw_gpu.F:81