(git:374b731)
Loading...
Searching...
No Matches
fft_fpga.c
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2024 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: GPL-2.0-or-later */
6/*----------------------------------------------------------------------------*/
7
8#if defined(__PW_FPGA)
9
10// global dependencies
11#include <stdio.h>
12#include <stdlib.h>
13#include <string.h>
14#include <unistd.h>
15
16// common dependencies
17#include "CL/opencl.h"
18
19// local dependencies
20#include "fft_fpga.h"
21#include "opencl_utils.h"
22
23// Function prototypes
24int init();
25void cleanup();
26static void cleanup_program();
27static void init_program(int N[3], char *data_path);
28static void queue_setup();
29void queue_cleanup();
30static void fftfpga_run_3d(int inverse, int N[3], cmplx *c_in);
31
32// --- CODE -------------------------------------------------------------------
33
34int pw_fpga_initialize_() { return init(); }
35
36void pw_fpga_final_() { cleanup(); }
37
38/*******************************************************************************
39 * \brief check whether FFT3d can be computed on the FPGA or not. This depends
40 * on the availability of bitstreams whose sizes are for now listed here
41 * If the fft sizes are found and the FPGA is not setup before,
42 * it is done
43 * \param data_path - path to the data directory
44 * \param N - integer pointer to the size of the FFT3d
45 * \retval true if fft3d size supported
46 ******************************************************************************/
47int pw_fpga_check_bitstream_(char *data_path, int N[3]) {
48 static int fft_size[3] = {0, 0, 0};
49
50 // check the supported sizes
51 if ((N[0] == 16 && N[1] == 16 && N[2] == 16) ||
52 (N[0] == 32 && N[1] == 32 && N[2] == 32) ||
53 (N[0] == 64 && N[1] == 64 && N[2] == 64)) {
54
55 // if first time
56 if (fft_size[0] == 0 && fft_size[1] == 0 && fft_size[2] == 0) {
57 fft_size[0] = N[0];
58 fft_size[1] = N[1];
59 fft_size[2] = N[2];
60
61 init_program(fft_size, data_path);
62 } else if (fft_size[0] == N[0] && fft_size[1] == N[1] &&
63 fft_size[2] == N[2]) {
64 // if same fft size as previous
65 // dont do anything
66 } else {
67 // else if different fft size as previous
68 // cleanup and initialize
69 fft_size[0] = N[0];
70 fft_size[1] = N[1];
71 fft_size[2] = N[2];
72
73 cleanup_program();
74 init_program(fft_size, data_path);
75 }
76
77 return 1;
78 } else {
79 return 0;
80 }
81}
82
83/*******************************************************************************
84 * \brief compute an in-place single precision complex 3D-FFT on the FPGA
85 * \param direction : direction - 1/forward, otherwise/backward FFT3d
86 * \param N : integer pointer to size of FFT3d
87 * \param din : complex input/output single precision data pointer
88 ******************************************************************************/
89void pw_fpga_fft3d_sp_(int direction, int N[3], cmplx *din) {
90 // setup device specific constructs
91 if (direction == 1) {
92 fftfpga_run_3d(0, N, din);
93 } else {
94 fftfpga_run_3d(1, N, din);
95 }
96}
97
98/*******************************************************************************
99 * \brief compute an in-place double precision complex 3D-FFT on the FPGA
100 * \param direction : direction - 1/forward, otherwise/backward FFT3d
101 * \param N : integer pointer to size of FFT3d
102 * \param din : complex input/output single precision data pointer
103 ******************************************************************************/
104void pw_fpga_fft3d_dp_(int direction, int N[3], cmplx *din) {
105 // setup device specific constructs
106 if (direction == 1) {
107 fftfpga_run_3d(0, N, din);
108 } else {
109 fftfpga_run_3d(1, N, din);
110 }
111}
112
113/*******************************************************************************
114 * \brief Execute a single precision complex FFT3d
115 * \param inverse : int
116 * \param N : integer pointer to size of FFT3d
117 * \param din : complex input/output single precision data pointer
118 ******************************************************************************/
119void fftfpga_run_3d(int inverse, int N[3], cmplx *c_in) {
120 cl_int status = 0;
121 int inverse_int = inverse;
122 cl_kernel fft_kernel = NULL, fft_kernel_2 = NULL;
123 cl_kernel fetch_kernel = NULL, transpose_kernel = NULL,
124 transpose_kernel_2 = NULL;
125
126 // Device memory buffers
127 cl_mem d_inData, d_outData;
128
129 // Create the kernel - name passed in here must match kernel name in the
130 // original CL file, that was compiled into an AOCX file using the AOC tool
131 fft_kernel = clCreateKernel(program, "fft3da", &status);
132 checkError(status, "Failed to create fft3da kernel");
133 fft_kernel_2 = clCreateKernel(program, "fft3db", &status);
134 checkError(status, "Failed to create fft3db kernel");
135 fetch_kernel = clCreateKernel(program, "fetch", &status);
136 checkError(status, "Failed to create fetch kernel");
137 transpose_kernel = clCreateKernel(program, "transpose", &status);
138 checkError(status, "Failed to create transpose kernel");
139 transpose_kernel_2 = clCreateKernel(program, "transpose3d", &status);
140 checkError(status, "Failed to create transpose3d kernel");
141
142 d_inData = clCreateBuffer(context, CL_MEM_READ_WRITE,
143 sizeof(cmplx) * N[0] * N[1] * N[2], NULL, &status);
144 checkError(status, "Failed to allocate input device buffer\n");
145 d_outData = clCreateBuffer(context, CL_MEM_READ_WRITE,
146 sizeof(cmplx) * N[0] * N[1] * N[2], NULL, &status);
147 checkError(status, "Failed to allocate output device buffer\n");
148
149 cmplx *h_inData = (cmplx *)alignedMalloc(sizeof(cmplx) * N[0] * N[1] * N[2]);
150 if (h_inData == NULL) {
151 printf("Unable to allocate host memory\n");
152 exit(1);
153 }
154 cmplx *h_outData = (cmplx *)alignedMalloc(sizeof(cmplx) * N[0] * N[1] * N[2]);
155 if (h_outData == NULL) {
156 printf("Unable to allocate host memory\n");
157 exit(1);
158 }
159
160 memcpy(h_inData, c_in, sizeof(cmplx) * N[0] * N[1] * N[2]);
161
162 queue_setup();
163
164 // Copy data from host to device
165 status = clEnqueueWriteBuffer(queue6, d_inData, CL_TRUE, 0,
166 sizeof(cmplx) * N[0] * N[1] * N[2], h_inData, 0,
167 NULL, NULL);
168 checkError(status, "Failed to copy data to device");
169
170 status = clFinish(queue6);
171 checkError(status, "failed to finish");
172
173 status = clSetKernelArg(fetch_kernel, 0, sizeof(cl_mem), (void *)&d_inData);
174 checkError(status, "Failed to set kernel arg 0");
175 status = clSetKernelArg(fft_kernel, 0, sizeof(cl_int), (void *)&inverse_int);
176 checkError(status, "Failed to set kernel arg 1");
177 status =
178 clSetKernelArg(transpose_kernel, 0, sizeof(cl_mem), (void *)&d_outData);
179 checkError(status, "Failed to set kernel arg 2");
180 status =
181 clSetKernelArg(fft_kernel_2, 0, sizeof(cl_int), (void *)&inverse_int);
182 checkError(status, "Failed to set kernel arg 3");
183
184 status = clEnqueueTask(queue1, fetch_kernel, 0, NULL, NULL);
185 checkError(status, "Failed to launch fetch kernel");
186
187 // Launch the fft kernel - we launch a single work item hence enqueue a task
188 status = clEnqueueTask(queue2, fft_kernel, 0, NULL, NULL);
189 checkError(status, "Failed to launch fft kernel");
190
191 status = clEnqueueTask(queue3, transpose_kernel, 0, NULL, NULL);
192 checkError(status, "Failed to launch transpose kernel");
193
194 status = clEnqueueTask(queue4, fft_kernel_2, 0, NULL, NULL);
195 checkError(status, "Failed to launch second fft kernel");
196
197 status = clEnqueueTask(queue5, transpose_kernel_2, 0, NULL, NULL);
198 checkError(status, "Failed to launch second transpose kernel");
199
200 // Wait for all command queues to complete pending events
201 status = clFinish(queue1);
202 checkError(status, "failed to finish");
203 status = clFinish(queue2);
204 checkError(status, "failed to finish");
205 status = clFinish(queue3);
206 checkError(status, "failed to finish");
207 status = clFinish(queue4);
208 checkError(status, "failed to finish");
209 status = clFinish(queue5);
210 checkError(status, "failed to finish");
211
212 // Copy results from device to host
213 status = clEnqueueReadBuffer(queue3, d_outData, CL_TRUE, 0,
214 sizeof(cmplx) * N[0] * N[1] * N[2], h_outData, 0,
215 NULL, NULL);
216 checkError(status, "Failed to read data from device");
217
218 memcpy(c_in, h_outData, sizeof(cmplx) * N[0] * N[1] * N[2]);
219
220 queue_cleanup();
221
222 if (h_outData)
223 free(h_outData);
224 if (h_inData)
225 free(h_inData);
226
227 if (d_inData)
228 clReleaseMemObject(d_inData);
229 if (d_outData)
230 clReleaseMemObject(d_outData);
231
232 if (fetch_kernel)
233 clReleaseKernel(fetch_kernel);
234 if (fft_kernel)
235 clReleaseKernel(fft_kernel);
236 if (fft_kernel_2)
237 clReleaseKernel(fft_kernel_2);
238 if (transpose_kernel)
239 clReleaseKernel(transpose_kernel);
240 if (transpose_kernel_2)
241 clReleaseKernel(transpose_kernel_2);
242}
243
244/*******************************************************************************
245 * \brief Initialize the program - select device, create context and program
246 ******************************************************************************/
247void init_program(int N[3], char *data_path) {
248 cl_int status = 0;
249
250 // use the first device.
251 device = devices[0];
252
253 // Create the context.
254 context = clCreateContext(NULL, 1, &device, &openCLContextCallBackFxn, NULL,
255 &status);
256 checkError(status, "Failed to create context");
257
258 // Create the program.
259 program = getProgramWithBinary(context, &device, 1, N, data_path);
260 if (program == NULL) {
261 printf("Failed to create program");
262 exit(1);
263 }
264 // Build the program that was just created.
265 status = clBuildProgram(program, 0, NULL, "", NULL, NULL);
266 checkError(status, "Failed to build program");
267}
268
269/*******************************************************************************
270 * \brief Free resources allocated during program initialization
271 ******************************************************************************/
272void cleanup_program() {
273 if (program)
274 clReleaseProgram(program);
275 if (context)
276 clReleaseContext(context);
277}
278
279/*******************************************************************************
280 * \brief Initialize the OpenCL FPGA environment - platform and devices
281 * \retval true if error in initialization
282 ******************************************************************************/
283int init() {
284 cl_int status = 0;
285
286 // Get the OpenCL platform.
287 platform = findPlatform("Intel(R) FPGA");
288 if (platform == NULL) {
289 printf("ERROR: Unable to find Intel(R) FPGA OpenCL platform\n");
290 return 1;
291 }
292 // Query the available OpenCL devices.
293 cl_uint num_devices;
294 devices = getDevices(platform, CL_DEVICE_TYPE_ALL, &num_devices);
295
296 return 0;
297}
298
299/*******************************************************************************
300 * \brief Free resources allocated during initialization - devices
301 ******************************************************************************/
302void cleanup() {
303 cleanup_program();
304 free(devices);
305}
306
307/*******************************************************************************
308 * \brief Create a command queue for each kernel
309 ******************************************************************************/
310void queue_setup() {
311 cl_int status = 0;
312 // Create one command queue for each kernel.
313 queue1 =
314 clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &status);
315 checkError(status, "Failed to create command queue1");
316 queue2 =
317 clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &status);
318 checkError(status, "Failed to create command queue2");
319 queue3 =
320 clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &status);
321 checkError(status, "Failed to create command queue3");
322 queue4 =
323 clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &status);
324 checkError(status, "Failed to create command queue4");
325 queue5 =
326 clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &status);
327 checkError(status, "Failed to create command queue5");
328 queue6 =
329 clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &status);
330 checkError(status, "Failed to create command queue6");
331}
332
333/*******************************************************************************
334 * \brief Release all command queues
335 ******************************************************************************/
336void queue_cleanup() {
337 if (queue1)
338 clReleaseCommandQueue(queue1);
339 if (queue2)
340 clReleaseCommandQueue(queue2);
341 if (queue3)
342 clReleaseCommandQueue(queue3);
343 if (queue4)
344 clReleaseCommandQueue(queue4);
345 if (queue5)
346 clReleaseCommandQueue(queue5);
347 if (queue6)
348 clReleaseCommandQueue(queue6);
349}
350
351#endif
subroutine, public init(nder, iunit, mepos, group)
...
Definition t_c_g0.F:1357