(git:0de0cc2)
fft_fpga.c
Go to the documentation of this file.
1 /*----------------------------------------------------------------------------*/
2 /* CP2K: A general program to perform molecular dynamics simulations */
3 /* Copyright 2000-2024 CP2K developers group <https://cp2k.org> */
4 /* */
5 /* SPDX-License-Identifier: GPL-2.0-or-later */
6 /*----------------------------------------------------------------------------*/
7 
8 #if defined(__PW_FPGA)
9 
10 // global dependencies
11 #include <stdio.h>
12 #include <stdlib.h>
13 #include <string.h>
14 #include <unistd.h>
15 
16 // common dependencies
17 #include "CL/opencl.h"
18 
19 // local dependencies
20 #include "fft_fpga.h"
21 #include "opencl_utils.h"
22 
23 // Function prototypes
24 int init();
25 void cleanup();
26 static void cleanup_program();
27 static void init_program(int N[3], char *data_path);
28 static void queue_setup();
29 void queue_cleanup();
30 static void fftfpga_run_3d(int inverse, int N[3], cmplx *c_in);
31 
32 // --- CODE -------------------------------------------------------------------
33 
34 int pw_fpga_initialize_() { return init(); }
35 
36 void pw_fpga_final_() { cleanup(); }
37 
38 /*******************************************************************************
39  * \brief check whether FFT3d can be computed on the FPGA or not. This depends
40  * on the availability of bitstreams whose sizes are for now listed here
41  * If the fft sizes are found and the FPGA is not setup before,
42  * it is done
43  * \param data_path - path to the data directory
44  * \param N - integer pointer to the size of the FFT3d
45  * \retval true if fft3d size supported
46  ******************************************************************************/
47 int pw_fpga_check_bitstream_(char *data_path, int N[3]) {
48  static int fft_size[3] = {0, 0, 0};
49 
50  // check the supported sizes
51  if ((N[0] == 16 && N[1] == 16 && N[2] == 16) ||
52  (N[0] == 32 && N[1] == 32 && N[2] == 32) ||
53  (N[0] == 64 && N[1] == 64 && N[2] == 64)) {
54 
55  // if first time
56  if (fft_size[0] == 0 && fft_size[1] == 0 && fft_size[2] == 0) {
57  fft_size[0] = N[0];
58  fft_size[1] = N[1];
59  fft_size[2] = N[2];
60 
61  init_program(fft_size, data_path);
62  } else if (fft_size[0] == N[0] && fft_size[1] == N[1] &&
63  fft_size[2] == N[2]) {
64  // if same fft size as previous
65  // dont do anything
66  } else {
67  // else if different fft size as previous
68  // cleanup and initialize
69  fft_size[0] = N[0];
70  fft_size[1] = N[1];
71  fft_size[2] = N[2];
72 
73  cleanup_program();
74  init_program(fft_size, data_path);
75  }
76 
77  return 1;
78  } else {
79  return 0;
80  }
81 }
82 
83 /*******************************************************************************
84  * \brief compute an in-place single precision complex 3D-FFT on the FPGA
85  * \param direction : direction - 1/forward, otherwise/backward FFT3d
86  * \param N : integer pointer to size of FFT3d
87  * \param din : complex input/output single precision data pointer
88  ******************************************************************************/
89 void pw_fpga_fft3d_sp_(int direction, int N[3], cmplx *din) {
90  // setup device specific constructs
91  if (direction == 1) {
92  fftfpga_run_3d(0, N, din);
93  } else {
94  fftfpga_run_3d(1, N, din);
95  }
96 }
97 
98 /*******************************************************************************
99  * \brief compute an in-place double precision complex 3D-FFT on the FPGA
100  * \param direction : direction - 1/forward, otherwise/backward FFT3d
101  * \param N : integer pointer to size of FFT3d
102  * \param din : complex input/output single precision data pointer
103  ******************************************************************************/
104 void pw_fpga_fft3d_dp_(int direction, int N[3], cmplx *din) {
105  // setup device specific constructs
106  if (direction == 1) {
107  fftfpga_run_3d(0, N, din);
108  } else {
109  fftfpga_run_3d(1, N, din);
110  }
111 }
112 
113 /*******************************************************************************
114  * \brief Execute a single precision complex FFT3d
115  * \param inverse : int
116  * \param N : integer pointer to size of FFT3d
117  * \param din : complex input/output single precision data pointer
118  ******************************************************************************/
119 void fftfpga_run_3d(int inverse, int N[3], cmplx *c_in) {
120  cl_int status = 0;
121  int inverse_int = inverse;
122  cl_kernel fft_kernel = NULL, fft_kernel_2 = NULL;
123  cl_kernel fetch_kernel = NULL, transpose_kernel = NULL,
124  transpose_kernel_2 = NULL;
125 
126  // Device memory buffers
127  cl_mem d_inData, d_outData;
128 
129  // Create the kernel - name passed in here must match kernel name in the
130  // original CL file, that was compiled into an AOCX file using the AOC tool
131  fft_kernel = clCreateKernel(program, "fft3da", &status);
132  checkError(status, "Failed to create fft3da kernel");
133  fft_kernel_2 = clCreateKernel(program, "fft3db", &status);
134  checkError(status, "Failed to create fft3db kernel");
135  fetch_kernel = clCreateKernel(program, "fetch", &status);
136  checkError(status, "Failed to create fetch kernel");
137  transpose_kernel = clCreateKernel(program, "transpose", &status);
138  checkError(status, "Failed to create transpose kernel");
139  transpose_kernel_2 = clCreateKernel(program, "transpose3d", &status);
140  checkError(status, "Failed to create transpose3d kernel");
141 
142  d_inData = clCreateBuffer(context, CL_MEM_READ_WRITE,
143  sizeof(cmplx) * N[0] * N[1] * N[2], NULL, &status);
144  checkError(status, "Failed to allocate input device buffer\n");
145  d_outData = clCreateBuffer(context, CL_MEM_READ_WRITE,
146  sizeof(cmplx) * N[0] * N[1] * N[2], NULL, &status);
147  checkError(status, "Failed to allocate output device buffer\n");
148 
149  cmplx *h_inData = (cmplx *)alignedMalloc(sizeof(cmplx) * N[0] * N[1] * N[2]);
150  if (h_inData == NULL) {
151  printf("Unable to allocate host memory\n");
152  exit(1);
153  }
154  cmplx *h_outData = (cmplx *)alignedMalloc(sizeof(cmplx) * N[0] * N[1] * N[2]);
155  if (h_outData == NULL) {
156  printf("Unable to allocate host memory\n");
157  exit(1);
158  }
159 
160  memcpy(h_inData, c_in, sizeof(cmplx) * N[0] * N[1] * N[2]);
161 
162  queue_setup();
163 
164  // Copy data from host to device
165  status = clEnqueueWriteBuffer(queue6, d_inData, CL_TRUE, 0,
166  sizeof(cmplx) * N[0] * N[1] * N[2], h_inData, 0,
167  NULL, NULL);
168  checkError(status, "Failed to copy data to device");
169 
170  status = clFinish(queue6);
171  checkError(status, "failed to finish");
172 
173  status = clSetKernelArg(fetch_kernel, 0, sizeof(cl_mem), (void *)&d_inData);
174  checkError(status, "Failed to set kernel arg 0");
175  status = clSetKernelArg(fft_kernel, 0, sizeof(cl_int), (void *)&inverse_int);
176  checkError(status, "Failed to set kernel arg 1");
177  status =
178  clSetKernelArg(transpose_kernel, 0, sizeof(cl_mem), (void *)&d_outData);
179  checkError(status, "Failed to set kernel arg 2");
180  status =
181  clSetKernelArg(fft_kernel_2, 0, sizeof(cl_int), (void *)&inverse_int);
182  checkError(status, "Failed to set kernel arg 3");
183 
184  status = clEnqueueTask(queue1, fetch_kernel, 0, NULL, NULL);
185  checkError(status, "Failed to launch fetch kernel");
186 
187  // Launch the fft kernel - we launch a single work item hence enqueue a task
188  status = clEnqueueTask(queue2, fft_kernel, 0, NULL, NULL);
189  checkError(status, "Failed to launch fft kernel");
190 
191  status = clEnqueueTask(queue3, transpose_kernel, 0, NULL, NULL);
192  checkError(status, "Failed to launch transpose kernel");
193 
194  status = clEnqueueTask(queue4, fft_kernel_2, 0, NULL, NULL);
195  checkError(status, "Failed to launch second fft kernel");
196 
197  status = clEnqueueTask(queue5, transpose_kernel_2, 0, NULL, NULL);
198  checkError(status, "Failed to launch second transpose kernel");
199 
200  // Wait for all command queues to complete pending events
201  status = clFinish(queue1);
202  checkError(status, "failed to finish");
203  status = clFinish(queue2);
204  checkError(status, "failed to finish");
205  status = clFinish(queue3);
206  checkError(status, "failed to finish");
207  status = clFinish(queue4);
208  checkError(status, "failed to finish");
209  status = clFinish(queue5);
210  checkError(status, "failed to finish");
211 
212  // Copy results from device to host
213  status = clEnqueueReadBuffer(queue3, d_outData, CL_TRUE, 0,
214  sizeof(cmplx) * N[0] * N[1] * N[2], h_outData, 0,
215  NULL, NULL);
216  checkError(status, "Failed to read data from device");
217 
218  memcpy(c_in, h_outData, sizeof(cmplx) * N[0] * N[1] * N[2]);
219 
220  queue_cleanup();
221 
222  if (h_outData)
223  free(h_outData);
224  if (h_inData)
225  free(h_inData);
226 
227  if (d_inData)
228  clReleaseMemObject(d_inData);
229  if (d_outData)
230  clReleaseMemObject(d_outData);
231 
232  if (fetch_kernel)
233  clReleaseKernel(fetch_kernel);
234  if (fft_kernel)
235  clReleaseKernel(fft_kernel);
236  if (fft_kernel_2)
237  clReleaseKernel(fft_kernel_2);
238  if (transpose_kernel)
239  clReleaseKernel(transpose_kernel);
240  if (transpose_kernel_2)
241  clReleaseKernel(transpose_kernel_2);
242 }
243 
244 /*******************************************************************************
245  * \brief Initialize the program - select device, create context and program
246  ******************************************************************************/
247 void init_program(int N[3], char *data_path) {
248  cl_int status = 0;
249 
250  // use the first device.
251  device = devices[0];
252 
253  // Create the context.
254  context = clCreateContext(NULL, 1, &device, &openCLContextCallBackFxn, NULL,
255  &status);
256  checkError(status, "Failed to create context");
257 
258  // Create the program.
259  program = getProgramWithBinary(context, &device, 1, N, data_path);
260  if (program == NULL) {
261  printf("Failed to create program");
262  exit(1);
263  }
264  // Build the program that was just created.
265  status = clBuildProgram(program, 0, NULL, "", NULL, NULL);
266  checkError(status, "Failed to build program");
267 }
268 
269 /*******************************************************************************
270  * \brief Free resources allocated during program initialization
271  ******************************************************************************/
272 void cleanup_program() {
273  if (program)
274  clReleaseProgram(program);
275  if (context)
276  clReleaseContext(context);
277 }
278 
279 /*******************************************************************************
280  * \brief Initialize the OpenCL FPGA environment - platform and devices
281  * \retval true if error in initialization
282  ******************************************************************************/
283 int init() {
284  cl_int status = 0;
285 
286  // Get the OpenCL platform.
287  platform = findPlatform("Intel(R) FPGA");
288  if (platform == NULL) {
289  printf("ERROR: Unable to find Intel(R) FPGA OpenCL platform\n");
290  return 1;
291  }
292  // Query the available OpenCL devices.
293  cl_uint num_devices;
294  devices = getDevices(platform, CL_DEVICE_TYPE_ALL, &num_devices);
295 
296  return 0;
297 }
298 
299 /*******************************************************************************
300  * \brief Free resources allocated during initialization - devices
301  ******************************************************************************/
302 void cleanup() {
303  cleanup_program();
304  free(devices);
305 }
306 
307 /*******************************************************************************
308  * \brief Create a command queue for each kernel
309  ******************************************************************************/
310 void queue_setup() {
311  cl_int status = 0;
312  // Create one command queue for each kernel.
313  queue1 =
314  clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &status);
315  checkError(status, "Failed to create command queue1");
316  queue2 =
317  clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &status);
318  checkError(status, "Failed to create command queue2");
319  queue3 =
320  clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &status);
321  checkError(status, "Failed to create command queue3");
322  queue4 =
323  clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &status);
324  checkError(status, "Failed to create command queue4");
325  queue5 =
326  clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &status);
327  checkError(status, "Failed to create command queue5");
328  queue6 =
329  clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &status);
330  checkError(status, "Failed to create command queue6");
331 }
332 
333 /*******************************************************************************
334  * \brief Release all command queues
335  ******************************************************************************/
336 void queue_cleanup() {
337  if (queue1)
338  clReleaseCommandQueue(queue1);
339  if (queue2)
340  clReleaseCommandQueue(queue2);
341  if (queue3)
342  clReleaseCommandQueue(queue3);
343  if (queue4)
344  clReleaseCommandQueue(queue4);
345  if (queue5)
346  clReleaseCommandQueue(queue5);
347  if (queue6)
348  clReleaseCommandQueue(queue6);
349 }
350 
351 #endif
subroutine, public init(Nder, iunit, mepos, group)
...
Definition: t_c_g0.F:1357