17 #include "CL/opencl.h"
26 static void cleanup_program();
27 static void init_program(
int N[3],
char *data_path);
28 static void queue_setup();
30 static void fftfpga_run_3d(
int inverse,
int N[3], cmplx *c_in);
34 int pw_fpga_initialize_() {
return init(); }
36 void pw_fpga_final_() { cleanup(); }
47 int pw_fpga_check_bitstream_(
char *data_path,
int N[3]) {
48 static int fft_size[3] = {0, 0, 0};
51 if ((N[0] == 16 && N[1] == 16 && N[2] == 16) ||
52 (N[0] == 32 && N[1] == 32 && N[2] == 32) ||
53 (N[0] == 64 && N[1] == 64 && N[2] == 64)) {
56 if (fft_size[0] == 0 && fft_size[1] == 0 && fft_size[2] == 0) {
61 init_program(fft_size, data_path);
62 }
else if (fft_size[0] == N[0] && fft_size[1] == N[1] &&
63 fft_size[2] == N[2]) {
74 init_program(fft_size, data_path);
89 void pw_fpga_fft3d_sp_(
int direction,
int N[3], cmplx *din) {
92 fftfpga_run_3d(0, N, din);
94 fftfpga_run_3d(1, N, din);
104 void pw_fpga_fft3d_dp_(
int direction,
int N[3], cmplx *din) {
106 if (direction == 1) {
107 fftfpga_run_3d(0, N, din);
109 fftfpga_run_3d(1, N, din);
119 void fftfpga_run_3d(
int inverse,
int N[3], cmplx *c_in) {
121 int inverse_int = inverse;
122 cl_kernel fft_kernel = NULL, fft_kernel_2 = NULL;
123 cl_kernel fetch_kernel = NULL, transpose_kernel = NULL,
124 transpose_kernel_2 = NULL;
127 cl_mem d_inData, d_outData;
131 fft_kernel = clCreateKernel(program,
"fft3da", &status);
132 checkError(status,
"Failed to create fft3da kernel");
133 fft_kernel_2 = clCreateKernel(program,
"fft3db", &status);
134 checkError(status,
"Failed to create fft3db kernel");
135 fetch_kernel = clCreateKernel(program,
"fetch", &status);
136 checkError(status,
"Failed to create fetch kernel");
137 transpose_kernel = clCreateKernel(program,
"transpose", &status);
138 checkError(status,
"Failed to create transpose kernel");
139 transpose_kernel_2 = clCreateKernel(program,
"transpose3d", &status);
140 checkError(status,
"Failed to create transpose3d kernel");
142 d_inData = clCreateBuffer(context, CL_MEM_READ_WRITE,
143 sizeof(cmplx) * N[0] * N[1] * N[2], NULL, &status);
144 checkError(status,
"Failed to allocate input device buffer\n");
145 d_outData = clCreateBuffer(context, CL_MEM_READ_WRITE,
146 sizeof(cmplx) * N[0] * N[1] * N[2], NULL, &status);
147 checkError(status,
"Failed to allocate output device buffer\n");
149 cmplx *h_inData = (cmplx *)alignedMalloc(
sizeof(cmplx) * N[0] * N[1] * N[2]);
150 if (h_inData == NULL) {
151 printf(
"Unable to allocate host memory\n");
154 cmplx *h_outData = (cmplx *)alignedMalloc(
sizeof(cmplx) * N[0] * N[1] * N[2]);
155 if (h_outData == NULL) {
156 printf(
"Unable to allocate host memory\n");
160 memcpy(h_inData, c_in,
sizeof(cmplx) * N[0] * N[1] * N[2]);
165 status = clEnqueueWriteBuffer(queue6, d_inData, CL_TRUE, 0,
166 sizeof(cmplx) * N[0] * N[1] * N[2], h_inData, 0,
168 checkError(status,
"Failed to copy data to device");
170 status = clFinish(queue6);
171 checkError(status,
"failed to finish");
173 status = clSetKernelArg(fetch_kernel, 0,
sizeof(cl_mem), (
void *)&d_inData);
174 checkError(status,
"Failed to set kernel arg 0");
175 status = clSetKernelArg(fft_kernel, 0,
sizeof(cl_int), (
void *)&inverse_int);
176 checkError(status,
"Failed to set kernel arg 1");
178 clSetKernelArg(transpose_kernel, 0,
sizeof(cl_mem), (
void *)&d_outData);
179 checkError(status,
"Failed to set kernel arg 2");
181 clSetKernelArg(fft_kernel_2, 0,
sizeof(cl_int), (
void *)&inverse_int);
182 checkError(status,
"Failed to set kernel arg 3");
184 status = clEnqueueTask(queue1, fetch_kernel, 0, NULL, NULL);
185 checkError(status,
"Failed to launch fetch kernel");
188 status = clEnqueueTask(queue2, fft_kernel, 0, NULL, NULL);
189 checkError(status,
"Failed to launch fft kernel");
191 status = clEnqueueTask(queue3, transpose_kernel, 0, NULL, NULL);
192 checkError(status,
"Failed to launch transpose kernel");
194 status = clEnqueueTask(queue4, fft_kernel_2, 0, NULL, NULL);
195 checkError(status,
"Failed to launch second fft kernel");
197 status = clEnqueueTask(queue5, transpose_kernel_2, 0, NULL, NULL);
198 checkError(status,
"Failed to launch second transpose kernel");
201 status = clFinish(queue1);
202 checkError(status,
"failed to finish");
203 status = clFinish(queue2);
204 checkError(status,
"failed to finish");
205 status = clFinish(queue3);
206 checkError(status,
"failed to finish");
207 status = clFinish(queue4);
208 checkError(status,
"failed to finish");
209 status = clFinish(queue5);
210 checkError(status,
"failed to finish");
213 status = clEnqueueReadBuffer(queue3, d_outData, CL_TRUE, 0,
214 sizeof(cmplx) * N[0] * N[1] * N[2], h_outData, 0,
216 checkError(status,
"Failed to read data from device");
218 memcpy(c_in, h_outData,
sizeof(cmplx) * N[0] * N[1] * N[2]);
228 clReleaseMemObject(d_inData);
230 clReleaseMemObject(d_outData);
233 clReleaseKernel(fetch_kernel);
235 clReleaseKernel(fft_kernel);
237 clReleaseKernel(fft_kernel_2);
238 if (transpose_kernel)
239 clReleaseKernel(transpose_kernel);
240 if (transpose_kernel_2)
241 clReleaseKernel(transpose_kernel_2);
247 void init_program(
int N[3],
char *data_path) {
254 context = clCreateContext(NULL, 1, &device, &openCLContextCallBackFxn, NULL,
256 checkError(status,
"Failed to create context");
259 program = getProgramWithBinary(context, &device, 1, N, data_path);
260 if (program == NULL) {
261 printf(
"Failed to create program");
265 status = clBuildProgram(program, 0, NULL,
"", NULL, NULL);
266 checkError(status,
"Failed to build program");
272 void cleanup_program() {
274 clReleaseProgram(program);
276 clReleaseContext(context);
287 platform = findPlatform(
"Intel(R) FPGA");
288 if (platform == NULL) {
289 printf(
"ERROR: Unable to find Intel(R) FPGA OpenCL platform\n");
294 devices = getDevices(platform, CL_DEVICE_TYPE_ALL, &num_devices);
314 clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &status);
315 checkError(status,
"Failed to create command queue1");
317 clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &status);
318 checkError(status,
"Failed to create command queue2");
320 clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &status);
321 checkError(status,
"Failed to create command queue3");
323 clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &status);
324 checkError(status,
"Failed to create command queue4");
326 clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &status);
327 checkError(status,
"Failed to create command queue5");
329 clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &status);
330 checkError(status,
"Failed to create command queue6");
336 void queue_cleanup() {
338 clReleaseCommandQueue(queue1);
340 clReleaseCommandQueue(queue2);
342 clReleaseCommandQueue(queue3);
344 clReleaseCommandQueue(queue4);
346 clReleaseCommandQueue(queue5);
348 clReleaseCommandQueue(queue6);
subroutine, public init(Nder, iunit, mepos, group)
...