26static void cleanup_program();
 
   27static void init_program(
int N[3], 
char *data_path);
 
   28static void queue_setup();
 
   30static void fftfpga_run_3d(
int inverse, 
int N[3], cmplx *c_in);
 
   34int pw_fpga_initialize_() { 
return init(); }
 
   36void pw_fpga_final_() { cleanup(); }
 
   47int pw_fpga_check_bitstream_(
char *data_path, 
int N[3]) {
 
   48  static int fft_size[3] = {0, 0, 0};
 
   51  if ((N[0] == 16 && N[1] == 16 && N[2] == 16) ||
 
   52      (N[0] == 32 && N[1] == 32 && N[2] == 32) ||
 
   53      (N[0] == 64 && N[1] == 64 && N[2] == 64)) {
 
   56    if (fft_size[0] == 0 && fft_size[1] == 0 && fft_size[2] == 0) {
 
   61      init_program(fft_size, data_path);
 
   62    } 
else if (fft_size[0] == N[0] && fft_size[1] == N[1] &&
 
   63               fft_size[2] == N[2]) {
 
   74      init_program(fft_size, data_path);
 
   89void pw_fpga_fft3d_sp_(
int direction, 
int N[3], cmplx *din) {
 
   92    fftfpga_run_3d(0, N, din);
 
   94    fftfpga_run_3d(1, N, din);
 
  104void pw_fpga_fft3d_dp_(
int direction, 
int N[3], cmplx *din) {
 
  106  if (direction == 1) {
 
  107    fftfpga_run_3d(0, N, din);
 
  109    fftfpga_run_3d(1, N, din);
 
  119void fftfpga_run_3d(
int inverse, 
int N[3], cmplx *c_in) {
 
  121  int inverse_int = inverse;
 
  122  cl_kernel fft_kernel = NULL, fft_kernel_2 = NULL;
 
  123  cl_kernel fetch_kernel = NULL, transpose_kernel = NULL,
 
  124            transpose_kernel_2 = NULL;
 
  127  cl_mem d_inData, d_outData;
 
  131  fft_kernel = clCreateKernel(program, 
"fft3da", &status);
 
  132  checkError(status, 
"Failed to create fft3da kernel");
 
  133  fft_kernel_2 = clCreateKernel(program, 
"fft3db", &status);
 
  134  checkError(status, 
"Failed to create fft3db kernel");
 
  135  fetch_kernel = clCreateKernel(program, 
"fetch", &status);
 
  136  checkError(status, 
"Failed to create fetch kernel");
 
  137  transpose_kernel = clCreateKernel(program, 
"transpose", &status);
 
  138  checkError(status, 
"Failed to create transpose kernel");
 
  139  transpose_kernel_2 = clCreateKernel(program, 
"transpose3d", &status);
 
  140  checkError(status, 
"Failed to create transpose3d kernel");
 
  142  d_inData = clCreateBuffer(context, CL_MEM_READ_WRITE,
 
  143                            sizeof(cmplx) * N[0] * N[1] * N[2], NULL, &status);
 
  144  checkError(status, 
"Failed to allocate input device buffer\n");
 
  145  d_outData = clCreateBuffer(context, CL_MEM_READ_WRITE,
 
  146                             sizeof(cmplx) * N[0] * N[1] * N[2], NULL, &status);
 
  147  checkError(status, 
"Failed to allocate output device buffer\n");
 
  149  cmplx *h_inData = (cmplx *)alignedMalloc(
sizeof(cmplx) * N[0] * N[1] * N[2]);
 
  150  if (h_inData == NULL) {
 
  151    printf(
"Unable to allocate host memory\n");
 
  154  cmplx *h_outData = (cmplx *)alignedMalloc(
sizeof(cmplx) * N[0] * N[1] * N[2]);
 
  155  if (h_outData == NULL) {
 
  156    printf(
"Unable to allocate host memory\n");
 
  160  memcpy(h_inData, c_in, 
sizeof(cmplx) * N[0] * N[1] * N[2]);
 
  165  status = clEnqueueWriteBuffer(queue6, d_inData, CL_TRUE, 0,
 
  166                                sizeof(cmplx) * N[0] * N[1] * N[2], h_inData, 0,
 
  168  checkError(status, 
"Failed to copy data to device");
 
  170  status = clFinish(queue6);
 
  171  checkError(status, 
"failed to finish");
 
  173  status = clSetKernelArg(fetch_kernel, 0, 
sizeof(cl_mem), (
void *)&d_inData);
 
  174  checkError(status, 
"Failed to set kernel arg 0");
 
  175  status = clSetKernelArg(fft_kernel, 0, 
sizeof(cl_int), (
void *)&inverse_int);
 
  176  checkError(status, 
"Failed to set kernel arg 1");
 
  178      clSetKernelArg(transpose_kernel, 0, 
sizeof(cl_mem), (
void *)&d_outData);
 
  179  checkError(status, 
"Failed to set kernel arg 2");
 
  181      clSetKernelArg(fft_kernel_2, 0, 
sizeof(cl_int), (
void *)&inverse_int);
 
  182  checkError(status, 
"Failed to set kernel arg 3");
 
  184  status = clEnqueueTask(queue1, fetch_kernel, 0, NULL, NULL);
 
  185  checkError(status, 
"Failed to launch fetch kernel");
 
  188  status = clEnqueueTask(queue2, fft_kernel, 0, NULL, NULL);
 
  189  checkError(status, 
"Failed to launch fft kernel");
 
  191  status = clEnqueueTask(queue3, transpose_kernel, 0, NULL, NULL);
 
  192  checkError(status, 
"Failed to launch transpose kernel");
 
  194  status = clEnqueueTask(queue4, fft_kernel_2, 0, NULL, NULL);
 
  195  checkError(status, 
"Failed to launch second fft kernel");
 
  197  status = clEnqueueTask(queue5, transpose_kernel_2, 0, NULL, NULL);
 
  198  checkError(status, 
"Failed to launch second transpose kernel");
 
  201  status = clFinish(queue1);
 
  202  checkError(status, 
"failed to finish");
 
  203  status = clFinish(queue2);
 
  204  checkError(status, 
"failed to finish");
 
  205  status = clFinish(queue3);
 
  206  checkError(status, 
"failed to finish");
 
  207  status = clFinish(queue4);
 
  208  checkError(status, 
"failed to finish");
 
  209  status = clFinish(queue5);
 
  210  checkError(status, 
"failed to finish");
 
  213  status = clEnqueueReadBuffer(queue3, d_outData, CL_TRUE, 0,
 
  214                               sizeof(cmplx) * N[0] * N[1] * N[2], h_outData, 0,
 
  216  checkError(status, 
"Failed to read data from device");
 
  218  memcpy(c_in, h_outData, 
sizeof(cmplx) * N[0] * N[1] * N[2]);
 
  228    clReleaseMemObject(d_inData);
 
  230    clReleaseMemObject(d_outData);
 
  233    clReleaseKernel(fetch_kernel);
 
  235    clReleaseKernel(fft_kernel);
 
  237    clReleaseKernel(fft_kernel_2);
 
  238  if (transpose_kernel)
 
  239    clReleaseKernel(transpose_kernel);
 
  240  if (transpose_kernel_2)
 
  241    clReleaseKernel(transpose_kernel_2);
 
  247void init_program(
int N[3], 
char *data_path) {
 
  254  context = clCreateContext(NULL, 1, &device, &openCLContextCallBackFxn, NULL,
 
  256  checkError(status, 
"Failed to create context");
 
  259  program = getProgramWithBinary(context, &device, 1, N, data_path);
 
  260  if (program == NULL) {
 
  261    printf(
"Failed to create program");
 
  265  status = clBuildProgram(program, 0, NULL, 
"", NULL, NULL);
 
  266  checkError(status, 
"Failed to build program");
 
  272void cleanup_program() {
 
  274    clReleaseProgram(program);
 
  276    clReleaseContext(context);
 
  287  platform = findPlatform(
"Intel(R) FPGA");
 
  288  if (platform == NULL) {
 
  289    printf(
"ERROR: Unable to find Intel(R) FPGA OpenCL platform\n");
 
  294  devices = getDevices(platform, CL_DEVICE_TYPE_ALL, &num_devices);
 
  314      clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &status);
 
  315  checkError(status, 
"Failed to create command queue1");
 
  317      clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &status);
 
  318  checkError(status, 
"Failed to create command queue2");
 
  320      clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &status);
 
  321  checkError(status, 
"Failed to create command queue3");
 
  323      clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &status);
 
  324  checkError(status, 
"Failed to create command queue4");
 
  326      clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &status);
 
  327  checkError(status, 
"Failed to create command queue5");
 
  329      clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &status);
 
  330  checkError(status, 
"Failed to create command queue6");
 
  336void queue_cleanup() {
 
  338    clReleaseCommandQueue(queue1);
 
  340    clReleaseCommandQueue(queue2);
 
  342    clReleaseCommandQueue(queue3);
 
  344    clReleaseCommandQueue(queue4);
 
  346    clReleaseCommandQueue(queue5);
 
  348    clReleaseCommandQueue(queue6);
 
subroutine, public init(nder, iunit, mepos, group)
...