(git:3add494)
dbm_multiply_opencl.c
Go to the documentation of this file.
1 /*----------------------------------------------------------------------------*/
2 /* CP2K: A general program to perform molecular dynamics simulations */
3 /* Copyright 2000-2024 CP2K developers group <https://cp2k.org> */
4 /* */
5 /* SPDX-License-Identifier: BSD-3-Clause */
6 /*----------------------------------------------------------------------------*/
7 
8 #include "../offload/offload_runtime.h"
9 #if defined(__OFFLOAD_OPENCL) && !defined(__NO_OFFLOAD_DBM)
10 
12 #include "dbm_multiply_opencl.cl.h"
13 
14 void dbm_multiply_gpu_launch_kernel(const offloadStream_t stream,
15  const int mnk_range[3][2], double alpha,
16  int ntasks, const dbm_task_t *tasks,
17  const double *pack_a_data,
18  const double *pack_b_data,
19  double *shard_c_data) {
20  static cl_kernel kernel = NULL;
21  static int ndims = 1, split = 0;
22  static size_t wgsize[] = {0, 0, 0};
23  int result = EXIT_SUCCESS, verbosity = c_dbcsr_acc_opencl_config.verbosity;
24  cl_event event, *const perf_event =
25  ((0 <= verbosity && 2 >= verbosity) ? NULL : &event);
26  const c_dbcsr_acc_opencl_stream_t *const str = ACC_OPENCL_STREAM(stream);
27  const size_t max_m = mnk_range[0][1], work_tasks = ntasks;
28  size_t work_size[] = {1, 1, 1}, ibatch = 0, iadata = 0, ibdata = 0,
29  icdata = 0;
30  c_dbcsr_acc_opencl_info_memptr_t adata, bdata, cdata, batch;
31  assert(NULL != pack_a_data && NULL != pack_b_data && NULL != shard_c_data);
32  assert(0 < mnk_range[0][0] && 0 < mnk_range[0][1] &&
33  mnk_range[0][0] <= mnk_range[0][1]);
34  assert(0 < mnk_range[1][0] && 0 < mnk_range[1][1] &&
35  mnk_range[1][0] <= mnk_range[1][1]);
36  assert(0 < mnk_range[2][0] && 0 < mnk_range[2][1] &&
37  mnk_range[2][0] <= mnk_range[2][1]);
38  assert(NULL != str && NULL != str->queue);
39  assert(0 < ntasks && NULL != tasks);
40  /* creating/calling kernel must be consistent across threads */
41  ACC_OPENCL_ACQUIRE(c_dbcsr_acc_opencl_config.lock_main);
42 #if defined(OPENCL_DBM_SOURCE_MULTIPLY)
43  if (NULL == kernel) { /* first-time check if kernel is present */
44  const libxsmm_timer_tickint start = libxsmm_timer_tick();
45  char params[ACC_OPENCL_BUFFERSIZE] =
46  "-cl-fast-relaxed-math -cl-denorms-are-zero";
47  const char *const gen_env = getenv("DBM_MULTIPLY_GEN");
48  const char *const xf_env = getenv("DBM_MULTIPLY_XF");
49  const char *const lu_env = getenv("DBM_MULTIPLY_LU");
50  const char *const bn_env = getenv("DBM_MULTIPLY_BN");
51  const int gpu =
52  (CL_DEVICE_TYPE_GPU == c_dbcsr_acc_opencl_config.device.type);
53  const int gen = (NULL == gen_env ? 0 /*default*/ : atoi(gen_env));
54  const int xf = (NULL == xf_env ? -1 /*default*/ : atoi(xf_env));
55  const int lu = LIBXSMM_CLMP(NULL == lu_env ? 0 : atoi(lu_env), -2, 1);
56  int bn = (NULL == bn_env ? 8 : atoi(bn_env));
57  const char *extensions[] = {NULL, NULL}, *flags = NULL;
58  size_t nextensions = sizeof(extensions) / sizeof(*extensions);
59  const size_t wgsize0 = c_dbcsr_acc_opencl_config.device.wgsize[0];
60  const size_t wgsize1 = c_dbcsr_acc_opencl_config.device.wgsize[1];
61  size_t wgsize2 = c_dbcsr_acc_opencl_config.device.wgsize[2];
62  size_t offset = (0 == c_dbcsr_acc_opencl_config.debug ? strlen(params) : 0);
63  offset += (size_t)c_dbcsr_acc_opencl_flags_atomics(
64  &c_dbcsr_acc_opencl_config.device, c_dbcsr_acc_opencl_atomic_fp_64,
65  extensions, &nextensions, params + offset, sizeof(params) - offset);
66  if (2 <= gen || (0 != gen && 0 != wgsize2 /*subgroups*/ &&
67  2 <= *c_dbcsr_acc_opencl_config.device.std_level &&
68  NULL != extensions[1] &&
69  NULL != strstr(extensions[1], "cl_ext_float_atomics"))) {
70  offset +=
71  (size_t)LIBXSMM_SNPRINTF(params + offset, sizeof(params) - offset,
72  " -DDBM_MULTIPLY_OPENCL_GEN");
73  if (0 != c_dbcsr_acc_opencl_config.device.intel && 0 != xf) {
74  flags = "-cl-intel-256-GRF-per-thread";
75  }
76  wgsize[1] = wgsize[2] = 1;
77  wgsize[0] = 16;
78  ndims = 3;
79  } else {
80  const char *const split_env = getenv("DBM_MULTIPLY_SPLIT");
81  const char *const wg_env = getenv("DBM_MULTIPLY_WG");
82  split = (NULL == split_env ? 1 /*default*/ : atoi(split_env));
83  wgsize[0] =
84  (NULL == wg_env ? (1 != split ? (wgsize1 * LIBXSMM_ABS(split)) : 0)
85  : strtoul(wg_env, NULL, 10));
86  if (0 != split && 1 != split && (bn * bn) > (int)wgsize[0]) {
87  wgsize[0] = bn * bn;
88  }
89  if (0 != split && 0 != wgsize2 && 0 < wgsize[0]) { /* subgroups */
90  if (LIBXSMM_DELTA(wgsize[0], wgsize1) <=
91  LIBXSMM_DELTA(wgsize[0], wgsize2)) { /* select SG-size */
92  wgsize2 = wgsize1;
93  }
94  wgsize[0] = LIBXSMM_UP(wgsize[0], wgsize2);
95  } else {
96  wgsize[0] = LIBXSMM_UP(wgsize[0], wgsize1);
97  wgsize2 = 0;
98  }
99  wgsize[0] = LIBXSMM_CLMP(wgsize[0], 0, wgsize0);
100  if (NULL == bn_env && 0 != split && 1 != split &&
101  (bn * bn) < (int)wgsize[0]) {
102  bn = libxsmm_isqrt2_u32(wgsize[0]);
103  }
104  bn = LIBXSMM_CLMP(bn, 4, 32);
105  offset += (size_t)LIBXSMM_SNPRINTF(
106  params + offset, sizeof(params) - offset,
107  " %s -DSPLIT=%i -DBN=%i -DWG=%i -DSG=%i -DLU=%i",
108  0 != gpu ? "-DGPU" : "", split, bn, (int)wgsize[0], (int)wgsize2, lu);
109  if (0 != c_dbcsr_acc_opencl_config.device.intel && 0 < xf) {
110  flags = "-cl-intel-256-GRF-per-thread";
111  }
112  }
113  result |= (sizeof(params) > offset ? EXIT_SUCCESS : EXIT_FAILURE);
114  result |= c_dbcsr_acc_opencl_kernel(
115  0 /*source_is_file*/, OPENCL_DBM_SOURCE_MULTIPLY, "dbm_multiply",
116  params, flags, NULL /*try*/, NULL /*try_ok*/, extensions, nextensions,
117  &kernel);
118  if (2 <= verbosity || 0 > verbosity) {
119  if (EXIT_SUCCESS == result) {
120  const double d = libxsmm_timer_duration(start, libxsmm_timer_tick());
121  fprintf(stderr, "INFO ACC/LIBDBM: DBM-kernel gpu=%i", gpu);
122  if (0 == gen) {
123  fprintf(stderr, " split=%i lu=%i bn=%i", split, lu, bn);
124  } else { /* generated kernel */
125  fprintf(stderr, " gen=%i", gen);
126  }
127  fprintf(stderr, " wg=%i sg=%i ms=%.1f\n", (int)wgsize[0], (int)wgsize2,
128  1E3 * d);
129  } else {
130  fprintf(stderr, "INFO ACC/LIBDBM: DBM-kernel failed to generate\n");
131  }
132  }
133  }
134 #else
135 #error "OpenCL kernel code not found!"
136 #endif
137  result |= c_dbcsr_acc_opencl_info_devptr_lock(&adata, NULL /*lock*/,
138  pack_a_data, 1 /*esize*/,
139  NULL /*amount*/, &iadata);
140  result |= c_dbcsr_acc_opencl_info_devptr_lock(&bdata, NULL /*lock*/,
141  pack_b_data, 1 /*esize*/,
142  NULL /*amount*/, &ibdata);
143  result |= c_dbcsr_acc_opencl_info_devptr_lock(&cdata, NULL /*lock*/,
144  shard_c_data, 1 /*esize*/,
145  NULL /*amount*/, &icdata);
146  result |= c_dbcsr_acc_opencl_info_devptr_lock(
147  &batch, NULL /*lock*/, tasks /*batch*/, sizeof(dbm_task_t), &work_tasks,
148  &ibatch);
149  assert(0 == iadata && 0 == ibdata && 0 == icdata);
150  result |= clSetKernelArg(kernel, 0, sizeof(cl_double), &alpha);
151  result |= clSetKernelArg(kernel, 1, sizeof(cl_int), &ibatch);
152  if (1 < ndims) { /* generated kernel */
153  const cl_uint zero = 0;
154  assert(0 != wgsize[1] && 0 != wgsize[1] && 0 != wgsize[2]);
155  work_size[0] = 16;
156  assert(1 == work_size[1]);
157  work_size[2] = work_tasks;
158  result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 2, batch.memory);
159  result |= clSetKernelArg(kernel, 3, sizeof(cl_uint), &zero /*shape*/);
160  result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 4, adata.memory);
161  result |= clSetKernelArg(kernel, 5, sizeof(cl_uint), &zero /*A_shape0*/);
162  result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 6, bdata.memory);
163  result |= clSetKernelArg(kernel, 7, sizeof(cl_uint), &zero /*B_shape0*/);
164  result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 8, cdata.memory);
165  result |= clSetKernelArg(kernel, 9, sizeof(cl_uint), &zero /*C_shape0*/);
166  } else {
167  result |= clSetKernelArg(kernel, 2, sizeof(cl_int), &ntasks);
168  if (0 != split) {
169  if (1 == split || 0 == wgsize[0]) {
170  work_size[0] = work_tasks * max_m;
171  result |= clSetKernelArg(kernel, 3, sizeof(cl_int), work_size);
172  if (0 < wgsize[0]) { /* fixup to be a multiple of the WG-size */
173  work_size[0] = LIBXSMM_UP(work_size[0], wgsize[0]);
174  }
175  } else {
176  work_size[0] = work_tasks * wgsize[0];
177  result |= clSetKernelArg(kernel, 3, sizeof(cl_int), work_size);
178  }
179  } else {
180  work_size[0] = work_tasks;
181  result |= clSetKernelArg(kernel, 3, sizeof(cl_int), work_size);
182  }
183  result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 4, batch.memory);
184  result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 5, adata.memory);
185  result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 6, bdata.memory);
186  result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 7, cdata.memory);
187  }
188  result |= clEnqueueNDRangeKernel(
189  str->queue, kernel, ndims, NULL, work_size, 0 < wgsize[0] ? wgsize : NULL,
190  0 /*num_wait*/, NULL /*wait_list*/, perf_event);
191  if (NULL != perf_event && EXIT_SUCCESS == result) {
192  cl_ulong begin = 0, end = 0;
193  clWaitForEvents(1, perf_event);
194  result |= clGetEventProfilingInfo(*perf_event, CL_PROFILING_COMMAND_START,
195  sizeof(cl_ulong), &begin, NULL);
196  result |= clGetEventProfilingInfo(*perf_event, CL_PROFILING_COMMAND_END,
197  sizeof(cl_ulong), &end, NULL);
198  if (EXIT_SUCCESS == result) {
199  const double duration_ns = LIBXSMM_DELTA(begin, end);
200  const double gflops =
201  (max_m * mnk_range[1][1] * mnk_range[2][1] * ntasks) / duration_ns;
202  fprintf(stderr,
203  "INFO ACC/LIBDBM: DBM-kernel mnk=%ix%ix%i "
204  "ntasks=%i gflops=%.1f ms=%.2g\n",
205  mnk_range[0][1], mnk_range[1][1], mnk_range[2][1], ntasks, gflops,
206  1E-6 * duration_ns);
207  }
208  }
209  ACC_OPENCL_RELEASE(c_dbcsr_acc_opencl_config.lock_main);
210  OFFLOAD_CHECK(result);
211 }
212 
213 #endif // defined(__OFFLOAD_OPENCL) && !defined(__NO_OFFLOAD_DBM)
214 
215 // EOF
real(dp), dimension(3) d
Definition: ai_eri_debug.F:31
Internal struct for storing a task, ie. a single block multiplication.