(git:419edc0)
Loading...
Searching...
No Matches
dbm_multiply_opencl.c
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2025 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: BSD-3-Clause */
6/*----------------------------------------------------------------------------*/
7
8#include "../offload/offload_runtime.h"
9#if defined(__OFFLOAD_OPENCL) && !defined(__NO_OFFLOAD_DBM)
10
12#include "dbm_multiply_opencl.cl.h"
13
14#if defined(__DBCSR_ACC)
15#include <smm/opencl_libsmm.h>
16#endif
17
18#define DBM_TIMER_DIFF(A, B) libxsmm_timer_duration(A, B)
19#define DBM_TIMER_TICK() libxsmm_timer_tick()
20#define DBM_TIMER_TICKINT libxsmm_timer_tickint
21
22int dbm_multiply_opencl_launch_kernel(void *stream, double alpha, int ntasks,
23 int param_format, const int *params_host,
24 const int *params,
25 const double *pack_a_data,
26 const double *pack_b_data,
27 double *shard_c_data);
28
29#if defined(OPENCL_LIBSMM_PFORMAT) && (0 < OPENCL_LIBSMM_PFORMAT)
30int dbm_multiply_opencl_initialized /*= 0*/;
31int dbm_multiply_opencl_smm /*= 0*/;
32
33LIBXSMM_ATTRIBUTE_CTOR static void dbm_multiply_opencl_initialize(void) {
34 const char *const smm_env = getenv("DBM_MULTIPLY_SMM");
35 const int smm = (NULL == smm_env ? 0 /*default*/ : atoi(smm_env));
36 dbm_multiply_opencl_smm =
37 LIBXSMM_MIN(1 != smm ? smm : 64, (1 << (OPENCL_LIBSMM_PFORMAT - 1)) - 1);
38 if (0 > dbm_multiply_opencl_smm) {
39 opencl_libsmm_acc_set_dbm_launch_fn(dbm_multiply_opencl_launch_kernel);
40 }
41 ++dbm_multiply_opencl_initialized;
42}
43#endif
44
45typedef struct {
46 int max_m, max_n, max_k, mnk_changes;
47} dbm_multiply_gpu_launch_info_t;
48
49static void dbm_multiply_gpu_launch_info(dbm_multiply_gpu_launch_info_t *info,
50 const int *params, int ntasks,
51 int param_format) {
52 if (0 == param_format) { /* native */
53 const int stride = sizeof(dbm_task_t) / sizeof(int);
54 int avg_m = params[0], avg_n = params[1], avg_k = params[2], i = stride;
55 info->max_m = avg_m;
56 info->max_n = avg_n;
57 info->max_k = avg_k;
58 for (info->mnk_changes = 0; i < (ntasks * stride); i += stride) {
59 const int m = params[i + 0], n = params[i + 1], k = params[i + 2];
60 info->max_m = imax(info->max_m, m);
61 info->max_n = imax(info->max_n, n);
62 info->max_k = imax(info->max_k, k);
63 if (m != avg_m || n != avg_n || k != avg_k) { /* approximation */
64 avg_m = (avg_m + m) / 2;
65 avg_n = (avg_n + n) / 2;
66 avg_k = (avg_k + k) / 2;
67 ++info->mnk_changes;
68 }
69 }
70 } else {
71#if defined(OPENCL_LIBSMM_PFORMAT) && (0 < OPENCL_LIBSMM_PFORMAT)
72 const int mask = (1 << OPENCL_LIBSMM_PFORMAT) - 1;
73 info->max_m = mask & (param_format);
74 info->max_n = mask & (param_format >> (OPENCL_LIBSMM_PFORMAT));
75 info->max_k = mask & (param_format >> (OPENCL_LIBSMM_PFORMAT * 2));
76 info->mnk_changes = 0; /* homogeneous */
77#else
78 assert(0);
79#endif
80 }
81}
82
83static void dbm_multiply_opencl_print(FILE *stream, const char *name, int val) {
84 if (0 != val) {
85 fprintf(stream, " %s=%i", name, val);
86 }
87}
88
89int dbm_multiply_opencl_launch_kernel(void *stream, double alpha, int ntasks,
90 int param_format, const int *params_host,
91 const int *params,
92 const double *pack_a_data,
93 const double *pack_b_data,
94 double *shard_c_data) {
95 const DBM_TIMER_TICKINT start = DBM_TIMER_TICK();
96 const c_dbcsr_acc_opencl_config_t *const config = &c_dbcsr_acc_opencl_config;
97 const int verbosity = config->verbosity;
98 int result = EXIT_SUCCESS;
99 cl_event e = NULL, *const event =
100 ((0 <= verbosity && 2 >= verbosity) ? NULL : &e);
101 dbm_multiply_gpu_launch_info_t info = {0};
102 assert(NULL != pack_a_data && NULL != pack_b_data && NULL != shard_c_data);
103 assert(NULL != params_host || 0 == ntasks);
104 assert(NULL != params || 0 == ntasks);
105 if (0 == ntasks) {
106 return result;
107 }
108#if defined(OPENCL_LIBSMM_PFORMAT) && (0 < OPENCL_LIBSMM_PFORMAT)
109 if (0 == dbm_multiply_opencl_initialized) {
110 dbm_multiply_opencl_initialize();
111 }
112 if (0 != dbm_multiply_opencl_smm) {
113 dbm_multiply_gpu_launch_info(&info, params_host, ntasks, param_format);
114 }
115 if (0 > dbm_multiply_opencl_smm || dbm_multiply_opencl_smm < info.max_m ||
116 dbm_multiply_opencl_smm < info.max_n ||
117 dbm_multiply_opencl_smm < info.max_k || 0 == info.max_k || 1 != alpha)
118#endif
119 {
120#if defined(OPENCL_DBM_SOURCE_MULTIPLY)
121 /* creating/calling kernel must be consistent across threads */
122 static cl_kernel kernel_global = NULL;
123 static LIBXSMM_TLS cl_kernel kernel = NULL;
124 static int ndims = 1, clinear = 0;
125 static size_t wgsize[] = {0, 0, 0};
126 const c_dbcsr_acc_opencl_stream_t *const str = ACC_OPENCL_STREAM(stream);
127 const c_dbcsr_acc_opencl_device_t *const devinfo = &config->device;
128 ACC_OPENCL_LOCKTYPE *const lock_memory =
129 (NULL != devinfo->clSetKernelArgMemPointerINTEL ? NULL
130 : config->lock_memory);
131 c_dbcsr_acc_opencl_info_memptr_t adata, bdata, cdata, batch;
132 const int stride = (0 == param_format ? 6 : 3);
133 size_t work_size[] = {1, 1, 1}, ibatch = 0;
134 size_t iadata = 0, ibdata = 0, icdata = 0;
135 const size_t work_tasks = ntasks;
136 assert(NULL != str && NULL != str->queue);
137 if (NULL == kernel_global) { /* initial check if kernel is present */
138 ACC_OPENCL_ACQUIRE(config->lock_main);
139 if (NULL == kernel_global) {
140 char flags[ACC_OPENCL_BUFFERSIZE] =
141 "-cl-fast-relaxed-math -cl-denorms-are-zero";
142 const char *const gen_env = getenv("DBM_MULTIPLY_GEN");
143 const char *const lin_env = getenv("DBM_MULTIPLY_LIN");
144 const char *const bn_env = getenv("DBM_MULTIPLY_BN");
145 const char *const sm_env = getenv("DBM_MULTIPLY_SM");
146 const char *const wg_env = getenv("DBM_MULTIPLY_WG");
147 const char *const lu_env = getenv("DBM_MULTIPLY_LU");
148 const char *const xf_env = getenv("DBM_MULTIPLY_XF");
149 int sm = (NULL == sm_env ? 0 /*default*/ : atoi(sm_env));
150 const int bn0 = (0 == devinfo->nv ? (0 == devinfo->amd ? 4 : 8) : 2);
151 const int bn1 = ((0 == sm && 0 == clinear) ? bn0 : (bn0 * 2));
152 int bn = LIBXSMM_CLMP(NULL == bn_env ? bn1 : atoi(bn_env), 1, 32);
153 int lu = LIBXSMM_CLMP(NULL == lu_env ? 0 : atoi(lu_env), -2, 1);
154 int gen = ((NULL == bn_env && NULL == sm_env && NULL == wg_env &&
155 NULL == lu_env && NULL == lin_env && 0 == param_format)
156 ? (NULL == gen_env ? 1 /*default*/ : atoi(gen_env))
157 : 0);
158 const int gpu = (CL_DEVICE_TYPE_GPU == devinfo->type);
159 const int xf = (NULL == xf_env ? -1 /*default*/ : atoi(xf_env));
160 const char *extensions[] = {NULL, NULL}, *options = NULL;
161 size_t nextensions = sizeof(extensions) / sizeof(*extensions);
162 const size_t wgsize0 = devinfo->wgsize[0], wgsize1 = devinfo->wgsize[1];
163 size_t wgsize2 = devinfo->wgsize[2];
164 size_t offset =
165 ((0 == config->debug && 0 == config->dump) ? strlen(flags) : 0);
166 offset += (size_t)c_dbcsr_acc_opencl_flags_atomics(
167 devinfo, c_dbcsr_acc_opencl_atomic_fp_64, extensions, &nextensions,
168 flags + offset, sizeof(flags) - offset);
169 if (2 <= gen ||
170 (0 != gen && 0 != wgsize2 /*subgroups*/ &&
171 2 <= *devinfo->std_level && NULL != extensions[1] &&
172 NULL != strstr(extensions[1], "cl_ext_float_atomics"))) {
173 offset +=
174 (size_t)LIBXSMM_SNPRINTF(flags + offset, sizeof(flags) - offset,
175 " -DDBM_MULTIPLY_OPENCL_GEN");
176 wgsize[1] = wgsize[2] = 1;
177 wgsize[0] = 16;
178 lu = bn = 0;
179 ndims = 3;
180 } else {
181 wgsize[0] = (NULL == wg_env ? (unsigned long int)LIBXSMM_ABS(sm)
182 : strtoul(wg_env, NULL, 10));
183 if (0 != wgsize2 && 0 < wgsize[0]) { /* subgroups */
184 if (LIBXSMM_DELTA(wgsize[0], wgsize1) <=
185 LIBXSMM_DELTA(wgsize[0], wgsize2)) { /* select SG-size */
186 wgsize2 = wgsize1;
187 }
188 wgsize[0] = LIBXSMM_UP(wgsize[0], wgsize2);
189 } else {
190 wgsize[0] = LIBXSMM_UP(wgsize[0], wgsize1);
191 wgsize2 = 0;
192 }
193 wgsize[0] = LIBXSMM_CLMP(wgsize[0], 0, wgsize0);
194 sm = ((0 != sm && 0 != wgsize[0])
195 ? (LIBXSMM_ISPOT(bn * sizeof(double)) + 1)
196 : 0);
197 clinear = (NULL == lin_env ? 0 /*default*/ : atoi(lin_env));
198 offset += (size_t)LIBXSMM_SNPRINTF(
199 flags + offset, sizeof(flags) - offset,
200 " %s %s -DBN=%i -DSM=%i -DLU=%i -DWG=%i -DSG=%i",
201 0 != gpu ? "-DGPU" : "", 0 == clinear ? "" : "-DCLINEAR", bn, sm,
202 lu, (int)wgsize[0], (int)wgsize2);
203 gen = 0;
204 }
205 if (0 != devinfo->intel && 0 < xf) {
206 options = "-cl-intel-256-GRF-per-thread";
207 }
208 result |= (sizeof(flags) > offset ? EXIT_SUCCESS : EXIT_FAILURE);
209 result |= c_dbcsr_acc_opencl_kernel(
210 0 /*source_is_file*/, OPENCL_DBM_SOURCE_MULTIPLY, "dbm_multiply",
211 flags, options, NULL /*try*/, NULL /*try_ok*/, extensions,
212 nextensions, &kernel_global);
213 if (2 <= verbosity || 0 > verbosity) {
214 if (EXIT_SUCCESS == result) {
215 const double ds = DBM_TIMER_DIFF(start, DBM_TIMER_TICK());
216 fprintf(stderr, "INFO ACC/LIBDBM: DBM-kernel gpu=%i", gpu);
217 dbm_multiply_opencl_print(stderr, "gen", gen); /* generated */
218 dbm_multiply_opencl_print(stderr, "lin", clinear);
219 dbm_multiply_opencl_print(stderr, "bn", bn);
220 dbm_multiply_opencl_print(stderr, "sm", sm);
221 dbm_multiply_opencl_print(stderr, "wg", (int)wgsize[0]);
222 dbm_multiply_opencl_print(stderr, "sg", (int)wgsize2);
223 dbm_multiply_opencl_print(stderr, "lu", lu);
224 fprintf(stderr, " ms=%.1f\n", 1E3 * ds);
225 } else {
226 fprintf(stderr, "INFO ACC/LIBDBM: DBM-kernel failed to generate\n");
227 }
228 }
229 }
230 kernel = clCloneKernel(kernel_global, &result); /* always clone */
231 ACC_OPENCL_RELEASE(config->lock_main);
232 } else if (NULL == kernel) {
233 kernel = clCloneKernel(kernel_global, &result);
234 }
235#else
236#error "OpenCL kernel code not found!"
237#endif
238 if (NULL != lock_memory) {
239 ACC_OPENCL_ACQUIRE(lock_memory);
240 }
241 result |= c_dbcsr_acc_opencl_info_devptr_lock(&adata, NULL /*lock*/,
242 pack_a_data, 1 /*esize*/,
243 NULL /*amount*/, &iadata);
244 result |= c_dbcsr_acc_opencl_info_devptr_lock(&bdata, NULL /*lock*/,
245 pack_b_data, 1 /*esize*/,
246 NULL /*amount*/, &ibdata);
247 result |= c_dbcsr_acc_opencl_info_devptr_lock(&cdata, NULL /*lock*/,
248 shard_c_data, 1 /*esize*/,
249 NULL /*amount*/, &icdata);
250 result |= c_dbcsr_acc_opencl_info_devptr_lock(
251 &batch, NULL /*lock*/, params /*batch*/, sizeof(int) * stride,
252 &work_tasks, &ibatch);
253 if (NULL != lock_memory) {
254 ACC_OPENCL_RELEASE(lock_memory);
255 }
256 assert(0 == iadata && 0 == ibdata && 0 == icdata);
257 result |= clSetKernelArg(kernel, 0, sizeof(cl_double), &alpha);
258 result |= clSetKernelArg(kernel, 1, sizeof(cl_int), &ibatch);
259 if (NULL != event || 1 == ndims) {
260 dbm_multiply_gpu_launch_info(&info, params_host, ntasks, param_format);
261 }
262 if (1 < ndims) { /* DBM_MULTIPLY_GEN */
263 const cl_uint zero = 0;
264 assert(0 != wgsize[1] && 0 != wgsize[1] && 0 != wgsize[2]);
265 work_size[0] = 16;
266 assert(1 == work_size[1]);
267 work_size[2] = work_tasks;
268 result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 2, batch.memory);
269 result |= clSetKernelArg(kernel, 3, sizeof(cl_uint), &zero /*shape*/);
270 result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 4, adata.memory);
271 result |= clSetKernelArg(kernel, 5, sizeof(cl_uint), &zero /*A_shape0*/);
272 result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 6, bdata.memory);
273 result |= clSetKernelArg(kernel, 7, sizeof(cl_uint), &zero /*B_shape0*/);
274 result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 8, cdata.memory);
275 result |= clSetKernelArg(kernel, 9, sizeof(cl_uint), &zero /*C_shape0*/);
276 } else {
277 size_t size = work_tasks;
278 size *= (0 == clinear ? info.max_m : info.max_n);
279 /* fixup to be a multiple of the WG-size */
280 work_size[0] = (0 < wgsize[0] ? LIBXSMM_UP(size, wgsize[0]) : size);
281 result |= clSetKernelArg(kernel, 2, sizeof(cl_int), &ntasks);
282 result |= clSetKernelArg(kernel, 3, sizeof(cl_int), &size);
283 result |= clSetKernelArg(kernel, 4, sizeof(cl_int), &param_format);
284 result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 5, batch.memory);
285 result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 6, adata.memory);
286 result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 7, bdata.memory);
287 result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 8, cdata.memory);
288 }
289 result |= clEnqueueNDRangeKernel(str->queue, kernel, ndims, NULL, work_size,
290 0 < wgsize[0] ? wgsize : NULL,
291 0 /*num_wait*/, NULL /*wait_list*/, event);
292 }
293#if defined(OPENCL_LIBSMM_PFORMAT) && (0 < OPENCL_LIBSMM_PFORMAT)
294 else { /* homogeneous */
295 result |= opencl_libsmm_acc_process(
296 params_host, params, ntasks, dbcsr_type_real_8, pack_a_data,
297 pack_b_data, shard_c_data, info.max_m, info.max_n, info.max_k,
298 dbm_multiply_opencl_smm, 1 /*homogeneous*/, stream, NULL /*c_stream*/,
299 info.max_m | info.max_n << OPENCL_LIBSMM_PFORMAT |
300 (info.max_k << (OPENCL_LIBSMM_PFORMAT * 2)),
301 event);
302 }
303#endif
304 if (NULL != event && NULL != *event && EXIT_SUCCESS == result &&
305 EXIT_SUCCESS == clWaitForEvents(1, event)) {
306 static LIBXSMM_TLS DBM_TIMER_TICKINT start2 = 0;
307 const DBM_TIMER_TICKINT stop = DBM_TIMER_TICK();
308 const double dhost = DBM_TIMER_DIFF(start, stop);
309 const double diter = (0 < start2 ? DBM_TIMER_DIFF(start, start2) : dhost);
310#if defined(OPENCL_LIBSMM_PFORMAT) && (0 < OPENCL_LIBSMM_PFORMAT)
311 const char *const kind = (0 >= dbm_multiply_opencl_smm ? "DBM" : "SMM");
312#else
313 const char *const kind = "DBM";
314#endif
315 const int pure = (100 * (ntasks - info.mnk_changes) + ntasks - 1) / ntasks;
316 double dkrnl = dhost, dtotl;
317 if (c_dbcsr_acc_opencl_timer_host == config->timer) {
318 cl_ulong begin = 0, end = 0;
319 const int r0 = clGetEventProfilingInfo(*event, CL_PROFILING_COMMAND_START,
320 sizeof(cl_ulong), &begin, NULL);
321 const int r1 = clGetEventProfilingInfo(*event, CL_PROFILING_COMMAND_END,
322 sizeof(cl_ulong), &end, NULL);
323 if (EXIT_SUCCESS == r0 && EXIT_SUCCESS == r1) {
324 dkrnl = 1E-9 * LIBXSMM_DELTA(begin, end);
325 }
326 }
327 start2 = stop;
328 dtotl = LIBXSMM_MIN(LIBXSMM_MIN(diter, dhost), dkrnl);
329 fprintf(stderr,
330 "INFO ACC/LIBDBM: %s-kernel mnk=%ix%ix%i pure=%i%% ntasks=%i "
331 "ims=%.1f hms=%.1f kms=%.1f gflops=%.1f\n",
332 kind, info.max_m, info.max_n, info.max_k, pure, ntasks,
333 1E+3 * diter, 1E+3 * dhost, 1E+3 * dkrnl,
334 1E-9 * info.max_m * info.max_n * info.max_k * ntasks / dtotl);
335 }
336 return result;
337}
338
339void dbm_multiply_gpu_launch_kernel(offloadStream_t stream, double alpha,
340 int ntasks, const dbm_task_t *tasks_host,
341 const dbm_task_t *tasks,
342 const double *pack_a_data,
343 const double *pack_b_data,
344 double *shard_c_data) {
345 const int result = dbm_multiply_opencl_launch_kernel(
346 stream, alpha, ntasks, 0 /*param_format*/, &tasks_host->m, &tasks->m,
347 pack_a_data, pack_b_data, shard_c_data);
348 OFFLOAD_CHECK(result);
349}
350
351#endif // defined(__OFFLOAD_OPENCL) && !defined(__NO_OFFLOAD_DBM)
352
353// EOF
static int imax(int x, int y)
Returns the larger of two given integers (missing from the C standard)
static void const int const int i
static grid_library_config config
Internal struct for storing a task, ie. a single block multiplication.