(git:1f9fd2c)
Loading...
Searching...
No Matches
dbm_multiply_opencl.c
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2026 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: BSD-3-Clause */
6/*----------------------------------------------------------------------------*/
7#include "../offload/offload_runtime.h"
8#if defined(__OFFLOAD_OPENCL) && !defined(__NO_OFFLOAD_DBM)
9
11#include "dbm_multiply_opencl.cl.h"
12#include <libxs/libxs_reg.h>
13#include <libxs/libxs_timer.h>
14#include <libxstream/libxstream_opencl.h>
15
16#if !defined(OPENCL_DBM_SOURCE_MULTIPLY)
17#error "OpenCL kernel source code not found!"
18#endif
19
20#if defined(OPENCL_LIBSMM_PFORMAT)
21#define DBM_OPENCL_LIBSMM_PFORMAT OPENCL_LIBSMM_PFORMAT
22#else
23#define DBM_OPENCL_LIBSMM_PFORMAT 8
24#endif
25
26#define DBM_OPENCL_CMEM LIBXSTREAM_CMEM
27#define DBM_TIMER_DIFF(A, B) libxs_timer_duration(A, B)
28#define DBM_TIMER_TICK() libxs_timer_tick()
29#define DBM_TIMER_TICKINT libxs_timer_tick_t
30
31#if 0 < DBM_OPENCL_LIBSMM_PFORMAT
32#define DBM_OPENCL_DBCSR_TYPE_REAL_8 3
33typedef int (*opencl_libsmm_acc_dbm_launch_fn_t)(
34 void *stream, double alpha, int ntasks, int param_format,
35 const int *params_host, const int *params, const double *pack_a_data,
36 const double *pack_b_data, double *shard_c_data);
37void opencl_libsmm_acc_set_dbm_launch_fn(
38 opencl_libsmm_acc_dbm_launch_fn_t launch_fn);
39int opencl_libsmm_acc_process(
40 const int *host_param_stack, const int *dev_param_stack, int stack_size,
41 int datatype, const void *dev_a_data, const void *dev_b_data,
42 void *dev_c_data, int m_max, int n_max, int k_max, int max_kernel_dim,
43 int def_mnk, void *stream, void *c_stream, int param_format, void *event);
44LIBXS_PRAGMA_WEAK(opencl_libsmm_acc_set_dbm_launch_fn)
45LIBXS_PRAGMA_WEAK(opencl_libsmm_acc_process)
46#endif
47
48/* Dispatch key for specialized kernels (must be memset-initialized).
49 For homogeneous batches: m,n,k hold the exact shape (bk=k).
50 For heterogeneous batches: m=n=k=0, bk holds the unroll threshold,
51 max_m enables compile-time division in the flat dispatch mapping. */
52typedef struct {
53 int bk;
54 int m, n, k;
55 int max_m;
56} dbm_multiply_opencl_key_t;
57
58typedef struct {
59 int max_m, max_n, max_k, mnk_changes;
60} dbm_multiply_gpu_launch_info_t;
61
62#if 0 < DBM_OPENCL_LIBSMM_PFORMAT
63int dbm_multiply_opencl_initialized /*= 0*/;
64int dbm_multiply_opencl_smm /*= 0*/;
65#endif
66
67int dbm_multiply_opencl_launch_kernel(void *stream, double alpha, int ntasks,
68 int param_format, const int *params_host,
69 const int *params,
70 const double *pack_a_data,
71 const double *pack_b_data,
72 double *shard_c_data);
73
74#if 0 < DBM_OPENCL_LIBSMM_PFORMAT
75LIBXS_ATTRIBUTE_CTOR static void dbm_multiply_opencl_initialize(void) {
76 const char *const smm_env = getenv("DBM_MULTIPLY_SMM");
77 const int smm = (NULL == smm_env ? 0 /*default*/ : atoi(smm_env));
78 dbm_multiply_opencl_smm =
79 LIBXS_MIN(1 != smm ? smm : 64,
80 (1 << (DBM_OPENCL_LIBSMM_PFORMAT - 1)) - 1);
81 if (0 > dbm_multiply_opencl_smm &&
82 NULL != opencl_libsmm_acc_set_dbm_launch_fn) {
83 opencl_libsmm_acc_set_dbm_launch_fn(dbm_multiply_opencl_launch_kernel);
84 }
85 LIBXS_ATOMIC_STORE(&dbm_multiply_opencl_initialized, 1, LIBXS_ATOMIC_SEQ_CST);
86}
87#endif
88
89static int dbm_multiply_gpu_launch_info(dbm_multiply_gpu_launch_info_t *info,
90 const int *params, int ntasks,
91 int param_format, int stop_at_impure) {
92 assert(0 < ntasks);
93 if (0 == param_format) { /* native */
94 const int stride = sizeof(dbm_task_t) / sizeof(int);
95 const int first_m = params[0], first_n = params[1], first_k = params[2];
96 int i = stride;
97 info->max_m = first_m;
98 info->max_n = first_n;
99 info->max_k = first_k;
100 for (info->mnk_changes = 0; i < (ntasks * stride); i += stride) {
101 const int m = params[i + 0], n = params[i + 1], k = params[i + 2];
102 info->max_m = imax(info->max_m, m);
103 info->max_n = imax(info->max_n, n);
104 info->max_k = imax(info->max_k, k);
105 if (m != first_m || n != first_n || k != first_k) {
106 info->mnk_changes = 1;
107 if (0 != stop_at_impure) {
108 return 0;
109 }
110 }
111 }
112 } else {
113#if 0 < DBM_OPENCL_LIBSMM_PFORMAT
114 const int mask = (1 << DBM_OPENCL_LIBSMM_PFORMAT) - 1;
115 info->max_m = mask & (param_format);
116 info->max_n = mask & (param_format >> (DBM_OPENCL_LIBSMM_PFORMAT));
117 info->max_k = mask & (param_format >> (DBM_OPENCL_LIBSMM_PFORMAT * 2));
118 info->mnk_changes = 0; /* homogeneous */
119#else
120 assert(0);
121#endif
122 }
123 return 1;
124}
125
126static void dbm_multiply_opencl_print(FILE *stream, const char *name, int val) {
127 if (0 != val) {
128 fprintf(stream, " %s=%i", name, val);
129 }
130}
131
132static int dbm_multiply_opencl_bk(int max_k) {
133 int result = 1;
134 if (16 <= max_k) {
135 result = 16;
136 } else if (8 <= max_k) {
137 result = 8;
138 } else if (4 <= max_k) {
139 result = 4;
140 }
141 return result;
142}
143
144int dbm_multiply_opencl_launch_kernel(void *stream, double alpha, int ntasks,
145 int param_format, const int *params_host,
146 const int *params,
147 const double *pack_a_data,
148 const double *pack_b_data,
149 double *shard_c_data) {
150 const DBM_TIMER_TICKINT start = DBM_TIMER_TICK();
151 const libxstream_opencl_config_t *const config = &libxstream_opencl_config;
152 const int verbosity = config->verbosity,
153 trace = (0 > verbosity || 2 < verbosity);
154 int result = EXIT_SUCCESS;
155#if 0 < DBM_OPENCL_LIBSMM_PFORMAT
156 int dbcsr = 0;
157#endif
158 dbm_multiply_gpu_launch_info_t task = {0};
159 int task_complete = 0;
160 assert(NULL != pack_a_data && NULL != pack_b_data && NULL != shard_c_data);
161 assert(NULL != params_host || 0 == ntasks);
162 assert(NULL != params || 0 == ntasks);
163 if (0 < ntasks) {
164#if 0 < DBM_OPENCL_LIBSMM_PFORMAT
165 if (0 == LIBXS_ATOMIC_LOAD(&dbm_multiply_opencl_initialized,
166 LIBXS_ATOMIC_SEQ_CST)) {
167 dbm_multiply_opencl_initialize();
168 }
169 if (0 != dbm_multiply_opencl_smm || 0 != trace) {
170 task_complete = dbm_multiply_gpu_launch_info(
171 &task, params_host, ntasks, param_format, 0 == trace);
172 }
173 if (NULL == opencl_libsmm_acc_process || 0 > dbm_multiply_opencl_smm ||
174 0 != task.mnk_changes ||
175 dbm_multiply_opencl_smm < task.max_m ||
176 dbm_multiply_opencl_smm < task.max_n ||
177 dbm_multiply_opencl_smm < task.max_k || 0 == task.max_k || 1 != alpha)
178#endif
179 { /* base init state: computed once, shared across all specializations */
180 static int clinear = 0, sgbcst = 0, bk_max = 0;
181 static int nz = 0, blkrd = 0, base_ready = 0;
182 static size_t wgsize[] = {1, 1, 1}, sgsize_s = 0;
183 static char base_flags[LIBXSTREAM_BUFFERSIZE];
184 static const char *base_options /*= NULL*/;
185 static const char *base_source /*= NULL*/;
186 static const char *base_exts[2] /*= {NULL, NULL}*/;
187 static size_t base_nexts, base_source_kind;
188 /* registry of BK-specialized kernels */
189 static libxs_registry_t *kernel_registry /*= NULL*/;
190 /* serializes kernel compilation (get-compile-set) */
191 static libxs_lock_t compile_lock /*= LIBXS_LOCK_INITIALIZER*/;
192 /* serializes kernel arg setting + enqueue (no TLS clones needed) */
193 static libxs_lock_t kernel_lock /*= LIBXS_LOCK_INITIALIZER*/;
194 const libxstream_opencl_stream_t *const str =
195 (const libxstream_opencl_stream_t *)(stream);
196 const libxstream_opencl_device_t *const devinfo = &config->device;
197 libxs_lock_t *const lock_memory =
198 (NULL != devinfo->clSetKernelArgMemPointerINTEL
199 ? NULL
200 : config->lock_memory);
201 libxstream_opencl_info_memptr_t adata, bdata, cdata, batch;
202 const int stride = (0 == param_format ? 6 : 3);
203 size_t work_size[] = {1, 1, 1}, ibatch = 0;
204 const size_t work_tasks = ntasks;
205 cl_kernel kernel = NULL;
206 int bk, bk0, use_blkrd = 0;
207 cl_int size;
208 assert(NULL != str && NULL != str->queue);
209 /* base init (once): env vars, flags, source, extensions */
210 if (0 == LIBXS_ATOMIC_LOAD(&base_ready, LIBXS_ATOMIC_SEQ_CST)) {
211 LIBXS_LOCK_ACQUIRE(LIBXS_LOCK, config->lock_main);
212 if (0 == base_ready) {
213 const char *const krn_env = getenv("DBM_MULTIPLY_KERNEL");
214 const char *const sgb_env = getenv("DBM_MULTIPLY_SGB");
215 const char *const blk_env = getenv("DBM_MULTIPLY_BLK");
216 const char *const lin_env = getenv("DBM_MULTIPLY_LIN");
217 const char *const fp_env = getenv("DBM_MULTIPLY_FP");
218 const char *const bn_env = getenv("DBM_MULTIPLY_BN");
219 const char *const bk_env = getenv("DBM_MULTIPLY_BK");
220 const char *const sm_env = getenv("DBM_MULTIPLY_SM");
221 const char *const wg_env = getenv("DBM_MULTIPLY_WG");
222 const char *const lu_env = getenv("DBM_MULTIPLY_LU");
223 const char *const ro_env = getenv("DBM_MULTIPLY_RO");
224 const char *const xf_env = getenv("DBM_MULTIPLY_XF");
225 const char *const nz_env = getenv("DBM_MULTIPLY_NZ");
226 const char *options = NULL;
227 const int dd = (0 != config->debug && 0 != config->dump);
228 const int ro = (NULL == ro_env ? -1 /*default*/ : atoi(ro_env));
229 const int xf = (NULL == xf_env ? -1 /*default*/ : atoi(xf_env));
230 const int sm0 = (NULL == sm_env ? 0 : atoi(sm_env));
231 int source_kind = 0, sm = LIBXS_ABS(sm0);
232 const int bn0 = (0 == devinfo->nv ? 8 : 2);
233 const int bn1 = ((0 == sm && 0 == clinear) ? bn0 : (bn0 * sm * 2));
234 const int gpu = (CL_DEVICE_TYPE_GPU == devinfo->type);
235 const int precision = (NULL == fp_env ? 0 /*default*/ : atoi(fp_env));
236 int bn = LIBXS_CLMP(NULL == bn_env ? bn1 : atoi(bn_env), 1, 32);
237 int lu = LIBXS_CLMP(NULL == lu_env ? 0 : atoi(lu_env), -2, 1);
238 size_t sgsize = devinfo->wgsize[2];
239 size_t offset;
240 const char *source = OPENCL_DBM_SOURCE_MULTIPLY, *cmem = NULL;
241 LIBXS_MEMZERO(base_flags);
242 LIBXS_SNPRINTF(base_flags, sizeof(base_flags),
243 "-cl-fast-relaxed-math -cl-denorms-are-zero");
244 offset = (0 == dd ? strlen(base_flags) : 0);
245 base_exts[0] = base_exts[1] = NULL;
246 base_nexts = sizeof(base_exts) / sizeof(*base_exts);
247 offset += (size_t)libxstream_opencl_flags_atomics(
248 devinfo, libxstream_opencl_atomic_fp_64, base_exts, &base_nexts,
249 base_flags + offset, sizeof(base_flags) - offset);
250 if (NULL != krn_env) {
251 FILE *const krn_file = fopen(krn_env, "rb");
252 if (NULL != krn_file) {
253 fclose(krn_file);
254 source = krn_env;
255 source_kind = 1;
256 }
257 }
258 wgsize[0] = (NULL == wg_env ? LIBXS_MAX((unsigned long int)sm,
259 devinfo->wgsize[1])
260 : strtoul(wg_env, NULL, 10));
261 if (1 < sgsize && 0 < wgsize[0]) { /* subgroups */
262 if (LIBXS_DELTA(wgsize[0], devinfo->wgsize[1]) <=
263 LIBXS_DELTA(wgsize[0], sgsize)) {
264 sgsize = devinfo->wgsize[1];
265 }
266 wgsize[0] = LIBXS_UP(wgsize[0], sgsize);
267 } else {
268 wgsize[0] = LIBXS_UP(wgsize[0], devinfo->wgsize[1]);
269 sgsize = 0;
270 }
271 { /* 256-GRF */
272 const int biggrf = (0 <= xf ? xf : devinfo->biggrf);
273 const size_t max_wgs =
274 (0 != biggrf) ? devinfo->wgsize[0] / 2 : devinfo->wgsize[0];
275 wgsize[0] = LIBXS_CLMP(wgsize[0], 0, max_wgs);
276 if (0 != biggrf && 0 != devinfo->intel && 0 == devinfo->biggrf) {
277 options = "-cl-intel-256-GRF-per-thread";
278 }
279 }
280 sm = ((0 != sm && 0 != wgsize[0])
281 ? (LIBXS_ISPOT(bn * sizeof(double)) + 1)
282 : 0);
283 clinear = (NULL == lin_env ? 0 /*default*/ : atoi(lin_env));
284 sgbcst = (0 != gpu && 0 < sgsize && 0 < wgsize[0] &&
285 2 <= devinfo->std_level[0] &&
286 (NULL == sgb_env ? 1 /*default*/ : (0 != atoi(sgb_env))));
287 cmem =
288#if defined(DBM_OPENCL_CMEM)
289 (0 > ro && EXIT_SUCCESS == libxstream_opencl_use_cmem(devinfo))
290 ? "constant"
291 :
292#endif
293 (0 >= ro ? "global" : "constant");
294 blkrd = (0 == clinear && 0 != devinfo->intel && 0 < (int)sgsize &&
295 (NULL == blk_env ? 1 /*default*/ : (0 != atoi(blk_env))));
296 if (0 != blkrd && 'g' != cmem[0]) {
297 cmem = "global"; /* block reads require global address space */
298 }
299 offset += (size_t)LIBXS_SNPRINTF(
300 base_flags + offset, sizeof(base_flags) - offset,
301 " %s %s -DCONSTANT=%s"
302 " -DBN=%i -DSM=%i -DLU=%i -DSG=%i -DINTEL=%i",
303 0 != gpu ? "-DGPU" : "", 0 == clinear ? "" : "-DCLINEAR", cmem,
304 bn, sm, lu, (int)sgsize, (int)(0 != devinfo->intel));
305 if (0 != precision) {
306 offset += (size_t)LIBXS_SNPRINTF(base_flags + offset,
307 sizeof(base_flags) - offset,
308 " -DPRECISION=%i", precision);
309 }
310 bk_max = (NULL == bk_env ? 0 /*default*/ : atoi(bk_env));
311 nz = (NULL == nz_env ? 0 /*default*/ : atoi(nz_env));
312 if (0 != nz) {
313 offset += (size_t)LIBXS_SNPRINTF(base_flags + offset,
314 sizeof(base_flags) - offset,
315 " -DNZ=%i", nz);
316 }
317 sgsize_s = sgsize;
318 base_source = source;
319 base_source_kind = source_kind;
320 base_options = options;
321 kernel_registry = libxs_registry_create();
322 if (2 <= verbosity || 0 > verbosity) {
323 fprintf(stderr, "INFO ACC/LIBDBM: DBM-kernel gpu=%i", gpu);
324 dbm_multiply_opencl_print(stderr, "sgb", sgbcst);
325 dbm_multiply_opencl_print(stderr, "lin", clinear);
326 dbm_multiply_opencl_print(stderr, "fp", precision);
327 dbm_multiply_opencl_print(stderr, "bn", bn);
328 dbm_multiply_opencl_print(stderr, "sm", sm);
329 dbm_multiply_opencl_print(stderr, "wg", (int)wgsize[0]);
330 dbm_multiply_opencl_print(stderr, "sg", (int)sgsize);
331 dbm_multiply_opencl_print(stderr, "lu", lu);
332 dbm_multiply_opencl_print(stderr, "nz", nz);
333 dbm_multiply_opencl_print(stderr, "blk", blkrd);
334 fprintf(stderr, " -> %.1f ms\n",
335 1E3 * DBM_TIMER_DIFF(start, DBM_TIMER_TICK()));
336 }
337 LIBXS_ATOMIC_STORE(&base_ready, 1, LIBXS_ATOMIC_SEQ_CST);
338 }
339 LIBXS_LOCK_RELEASE(LIBXS_LOCK, config->lock_main);
340 }
341 /* per-launch: compute task info and dispatch key */
342#if 0 < DBM_OPENCL_LIBSMM_PFORMAT
343 if (0 == task_complete)
344#endif
345 {
346 task_complete = dbm_multiply_gpu_launch_info(&task, params_host, ntasks,
347 param_format, 0);
348 }
349 bk0 = dbm_multiply_opencl_bk(task.max_k);
350 bk = (0 < bk_max ? LIBXS_MIN(bk0, bk_max) : bk0);
351 { /* per-shape kernel lookup/compile */
352 dbm_multiply_opencl_key_t key;
353 cl_kernel *kptr;
354 LIBXS_MEMZERO(&key);
355 key.bk = bk;
356 if (0 == task.mnk_changes) { /* homogeneous: exact shape */
357 key.m = task.max_m;
358 key.n = task.max_n;
359 key.k = task.max_k;
360 key.bk = task.max_k; /* exact K for full unrolling */
361 } else { /* heterogeneous: max_m for compile-time division.
362 With BLKRD_P (per-task dispatch), max_m is unused
363 so omit it to consolidate into fewer kernels. */
364 if (0 == blkrd) {
365 key.max_m = (0 == clinear ? task.max_m : task.max_n);
366 }
367 }
368 use_blkrd = (0 != blkrd && 0 != key.m && 16 <= key.m &&
369 key.m <= (int)sgsize_s && 0 == (key.m & (key.m - 1)));
370 kptr = (cl_kernel *)libxs_registry_get(
371 kernel_registry, &key, sizeof(key),
372 libxs_registry_lock(kernel_registry));
373 if (NULL == kptr || NULL == *kptr) { /* compile specialization */
374 LIBXS_LOCK_ACQUIRE(LIBXS_LOCK, &compile_lock);
375 kptr = (cl_kernel *)libxs_registry_get(
376 kernel_registry, &key, sizeof(key),
377 libxs_registry_lock(kernel_registry));
378 if (NULL == kptr || NULL == *kptr) {
379 char flags[LIBXSTREAM_BUFFERSIZE];
380 const cl_device_id device_id = config->devices[config->device_id];
381 cl_kernel kernel_new = NULL;
382 size_t wgs[3];
383 if (0 != key.m) { /* homogeneous: add shape defines */
384 const int use_wg = (0 != use_blkrd || 0 != sgbcst);
385 const int n = LIBXS_SNPRINTF(
386 flags, sizeof(flags),
387 "%s -DWG=%i -DBK=%i -DDBM_M=%i -DDBM_N=%i -DDBM_K=%i%s%s",
388 base_flags, use_wg ? (int)wgsize[0] : 0, key.bk, key.m, key.n,
389 key.k, 0 != use_blkrd ? " -DBLKRD_A" : "",
390 (0 != sgbcst && 0 == use_blkrd) ? " -DSGBCST" : "");
391 assert(0 < n && (size_t)n < sizeof(flags));
392 LIBXS_UNUSED(n);
393 } else if (0 < key.max_m) { /* heterogeneous with known max_m */
394 const int n = LIBXS_SNPRINTF(
395 flags, sizeof(flags), "%s -DWG=%i -DBK=%i -DMAX_M=%i%s%s",
396 base_flags, (int)wgsize[0], bk, key.max_m,
397 0 != blkrd ? " -DBLKRD_P" : "",
398 (0 != sgbcst && 0 == blkrd) ? " -DSGBCST" : "");
399 assert(0 < n && (size_t)n < sizeof(flags));
400 LIBXS_UNUSED(n);
401 } else { /* heterogeneous: BK only */
402 const int n = LIBXS_SNPRINTF(
403 flags, sizeof(flags), "%s -DWG=%i -DBK=%i%s%s", base_flags,
404 (int)wgsize[0], bk, 0 != blkrd ? " -DBLKRD_P" : "",
405 (0 != sgbcst && 0 == blkrd) ? " -DSGBCST" : "");
406 assert(0 < n && (size_t)n < sizeof(flags));
407 LIBXS_UNUSED(n);
408 }
409 result |= libxstream_opencl_kernel(
410 base_source_kind, base_source, "dbm_multiply", flags,
411 base_options, NULL /*try*/, NULL /*try_ok*/, base_exts,
412 base_nexts, &kernel_new);
413 if (EXIT_SUCCESS == result &&
414 EXIT_SUCCESS ==
415 clGetKernelWorkGroupInfo(kernel_new, device_id,
416 CL_KERNEL_COMPILE_WORK_GROUP_SIZE,
417 sizeof(wgs), wgs, NULL) &&
418 0 != wgs[0] && 0 != wgs[1]) {
419 wgsize[0] = wgs[0];
420 wgsize[1] = wgs[1];
421 }
422 kptr = (cl_kernel *)libxs_registry_set(
423 kernel_registry, &key, sizeof(key), &kernel_new,
424 sizeof(kernel_new), libxs_registry_lock(kernel_registry));
425 if (2 <= verbosity || 0 > verbosity || EXIT_SUCCESS != result) {
426 const char *const kind =
427 (EXIT_SUCCESS == result ? "INFO" : "ERROR");
428 fprintf(stderr, "%s ACC/LIBDBM: DBM-kernel bk=%i", kind, key.bk);
429 if (0 != key.m) {
430 fprintf(stderr, " mnk=%ix%ix%i", key.m, key.n, key.k);
431 }
432 fprintf(stderr, " -> ");
433 if (EXIT_SUCCESS == result) {
434 fprintf(stderr, "%.1f ms\n",
435 1E3 * DBM_TIMER_DIFF(start, DBM_TIMER_TICK()));
436 } else {
437 fprintf(stderr, "FAILED!\n");
438 }
439 }
440 }
441 LIBXS_LOCK_RELEASE(LIBXS_LOCK, &compile_lock);
442 }
443 kernel = (NULL != kptr) ? *kptr : NULL;
444 }
445 LIBXS_LOCK_ACQUIRE(LIBXS_LOCK, &kernel_lock);
446 if (NULL != lock_memory) {
447 LIBXS_LOCK_ACQUIRE(LIBXS_LOCK, lock_memory);
448 }
449 { /* assume A, B, and C do not carry an offset */
450 size_t iadata = 0, ibdata = 0, icdata = 0;
451 result |= libxstream_opencl_info_devptr_lock(&adata, NULL /*lock*/,
452 pack_a_data, 1 /*esize*/,
453 NULL /*amount*/, &iadata);
454 result |= libxstream_opencl_info_devptr_lock(&bdata, NULL /*lock*/,
455 pack_b_data, 1 /*esize*/,
456 NULL /*amount*/, &ibdata);
457 result |= libxstream_opencl_info_devptr_lock(&cdata, NULL /*lock*/,
458 shard_c_data, 1 /*esize*/,
459 NULL /*amount*/, &icdata);
460 assert(0 == iadata && 0 == ibdata && 0 == icdata);
461 }
462 result |= libxstream_opencl_info_devptr_lock(
463 &batch, NULL /*lock*/, params /*batch*/, sizeof(int) * stride,
464 &work_tasks, &ibatch);
465 if (NULL != lock_memory) {
466 LIBXS_LOCK_RELEASE(LIBXS_LOCK, lock_memory);
467 }
468 if (EXIT_SUCCESS ==
469 result) { /* determine dispatch mode: per-task vs flat */
470 const int per_task = (0 != sgbcst && 0 == use_blkrd) ||
471 (0 != blkrd && 0 != task.mnk_changes);
472 const int use_wg =
473 (0 != task.mnk_changes || 0 != use_blkrd || 0 != sgbcst);
474 size = (cl_int)(work_tasks * (0 == clinear ? task.max_m : task.max_n));
475 if (per_task) {
476 work_size[0] = work_tasks * wgsize[0];
477 } else if (use_wg) {
478 work_size[0] = LIBXS_UP((size_t)size, wgsize[0]);
479 } else {
480 work_size[0] = (size_t)size;
481 }
482 result |= clSetKernelArg(kernel, 0, sizeof(cl_double), &alpha);
483 result |= clSetKernelArg(kernel, 1, sizeof(cl_int), &ibatch);
484 result |= clSetKernelArg(kernel, 2, sizeof(cl_int), &ntasks);
485 result |= clSetKernelArg(kernel, 3, sizeof(cl_int), &size);
486 result |= clSetKernelArg(kernel, 4, sizeof(cl_int), &param_format);
487 result |= libxstream_opencl_set_kernel_ptr(kernel, 5, batch.memory);
488 result |= libxstream_opencl_set_kernel_ptr(kernel, 6, adata.memory);
489 result |= libxstream_opencl_set_kernel_ptr(kernel, 7, bdata.memory);
490 result |= libxstream_opencl_set_kernel_ptr(kernel, 8, cdata.memory);
491 result |=
492 clEnqueueNDRangeKernel(str->queue, kernel, 1, NULL, work_size,
493 0 < wgsize[0] ? wgsize : NULL,
494 0 /*num_wait*/, NULL /*wait_list*/, NULL);
495 }
496 LIBXS_LOCK_RELEASE(LIBXS_LOCK, &kernel_lock);
497 }
498#if 0 < DBM_OPENCL_LIBSMM_PFORMAT
499 else { /* homogeneous */
500 result |= opencl_libsmm_acc_process(
501 params_host, params, ntasks, DBM_OPENCL_DBCSR_TYPE_REAL_8,
502 pack_a_data, pack_b_data, shard_c_data, task.max_m, task.max_n,
503 task.max_k, dbm_multiply_opencl_smm, 1 /*homogeneous*/, stream,
504 NULL /*c_stream*/,
505 task.max_m | task.max_n << DBM_OPENCL_LIBSMM_PFORMAT |
506 (task.max_k << (DBM_OPENCL_LIBSMM_PFORMAT * 2)),
507 NULL);
508 dbcsr = 1;
509 }
510#endif
511 if (0 != trace && EXIT_SUCCESS == result) {
512 static LIBXS_TLS DBM_TIMER_TICKINT start2 = 0;
513 const DBM_TIMER_TICKINT stop = DBM_TIMER_TICK();
514 const double dhost = DBM_TIMER_DIFF(start, stop);
515 const double diter = (0 < start2 ? DBM_TIMER_DIFF(start, start2) : dhost);
516#if 0 < DBM_OPENCL_LIBSMM_PFORMAT
517 const char *const kind = (0 == dbcsr ? "DBM" : "SMM");
518#else
519 const char *const kind = "DBM";
520#endif
521 const int pure = (0 == task.mnk_changes ? 100 : 0);
522 const double dtotl = LIBXS_MAX(diter, dhost);
523 start2 = stop;
524 fprintf(stderr,
525 "INFO ACC/LIBDBM: %s-kernel mnk=%ix%ix%i "
526 "pure=%i%% ntasks=%i ms=%.1f\n",
527 kind, task.max_m, task.max_n, task.max_k, pure, ntasks,
528 1E+3 * dtotl);
529 }
530 }
531 return result;
532}
533
534void dbm_multiply_gpu_launch_kernel(offloadStream_t stream, double alpha,
535 int ntasks, const dbm_task_t *tasks_host,
536 const dbm_task_t *tasks,
537 const double *pack_a_data,
538 const double *pack_b_data,
539 double *shard_c_data) {
540 const int result = dbm_multiply_opencl_launch_kernel(
541 stream, alpha, ntasks, 0 /*param_format*/, &tasks_host->m, &tasks->m,
542 pack_a_data, pack_b_data, shard_c_data);
543 OFFLOAD_CHECK(result);
544}
545
546#endif /* defined(__OFFLOAD_OPENCL) && !defined(__NO_OFFLOAD_DBM) */
547
548/* EOF */
static int imax(int x, int y)
Returns the larger of two given integers (missing from the C standard)
static void const int const int i
static grid_library_config config
Internal struct for storing a task, ie. a single block multiplication.