7#include "../offload/offload_runtime.h"
8#if defined(__OFFLOAD_OPENCL) && !defined(__NO_OFFLOAD_DBM)
11#include "dbm_multiply_opencl.cl.h"
12#include <libxs/libxs_reg.h>
13#include <libxs/libxs_timer.h>
14#include <libxstream/libxstream_opencl.h>
16#if !defined(OPENCL_DBM_SOURCE_MULTIPLY)
17#error "OpenCL kernel source code not found!"
20#if defined(OPENCL_LIBSMM_PFORMAT)
21#define DBM_OPENCL_LIBSMM_PFORMAT OPENCL_LIBSMM_PFORMAT
23#define DBM_OPENCL_LIBSMM_PFORMAT 8
26#define DBM_OPENCL_CMEM LIBXSTREAM_CMEM
27#define DBM_TIMER_DIFF(A, B) libxs_timer_duration(A, B)
28#define DBM_TIMER_TICK() libxs_timer_tick()
29#define DBM_TIMER_TICKINT libxs_timer_tick_t
31#if 0 < DBM_OPENCL_LIBSMM_PFORMAT
32#define DBM_OPENCL_DBCSR_TYPE_REAL_8 3
33typedef int (*opencl_libsmm_acc_dbm_launch_fn_t)(
34 void *stream,
double alpha,
int ntasks,
int param_format,
35 const int *params_host,
const int *params,
const double *pack_a_data,
36 const double *pack_b_data,
double *shard_c_data);
37void opencl_libsmm_acc_set_dbm_launch_fn(
38 opencl_libsmm_acc_dbm_launch_fn_t launch_fn);
39int opencl_libsmm_acc_process(
40 const int *host_param_stack,
const int *dev_param_stack,
int stack_size,
41 int datatype,
const void *dev_a_data,
const void *dev_b_data,
42 void *dev_c_data,
int m_max,
int n_max,
int k_max,
int max_kernel_dim,
43 int def_mnk,
void *stream,
void *c_stream,
int param_format,
void *event);
44LIBXS_PRAGMA_WEAK(opencl_libsmm_acc_set_dbm_launch_fn)
45LIBXS_PRAGMA_WEAK(opencl_libsmm_acc_process)
56} dbm_multiply_opencl_key_t;
59 int max_m, max_n, max_k, mnk_changes;
60} dbm_multiply_gpu_launch_info_t;
62#if 0 < DBM_OPENCL_LIBSMM_PFORMAT
63int dbm_multiply_opencl_initialized ;
64int dbm_multiply_opencl_smm ;
67int dbm_multiply_opencl_launch_kernel(
void *stream,
double alpha,
int ntasks,
68 int param_format,
const int *params_host,
70 const double *pack_a_data,
71 const double *pack_b_data,
72 double *shard_c_data);
74#if 0 < DBM_OPENCL_LIBSMM_PFORMAT
75LIBXS_ATTRIBUTE_CTOR
static void dbm_multiply_opencl_initialize(
void) {
76 const char *
const smm_env = getenv(
"DBM_MULTIPLY_SMM");
77 const int smm = (NULL == smm_env ? 0 : atoi(smm_env));
78 dbm_multiply_opencl_smm =
79 LIBXS_MIN(1 != smm ? smm : 64,
80 (1 << (DBM_OPENCL_LIBSMM_PFORMAT - 1)) - 1);
81 if (0 > dbm_multiply_opencl_smm &&
82 NULL != opencl_libsmm_acc_set_dbm_launch_fn) {
83 opencl_libsmm_acc_set_dbm_launch_fn(dbm_multiply_opencl_launch_kernel);
85 LIBXS_ATOMIC_STORE(&dbm_multiply_opencl_initialized, 1, LIBXS_ATOMIC_SEQ_CST);
89static int dbm_multiply_gpu_launch_info(dbm_multiply_gpu_launch_info_t *info,
90 const int *params,
int ntasks,
91 int param_format,
int stop_at_impure) {
93 if (0 == param_format) {
94 const int stride =
sizeof(
dbm_task_t) /
sizeof(
int);
95 const int first_m = params[0], first_n = params[1], first_k = params[2];
97 info->max_m = first_m;
98 info->max_n = first_n;
99 info->max_k = first_k;
100 for (info->mnk_changes = 0;
i < (ntasks * stride);
i += stride) {
101 const int m = params[
i + 0], n = params[
i + 1], k = params[
i + 2];
102 info->max_m =
imax(info->max_m, m);
103 info->max_n =
imax(info->max_n, n);
104 info->max_k =
imax(info->max_k, k);
105 if (m != first_m || n != first_n || k != first_k) {
106 info->mnk_changes = 1;
107 if (0 != stop_at_impure) {
113#if 0 < DBM_OPENCL_LIBSMM_PFORMAT
114 const int mask = (1 << DBM_OPENCL_LIBSMM_PFORMAT) - 1;
115 info->max_m = mask & (param_format);
116 info->max_n = mask & (param_format >> (DBM_OPENCL_LIBSMM_PFORMAT));
117 info->max_k = mask & (param_format >> (DBM_OPENCL_LIBSMM_PFORMAT * 2));
118 info->mnk_changes = 0;
126static void dbm_multiply_opencl_print(FILE *stream,
const char *name,
int val) {
128 fprintf(stream,
" %s=%i", name, val);
132static int dbm_multiply_opencl_bk(
int max_k) {
136 }
else if (8 <= max_k) {
138 }
else if (4 <= max_k) {
144int dbm_multiply_opencl_launch_kernel(
void *stream,
double alpha,
int ntasks,
145 int param_format,
const int *params_host,
147 const double *pack_a_data,
148 const double *pack_b_data,
149 double *shard_c_data) {
150 const DBM_TIMER_TICKINT start = DBM_TIMER_TICK();
151 const libxstream_opencl_config_t *
const config = &libxstream_opencl_config;
152 const int verbosity =
config->verbosity,
153 trace = (0 > verbosity || 2 < verbosity);
154 int result = EXIT_SUCCESS;
155#if 0 < DBM_OPENCL_LIBSMM_PFORMAT
158 dbm_multiply_gpu_launch_info_t task = {0};
159 int task_complete = 0;
160 assert(NULL != pack_a_data && NULL != pack_b_data && NULL != shard_c_data);
161 assert(NULL != params_host || 0 == ntasks);
162 assert(NULL != params || 0 == ntasks);
164#if 0 < DBM_OPENCL_LIBSMM_PFORMAT
165 if (0 == LIBXS_ATOMIC_LOAD(&dbm_multiply_opencl_initialized,
166 LIBXS_ATOMIC_SEQ_CST)) {
167 dbm_multiply_opencl_initialize();
169 if (0 != dbm_multiply_opencl_smm || 0 != trace) {
170 task_complete = dbm_multiply_gpu_launch_info(
171 &task, params_host, ntasks, param_format, 0 == trace);
173 if (NULL == opencl_libsmm_acc_process || 0 > dbm_multiply_opencl_smm ||
174 0 != task.mnk_changes ||
175 dbm_multiply_opencl_smm < task.max_m ||
176 dbm_multiply_opencl_smm < task.max_n ||
177 dbm_multiply_opencl_smm < task.max_k || 0 == task.max_k || 1 != alpha)
180 static int clinear = 0, sgbcst = 0, bk_max = 0;
181 static int nz = 0, blkrd = 0, base_ready = 0;
182 static size_t wgsize[] = {1, 1, 1}, sgsize_s = 0;
183 static char base_flags[LIBXSTREAM_BUFFERSIZE];
184 static const char *base_options ;
185 static const char *base_source ;
186 static const char *base_exts[2] ;
187 static size_t base_nexts, base_source_kind;
189 static libxs_registry_t *kernel_registry ;
191 static libxs_lock_t compile_lock ;
193 static libxs_lock_t kernel_lock ;
194 const libxstream_opencl_stream_t *
const str =
195 (
const libxstream_opencl_stream_t *)(stream);
196 const libxstream_opencl_device_t *
const devinfo = &
config->device;
197 libxs_lock_t *
const lock_memory =
198 (NULL != devinfo->clSetKernelArgMemPointerINTEL
201 libxstream_opencl_info_memptr_t adata, bdata, cdata, batch;
202 const int stride = (0 == param_format ? 6 : 3);
203 size_t work_size[] = {1, 1, 1}, ibatch = 0;
204 const size_t work_tasks = ntasks;
205 cl_kernel kernel = NULL;
206 int bk, bk0, use_blkrd = 0;
208 assert(NULL != str && NULL != str->queue);
210 if (0 == LIBXS_ATOMIC_LOAD(&base_ready, LIBXS_ATOMIC_SEQ_CST)) {
211 LIBXS_LOCK_ACQUIRE(LIBXS_LOCK,
config->lock_main);
212 if (0 == base_ready) {
213 const char *
const krn_env = getenv(
"DBM_MULTIPLY_KERNEL");
214 const char *
const sgb_env = getenv(
"DBM_MULTIPLY_SGB");
215 const char *
const blk_env = getenv(
"DBM_MULTIPLY_BLK");
216 const char *
const lin_env = getenv(
"DBM_MULTIPLY_LIN");
217 const char *
const fp_env = getenv(
"DBM_MULTIPLY_FP");
218 const char *
const bn_env = getenv(
"DBM_MULTIPLY_BN");
219 const char *
const bk_env = getenv(
"DBM_MULTIPLY_BK");
220 const char *
const sm_env = getenv(
"DBM_MULTIPLY_SM");
221 const char *
const wg_env = getenv(
"DBM_MULTIPLY_WG");
222 const char *
const lu_env = getenv(
"DBM_MULTIPLY_LU");
223 const char *
const ro_env = getenv(
"DBM_MULTIPLY_RO");
224 const char *
const xf_env = getenv(
"DBM_MULTIPLY_XF");
225 const char *
const nz_env = getenv(
"DBM_MULTIPLY_NZ");
226 const char *options = NULL;
227 const int dd = (0 !=
config->debug && 0 !=
config->dump);
228 const int ro = (NULL == ro_env ? -1 : atoi(ro_env));
229 const int xf = (NULL == xf_env ? -1 : atoi(xf_env));
230 const int sm0 = (NULL == sm_env ? 0 : atoi(sm_env));
231 int source_kind = 0, sm = LIBXS_ABS(sm0);
232 const int bn0 = (0 == devinfo->nv ? 8 : 2);
233 const int bn1 = ((0 == sm && 0 == clinear) ? bn0 : (bn0 * sm * 2));
234 const int gpu = (CL_DEVICE_TYPE_GPU == devinfo->type);
235 const int precision = (NULL == fp_env ? 0 : atoi(fp_env));
236 int bn = LIBXS_CLMP(NULL == bn_env ? bn1 : atoi(bn_env), 1, 32);
237 int lu = LIBXS_CLMP(NULL == lu_env ? 0 : atoi(lu_env), -2, 1);
238 size_t sgsize = devinfo->wgsize[2];
240 const char *source = OPENCL_DBM_SOURCE_MULTIPLY, *cmem = NULL;
241 LIBXS_MEMZERO(base_flags);
242 LIBXS_SNPRINTF(base_flags,
sizeof(base_flags),
243 "-cl-fast-relaxed-math -cl-denorms-are-zero");
244 offset = (0 == dd ? strlen(base_flags) : 0);
245 base_exts[0] = base_exts[1] = NULL;
246 base_nexts =
sizeof(base_exts) /
sizeof(*base_exts);
247 offset += (size_t)libxstream_opencl_flags_atomics(
248 devinfo, libxstream_opencl_atomic_fp_64, base_exts, &base_nexts,
249 base_flags + offset,
sizeof(base_flags) - offset);
250 if (NULL != krn_env) {
251 FILE *
const krn_file = fopen(krn_env,
"rb");
252 if (NULL != krn_file) {
258 wgsize[0] = (NULL == wg_env ? LIBXS_MAX((
unsigned long int)sm,
260 : strtoul(wg_env, NULL, 10));
261 if (1 < sgsize && 0 < wgsize[0]) {
262 if (LIBXS_DELTA(wgsize[0], devinfo->wgsize[1]) <=
263 LIBXS_DELTA(wgsize[0], sgsize)) {
264 sgsize = devinfo->wgsize[1];
266 wgsize[0] = LIBXS_UP(wgsize[0], sgsize);
268 wgsize[0] = LIBXS_UP(wgsize[0], devinfo->wgsize[1]);
272 const int biggrf = (0 <= xf ? xf : devinfo->biggrf);
273 const size_t max_wgs =
274 (0 != biggrf) ? devinfo->wgsize[0] / 2 : devinfo->wgsize[0];
275 wgsize[0] = LIBXS_CLMP(wgsize[0], 0, max_wgs);
276 if (0 != biggrf && 0 != devinfo->intel && 0 == devinfo->biggrf) {
277 options =
"-cl-intel-256-GRF-per-thread";
280 sm = ((0 != sm && 0 != wgsize[0])
281 ? (LIBXS_ISPOT(bn *
sizeof(
double)) + 1)
283 clinear = (NULL == lin_env ? 0 : atoi(lin_env));
284 sgbcst = (0 != gpu && 0 < sgsize && 0 < wgsize[0] &&
285 2 <= devinfo->std_level[0] &&
286 (NULL == sgb_env ? 1 : (0 != atoi(sgb_env))));
288#if defined(DBM_OPENCL_CMEM)
289 (0 > ro && EXIT_SUCCESS == libxstream_opencl_use_cmem(devinfo))
293 (0 >= ro ?
"global" :
"constant");
294 blkrd = (0 == clinear && 0 != devinfo->intel && 0 < (int)sgsize &&
295 (NULL == blk_env ? 1 : (0 != atoi(blk_env))));
296 if (0 != blkrd &&
'g' != cmem[0]) {
299 offset += (size_t)LIBXS_SNPRINTF(
300 base_flags + offset,
sizeof(base_flags) - offset,
301 " %s %s -DCONSTANT=%s"
302 " -DBN=%i -DSM=%i -DLU=%i -DSG=%i -DINTEL=%i",
303 0 != gpu ?
"-DGPU" :
"", 0 == clinear ?
"" :
"-DCLINEAR", cmem,
304 bn, sm, lu, (int)sgsize, (
int)(0 != devinfo->intel));
305 if (0 != precision) {
306 offset += (size_t)LIBXS_SNPRINTF(base_flags + offset,
307 sizeof(base_flags) - offset,
308 " -DPRECISION=%i", precision);
310 bk_max = (NULL == bk_env ? 0 : atoi(bk_env));
311 nz = (NULL == nz_env ? 0 : atoi(nz_env));
313 offset += (size_t)LIBXS_SNPRINTF(base_flags + offset,
314 sizeof(base_flags) - offset,
318 base_source = source;
319 base_source_kind = source_kind;
320 base_options = options;
321 kernel_registry = libxs_registry_create();
322 if (2 <= verbosity || 0 > verbosity) {
323 fprintf(stderr,
"INFO ACC/LIBDBM: DBM-kernel gpu=%i", gpu);
324 dbm_multiply_opencl_print(stderr,
"sgb", sgbcst);
325 dbm_multiply_opencl_print(stderr,
"lin", clinear);
326 dbm_multiply_opencl_print(stderr,
"fp", precision);
327 dbm_multiply_opencl_print(stderr,
"bn", bn);
328 dbm_multiply_opencl_print(stderr,
"sm", sm);
329 dbm_multiply_opencl_print(stderr,
"wg", (
int)wgsize[0]);
330 dbm_multiply_opencl_print(stderr,
"sg", (
int)sgsize);
331 dbm_multiply_opencl_print(stderr,
"lu", lu);
332 dbm_multiply_opencl_print(stderr,
"nz", nz);
333 dbm_multiply_opencl_print(stderr,
"blk", blkrd);
334 fprintf(stderr,
" -> %.1f ms\n",
335 1E3 * DBM_TIMER_DIFF(start, DBM_TIMER_TICK()));
337 LIBXS_ATOMIC_STORE(&base_ready, 1, LIBXS_ATOMIC_SEQ_CST);
339 LIBXS_LOCK_RELEASE(LIBXS_LOCK,
config->lock_main);
342#if 0 < DBM_OPENCL_LIBSMM_PFORMAT
343 if (0 == task_complete)
346 task_complete = dbm_multiply_gpu_launch_info(&task, params_host, ntasks,
349 bk0 = dbm_multiply_opencl_bk(task.max_k);
350 bk = (0 < bk_max ? LIBXS_MIN(bk0, bk_max) : bk0);
352 dbm_multiply_opencl_key_t key;
356 if (0 == task.mnk_changes) {
365 key.max_m = (0 == clinear ? task.max_m : task.max_n);
368 use_blkrd = (0 != blkrd && 0 != key.m && 16 <= key.m &&
369 key.m <= (int)sgsize_s && 0 == (key.m & (key.m - 1)));
370 kptr = (cl_kernel *)libxs_registry_get(
371 kernel_registry, &key,
sizeof(key),
372 libxs_registry_lock(kernel_registry));
373 if (NULL == kptr || NULL == *kptr) {
374 LIBXS_LOCK_ACQUIRE(LIBXS_LOCK, &compile_lock);
375 kptr = (cl_kernel *)libxs_registry_get(
376 kernel_registry, &key,
sizeof(key),
377 libxs_registry_lock(kernel_registry));
378 if (NULL == kptr || NULL == *kptr) {
379 char flags[LIBXSTREAM_BUFFERSIZE];
380 const cl_device_id device_id =
config->devices[
config->device_id];
381 cl_kernel kernel_new = NULL;
384 const int use_wg = (0 != use_blkrd || 0 != sgbcst);
385 const int n = LIBXS_SNPRINTF(
386 flags,
sizeof(flags),
387 "%s -DWG=%i -DBK=%i -DDBM_M=%i -DDBM_N=%i -DDBM_K=%i%s%s",
388 base_flags, use_wg ? (
int)wgsize[0] : 0, key.bk, key.m, key.n,
389 key.k, 0 != use_blkrd ?
" -DBLKRD_A" :
"",
390 (0 != sgbcst && 0 == use_blkrd) ?
" -DSGBCST" :
"");
391 assert(0 < n && (
size_t)n <
sizeof(flags));
393 }
else if (0 < key.max_m) {
394 const int n = LIBXS_SNPRINTF(
395 flags,
sizeof(flags),
"%s -DWG=%i -DBK=%i -DMAX_M=%i%s%s",
396 base_flags, (
int)wgsize[0], bk, key.max_m,
397 0 != blkrd ?
" -DBLKRD_P" :
"",
398 (0 != sgbcst && 0 == blkrd) ?
" -DSGBCST" :
"");
399 assert(0 < n && (
size_t)n <
sizeof(flags));
402 const int n = LIBXS_SNPRINTF(
403 flags,
sizeof(flags),
"%s -DWG=%i -DBK=%i%s%s", base_flags,
404 (
int)wgsize[0], bk, 0 != blkrd ?
" -DBLKRD_P" :
"",
405 (0 != sgbcst && 0 == blkrd) ?
" -DSGBCST" :
"");
406 assert(0 < n && (
size_t)n <
sizeof(flags));
409 result |= libxstream_opencl_kernel(
410 base_source_kind, base_source,
"dbm_multiply", flags,
411 base_options, NULL , NULL , base_exts,
412 base_nexts, &kernel_new);
413 if (EXIT_SUCCESS == result &&
415 clGetKernelWorkGroupInfo(kernel_new, device_id,
416 CL_KERNEL_COMPILE_WORK_GROUP_SIZE,
417 sizeof(wgs), wgs, NULL) &&
418 0 != wgs[0] && 0 != wgs[1]) {
422 kptr = (cl_kernel *)libxs_registry_set(
423 kernel_registry, &key,
sizeof(key), &kernel_new,
424 sizeof(kernel_new), libxs_registry_lock(kernel_registry));
425 if (2 <= verbosity || 0 > verbosity || EXIT_SUCCESS != result) {
426 const char *
const kind =
427 (EXIT_SUCCESS == result ?
"INFO" :
"ERROR");
428 fprintf(stderr,
"%s ACC/LIBDBM: DBM-kernel bk=%i", kind, key.bk);
430 fprintf(stderr,
" mnk=%ix%ix%i", key.m, key.n, key.k);
432 fprintf(stderr,
" -> ");
433 if (EXIT_SUCCESS == result) {
434 fprintf(stderr,
"%.1f ms\n",
435 1E3 * DBM_TIMER_DIFF(start, DBM_TIMER_TICK()));
437 fprintf(stderr,
"FAILED!\n");
441 LIBXS_LOCK_RELEASE(LIBXS_LOCK, &compile_lock);
443 kernel = (NULL != kptr) ? *kptr : NULL;
445 LIBXS_LOCK_ACQUIRE(LIBXS_LOCK, &kernel_lock);
446 if (NULL != lock_memory) {
447 LIBXS_LOCK_ACQUIRE(LIBXS_LOCK, lock_memory);
450 size_t iadata = 0, ibdata = 0, icdata = 0;
451 result |= libxstream_opencl_info_devptr_lock(&adata, NULL ,
454 result |= libxstream_opencl_info_devptr_lock(&bdata, NULL ,
457 result |= libxstream_opencl_info_devptr_lock(&cdata, NULL ,
460 assert(0 == iadata && 0 == ibdata && 0 == icdata);
462 result |= libxstream_opencl_info_devptr_lock(
463 &batch, NULL , params ,
sizeof(
int) * stride,
464 &work_tasks, &ibatch);
465 if (NULL != lock_memory) {
466 LIBXS_LOCK_RELEASE(LIBXS_LOCK, lock_memory);
470 const int per_task = (0 != sgbcst && 0 == use_blkrd) ||
471 (0 != blkrd && 0 != task.mnk_changes);
473 (0 != task.mnk_changes || 0 != use_blkrd || 0 != sgbcst);
474 size = (cl_int)(work_tasks * (0 == clinear ? task.max_m : task.max_n));
476 work_size[0] = work_tasks * wgsize[0];
478 work_size[0] = LIBXS_UP((
size_t)size, wgsize[0]);
480 work_size[0] = (size_t)size;
482 result |= clSetKernelArg(kernel, 0,
sizeof(cl_double), &alpha);
483 result |= clSetKernelArg(kernel, 1,
sizeof(cl_int), &ibatch);
484 result |= clSetKernelArg(kernel, 2,
sizeof(cl_int), &ntasks);
485 result |= clSetKernelArg(kernel, 3,
sizeof(cl_int), &size);
486 result |= clSetKernelArg(kernel, 4,
sizeof(cl_int), ¶m_format);
487 result |= libxstream_opencl_set_kernel_ptr(kernel, 5, batch.memory);
488 result |= libxstream_opencl_set_kernel_ptr(kernel, 6, adata.memory);
489 result |= libxstream_opencl_set_kernel_ptr(kernel, 7, bdata.memory);
490 result |= libxstream_opencl_set_kernel_ptr(kernel, 8, cdata.memory);
492 clEnqueueNDRangeKernel(str->queue, kernel, 1, NULL, work_size,
493 0 < wgsize[0] ? wgsize : NULL,
496 LIBXS_LOCK_RELEASE(LIBXS_LOCK, &kernel_lock);
498#if 0 < DBM_OPENCL_LIBSMM_PFORMAT
500 result |= opencl_libsmm_acc_process(
501 params_host, params, ntasks, DBM_OPENCL_DBCSR_TYPE_REAL_8,
502 pack_a_data, pack_b_data, shard_c_data, task.max_m, task.max_n,
503 task.max_k, dbm_multiply_opencl_smm, 1 , stream,
505 task.max_m | task.max_n << DBM_OPENCL_LIBSMM_PFORMAT |
506 (task.max_k << (DBM_OPENCL_LIBSMM_PFORMAT * 2)),
511 if (0 != trace && EXIT_SUCCESS == result) {
512 static LIBXS_TLS DBM_TIMER_TICKINT start2 = 0;
513 const DBM_TIMER_TICKINT stop = DBM_TIMER_TICK();
514 const double dhost = DBM_TIMER_DIFF(start, stop);
515 const double diter = (0 < start2 ? DBM_TIMER_DIFF(start, start2) : dhost);
516#if 0 < DBM_OPENCL_LIBSMM_PFORMAT
517 const char *
const kind = (0 == dbcsr ?
"DBM" :
"SMM");
519 const char *
const kind =
"DBM";
521 const int pure = (0 == task.mnk_changes ? 100 : 0);
522 const double dtotl = LIBXS_MAX(diter, dhost);
525 "INFO ACC/LIBDBM: %s-kernel mnk=%ix%ix%i "
526 "pure=%i%% ntasks=%i ms=%.1f\n",
527 kind, task.max_m, task.max_n, task.max_k, pure, ntasks,
534void dbm_multiply_gpu_launch_kernel(offloadStream_t stream,
double alpha,
537 const double *pack_a_data,
538 const double *pack_b_data,
539 double *shard_c_data) {
540 const int result = dbm_multiply_opencl_launch_kernel(
541 stream, alpha, ntasks, 0 , &tasks_host->
m, &tasks->
m,
542 pack_a_data, pack_b_data, shard_c_data);
543 OFFLOAD_CHECK(result);
static int imax(int x, int y)
Returns the larger of two given integers (missing from the C standard)
static void const int const int i
static grid_library_config config
Internal struct for storing a task, ie. a single block multiplication.