8#include "../offload/offload_runtime.h"
9#if defined(__OFFLOAD_OPENCL) && !defined(__NO_OFFLOAD_DBM)
12#include "dbm_multiply_opencl.cl.h"
14#if defined(__DBCSR_ACC)
15#include <smm/opencl_libsmm.h>
18#define DBM_TIMER_DIFF(A, B) libxsmm_timer_duration(A, B)
19#define DBM_TIMER_TICK() libxsmm_timer_tick()
20#define DBM_TIMER_TICKINT libxsmm_timer_tickint
22int dbm_multiply_opencl_launch_kernel(
void *stream,
double alpha,
int ntasks,
23 int param_format,
const int *params_host,
25 const double *pack_a_data,
26 const double *pack_b_data,
27 double *shard_c_data);
29#if defined(OPENCL_LIBSMM_PFORMAT) && (0 < OPENCL_LIBSMM_PFORMAT)
30int dbm_multiply_opencl_initialized ;
31int dbm_multiply_opencl_smm ;
33LIBXSMM_ATTRIBUTE_CTOR
static void dbm_multiply_opencl_initialize(
void) {
34 const char *
const smm_env = getenv(
"DBM_MULTIPLY_SMM");
35 const int smm = (NULL == smm_env ? 0 : atoi(smm_env));
36 dbm_multiply_opencl_smm =
37 LIBXSMM_MIN(1 != smm ? smm : 64, (1 << (OPENCL_LIBSMM_PFORMAT - 1)) - 1);
38 if (0 > dbm_multiply_opencl_smm) {
39 opencl_libsmm_acc_set_dbm_launch_fn(dbm_multiply_opencl_launch_kernel);
41 ++dbm_multiply_opencl_initialized;
46 int max_m, max_n, max_k, mnk_changes;
47} dbm_multiply_gpu_launch_info_t;
49static void dbm_multiply_gpu_launch_info(dbm_multiply_gpu_launch_info_t *info,
50 const int *params,
int ntasks,
52 if (0 == param_format) {
53 const int stride =
sizeof(
dbm_task_t) /
sizeof(
int);
54 int avg_m = params[0], avg_n = params[1], avg_k = params[2],
i = stride;
58 for (info->mnk_changes = 0;
i < (ntasks * stride);
i += stride) {
59 const int m = params[
i + 0], n = params[
i + 1], k = params[
i + 2];
60 info->max_m =
imax(info->max_m, m);
61 info->max_n =
imax(info->max_n, n);
62 info->max_k =
imax(info->max_k, k);
63 if (m != avg_m || n != avg_n || k != avg_k) {
64 avg_m = (avg_m + m) / 2;
65 avg_n = (avg_n + n) / 2;
66 avg_k = (avg_k + k) / 2;
71#if defined(OPENCL_LIBSMM_PFORMAT) && (0 < OPENCL_LIBSMM_PFORMAT)
72 const int mask = (1 << OPENCL_LIBSMM_PFORMAT) - 1;
73 info->max_m = mask & (param_format);
74 info->max_n = mask & (param_format >> (OPENCL_LIBSMM_PFORMAT));
75 info->max_k = mask & (param_format >> (OPENCL_LIBSMM_PFORMAT * 2));
76 info->mnk_changes = 0;
83static void dbm_multiply_opencl_print(FILE *stream,
const char *name,
int val) {
85 fprintf(stream,
" %s=%i", name, val);
89int dbm_multiply_opencl_launch_kernel(
void *stream,
double alpha,
int ntasks,
90 int param_format,
const int *params_host,
92 const double *pack_a_data,
93 const double *pack_b_data,
94 double *shard_c_data) {
95 const DBM_TIMER_TICKINT start = DBM_TIMER_TICK();
96 const c_dbcsr_acc_opencl_config_t *
const config = &c_dbcsr_acc_opencl_config;
97 const int verbosity =
config->verbosity,
98 info = (0 > verbosity || 2 < verbosity);
99 int result = EXIT_SUCCESS;
100#if defined(OPENCL_LIBSMM_PFORMAT) && (0 < OPENCL_LIBSMM_PFORMAT)
103 dbm_multiply_gpu_launch_info_t task = {0};
104 assert(NULL != pack_a_data && NULL != pack_b_data && NULL != shard_c_data);
105 assert(NULL != params_host || 0 == ntasks);
106 assert(NULL != params || 0 == ntasks);
110#if defined(OPENCL_LIBSMM_PFORMAT) && (0 < OPENCL_LIBSMM_PFORMAT)
111 if (0 == dbm_multiply_opencl_initialized) {
112 dbm_multiply_opencl_initialize();
114 if (0 != dbm_multiply_opencl_smm || 0 != info) {
115 dbm_multiply_gpu_launch_info(&task, params_host, ntasks, param_format);
117 if (0 > dbm_multiply_opencl_smm || 0 != task.mnk_changes ||
118 dbm_multiply_opencl_smm < task.max_m ||
119 dbm_multiply_opencl_smm < task.max_n ||
120 dbm_multiply_opencl_smm < task.max_k || 0 == task.max_k || 1 != alpha)
123#if defined(OPENCL_DBM_SOURCE_MULTIPLY)
125 static cl_kernel kernel_global = NULL;
126 static LIBXSMM_TLS cl_kernel kernel = NULL;
127 static int ndims = 1, clinear = 0;
128 static size_t wgsize[] = {0, 0, 0};
129 const c_dbcsr_acc_opencl_stream_t *
const str = ACC_OPENCL_STREAM(stream);
130 const c_dbcsr_acc_opencl_device_t *
const devinfo = &
config->device;
131 ACC_OPENCL_LOCKTYPE *
const lock_memory =
132 (NULL != devinfo->clSetKernelArgMemPointerINTEL ? NULL
134 c_dbcsr_acc_opencl_info_memptr_t adata, bdata, cdata, batch;
135 const int stride = (0 == param_format ? 6 : 3);
136 size_t work_size[] = {1, 1, 1}, ibatch = 0;
137 size_t iadata = 0, ibdata = 0, icdata = 0;
138 const size_t work_tasks = ntasks;
139 assert(NULL != str && NULL != str->queue);
140 if (NULL == kernel_global) {
141 ACC_OPENCL_ACQUIRE(
config->lock_main);
142 if (NULL == kernel_global) {
143 char flags[ACC_OPENCL_BUFFERSIZE] =
144 "-cl-fast-relaxed-math -cl-denorms-are-zero";
145 const char *
const gen_env = getenv(
"DBM_MULTIPLY_GEN");
146 const char *
const lin_env = getenv(
"DBM_MULTIPLY_LIN");
147 const char *
const fp_env = getenv(
"DBM_MULTIPLY_FP");
148 const char *
const bn_env = getenv(
"DBM_MULTIPLY_BN");
149 const char *
const sm_env = getenv(
"DBM_MULTIPLY_SM");
150 const char *
const wg_env = getenv(
"DBM_MULTIPLY_WG");
151 const char *
const lu_env = getenv(
"DBM_MULTIPLY_LU");
152 const char *
const ro_env = getenv(
"DBM_MULTIPLY_RO");
153 const char *
const xf_env = getenv(
"DBM_MULTIPLY_XF");
154 const char *exts[] = {NULL, NULL}, *options = NULL;
155 int sm = (NULL == sm_env ? 0 : atoi(sm_env));
156 const int dd = (0 !=
config->debug && 0 !=
config->dump);
157 const int ro = (NULL == ro_env ? -1 : atoi(ro_env));
158 const int xf = (NULL == xf_env ? -1 : atoi(xf_env));
159 const int bn0 = (0 == devinfo->nv ? 8 : 2), uid = devinfo->uid;
160 const int bn1 = ((0 == sm && 0 == clinear) ? bn0 : (bn0 * 2));
161 const int gpu = (CL_DEVICE_TYPE_GPU == devinfo->type);
162 const int precision = (NULL == fp_env ? 0 : atoi(fp_env));
163 const int gen0 = (NULL == fp_env && NULL == bn_env && NULL == sm_env &&
164 NULL == wg_env && NULL == lu_env && NULL == lin_env &&
165 NULL == ro_env && 0 == param_format);
166 const int gen1 = (devinfo->intel && 0x0bd0 <= uid && 0x0bdb >= uid);
167 int gen = (0 != gen0 ? (NULL == gen_env ? gen1 : atoi(gen_env)) : 0);
168 int bn = LIBXSMM_CLMP(NULL == bn_env ? bn1 : atoi(bn_env), 1, 32);
169 int lu = LIBXSMM_CLMP(NULL == lu_env ? 0 : atoi(lu_env), -2, 1);
170 size_t nexts =
sizeof(exts) /
sizeof(*exts);
171 size_t sgsize = devinfo->wgsize[2];
172 size_t offset = (0 == dd ? strlen(flags) : 0);
173 offset += (size_t)c_dbcsr_acc_opencl_flags_atomics(
174 devinfo, c_dbcsr_acc_opencl_atomic_fp_64, exts, &nexts,
175 flags + offset,
sizeof(flags) - offset);
176 if (2 <= gen || (0 != gen && 1 < sgsize &&
177 2 <= *devinfo->std_level && NULL != exts[1] &&
178 NULL != strstr(exts[1],
"cl_ext_float_atomics"))) {
180 (size_t)LIBXSMM_SNPRINTF(flags + offset,
sizeof(flags) - offset,
181 " -DDBM_MULTIPLY_OPENCL_GEN");
182 wgsize[1] = wgsize[2] = 1;
187 wgsize[0] = (NULL == wg_env ? (
unsigned long int)LIBXSMM_ABS(sm)
188 : strtoul(wg_env, NULL, 10));
189 if (1 < sgsize && 0 < wgsize[0]) {
190 if (LIBXSMM_DELTA(wgsize[0], devinfo->wgsize[1]) <=
191 LIBXSMM_DELTA(wgsize[0], sgsize)) {
192 sgsize = devinfo->wgsize[1];
194 wgsize[0] = LIBXSMM_UP(wgsize[0], sgsize);
196 wgsize[0] = LIBXSMM_UP(wgsize[0], devinfo->wgsize[1]);
199 wgsize[0] = LIBXSMM_CLMP(wgsize[0], 0, devinfo->wgsize[0]);
200 sm = ((0 != sm && 0 != wgsize[0])
201 ? (LIBXSMM_ISPOT(bn *
sizeof(
double)) + 1)
203 clinear = (NULL == lin_env ? 0 : atoi(lin_env));
204 offset += (size_t)LIBXSMM_SNPRINTF(
205 flags + offset,
sizeof(flags) - offset,
206 " %s %s -DCONSTANT=%s -DBN=%i -DSM=%i -DLU=%i -DWG=%i -DSG=%i",
207 0 != gpu ?
"-DGPU" :
"", 0 == clinear ?
"" :
"-DCLINEAR",
208#if defined(ACC_OPENCL_CMEM)
209 (0 > ro && EXIT_SUCCESS == c_dbcsr_acc_opencl_use_cmem(devinfo))
213 (0 >= ro ?
"global" :
"constant"),
214 bn, sm, lu, (int)wgsize[0], (int)sgsize);
215 if (0 != precision) {
217 (size_t)LIBXSMM_SNPRINTF(flags + offset,
sizeof(flags) - offset,
218 " -DPRECISION=%i", precision);
222 if (0 != devinfo->intel && 0 < xf) {
223 options =
"-cl-intel-256-GRF-per-thread";
225 result |= (
sizeof(flags) > offset ? EXIT_SUCCESS : EXIT_FAILURE);
226 if (2 <= verbosity || 0 > verbosity || EXIT_SUCCESS != result) {
227 const char *
const kind = (EXIT_SUCCESS == result ?
"INFO" :
"ERROR");
228 fprintf(stderr,
"%s ACC/LIBDBM: DBM-kernel gpu=%i", kind, gpu);
229 dbm_multiply_opencl_print(stderr,
"gen", gen);
230 dbm_multiply_opencl_print(stderr,
"lin", clinear);
231 dbm_multiply_opencl_print(stderr,
"fp", precision);
232 dbm_multiply_opencl_print(stderr,
"bn", bn);
233 dbm_multiply_opencl_print(stderr,
"sm", sm);
234 dbm_multiply_opencl_print(stderr,
"wg", (
int)wgsize[0]);
235 dbm_multiply_opencl_print(stderr,
"sg", (
int)sgsize);
236 dbm_multiply_opencl_print(stderr,
"lu", lu);
237 fprintf(stderr,
" -> ");
239 result |= c_dbcsr_acc_opencl_kernel(
240 0 , OPENCL_DBM_SOURCE_MULTIPLY,
"dbm_multiply",
241 flags, options, NULL , NULL , exts, nexts,
243 if (2 <= verbosity || 0 > verbosity || EXIT_SUCCESS != result) {
244 if (EXIT_SUCCESS == result) {
245 const double ds = DBM_TIMER_DIFF(start, DBM_TIMER_TICK());
246 fprintf(stderr,
"%.1f ms\n", 1E3 * ds);
248 fprintf(stderr,
"FAILED!\n");
252 kernel = clCloneKernel(kernel_global, &result);
253 ACC_OPENCL_RELEASE(
config->lock_main);
254 }
else if (NULL == kernel) {
255 kernel = clCloneKernel(kernel_global, &result);
258#error "OpenCL kernel code not found!"
260 if (NULL != lock_memory) {
261 ACC_OPENCL_ACQUIRE(lock_memory);
263 result |= c_dbcsr_acc_opencl_info_devptr_lock(&adata, NULL ,
266 result |= c_dbcsr_acc_opencl_info_devptr_lock(&bdata, NULL ,
269 result |= c_dbcsr_acc_opencl_info_devptr_lock(&cdata, NULL ,
272 result |= c_dbcsr_acc_opencl_info_devptr_lock(
273 &batch, NULL , params ,
sizeof(
int) * stride,
274 &work_tasks, &ibatch);
275 if (NULL != lock_memory) {
276 ACC_OPENCL_RELEASE(lock_memory);
278 assert(0 == iadata && 0 == ibdata && 0 == icdata);
279 result |= clSetKernelArg(kernel, 0,
sizeof(cl_double), &alpha);
280 result |= clSetKernelArg(kernel, 1,
sizeof(cl_int), &ibatch);
282 const cl_uint
zero = 0;
283 assert(0 != wgsize[1] && 0 != wgsize[1] && 0 != wgsize[2]);
285 assert(1 == work_size[1]);
286 work_size[2] = work_tasks;
287 result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 2, batch.memory);
288 result |= clSetKernelArg(kernel, 3,
sizeof(cl_uint), &zero );
289 result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 4, adata.memory);
290 result |= clSetKernelArg(kernel, 5,
sizeof(cl_uint), &zero );
291 result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 6, bdata.memory);
292 result |= clSetKernelArg(kernel, 7,
sizeof(cl_uint), &zero );
293 result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 8, cdata.memory);
294 result |= clSetKernelArg(kernel, 9,
sizeof(cl_uint), &zero );
295#if !(defined(OPENCL_LIBSMM_PFORMAT) && (0 < OPENCL_LIBSMM_PFORMAT))
297 dbm_multiply_gpu_launch_info(&task, params_host, ntasks, param_format);
301 size_t size = work_tasks;
302#if defined(OPENCL_LIBSMM_PFORMAT) && (0 < OPENCL_LIBSMM_PFORMAT)
303 if (0 == dbm_multiply_opencl_smm && 0 == info)
306 dbm_multiply_gpu_launch_info(&task, params_host, ntasks, param_format);
308 size *= (0 == clinear ? task.max_m : task.max_n);
310 work_size[0] = (0 < wgsize[0] ? LIBXSMM_UP(size, wgsize[0]) : size);
311 result |= clSetKernelArg(kernel, 2,
sizeof(cl_int), &ntasks);
312 result |= clSetKernelArg(kernel, 3,
sizeof(cl_int), &size);
313 result |= clSetKernelArg(kernel, 4,
sizeof(cl_int), ¶m_format);
314 result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 5, batch.memory);
315 result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 6, adata.memory);
316 result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 7, bdata.memory);
317 result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 8, cdata.memory);
319 result |= clEnqueueNDRangeKernel(str->queue, kernel, ndims, NULL, work_size,
320 0 < wgsize[0] ? wgsize : NULL,
323#if defined(OPENCL_LIBSMM_PFORMAT) && (0 < OPENCL_LIBSMM_PFORMAT)
325 result |= opencl_libsmm_acc_process(
326 params_host, params, ntasks, dbcsr_type_real_8, pack_a_data,
327 pack_b_data, shard_c_data, task.max_m, task.max_n, task.max_k,
328 dbm_multiply_opencl_smm, 1 , stream, NULL ,
329 task.max_m | task.max_n << OPENCL_LIBSMM_PFORMAT |
330 (task.max_k << (OPENCL_LIBSMM_PFORMAT * 2)),
335 if (0 != info && EXIT_SUCCESS == result) {
336 static LIBXSMM_TLS DBM_TIMER_TICKINT start2 = 0;
337 const DBM_TIMER_TICKINT stop = DBM_TIMER_TICK();
338 const double dhost = DBM_TIMER_DIFF(start, stop);
339 const double diter = (0 < start2 ? DBM_TIMER_DIFF(start, start2) : dhost);
340#if defined(OPENCL_LIBSMM_PFORMAT) && (0 < OPENCL_LIBSMM_PFORMAT)
341 const char *
const kind = (0 == dbcsr ?
"DBM" :
"SMM");
343 const char *
const kind =
"DBM";
345 const int pure = (100 * (ntasks - task.mnk_changes) + ntasks - 1) / ntasks;
346 const double dtotl = LIBXSMM_MAX(diter, dhost);
349 "INFO ACC/LIBDBM: %s-kernel mnk=%ix%ix%i "
350 "pure=%i%% ntasks=%i ms=%.1f\n",
351 kind, task.max_m, task.max_n, task.max_k, pure, ntasks,
357void dbm_multiply_gpu_launch_kernel(offloadStream_t stream,
double alpha,
360 const double *pack_a_data,
361 const double *pack_b_data,
362 double *shard_c_data) {
363 const int result = dbm_multiply_opencl_launch_kernel(
364 stream, alpha, ntasks, 0 , &tasks_host->
m, &tasks->
m,
365 pack_a_data, pack_b_data, shard_c_data);
366 OFFLOAD_CHECK(result);
static int imax(int x, int y)
Returns the larger of two given integers (missing from the C standard)
static void const int const int i
static grid_library_config config
real(kind=dp), parameter zero
Internal struct for storing a task, ie. a single block multiplication.