8 #include "../offload/offload_runtime.h"
9 #if defined(__OFFLOAD_OPENCL) && !defined(__NO_OFFLOAD_DBM)
12 #include "dbm_multiply_opencl.cl.h"
14 void dbm_multiply_gpu_launch_kernel(
const offloadStream_t stream,
15 const int mnk_range[3][2],
double alpha,
17 const double *pack_a_data,
18 const double *pack_b_data,
19 double *shard_c_data) {
20 static cl_kernel kernel = NULL;
21 static int ndims = 1, split = 0;
22 static size_t wgsize[] = {0, 0, 0};
23 int result = EXIT_SUCCESS, verbosity = c_dbcsr_acc_opencl_config.verbosity;
24 cl_event event, *
const perf_event =
25 ((0 <= verbosity && 2 >= verbosity) ? NULL : &event);
26 const c_dbcsr_acc_opencl_stream_t *
const str = ACC_OPENCL_STREAM(stream);
27 const size_t max_m = mnk_range[0][1], work_tasks = ntasks;
28 size_t work_size[] = {1, 1, 1}, ibatch = 0, iadata = 0, ibdata = 0,
30 c_dbcsr_acc_opencl_info_memptr_t adata, bdata, cdata, batch;
31 assert(NULL != pack_a_data && NULL != pack_b_data && NULL != shard_c_data);
32 assert(0 < mnk_range[0][0] && 0 < mnk_range[0][1] &&
33 mnk_range[0][0] <= mnk_range[0][1]);
34 assert(0 < mnk_range[1][0] && 0 < mnk_range[1][1] &&
35 mnk_range[1][0] <= mnk_range[1][1]);
36 assert(0 < mnk_range[2][0] && 0 < mnk_range[2][1] &&
37 mnk_range[2][0] <= mnk_range[2][1]);
38 assert(NULL != str && NULL != str->queue);
39 assert(0 < ntasks && NULL != tasks);
41 ACC_OPENCL_ACQUIRE(c_dbcsr_acc_opencl_config.lock_main);
42 #if defined(OPENCL_DBM_SOURCE_MULTIPLY)
44 const libxsmm_timer_tickint start = libxsmm_timer_tick();
45 char params[ACC_OPENCL_BUFFERSIZE] =
46 "-cl-fast-relaxed-math -cl-denorms-are-zero";
47 const char *
const gen_env = getenv(
"DBM_MULTIPLY_GEN");
48 const char *
const xf_env = getenv(
"DBM_MULTIPLY_XF");
49 const char *
const lu_env = getenv(
"DBM_MULTIPLY_LU");
50 const char *
const bn_env = getenv(
"DBM_MULTIPLY_BN");
52 (CL_DEVICE_TYPE_GPU == c_dbcsr_acc_opencl_config.device.type);
53 const int gen = (NULL == gen_env ? 0 : atoi(gen_env));
54 const int xf = (NULL == xf_env ? -1 : atoi(xf_env));
55 const int lu = LIBXSMM_CLMP(NULL == lu_env ? 0 : atoi(lu_env), -2, 1);
56 int bn = (NULL == bn_env ? 8 : atoi(bn_env));
57 const char *extensions[] = {NULL, NULL}, *flags = NULL;
58 size_t nextensions =
sizeof(extensions) /
sizeof(*extensions);
59 const size_t wgsize0 = c_dbcsr_acc_opencl_config.device.wgsize[0];
60 const size_t wgsize1 = c_dbcsr_acc_opencl_config.device.wgsize[1];
61 size_t wgsize2 = c_dbcsr_acc_opencl_config.device.wgsize[2];
62 size_t offset = (0 == c_dbcsr_acc_opencl_config.debug ? strlen(params) : 0);
63 offset += (size_t)c_dbcsr_acc_opencl_flags_atomics(
64 &c_dbcsr_acc_opencl_config.device, c_dbcsr_acc_opencl_atomic_fp_64,
65 extensions, &nextensions, params + offset,
sizeof(params) - offset);
66 if (2 <= gen || (0 != gen && 0 != wgsize2 &&
67 2 <= *c_dbcsr_acc_opencl_config.device.std_level &&
68 NULL != extensions[1] &&
69 NULL != strstr(extensions[1],
"cl_ext_float_atomics"))) {
71 (size_t)LIBXSMM_SNPRINTF(params + offset,
sizeof(params) - offset,
72 " -DDBM_MULTIPLY_OPENCL_GEN");
73 if (0 != c_dbcsr_acc_opencl_config.device.intel && 0 != xf) {
74 flags =
"-cl-intel-256-GRF-per-thread";
76 wgsize[1] = wgsize[2] = 1;
80 const char *
const split_env = getenv(
"DBM_MULTIPLY_SPLIT");
81 const char *
const wg_env = getenv(
"DBM_MULTIPLY_WG");
82 split = (NULL == split_env ? 1 : atoi(split_env));
84 (NULL == wg_env ? (1 != split ? (wgsize1 * LIBXSMM_ABS(split)) : 0)
85 : strtoul(wg_env, NULL, 10));
86 if (0 != split && 1 != split && (bn * bn) > (
int)wgsize[0]) {
89 if (0 != split && 0 != wgsize2 && 0 < wgsize[0]) {
90 if (LIBXSMM_DELTA(wgsize[0], wgsize1) <=
91 LIBXSMM_DELTA(wgsize[0], wgsize2)) {
94 wgsize[0] = LIBXSMM_UP(wgsize[0], wgsize2);
96 wgsize[0] = LIBXSMM_UP(wgsize[0], wgsize1);
99 wgsize[0] = LIBXSMM_CLMP(wgsize[0], 0, wgsize0);
100 if (NULL == bn_env && 0 != split && 1 != split &&
101 (bn * bn) < (
int)wgsize[0]) {
102 bn = libxsmm_isqrt2_u32(wgsize[0]);
104 bn = LIBXSMM_CLMP(bn, 4, 32);
105 offset += (size_t)LIBXSMM_SNPRINTF(
106 params + offset,
sizeof(params) - offset,
107 " %s -DSPLIT=%i -DBN=%i -DWG=%i -DSG=%i -DLU=%i",
108 0 != gpu ?
"-DGPU" :
"", split, bn, (int)wgsize[0], (
int)wgsize2, lu);
109 if (0 != c_dbcsr_acc_opencl_config.device.intel && 0 < xf) {
110 flags =
"-cl-intel-256-GRF-per-thread";
113 result |= (
sizeof(params) > offset ? EXIT_SUCCESS : EXIT_FAILURE);
114 result |= c_dbcsr_acc_opencl_kernel(
115 0 , OPENCL_DBM_SOURCE_MULTIPLY,
"dbm_multiply",
116 params, flags, NULL , NULL , extensions, nextensions,
118 if (2 <= verbosity || 0 > verbosity) {
119 if (EXIT_SUCCESS == result) {
120 const double d = libxsmm_timer_duration(start, libxsmm_timer_tick());
121 fprintf(stderr,
"INFO ACC/LIBDBM: DBM-kernel gpu=%i", gpu);
123 fprintf(stderr,
" split=%i lu=%i bn=%i", split, lu, bn);
125 fprintf(stderr,
" gen=%i", gen);
127 fprintf(stderr,
" wg=%i sg=%i ms=%.1f\n", (
int)wgsize[0], (
int)wgsize2,
130 fprintf(stderr,
"INFO ACC/LIBDBM: DBM-kernel failed to generate\n");
135 #error "OpenCL kernel code not found!"
137 result |= c_dbcsr_acc_opencl_info_devptr_lock(&adata, NULL ,
140 result |= c_dbcsr_acc_opencl_info_devptr_lock(&bdata, NULL ,
143 result |= c_dbcsr_acc_opencl_info_devptr_lock(&cdata, NULL ,
146 result |= c_dbcsr_acc_opencl_info_devptr_lock(
147 &batch, NULL , tasks ,
sizeof(
dbm_task_t), &work_tasks,
149 assert(0 == iadata && 0 == ibdata && 0 == icdata);
150 result |= clSetKernelArg(kernel, 0,
sizeof(cl_double), &alpha);
151 result |= clSetKernelArg(kernel, 1,
sizeof(cl_int), &ibatch);
153 const cl_uint zero = 0;
154 assert(0 != wgsize[1] && 0 != wgsize[1] && 0 != wgsize[2]);
156 assert(1 == work_size[1]);
157 work_size[2] = work_tasks;
158 result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 2, batch.memory);
159 result |= clSetKernelArg(kernel, 3,
sizeof(cl_uint), &zero );
160 result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 4, adata.memory);
161 result |= clSetKernelArg(kernel, 5,
sizeof(cl_uint), &zero );
162 result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 6, bdata.memory);
163 result |= clSetKernelArg(kernel, 7,
sizeof(cl_uint), &zero );
164 result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 8, cdata.memory);
165 result |= clSetKernelArg(kernel, 9,
sizeof(cl_uint), &zero );
167 result |= clSetKernelArg(kernel, 2,
sizeof(cl_int), &ntasks);
169 if (1 == split || 0 == wgsize[0]) {
170 work_size[0] = work_tasks * max_m;
171 result |= clSetKernelArg(kernel, 3,
sizeof(cl_int), work_size);
173 work_size[0] = LIBXSMM_UP(work_size[0], wgsize[0]);
176 work_size[0] = work_tasks * wgsize[0];
177 result |= clSetKernelArg(kernel, 3,
sizeof(cl_int), work_size);
180 work_size[0] = work_tasks;
181 result |= clSetKernelArg(kernel, 3,
sizeof(cl_int), work_size);
183 result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 4, batch.memory);
184 result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 5, adata.memory);
185 result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 6, bdata.memory);
186 result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 7, cdata.memory);
188 result |= clEnqueueNDRangeKernel(
189 str->queue, kernel, ndims, NULL, work_size, 0 < wgsize[0] ? wgsize : NULL,
190 0 , NULL , perf_event);
191 if (NULL != perf_event && EXIT_SUCCESS == result) {
192 cl_ulong begin = 0, end = 0;
193 clWaitForEvents(1, perf_event);
194 result |= clGetEventProfilingInfo(*perf_event, CL_PROFILING_COMMAND_START,
195 sizeof(cl_ulong), &begin, NULL);
196 result |= clGetEventProfilingInfo(*perf_event, CL_PROFILING_COMMAND_END,
197 sizeof(cl_ulong), &end, NULL);
198 if (EXIT_SUCCESS == result) {
199 const double duration_ns = LIBXSMM_DELTA(begin, end);
200 const double gflops =
201 (max_m * mnk_range[1][1] * mnk_range[2][1] * ntasks) / duration_ns;
203 "INFO ACC/LIBDBM: DBM-kernel mnk=%ix%ix%i "
204 "ntasks=%i gflops=%.1f ms=%.2g\n",
205 mnk_range[0][1], mnk_range[1][1], mnk_range[2][1], ntasks, gflops,
209 ACC_OPENCL_RELEASE(c_dbcsr_acc_opencl_config.lock_main);
210 OFFLOAD_CHECK(result);
Internal struct for storing a task, ie. a single block multiplication.