8#include "../offload/offload_runtime.h"
9#if defined(__OFFLOAD_OPENCL) && !defined(__NO_OFFLOAD_DBM)
12#include "dbm_multiply_opencl.cl.h"
15 int max_m, max_n, avg_m, avg_n, avg_k, changes;
16} dbm_multiply_gpu_launch_info_t;
18static void dbm_multiply_gpu_launch_info(dbm_multiply_gpu_launch_info_t *info,
21 info->max_m = tasks[0].
m;
22 info->avg_m = tasks[0].
m;
23 info->max_n = tasks[0].
n;
24 info->avg_n = tasks[0].
n;
25 info->avg_k = tasks[0].
k;
26 for (info->changes = 0;
i < ntasks; ++
i) {
27 const int m = tasks[
i].
m, n = tasks[
i].
n, k = tasks[
i].
k;
28 info->max_m =
imax(info->max_m, m);
29 info->max_n =
imax(info->max_n, n);
30 if (info->avg_m != m || info->avg_n != n || info->avg_k != k) {
31 info->avg_m = (info->avg_m + m) / 2;
32 info->avg_n = (info->avg_n + n) / 2;
33 info->avg_k = (info->avg_k + k) / 2;
39void dbm_multiply_gpu_launch_kernel(
const offloadStream_t stream,
double alpha,
42 const double *pack_a_data,
43 const double *pack_b_data,
44 double *shard_c_data) {
46 static cl_kernel kernel_global = NULL;
47 static LIBXSMM_TLS cl_kernel kernel = NULL;
48 static int ndims = 1, clinear = 0;
49 static size_t wgsize[] = {0, 0, 0};
50 const libxsmm_timer_tickint start = libxsmm_timer_tick();
51 const c_dbcsr_acc_opencl_config_t *
const config = &c_dbcsr_acc_opencl_config;
52 const int verbosity =
config->verbosity;
53 int result = EXIT_SUCCESS;
54 cl_event event, *
const perf_event =
55 ((0 <= verbosity && 2 >= verbosity) ? NULL : &event);
56 const c_dbcsr_acc_opencl_stream_t *
const str = ACC_OPENCL_STREAM(stream);
57 size_t work_size[] = {1, 1, 1}, ibatch = 0;
58 size_t iadata = 0, ibdata = 0, icdata = 0;
59 const size_t work_tasks = ntasks;
60 dbm_multiply_gpu_launch_info_t info = {0};
61 c_dbcsr_acc_opencl_info_memptr_t adata, bdata, cdata, batch;
62 assert(NULL != pack_a_data && NULL != pack_b_data && NULL != shard_c_data);
63 assert(NULL != str && NULL != str->queue);
64 assert(0 < ntasks && NULL != tasks);
65#if defined(OPENCL_DBM_SOURCE_MULTIPLY)
66 if (NULL == kernel_global) {
67 ACC_OPENCL_ACQUIRE(
config->lock_main);
68 if (NULL == kernel_global) {
69 char params[ACC_OPENCL_BUFFERSIZE] =
70 "-cl-fast-relaxed-math -cl-denorms-are-zero";
71 const char *
const gen_env = getenv(
"DBM_MULTIPLY_GEN");
72 const char *
const lin_env = getenv(
"DBM_MULTIPLY_LIN");
73 const char *
const bn_env = getenv(
"DBM_MULTIPLY_BN");
74 const char *
const sm_env = getenv(
"DBM_MULTIPLY_SM");
75 const char *
const wg_env = getenv(
"DBM_MULTIPLY_WG");
76 const char *
const lu_env = getenv(
"DBM_MULTIPLY_LU");
77 const char *
const xf_env = getenv(
"DBM_MULTIPLY_XF");
78 const c_dbcsr_acc_opencl_device_t *
const devinfo = &
config->device;
79 int sm = (NULL == sm_env ? 0 : atoi(sm_env));
80 const int bn0 = (0 == devinfo->nv ? (0 == devinfo->amd ? 4 : 8) : 2);
81 const int bn1 = ((0 == sm && 0 == clinear) ? bn0 : (bn0 * 2));
82 int bn = LIBXSMM_CLMP(NULL == bn_env ? bn1 : atoi(bn_env), 1, 32);
83 int lu = LIBXSMM_CLMP(NULL == lu_env ? 0 : atoi(lu_env), -2, 1);
84 int gen = ((NULL == bn_env && NULL == sm_env && NULL == wg_env &&
85 NULL == lu_env && NULL == lin_env)
86 ? (NULL == gen_env ? 1 : atoi(gen_env))
88 const int gpu = (CL_DEVICE_TYPE_GPU == devinfo->type);
89 const int xf = (NULL == xf_env ? -1 : atoi(xf_env));
90 const char *extensions[] = {NULL, NULL}, *flags = NULL;
91 size_t nextensions =
sizeof(extensions) /
sizeof(*extensions);
92 const size_t wgsize0 = devinfo->wgsize[0], wgsize1 = devinfo->wgsize[1];
93 size_t wgsize2 = devinfo->wgsize[2];
95 ((0 ==
config->debug && 0 ==
config->dump) ? strlen(params) : 0);
96 offset += (size_t)c_dbcsr_acc_opencl_flags_atomics(
97 devinfo, c_dbcsr_acc_opencl_atomic_fp_64, extensions, &nextensions,
98 params + offset,
sizeof(params) - offset);
99 if (2 <= gen || (0 != gen && 0 != wgsize2 &&
100 2 <= *devinfo->std_level && NULL != extensions[1] &&
101 NULL != strstr(extensions[1],
"cl_ext_float_atomics"))) {
103 (size_t)LIBXSMM_SNPRINTF(params + offset,
sizeof(params) - offset,
104 " -DDBM_MULTIPLY_OPENCL_GEN");
105 wgsize[1] = wgsize[2] = 1;
110 wgsize[0] = (NULL == wg_env ? (
unsigned long int)LIBXSMM_ABS(sm)
111 : strtoul(wg_env, NULL, 10));
112 if (0 != wgsize2 && 0 < wgsize[0]) {
113 if (LIBXSMM_DELTA(wgsize[0], wgsize1) <=
114 LIBXSMM_DELTA(wgsize[0], wgsize2)) {
117 wgsize[0] = LIBXSMM_UP(wgsize[0], wgsize2);
119 wgsize[0] = LIBXSMM_UP(wgsize[0], wgsize1);
122 wgsize[0] = LIBXSMM_CLMP(wgsize[0], 0, wgsize0);
123 sm = ((0 != sm && 0 != wgsize[0])
124 ? (LIBXSMM_ISPOT(bn *
sizeof(
double)) + 1)
126 clinear = (NULL == lin_env ? 0 : atoi(lin_env));
127 offset += (size_t)LIBXSMM_SNPRINTF(
128 params + offset,
sizeof(params) - offset,
129 " %s %s -DBN=%i -DSM=%i -DLU=%i -DWG=%i -DSG=%i",
130 0 != gpu ?
"-DGPU" :
"", 0 == clinear ?
"" :
"-DCLINEAR", bn, sm,
131 lu, (int)wgsize[0], (
int)wgsize2);
134 if (0 != devinfo->intel && 0 < xf) {
135 flags =
"-cl-intel-256-GRF-per-thread";
137 result |= (
sizeof(params) > offset ? EXIT_SUCCESS : EXIT_FAILURE);
138 result |= c_dbcsr_acc_opencl_kernel(
139 0 , OPENCL_DBM_SOURCE_MULTIPLY,
"dbm_multiply",
140 params, flags, NULL , NULL , extensions, nextensions,
142 if (2 <= verbosity || 0 > verbosity) {
143 if (EXIT_SUCCESS == result) {
144 const double ds = libxsmm_timer_duration(start, libxsmm_timer_tick());
145 fprintf(stderr,
"INFO ACC/LIBDBM: DBM-kernel gpu=%i", gpu);
147 fprintf(stderr,
" gen=%i", gen);
150 fprintf(stderr,
" lin=%i", clinear);
153 fprintf(stderr,
" bn=%i", bn);
156 fprintf(stderr,
" sm=%i", sm);
158 if (0 != wgsize[0]) {
159 fprintf(stderr,
" wg=%i", (
int)wgsize[0]);
162 fprintf(stderr,
" sg=%i", (
int)wgsize2);
165 fprintf(stderr,
" lu=%i", lu);
167 fprintf(stderr,
" ms=%.1f\n", 1E3 * ds);
169 fprintf(stderr,
"INFO ACC/LIBDBM: DBM-kernel failed to generate\n");
173 kernel = clCloneKernel(kernel_global, &result);
174 ACC_OPENCL_RELEASE(
config->lock_main);
175 }
else if (NULL == kernel) {
176 kernel = clCloneKernel(kernel_global, &result);
179#error "OpenCL kernel code not found!"
181 result |= c_dbcsr_acc_opencl_info_devptr_lock(&adata, NULL ,
184 result |= c_dbcsr_acc_opencl_info_devptr_lock(&bdata, NULL ,
187 result |= c_dbcsr_acc_opencl_info_devptr_lock(&cdata, NULL ,
190 result |= c_dbcsr_acc_opencl_info_devptr_lock(
191 &batch, NULL , tasks ,
sizeof(
dbm_task_t), &work_tasks,
193 assert(0 == iadata && 0 == ibdata && 0 == icdata);
194 result |= clSetKernelArg(kernel, 0,
sizeof(cl_double), &alpha);
195 result |= clSetKernelArg(kernel, 1,
sizeof(cl_int), &ibatch);
197 const cl_uint zero = 0;
198 assert(0 != wgsize[1] && 0 != wgsize[1] && 0 != wgsize[2]);
200 assert(1 == work_size[1]);
201 work_size[2] = work_tasks;
202 result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 2, batch.memory);
203 result |= clSetKernelArg(kernel, 3,
sizeof(cl_uint), &zero );
204 result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 4, adata.memory);
205 result |= clSetKernelArg(kernel, 5,
sizeof(cl_uint), &zero );
206 result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 6, bdata.memory);
207 result |= clSetKernelArg(kernel, 7,
sizeof(cl_uint), &zero );
208 result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 8, cdata.memory);
209 result |= clSetKernelArg(kernel, 9,
sizeof(cl_uint), &zero );
211 size_t size = work_tasks;
212 dbm_multiply_gpu_launch_info(&info, tasks_host, ntasks);
213 size *= (0 == clinear ? info.max_m : info.max_n);
215 work_size[0] = (0 < wgsize[0] ? LIBXSMM_UP(size, wgsize[0]) : size);
216 result |= clSetKernelArg(kernel, 2,
sizeof(cl_int), &ntasks);
217 result |= clSetKernelArg(kernel, 3,
sizeof(cl_int), &size);
218 result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 4, batch.memory);
219 result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 5, adata.memory);
220 result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 6, bdata.memory);
221 result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 7, cdata.memory);
223 result |= clEnqueueNDRangeKernel(
224 str->queue, kernel, ndims, NULL, work_size, 0 < wgsize[0] ? wgsize : NULL,
225 0 , NULL , perf_event);
226 if (NULL != perf_event && EXIT_SUCCESS == result &&
227 EXIT_SUCCESS == clWaitForEvents(1, perf_event)) {
228 const double dhost = libxsmm_timer_duration(start, libxsmm_timer_tick());
229 cl_ulong begin = 0, end = 0;
231 clGetEventProfilingInfo(*perf_event, CL_PROFILING_COMMAND_START,
232 sizeof(cl_ulong), &begin, NULL) &&
233 EXIT_SUCCESS == clGetEventProfilingInfo(*perf_event,
234 CL_PROFILING_COMMAND_END,
235 sizeof(cl_ulong), &end, NULL)) {
236 const double dkrnl = LIBXSMM_DELTA(begin, end) * 1E-6;
238 LIBXSMM_MAX(dkrnl, dhost / c_dbcsr_acc_opencl_config.nthreads * 1E+3);
241 dbm_multiply_gpu_launch_info(&info, tasks_host, ntasks);
243 pure = (100 * (ntasks - info.changes) + ntasks - 1) / ntasks;
245 "INFO ACC/LIBDBM: DBM-kernel mnk=%ix%ix%i pure=%i%% "
246 "ntasks=%i kernel_ms=%.1f total_ms=%.1f gflops=%.1f\n",
247 info.avg_m, info.avg_n, info.avg_k, pure, ntasks, dkrnl, dtotl,
248 2E-6 * info.avg_m * info.avg_n * info.avg_k * ntasks *
249 c_dbcsr_acc_opencl_config.nranks / dtotl);
252 OFFLOAD_CHECK(result);
static int imax(int x, int y)
Returns the larger of two given integers (missing from the C standard)
static void const int const int i
static grid_library_config config
Internal struct for storing a task, ie. a single block multiplication.