(git:ed6f26b)
Loading...
Searching...
No Matches
dbm_multiply_opencl.c
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2025 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: BSD-3-Clause */
6/*----------------------------------------------------------------------------*/
7
8#include "../offload/offload_runtime.h"
9#if defined(__OFFLOAD_OPENCL) && !defined(__NO_OFFLOAD_DBM)
10
12#include "dbm_multiply_opencl.cl.h"
13
14typedef struct {
15 int max_m, max_n, avg_m, avg_n, avg_k, changes;
16} dbm_multiply_gpu_launch_info_t;
17
18static void dbm_multiply_gpu_launch_info(dbm_multiply_gpu_launch_info_t *info,
19 const dbm_task_t *tasks, int ntasks) {
20 int i = 1;
21 info->max_m = tasks[0].m;
22 info->avg_m = tasks[0].m;
23 info->max_n = tasks[0].n;
24 info->avg_n = tasks[0].n;
25 info->avg_k = tasks[0].k;
26 for (info->changes = 0; i < ntasks; ++i) {
27 const int m = tasks[i].m, n = tasks[i].n, k = tasks[i].k;
28 info->max_m = imax(info->max_m, m);
29 info->max_n = imax(info->max_n, n);
30 if (info->avg_m != m || info->avg_n != n || info->avg_k != k) {
31 info->avg_m = (info->avg_m + m) / 2;
32 info->avg_n = (info->avg_n + n) / 2;
33 info->avg_k = (info->avg_k + k) / 2;
34 ++info->changes;
35 }
36 }
37}
38
39void dbm_multiply_gpu_launch_kernel(const offloadStream_t stream, double alpha,
40 int ntasks, const dbm_task_t *tasks_host,
41 const dbm_task_t *tasks,
42 const double *pack_a_data,
43 const double *pack_b_data,
44 double *shard_c_data) {
45 /* creating/calling kernel must be consistent across threads */
46 static cl_kernel kernel_global = NULL;
47 static LIBXSMM_TLS cl_kernel kernel = NULL;
48 static int ndims = 1, clinear = 0;
49 static size_t wgsize[] = {0, 0, 0};
50 const libxsmm_timer_tickint start = libxsmm_timer_tick();
51 const c_dbcsr_acc_opencl_config_t *const config = &c_dbcsr_acc_opencl_config;
52 const int verbosity = config->verbosity;
53 int result = EXIT_SUCCESS;
54 cl_event event, *const perf_event =
55 ((0 <= verbosity && 2 >= verbosity) ? NULL : &event);
56 const c_dbcsr_acc_opencl_stream_t *const str = ACC_OPENCL_STREAM(stream);
57 size_t work_size[] = {1, 1, 1}, ibatch = 0;
58 size_t iadata = 0, ibdata = 0, icdata = 0;
59 const size_t work_tasks = ntasks;
60 dbm_multiply_gpu_launch_info_t info = {0};
61 c_dbcsr_acc_opencl_info_memptr_t adata, bdata, cdata, batch;
62 assert(NULL != pack_a_data && NULL != pack_b_data && NULL != shard_c_data);
63 assert(NULL != str && NULL != str->queue);
64 assert(0 < ntasks && NULL != tasks);
65#if defined(OPENCL_DBM_SOURCE_MULTIPLY)
66 if (NULL == kernel_global) { /* initial check if kernel is present */
67 ACC_OPENCL_ACQUIRE(config->lock_main);
68 if (NULL == kernel_global) {
69 char params[ACC_OPENCL_BUFFERSIZE] =
70 "-cl-fast-relaxed-math -cl-denorms-are-zero";
71 const char *const gen_env = getenv("DBM_MULTIPLY_GEN");
72 const char *const lin_env = getenv("DBM_MULTIPLY_LIN");
73 const char *const bn_env = getenv("DBM_MULTIPLY_BN");
74 const char *const sm_env = getenv("DBM_MULTIPLY_SM");
75 const char *const wg_env = getenv("DBM_MULTIPLY_WG");
76 const char *const lu_env = getenv("DBM_MULTIPLY_LU");
77 const char *const xf_env = getenv("DBM_MULTIPLY_XF");
78 const c_dbcsr_acc_opencl_device_t *const devinfo = &config->device;
79 int sm = (NULL == sm_env ? 0 /*default*/ : atoi(sm_env));
80 const int bn0 = (0 == devinfo->nv ? (0 == devinfo->amd ? 4 : 8) : 2);
81 const int bn1 = ((0 == sm && 0 == clinear) ? bn0 : (bn0 * 2));
82 int bn = LIBXSMM_CLMP(NULL == bn_env ? bn1 : atoi(bn_env), 1, 32);
83 int lu = LIBXSMM_CLMP(NULL == lu_env ? 0 : atoi(lu_env), -2, 1);
84 int gen = ((NULL == bn_env && NULL == sm_env && NULL == wg_env &&
85 NULL == lu_env && NULL == lin_env)
86 ? (NULL == gen_env ? 1 /*default*/ : atoi(gen_env))
87 : 0);
88 const int gpu = (CL_DEVICE_TYPE_GPU == devinfo->type);
89 const int xf = (NULL == xf_env ? -1 /*default*/ : atoi(xf_env));
90 const char *extensions[] = {NULL, NULL}, *flags = NULL;
91 size_t nextensions = sizeof(extensions) / sizeof(*extensions);
92 const size_t wgsize0 = devinfo->wgsize[0], wgsize1 = devinfo->wgsize[1];
93 size_t wgsize2 = devinfo->wgsize[2];
94 size_t offset =
95 ((0 == config->debug && 0 == config->dump) ? strlen(params) : 0);
96 offset += (size_t)c_dbcsr_acc_opencl_flags_atomics(
97 devinfo, c_dbcsr_acc_opencl_atomic_fp_64, extensions, &nextensions,
98 params + offset, sizeof(params) - offset);
99 if (2 <= gen || (0 != gen && 0 != wgsize2 /*subgroups*/ &&
100 2 <= *devinfo->std_level && NULL != extensions[1] &&
101 NULL != strstr(extensions[1], "cl_ext_float_atomics"))) {
102 offset +=
103 (size_t)LIBXSMM_SNPRINTF(params + offset, sizeof(params) - offset,
104 " -DDBM_MULTIPLY_OPENCL_GEN");
105 wgsize[1] = wgsize[2] = 1;
106 wgsize[0] = 16;
107 lu = bn = 0;
108 ndims = 3;
109 } else {
110 wgsize[0] = (NULL == wg_env ? (unsigned long int)LIBXSMM_ABS(sm)
111 : strtoul(wg_env, NULL, 10));
112 if (0 != wgsize2 && 0 < wgsize[0]) { /* subgroups */
113 if (LIBXSMM_DELTA(wgsize[0], wgsize1) <=
114 LIBXSMM_DELTA(wgsize[0], wgsize2)) { /* select SG-size */
115 wgsize2 = wgsize1;
116 }
117 wgsize[0] = LIBXSMM_UP(wgsize[0], wgsize2);
118 } else {
119 wgsize[0] = LIBXSMM_UP(wgsize[0], wgsize1);
120 wgsize2 = 0;
121 }
122 wgsize[0] = LIBXSMM_CLMP(wgsize[0], 0, wgsize0);
123 sm = ((0 != sm && 0 != wgsize[0])
124 ? (LIBXSMM_ISPOT(bn * sizeof(double)) + 1)
125 : 0);
126 clinear = (NULL == lin_env ? 0 /*default*/ : atoi(lin_env));
127 offset += (size_t)LIBXSMM_SNPRINTF(
128 params + offset, sizeof(params) - offset,
129 " %s %s -DBN=%i -DSM=%i -DLU=%i -DWG=%i -DSG=%i",
130 0 != gpu ? "-DGPU" : "", 0 == clinear ? "" : "-DCLINEAR", bn, sm,
131 lu, (int)wgsize[0], (int)wgsize2);
132 gen = 0;
133 }
134 if (0 != devinfo->intel && 0 < xf) {
135 flags = "-cl-intel-256-GRF-per-thread";
136 }
137 result |= (sizeof(params) > offset ? EXIT_SUCCESS : EXIT_FAILURE);
138 result |= c_dbcsr_acc_opencl_kernel(
139 0 /*source_is_file*/, OPENCL_DBM_SOURCE_MULTIPLY, "dbm_multiply",
140 params, flags, NULL /*try*/, NULL /*try_ok*/, extensions, nextensions,
141 &kernel_global);
142 if (2 <= verbosity || 0 > verbosity) {
143 if (EXIT_SUCCESS == result) {
144 const double ds = libxsmm_timer_duration(start, libxsmm_timer_tick());
145 fprintf(stderr, "INFO ACC/LIBDBM: DBM-kernel gpu=%i", gpu);
146 if (0 != gen) { /* generated kernel */
147 fprintf(stderr, " gen=%i", gen);
148 }
149 if (0 != clinear) {
150 fprintf(stderr, " lin=%i", clinear);
151 }
152 if (0 != bn) {
153 fprintf(stderr, " bn=%i", bn);
154 }
155 if (0 != sm) {
156 fprintf(stderr, " sm=%i", sm);
157 }
158 if (0 != wgsize[0]) {
159 fprintf(stderr, " wg=%i", (int)wgsize[0]);
160 }
161 if (0 != wgsize2) {
162 fprintf(stderr, " sg=%i", (int)wgsize2);
163 }
164 if (0 != lu) {
165 fprintf(stderr, " lu=%i", lu);
166 }
167 fprintf(stderr, " ms=%.1f\n", 1E3 * ds);
168 } else {
169 fprintf(stderr, "INFO ACC/LIBDBM: DBM-kernel failed to generate\n");
170 }
171 }
172 }
173 kernel = clCloneKernel(kernel_global, &result); /* always clone */
174 ACC_OPENCL_RELEASE(config->lock_main);
175 } else if (NULL == kernel) {
176 kernel = clCloneKernel(kernel_global, &result);
177 }
178#else
179#error "OpenCL kernel code not found!"
180#endif
181 result |= c_dbcsr_acc_opencl_info_devptr_lock(&adata, NULL /*lock*/,
182 pack_a_data, 1 /*esize*/,
183 NULL /*amount*/, &iadata);
184 result |= c_dbcsr_acc_opencl_info_devptr_lock(&bdata, NULL /*lock*/,
185 pack_b_data, 1 /*esize*/,
186 NULL /*amount*/, &ibdata);
187 result |= c_dbcsr_acc_opencl_info_devptr_lock(&cdata, NULL /*lock*/,
188 shard_c_data, 1 /*esize*/,
189 NULL /*amount*/, &icdata);
190 result |= c_dbcsr_acc_opencl_info_devptr_lock(
191 &batch, NULL /*lock*/, tasks /*batch*/, sizeof(dbm_task_t), &work_tasks,
192 &ibatch);
193 assert(0 == iadata && 0 == ibdata && 0 == icdata);
194 result |= clSetKernelArg(kernel, 0, sizeof(cl_double), &alpha);
195 result |= clSetKernelArg(kernel, 1, sizeof(cl_int), &ibatch);
196 if (1 < ndims) { /* DBM_MULTIPLY_GEN */
197 const cl_uint zero = 0;
198 assert(0 != wgsize[1] && 0 != wgsize[1] && 0 != wgsize[2]);
199 work_size[0] = 16;
200 assert(1 == work_size[1]);
201 work_size[2] = work_tasks;
202 result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 2, batch.memory);
203 result |= clSetKernelArg(kernel, 3, sizeof(cl_uint), &zero /*shape*/);
204 result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 4, adata.memory);
205 result |= clSetKernelArg(kernel, 5, sizeof(cl_uint), &zero /*A_shape0*/);
206 result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 6, bdata.memory);
207 result |= clSetKernelArg(kernel, 7, sizeof(cl_uint), &zero /*B_shape0*/);
208 result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 8, cdata.memory);
209 result |= clSetKernelArg(kernel, 9, sizeof(cl_uint), &zero /*C_shape0*/);
210 } else {
211 size_t size = work_tasks;
212 dbm_multiply_gpu_launch_info(&info, tasks_host, ntasks);
213 size *= (0 == clinear ? info.max_m : info.max_n);
214 /* fixup to be a multiple of the WG-size */
215 work_size[0] = (0 < wgsize[0] ? LIBXSMM_UP(size, wgsize[0]) : size);
216 result |= clSetKernelArg(kernel, 2, sizeof(cl_int), &ntasks);
217 result |= clSetKernelArg(kernel, 3, sizeof(cl_int), &size);
218 result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 4, batch.memory);
219 result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 5, adata.memory);
220 result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 6, bdata.memory);
221 result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 7, cdata.memory);
222 }
223 result |= clEnqueueNDRangeKernel(
224 str->queue, kernel, ndims, NULL, work_size, 0 < wgsize[0] ? wgsize : NULL,
225 0 /*num_wait*/, NULL /*wait_list*/, perf_event);
226 if (NULL != perf_event && EXIT_SUCCESS == result &&
227 EXIT_SUCCESS == clWaitForEvents(1, perf_event)) {
228 const double dhost = libxsmm_timer_duration(start, libxsmm_timer_tick());
229 cl_ulong begin = 0, end = 0;
230 if (EXIT_SUCCESS ==
231 clGetEventProfilingInfo(*perf_event, CL_PROFILING_COMMAND_START,
232 sizeof(cl_ulong), &begin, NULL) &&
233 EXIT_SUCCESS == clGetEventProfilingInfo(*perf_event,
234 CL_PROFILING_COMMAND_END,
235 sizeof(cl_ulong), &end, NULL)) {
236 const double dkrnl = LIBXSMM_DELTA(begin, end) * 1E-6;
237 const double dtotl =
238 LIBXSMM_MAX(dkrnl, dhost / c_dbcsr_acc_opencl_config.nthreads * 1E+3);
239 int pure;
240 if (1 < ndims) { /* DBM_MULTIPLY_GEN */
241 dbm_multiply_gpu_launch_info(&info, tasks_host, ntasks);
242 }
243 pure = (100 * (ntasks - info.changes) + ntasks - 1) / ntasks;
244 fprintf(stderr,
245 "INFO ACC/LIBDBM: DBM-kernel mnk=%ix%ix%i pure=%i%% "
246 "ntasks=%i kernel_ms=%.1f total_ms=%.1f gflops=%.1f\n",
247 info.avg_m, info.avg_n, info.avg_k, pure, ntasks, dkrnl, dtotl,
248 2E-6 * info.avg_m * info.avg_n * info.avg_k * ntasks *
249 c_dbcsr_acc_opencl_config.nranks / dtotl);
250 }
251 }
252 OFFLOAD_CHECK(result);
253}
254
255#endif // defined(__OFFLOAD_OPENCL) && !defined(__NO_OFFLOAD_DBM)
256
257// EOF
static int imax(int x, int y)
Returns the larger of two given integers (missing from the C standard)
static void const int const int i
static grid_library_config config
Internal struct for storing a task, ie. a single block multiplication.