(git:374b731)
Loading...
Searching...
No Matches
dbm_multiply_opencl.c
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2024 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: BSD-3-Clause */
6/*----------------------------------------------------------------------------*/
7
8#include "../offload/offload_runtime.h"
9#if defined(__OFFLOAD_OPENCL) && !defined(__NO_OFFLOAD_DBM)
10
12#include "dbm_multiply_opencl.cl.h"
13
14void dbm_multiply_gpu_launch_kernel(const offloadStream_t stream,
15 const int mnk_range[3][2], double alpha,
16 int ntasks, const dbm_task_t *tasks,
17 const double *pack_a_data,
18 const double *pack_b_data,
19 double *shard_c_data) {
20 static cl_kernel kernel = NULL;
21 static int ndims = 1, split = 0;
22 static size_t wgsize[] = {0, 0, 0};
23 int result = EXIT_SUCCESS, verbosity = c_dbcsr_acc_opencl_config.verbosity;
24 cl_event event, *const perf_event =
25 ((0 <= verbosity && 2 >= verbosity) ? NULL : &event);
26 const c_dbcsr_acc_opencl_stream_t *const str = ACC_OPENCL_STREAM(stream);
27 const size_t max_m = mnk_range[0][1], work_tasks = ntasks;
28 size_t work_size[] = {1, 1, 1}, ibatch = 0, iadata = 0, ibdata = 0,
29 icdata = 0;
30 c_dbcsr_acc_opencl_info_memptr_t adata, bdata, cdata, batch;
31 assert(NULL != pack_a_data && NULL != pack_b_data && NULL != shard_c_data);
32 assert(0 < mnk_range[0][0] && 0 < mnk_range[0][1] &&
33 mnk_range[0][0] <= mnk_range[0][1]);
34 assert(0 < mnk_range[1][0] && 0 < mnk_range[1][1] &&
35 mnk_range[1][0] <= mnk_range[1][1]);
36 assert(0 < mnk_range[2][0] && 0 < mnk_range[2][1] &&
37 mnk_range[2][0] <= mnk_range[2][1]);
38 assert(NULL != str && NULL != str->queue);
39 assert(0 < ntasks && NULL != tasks);
40 /* creating/calling kernel must be consistent across threads */
41 ACC_OPENCL_ACQUIRE(c_dbcsr_acc_opencl_config.lock_main);
42#if defined(OPENCL_DBM_SOURCE_MULTIPLY)
43 if (NULL == kernel) { /* first-time check if kernel is present */
44 const libxsmm_timer_tickint start = libxsmm_timer_tick();
45 char params[ACC_OPENCL_BUFFERSIZE] =
46 "-cl-fast-relaxed-math -cl-denorms-are-zero";
47 const char *const gen_env = getenv("DBM_MULTIPLY_GEN");
48 const char *const xf_env = getenv("DBM_MULTIPLY_XF");
49 const char *const lu_env = getenv("DBM_MULTIPLY_LU");
50 const char *const bn_env = getenv("DBM_MULTIPLY_BN");
51 const int gpu =
52 (CL_DEVICE_TYPE_GPU == c_dbcsr_acc_opencl_config.device.type);
53 const int gen = (NULL == gen_env ? 0 /*default*/ : atoi(gen_env));
54 const int xf = (NULL == xf_env ? -1 /*default*/ : atoi(xf_env));
55 const int lu = LIBXSMM_CLMP(NULL == lu_env ? 0 : atoi(lu_env), -2, 1);
56 int bn = (NULL == bn_env ? 8 : atoi(bn_env));
57 const char *extensions[] = {NULL, NULL}, *flags = NULL;
58 size_t nextensions = sizeof(extensions) / sizeof(*extensions);
59 const size_t wgsize0 = c_dbcsr_acc_opencl_config.device.wgsize[0];
60 const size_t wgsize1 = c_dbcsr_acc_opencl_config.device.wgsize[1];
61 size_t wgsize2 = c_dbcsr_acc_opencl_config.device.wgsize[2];
62 size_t offset = (0 == c_dbcsr_acc_opencl_config.debug ? strlen(params) : 0);
63 offset += (size_t)c_dbcsr_acc_opencl_flags_atomics(
64 &c_dbcsr_acc_opencl_config.device, c_dbcsr_acc_opencl_atomic_fp_64,
65 extensions, &nextensions, params + offset, sizeof(params) - offset);
66 if (2 <= gen || (0 != gen && 0 != wgsize2 /*subgroups*/ &&
67 2 <= *c_dbcsr_acc_opencl_config.device.std_level &&
68 NULL != extensions[1] &&
69 NULL != strstr(extensions[1], "cl_ext_float_atomics"))) {
70 offset +=
71 (size_t)LIBXSMM_SNPRINTF(params + offset, sizeof(params) - offset,
72 " -DDBM_MULTIPLY_OPENCL_GEN");
73 if (0 != c_dbcsr_acc_opencl_config.device.intel && 0 != xf) {
74 flags = "-cl-intel-256-GRF-per-thread";
75 }
76 wgsize[1] = wgsize[2] = 1;
77 wgsize[0] = 16;
78 ndims = 3;
79 } else {
80 const char *const split_env = getenv("DBM_MULTIPLY_SPLIT");
81 const char *const wg_env = getenv("DBM_MULTIPLY_WG");
82 split = (NULL == split_env ? 1 /*default*/ : atoi(split_env));
83 wgsize[0] =
84 (NULL == wg_env ? (1 != split ? (wgsize1 * LIBXSMM_ABS(split)) : 0)
85 : strtoul(wg_env, NULL, 10));
86 if (0 != split && 1 != split && (bn * bn) > (int)wgsize[0]) {
87 wgsize[0] = bn * bn;
88 }
89 if (0 != split && 0 != wgsize2 && 0 < wgsize[0]) { /* subgroups */
90 if (LIBXSMM_DELTA(wgsize[0], wgsize1) <=
91 LIBXSMM_DELTA(wgsize[0], wgsize2)) { /* select SG-size */
92 wgsize2 = wgsize1;
93 }
94 wgsize[0] = LIBXSMM_UP(wgsize[0], wgsize2);
95 } else {
96 wgsize[0] = LIBXSMM_UP(wgsize[0], wgsize1);
97 wgsize2 = 0;
98 }
99 wgsize[0] = LIBXSMM_CLMP(wgsize[0], 0, wgsize0);
100 if (NULL == bn_env && 0 != split && 1 != split &&
101 (bn * bn) < (int)wgsize[0]) {
102 bn = libxsmm_isqrt2_u32(wgsize[0]);
103 }
104 bn = LIBXSMM_CLMP(bn, 4, 32);
105 offset += (size_t)LIBXSMM_SNPRINTF(
106 params + offset, sizeof(params) - offset,
107 " %s -DSPLIT=%i -DBN=%i -DWG=%i -DSG=%i -DLU=%i",
108 0 != gpu ? "-DGPU" : "", split, bn, (int)wgsize[0], (int)wgsize2, lu);
109 if (0 != c_dbcsr_acc_opencl_config.device.intel && 0 < xf) {
110 flags = "-cl-intel-256-GRF-per-thread";
111 }
112 }
113 result |= (sizeof(params) > offset ? EXIT_SUCCESS : EXIT_FAILURE);
114 result |= c_dbcsr_acc_opencl_kernel(
115 0 /*source_is_file*/, OPENCL_DBM_SOURCE_MULTIPLY, "dbm_multiply",
116 params, flags, NULL /*try*/, NULL /*try_ok*/, extensions, nextensions,
117 &kernel);
118 if (2 <= verbosity || 0 > verbosity) {
119 if (EXIT_SUCCESS == result) {
120 const double d = libxsmm_timer_duration(start, libxsmm_timer_tick());
121 fprintf(stderr, "INFO ACC/LIBDBM: DBM-kernel gpu=%i", gpu);
122 if (0 == gen) {
123 fprintf(stderr, " split=%i lu=%i bn=%i", split, lu, bn);
124 } else { /* generated kernel */
125 fprintf(stderr, " gen=%i", gen);
126 }
127 fprintf(stderr, " wg=%i sg=%i ms=%.1f\n", (int)wgsize[0], (int)wgsize2,
128 1E3 * d);
129 } else {
130 fprintf(stderr, "INFO ACC/LIBDBM: DBM-kernel failed to generate\n");
131 }
132 }
133 }
134#else
135#error "OpenCL kernel code not found!"
136#endif
137 result |= c_dbcsr_acc_opencl_info_devptr_lock(&adata, NULL /*lock*/,
138 pack_a_data, 1 /*esize*/,
139 NULL /*amount*/, &iadata);
140 result |= c_dbcsr_acc_opencl_info_devptr_lock(&bdata, NULL /*lock*/,
141 pack_b_data, 1 /*esize*/,
142 NULL /*amount*/, &ibdata);
143 result |= c_dbcsr_acc_opencl_info_devptr_lock(&cdata, NULL /*lock*/,
144 shard_c_data, 1 /*esize*/,
145 NULL /*amount*/, &icdata);
146 result |= c_dbcsr_acc_opencl_info_devptr_lock(
147 &batch, NULL /*lock*/, tasks /*batch*/, sizeof(dbm_task_t), &work_tasks,
148 &ibatch);
149 assert(0 == iadata && 0 == ibdata && 0 == icdata);
150 result |= clSetKernelArg(kernel, 0, sizeof(cl_double), &alpha);
151 result |= clSetKernelArg(kernel, 1, sizeof(cl_int), &ibatch);
152 if (1 < ndims) { /* generated kernel */
153 const cl_uint zero = 0;
154 assert(0 != wgsize[1] && 0 != wgsize[1] && 0 != wgsize[2]);
155 work_size[0] = 16;
156 assert(1 == work_size[1]);
157 work_size[2] = work_tasks;
158 result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 2, batch.memory);
159 result |= clSetKernelArg(kernel, 3, sizeof(cl_uint), &zero /*shape*/);
160 result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 4, adata.memory);
161 result |= clSetKernelArg(kernel, 5, sizeof(cl_uint), &zero /*A_shape0*/);
162 result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 6, bdata.memory);
163 result |= clSetKernelArg(kernel, 7, sizeof(cl_uint), &zero /*B_shape0*/);
164 result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 8, cdata.memory);
165 result |= clSetKernelArg(kernel, 9, sizeof(cl_uint), &zero /*C_shape0*/);
166 } else {
167 result |= clSetKernelArg(kernel, 2, sizeof(cl_int), &ntasks);
168 if (0 != split) {
169 if (1 == split || 0 == wgsize[0]) {
170 work_size[0] = work_tasks * max_m;
171 result |= clSetKernelArg(kernel, 3, sizeof(cl_int), work_size);
172 if (0 < wgsize[0]) { /* fixup to be a multiple of the WG-size */
173 work_size[0] = LIBXSMM_UP(work_size[0], wgsize[0]);
174 }
175 } else {
176 work_size[0] = work_tasks * wgsize[0];
177 result |= clSetKernelArg(kernel, 3, sizeof(cl_int), work_size);
178 }
179 } else {
180 work_size[0] = work_tasks;
181 result |= clSetKernelArg(kernel, 3, sizeof(cl_int), work_size);
182 }
183 result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 4, batch.memory);
184 result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 5, adata.memory);
185 result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 6, bdata.memory);
186 result |= c_dbcsr_acc_opencl_set_kernel_ptr(kernel, 7, cdata.memory);
187 }
188 result |= clEnqueueNDRangeKernel(
189 str->queue, kernel, ndims, NULL, work_size, 0 < wgsize[0] ? wgsize : NULL,
190 0 /*num_wait*/, NULL /*wait_list*/, perf_event);
191 if (NULL != perf_event && EXIT_SUCCESS == result) {
192 cl_ulong begin = 0, end = 0;
193 clWaitForEvents(1, perf_event);
194 result |= clGetEventProfilingInfo(*perf_event, CL_PROFILING_COMMAND_START,
195 sizeof(cl_ulong), &begin, NULL);
196 result |= clGetEventProfilingInfo(*perf_event, CL_PROFILING_COMMAND_END,
197 sizeof(cl_ulong), &end, NULL);
198 if (EXIT_SUCCESS == result) {
199 const double duration_ns = LIBXSMM_DELTA(begin, end);
200 const double gflops =
201 (max_m * mnk_range[1][1] * mnk_range[2][1] * ntasks) / duration_ns;
202 fprintf(stderr,
203 "INFO ACC/LIBDBM: DBM-kernel mnk=%ix%ix%i "
204 "ntasks=%i gflops=%.1f ms=%.2g\n",
205 mnk_range[0][1], mnk_range[1][1], mnk_range[2][1], ntasks, gflops,
206 1E-6 * duration_ns);
207 }
208 }
209 ACC_OPENCL_RELEASE(c_dbcsr_acc_opencl_config.lock_main);
210 OFFLOAD_CHECK(result);
211}
212
213#endif // defined(__OFFLOAD_OPENCL) && !defined(__NO_OFFLOAD_DBM)
214
215// EOF
real(dp), dimension(3) d
Internal struct for storing a task, ie. a single block multiplication.