(git:e5fdd81)
offload_runtime.h
Go to the documentation of this file.
1 /*----------------------------------------------------------------------------*/
2 /* CP2K: A general program to perform molecular dynamics simulations */
3 /* Copyright 2000-2024 CP2K developers group <https://cp2k.org> */
4 /* */
5 /* SPDX-License-Identifier: BSD-3-Clause */
6 /*----------------------------------------------------------------------------*/
7 #ifndef OFFLOAD_RUNTIME_H
8 #define OFFLOAD_RUNTIME_H
9 
10 #if defined(__OFFLOAD_OPENCL) && !defined(__DBCSR_ACC)
11 #undef __OFFLOAD_OPENCL
12 #endif
13 /* TODO: implement support or missing features */
14 #if defined(__OFFLOAD_OPENCL)
15 #if !defined(__NO_OFFLOAD_GRID)
16 #define __NO_OFFLOAD_GRID
17 #endif
18 #if !defined(__NO_OFFLOAD_PW)
19 #define __NO_OFFLOAD_PW
20 #endif
21 #endif
22 
23 #if defined(__OFFLOAD_CUDA) || defined(__OFFLOAD_HIP) || \
24  defined(__OFFLOAD_OPENCL)
25 #define __OFFLOAD
26 
27 #include <stdio.h>
28 #include <stdlib.h>
29 
30 #if defined(__OFFLOAD_CUDA)
31 #include <cuda_runtime.h>
32 #elif defined(__OFFLOAD_HIP)
33 #include <hip/hip_runtime.h>
34 #include <hip/hip_version.h>
35 #elif defined(__OFFLOAD_OPENCL)
36 /* No relative path aka double-quote to avoid PACKAGE deps (-Iexts/dbcsr). */
37 #include <src/acc/opencl/acc_opencl.h>
38 #endif
39 
40 #ifdef __cplusplus
41 extern "C" {
42 #endif
43 
44 #if defined(__OFFLOAD_CUDA)
45 typedef cudaStream_t offloadStream_t;
46 typedef cudaEvent_t offloadEvent_t;
47 typedef cudaError_t offloadError_t;
48 #elif defined(__OFFLOAD_HIP)
49 typedef hipStream_t offloadStream_t;
50 typedef hipEvent_t offloadEvent_t;
51 typedef hipError_t offloadError_t;
52 #elif defined(__OFFLOAD_OPENCL)
53 typedef void *offloadStream_t;
54 typedef void *offloadEvent_t;
55 typedef int offloadError_t;
56 #endif
57 
58 #if defined(__OFFLOAD_CUDA)
59 #define offloadSuccess cudaSuccess
60 #elif defined(__OFFLOAD_HIP)
61 #define offloadSuccess hipSuccess
62 #elif defined(__OFFLOAD_OPENCL)
63 #define offloadSuccess EXIT_SUCCESS
64 #endif
65 
66 /*******************************************************************************
67  * \brief Check given Cuda status and upon failure abort with a nice message.
68  * \author Ole Schuett
69  ******************************************************************************/
70 #if !defined(OFFLOAD_CHECK)
71 #define OFFLOAD_CHECK(CMD) \
72  do { \
73  const offloadError_t error = (CMD); \
74  if (error != offloadSuccess) { \
75  const char *const name = offloadGetErrorName(error); \
76  if (NULL != name && '\0' != *name) { \
77  fprintf(stderr, "ERROR: \"%s\" at %s:%d\n", name, __FILE__, __LINE__); \
78  } else { \
79  fprintf(stderr, "ERROR %i: %s:%d\n", (int)error, __FILE__, __LINE__); \
80  } \
81  abort(); \
82  } \
83  } while (0)
84 #endif
85 
86 /*******************************************************************************
87  * \brief Wrapper around cudaGetErrorName.
88  ******************************************************************************/
89 static inline const char *offloadGetErrorName(offloadError_t error) {
90 #if defined(__OFFLOAD_CUDA)
91  return cudaGetErrorName(error);
92 #elif defined(__OFFLOAD_HIP)
93  return hipGetErrorName(error);
94 #elif defined(__OFFLOAD_OPENCL)
95  (void)error; /* mark used */
96  return ""; /* TODO */
97 #endif
98 }
99 
100 /*******************************************************************************
101  * \brief Wrapper around cudaGetLastError.
102  ******************************************************************************/
103 static inline offloadError_t offloadGetLastError(void) {
104 #if defined(__OFFLOAD_CUDA)
105  return cudaGetLastError();
106 #elif defined(__OFFLOAD_HIP)
107  return hipGetLastError();
108 #elif defined(__OFFLOAD_OPENCL)
109  return offloadSuccess; /* TODO */
110 #endif
111 }
112 
113 /*******************************************************************************
114  * \brief Wrapper around cudaMemsetAsync.
115  ******************************************************************************/
116 static inline void offloadMemsetAsync(void *const ptr, const int val,
117  const size_t size,
118  offloadStream_t stream) {
119 #if defined(__OFFLOAD_CUDA)
120  OFFLOAD_CHECK(cudaMemsetAsync(ptr, val, size, stream));
121 #elif defined(__OFFLOAD_HIP)
122  OFFLOAD_CHECK(hipMemsetAsync(ptr, val, size, stream));
123 #elif defined(__OFFLOAD_OPENCL)
124  OFFLOAD_CHECK(
125  c_dbcsr_acc_opencl_memset(ptr, val, 0 /*offset*/, size, stream));
126 #endif
127 }
128 
129 /*******************************************************************************
130  * \brief Wrapper around cudaMemset.
131  ******************************************************************************/
132 static inline void offloadMemset(void *ptr, const int val, size_t size) {
133 #if defined(__OFFLOAD_CUDA)
134  OFFLOAD_CHECK(cudaMemset(ptr, val, size));
135 #elif defined(__OFFLOAD_HIP)
136  OFFLOAD_CHECK(hipMemset(ptr, val, size));
137 #elif defined(__OFFLOAD_OPENCL)
138  offloadMemsetAsync(ptr, val, size, NULL /*stream*/);
139 #endif
140 }
141 
142 /*******************************************************************************
143  * \brief Wrapper around cudaMemcpyAsync(...,cudaMemcpyHostToDevice,...).
144  ******************************************************************************/
145 static inline void offloadMemcpyAsyncHtoD(void *const ptr1, const void *ptr2,
146  const size_t size,
147  offloadStream_t stream) {
148 #if defined(__OFFLOAD_CUDA)
149  OFFLOAD_CHECK(
150  cudaMemcpyAsync(ptr1, ptr2, size, cudaMemcpyHostToDevice, stream));
151 #elif defined(__OFFLOAD_HIP)
152 #if defined(__OFFLOAD_UNIFIED_MEMORY)
153  if (ptr1 == ptr2) {
154  return;
155  }
156 #endif
157  OFFLOAD_CHECK(
158  hipMemcpyAsync(ptr1, ptr2, size, hipMemcpyHostToDevice, stream));
159 #elif defined(__OFFLOAD_OPENCL)
160  OFFLOAD_CHECK(c_dbcsr_acc_memcpy_h2d(ptr2, ptr1, size, stream));
161 #endif
162 }
163 
164 /*******************************************************************************
165  * \brief Wrapper around cudaMemcpyAsync(...,cudaMemcpyDeviceToHost,...).
166  ******************************************************************************/
167 static inline void offloadMemcpyAsyncDtoH(void *const ptr1, const void *ptr2,
168  const size_t size,
169  const offloadStream_t stream) {
170 #if defined(__OFFLOAD_CUDA)
171  OFFLOAD_CHECK(
172  cudaMemcpyAsync(ptr1, ptr2, size, cudaMemcpyDeviceToHost, stream));
173 #elif defined(__OFFLOAD_HIP)
174 #if defined(__OFFLOAD_UNIFIED_MEMORY)
175  if (ptr1 == ptr2) {
176  return;
177  }
178 #endif
179  OFFLOAD_CHECK(
180  hipMemcpyAsync(ptr1, ptr2, size, hipMemcpyDeviceToHost, stream));
181 #elif defined(__OFFLOAD_OPENCL)
182  OFFLOAD_CHECK(c_dbcsr_acc_memcpy_d2h(ptr2, ptr1, size, stream));
183 #endif
184 }
185 
186 /*******************************************************************************
187  * \brief Wrapper around cudaMemcpyAsync(...,cudaMemcpyDeviceToDevice).
188  ******************************************************************************/
189 static inline void offloadMemcpyAsyncDtoD(void *ptr1, const void *ptr2,
190  const size_t size,
191  const offloadStream_t stream) {
192 #if defined(__OFFLOAD_CUDA)
193  OFFLOAD_CHECK(
194  cudaMemcpyAsync(ptr1, ptr2, size, cudaMemcpyDeviceToDevice, stream));
195 #elif defined(__OFFLOAD_HIP)
196  OFFLOAD_CHECK(
197  hipMemcpyAsync(ptr1, ptr2, size, hipMemcpyDeviceToDevice, stream));
198 #elif defined(__OFFLOAD_OPENCL)
199  OFFLOAD_CHECK(c_dbcsr_acc_memcpy_d2d(ptr2, ptr1, size, stream));
200 #endif
201 }
202 
203 /*******************************************************************************
204  * \brief Wrapper around cudaMemcpy(...,cudaMemcpyHostToDevice).
205  ******************************************************************************/
206 static inline void offloadMemcpyHtoD(void *ptr_device, const void *ptr_host,
207  const size_t size) {
208 #if defined(__OFFLOAD_CUDA)
209  OFFLOAD_CHECK(cudaMemcpy(ptr_device, ptr_host, size, cudaMemcpyHostToDevice));
210 #elif defined(__OFFLOAD_HIP)
211 #if defined(__OFFLOAD_UNIFIED_MEMORY)
212  if (ptr_device == ptr_host) {
213  return;
214  }
215 #endif
216  OFFLOAD_CHECK(hipMemcpy(ptr_device, ptr_host, size, hipMemcpyHostToDevice));
217 #elif defined(__OFFLOAD_OPENCL)
218  offloadMemcpyAsyncHtoD(ptr_device, ptr_host, size, NULL /*stream*/);
219 #endif
220 }
221 
222 /*******************************************************************************
223  * \brief Wrapper around cudaMemcpy(...,cudaMemcpyDeviceToHost).
224  ******************************************************************************/
225 static inline void offloadMemcpyDtoH(void *ptr_device, const void *ptr_host,
226  const size_t size) {
227 #if defined(__OFFLOAD_CUDA)
228  OFFLOAD_CHECK(cudaMemcpy(ptr_device, ptr_host, size, cudaMemcpyDeviceToHost));
229 #elif defined(__OFFLOAD_HIP)
230 #if defined(__OFFLOAD_UNIFIED_MEMORY)
231  if (ptr_device == ptr_host) {
232  return;
233  }
234 #endif
235  OFFLOAD_CHECK(hipMemcpy(ptr_device, ptr_host, size, hipMemcpyDeviceToHost));
236 #elif defined(__OFFLOAD_OPENCL)
237  offloadMemcpyAsyncDtoH(ptr_device, ptr_host, size, NULL /*stream*/);
238 #endif
239 }
240 
241 /*******************************************************************************
242  * \brief Wrapper around cudaMemcpyToSymbol.
243  ******************************************************************************/
244 static inline void offloadMemcpyToSymbol(const void *symbol, const void *src,
245  const size_t count) {
246 #if defined(__OFFLOAD_CUDA)
247  OFFLOAD_CHECK(
248  cudaMemcpyToSymbol(symbol, src, count, 0, cudaMemcpyHostToDevice));
249 #elif defined(__OFFLOAD_HIP)
250  OFFLOAD_CHECK(
251  hipMemcpyToSymbol(symbol, src, count, 0, hipMemcpyHostToDevice));
252 #elif defined(__OFFLOAD_OPENCL)
253  assert(NULL == symbol || NULL == src || 0 == count); /* TODO */
254 #endif
255 }
256 
257 /*******************************************************************************
258  * \brief Wrapper around cudaEventCreate.
259  ******************************************************************************/
260 static inline void offloadEventCreate(offloadEvent_t *event) {
261 #if defined(__OFFLOAD_CUDA)
262  OFFLOAD_CHECK(cudaEventCreate(event));
263 #elif defined(__OFFLOAD_HIP)
264  OFFLOAD_CHECK(hipEventCreate(event));
265 #elif defined(__OFFLOAD_OPENCL)
266  OFFLOAD_CHECK(c_dbcsr_acc_event_create(event));
267 #endif
268 }
269 
270 /*******************************************************************************
271  * \brief Wrapper around cudaEventDestroy.
272  ******************************************************************************/
273 static inline void offloadEventDestroy(offloadEvent_t event) {
274 #if defined(__OFFLOAD_CUDA)
275  OFFLOAD_CHECK(cudaEventDestroy(event));
276 #elif defined(__OFFLOAD_HIP)
277  OFFLOAD_CHECK(hipEventDestroy(event));
278 #elif defined(__OFFLOAD_OPENCL)
279  OFFLOAD_CHECK(c_dbcsr_acc_event_destroy(event));
280 #endif
281 }
282 
283 /*******************************************************************************
284  * \brief Wrapper around cudaStreamCreate.
285  ******************************************************************************/
286 static inline void offloadStreamCreate(offloadStream_t *stream) {
287 #if defined(__OFFLOAD_CUDA)
288  OFFLOAD_CHECK(cudaStreamCreate(stream));
289 #elif defined(__OFFLOAD_HIP)
290  OFFLOAD_CHECK(hipStreamCreate(stream));
291 #elif defined(__OFFLOAD_OPENCL)
292  int least = 0;
293  OFFLOAD_CHECK(c_dbcsr_acc_stream_priority_range(&least, NULL /*greatest*/));
294  OFFLOAD_CHECK(c_dbcsr_acc_stream_create(stream, "Offload Stream", least));
295 #endif
296 }
297 
298 /*******************************************************************************
299  * \brief Wrapper around cudaStreamDestroy.
300  ******************************************************************************/
301 static inline void offloadStreamDestroy(offloadStream_t stream) {
302 #if defined(__OFFLOAD_CUDA)
303  OFFLOAD_CHECK(cudaStreamDestroy(stream));
304 #elif defined(__OFFLOAD_HIP)
305  OFFLOAD_CHECK(hipStreamDestroy(stream));
306 #elif defined(__OFFLOAD_OPENCL)
307  OFFLOAD_CHECK(c_dbcsr_acc_stream_destroy(stream));
308 #endif
309 }
310 
311 /*******************************************************************************
312  * \brief Wrapper around cudaEventSynchronize.
313  ******************************************************************************/
314 static inline void offloadEventSynchronize(offloadEvent_t event) {
315 #if defined(__OFFLOAD_CUDA)
316  OFFLOAD_CHECK(cudaEventSynchronize(event));
317 #elif defined(__OFFLOAD_HIP)
318  OFFLOAD_CHECK(hipEventSynchronize(event));
319 #elif defined(__OFFLOAD_OPENCL)
320  OFFLOAD_CHECK(c_dbcsr_acc_event_synchronize(event));
321 #endif
322 }
323 
324 /*******************************************************************************
325  * \brief Wrapper around cudaStreamSynchronize.
326  ******************************************************************************/
327 static inline void offloadStreamSynchronize(offloadStream_t stream) {
328 #if defined(__OFFLOAD_CUDA)
329  OFFLOAD_CHECK(cudaStreamSynchronize(stream));
330 #elif defined(__OFFLOAD_HIP)
331  OFFLOAD_CHECK(hipStreamSynchronize(stream));
332 #elif defined(__OFFLOAD_OPENCL)
333  OFFLOAD_CHECK(c_dbcsr_acc_stream_sync(stream));
334 #endif
335 }
336 
337 /*******************************************************************************
338  * \brief Wrapper around cudaEventRecord.
339  ******************************************************************************/
340 static inline void offloadEventRecord(offloadEvent_t event,
341  offloadStream_t stream) {
342 #if defined(__OFFLOAD_CUDA)
343  OFFLOAD_CHECK(cudaEventRecord(event, stream));
344 #elif defined(__OFFLOAD_HIP)
345  OFFLOAD_CHECK(hipEventRecord(event, stream));
346 #elif defined(__OFFLOAD_OPENCL)
347  OFFLOAD_CHECK(c_dbcsr_acc_event_record(event, stream));
348 #endif
349 }
350 
351 /*******************************************************************************
352  * \brief Wrapper around cudaMallocHost.
353  ******************************************************************************/
354 static inline void offloadMallocHost(void **ptr, size_t size) {
355 #if defined(__OFFLOAD_CUDA)
356  OFFLOAD_CHECK(cudaMallocHost(ptr, size));
357 #elif defined(__OFFLOAD_HIP)
358 #if !defined(__OFFLOAD_UNIFIED_MEMORY)
359  OFFLOAD_CHECK(hipHostMalloc(ptr, size, hipHostMallocDefault)); // inconsistent
360 #else
361  assert(NULL != ptr);
362  *ptr = NULL;
363 #endif
364 #elif defined(__OFFLOAD_OPENCL)
365  OFFLOAD_CHECK(c_dbcsr_acc_host_mem_allocate(ptr, size, NULL /*stream*/));
366 #else
367  assert(NULL != ptr);
368  *ptr = malloc(size);
369 #endif
370 }
371 
372 /*******************************************************************************
373  * \brief Wrapper around cudaMalloc.
374  ******************************************************************************/
375 static inline void offloadMalloc(void **ptr, size_t size) {
376 #if defined(__OFFLOAD_CUDA)
377  OFFLOAD_CHECK(cudaMalloc(ptr, size));
378 #elif defined(__OFFLOAD_HIP)
379  OFFLOAD_CHECK(hipMalloc(ptr, size));
380 #elif defined(__OFFLOAD_OPENCL)
381  OFFLOAD_CHECK(c_dbcsr_acc_dev_mem_allocate(ptr, size));
382 #else
383  assert(NULL != ptr);
384  *ptr = NULL;
385 #endif
386 }
387 
388 /*******************************************************************************
389  * \brief Wrapper around cudaFree.
390  ******************************************************************************/
391 static inline void offloadFree(void *ptr) {
392 #if defined(__OFFLOAD_CUDA)
393  OFFLOAD_CHECK(cudaFree(ptr));
394 #elif defined(__OFFLOAD_HIP)
395  OFFLOAD_CHECK(hipFree(ptr));
396 #elif defined(__OFFLOAD_OPENCL)
397  OFFLOAD_CHECK(c_dbcsr_acc_dev_mem_deallocate(ptr));
398 #else
399  assert(NULL == ptr);
400 #endif
401 }
402 
403 /*******************************************************************************
404  * \brief Wrapper around cudaFreeHost.
405  ******************************************************************************/
406 static inline void offloadFreeHost(void *ptr) {
407 #if defined(__OFFLOAD_CUDA)
408  OFFLOAD_CHECK(cudaFreeHost(ptr));
409 #elif defined(__OFFLOAD_HIP)
410 #if !defined(__OFFLOAD_UNIFIED_MEMORY)
411  OFFLOAD_CHECK(hipHostFree(ptr)); // inconsistent
412 #endif
413 #elif defined(__OFFLOAD_OPENCL)
414  OFFLOAD_CHECK(c_dbcsr_acc_host_mem_deallocate(ptr, NULL /*stream*/));
415 #else
416  free(ptr);
417 #endif
418 }
419 
420 /*******************************************************************************
421  * \brief Wrapper around cudaStreamWaitEvent.
422  ******************************************************************************/
423 static inline void offloadStreamWaitEvent(offloadStream_t stream,
424  offloadEvent_t event, const int val) {
425 #if defined(__OFFLOAD_CUDA)
426  OFFLOAD_CHECK(cudaStreamWaitEvent(stream, event, val));
427 #elif defined(__OFFLOAD_HIP)
428  OFFLOAD_CHECK(hipStreamWaitEvent(stream, event, val));
429 #elif defined(__OFFLOAD_OPENCL)
430  assert(0 == val); /* TODO */
431  OFFLOAD_CHECK(c_dbcsr_acc_stream_wait_event(stream, event));
432 #endif
433 }
434 
435 /*******************************************************************************
436  * \brief Wrapper around cudaDeviceSynchronize.
437  ******************************************************************************/
438 static inline void offloadDeviceSynchronize(void) {
439 #if defined(__OFFLOAD_CUDA)
440  OFFLOAD_CHECK(cudaDeviceSynchronize());
441 #elif defined(__OFFLOAD_HIP)
442  OFFLOAD_CHECK(hipDeviceSynchronize());
443 #elif defined(__OFFLOAD_OPENCL)
444  OFFLOAD_CHECK(c_dbcsr_acc_device_synchronize());
445 #endif
446 }
447 
448 /*******************************************************************************
449  * \brief Wrapper around cudaDeviceSetLimit(cudaLimitMallocHeapSize,...).
450  ******************************************************************************/
451 static inline void offloadEnsureMallocHeapSize(const size_t required_size) {
452 #if defined(__OFFLOAD_CUDA)
453  size_t current_size;
454  OFFLOAD_CHECK(cudaDeviceGetLimit(&current_size, cudaLimitMallocHeapSize));
455  if (current_size < required_size) {
456  OFFLOAD_CHECK(cudaDeviceSetLimit(cudaLimitMallocHeapSize, required_size));
457  }
458 #elif defined(__OFFLOAD_HIP) && (HIP_VERSION >= 50300000)
459  size_t current_size;
460  OFFLOAD_CHECK(hipDeviceGetLimit(&current_size, hipLimitMallocHeapSize));
461  if (current_size < required_size) {
462  OFFLOAD_CHECK(hipDeviceSetLimit(hipLimitMallocHeapSize, required_size));
463  }
464 #elif defined(__OFFLOAD_OPENCL)
465  assert(0 == required_size); /* TODO */
466 #else
467  (void)required_size; /* mark used */
468 #endif
469 }
470 
471 #ifdef __cplusplus
472 }
473 #endif
474 
475 #endif // defined(__OFFLOAD_CUDA) || defined(__OFFLOAD_HIP) ||
476  // defined(__OFFLOAD_OPENCL)
477 #endif