(git:ed6f26b)
Loading...
Searching...
No Matches
offload_runtime.h
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2025 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: BSD-3-Clause */
6/*----------------------------------------------------------------------------*/
7#ifndef OFFLOAD_RUNTIME_H
8#define OFFLOAD_RUNTIME_H
9
10#if defined(__OFFLOAD_OPENCL) && !defined(__DBCSR_ACC)
11#undef __OFFLOAD_OPENCL
12#endif
13/* TODO: implement support or missing features */
14#if defined(__OFFLOAD_OPENCL)
15#if !defined(__NO_OFFLOAD_GRID)
16#define __NO_OFFLOAD_GRID
17#endif
18#if !defined(__NO_OFFLOAD_PW)
19#define __NO_OFFLOAD_PW
20#endif
21#endif
22
23#if defined(__OFFLOAD_CUDA) || defined(__OFFLOAD_HIP) || \
24 defined(__OFFLOAD_OPENCL)
25#include <stdio.h>
26#include <stdlib.h>
27
28#if !defined(__OFFLOAD)
29#define __OFFLOAD
30#endif
31
32#if defined(__OFFLOAD_CUDA)
33#include <cuda_runtime.h>
34#elif defined(__OFFLOAD_HIP)
35#include <hip/hip_runtime.h>
36#include <hip/hip_version.h>
37#elif defined(__OFFLOAD_OPENCL)
38/* No relative path aka double-quote to avoid PACKAGE deps (-Iexts/dbcsr). */
39#include <acc_opencl.h>
40#endif
41
42#ifdef __cplusplus
43extern "C" {
44#endif
45
46#if defined(__OFFLOAD_CUDA)
47typedef cudaStream_t offloadStream_t;
48typedef cudaEvent_t offloadEvent_t;
49typedef cudaError_t offloadError_t;
50#elif defined(__OFFLOAD_HIP)
51typedef hipStream_t offloadStream_t;
52typedef hipEvent_t offloadEvent_t;
53typedef hipError_t offloadError_t;
54#elif defined(__OFFLOAD_OPENCL)
55typedef void *offloadStream_t;
56typedef void *offloadEvent_t;
57typedef int offloadError_t;
58#endif
59
60#if defined(__OFFLOAD_CUDA)
61#define offloadSuccess cudaSuccess
62#elif defined(__OFFLOAD_HIP)
63#define offloadSuccess hipSuccess
64#elif defined(__OFFLOAD_OPENCL)
65#define offloadSuccess EXIT_SUCCESS
66#endif
67
68/*******************************************************************************
69 * \brief Check given Cuda status and upon failure abort with a nice message.
70 * \author Ole Schuett
71 ******************************************************************************/
72#if !defined(OFFLOAD_CHECK)
73#define OFFLOAD_CHECK(CMD) \
74 do { \
75 const offloadError_t error = (CMD); \
76 if (error != offloadSuccess) { \
77 const char *const name = offloadGetErrorName(error); \
78 if (NULL != name && '\0' != *name) { \
79 fprintf(stderr, "ERROR: \"%s\" at %s:%d\n", name, __FILE__, __LINE__); \
80 } else { \
81 fprintf(stderr, "ERROR %i: %s:%d\n", (int)error, __FILE__, __LINE__); \
82 } \
83 abort(); \
84 } \
85 } while (0)
86#endif
87
88/*******************************************************************************
89 * \brief Wrapper around cudaGetErrorName.
90 ******************************************************************************/
91static inline const char *offloadGetErrorName(offloadError_t error) {
92#if defined(__OFFLOAD_CUDA)
93 return cudaGetErrorName(error);
94#elif defined(__OFFLOAD_HIP)
95 return hipGetErrorName(error);
96#elif defined(__OFFLOAD_OPENCL)
97#if defined(ACC_OPENCL_ERROR_NAME)
98 return ACC_OPENCL_ERROR_NAME(error);
99#else
100 (void)error; /* mark used */
101 return "";
102#endif
103#endif
104}
105
106/*******************************************************************************
107 * \brief Wrapper around cudaGetLastError.
108 ******************************************************************************/
109static inline offloadError_t offloadGetLastError(void) {
110#if defined(__OFFLOAD_CUDA)
111 return cudaGetLastError();
112#elif defined(__OFFLOAD_HIP)
113 return hipGetLastError();
114#elif defined(__OFFLOAD_OPENCL)
115#if defined(ACC_OPENCL_ERROR)
116 return ACC_OPENCL_ERROR();
117#else
118 return offloadSuccess;
119#endif
120#endif
121}
122
123/*******************************************************************************
124 * \brief Wrapper around cudaMemsetAsync.
125 ******************************************************************************/
126static inline void offloadMemsetAsync(void *const ptr, const int val,
127 const size_t size,
128 offloadStream_t stream) {
129#if defined(__OFFLOAD_CUDA)
130 OFFLOAD_CHECK(cudaMemsetAsync(ptr, val, size, stream));
131#elif defined(__OFFLOAD_HIP)
132 OFFLOAD_CHECK(hipMemsetAsync(ptr, val, size, stream));
133#elif defined(__OFFLOAD_OPENCL)
134 OFFLOAD_CHECK(
135 c_dbcsr_acc_opencl_memset(ptr, val, 0 /*offset*/, size, stream));
136#endif
137}
138
139/*******************************************************************************
140 * \brief Wrapper around cudaMemset.
141 ******************************************************************************/
142static inline void offloadMemset(void *ptr, const int val, size_t size) {
143#if defined(__OFFLOAD_CUDA)
144 OFFLOAD_CHECK(cudaMemset(ptr, val, size));
145#elif defined(__OFFLOAD_HIP)
146 OFFLOAD_CHECK(hipMemset(ptr, val, size));
147#elif defined(__OFFLOAD_OPENCL)
148 offloadMemsetAsync(ptr, val, size, NULL /*stream*/);
149#endif
150}
151
152/*******************************************************************************
153 * \brief Wrapper around cudaMemcpyAsync(...,cudaMemcpyHostToDevice,...).
154 ******************************************************************************/
155static inline void offloadMemcpyAsyncHtoD(void *const ptr1, const void *ptr2,
156 const size_t size,
157 offloadStream_t stream) {
158#if defined(__OFFLOAD_CUDA)
159 OFFLOAD_CHECK(
160 cudaMemcpyAsync(ptr1, ptr2, size, cudaMemcpyHostToDevice, stream));
161#elif defined(__OFFLOAD_HIP)
162#if defined(__OFFLOAD_UNIFIED_MEMORY)
163 if (ptr1 == ptr2) {
164 return;
165 }
166#endif
167 OFFLOAD_CHECK(
168 hipMemcpyAsync(ptr1, ptr2, size, hipMemcpyHostToDevice, stream));
169#elif defined(__OFFLOAD_OPENCL)
170 OFFLOAD_CHECK(c_dbcsr_acc_memcpy_h2d(ptr2, ptr1, size, stream));
171#endif
172}
173
174/*******************************************************************************
175 * \brief Wrapper around cudaMemcpyAsync(...,cudaMemcpyDeviceToHost,...).
176 ******************************************************************************/
177static inline void offloadMemcpyAsyncDtoH(void *const ptr1, const void *ptr2,
178 const size_t size,
179 const offloadStream_t stream) {
180#if defined(__OFFLOAD_CUDA)
181 OFFLOAD_CHECK(
182 cudaMemcpyAsync(ptr1, ptr2, size, cudaMemcpyDeviceToHost, stream));
183#elif defined(__OFFLOAD_HIP)
184#if defined(__OFFLOAD_UNIFIED_MEMORY)
185 if (ptr1 == ptr2) {
186 return;
187 }
188#endif
189 OFFLOAD_CHECK(
190 hipMemcpyAsync(ptr1, ptr2, size, hipMemcpyDeviceToHost, stream));
191#elif defined(__OFFLOAD_OPENCL)
192 OFFLOAD_CHECK(c_dbcsr_acc_memcpy_d2h(ptr2, ptr1, size, stream));
193#endif
194}
195
196/*******************************************************************************
197 * \brief Wrapper around cudaMemcpyAsync(...,cudaMemcpyDeviceToDevice).
198 ******************************************************************************/
199static inline void offloadMemcpyAsyncDtoD(void *ptr1, const void *ptr2,
200 const size_t size,
201 const offloadStream_t stream) {
202#if defined(__OFFLOAD_CUDA)
203 OFFLOAD_CHECK(
204 cudaMemcpyAsync(ptr1, ptr2, size, cudaMemcpyDeviceToDevice, stream));
205#elif defined(__OFFLOAD_HIP)
206 OFFLOAD_CHECK(
207 hipMemcpyAsync(ptr1, ptr2, size, hipMemcpyDeviceToDevice, stream));
208#elif defined(__OFFLOAD_OPENCL)
209 OFFLOAD_CHECK(c_dbcsr_acc_memcpy_d2d(ptr2, ptr1, size, stream));
210#endif
211}
212
213/*******************************************************************************
214 * \brief Wrapper around cudaMemcpy(...,cudaMemcpyHostToDevice).
215 ******************************************************************************/
216static inline void offloadMemcpyHtoD(void *ptr_device, const void *ptr_host,
217 const size_t size) {
218#if defined(__OFFLOAD_CUDA)
219 OFFLOAD_CHECK(cudaMemcpy(ptr_device, ptr_host, size, cudaMemcpyHostToDevice));
220#elif defined(__OFFLOAD_HIP)
221#if defined(__OFFLOAD_UNIFIED_MEMORY)
222 if (ptr_device == ptr_host) {
223 return;
224 }
225#endif
226 OFFLOAD_CHECK(hipMemcpy(ptr_device, ptr_host, size, hipMemcpyHostToDevice));
227#elif defined(__OFFLOAD_OPENCL)
228 offloadMemcpyAsyncHtoD(ptr_device, ptr_host, size, NULL /*stream*/);
229#endif
230}
231
232/*******************************************************************************
233 * \brief Wrapper around cudaMemcpy(...,cudaMemcpyDeviceToHost).
234 ******************************************************************************/
235static inline void offloadMemcpyDtoH(void *ptr_device, const void *ptr_host,
236 const size_t size) {
237#if defined(__OFFLOAD_CUDA)
238 OFFLOAD_CHECK(cudaMemcpy(ptr_device, ptr_host, size, cudaMemcpyDeviceToHost));
239#elif defined(__OFFLOAD_HIP)
240#if defined(__OFFLOAD_UNIFIED_MEMORY)
241 if (ptr_device == ptr_host) {
242 return;
243 }
244#endif
245 OFFLOAD_CHECK(hipMemcpy(ptr_device, ptr_host, size, hipMemcpyDeviceToHost));
246#elif defined(__OFFLOAD_OPENCL)
247 offloadMemcpyAsyncDtoH(ptr_device, ptr_host, size, NULL /*stream*/);
248#endif
249}
250
251/*******************************************************************************
252 * \brief Wrapper around cudaMemcpyToSymbol.
253 ******************************************************************************/
254static inline void offloadMemcpyToSymbol(const void *symbol, const void *src,
255 const size_t count) {
256#if defined(__OFFLOAD_CUDA)
257 OFFLOAD_CHECK(
258 cudaMemcpyToSymbol(symbol, src, count, 0, cudaMemcpyHostToDevice));
259#elif defined(__OFFLOAD_HIP)
260 OFFLOAD_CHECK(
261 hipMemcpyToSymbol(symbol, src, count, 0, hipMemcpyHostToDevice));
262#elif defined(__OFFLOAD_OPENCL)
263 assert(NULL == symbol || NULL == src || 0 == count); /* TODO */
264#endif
265}
266
267/*******************************************************************************
268 * \brief Wrapper around cudaEventCreate.
269 ******************************************************************************/
270static inline void offloadEventCreate(offloadEvent_t *event) {
271#if defined(__OFFLOAD_CUDA)
272 OFFLOAD_CHECK(cudaEventCreate(event));
273#elif defined(__OFFLOAD_HIP)
274 OFFLOAD_CHECK(hipEventCreate(event));
275#elif defined(__OFFLOAD_OPENCL)
276 OFFLOAD_CHECK(c_dbcsr_acc_event_create(event));
277#endif
278}
279
280/*******************************************************************************
281 * \brief Wrapper around cudaEventDestroy.
282 ******************************************************************************/
283static inline void offloadEventDestroy(offloadEvent_t event) {
284#if defined(__OFFLOAD_CUDA)
285 OFFLOAD_CHECK(cudaEventDestroy(event));
286#elif defined(__OFFLOAD_HIP)
287 OFFLOAD_CHECK(hipEventDestroy(event));
288#elif defined(__OFFLOAD_OPENCL)
289 OFFLOAD_CHECK(c_dbcsr_acc_event_destroy(event));
290#endif
291}
292
293/*******************************************************************************
294 * \brief Wrapper around cudaStreamCreate.
295 ******************************************************************************/
296static inline void offloadStreamCreate(offloadStream_t *stream) {
297#if defined(__OFFLOAD_CUDA)
298 OFFLOAD_CHECK(cudaStreamCreate(stream));
299#elif defined(__OFFLOAD_HIP)
300 OFFLOAD_CHECK(hipStreamCreate(stream));
301#elif defined(__OFFLOAD_OPENCL)
302 int least = 0;
303 OFFLOAD_CHECK(c_dbcsr_acc_stream_priority_range(&least, NULL /*greatest*/));
304 OFFLOAD_CHECK(c_dbcsr_acc_stream_create(stream, "Offload Stream", least));
305#endif
306}
307
308/*******************************************************************************
309 * \brief Wrapper around cudaStreamDestroy.
310 ******************************************************************************/
311static inline void offloadStreamDestroy(offloadStream_t stream) {
312#if defined(__OFFLOAD_CUDA)
313 OFFLOAD_CHECK(cudaStreamDestroy(stream));
314#elif defined(__OFFLOAD_HIP)
315 OFFLOAD_CHECK(hipStreamDestroy(stream));
316#elif defined(__OFFLOAD_OPENCL)
317 OFFLOAD_CHECK(c_dbcsr_acc_stream_destroy(stream));
318#endif
319}
320
321/*******************************************************************************
322 * \brief Wrapper around cudaEventSynchronize.
323 ******************************************************************************/
324static inline void offloadEventSynchronize(offloadEvent_t event) {
325#if defined(__OFFLOAD_CUDA)
326 OFFLOAD_CHECK(cudaEventSynchronize(event));
327#elif defined(__OFFLOAD_HIP)
328 OFFLOAD_CHECK(hipEventSynchronize(event));
329#elif defined(__OFFLOAD_OPENCL)
330 OFFLOAD_CHECK(c_dbcsr_acc_event_synchronize(event));
331#endif
332}
333
334/*******************************************************************************
335 * \brief Wrapper around cudaStreamSynchronize.
336 ******************************************************************************/
337static inline void offloadStreamSynchronize(offloadStream_t stream) {
338#if defined(__OFFLOAD_CUDA)
339 OFFLOAD_CHECK(cudaStreamSynchronize(stream));
340#elif defined(__OFFLOAD_HIP)
341 OFFLOAD_CHECK(hipStreamSynchronize(stream));
342#elif defined(__OFFLOAD_OPENCL)
343 OFFLOAD_CHECK(c_dbcsr_acc_stream_sync(stream));
344#endif
345}
346
347/*******************************************************************************
348 * \brief Wrapper around cudaEventRecord.
349 ******************************************************************************/
350static inline void offloadEventRecord(offloadEvent_t event,
351 offloadStream_t stream) {
352#if defined(__OFFLOAD_CUDA)
353 OFFLOAD_CHECK(cudaEventRecord(event, stream));
354#elif defined(__OFFLOAD_HIP)
355 OFFLOAD_CHECK(hipEventRecord(event, stream));
356#elif defined(__OFFLOAD_OPENCL)
357 OFFLOAD_CHECK(c_dbcsr_acc_event_record(event, stream));
358#endif
359}
360
361/*******************************************************************************
362 * \brief Wrapper around cudaMallocHost.
363 ******************************************************************************/
364static inline void offloadMallocHost(void **ptr, size_t size) {
365#if defined(__OFFLOAD_CUDA)
366 OFFLOAD_CHECK(cudaMallocHost(ptr, size));
367#elif defined(__OFFLOAD_HIP)
368#if !defined(__OFFLOAD_UNIFIED_MEMORY)
369 OFFLOAD_CHECK(hipHostMalloc(ptr, size, hipHostMallocDefault)); // inconsistent
370#else
371 assert(NULL != ptr);
372 *ptr = NULL;
373#endif
374#elif defined(__OFFLOAD_OPENCL)
375 OFFLOAD_CHECK(c_dbcsr_acc_host_mem_allocate(ptr, size, NULL /*stream*/));
376#else
377 assert(NULL != ptr);
378 *ptr = malloc(size);
379#endif
380}
381
382/*******************************************************************************
383 * \brief Wrapper around cudaMalloc.
384 ******************************************************************************/
385static inline void offloadMalloc(void **ptr, size_t size) {
386#if defined(__OFFLOAD_CUDA)
387 OFFLOAD_CHECK(cudaMalloc(ptr, size));
388#elif defined(__OFFLOAD_HIP)
389 OFFLOAD_CHECK(hipMalloc(ptr, size));
390#elif defined(__OFFLOAD_OPENCL)
391 OFFLOAD_CHECK(c_dbcsr_acc_dev_mem_allocate(ptr, size));
392#else
393 assert(NULL != ptr);
394 *ptr = NULL;
395#endif
396}
397
398/*******************************************************************************
399 * \brief Wrapper around cudaFree.
400 ******************************************************************************/
401static inline void offloadFree(void *ptr) {
402#if defined(__OFFLOAD_CUDA)
403 OFFLOAD_CHECK(cudaFree(ptr));
404#elif defined(__OFFLOAD_HIP)
405 OFFLOAD_CHECK(hipFree(ptr));
406#elif defined(__OFFLOAD_OPENCL)
407 OFFLOAD_CHECK(c_dbcsr_acc_dev_mem_deallocate(ptr));
408#else
409 assert(NULL == ptr);
410#endif
411}
412
413/*******************************************************************************
414 * \brief Wrapper around cudaFreeHost.
415 ******************************************************************************/
416static inline void offloadFreeHost(void *ptr) {
417#if defined(__OFFLOAD_CUDA)
418 OFFLOAD_CHECK(cudaFreeHost(ptr));
419#elif defined(__OFFLOAD_HIP)
420#if !defined(__OFFLOAD_UNIFIED_MEMORY)
421 OFFLOAD_CHECK(hipHostFree(ptr)); // inconsistent
422#endif
423#elif defined(__OFFLOAD_OPENCL)
424 OFFLOAD_CHECK(c_dbcsr_acc_host_mem_deallocate(ptr, NULL /*stream*/));
425#else
426 free(ptr);
427#endif
428}
429
430/*******************************************************************************
431 * \brief Wrapper around cudaStreamWaitEvent.
432 ******************************************************************************/
433static inline void offloadStreamWaitEvent(offloadStream_t stream,
434 offloadEvent_t event, const int val) {
435#if defined(__OFFLOAD_CUDA)
436 OFFLOAD_CHECK(cudaStreamWaitEvent(stream, event, val));
437#elif defined(__OFFLOAD_HIP)
438 OFFLOAD_CHECK(hipStreamWaitEvent(stream, event, val));
439#elif defined(__OFFLOAD_OPENCL)
440 assert(0 == val); /* TODO */
441 OFFLOAD_CHECK(c_dbcsr_acc_stream_wait_event(stream, event));
442#endif
443}
444
445/*******************************************************************************
446 * \brief Wrapper around cudaDeviceSynchronize.
447 ******************************************************************************/
448static inline void offloadDeviceSynchronize(void) {
449#if defined(__OFFLOAD_CUDA)
450 OFFLOAD_CHECK(cudaDeviceSynchronize());
451#elif defined(__OFFLOAD_HIP)
452 OFFLOAD_CHECK(hipDeviceSynchronize());
453#elif defined(__OFFLOAD_OPENCL)
454 OFFLOAD_CHECK(c_dbcsr_acc_device_synchronize());
455#endif
456}
457
458/*******************************************************************************
459 * \brief Wrapper around cudaDeviceSetLimit(cudaLimitMallocHeapSize,...).
460 ******************************************************************************/
461static inline void offloadEnsureMallocHeapSize(const size_t required_size) {
462#if defined(__OFFLOAD_CUDA)
463 size_t current_size;
464 OFFLOAD_CHECK(cudaDeviceGetLimit(&current_size, cudaLimitMallocHeapSize));
465 if (current_size < required_size) {
466 OFFLOAD_CHECK(cudaDeviceSetLimit(cudaLimitMallocHeapSize, required_size));
467 }
468#elif defined(__OFFLOAD_HIP) && (HIP_VERSION >= 50300000)
469 size_t current_size;
470 OFFLOAD_CHECK(hipDeviceGetLimit(&current_size, hipLimitMallocHeapSize));
471 if (current_size < required_size) {
472 OFFLOAD_CHECK(hipDeviceSetLimit(hipLimitMallocHeapSize, required_size));
473 }
474#elif defined(__OFFLOAD_OPENCL)
475 assert(0 == required_size); /* TODO */
476#else
477 (void)required_size; /* mark used */
478#endif
479}
480
481#ifdef __cplusplus
482}
483#endif
484
485#endif // defined(__OFFLOAD_CUDA) || defined(__OFFLOAD_HIP) ||
486 // defined(__OFFLOAD_OPENCL)
487#endif