7#ifndef OFFLOAD_RUNTIME_H
8#define OFFLOAD_RUNTIME_H
10#if defined(__OFFLOAD_OPENCL) && !defined(__DBCSR_ACC)
11#undef __OFFLOAD_OPENCL
14#if defined(__OFFLOAD_OPENCL)
15#if !defined(__NO_OFFLOAD_GRID)
16#define __NO_OFFLOAD_GRID
18#if !defined(__NO_OFFLOAD_PW)
19#define __NO_OFFLOAD_PW
23#if defined(__OFFLOAD_CUDA) || defined(__OFFLOAD_HIP) || \
24 defined(__OFFLOAD_OPENCL)
28#if !defined(__OFFLOAD)
32#if defined(__OFFLOAD_CUDA)
33#include <cuda_runtime.h>
34#elif defined(__OFFLOAD_HIP)
35#include <hip/hip_runtime.h>
36#include <hip/hip_version.h>
37#elif defined(__OFFLOAD_OPENCL)
39#include <acc_opencl.h>
46#if defined(__OFFLOAD_CUDA)
47typedef cudaStream_t offloadStream_t;
48typedef cudaEvent_t offloadEvent_t;
49typedef cudaError_t offloadError_t;
50#elif defined(__OFFLOAD_HIP)
51typedef hipStream_t offloadStream_t;
52typedef hipEvent_t offloadEvent_t;
53typedef hipError_t offloadError_t;
54#elif defined(__OFFLOAD_OPENCL)
55typedef void *offloadStream_t;
56typedef void *offloadEvent_t;
57typedef int offloadError_t;
60#if defined(__OFFLOAD_CUDA)
61#define offloadSuccess cudaSuccess
62#elif defined(__OFFLOAD_HIP)
63#define offloadSuccess hipSuccess
64#elif defined(__OFFLOAD_OPENCL)
65#define offloadSuccess EXIT_SUCCESS
72#if !defined(OFFLOAD_CHECK)
73#define OFFLOAD_CHECK(CMD) \
75 const offloadError_t error = (CMD); \
76 if (error != offloadSuccess) { \
77 const char *const name = offloadGetErrorName(error); \
78 if (NULL != name && '\0' != *name) { \
79 fprintf(stderr, "ERROR: \"%s\" at %s:%d\n", name, __FILE__, __LINE__); \
81 fprintf(stderr, "ERROR %i: %s:%d\n", (int)error, __FILE__, __LINE__); \
91static inline const char *offloadGetErrorName(offloadError_t error) {
92#if defined(__OFFLOAD_CUDA)
93 return cudaGetErrorName(error);
94#elif defined(__OFFLOAD_HIP)
95 return hipGetErrorName(error);
96#elif defined(__OFFLOAD_OPENCL)
97#if defined(ACC_OPENCL_ERROR_NAME)
98 return ACC_OPENCL_ERROR_NAME(error);
109static inline offloadError_t offloadGetLastError(
void) {
110#if defined(__OFFLOAD_CUDA)
111 return cudaGetLastError();
112#elif defined(__OFFLOAD_HIP)
113 return hipGetLastError();
114#elif defined(__OFFLOAD_OPENCL)
115#if defined(ACC_OPENCL_ERROR)
116 return ACC_OPENCL_ERROR();
118 return offloadSuccess;
126static inline void offloadMemsetAsync(
void *
const ptr,
const int val,
128 offloadStream_t stream) {
129#if defined(__OFFLOAD_CUDA)
130 OFFLOAD_CHECK(cudaMemsetAsync(ptr, val, size, stream));
131#elif defined(__OFFLOAD_HIP)
132 OFFLOAD_CHECK(hipMemsetAsync(ptr, val, size, stream));
133#elif defined(__OFFLOAD_OPENCL)
135 c_dbcsr_acc_opencl_memset(ptr, val, 0 , size, stream));
142static inline void offloadMemset(
void *ptr,
const int val,
size_t size) {
143#if defined(__OFFLOAD_CUDA)
144 OFFLOAD_CHECK(cudaMemset(ptr, val, size));
145#elif defined(__OFFLOAD_HIP)
146 OFFLOAD_CHECK(hipMemset(ptr, val, size));
147#elif defined(__OFFLOAD_OPENCL)
148 offloadMemsetAsync(ptr, val, size, NULL );
155static inline void offloadMemcpyAsyncHtoD(
void *
const ptr1,
const void *ptr2,
157 offloadStream_t stream) {
158#if defined(__OFFLOAD_CUDA)
160 cudaMemcpyAsync(ptr1, ptr2, size, cudaMemcpyHostToDevice, stream));
161#elif defined(__OFFLOAD_HIP)
162#if defined(__OFFLOAD_UNIFIED_MEMORY)
168 hipMemcpyAsync(ptr1, ptr2, size, hipMemcpyHostToDevice, stream));
169#elif defined(__OFFLOAD_OPENCL)
170 OFFLOAD_CHECK(c_dbcsr_acc_memcpy_h2d(ptr2, ptr1, size, stream));
177static inline void offloadMemcpyAsyncDtoH(
void *
const ptr1,
const void *ptr2,
179 const offloadStream_t stream) {
180#if defined(__OFFLOAD_CUDA)
182 cudaMemcpyAsync(ptr1, ptr2, size, cudaMemcpyDeviceToHost, stream));
183#elif defined(__OFFLOAD_HIP)
184#if defined(__OFFLOAD_UNIFIED_MEMORY)
190 hipMemcpyAsync(ptr1, ptr2, size, hipMemcpyDeviceToHost, stream));
191#elif defined(__OFFLOAD_OPENCL)
192 OFFLOAD_CHECK(c_dbcsr_acc_memcpy_d2h(ptr2, ptr1, size, stream));
199static inline void offloadMemcpyAsyncDtoD(
void *ptr1,
const void *ptr2,
201 const offloadStream_t stream) {
202#if defined(__OFFLOAD_CUDA)
204 cudaMemcpyAsync(ptr1, ptr2, size, cudaMemcpyDeviceToDevice, stream));
205#elif defined(__OFFLOAD_HIP)
207 hipMemcpyAsync(ptr1, ptr2, size, hipMemcpyDeviceToDevice, stream));
208#elif defined(__OFFLOAD_OPENCL)
209 OFFLOAD_CHECK(c_dbcsr_acc_memcpy_d2d(ptr2, ptr1, size, stream));
216static inline void offloadMemcpyHtoD(
void *ptr_device,
const void *ptr_host,
218#if defined(__OFFLOAD_CUDA)
219 OFFLOAD_CHECK(cudaMemcpy(ptr_device, ptr_host, size, cudaMemcpyHostToDevice));
220#elif defined(__OFFLOAD_HIP)
221#if defined(__OFFLOAD_UNIFIED_MEMORY)
222 if (ptr_device == ptr_host) {
226 OFFLOAD_CHECK(hipMemcpy(ptr_device, ptr_host, size, hipMemcpyHostToDevice));
227#elif defined(__OFFLOAD_OPENCL)
228 offloadMemcpyAsyncHtoD(ptr_device, ptr_host, size, NULL );
235static inline void offloadMemcpyDtoH(
void *ptr_device,
const void *ptr_host,
237#if defined(__OFFLOAD_CUDA)
238 OFFLOAD_CHECK(cudaMemcpy(ptr_device, ptr_host, size, cudaMemcpyDeviceToHost));
239#elif defined(__OFFLOAD_HIP)
240#if defined(__OFFLOAD_UNIFIED_MEMORY)
241 if (ptr_device == ptr_host) {
245 OFFLOAD_CHECK(hipMemcpy(ptr_device, ptr_host, size, hipMemcpyDeviceToHost));
246#elif defined(__OFFLOAD_OPENCL)
247 offloadMemcpyAsyncDtoH(ptr_device, ptr_host, size, NULL );
254static inline void offloadMemcpyToSymbol(
const void *symbol,
const void *src,
255 const size_t count) {
256#if defined(__OFFLOAD_CUDA)
258 cudaMemcpyToSymbol(symbol, src, count, 0, cudaMemcpyHostToDevice));
259#elif defined(__OFFLOAD_HIP)
261 hipMemcpyToSymbol(symbol, src, count, 0, hipMemcpyHostToDevice));
262#elif defined(__OFFLOAD_OPENCL)
263 assert(NULL == symbol || NULL == src || 0 == count);
270static inline void offloadEventCreate(offloadEvent_t *event) {
271#if defined(__OFFLOAD_CUDA)
272 OFFLOAD_CHECK(cudaEventCreate(event));
273#elif defined(__OFFLOAD_HIP)
274 OFFLOAD_CHECK(hipEventCreate(event));
275#elif defined(__OFFLOAD_OPENCL)
276 OFFLOAD_CHECK(c_dbcsr_acc_event_create(event));
283static inline void offloadEventDestroy(offloadEvent_t event) {
284#if defined(__OFFLOAD_CUDA)
285 OFFLOAD_CHECK(cudaEventDestroy(event));
286#elif defined(__OFFLOAD_HIP)
287 OFFLOAD_CHECK(hipEventDestroy(event));
288#elif defined(__OFFLOAD_OPENCL)
289 OFFLOAD_CHECK(c_dbcsr_acc_event_destroy(event));
296static inline void offloadStreamCreate(offloadStream_t *stream) {
297#if defined(__OFFLOAD_CUDA)
298 OFFLOAD_CHECK(cudaStreamCreate(stream));
299#elif defined(__OFFLOAD_HIP)
300 OFFLOAD_CHECK(hipStreamCreate(stream));
301#elif defined(__OFFLOAD_OPENCL)
303 OFFLOAD_CHECK(c_dbcsr_acc_stream_priority_range(&least, NULL ));
304 OFFLOAD_CHECK(c_dbcsr_acc_stream_create(stream,
"Offload Stream", least));
311static inline void offloadStreamDestroy(offloadStream_t stream) {
312#if defined(__OFFLOAD_CUDA)
313 OFFLOAD_CHECK(cudaStreamDestroy(stream));
314#elif defined(__OFFLOAD_HIP)
315 OFFLOAD_CHECK(hipStreamDestroy(stream));
316#elif defined(__OFFLOAD_OPENCL)
317 OFFLOAD_CHECK(c_dbcsr_acc_stream_destroy(stream));
324static inline void offloadEventSynchronize(offloadEvent_t event) {
325#if defined(__OFFLOAD_CUDA)
326 OFFLOAD_CHECK(cudaEventSynchronize(event));
327#elif defined(__OFFLOAD_HIP)
328 OFFLOAD_CHECK(hipEventSynchronize(event));
329#elif defined(__OFFLOAD_OPENCL)
330 OFFLOAD_CHECK(c_dbcsr_acc_event_synchronize(event));
337static inline void offloadStreamSynchronize(offloadStream_t stream) {
338#if defined(__OFFLOAD_CUDA)
339 OFFLOAD_CHECK(cudaStreamSynchronize(stream));
340#elif defined(__OFFLOAD_HIP)
341 OFFLOAD_CHECK(hipStreamSynchronize(stream));
342#elif defined(__OFFLOAD_OPENCL)
343 OFFLOAD_CHECK(c_dbcsr_acc_stream_sync(stream));
350static inline void offloadEventRecord(offloadEvent_t event,
351 offloadStream_t stream) {
352#if defined(__OFFLOAD_CUDA)
353 OFFLOAD_CHECK(cudaEventRecord(event, stream));
354#elif defined(__OFFLOAD_HIP)
355 OFFLOAD_CHECK(hipEventRecord(event, stream));
356#elif defined(__OFFLOAD_OPENCL)
357 OFFLOAD_CHECK(c_dbcsr_acc_event_record(event, stream));
364static inline void offloadMallocHost(
void **ptr,
size_t size) {
365#if defined(__OFFLOAD_CUDA)
366 OFFLOAD_CHECK(cudaMallocHost(ptr, size));
367#elif defined(__OFFLOAD_HIP)
368#if !defined(__OFFLOAD_UNIFIED_MEMORY)
369 OFFLOAD_CHECK(hipHostMalloc(ptr, size, hipHostMallocDefault));
374#elif defined(__OFFLOAD_OPENCL)
375 OFFLOAD_CHECK(c_dbcsr_acc_host_mem_allocate(ptr, size, NULL ));
385static inline void offloadMalloc(
void **ptr,
size_t size) {
386#if defined(__OFFLOAD_CUDA)
387 OFFLOAD_CHECK(cudaMalloc(ptr, size));
388#elif defined(__OFFLOAD_HIP)
389 OFFLOAD_CHECK(hipMalloc(ptr, size));
390#elif defined(__OFFLOAD_OPENCL)
391 OFFLOAD_CHECK(c_dbcsr_acc_dev_mem_allocate(ptr, size));
401static inline void offloadFree(
void *ptr) {
402#if defined(__OFFLOAD_CUDA)
403 OFFLOAD_CHECK(cudaFree(ptr));
404#elif defined(__OFFLOAD_HIP)
405 OFFLOAD_CHECK(hipFree(ptr));
406#elif defined(__OFFLOAD_OPENCL)
407 OFFLOAD_CHECK(c_dbcsr_acc_dev_mem_deallocate(ptr));
416static inline void offloadFreeHost(
void *ptr) {
417#if defined(__OFFLOAD_CUDA)
418 OFFLOAD_CHECK(cudaFreeHost(ptr));
419#elif defined(__OFFLOAD_HIP)
420#if !defined(__OFFLOAD_UNIFIED_MEMORY)
421 OFFLOAD_CHECK(hipHostFree(ptr));
423#elif defined(__OFFLOAD_OPENCL)
424 OFFLOAD_CHECK(c_dbcsr_acc_host_mem_deallocate(ptr, NULL ));
433static inline void offloadStreamWaitEvent(offloadStream_t stream,
434 offloadEvent_t event,
const int val) {
435#if defined(__OFFLOAD_CUDA)
436 OFFLOAD_CHECK(cudaStreamWaitEvent(stream, event, val));
437#elif defined(__OFFLOAD_HIP)
438 OFFLOAD_CHECK(hipStreamWaitEvent(stream, event, val));
439#elif defined(__OFFLOAD_OPENCL)
441 OFFLOAD_CHECK(c_dbcsr_acc_stream_wait_event(stream, event));
448static inline void offloadDeviceSynchronize(
void) {
449#if defined(__OFFLOAD_CUDA)
450 OFFLOAD_CHECK(cudaDeviceSynchronize());
451#elif defined(__OFFLOAD_HIP)
452 OFFLOAD_CHECK(hipDeviceSynchronize());
453#elif defined(__OFFLOAD_OPENCL)
454 OFFLOAD_CHECK(c_dbcsr_acc_device_synchronize());
461static inline void offloadEnsureMallocHeapSize(
const size_t required_size) {
462#if defined(__OFFLOAD_CUDA)
464 OFFLOAD_CHECK(cudaDeviceGetLimit(¤t_size, cudaLimitMallocHeapSize));
465 if (current_size < required_size) {
466 OFFLOAD_CHECK(cudaDeviceSetLimit(cudaLimitMallocHeapSize, required_size));
468#elif defined(__OFFLOAD_HIP) && (HIP_VERSION >= 50300000)
470 OFFLOAD_CHECK(hipDeviceGetLimit(¤t_size, hipLimitMallocHeapSize));
471 if (current_size < required_size) {
472 OFFLOAD_CHECK(hipDeviceSetLimit(hipLimitMallocHeapSize, required_size));
474#elif defined(__OFFLOAD_OPENCL)
475 assert(0 == required_size);