7#ifndef OFFLOAD_RUNTIME_H
8#define OFFLOAD_RUNTIME_H
10#if defined(__OFFLOAD_OPENCL) && !defined(__DBCSR_ACC)
11#undef __OFFLOAD_OPENCL
14#if defined(__OFFLOAD_OPENCL)
15#if !defined(__NO_OFFLOAD_GRID)
16#define __NO_OFFLOAD_GRID
18#if !defined(__NO_OFFLOAD_PW)
19#define __NO_OFFLOAD_PW
23#if defined(__OFFLOAD_CUDA) || defined(__OFFLOAD_HIP) || \
24 defined(__OFFLOAD_OPENCL)
29#if !defined(__OFFLOAD)
33#if defined(__OFFLOAD_CUDA)
34#include <cuda_runtime.h>
35#elif defined(__OFFLOAD_HIP)
36#include <hip/hip_runtime.h>
37#include <hip/hip_version.h>
38#elif defined(__OFFLOAD_OPENCL)
40#include <acc_opencl.h>
47#if defined(__OFFLOAD_CUDA)
48typedef cudaStream_t offloadStream_t;
49typedef cudaEvent_t offloadEvent_t;
50typedef cudaError_t offloadError_t;
51#elif defined(__OFFLOAD_HIP)
52typedef hipStream_t offloadStream_t;
53typedef hipEvent_t offloadEvent_t;
54typedef hipError_t offloadError_t;
55#elif defined(__OFFLOAD_OPENCL)
56typedef void *offloadStream_t;
57typedef void *offloadEvent_t;
58typedef int offloadError_t;
61#if defined(__OFFLOAD_CUDA)
62#define offloadSuccess cudaSuccess
63#elif defined(__OFFLOAD_HIP)
64#define offloadSuccess hipSuccess
65#elif defined(__OFFLOAD_OPENCL)
66#define offloadSuccess EXIT_SUCCESS
73#if !defined(OFFLOAD_CHECK)
74#define OFFLOAD_CHECK(CMD) \
76 const offloadError_t error = (CMD); \
77 if (error != offloadSuccess) { \
78 const char *const name = offloadGetErrorName(error); \
79 if (NULL != name && '\0' != *name) { \
80 fprintf(stderr, "ERROR: \"%s\" at %s:%d\n", name, __FILE__, __LINE__); \
82 fprintf(stderr, "ERROR %i: %s:%d\n", (int)error, __FILE__, __LINE__); \
92static inline const char *offloadGetErrorName(offloadError_t error) {
93#if defined(__OFFLOAD_CUDA)
94 return cudaGetErrorName(error);
95#elif defined(__OFFLOAD_HIP)
96 return hipGetErrorName(error);
97#elif defined(__OFFLOAD_OPENCL)
98#if defined(ACC_OPENCL_ERROR_NAME)
99 return ACC_OPENCL_ERROR_NAME(error);
110static inline offloadError_t offloadGetLastError(
void) {
111#if defined(__OFFLOAD_CUDA)
112 return cudaGetLastError();
113#elif defined(__OFFLOAD_HIP)
114 return hipGetLastError();
115#elif defined(__OFFLOAD_OPENCL)
116#if defined(ACC_OPENCL_ERROR)
117 return ACC_OPENCL_ERROR();
119 return offloadSuccess;
127static inline void offloadMemsetAsync(
void *
const ptr,
const int val,
129 offloadStream_t stream) {
130#if defined(__OFFLOAD_CUDA)
131 OFFLOAD_CHECK(cudaMemsetAsync(ptr, val, size, stream));
132#elif defined(__OFFLOAD_HIP)
133 OFFLOAD_CHECK(hipMemsetAsync(ptr, val, size, stream));
134#elif defined(__OFFLOAD_OPENCL)
136 c_dbcsr_acc_opencl_memset(ptr, val, 0 , size, stream));
143static inline void offloadMemset(
void *ptr,
const int val,
size_t size) {
144#if defined(__OFFLOAD_CUDA)
145 OFFLOAD_CHECK(cudaMemset(ptr, val, size));
146#elif defined(__OFFLOAD_HIP)
147 OFFLOAD_CHECK(hipMemset(ptr, val, size));
148#elif defined(__OFFLOAD_OPENCL)
149 offloadMemsetAsync(ptr, val, size, NULL );
156static inline void offloadMemcpyAsyncHtoD(
void *
const ptr1,
const void *ptr2,
158 offloadStream_t stream) {
159#if defined(__OFFLOAD_CUDA)
161 cudaMemcpyAsync(ptr1, ptr2, size, cudaMemcpyHostToDevice, stream));
162#elif defined(__OFFLOAD_HIP)
163#if defined(__OFFLOAD_UNIFIED_MEMORY)
169 hipMemcpyAsync(ptr1, ptr2, size, hipMemcpyHostToDevice, stream));
170#elif defined(__OFFLOAD_OPENCL)
171 OFFLOAD_CHECK(c_dbcsr_acc_memcpy_h2d(ptr2, ptr1, size, stream));
178static inline void offloadMemcpyAsyncDtoH(
void *
const ptr1,
const void *ptr2,
180 const offloadStream_t stream) {
181#if defined(__OFFLOAD_CUDA)
183 cudaMemcpyAsync(ptr1, ptr2, size, cudaMemcpyDeviceToHost, stream));
184#elif defined(__OFFLOAD_HIP)
185#if defined(__OFFLOAD_UNIFIED_MEMORY)
191 hipMemcpyAsync(ptr1, ptr2, size, hipMemcpyDeviceToHost, stream));
192#elif defined(__OFFLOAD_OPENCL)
193 OFFLOAD_CHECK(c_dbcsr_acc_memcpy_d2h(ptr2, ptr1, size, stream));
200static inline void offloadMemcpyAsyncDtoD(
void *ptr1,
const void *ptr2,
202 const offloadStream_t stream) {
203#if defined(__OFFLOAD_CUDA)
205 cudaMemcpyAsync(ptr1, ptr2, size, cudaMemcpyDeviceToDevice, stream));
206#elif defined(__OFFLOAD_HIP)
208 hipMemcpyAsync(ptr1, ptr2, size, hipMemcpyDeviceToDevice, stream));
209#elif defined(__OFFLOAD_OPENCL)
210 OFFLOAD_CHECK(c_dbcsr_acc_memcpy_d2d(ptr2, ptr1, size, stream));
217static inline void offloadMemcpyHtoD(
void *ptr_device,
const void *ptr_host,
219#if defined(__OFFLOAD_CUDA)
220 OFFLOAD_CHECK(cudaMemcpy(ptr_device, ptr_host, size, cudaMemcpyHostToDevice));
221#elif defined(__OFFLOAD_HIP)
222#if defined(__OFFLOAD_UNIFIED_MEMORY)
223 if (ptr_device == ptr_host) {
227 OFFLOAD_CHECK(hipMemcpy(ptr_device, ptr_host, size, hipMemcpyHostToDevice));
228#elif defined(__OFFLOAD_OPENCL)
229 offloadMemcpyAsyncHtoD(ptr_device, ptr_host, size, NULL );
236static inline void offloadMemcpyDtoH(
void *ptr_device,
const void *ptr_host,
238#if defined(__OFFLOAD_CUDA)
239 OFFLOAD_CHECK(cudaMemcpy(ptr_device, ptr_host, size, cudaMemcpyDeviceToHost));
240#elif defined(__OFFLOAD_HIP)
241#if defined(__OFFLOAD_UNIFIED_MEMORY)
242 if (ptr_device == ptr_host) {
246 OFFLOAD_CHECK(hipMemcpy(ptr_device, ptr_host, size, hipMemcpyDeviceToHost));
247#elif defined(__OFFLOAD_OPENCL)
248 offloadMemcpyAsyncDtoH(ptr_device, ptr_host, size, NULL );
255static inline void offloadMemcpyToSymbol(
const void *symbol,
const void *src,
256 const size_t count) {
257#if defined(__OFFLOAD_CUDA)
259 cudaMemcpyToSymbol(symbol, src, count, 0, cudaMemcpyHostToDevice));
260#elif defined(__OFFLOAD_HIP)
262 hipMemcpyToSymbol(symbol, src, count, 0, hipMemcpyHostToDevice));
263#elif defined(__OFFLOAD_OPENCL)
264 assert(NULL == symbol || NULL == src || 0 == count);
271static inline void offloadEventCreate(offloadEvent_t *event) {
272#if defined(__OFFLOAD_CUDA)
273 OFFLOAD_CHECK(cudaEventCreate(event));
274#elif defined(__OFFLOAD_HIP)
275 OFFLOAD_CHECK(hipEventCreate(event));
276#elif defined(__OFFLOAD_OPENCL)
277 OFFLOAD_CHECK(c_dbcsr_acc_event_create(event));
284static inline void offloadEventDestroy(offloadEvent_t event) {
285#if defined(__OFFLOAD_CUDA)
286 OFFLOAD_CHECK(cudaEventDestroy(event));
287#elif defined(__OFFLOAD_HIP)
288 OFFLOAD_CHECK(hipEventDestroy(event));
289#elif defined(__OFFLOAD_OPENCL)
290 OFFLOAD_CHECK(c_dbcsr_acc_event_destroy(event));
297static inline void offloadStreamCreate(offloadStream_t *stream) {
298#if defined(__OFFLOAD_CUDA)
299 OFFLOAD_CHECK(cudaStreamCreate(stream));
300#elif defined(__OFFLOAD_HIP)
301 OFFLOAD_CHECK(hipStreamCreate(stream));
302#elif defined(__OFFLOAD_OPENCL)
304 OFFLOAD_CHECK(c_dbcsr_acc_stream_priority_range(&least, NULL ));
305 OFFLOAD_CHECK(c_dbcsr_acc_stream_create(stream,
"Offload Stream", least));
312static inline void offloadStreamDestroy(offloadStream_t stream) {
313#if defined(__OFFLOAD_CUDA)
314 OFFLOAD_CHECK(cudaStreamDestroy(stream));
315#elif defined(__OFFLOAD_HIP)
316 OFFLOAD_CHECK(hipStreamDestroy(stream));
317#elif defined(__OFFLOAD_OPENCL)
318 OFFLOAD_CHECK(c_dbcsr_acc_stream_destroy(stream));
325static inline void offloadEventSynchronize(offloadEvent_t event) {
326#if defined(__OFFLOAD_CUDA)
327 OFFLOAD_CHECK(cudaEventSynchronize(event));
328#elif defined(__OFFLOAD_HIP)
329 OFFLOAD_CHECK(hipEventSynchronize(event));
330#elif defined(__OFFLOAD_OPENCL)
331 OFFLOAD_CHECK(c_dbcsr_acc_event_synchronize(event));
338static inline void offloadStreamSynchronize(offloadStream_t stream) {
339#if defined(__OFFLOAD_CUDA)
340 OFFLOAD_CHECK(cudaStreamSynchronize(stream));
341#elif defined(__OFFLOAD_HIP)
342 OFFLOAD_CHECK(hipStreamSynchronize(stream));
343#elif defined(__OFFLOAD_OPENCL)
344 OFFLOAD_CHECK(c_dbcsr_acc_stream_sync(stream));
351static inline void offloadEventRecord(offloadEvent_t event,
352 offloadStream_t stream) {
353#if defined(__OFFLOAD_CUDA)
354 OFFLOAD_CHECK(cudaEventRecord(event, stream));
355#elif defined(__OFFLOAD_HIP)
356 OFFLOAD_CHECK(hipEventRecord(event, stream));
357#elif defined(__OFFLOAD_OPENCL)
358 OFFLOAD_CHECK(c_dbcsr_acc_event_record(event, stream));
365static inline void offloadMallocHost(
void **ptr,
size_t size) {
366#if defined(__OFFLOAD_CUDA)
367 OFFLOAD_CHECK(cudaMallocHost(ptr, size));
368#elif defined(__OFFLOAD_HIP)
369#if !defined(__OFFLOAD_UNIFIED_MEMORY)
370 OFFLOAD_CHECK(hipHostMalloc(ptr, size, hipHostMallocDefault));
375#elif defined(__OFFLOAD_OPENCL)
376 OFFLOAD_CHECK(c_dbcsr_acc_host_mem_allocate(ptr, size, NULL ));
386static inline void offloadMalloc(
void **ptr,
size_t size) {
387#if defined(__OFFLOAD_CUDA)
388 OFFLOAD_CHECK(cudaMalloc(ptr, size));
389#elif defined(__OFFLOAD_HIP)
390 OFFLOAD_CHECK(hipMalloc(ptr, size));
391#elif defined(__OFFLOAD_OPENCL)
392 OFFLOAD_CHECK(c_dbcsr_acc_dev_mem_allocate(ptr, size));
402static inline void offloadFree(
void *ptr) {
403#if defined(__OFFLOAD_CUDA)
404 OFFLOAD_CHECK(cudaFree(ptr));
405#elif defined(__OFFLOAD_HIP)
406 OFFLOAD_CHECK(hipFree(ptr));
407#elif defined(__OFFLOAD_OPENCL)
408 OFFLOAD_CHECK(c_dbcsr_acc_dev_mem_deallocate(ptr));
417static inline void offloadFreeHost(
void *ptr) {
418#if defined(__OFFLOAD_CUDA)
419 OFFLOAD_CHECK(cudaFreeHost(ptr));
420#elif defined(__OFFLOAD_HIP)
421#if !defined(__OFFLOAD_UNIFIED_MEMORY)
422 OFFLOAD_CHECK(hipHostFree(ptr));
424#elif defined(__OFFLOAD_OPENCL)
425 OFFLOAD_CHECK(c_dbcsr_acc_host_mem_deallocate(ptr, NULL ));
434static inline void offloadStreamWaitEvent(offloadStream_t stream,
435 offloadEvent_t event) {
436#if defined(__OFFLOAD_CUDA)
437 OFFLOAD_CHECK(cudaStreamWaitEvent(stream, event, 0 ));
438#elif defined(__OFFLOAD_HIP)
439 OFFLOAD_CHECK(hipStreamWaitEvent(stream, event, 0 ));
440#elif defined(__OFFLOAD_OPENCL)
441 OFFLOAD_CHECK(c_dbcsr_acc_stream_wait_event(stream, event));
448static inline bool offloadEventQuery(offloadEvent_t event) {
449#if defined(__OFFLOAD_CUDA)
450 return offloadSuccess == cudaEventQuery(event);
451#elif defined(__OFFLOAD_HIP)
452 return offloadSuccess == hipEventQuery(event);
453#elif defined(__OFFLOAD_OPENCL)
454 c_dbcsr_acc_bool_t has_occurred;
455 OFFLOAD_CHECK(c_dbcsr_acc_event_query(event, &has_occurred));
456 return (
bool)has_occurred;
463static inline void offloadDeviceSynchronize(
void) {
464#if defined(__OFFLOAD_CUDA)
465 OFFLOAD_CHECK(cudaDeviceSynchronize());
466#elif defined(__OFFLOAD_HIP)
467 OFFLOAD_CHECK(hipDeviceSynchronize());
468#elif defined(__OFFLOAD_OPENCL)
469 OFFLOAD_CHECK(c_dbcsr_acc_device_synchronize());
476static inline void offloadEnsureMallocHeapSize(
const size_t required_size) {
477#if defined(__OFFLOAD_CUDA)
479 OFFLOAD_CHECK(cudaDeviceGetLimit(¤t_size, cudaLimitMallocHeapSize));
480 if (current_size < required_size) {
481 OFFLOAD_CHECK(cudaDeviceSetLimit(cudaLimitMallocHeapSize, required_size));
483#elif defined(__OFFLOAD_HIP) && (HIP_VERSION >= 50300000)
485 OFFLOAD_CHECK(hipDeviceGetLimit(¤t_size, hipLimitMallocHeapSize));
486 if (current_size < required_size) {
487 OFFLOAD_CHECK(hipDeviceSetLimit(hipLimitMallocHeapSize, required_size));
489#elif defined(__OFFLOAD_OPENCL)
490 assert(0 == required_size);