7#ifndef OFFLOAD_RUNTIME_H
8#define OFFLOAD_RUNTIME_H
10#if defined(__OFFLOAD_OPENCL) && !defined(__DBCSR_ACC)
11#undef __OFFLOAD_OPENCL
14#if defined(__OFFLOAD_OPENCL)
15#if !defined(__NO_OFFLOAD_GRID)
16#define __NO_OFFLOAD_GRID
18#if !defined(__NO_OFFLOAD_PW)
19#define __NO_OFFLOAD_PW
23#if defined(__OFFLOAD_CUDA) || defined(__OFFLOAD_HIP) || \
24 defined(__OFFLOAD_OPENCL)
30#if defined(__OFFLOAD_CUDA)
31#include <cuda_runtime.h>
32#elif defined(__OFFLOAD_HIP)
33#include <hip/hip_runtime.h>
34#include <hip/hip_version.h>
35#elif defined(__OFFLOAD_OPENCL)
37#include <src/acc/opencl/acc_opencl.h>
44#if defined(__OFFLOAD_CUDA)
45typedef cudaStream_t offloadStream_t;
46typedef cudaEvent_t offloadEvent_t;
47typedef cudaError_t offloadError_t;
48#elif defined(__OFFLOAD_HIP)
49typedef hipStream_t offloadStream_t;
50typedef hipEvent_t offloadEvent_t;
51typedef hipError_t offloadError_t;
52#elif defined(__OFFLOAD_OPENCL)
53typedef void *offloadStream_t;
54typedef void *offloadEvent_t;
55typedef int offloadError_t;
58#if defined(__OFFLOAD_CUDA)
59#define offloadSuccess cudaSuccess
60#elif defined(__OFFLOAD_HIP)
61#define offloadSuccess hipSuccess
62#elif defined(__OFFLOAD_OPENCL)
63#define offloadSuccess EXIT_SUCCESS
70#if !defined(OFFLOAD_CHECK)
71#define OFFLOAD_CHECK(CMD) \
73 const offloadError_t error = (CMD); \
74 if (error != offloadSuccess) { \
75 const char *const name = offloadGetErrorName(error); \
76 if (NULL != name && '\0' != *name) { \
77 fprintf(stderr, "ERROR: \"%s\" at %s:%d\n", name, __FILE__, __LINE__); \
79 fprintf(stderr, "ERROR %i: %s:%d\n", (int)error, __FILE__, __LINE__); \
89static inline const char *offloadGetErrorName(offloadError_t error) {
90#if defined(__OFFLOAD_CUDA)
91 return cudaGetErrorName(error);
92#elif defined(__OFFLOAD_HIP)
93 return hipGetErrorName(error);
94#elif defined(__OFFLOAD_OPENCL)
103static inline offloadError_t offloadGetLastError(
void) {
104#if defined(__OFFLOAD_CUDA)
105 return cudaGetLastError();
106#elif defined(__OFFLOAD_HIP)
107 return hipGetLastError();
108#elif defined(__OFFLOAD_OPENCL)
109 return offloadSuccess;
116static inline void offloadMemsetAsync(
void *
const ptr,
const int val,
118 offloadStream_t stream) {
119#if defined(__OFFLOAD_CUDA)
120 OFFLOAD_CHECK(cudaMemsetAsync(ptr, val, size, stream));
121#elif defined(__OFFLOAD_HIP)
122 OFFLOAD_CHECK(hipMemsetAsync(ptr, val, size, stream));
123#elif defined(__OFFLOAD_OPENCL)
125 c_dbcsr_acc_opencl_memset(ptr, val, 0 , size, stream));
132static inline void offloadMemset(
void *ptr,
const int val,
size_t size) {
133#if defined(__OFFLOAD_CUDA)
134 OFFLOAD_CHECK(cudaMemset(ptr, val, size));
135#elif defined(__OFFLOAD_HIP)
136 OFFLOAD_CHECK(hipMemset(ptr, val, size));
137#elif defined(__OFFLOAD_OPENCL)
138 offloadMemsetAsync(ptr, val, size, NULL );
145static inline void offloadMemcpyAsyncHtoD(
void *
const ptr1,
const void *ptr2,
147 offloadStream_t stream) {
148#if defined(__OFFLOAD_CUDA)
150 cudaMemcpyAsync(ptr1, ptr2, size, cudaMemcpyHostToDevice, stream));
151#elif defined(__OFFLOAD_HIP)
152#if defined(__OFFLOAD_UNIFIED_MEMORY)
158 hipMemcpyAsync(ptr1, ptr2, size, hipMemcpyHostToDevice, stream));
159#elif defined(__OFFLOAD_OPENCL)
160 OFFLOAD_CHECK(c_dbcsr_acc_memcpy_h2d(ptr2, ptr1, size, stream));
167static inline void offloadMemcpyAsyncDtoH(
void *
const ptr1,
const void *ptr2,
169 const offloadStream_t stream) {
170#if defined(__OFFLOAD_CUDA)
172 cudaMemcpyAsync(ptr1, ptr2, size, cudaMemcpyDeviceToHost, stream));
173#elif defined(__OFFLOAD_HIP)
174#if defined(__OFFLOAD_UNIFIED_MEMORY)
180 hipMemcpyAsync(ptr1, ptr2, size, hipMemcpyDeviceToHost, stream));
181#elif defined(__OFFLOAD_OPENCL)
182 OFFLOAD_CHECK(c_dbcsr_acc_memcpy_d2h(ptr2, ptr1, size, stream));
189static inline void offloadMemcpyAsyncDtoD(
void *ptr1,
const void *ptr2,
191 const offloadStream_t stream) {
192#if defined(__OFFLOAD_CUDA)
194 cudaMemcpyAsync(ptr1, ptr2, size, cudaMemcpyDeviceToDevice, stream));
195#elif defined(__OFFLOAD_HIP)
197 hipMemcpyAsync(ptr1, ptr2, size, hipMemcpyDeviceToDevice, stream));
198#elif defined(__OFFLOAD_OPENCL)
199 OFFLOAD_CHECK(c_dbcsr_acc_memcpy_d2d(ptr2, ptr1, size, stream));
206static inline void offloadMemcpyHtoD(
void *ptr_device,
const void *ptr_host,
208#if defined(__OFFLOAD_CUDA)
209 OFFLOAD_CHECK(cudaMemcpy(ptr_device, ptr_host, size, cudaMemcpyHostToDevice));
210#elif defined(__OFFLOAD_HIP)
211#if defined(__OFFLOAD_UNIFIED_MEMORY)
212 if (ptr_device == ptr_host) {
216 OFFLOAD_CHECK(hipMemcpy(ptr_device, ptr_host, size, hipMemcpyHostToDevice));
217#elif defined(__OFFLOAD_OPENCL)
218 offloadMemcpyAsyncHtoD(ptr_device, ptr_host, size, NULL );
225static inline void offloadMemcpyDtoH(
void *ptr_device,
const void *ptr_host,
227#if defined(__OFFLOAD_CUDA)
228 OFFLOAD_CHECK(cudaMemcpy(ptr_device, ptr_host, size, cudaMemcpyDeviceToHost));
229#elif defined(__OFFLOAD_HIP)
230#if defined(__OFFLOAD_UNIFIED_MEMORY)
231 if (ptr_device == ptr_host) {
235 OFFLOAD_CHECK(hipMemcpy(ptr_device, ptr_host, size, hipMemcpyDeviceToHost));
236#elif defined(__OFFLOAD_OPENCL)
237 offloadMemcpyAsyncDtoH(ptr_device, ptr_host, size, NULL );
244static inline void offloadMemcpyToSymbol(
const void *symbol,
const void *src,
245 const size_t count) {
246#if defined(__OFFLOAD_CUDA)
248 cudaMemcpyToSymbol(symbol, src, count, 0, cudaMemcpyHostToDevice));
249#elif defined(__OFFLOAD_HIP)
251 hipMemcpyToSymbol(symbol, src, count, 0, hipMemcpyHostToDevice));
252#elif defined(__OFFLOAD_OPENCL)
253 assert(NULL == symbol || NULL == src || 0 == count);
260static inline void offloadEventCreate(offloadEvent_t *event) {
261#if defined(__OFFLOAD_CUDA)
262 OFFLOAD_CHECK(cudaEventCreate(event));
263#elif defined(__OFFLOAD_HIP)
264 OFFLOAD_CHECK(hipEventCreate(event));
265#elif defined(__OFFLOAD_OPENCL)
266 OFFLOAD_CHECK(c_dbcsr_acc_event_create(event));
273static inline void offloadEventDestroy(offloadEvent_t event) {
274#if defined(__OFFLOAD_CUDA)
275 OFFLOAD_CHECK(cudaEventDestroy(event));
276#elif defined(__OFFLOAD_HIP)
277 OFFLOAD_CHECK(hipEventDestroy(event));
278#elif defined(__OFFLOAD_OPENCL)
279 OFFLOAD_CHECK(c_dbcsr_acc_event_destroy(event));
286static inline void offloadStreamCreate(offloadStream_t *stream) {
287#if defined(__OFFLOAD_CUDA)
288 OFFLOAD_CHECK(cudaStreamCreate(stream));
289#elif defined(__OFFLOAD_HIP)
290 OFFLOAD_CHECK(hipStreamCreate(stream));
291#elif defined(__OFFLOAD_OPENCL)
293 OFFLOAD_CHECK(c_dbcsr_acc_stream_priority_range(&least, NULL ));
294 OFFLOAD_CHECK(c_dbcsr_acc_stream_create(stream,
"Offload Stream", least));
301static inline void offloadStreamDestroy(offloadStream_t stream) {
302#if defined(__OFFLOAD_CUDA)
303 OFFLOAD_CHECK(cudaStreamDestroy(stream));
304#elif defined(__OFFLOAD_HIP)
305 OFFLOAD_CHECK(hipStreamDestroy(stream));
306#elif defined(__OFFLOAD_OPENCL)
307 OFFLOAD_CHECK(c_dbcsr_acc_stream_destroy(stream));
314static inline void offloadEventSynchronize(offloadEvent_t event) {
315#if defined(__OFFLOAD_CUDA)
316 OFFLOAD_CHECK(cudaEventSynchronize(event));
317#elif defined(__OFFLOAD_HIP)
318 OFFLOAD_CHECK(hipEventSynchronize(event));
319#elif defined(__OFFLOAD_OPENCL)
320 OFFLOAD_CHECK(c_dbcsr_acc_event_synchronize(event));
327static inline void offloadStreamSynchronize(offloadStream_t stream) {
328#if defined(__OFFLOAD_CUDA)
329 OFFLOAD_CHECK(cudaStreamSynchronize(stream));
330#elif defined(__OFFLOAD_HIP)
331 OFFLOAD_CHECK(hipStreamSynchronize(stream));
332#elif defined(__OFFLOAD_OPENCL)
333 OFFLOAD_CHECK(c_dbcsr_acc_stream_sync(stream));
340static inline void offloadEventRecord(offloadEvent_t event,
341 offloadStream_t stream) {
342#if defined(__OFFLOAD_CUDA)
343 OFFLOAD_CHECK(cudaEventRecord(event, stream));
344#elif defined(__OFFLOAD_HIP)
345 OFFLOAD_CHECK(hipEventRecord(event, stream));
346#elif defined(__OFFLOAD_OPENCL)
347 OFFLOAD_CHECK(c_dbcsr_acc_event_record(event, stream));
354static inline void offloadMallocHost(
void **ptr,
size_t size) {
355#if defined(__OFFLOAD_CUDA)
356 OFFLOAD_CHECK(cudaMallocHost(ptr, size));
357#elif defined(__OFFLOAD_HIP)
358#if !defined(__OFFLOAD_UNIFIED_MEMORY)
359 OFFLOAD_CHECK(hipHostMalloc(ptr, size, hipHostMallocDefault));
364#elif defined(__OFFLOAD_OPENCL)
365 OFFLOAD_CHECK(c_dbcsr_acc_host_mem_allocate(ptr, size, NULL ));
375static inline void offloadMalloc(
void **ptr,
size_t size) {
376#if defined(__OFFLOAD_CUDA)
377 OFFLOAD_CHECK(cudaMalloc(ptr, size));
378#elif defined(__OFFLOAD_HIP)
379 OFFLOAD_CHECK(hipMalloc(ptr, size));
380#elif defined(__OFFLOAD_OPENCL)
381 OFFLOAD_CHECK(c_dbcsr_acc_dev_mem_allocate(ptr, size));
391static inline void offloadFree(
void *ptr) {
392#if defined(__OFFLOAD_CUDA)
393 OFFLOAD_CHECK(cudaFree(ptr));
394#elif defined(__OFFLOAD_HIP)
395 OFFLOAD_CHECK(hipFree(ptr));
396#elif defined(__OFFLOAD_OPENCL)
397 OFFLOAD_CHECK(c_dbcsr_acc_dev_mem_deallocate(ptr));
406static inline void offloadFreeHost(
void *ptr) {
407#if defined(__OFFLOAD_CUDA)
408 OFFLOAD_CHECK(cudaFreeHost(ptr));
409#elif defined(__OFFLOAD_HIP)
410#if !defined(__OFFLOAD_UNIFIED_MEMORY)
411 OFFLOAD_CHECK(hipHostFree(ptr));
413#elif defined(__OFFLOAD_OPENCL)
414 OFFLOAD_CHECK(c_dbcsr_acc_host_mem_deallocate(ptr, NULL ));
423static inline void offloadStreamWaitEvent(offloadStream_t stream,
424 offloadEvent_t event,
const int val) {
425#if defined(__OFFLOAD_CUDA)
426 OFFLOAD_CHECK(cudaStreamWaitEvent(stream, event, val));
427#elif defined(__OFFLOAD_HIP)
428 OFFLOAD_CHECK(hipStreamWaitEvent(stream, event, val));
429#elif defined(__OFFLOAD_OPENCL)
431 OFFLOAD_CHECK(c_dbcsr_acc_stream_wait_event(stream, event));
438static inline void offloadDeviceSynchronize(
void) {
439#if defined(__OFFLOAD_CUDA)
440 OFFLOAD_CHECK(cudaDeviceSynchronize());
441#elif defined(__OFFLOAD_HIP)
442 OFFLOAD_CHECK(hipDeviceSynchronize());
443#elif defined(__OFFLOAD_OPENCL)
444 OFFLOAD_CHECK(c_dbcsr_acc_device_synchronize());
451static inline void offloadEnsureMallocHeapSize(
const size_t required_size) {
452#if defined(__OFFLOAD_CUDA)
454 OFFLOAD_CHECK(cudaDeviceGetLimit(¤t_size, cudaLimitMallocHeapSize));
455 if (current_size < required_size) {
456 OFFLOAD_CHECK(cudaDeviceSetLimit(cudaLimitMallocHeapSize, required_size));
458#elif defined(__OFFLOAD_HIP) && (HIP_VERSION >= 50300000)
460 OFFLOAD_CHECK(hipDeviceGetLimit(¤t_size, hipLimitMallocHeapSize));
461 if (current_size < required_size) {
462 OFFLOAD_CHECK(hipDeviceSetLimit(hipLimitMallocHeapSize, required_size));
464#elif defined(__OFFLOAD_OPENCL)
465 assert(0 == required_size);