(git:b17b328)
Loading...
Searching...
No Matches
offload_runtime.h
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2025 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: BSD-3-Clause */
6/*----------------------------------------------------------------------------*/
7#ifndef OFFLOAD_RUNTIME_H
8#define OFFLOAD_RUNTIME_H
9
10#if defined(__OFFLOAD_OPENCL) && !defined(__DBCSR_ACC)
11#undef __OFFLOAD_OPENCL
12#endif
13/* TODO: implement support or missing features */
14#if defined(__OFFLOAD_OPENCL)
15#if !defined(__NO_OFFLOAD_GRID)
16#define __NO_OFFLOAD_GRID
17#endif
18#if !defined(__NO_OFFLOAD_PW)
19#define __NO_OFFLOAD_PW
20#endif
21#endif
22
23#if defined(__OFFLOAD_CUDA) || defined(__OFFLOAD_HIP) || \
24 defined(__OFFLOAD_OPENCL)
25#include <stdbool.h>
26#include <stdio.h>
27#include <stdlib.h>
28
29#if !defined(__OFFLOAD)
30#define __OFFLOAD
31#endif
32
33#if defined(__OFFLOAD_CUDA)
34#include <cuda_runtime.h>
35#elif defined(__OFFLOAD_HIP)
36#include <hip/hip_runtime.h>
37#include <hip/hip_version.h>
38#elif defined(__OFFLOAD_OPENCL)
39/* No relative path aka double-quote to avoid PACKAGE deps (-Iexts/dbcsr). */
40#include <acc_opencl.h>
41#endif
42
43#ifdef __cplusplus
44extern "C" {
45#endif
46
47#if defined(__OFFLOAD_CUDA)
48typedef cudaStream_t offloadStream_t;
49typedef cudaEvent_t offloadEvent_t;
50typedef cudaError_t offloadError_t;
51#elif defined(__OFFLOAD_HIP)
52typedef hipStream_t offloadStream_t;
53typedef hipEvent_t offloadEvent_t;
54typedef hipError_t offloadError_t;
55#elif defined(__OFFLOAD_OPENCL)
56typedef void *offloadStream_t;
57typedef void *offloadEvent_t;
58typedef int offloadError_t;
59#endif
60
61#if defined(__OFFLOAD_CUDA)
62#define offloadSuccess cudaSuccess
63#elif defined(__OFFLOAD_HIP)
64#define offloadSuccess hipSuccess
65#elif defined(__OFFLOAD_OPENCL)
66#define offloadSuccess EXIT_SUCCESS
67#endif
68
69/*******************************************************************************
70 * \brief Check given Cuda status and upon failure abort with a nice message.
71 * \author Ole Schuett
72 ******************************************************************************/
73#if !defined(OFFLOAD_CHECK)
74#define OFFLOAD_CHECK(CMD) \
75 do { \
76 const offloadError_t error = (CMD); \
77 if (error != offloadSuccess) { \
78 const char *const name = offloadGetErrorName(error); \
79 if (NULL != name && '\0' != *name) { \
80 fprintf(stderr, "ERROR: \"%s\" at %s:%d\n", name, __FILE__, __LINE__); \
81 } else { \
82 fprintf(stderr, "ERROR %i: %s:%d\n", (int)error, __FILE__, __LINE__); \
83 } \
84 abort(); \
85 } \
86 } while (0)
87#endif
88
89/*******************************************************************************
90 * \brief Wrapper around cudaGetErrorName.
91 ******************************************************************************/
92static inline const char *offloadGetErrorName(offloadError_t error) {
93#if defined(__OFFLOAD_CUDA)
94 return cudaGetErrorName(error);
95#elif defined(__OFFLOAD_HIP)
96 return hipGetErrorName(error);
97#elif defined(__OFFLOAD_OPENCL)
98#if defined(ACC_OPENCL_ERROR_NAME)
99 return ACC_OPENCL_ERROR_NAME(error);
100#else
101 (void)error; /* mark used */
102 return "";
103#endif
104#endif
105}
106
107/*******************************************************************************
108 * \brief Wrapper around cudaGetLastError.
109 ******************************************************************************/
110static inline offloadError_t offloadGetLastError(void) {
111#if defined(__OFFLOAD_CUDA)
112 return cudaGetLastError();
113#elif defined(__OFFLOAD_HIP)
114 return hipGetLastError();
115#elif defined(__OFFLOAD_OPENCL)
116#if defined(ACC_OPENCL_ERROR)
117 return ACC_OPENCL_ERROR();
118#else
119 return offloadSuccess;
120#endif
121#endif
122}
123
124/*******************************************************************************
125 * \brief Wrapper around cudaMemsetAsync.
126 ******************************************************************************/
127static inline void offloadMemsetAsync(void *const ptr, const int val,
128 const size_t size,
129 offloadStream_t stream) {
130#if defined(__OFFLOAD_CUDA)
131 OFFLOAD_CHECK(cudaMemsetAsync(ptr, val, size, stream));
132#elif defined(__OFFLOAD_HIP)
133 OFFLOAD_CHECK(hipMemsetAsync(ptr, val, size, stream));
134#elif defined(__OFFLOAD_OPENCL)
135 OFFLOAD_CHECK(
136 c_dbcsr_acc_opencl_memset(ptr, val, 0 /*offset*/, size, stream));
137#endif
138}
139
140/*******************************************************************************
141 * \brief Wrapper around cudaMemset.
142 ******************************************************************************/
143static inline void offloadMemset(void *ptr, const int val, size_t size) {
144#if defined(__OFFLOAD_CUDA)
145 OFFLOAD_CHECK(cudaMemset(ptr, val, size));
146#elif defined(__OFFLOAD_HIP)
147 OFFLOAD_CHECK(hipMemset(ptr, val, size));
148#elif defined(__OFFLOAD_OPENCL)
149 offloadMemsetAsync(ptr, val, size, NULL /*stream*/);
150#endif
151}
152
153/*******************************************************************************
154 * \brief Wrapper around cudaMemcpyAsync(...,cudaMemcpyHostToDevice,...).
155 ******************************************************************************/
156static inline void offloadMemcpyAsyncHtoD(void *const ptr1, const void *ptr2,
157 const size_t size,
158 offloadStream_t stream) {
159#if defined(__OFFLOAD_CUDA)
160 OFFLOAD_CHECK(
161 cudaMemcpyAsync(ptr1, ptr2, size, cudaMemcpyHostToDevice, stream));
162#elif defined(__OFFLOAD_HIP)
163#if defined(__OFFLOAD_UNIFIED_MEMORY)
164 if (ptr1 == ptr2) {
165 return;
166 }
167#endif
168 OFFLOAD_CHECK(
169 hipMemcpyAsync(ptr1, ptr2, size, hipMemcpyHostToDevice, stream));
170#elif defined(__OFFLOAD_OPENCL)
171 OFFLOAD_CHECK(c_dbcsr_acc_memcpy_h2d(ptr2, ptr1, size, stream));
172#endif
173}
174
175/*******************************************************************************
176 * \brief Wrapper around cudaMemcpyAsync(...,cudaMemcpyDeviceToHost,...).
177 ******************************************************************************/
178static inline void offloadMemcpyAsyncDtoH(void *const ptr1, const void *ptr2,
179 const size_t size,
180 const offloadStream_t stream) {
181#if defined(__OFFLOAD_CUDA)
182 OFFLOAD_CHECK(
183 cudaMemcpyAsync(ptr1, ptr2, size, cudaMemcpyDeviceToHost, stream));
184#elif defined(__OFFLOAD_HIP)
185#if defined(__OFFLOAD_UNIFIED_MEMORY)
186 if (ptr1 == ptr2) {
187 return;
188 }
189#endif
190 OFFLOAD_CHECK(
191 hipMemcpyAsync(ptr1, ptr2, size, hipMemcpyDeviceToHost, stream));
192#elif defined(__OFFLOAD_OPENCL)
193 OFFLOAD_CHECK(c_dbcsr_acc_memcpy_d2h(ptr2, ptr1, size, stream));
194#endif
195}
196
197/*******************************************************************************
198 * \brief Wrapper around cudaMemcpyAsync(...,cudaMemcpyDeviceToDevice).
199 ******************************************************************************/
200static inline void offloadMemcpyAsyncDtoD(void *ptr1, const void *ptr2,
201 const size_t size,
202 const offloadStream_t stream) {
203#if defined(__OFFLOAD_CUDA)
204 OFFLOAD_CHECK(
205 cudaMemcpyAsync(ptr1, ptr2, size, cudaMemcpyDeviceToDevice, stream));
206#elif defined(__OFFLOAD_HIP)
207 OFFLOAD_CHECK(
208 hipMemcpyAsync(ptr1, ptr2, size, hipMemcpyDeviceToDevice, stream));
209#elif defined(__OFFLOAD_OPENCL)
210 OFFLOAD_CHECK(c_dbcsr_acc_memcpy_d2d(ptr2, ptr1, size, stream));
211#endif
212}
213
214/*******************************************************************************
215 * \brief Wrapper around cudaMemcpy(...,cudaMemcpyHostToDevice).
216 ******************************************************************************/
217static inline void offloadMemcpyHtoD(void *ptr_device, const void *ptr_host,
218 const size_t size) {
219#if defined(__OFFLOAD_CUDA)
220 OFFLOAD_CHECK(cudaMemcpy(ptr_device, ptr_host, size, cudaMemcpyHostToDevice));
221#elif defined(__OFFLOAD_HIP)
222#if defined(__OFFLOAD_UNIFIED_MEMORY)
223 if (ptr_device == ptr_host) {
224 return;
225 }
226#endif
227 OFFLOAD_CHECK(hipMemcpy(ptr_device, ptr_host, size, hipMemcpyHostToDevice));
228#elif defined(__OFFLOAD_OPENCL)
229 offloadMemcpyAsyncHtoD(ptr_device, ptr_host, size, NULL /*stream*/);
230#endif
231}
232
233/*******************************************************************************
234 * \brief Wrapper around cudaMemcpy(...,cudaMemcpyDeviceToHost).
235 ******************************************************************************/
236static inline void offloadMemcpyDtoH(void *ptr_device, const void *ptr_host,
237 const size_t size) {
238#if defined(__OFFLOAD_CUDA)
239 OFFLOAD_CHECK(cudaMemcpy(ptr_device, ptr_host, size, cudaMemcpyDeviceToHost));
240#elif defined(__OFFLOAD_HIP)
241#if defined(__OFFLOAD_UNIFIED_MEMORY)
242 if (ptr_device == ptr_host) {
243 return;
244 }
245#endif
246 OFFLOAD_CHECK(hipMemcpy(ptr_device, ptr_host, size, hipMemcpyDeviceToHost));
247#elif defined(__OFFLOAD_OPENCL)
248 offloadMemcpyAsyncDtoH(ptr_device, ptr_host, size, NULL /*stream*/);
249#endif
250}
251
252/*******************************************************************************
253 * \brief Wrapper around cudaMemcpyToSymbol.
254 ******************************************************************************/
255static inline void offloadMemcpyToSymbol(const void *symbol, const void *src,
256 const size_t count) {
257#if defined(__OFFLOAD_CUDA)
258 OFFLOAD_CHECK(
259 cudaMemcpyToSymbol(symbol, src, count, 0, cudaMemcpyHostToDevice));
260#elif defined(__OFFLOAD_HIP)
261 OFFLOAD_CHECK(
262 hipMemcpyToSymbol(symbol, src, count, 0, hipMemcpyHostToDevice));
263#elif defined(__OFFLOAD_OPENCL)
264 assert(NULL == symbol || NULL == src || 0 == count); /* TODO */
265#endif
266}
267
268/*******************************************************************************
269 * \brief Wrapper around cudaEventCreate.
270 ******************************************************************************/
271static inline void offloadEventCreate(offloadEvent_t *event) {
272#if defined(__OFFLOAD_CUDA)
273 OFFLOAD_CHECK(cudaEventCreate(event));
274#elif defined(__OFFLOAD_HIP)
275 OFFLOAD_CHECK(hipEventCreate(event));
276#elif defined(__OFFLOAD_OPENCL)
277 OFFLOAD_CHECK(c_dbcsr_acc_event_create(event));
278#endif
279}
280
281/*******************************************************************************
282 * \brief Wrapper around cudaEventDestroy.
283 ******************************************************************************/
284static inline void offloadEventDestroy(offloadEvent_t event) {
285#if defined(__OFFLOAD_CUDA)
286 OFFLOAD_CHECK(cudaEventDestroy(event));
287#elif defined(__OFFLOAD_HIP)
288 OFFLOAD_CHECK(hipEventDestroy(event));
289#elif defined(__OFFLOAD_OPENCL)
290 OFFLOAD_CHECK(c_dbcsr_acc_event_destroy(event));
291#endif
292}
293
294/*******************************************************************************
295 * \brief Wrapper around cudaStreamCreate.
296 ******************************************************************************/
297static inline void offloadStreamCreate(offloadStream_t *stream) {
298#if defined(__OFFLOAD_CUDA)
299 OFFLOAD_CHECK(cudaStreamCreate(stream));
300#elif defined(__OFFLOAD_HIP)
301 OFFLOAD_CHECK(hipStreamCreate(stream));
302#elif defined(__OFFLOAD_OPENCL)
303 int least = 0;
304 OFFLOAD_CHECK(c_dbcsr_acc_stream_priority_range(&least, NULL /*greatest*/));
305 OFFLOAD_CHECK(c_dbcsr_acc_stream_create(stream, "Offload Stream", least));
306#endif
307}
308
309/*******************************************************************************
310 * \brief Wrapper around cudaStreamDestroy.
311 ******************************************************************************/
312static inline void offloadStreamDestroy(offloadStream_t stream) {
313#if defined(__OFFLOAD_CUDA)
314 OFFLOAD_CHECK(cudaStreamDestroy(stream));
315#elif defined(__OFFLOAD_HIP)
316 OFFLOAD_CHECK(hipStreamDestroy(stream));
317#elif defined(__OFFLOAD_OPENCL)
318 OFFLOAD_CHECK(c_dbcsr_acc_stream_destroy(stream));
319#endif
320}
321
322/*******************************************************************************
323 * \brief Wrapper around cudaEventSynchronize.
324 ******************************************************************************/
325static inline void offloadEventSynchronize(offloadEvent_t event) {
326#if defined(__OFFLOAD_CUDA)
327 OFFLOAD_CHECK(cudaEventSynchronize(event));
328#elif defined(__OFFLOAD_HIP)
329 OFFLOAD_CHECK(hipEventSynchronize(event));
330#elif defined(__OFFLOAD_OPENCL)
331 OFFLOAD_CHECK(c_dbcsr_acc_event_synchronize(event));
332#endif
333}
334
335/*******************************************************************************
336 * \brief Wrapper around cudaStreamSynchronize.
337 ******************************************************************************/
338static inline void offloadStreamSynchronize(offloadStream_t stream) {
339#if defined(__OFFLOAD_CUDA)
340 OFFLOAD_CHECK(cudaStreamSynchronize(stream));
341#elif defined(__OFFLOAD_HIP)
342 OFFLOAD_CHECK(hipStreamSynchronize(stream));
343#elif defined(__OFFLOAD_OPENCL)
344 OFFLOAD_CHECK(c_dbcsr_acc_stream_sync(stream));
345#endif
346}
347
348/*******************************************************************************
349 * \brief Wrapper around cudaEventRecord.
350 ******************************************************************************/
351static inline void offloadEventRecord(offloadEvent_t event,
352 offloadStream_t stream) {
353#if defined(__OFFLOAD_CUDA)
354 OFFLOAD_CHECK(cudaEventRecord(event, stream));
355#elif defined(__OFFLOAD_HIP)
356 OFFLOAD_CHECK(hipEventRecord(event, stream));
357#elif defined(__OFFLOAD_OPENCL)
358 OFFLOAD_CHECK(c_dbcsr_acc_event_record(event, stream));
359#endif
360}
361
362/*******************************************************************************
363 * \brief Wrapper around cudaMallocHost.
364 ******************************************************************************/
365static inline void offloadMallocHost(void **ptr, size_t size) {
366#if defined(__OFFLOAD_CUDA)
367 OFFLOAD_CHECK(cudaMallocHost(ptr, size));
368#elif defined(__OFFLOAD_HIP)
369#if !defined(__OFFLOAD_UNIFIED_MEMORY)
370 OFFLOAD_CHECK(hipHostMalloc(ptr, size, hipHostMallocDefault)); // inconsistent
371#else
372 assert(NULL != ptr);
373 *ptr = NULL;
374#endif
375#elif defined(__OFFLOAD_OPENCL)
376 OFFLOAD_CHECK(c_dbcsr_acc_host_mem_allocate(ptr, size, NULL /*stream*/));
377#else
378 assert(NULL != ptr);
379 *ptr = malloc(size);
380#endif
381}
382
383/*******************************************************************************
384 * \brief Wrapper around cudaMalloc.
385 ******************************************************************************/
386static inline void offloadMalloc(void **ptr, size_t size) {
387#if defined(__OFFLOAD_CUDA)
388 OFFLOAD_CHECK(cudaMalloc(ptr, size));
389#elif defined(__OFFLOAD_HIP)
390 OFFLOAD_CHECK(hipMalloc(ptr, size));
391#elif defined(__OFFLOAD_OPENCL)
392 OFFLOAD_CHECK(c_dbcsr_acc_dev_mem_allocate(ptr, size));
393#else
394 assert(NULL != ptr);
395 *ptr = NULL;
396#endif
397}
398
399/*******************************************************************************
400 * \brief Wrapper around cudaFree.
401 ******************************************************************************/
402static inline void offloadFree(void *ptr) {
403#if defined(__OFFLOAD_CUDA)
404 OFFLOAD_CHECK(cudaFree(ptr));
405#elif defined(__OFFLOAD_HIP)
406 OFFLOAD_CHECK(hipFree(ptr));
407#elif defined(__OFFLOAD_OPENCL)
408 OFFLOAD_CHECK(c_dbcsr_acc_dev_mem_deallocate(ptr));
409#else
410 assert(NULL == ptr);
411#endif
412}
413
414/*******************************************************************************
415 * \brief Wrapper around cudaFreeHost.
416 ******************************************************************************/
417static inline void offloadFreeHost(void *ptr) {
418#if defined(__OFFLOAD_CUDA)
419 OFFLOAD_CHECK(cudaFreeHost(ptr));
420#elif defined(__OFFLOAD_HIP)
421#if !defined(__OFFLOAD_UNIFIED_MEMORY)
422 OFFLOAD_CHECK(hipHostFree(ptr)); // inconsistent
423#endif
424#elif defined(__OFFLOAD_OPENCL)
425 OFFLOAD_CHECK(c_dbcsr_acc_host_mem_deallocate(ptr, NULL /*stream*/));
426#else
427 free(ptr);
428#endif
429}
430
431/*******************************************************************************
432 * \brief Wrapper around cudaStreamWaitEvent.
433 ******************************************************************************/
434static inline void offloadStreamWaitEvent(offloadStream_t stream,
435 offloadEvent_t event) {
436#if defined(__OFFLOAD_CUDA)
437 OFFLOAD_CHECK(cudaStreamWaitEvent(stream, event, 0 /*flags*/));
438#elif defined(__OFFLOAD_HIP)
439 OFFLOAD_CHECK(hipStreamWaitEvent(stream, event, 0 /*flags*/));
440#elif defined(__OFFLOAD_OPENCL)
441 OFFLOAD_CHECK(c_dbcsr_acc_stream_wait_event(stream, event));
442#endif
443}
444
445/*******************************************************************************
446 * \brief Wrapper around cudaEventQuery.
447 ******************************************************************************/
448static inline bool offloadEventQuery(offloadEvent_t event) {
449#if defined(__OFFLOAD_CUDA)
450 return offloadSuccess == cudaEventQuery(event);
451#elif defined(__OFFLOAD_HIP)
452 return offloadSuccess == hipEventQuery(event);
453#elif defined(__OFFLOAD_OPENCL)
454 c_dbcsr_acc_bool_t has_occurred;
455 OFFLOAD_CHECK(c_dbcsr_acc_event_query(event, &has_occurred));
456 return (bool)has_occurred;
457#endif
458}
459
460/*******************************************************************************
461 * \brief Wrapper around cudaDeviceSynchronize.
462 ******************************************************************************/
463static inline void offloadDeviceSynchronize(void) {
464#if defined(__OFFLOAD_CUDA)
465 OFFLOAD_CHECK(cudaDeviceSynchronize());
466#elif defined(__OFFLOAD_HIP)
467 OFFLOAD_CHECK(hipDeviceSynchronize());
468#elif defined(__OFFLOAD_OPENCL)
469 OFFLOAD_CHECK(c_dbcsr_acc_device_synchronize());
470#endif
471}
472
473/*******************************************************************************
474 * \brief Wrapper around cudaDeviceSetLimit(cudaLimitMallocHeapSize,...).
475 ******************************************************************************/
476static inline void offloadEnsureMallocHeapSize(const size_t required_size) {
477#if defined(__OFFLOAD_CUDA)
478 size_t current_size;
479 OFFLOAD_CHECK(cudaDeviceGetLimit(&current_size, cudaLimitMallocHeapSize));
480 if (current_size < required_size) {
481 OFFLOAD_CHECK(cudaDeviceSetLimit(cudaLimitMallocHeapSize, required_size));
482 }
483#elif defined(__OFFLOAD_HIP) && (HIP_VERSION >= 50300000)
484 size_t current_size;
485 OFFLOAD_CHECK(hipDeviceGetLimit(&current_size, hipLimitMallocHeapSize));
486 if (current_size < required_size) {
487 OFFLOAD_CHECK(hipDeviceSetLimit(hipLimitMallocHeapSize, required_size));
488 }
489#elif defined(__OFFLOAD_OPENCL)
490 assert(0 == required_size); /* TODO */
491#else
492 (void)required_size; /* mark used */
493#endif
494}
495
496#ifdef __cplusplus
497}
498#endif
499
500#endif // defined(__OFFLOAD_CUDA) || defined(__OFFLOAD_HIP) ||
501 // defined(__OFFLOAD_OPENCL)
502#endif