(git:374b731)
Loading...
Searching...
No Matches
offload_runtime.h
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2024 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: BSD-3-Clause */
6/*----------------------------------------------------------------------------*/
7#ifndef OFFLOAD_RUNTIME_H
8#define OFFLOAD_RUNTIME_H
9
10#if defined(__OFFLOAD_OPENCL) && !defined(__DBCSR_ACC)
11#undef __OFFLOAD_OPENCL
12#endif
13/* TODO: implement support or missing features */
14#if defined(__OFFLOAD_OPENCL)
15#if !defined(__NO_OFFLOAD_GRID)
16#define __NO_OFFLOAD_GRID
17#endif
18#if !defined(__NO_OFFLOAD_PW)
19#define __NO_OFFLOAD_PW
20#endif
21#endif
22
23#if defined(__OFFLOAD_CUDA) || defined(__OFFLOAD_HIP) || \
24 defined(__OFFLOAD_OPENCL)
25#define __OFFLOAD
26
27#include <stdio.h>
28#include <stdlib.h>
29
30#if defined(__OFFLOAD_CUDA)
31#include <cuda_runtime.h>
32#elif defined(__OFFLOAD_HIP)
33#include <hip/hip_runtime.h>
34#include <hip/hip_version.h>
35#elif defined(__OFFLOAD_OPENCL)
36/* No relative path aka double-quote to avoid PACKAGE deps (-Iexts/dbcsr). */
37#include <src/acc/opencl/acc_opencl.h>
38#endif
39
40#ifdef __cplusplus
41extern "C" {
42#endif
43
44#if defined(__OFFLOAD_CUDA)
45typedef cudaStream_t offloadStream_t;
46typedef cudaEvent_t offloadEvent_t;
47typedef cudaError_t offloadError_t;
48#elif defined(__OFFLOAD_HIP)
49typedef hipStream_t offloadStream_t;
50typedef hipEvent_t offloadEvent_t;
51typedef hipError_t offloadError_t;
52#elif defined(__OFFLOAD_OPENCL)
53typedef void *offloadStream_t;
54typedef void *offloadEvent_t;
55typedef int offloadError_t;
56#endif
57
58#if defined(__OFFLOAD_CUDA)
59#define offloadSuccess cudaSuccess
60#elif defined(__OFFLOAD_HIP)
61#define offloadSuccess hipSuccess
62#elif defined(__OFFLOAD_OPENCL)
63#define offloadSuccess EXIT_SUCCESS
64#endif
65
66/*******************************************************************************
67 * \brief Check given Cuda status and upon failure abort with a nice message.
68 * \author Ole Schuett
69 ******************************************************************************/
70#if !defined(OFFLOAD_CHECK)
71#define OFFLOAD_CHECK(CMD) \
72 do { \
73 const offloadError_t error = (CMD); \
74 if (error != offloadSuccess) { \
75 const char *const name = offloadGetErrorName(error); \
76 if (NULL != name && '\0' != *name) { \
77 fprintf(stderr, "ERROR: \"%s\" at %s:%d\n", name, __FILE__, __LINE__); \
78 } else { \
79 fprintf(stderr, "ERROR %i: %s:%d\n", (int)error, __FILE__, __LINE__); \
80 } \
81 abort(); \
82 } \
83 } while (0)
84#endif
85
86/*******************************************************************************
87 * \brief Wrapper around cudaGetErrorName.
88 ******************************************************************************/
89static inline const char *offloadGetErrorName(offloadError_t error) {
90#if defined(__OFFLOAD_CUDA)
91 return cudaGetErrorName(error);
92#elif defined(__OFFLOAD_HIP)
93 return hipGetErrorName(error);
94#elif defined(__OFFLOAD_OPENCL)
95 (void)error; /* mark used */
96 return ""; /* TODO */
97#endif
98}
99
100/*******************************************************************************
101 * \brief Wrapper around cudaGetLastError.
102 ******************************************************************************/
103static inline offloadError_t offloadGetLastError(void) {
104#if defined(__OFFLOAD_CUDA)
105 return cudaGetLastError();
106#elif defined(__OFFLOAD_HIP)
107 return hipGetLastError();
108#elif defined(__OFFLOAD_OPENCL)
109 return offloadSuccess; /* TODO */
110#endif
111}
112
113/*******************************************************************************
114 * \brief Wrapper around cudaMemsetAsync.
115 ******************************************************************************/
116static inline void offloadMemsetAsync(void *const ptr, const int val,
117 const size_t size,
118 offloadStream_t stream) {
119#if defined(__OFFLOAD_CUDA)
120 OFFLOAD_CHECK(cudaMemsetAsync(ptr, val, size, stream));
121#elif defined(__OFFLOAD_HIP)
122 OFFLOAD_CHECK(hipMemsetAsync(ptr, val, size, stream));
123#elif defined(__OFFLOAD_OPENCL)
124 OFFLOAD_CHECK(
125 c_dbcsr_acc_opencl_memset(ptr, val, 0 /*offset*/, size, stream));
126#endif
127}
128
129/*******************************************************************************
130 * \brief Wrapper around cudaMemset.
131 ******************************************************************************/
132static inline void offloadMemset(void *ptr, const int val, size_t size) {
133#if defined(__OFFLOAD_CUDA)
134 OFFLOAD_CHECK(cudaMemset(ptr, val, size));
135#elif defined(__OFFLOAD_HIP)
136 OFFLOAD_CHECK(hipMemset(ptr, val, size));
137#elif defined(__OFFLOAD_OPENCL)
138 offloadMemsetAsync(ptr, val, size, NULL /*stream*/);
139#endif
140}
141
142/*******************************************************************************
143 * \brief Wrapper around cudaMemcpyAsync(...,cudaMemcpyHostToDevice,...).
144 ******************************************************************************/
145static inline void offloadMemcpyAsyncHtoD(void *const ptr1, const void *ptr2,
146 const size_t size,
147 offloadStream_t stream) {
148#if defined(__OFFLOAD_CUDA)
149 OFFLOAD_CHECK(
150 cudaMemcpyAsync(ptr1, ptr2, size, cudaMemcpyHostToDevice, stream));
151#elif defined(__OFFLOAD_HIP)
152#if defined(__OFFLOAD_UNIFIED_MEMORY)
153 if (ptr1 == ptr2) {
154 return;
155 }
156#endif
157 OFFLOAD_CHECK(
158 hipMemcpyAsync(ptr1, ptr2, size, hipMemcpyHostToDevice, stream));
159#elif defined(__OFFLOAD_OPENCL)
160 OFFLOAD_CHECK(c_dbcsr_acc_memcpy_h2d(ptr2, ptr1, size, stream));
161#endif
162}
163
164/*******************************************************************************
165 * \brief Wrapper around cudaMemcpyAsync(...,cudaMemcpyDeviceToHost,...).
166 ******************************************************************************/
167static inline void offloadMemcpyAsyncDtoH(void *const ptr1, const void *ptr2,
168 const size_t size,
169 const offloadStream_t stream) {
170#if defined(__OFFLOAD_CUDA)
171 OFFLOAD_CHECK(
172 cudaMemcpyAsync(ptr1, ptr2, size, cudaMemcpyDeviceToHost, stream));
173#elif defined(__OFFLOAD_HIP)
174#if defined(__OFFLOAD_UNIFIED_MEMORY)
175 if (ptr1 == ptr2) {
176 return;
177 }
178#endif
179 OFFLOAD_CHECK(
180 hipMemcpyAsync(ptr1, ptr2, size, hipMemcpyDeviceToHost, stream));
181#elif defined(__OFFLOAD_OPENCL)
182 OFFLOAD_CHECK(c_dbcsr_acc_memcpy_d2h(ptr2, ptr1, size, stream));
183#endif
184}
185
186/*******************************************************************************
187 * \brief Wrapper around cudaMemcpyAsync(...,cudaMemcpyDeviceToDevice).
188 ******************************************************************************/
189static inline void offloadMemcpyAsyncDtoD(void *ptr1, const void *ptr2,
190 const size_t size,
191 const offloadStream_t stream) {
192#if defined(__OFFLOAD_CUDA)
193 OFFLOAD_CHECK(
194 cudaMemcpyAsync(ptr1, ptr2, size, cudaMemcpyDeviceToDevice, stream));
195#elif defined(__OFFLOAD_HIP)
196 OFFLOAD_CHECK(
197 hipMemcpyAsync(ptr1, ptr2, size, hipMemcpyDeviceToDevice, stream));
198#elif defined(__OFFLOAD_OPENCL)
199 OFFLOAD_CHECK(c_dbcsr_acc_memcpy_d2d(ptr2, ptr1, size, stream));
200#endif
201}
202
203/*******************************************************************************
204 * \brief Wrapper around cudaMemcpy(...,cudaMemcpyHostToDevice).
205 ******************************************************************************/
206static inline void offloadMemcpyHtoD(void *ptr_device, const void *ptr_host,
207 const size_t size) {
208#if defined(__OFFLOAD_CUDA)
209 OFFLOAD_CHECK(cudaMemcpy(ptr_device, ptr_host, size, cudaMemcpyHostToDevice));
210#elif defined(__OFFLOAD_HIP)
211#if defined(__OFFLOAD_UNIFIED_MEMORY)
212 if (ptr_device == ptr_host) {
213 return;
214 }
215#endif
216 OFFLOAD_CHECK(hipMemcpy(ptr_device, ptr_host, size, hipMemcpyHostToDevice));
217#elif defined(__OFFLOAD_OPENCL)
218 offloadMemcpyAsyncHtoD(ptr_device, ptr_host, size, NULL /*stream*/);
219#endif
220}
221
222/*******************************************************************************
223 * \brief Wrapper around cudaMemcpy(...,cudaMemcpyDeviceToHost).
224 ******************************************************************************/
225static inline void offloadMemcpyDtoH(void *ptr_device, const void *ptr_host,
226 const size_t size) {
227#if defined(__OFFLOAD_CUDA)
228 OFFLOAD_CHECK(cudaMemcpy(ptr_device, ptr_host, size, cudaMemcpyDeviceToHost));
229#elif defined(__OFFLOAD_HIP)
230#if defined(__OFFLOAD_UNIFIED_MEMORY)
231 if (ptr_device == ptr_host) {
232 return;
233 }
234#endif
235 OFFLOAD_CHECK(hipMemcpy(ptr_device, ptr_host, size, hipMemcpyDeviceToHost));
236#elif defined(__OFFLOAD_OPENCL)
237 offloadMemcpyAsyncDtoH(ptr_device, ptr_host, size, NULL /*stream*/);
238#endif
239}
240
241/*******************************************************************************
242 * \brief Wrapper around cudaMemcpyToSymbol.
243 ******************************************************************************/
244static inline void offloadMemcpyToSymbol(const void *symbol, const void *src,
245 const size_t count) {
246#if defined(__OFFLOAD_CUDA)
247 OFFLOAD_CHECK(
248 cudaMemcpyToSymbol(symbol, src, count, 0, cudaMemcpyHostToDevice));
249#elif defined(__OFFLOAD_HIP)
250 OFFLOAD_CHECK(
251 hipMemcpyToSymbol(symbol, src, count, 0, hipMemcpyHostToDevice));
252#elif defined(__OFFLOAD_OPENCL)
253 assert(NULL == symbol || NULL == src || 0 == count); /* TODO */
254#endif
255}
256
257/*******************************************************************************
258 * \brief Wrapper around cudaEventCreate.
259 ******************************************************************************/
260static inline void offloadEventCreate(offloadEvent_t *event) {
261#if defined(__OFFLOAD_CUDA)
262 OFFLOAD_CHECK(cudaEventCreate(event));
263#elif defined(__OFFLOAD_HIP)
264 OFFLOAD_CHECK(hipEventCreate(event));
265#elif defined(__OFFLOAD_OPENCL)
266 OFFLOAD_CHECK(c_dbcsr_acc_event_create(event));
267#endif
268}
269
270/*******************************************************************************
271 * \brief Wrapper around cudaEventDestroy.
272 ******************************************************************************/
273static inline void offloadEventDestroy(offloadEvent_t event) {
274#if defined(__OFFLOAD_CUDA)
275 OFFLOAD_CHECK(cudaEventDestroy(event));
276#elif defined(__OFFLOAD_HIP)
277 OFFLOAD_CHECK(hipEventDestroy(event));
278#elif defined(__OFFLOAD_OPENCL)
279 OFFLOAD_CHECK(c_dbcsr_acc_event_destroy(event));
280#endif
281}
282
283/*******************************************************************************
284 * \brief Wrapper around cudaStreamCreate.
285 ******************************************************************************/
286static inline void offloadStreamCreate(offloadStream_t *stream) {
287#if defined(__OFFLOAD_CUDA)
288 OFFLOAD_CHECK(cudaStreamCreate(stream));
289#elif defined(__OFFLOAD_HIP)
290 OFFLOAD_CHECK(hipStreamCreate(stream));
291#elif defined(__OFFLOAD_OPENCL)
292 int least = 0;
293 OFFLOAD_CHECK(c_dbcsr_acc_stream_priority_range(&least, NULL /*greatest*/));
294 OFFLOAD_CHECK(c_dbcsr_acc_stream_create(stream, "Offload Stream", least));
295#endif
296}
297
298/*******************************************************************************
299 * \brief Wrapper around cudaStreamDestroy.
300 ******************************************************************************/
301static inline void offloadStreamDestroy(offloadStream_t stream) {
302#if defined(__OFFLOAD_CUDA)
303 OFFLOAD_CHECK(cudaStreamDestroy(stream));
304#elif defined(__OFFLOAD_HIP)
305 OFFLOAD_CHECK(hipStreamDestroy(stream));
306#elif defined(__OFFLOAD_OPENCL)
307 OFFLOAD_CHECK(c_dbcsr_acc_stream_destroy(stream));
308#endif
309}
310
311/*******************************************************************************
312 * \brief Wrapper around cudaEventSynchronize.
313 ******************************************************************************/
314static inline void offloadEventSynchronize(offloadEvent_t event) {
315#if defined(__OFFLOAD_CUDA)
316 OFFLOAD_CHECK(cudaEventSynchronize(event));
317#elif defined(__OFFLOAD_HIP)
318 OFFLOAD_CHECK(hipEventSynchronize(event));
319#elif defined(__OFFLOAD_OPENCL)
320 OFFLOAD_CHECK(c_dbcsr_acc_event_synchronize(event));
321#endif
322}
323
324/*******************************************************************************
325 * \brief Wrapper around cudaStreamSynchronize.
326 ******************************************************************************/
327static inline void offloadStreamSynchronize(offloadStream_t stream) {
328#if defined(__OFFLOAD_CUDA)
329 OFFLOAD_CHECK(cudaStreamSynchronize(stream));
330#elif defined(__OFFLOAD_HIP)
331 OFFLOAD_CHECK(hipStreamSynchronize(stream));
332#elif defined(__OFFLOAD_OPENCL)
333 OFFLOAD_CHECK(c_dbcsr_acc_stream_sync(stream));
334#endif
335}
336
337/*******************************************************************************
338 * \brief Wrapper around cudaEventRecord.
339 ******************************************************************************/
340static inline void offloadEventRecord(offloadEvent_t event,
341 offloadStream_t stream) {
342#if defined(__OFFLOAD_CUDA)
343 OFFLOAD_CHECK(cudaEventRecord(event, stream));
344#elif defined(__OFFLOAD_HIP)
345 OFFLOAD_CHECK(hipEventRecord(event, stream));
346#elif defined(__OFFLOAD_OPENCL)
347 OFFLOAD_CHECK(c_dbcsr_acc_event_record(event, stream));
348#endif
349}
350
351/*******************************************************************************
352 * \brief Wrapper around cudaMallocHost.
353 ******************************************************************************/
354static inline void offloadMallocHost(void **ptr, size_t size) {
355#if defined(__OFFLOAD_CUDA)
356 OFFLOAD_CHECK(cudaMallocHost(ptr, size));
357#elif defined(__OFFLOAD_HIP)
358#if !defined(__OFFLOAD_UNIFIED_MEMORY)
359 OFFLOAD_CHECK(hipHostMalloc(ptr, size, hipHostMallocDefault)); // inconsistent
360#else
361 assert(NULL != ptr);
362 *ptr = NULL;
363#endif
364#elif defined(__OFFLOAD_OPENCL)
365 OFFLOAD_CHECK(c_dbcsr_acc_host_mem_allocate(ptr, size, NULL /*stream*/));
366#else
367 assert(NULL != ptr);
368 *ptr = malloc(size);
369#endif
370}
371
372/*******************************************************************************
373 * \brief Wrapper around cudaMalloc.
374 ******************************************************************************/
375static inline void offloadMalloc(void **ptr, size_t size) {
376#if defined(__OFFLOAD_CUDA)
377 OFFLOAD_CHECK(cudaMalloc(ptr, size));
378#elif defined(__OFFLOAD_HIP)
379 OFFLOAD_CHECK(hipMalloc(ptr, size));
380#elif defined(__OFFLOAD_OPENCL)
381 OFFLOAD_CHECK(c_dbcsr_acc_dev_mem_allocate(ptr, size));
382#else
383 assert(NULL != ptr);
384 *ptr = NULL;
385#endif
386}
387
388/*******************************************************************************
389 * \brief Wrapper around cudaFree.
390 ******************************************************************************/
391static inline void offloadFree(void *ptr) {
392#if defined(__OFFLOAD_CUDA)
393 OFFLOAD_CHECK(cudaFree(ptr));
394#elif defined(__OFFLOAD_HIP)
395 OFFLOAD_CHECK(hipFree(ptr));
396#elif defined(__OFFLOAD_OPENCL)
397 OFFLOAD_CHECK(c_dbcsr_acc_dev_mem_deallocate(ptr));
398#else
399 assert(NULL == ptr);
400#endif
401}
402
403/*******************************************************************************
404 * \brief Wrapper around cudaFreeHost.
405 ******************************************************************************/
406static inline void offloadFreeHost(void *ptr) {
407#if defined(__OFFLOAD_CUDA)
408 OFFLOAD_CHECK(cudaFreeHost(ptr));
409#elif defined(__OFFLOAD_HIP)
410#if !defined(__OFFLOAD_UNIFIED_MEMORY)
411 OFFLOAD_CHECK(hipHostFree(ptr)); // inconsistent
412#endif
413#elif defined(__OFFLOAD_OPENCL)
414 OFFLOAD_CHECK(c_dbcsr_acc_host_mem_deallocate(ptr, NULL /*stream*/));
415#else
416 free(ptr);
417#endif
418}
419
420/*******************************************************************************
421 * \brief Wrapper around cudaStreamWaitEvent.
422 ******************************************************************************/
423static inline void offloadStreamWaitEvent(offloadStream_t stream,
424 offloadEvent_t event, const int val) {
425#if defined(__OFFLOAD_CUDA)
426 OFFLOAD_CHECK(cudaStreamWaitEvent(stream, event, val));
427#elif defined(__OFFLOAD_HIP)
428 OFFLOAD_CHECK(hipStreamWaitEvent(stream, event, val));
429#elif defined(__OFFLOAD_OPENCL)
430 assert(0 == val); /* TODO */
431 OFFLOAD_CHECK(c_dbcsr_acc_stream_wait_event(stream, event));
432#endif
433}
434
435/*******************************************************************************
436 * \brief Wrapper around cudaDeviceSynchronize.
437 ******************************************************************************/
438static inline void offloadDeviceSynchronize(void) {
439#if defined(__OFFLOAD_CUDA)
440 OFFLOAD_CHECK(cudaDeviceSynchronize());
441#elif defined(__OFFLOAD_HIP)
442 OFFLOAD_CHECK(hipDeviceSynchronize());
443#elif defined(__OFFLOAD_OPENCL)
444 OFFLOAD_CHECK(c_dbcsr_acc_device_synchronize());
445#endif
446}
447
448/*******************************************************************************
449 * \brief Wrapper around cudaDeviceSetLimit(cudaLimitMallocHeapSize,...).
450 ******************************************************************************/
451static inline void offloadEnsureMallocHeapSize(const size_t required_size) {
452#if defined(__OFFLOAD_CUDA)
453 size_t current_size;
454 OFFLOAD_CHECK(cudaDeviceGetLimit(&current_size, cudaLimitMallocHeapSize));
455 if (current_size < required_size) {
456 OFFLOAD_CHECK(cudaDeviceSetLimit(cudaLimitMallocHeapSize, required_size));
457 }
458#elif defined(__OFFLOAD_HIP) && (HIP_VERSION >= 50300000)
459 size_t current_size;
460 OFFLOAD_CHECK(hipDeviceGetLimit(&current_size, hipLimitMallocHeapSize));
461 if (current_size < required_size) {
462 OFFLOAD_CHECK(hipDeviceSetLimit(hipLimitMallocHeapSize, required_size));
463 }
464#elif defined(__OFFLOAD_OPENCL)
465 assert(0 == required_size); /* TODO */
466#else
467 (void)required_size; /* mark used */
468#endif
469}
470
471#ifdef __cplusplus
472}
473#endif
474
475#endif // defined(__OFFLOAD_CUDA) || defined(__OFFLOAD_HIP) ||
476 // defined(__OFFLOAD_OPENCL)
477#endif