(git:ed6f26b)
Loading...
Searching...
No Matches
dbm_multiply_gpu.c
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2025 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: BSD-3-Clause */
6/*----------------------------------------------------------------------------*/
7
8#include "../offload/offload_runtime.h"
9#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_DBM)
10
11#include "../offload/offload_library.h"
12#include "dbm_hyperparams.h"
13#include "dbm_mempool.h"
14#include "dbm_multiply_gpu.h"
16
17#include <assert.h>
18#include <stdio.h>
19
20/*******************************************************************************
21 * \brief Internal routine for initializing the gpu backend.
22 * \author Ole Schuett
23 ******************************************************************************/
24void dbm_multiply_gpu_start(const int max_batch_size, const int nshards,
25 dbm_shard_t *shards_c_host,
26 dbm_multiply_gpu_context_t *ctx) {
27 // Select GPU device.
29
30 ctx->nshards = nshards;
31 ctx->shards_c_host = shards_c_host;
32 ctx->max_batch_size = max_batch_size;
33 offloadStreamCreate(&ctx->main_stream);
34
35 // Allocate device storage for batches.
36 const size_t size = nshards * max_batch_size * sizeof(dbm_task_t);
37 ctx->batches_dev = dbm_mempool_device_malloc(size);
38
39 // Allocate and upload shards of result matrix C.
40 ctx->shards_c_dev = malloc(nshards * sizeof(dbm_shard_gpu_t));
41 assert(ctx->shards_c_dev != NULL);
42 for (int i = 0; i < nshards; i++) {
43 const dbm_shard_t *shard_c_host = &ctx->shards_c_host[i];
44 dbm_shard_gpu_t *shard_c_dev = &ctx->shards_c_dev[i];
45 offloadStreamCreate(&shard_c_dev->stream);
46 shard_c_dev->data_size = shard_c_host->data_size;
47 // only allocate data_size on device rather than data_allocated
48 shard_c_dev->data_allocated = shard_c_host->data_size;
49 shard_c_dev->data =
50 dbm_mempool_device_malloc(shard_c_dev->data_allocated * sizeof(double));
51 offloadMemcpyAsyncHtoD(shard_c_dev->data, shard_c_host->data,
52 shard_c_dev->data_size * sizeof(double),
53 shard_c_dev->stream);
54 }
55}
56
57/*******************************************************************************
58 * \brief Private routine for uploading a single pack onto the device.
59 * \author Ole Schuett
60 ******************************************************************************/
61static void upload_pack(const dbm_pack_t *pack_host, dbm_pack_t *pack_dev,
62 const offloadStream_t stream) {
63
64 const size_t size = pack_host->data_size * sizeof(double);
65 if (pack_dev->data_size < pack_host->data_size) {
67 pack_dev->data = dbm_mempool_device_malloc(size);
68 }
69 offloadMemcpyAsyncHtoD(pack_dev->data, pack_host->data, size, stream);
70}
71
72/*******************************************************************************
73 * \brief Internal routine for uploading newly arrived packs onto the device.
74 * \author Ole Schuett
75 ******************************************************************************/
76void dbm_multiply_gpu_upload_packs(const dbm_pack_t *pack_a,
77 const dbm_pack_t *pack_b,
78 dbm_multiply_gpu_context_t *ctx) {
79 // Select GPU device.
81
82 // Wait for all c-streams to complete before overwriting old packs.
83 offloadEvent_t event;
84 offloadEventCreate(&event);
85 for (int i = 0; i < ctx->nshards; i++) {
86 offloadEventRecord(event, ctx->shards_c_dev[i].stream);
87 offloadStreamWaitEvent(ctx->main_stream, event, 0);
88 }
89
90 upload_pack(pack_a, &ctx->pack_a_dev, ctx->main_stream);
91 upload_pack(pack_b, &ctx->pack_b_dev, ctx->main_stream);
92
93 // Have all c-streams wait until new packs are uploaded.
94 offloadEventRecord(event, ctx->main_stream);
95 for (int i = 0; i < ctx->nshards; i++) {
96 offloadStreamWaitEvent(ctx->shards_c_dev[i].stream, event, 0);
97 }
98 offloadEventDestroy(event);
99}
100
101/*******************************************************************************
102 * \brief Internal routine for executing the tasks in given batch on the GPU.
103 * \author Ole Schuett
104 ******************************************************************************/
105void dbm_multiply_gpu_process_batch(const int ntasks, const dbm_task_t *batch,
106 const double alpha, const int kshard,
107 dbm_multiply_gpu_context_t *ctx) {
108 if (ntasks == 0) {
109 return; // Nothing to do.
110 }
111
112 // Select GPU device.
114
115 const dbm_shard_t *shard_c_host = &ctx->shards_c_host[kshard];
116 dbm_shard_gpu_t *shard_c_dev = &ctx->shards_c_dev[kshard];
117 assert(NULL != shard_c_host && NULL != shard_c_dev);
118
119 // Upload new batch.
120 dbm_task_t *batch_dev = &ctx->batches_dev[kshard * ctx->max_batch_size];
121 const size_t size = ntasks * sizeof(dbm_task_t);
122 offloadMemcpyAsyncHtoD(batch_dev, batch, size, shard_c_dev->stream);
123 offloadEvent_t memsetup;
124 offloadEventCreate(&memsetup);
125
126 // Reallocate shard_c_dev->data if necessary.
127 double *old_data_dev = NULL;
128 if (shard_c_host->data_promised > shard_c_dev->data_allocated) {
129 shard_c_dev->data_allocated =
130 DBM_ALLOCATION_FACTOR * shard_c_host->data_promised;
131 assert(shard_c_host->data_promised <= shard_c_dev->data_allocated);
132 old_data_dev = shard_c_dev->data;
133 shard_c_dev->data =
134 dbm_mempool_device_malloc(shard_c_dev->data_allocated * sizeof(double));
135 // Omit to wait for copy before freeing old buffer.
136 offloadMemcpyAsyncDtoD(shard_c_dev->data, old_data_dev,
137 shard_c_dev->data_size * sizeof(double),
138 shard_c_dev->stream);
139 }
140 offloadEventRecord(memsetup, shard_c_dev->stream);
141
142 // Zero new blocks if necessary.
143 if (shard_c_host->data_promised > shard_c_dev->data_size) {
144 const int tail = shard_c_host->data_promised - shard_c_dev->data_size;
145 offloadMemsetAsync(&shard_c_dev->data[shard_c_dev->data_size], 0,
146 tail * sizeof(double), shard_c_dev->stream);
147 shard_c_dev->data_size = shard_c_host->data_promised;
148 }
149
150 // Launch kernel.
151 assert(0 != shard_c_dev->data_size);
152 dbm_multiply_gpu_launch_kernel(shard_c_dev->stream, alpha, ntasks, batch,
153 batch_dev, ctx->pack_a_dev.data,
154 ctx->pack_b_dev.data, shard_c_dev->data);
155 OFFLOAD_CHECK(offloadGetLastError());
156
157 // Wait for:
158 // - Batch to be uploaded (before refilling it).
159 // - Device memory buffer (if resized).
160 offloadEventSynchronize(memsetup);
161 offloadEventDestroy(memsetup);
162
163 // Safely freeing old buffer.
164 if (NULL != old_data_dev) {
165 dbm_mempool_device_free(old_data_dev);
166 }
167}
168
169/*******************************************************************************
170 * \brief Internal routine for downloading results from the device.
171 * \author Ole Schuett
172 ******************************************************************************/
173void dbm_multiply_gpu_download_results(dbm_multiply_gpu_context_t *ctx) {
174 // Select GPU device.
176
177#pragma omp parallel for DBM_OMP_SCHEDULE
178 for (int i = 0; i < ctx->nshards; i++) {
179 // Grow host buffer if necessary.
180 dbm_shard_t *shard_c_host = &ctx->shards_c_host[i];
182
183 // Download results from device.
184 dbm_shard_gpu_t *shard_c_dev = &ctx->shards_c_dev[i];
185 assert(shard_c_host->data_size == shard_c_dev->data_size);
186 const size_t size = shard_c_dev->data_size * sizeof(double);
187 offloadMemcpyAsyncDtoH(shard_c_host->data, shard_c_dev->data, size,
188 shard_c_dev->stream);
189 }
190}
191
192/*******************************************************************************
193 * \brief Internal routine for shutting down the gpu backend.
194 * \author Ole Schuett
195 ******************************************************************************/
196void dbm_multiply_gpu_stop(dbm_multiply_gpu_context_t *ctx) {
197 // Select GPU device.
199
200 // Wait for completion, then free gpu ressources.
201#pragma omp parallel for DBM_OMP_SCHEDULE
202 for (int i = 0; i < ctx->nshards; i++) {
203 dbm_shard_gpu_t *shard_c_dev = &ctx->shards_c_dev[i];
204 offloadStreamSynchronize(shard_c_dev->stream);
205 offloadStreamDestroy(shard_c_dev->stream);
206 dbm_mempool_device_free(shard_c_dev->data);
207 }
208 free(ctx->shards_c_dev);
209
210 dbm_mempool_device_free(ctx->pack_a_dev.data);
211 dbm_mempool_device_free(ctx->pack_b_dev.data);
212 dbm_mempool_device_free(ctx->batches_dev);
213 offloadStreamDestroy(ctx->main_stream);
214}
215
216#endif // defined(__OFFLOAD) && !defined(__NO_OFFLOAD_DBM)
217
218// EOF
#define DBM_ALLOCATION_FACTOR
void dbm_mempool_device_free(const void *memory)
Internal routine for releasing memory back to the pool.
void * dbm_mempool_device_malloc(size_t size)
Internal routine for allocating device memory from the pool.
void dbm_shard_allocate_promised_blocks(dbm_shard_t *shard)
Internal routine for allocating and zeroing any promised block's data.
Definition dbm_shard.c:231
static void const int const int i
subroutine, public offload_activate_chosen_device()
Activates the device selected via offload_set_chosen_device()
Internal struct for storing a pack - essentially a shard for MPI.
double * data
Internal struct for storing a matrix shard.
Definition dbm_shard.h:30
double * data
Definition dbm_shard.h:43
int data_size
Definition dbm_shard.h:42
int data_promised
Definition dbm_shard.h:39
Internal struct for storing a task, ie. a single block multiplication.