(git:b5558c7)
Loading...
Searching...
No Matches
dbm_multiply_gpu.c
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2025 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: BSD-3-Clause */
6/*----------------------------------------------------------------------------*/
7
8#include "../offload/offload_runtime.h"
9#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_DBM)
10
11#include "../offload/offload_library.h"
12#include "../offload/offload_mempool.h"
13#include "dbm_hyperparams.h"
14#include "dbm_multiply_gpu.h"
16
17#include <assert.h>
18#include <stdio.h>
19
20/*******************************************************************************
21 * \brief Internal routine for initializing the gpu backend.
22 * \author Ole Schuett
23 ******************************************************************************/
24void dbm_multiply_gpu_start(const int max_batch_size, const int nshards,
25 dbm_shard_t *shards_c_host,
26 dbm_multiply_gpu_context_t *ctx) {
27 // Select GPU device.
29
30 ctx->nshards = nshards;
31 ctx->max_batch_size = max_batch_size;
32 offloadStreamCreate(&ctx->main_stream);
33 offloadEventCreate(&ctx->upload_event);
34
35 // Allocate device storage for batches.
36 const size_t size = nshards * max_batch_size * sizeof(dbm_task_t);
37 ctx->batches_dev = offload_mempool_device_malloc(size);
38
39 // Allocate and upload shards of result matrix C.
40 ctx->shards_c_dev = malloc(nshards * sizeof(dbm_shard_gpu_t));
41 assert(ctx->shards_c_dev != NULL || nshards == 0);
42 for (int i = 0; i < nshards; i++) {
43 const dbm_shard_t *const shard_c_host = &shards_c_host[i];
44 dbm_shard_gpu_t *shard_g = &ctx->shards_c_dev[i];
45 offloadStreamCreate(&shard_g->stream);
46 offloadEventCreate(&shard_g->event);
47 shard_g->data_size = shard_c_host->data_size;
48 // only allocate data_size on device rather than data_allocated
49 shard_g->data_allocated = shard_c_host->data_size;
50 shard_g->data =
51 offload_mempool_device_malloc(shard_g->data_allocated * sizeof(double));
52 offloadMemcpyAsyncHtoD(shard_g->data, shard_c_host->data,
53 shard_g->data_size * sizeof(double),
54 shard_g->stream);
55 }
56}
57
58/*******************************************************************************
59 * \brief Private routine for uploading a single pack onto the device.
60 * \author Ole Schuett
61 ******************************************************************************/
62static void upload_pack(const dbm_pack_t *pack_host, dbm_pack_t *pack_dev,
63 const offloadStream_t stream) {
64
65 const size_t size = pack_host->data_size * sizeof(double);
66 if (pack_dev->data_size < pack_host->data_size) {
68 pack_dev->data = offload_mempool_device_malloc(size);
69 }
70 offloadMemcpyAsyncHtoD(pack_dev->data, pack_host->data, size, stream);
71}
72
73/*******************************************************************************
74 * \brief Internal routine for uploading newly arrived packs onto the device.
75 * \author Ole Schuett and Hans Pabst
76 ******************************************************************************/
77bool dbm_multiply_gpu_upload_packs(const dbm_pack_t *pack_a,
78 const dbm_pack_t *pack_b,
79 dbm_multiply_gpu_context_t *ctx) {
80 // Assume GPU device was activated earlier.
81 // Wait for all c-streams to complete before overwriting old packs.
82 for (int i = 0; i < ctx->nshards; i++) {
83 offloadEventRecord(ctx->upload_event, ctx->shards_c_dev[i].stream);
84 offloadStreamWaitEvent(ctx->main_stream, ctx->upload_event);
85 }
86 // Record event to check if all c-streams already completed.
87 offloadEventRecord(ctx->upload_event, ctx->main_stream);
88
89 bool uploaded = false;
90 /*if (offloadEventQuery(ctx->upload_event))*/
91 {
92 upload_pack(pack_a, &ctx->pack_a_dev, ctx->main_stream);
93 upload_pack(pack_b, &ctx->pack_b_dev, ctx->main_stream);
94
95 // Have all c-streams wait until new packs are uploaded.
96 offloadEventRecord(ctx->upload_event, ctx->main_stream);
97 for (int i = 0; i < ctx->nshards; i++) {
98 offloadStreamWaitEvent(ctx->shards_c_dev[i].stream, ctx->upload_event);
99 }
100 uploaded = true;
101 }
102
103 return uploaded;
104}
105
106/*******************************************************************************
107 * \brief Internal routine for executing the tasks in given batch on the GPU.
108 * \author Ole Schuett
109 ******************************************************************************/
110void dbm_multiply_gpu_process_batch(const int ntasks, const dbm_task_t *batch,
111 const double alpha, dbm_shard_t *shard_c,
112 const int kshard, const bool finish,
113 dbm_multiply_gpu_context_t *ctx) {
114 // Assume GPU device was activated earlier.
115 dbm_shard_gpu_t *const shard_g = &ctx->shards_c_dev[kshard];
116 double *old_data_dev = NULL;
117
118 if (0 < ntasks) {
119 assert(NULL != shard_c && NULL != shard_g);
120
121 // Upload new batch.
122 dbm_task_t *batch_dev = &ctx->batches_dev[kshard * ctx->max_batch_size];
123 const size_t size = ntasks * sizeof(dbm_task_t);
124 offloadMemcpyAsyncHtoD(batch_dev, batch, size, shard_g->stream);
125
126 // Reallocate shard_g->data if necessary.
127 if (shard_c->data_promised > shard_g->data_allocated) {
128 shard_g->data_allocated = DBM_ALLOCATION_FACTOR * shard_c->data_promised;
129 assert(shard_c->data_promised <= shard_g->data_allocated);
130 old_data_dev = shard_g->data;
131 shard_g->data = offload_mempool_device_malloc(shard_g->data_allocated *
132 sizeof(double));
133 // Omit to wait for copy before freeing old buffer.
134 offloadMemcpyAsyncDtoD(shard_g->data, old_data_dev,
135 shard_g->data_size * sizeof(double),
136 shard_g->stream);
137 }
138 offloadEventRecord(shard_g->event, shard_g->stream);
139
140 // Zero new blocks if necessary.
141 if (shard_c->data_promised > shard_g->data_size) {
142 const int tail = shard_c->data_promised - shard_g->data_size;
143 offloadMemsetAsync(&shard_g->data[shard_g->data_size], 0,
144 tail * sizeof(double), shard_g->stream);
145 shard_g->data_size = shard_c->data_promised;
146 }
147
148 OFFLOAD_CHECK(offloadGetLastError());
149 assert(0 != shard_g->data_size);
150
151 // Launch kernel.
152 dbm_multiply_gpu_launch_kernel(shard_g->stream, alpha, ntasks, batch,
153 batch_dev, ctx->pack_a_dev.data,
154 ctx->pack_b_dev.data, shard_g->data);
155 OFFLOAD_CHECK(offloadGetLastError());
156 }
157
158 if (finish) { // Start downloading the current shard of matrix_c.
159 // Grow host buffer if necessary.
161 // Download results from device.
162 assert(shard_c->data_size == shard_g->data_size);
163 offloadMemcpyAsyncDtoH(shard_c->data, shard_g->data,
164 shard_g->data_size * sizeof(double),
165 shard_g->stream);
166 }
167
168 if (0 < ntasks) {
169 // Wait for:
170 // - Batch to be uploaded (before refilling it).
171 // - Safely freeing device buffer (if resized).
172 offloadEventSynchronize(shard_g->event);
173
174 if (NULL != old_data_dev) {
175 offload_mempool_device_free(old_data_dev);
176 }
177 }
178}
179
180/*******************************************************************************
181 * \brief Internal routine for shutting down the gpu backend.
182 * \author Ole Schuett
183 ******************************************************************************/
184void dbm_multiply_gpu_stop(dbm_multiply_gpu_context_t *ctx) {
185 // Assume GPU device was activated earlier.
186 // Wait for completion, then free gpu ressources.
187#pragma omp parallel for DBM_OMP_SCHEDULE
188 for (int i = 0; i < ctx->nshards; i++) {
189 dbm_shard_gpu_t *const shard_g = &ctx->shards_c_dev[i];
190 offloadStreamSynchronize(shard_g->stream);
191 offloadStreamDestroy(shard_g->stream);
192 offloadEventDestroy(shard_g->event);
193 offload_mempool_device_free(shard_g->data);
194 }
195 free(ctx->shards_c_dev);
196
197 offload_mempool_device_free(ctx->pack_a_dev.data);
198 offload_mempool_device_free(ctx->pack_b_dev.data);
199 offload_mempool_device_free(ctx->batches_dev);
200 offloadStreamDestroy(ctx->main_stream);
201 offloadEventDestroy(ctx->upload_event);
202}
203
204#endif // defined(__OFFLOAD) && !defined(__NO_OFFLOAD_DBM)
205
206// EOF
#define DBM_ALLOCATION_FACTOR
void dbm_shard_allocate_promised_blocks(dbm_shard_t *shard)
Internal routine for allocating and zeroing any promised block's data.
Definition dbm_shard.c:234
static void const int const int i
subroutine, public offload_activate_chosen_device()
Activates the device selected via offload_set_chosen_device()
void offload_mempool_device_free(const void *memory)
Internal routine for releasing memory back to the pool.
void * offload_mempool_device_malloc(const size_t size)
Internal routine for allocating device memory from the pool.
Internal struct for storing a pack - essentially a shard for MPI.
double * data
Internal struct for storing a matrix shard.
Definition dbm_shard.h:30
double * data
Definition dbm_shard.h:43
int data_size
Definition dbm_shard.h:42
int data_promised
Definition dbm_shard.h:39
Internal struct for storing a task, ie. a single block multiplication.