(git:374b731)
Loading...
Searching...
No Matches
dbm_multiply_gpu.c
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2024 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: BSD-3-Clause */
6/*----------------------------------------------------------------------------*/
7
8#include "../offload/offload_runtime.h"
9#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_DBM)
10
11#include "../offload/offload_library.h"
12#include "dbm_hyperparams.h"
13#include "dbm_mempool.h"
14#include "dbm_multiply_gpu.h"
16
17#include <assert.h>
18#include <stdio.h>
19
20/*******************************************************************************
21 * \brief Internal routine for intializing the gpu backend.
22 * \author Ole Schuett
23 ******************************************************************************/
24void dbm_multiply_gpu_start(const int max_batch_size, const int nshards,
25 dbm_shard_t *shards_c_host,
26 dbm_multiply_gpu_context_t *ctx) {
27 // Select GPU device.
29
30 ctx->nshards = nshards;
31 ctx->shards_c_host = shards_c_host;
32 ctx->max_batch_size = max_batch_size;
33 offloadStreamCreate(&ctx->main_stream);
34
35 // Allocate device storage for batches.
36 const size_t size = nshards * max_batch_size * sizeof(dbm_task_t);
37 ctx->batches_dev = (dbm_task_t *)dbm_mempool_device_malloc(size);
38
39 // Allocate and upload shards of result matrix C.
40 ctx->shards_c_dev =
41 (dbm_shard_gpu_t *)malloc(nshards * sizeof(dbm_shard_gpu_t));
42 for (int i = 0; i < nshards; i++) {
43 const dbm_shard_t *shard_c_host = &ctx->shards_c_host[i];
44 dbm_shard_gpu_t *shard_c_dev = &ctx->shards_c_dev[i];
45 offloadStreamCreate(&shard_c_dev->stream);
46 shard_c_dev->data_size = shard_c_host->data_size;
47 shard_c_dev->data_allocated = shard_c_host->data_allocated;
48 shard_c_dev->data = (double *)dbm_mempool_device_malloc(
49 shard_c_dev->data_allocated * sizeof(double));
50 offloadMemcpyAsyncHtoD(shard_c_dev->data, shard_c_host->data,
51 shard_c_dev->data_size * sizeof(double),
52 shard_c_dev->stream);
53 }
54}
55
56/*******************************************************************************
57 * \brief Private routine for uploading a single pack onto the device.
58 * \author Ole Schuett
59 ******************************************************************************/
60static void upload_pack(const dbm_pack_t *pack_host, dbm_pack_t *pack_dev,
61 const offloadStream_t stream) {
62
63 const size_t size = pack_host->data_size * sizeof(double);
64 if (pack_dev->data_size < pack_host->data_size) {
65 dbm_mempool_free(pack_dev->data);
66 pack_dev->data = (double *)dbm_mempool_device_malloc(size);
67 }
68 offloadMemcpyAsyncHtoD(pack_dev->data, pack_host->data, size, stream);
69}
70
71/*******************************************************************************
72 * \brief Internal routine for uploading newly arrived packs onto the device.
73 * \author Ole Schuett
74 ******************************************************************************/
75void dbm_multiply_gpu_upload_packs(const dbm_pack_t *pack_a,
76 const dbm_pack_t *pack_b,
77 dbm_multiply_gpu_context_t *ctx) {
78 // Select GPU device.
80
81 // Wait for all c-streams to complete before overwriting old packs.
82 offloadEvent_t event;
83 offloadEventCreate(&event);
84 for (int i = 0; i < ctx->nshards; i++) {
85 offloadEventRecord(event, ctx->shards_c_dev[i].stream);
86 offloadStreamWaitEvent(ctx->main_stream, event, 0);
87 }
88
89 upload_pack(pack_a, &ctx->pack_a_dev, ctx->main_stream);
90 upload_pack(pack_b, &ctx->pack_b_dev, ctx->main_stream);
91
92 // Have all c-streams wait until new packs are uploaded.
93 offloadEventRecord(event, ctx->main_stream);
94 for (int i = 0; i < ctx->nshards; i++) {
95 offloadStreamWaitEvent(ctx->shards_c_dev[i].stream, event, 0);
96 }
97 offloadEventDestroy(event);
98}
99
100/*******************************************************************************
101 * \brief Internal routine for executing the tasks in given batch on the GPU.
102 * \author Ole Schuett
103 ******************************************************************************/
104void dbm_multiply_gpu_process_batch(const int ntasks, const dbm_task_t *batch,
105 const int mnk_range[3][2],
106 const double alpha, const int kshard,
107 dbm_multiply_gpu_context_t *ctx) {
108
109 if (ntasks == 0) {
110 return; // Nothing to do.
111 }
112
113 // Select GPU device.
115
116 const dbm_shard_t *shard_c_host = &ctx->shards_c_host[kshard];
117 dbm_shard_gpu_t *shard_c_dev = &ctx->shards_c_dev[kshard];
118
119 // Upload new batch.
120 dbm_task_t *batch_dev = &ctx->batches_dev[kshard * ctx->max_batch_size];
121 const size_t size = ntasks * sizeof(dbm_task_t);
122 offloadMemcpyAsyncHtoD(batch_dev, batch, size, shard_c_dev->stream);
123 offloadEvent_t batch_uploaded;
124 offloadEventCreate(&batch_uploaded);
125 offloadEventRecord(batch_uploaded, shard_c_dev->stream);
126
127 // Reallocate shard_c_dev->data if necessary.
128 if (shard_c_host->data_promised > shard_c_dev->data_allocated) {
129 double *old_data_dev = shard_c_dev->data;
130 shard_c_dev->data_allocated =
131 ALLOCATION_FACTOR * shard_c_host->data_promised;
132 shard_c_dev->data = (double *)dbm_mempool_device_malloc(
133 shard_c_dev->data_allocated * sizeof(double));
134 offloadMemcpyAsyncDtoD(shard_c_dev->data, old_data_dev,
135 shard_c_dev->data_size * sizeof(double),
136 shard_c_dev->stream);
137 // Wait for copy to complete before freeing old buffer.
138 offloadStreamSynchronize(shard_c_dev->stream);
139 dbm_mempool_free(old_data_dev);
140 }
141
142 // Zero new blocks if necessary.
143 if (shard_c_host->data_promised > shard_c_dev->data_size) {
144 const int tail = shard_c_host->data_promised - shard_c_dev->data_size;
145 offloadMemsetAsync(&shard_c_dev->data[shard_c_dev->data_size], 0,
146 tail * sizeof(double), shard_c_dev->stream);
147 shard_c_dev->data_size = shard_c_host->data_promised;
148 }
149
150 // Launch kernel.
151 dbm_multiply_gpu_launch_kernel(shard_c_dev->stream, mnk_range, alpha, ntasks,
152 batch_dev, ctx->pack_a_dev.data,
153 ctx->pack_b_dev.data, shard_c_dev->data);
154 OFFLOAD_CHECK(offloadGetLastError());
155
156 // Wait for batch to be uploaded before refilling it.
157 offloadEventSynchronize(batch_uploaded);
158 offloadEventDestroy(batch_uploaded);
159}
160
161/*******************************************************************************
162 * \brief Internal routine for downloading results from the device.
163 * \author Ole Schuett
164 ******************************************************************************/
165void dbm_multiply_gpu_download_results(dbm_multiply_gpu_context_t *ctx) {
166 // Select GPU device.
168
169#pragma omp parallel for schedule(dynamic)
170 for (int i = 0; i < ctx->nshards; i++) {
171 // Grow host buffer if necessary.
172 dbm_shard_t *shard_c_host = &ctx->shards_c_host[i];
174
175 // Download results from device.
176 dbm_shard_gpu_t *shard_c_dev = &ctx->shards_c_dev[i];
177 assert(shard_c_host->data_size == shard_c_dev->data_size);
178 const size_t size = shard_c_dev->data_size * sizeof(double);
179 offloadMemcpyAsyncDtoH(shard_c_host->data, shard_c_dev->data, size,
180 shard_c_dev->stream);
181 }
182}
183
184/*******************************************************************************
185 * \brief Internal routine for shutting down the gpu backend.
186 * \author Ole Schuett
187 ******************************************************************************/
188void dbm_multiply_gpu_stop(dbm_multiply_gpu_context_t *ctx) {
189 // Select GPU device.
191
192 // Wait for completion, then free gpu ressources.
193#pragma omp parallel for schedule(dynamic)
194 for (int i = 0; i < ctx->nshards; i++) {
195 dbm_shard_gpu_t *shard_c_dev = &ctx->shards_c_dev[i];
196 offloadStreamSynchronize(shard_c_dev->stream);
197 offloadStreamDestroy(shard_c_dev->stream);
198 dbm_mempool_free(shard_c_dev->data);
199 }
200 free(ctx->shards_c_dev);
201
202 dbm_mempool_free(ctx->pack_a_dev.data);
203 dbm_mempool_free(ctx->pack_b_dev.data);
204 dbm_mempool_free(ctx->batches_dev);
205 offloadStreamDestroy(ctx->main_stream);
206}
207
208#endif // defined(__OFFLOAD) && !defined(__NO_OFFLOAD_DBM)
209
210// EOF
static const float ALLOCATION_FACTOR
void dbm_mempool_free(void *mem)
Internal routine for releasing memory back to the pool.
void * dbm_mempool_device_malloc(const size_t size)
Internal routine for allocating device memory from the pool.
void dbm_shard_allocate_promised_blocks(dbm_shard_t *shard)
Internal routine for allocating and zeroing any promised block's data.
Definition dbm_shard.c:203
static void const int const int i
subroutine, public offload_activate_chosen_device()
Activates the device selected via offload_set_chosen_device()
Internal struct for storing a pack - essentially a shard for MPI.
Internal struct for storing a matrix shard.
Definition dbm_shard.h:30
double * data
Definition dbm_shard.h:44
int data_allocated
Definition dbm_shard.h:42
int data_size
Definition dbm_shard.h:43
int data_promised
Definition dbm_shard.h:40
Internal struct for storing a task, ie. a single block multiplication.