(git:1f285aa)
dbm_multiply_gpu.c
Go to the documentation of this file.
1 /*----------------------------------------------------------------------------*/
2 /* CP2K: A general program to perform molecular dynamics simulations */
3 /* Copyright 2000-2024 CP2K developers group <https://cp2k.org> */
4 /* */
5 /* SPDX-License-Identifier: BSD-3-Clause */
6 /*----------------------------------------------------------------------------*/
7 
8 #include "../offload/offload_runtime.h"
9 #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_DBM)
10 
11 #include "../offload/offload_library.h"
12 #include "dbm_hyperparams.h"
13 #include "dbm_mempool.h"
14 #include "dbm_multiply_gpu.h"
16 
17 #include <assert.h>
18 #include <stdio.h>
19 
20 /*******************************************************************************
21  * \brief Internal routine for intializing the gpu backend.
22  * \author Ole Schuett
23  ******************************************************************************/
24 void dbm_multiply_gpu_start(const int max_batch_size, const int nshards,
25  dbm_shard_t *shards_c_host,
26  dbm_multiply_gpu_context_t *ctx) {
27  // Select GPU device.
29 
30  ctx->nshards = nshards;
31  ctx->shards_c_host = shards_c_host;
32  ctx->max_batch_size = max_batch_size;
33  offloadStreamCreate(&ctx->main_stream);
34 
35  // Allocate device storage for batches.
36  const size_t size = nshards * max_batch_size * sizeof(dbm_task_t);
37  ctx->batches_dev = (dbm_task_t *)dbm_mempool_device_malloc(size);
38 
39  // Allocate and upload shards of result matrix C.
40  ctx->shards_c_dev =
41  (dbm_shard_gpu_t *)malloc(nshards * sizeof(dbm_shard_gpu_t));
42  for (int i = 0; i < nshards; i++) {
43  const dbm_shard_t *shard_c_host = &ctx->shards_c_host[i];
44  dbm_shard_gpu_t *shard_c_dev = &ctx->shards_c_dev[i];
45  offloadStreamCreate(&shard_c_dev->stream);
46  shard_c_dev->data_size = shard_c_host->data_size;
47  shard_c_dev->data_allocated = shard_c_host->data_allocated;
48  shard_c_dev->data = (double *)dbm_mempool_device_malloc(
49  shard_c_dev->data_allocated * sizeof(double));
50  offloadMemcpyAsyncHtoD(shard_c_dev->data, shard_c_host->data,
51  shard_c_dev->data_size * sizeof(double),
52  shard_c_dev->stream);
53  }
54 }
55 
56 /*******************************************************************************
57  * \brief Private routine for uploading a single pack onto the device.
58  * \author Ole Schuett
59  ******************************************************************************/
60 static void upload_pack(const dbm_pack_t *pack_host, dbm_pack_t *pack_dev,
61  const offloadStream_t stream) {
62 
63  const size_t size = pack_host->data_size * sizeof(double);
64  if (pack_dev->data_size < pack_host->data_size) {
65  dbm_mempool_free(pack_dev->data);
66  pack_dev->data = (double *)dbm_mempool_device_malloc(size);
67  }
68  offloadMemcpyAsyncHtoD(pack_dev->data, pack_host->data, size, stream);
69 }
70 
71 /*******************************************************************************
72  * \brief Internal routine for uploading newly arrived packs onto the device.
73  * \author Ole Schuett
74  ******************************************************************************/
75 void dbm_multiply_gpu_upload_packs(const dbm_pack_t *pack_a,
76  const dbm_pack_t *pack_b,
77  dbm_multiply_gpu_context_t *ctx) {
78  // Select GPU device.
80 
81  // Wait for all c-streams to complete before overwriting old packs.
82  offloadEvent_t event;
83  offloadEventCreate(&event);
84  for (int i = 0; i < ctx->nshards; i++) {
85  offloadEventRecord(event, ctx->shards_c_dev[i].stream);
86  offloadStreamWaitEvent(ctx->main_stream, event, 0);
87  }
88 
89  upload_pack(pack_a, &ctx->pack_a_dev, ctx->main_stream);
90  upload_pack(pack_b, &ctx->pack_b_dev, ctx->main_stream);
91 
92  // Have all c-streams wait until new packs are uploaded.
93  offloadEventRecord(event, ctx->main_stream);
94  for (int i = 0; i < ctx->nshards; i++) {
95  offloadStreamWaitEvent(ctx->shards_c_dev[i].stream, event, 0);
96  }
97  offloadEventDestroy(event);
98 }
99 
100 /*******************************************************************************
101  * \brief Internal routine for executing the tasks in given batch on the GPU.
102  * \author Ole Schuett
103  ******************************************************************************/
104 void dbm_multiply_gpu_process_batch(const int ntasks, const dbm_task_t *batch,
105  const int mnk_range[3][2],
106  const double alpha, const int kshard,
107  dbm_multiply_gpu_context_t *ctx) {
108 
109  if (ntasks == 0) {
110  return; // Nothing to do.
111  }
112 
113  // Select GPU device.
115 
116  const dbm_shard_t *shard_c_host = &ctx->shards_c_host[kshard];
117  dbm_shard_gpu_t *shard_c_dev = &ctx->shards_c_dev[kshard];
118 
119  // Upload new batch.
120  dbm_task_t *batch_dev = &ctx->batches_dev[kshard * ctx->max_batch_size];
121  const size_t size = ntasks * sizeof(dbm_task_t);
122  offloadMemcpyAsyncHtoD(batch_dev, batch, size, shard_c_dev->stream);
123  offloadEvent_t batch_uploaded;
124  offloadEventCreate(&batch_uploaded);
125  offloadEventRecord(batch_uploaded, shard_c_dev->stream);
126 
127  // Reallocate shard_c_dev->data if necessary.
128  if (shard_c_host->data_promised > shard_c_dev->data_allocated) {
129  double *old_data_dev = shard_c_dev->data;
130  shard_c_dev->data_allocated =
131  ALLOCATION_FACTOR * shard_c_host->data_promised;
132  shard_c_dev->data = (double *)dbm_mempool_device_malloc(
133  shard_c_dev->data_allocated * sizeof(double));
134  offloadMemcpyAsyncDtoD(shard_c_dev->data, old_data_dev,
135  shard_c_dev->data_size * sizeof(double),
136  shard_c_dev->stream);
137  // Wait for copy to complete before freeing old buffer.
138  offloadStreamSynchronize(shard_c_dev->stream);
139  dbm_mempool_free(old_data_dev);
140  }
141 
142  // Zero new blocks if necessary.
143  if (shard_c_host->data_promised > shard_c_dev->data_size) {
144  const int tail = shard_c_host->data_promised - shard_c_dev->data_size;
145  offloadMemsetAsync(&shard_c_dev->data[shard_c_dev->data_size], 0,
146  tail * sizeof(double), shard_c_dev->stream);
147  shard_c_dev->data_size = shard_c_host->data_promised;
148  }
149 
150  // Launch kernel.
151  dbm_multiply_gpu_launch_kernel(shard_c_dev->stream, mnk_range, alpha, ntasks,
152  batch_dev, ctx->pack_a_dev.data,
153  ctx->pack_b_dev.data, shard_c_dev->data);
154  OFFLOAD_CHECK(offloadGetLastError());
155 
156  // Wait for batch to be uploaded before refilling it.
157  offloadEventSynchronize(batch_uploaded);
158  offloadEventDestroy(batch_uploaded);
159 }
160 
161 /*******************************************************************************
162  * \brief Internal routine for downloading results from the device.
163  * \author Ole Schuett
164  ******************************************************************************/
165 void dbm_multiply_gpu_download_results(dbm_multiply_gpu_context_t *ctx) {
166  // Select GPU device.
168 
169 #pragma omp parallel for schedule(dynamic)
170  for (int i = 0; i < ctx->nshards; i++) {
171  // Grow host buffer if necessary.
172  dbm_shard_t *shard_c_host = &ctx->shards_c_host[i];
174 
175  // Download results from device.
176  dbm_shard_gpu_t *shard_c_dev = &ctx->shards_c_dev[i];
177  assert(shard_c_host->data_size == shard_c_dev->data_size);
178  const size_t size = shard_c_dev->data_size * sizeof(double);
179  offloadMemcpyAsyncDtoH(shard_c_host->data, shard_c_dev->data, size,
180  shard_c_dev->stream);
181  }
182 }
183 
184 /*******************************************************************************
185  * \brief Internal routine for shutting down the gpu backend.
186  * \author Ole Schuett
187  ******************************************************************************/
188 void dbm_multiply_gpu_stop(dbm_multiply_gpu_context_t *ctx) {
189  // Select GPU device.
191 
192  // Wait for completion, then free gpu ressources.
193 #pragma omp parallel for schedule(dynamic)
194  for (int i = 0; i < ctx->nshards; i++) {
195  dbm_shard_gpu_t *shard_c_dev = &ctx->shards_c_dev[i];
196  offloadStreamSynchronize(shard_c_dev->stream);
197  offloadStreamDestroy(shard_c_dev->stream);
198  dbm_mempool_free(shard_c_dev->data);
199  }
200  free(ctx->shards_c_dev);
201 
202  dbm_mempool_free(ctx->pack_a_dev.data);
203  dbm_mempool_free(ctx->pack_b_dev.data);
204  dbm_mempool_free(ctx->batches_dev);
205  offloadStreamDestroy(ctx->main_stream);
206 }
207 
208 #endif // defined(__OFFLOAD) && !defined(__NO_OFFLOAD_DBM)
209 
210 // EOF
static const float ALLOCATION_FACTOR
void * dbm_mempool_device_malloc(const size_t size)
Internal routine for allocating device memory from the pool.
Definition: dbm_mempool.c:149
void dbm_mempool_free(void *mem)
Internal routine for releasing memory back to the pool.
Definition: dbm_mempool.c:157
void dbm_shard_allocate_promised_blocks(dbm_shard_t *shard)
Internal routine for allocating and zeroing any promised block's data.
Definition: dbm_shard.c:203
static void const int const int i
subroutine, public offload_activate_chosen_device()
Activates the device selected via offload_set_chosen_device()
Definition: offload_api.F:174
Internal struct for storing a pack - essentially a shard for MPI.
Internal struct for storing a matrix shard.
Definition: dbm_shard.h:30
double * data
Definition: dbm_shard.h:44
int data_allocated
Definition: dbm_shard.h:42
int data_size
Definition: dbm_shard.h:43
int data_promised
Definition: dbm_shard.h:40
Internal struct for storing a task, ie. a single block multiplication.