8 #include "../offload/offload_runtime.h"
9 #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_DBM)
11 #include "../offload/offload_library.h"
24 void dbm_multiply_gpu_start(
const int max_batch_size,
const int nshards,
26 dbm_multiply_gpu_context_t *ctx) {
30 ctx->nshards = nshards;
31 ctx->shards_c_host = shards_c_host;
32 ctx->max_batch_size = max_batch_size;
33 offloadStreamCreate(&ctx->main_stream);
36 const size_t size = nshards * max_batch_size *
sizeof(
dbm_task_t);
41 (dbm_shard_gpu_t *)malloc(nshards *
sizeof(dbm_shard_gpu_t));
42 for (
int i = 0;
i < nshards;
i++) {
43 const dbm_shard_t *shard_c_host = &ctx->shards_c_host[
i];
44 dbm_shard_gpu_t *shard_c_dev = &ctx->shards_c_dev[
i];
45 offloadStreamCreate(&shard_c_dev->stream);
46 shard_c_dev->data_size = shard_c_host->
data_size;
49 shard_c_dev->data_allocated *
sizeof(
double));
50 offloadMemcpyAsyncHtoD(shard_c_dev->data, shard_c_host->
data,
51 shard_c_dev->data_size *
sizeof(
double),
61 const offloadStream_t stream) {
63 const size_t size = pack_host->
data_size *
sizeof(double);
68 offloadMemcpyAsyncHtoD(pack_dev->
data, pack_host->
data, size, stream);
75 void dbm_multiply_gpu_upload_packs(
const dbm_pack_t *pack_a,
77 dbm_multiply_gpu_context_t *ctx) {
83 offloadEventCreate(&event);
84 for (
int i = 0;
i < ctx->nshards;
i++) {
85 offloadEventRecord(event, ctx->shards_c_dev[
i].stream);
86 offloadStreamWaitEvent(ctx->main_stream, event, 0);
89 upload_pack(pack_a, &ctx->pack_a_dev, ctx->main_stream);
90 upload_pack(pack_b, &ctx->pack_b_dev, ctx->main_stream);
93 offloadEventRecord(event, ctx->main_stream);
94 for (
int i = 0;
i < ctx->nshards;
i++) {
95 offloadStreamWaitEvent(ctx->shards_c_dev[
i].stream, event, 0);
97 offloadEventDestroy(event);
104 void dbm_multiply_gpu_process_batch(
const int ntasks,
const dbm_task_t *batch,
105 const int mnk_range[3][2],
106 const double alpha,
const int kshard,
107 dbm_multiply_gpu_context_t *ctx) {
116 const dbm_shard_t *shard_c_host = &ctx->shards_c_host[kshard];
117 dbm_shard_gpu_t *shard_c_dev = &ctx->shards_c_dev[kshard];
120 dbm_task_t *batch_dev = &ctx->batches_dev[kshard * ctx->max_batch_size];
121 const size_t size = ntasks *
sizeof(
dbm_task_t);
122 offloadMemcpyAsyncHtoD(batch_dev, batch, size, shard_c_dev->stream);
123 offloadEvent_t batch_uploaded;
124 offloadEventCreate(&batch_uploaded);
125 offloadEventRecord(batch_uploaded, shard_c_dev->stream);
128 if (shard_c_host->
data_promised > shard_c_dev->data_allocated) {
129 double *old_data_dev = shard_c_dev->data;
130 shard_c_dev->data_allocated =
133 shard_c_dev->data_allocated *
sizeof(
double));
134 offloadMemcpyAsyncDtoD(shard_c_dev->data, old_data_dev,
135 shard_c_dev->data_size *
sizeof(
double),
136 shard_c_dev->stream);
138 offloadStreamSynchronize(shard_c_dev->stream);
144 const int tail = shard_c_host->
data_promised - shard_c_dev->data_size;
145 offloadMemsetAsync(&shard_c_dev->data[shard_c_dev->data_size], 0,
146 tail *
sizeof(
double), shard_c_dev->stream);
151 dbm_multiply_gpu_launch_kernel(shard_c_dev->stream, mnk_range, alpha, ntasks,
152 batch_dev, ctx->pack_a_dev.data,
153 ctx->pack_b_dev.data, shard_c_dev->data);
154 OFFLOAD_CHECK(offloadGetLastError());
157 offloadEventSynchronize(batch_uploaded);
158 offloadEventDestroy(batch_uploaded);
165 void dbm_multiply_gpu_download_results(dbm_multiply_gpu_context_t *ctx) {
169 #pragma omp parallel for schedule(dynamic)
170 for (
int i = 0;
i < ctx->nshards;
i++) {
176 dbm_shard_gpu_t *shard_c_dev = &ctx->shards_c_dev[
i];
177 assert(shard_c_host->
data_size == shard_c_dev->data_size);
178 const size_t size = shard_c_dev->data_size *
sizeof(double);
179 offloadMemcpyAsyncDtoH(shard_c_host->
data, shard_c_dev->data, size,
180 shard_c_dev->stream);
188 void dbm_multiply_gpu_stop(dbm_multiply_gpu_context_t *ctx) {
193 #pragma omp parallel for schedule(dynamic)
194 for (
int i = 0;
i < ctx->nshards;
i++) {
195 dbm_shard_gpu_t *shard_c_dev = &ctx->shards_c_dev[
i];
196 offloadStreamSynchronize(shard_c_dev->stream);
197 offloadStreamDestroy(shard_c_dev->stream);
200 free(ctx->shards_c_dev);
205 offloadStreamDestroy(ctx->main_stream);
static const float ALLOCATION_FACTOR
void * dbm_mempool_device_malloc(const size_t size)
Internal routine for allocating device memory from the pool.
void dbm_mempool_free(void *mem)
Internal routine for releasing memory back to the pool.
void dbm_shard_allocate_promised_blocks(dbm_shard_t *shard)
Internal routine for allocating and zeroing any promised block's data.
static void const int const int i
subroutine, public offload_activate_chosen_device()
Activates the device selected via offload_set_chosen_device()
Internal struct for storing a pack - essentially a shard for MPI.
Internal struct for storing a matrix shard.
Internal struct for storing a task, ie. a single block multiplication.