8#include "../offload/offload_runtime.h"
9#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_DBM)
11#include "../offload/offload_library.h"
12#include "../offload/offload_mempool.h"
24void dbm_multiply_gpu_start(
const int max_batch_size,
const int nshards,
26 dbm_multiply_gpu_context_t *ctx) {
30 ctx->nshards = nshards;
31 ctx->max_batch_size = max_batch_size;
32 offloadStreamCreate(&ctx->main_stream);
33 offloadEventCreate(&ctx->upload_event);
36 const size_t size = nshards * max_batch_size *
sizeof(
dbm_task_t);
40 ctx->shards_c_dev = malloc(nshards *
sizeof(dbm_shard_gpu_t));
41 assert(ctx->shards_c_dev != NULL || nshards == 0);
42 for (
int i = 0;
i < nshards;
i++) {
43 const dbm_shard_t *
const shard_c_host = &shards_c_host[
i];
44 dbm_shard_gpu_t *shard_g = &ctx->shards_c_dev[
i];
45 offloadStreamCreate(&shard_g->stream);
46 offloadEventCreate(&shard_g->event);
47 shard_g->data_size = shard_c_host->
data_size;
49 shard_g->data_allocated = shard_c_host->
data_size;
52 offloadMemcpyAsyncHtoD(shard_g->data, shard_c_host->
data,
53 shard_g->data_size *
sizeof(
double),
63 const offloadStream_t stream) {
65 const size_t size = pack_host->
data_size *
sizeof(double);
70 offloadMemcpyAsyncHtoD(pack_dev->
data, pack_host->
data, size, stream);
77bool dbm_multiply_gpu_upload_packs(
const dbm_pack_t *pack_a,
79 dbm_multiply_gpu_context_t *ctx) {
82 for (
int i = 0;
i < ctx->nshards;
i++) {
83 offloadEventRecord(ctx->upload_event, ctx->shards_c_dev[
i].stream);
84 offloadStreamWaitEvent(ctx->main_stream, ctx->upload_event);
87 offloadEventRecord(ctx->upload_event, ctx->main_stream);
89 bool uploaded =
false;
92 upload_pack(pack_a, &ctx->pack_a_dev, ctx->main_stream);
93 upload_pack(pack_b, &ctx->pack_b_dev, ctx->main_stream);
96 offloadEventRecord(ctx->upload_event, ctx->main_stream);
97 for (
int i = 0;
i < ctx->nshards;
i++) {
98 offloadStreamWaitEvent(ctx->shards_c_dev[
i].stream, ctx->upload_event);
110void dbm_multiply_gpu_process_batch(
const int ntasks,
const dbm_task_t *batch,
112 const int kshard,
const bool finish,
113 dbm_multiply_gpu_context_t *ctx) {
115 dbm_shard_gpu_t *
const shard_g = &ctx->shards_c_dev[kshard];
116 double *old_data_dev = NULL;
119 assert(NULL != shard_c && NULL != shard_g);
122 dbm_task_t *batch_dev = &ctx->batches_dev[kshard * ctx->max_batch_size];
123 const size_t size = ntasks *
sizeof(
dbm_task_t);
124 offloadMemcpyAsyncHtoD(batch_dev, batch, size, shard_g->stream);
130 old_data_dev = shard_g->data;
134 offloadMemcpyAsyncDtoD(shard_g->data, old_data_dev,
135 shard_g->data_size *
sizeof(
double),
138 offloadEventRecord(shard_g->event, shard_g->stream);
142 const int tail = shard_c->
data_promised - shard_g->data_size;
143 offloadMemsetAsync(&shard_g->data[shard_g->data_size], 0,
144 tail *
sizeof(
double), shard_g->stream);
148 OFFLOAD_CHECK(offloadGetLastError());
149 assert(0 != shard_g->data_size);
152 dbm_multiply_gpu_launch_kernel(shard_g->stream, alpha, ntasks, batch,
153 batch_dev, ctx->pack_a_dev.data,
154 ctx->pack_b_dev.data, shard_g->data);
155 OFFLOAD_CHECK(offloadGetLastError());
162 assert(shard_c->
data_size == shard_g->data_size);
163 offloadMemcpyAsyncDtoH(shard_c->
data, shard_g->data,
164 shard_g->data_size *
sizeof(
double),
172 offloadEventSynchronize(shard_g->event);
174 if (NULL != old_data_dev) {
184void dbm_multiply_gpu_stop(dbm_multiply_gpu_context_t *ctx) {
187#pragma omp parallel for DBM_OMP_SCHEDULE
188 for (
int i = 0;
i < ctx->nshards;
i++) {
189 dbm_shard_gpu_t *
const shard_g = &ctx->shards_c_dev[
i];
190 offloadStreamSynchronize(shard_g->stream);
191 offloadStreamDestroy(shard_g->stream);
192 offloadEventDestroy(shard_g->event);
195 free(ctx->shards_c_dev);
200 offloadStreamDestroy(ctx->main_stream);
201 offloadEventDestroy(ctx->upload_event);
#define DBM_ALLOCATION_FACTOR
void dbm_shard_allocate_promised_blocks(dbm_shard_t *shard)
Internal routine for allocating and zeroing any promised block's data.
static void const int const int i
subroutine, public offload_activate_chosen_device()
Activates the device selected via offload_set_chosen_device()
void offload_mempool_device_free(const void *memory)
Internal routine for releasing memory back to the pool.
void * offload_mempool_device_malloc(const size_t size)
Internal routine for allocating device memory from the pool.
Internal struct for storing a pack - essentially a shard for MPI.
Internal struct for storing a matrix shard.
Internal struct for storing a task, ie. a single block multiplication.