8#include "../offload/offload_runtime.h"
9#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_DBM)
11#include "../offload/offload_library.h"
24void dbm_multiply_gpu_start(
const int max_batch_size,
const int nshards,
26 dbm_multiply_gpu_context_t *ctx) {
30 ctx->nshards = nshards;
31 ctx->shards_c_host = shards_c_host;
32 ctx->max_batch_size = max_batch_size;
33 offloadStreamCreate(&ctx->main_stream);
36 const size_t size = nshards * max_batch_size *
sizeof(
dbm_task_t);
40 ctx->shards_c_dev = malloc(nshards *
sizeof(dbm_shard_gpu_t));
41 assert(ctx->shards_c_dev != NULL);
42 for (
int i = 0;
i < nshards;
i++) {
43 const dbm_shard_t *shard_c_host = &ctx->shards_c_host[
i];
44 dbm_shard_gpu_t *shard_c_dev = &ctx->shards_c_dev[
i];
45 offloadStreamCreate(&shard_c_dev->stream);
46 shard_c_dev->data_size = shard_c_host->
data_size;
48 shard_c_dev->data_allocated = shard_c_host->
data_size;
51 offloadMemcpyAsyncHtoD(shard_c_dev->data, shard_c_host->
data,
52 shard_c_dev->data_size *
sizeof(
double),
62 const offloadStream_t stream) {
64 const size_t size = pack_host->
data_size *
sizeof(double);
69 offloadMemcpyAsyncHtoD(pack_dev->
data, pack_host->
data, size, stream);
76void dbm_multiply_gpu_upload_packs(
const dbm_pack_t *pack_a,
78 dbm_multiply_gpu_context_t *ctx) {
84 offloadEventCreate(&event);
85 for (
int i = 0;
i < ctx->nshards;
i++) {
86 offloadEventRecord(event, ctx->shards_c_dev[
i].stream);
87 offloadStreamWaitEvent(ctx->main_stream, event, 0);
90 upload_pack(pack_a, &ctx->pack_a_dev, ctx->main_stream);
91 upload_pack(pack_b, &ctx->pack_b_dev, ctx->main_stream);
94 offloadEventRecord(event, ctx->main_stream);
95 for (
int i = 0;
i < ctx->nshards;
i++) {
96 offloadStreamWaitEvent(ctx->shards_c_dev[
i].stream, event, 0);
98 offloadEventDestroy(event);
105void dbm_multiply_gpu_process_batch(
const int ntasks,
const dbm_task_t *batch,
106 const double alpha,
const int kshard,
107 dbm_multiply_gpu_context_t *ctx) {
115 const dbm_shard_t *shard_c_host = &ctx->shards_c_host[kshard];
116 dbm_shard_gpu_t *shard_c_dev = &ctx->shards_c_dev[kshard];
117 assert(NULL != shard_c_host && NULL != shard_c_dev);
120 dbm_task_t *batch_dev = &ctx->batches_dev[kshard * ctx->max_batch_size];
121 const size_t size = ntasks *
sizeof(
dbm_task_t);
122 offloadMemcpyAsyncHtoD(batch_dev, batch, size, shard_c_dev->stream);
123 offloadEvent_t memsetup;
124 offloadEventCreate(&memsetup);
127 double *old_data_dev = NULL;
128 if (shard_c_host->
data_promised > shard_c_dev->data_allocated) {
129 shard_c_dev->data_allocated =
131 assert(shard_c_host->
data_promised <= shard_c_dev->data_allocated);
132 old_data_dev = shard_c_dev->data;
136 offloadMemcpyAsyncDtoD(shard_c_dev->data, old_data_dev,
137 shard_c_dev->data_size *
sizeof(
double),
138 shard_c_dev->stream);
140 offloadEventRecord(memsetup, shard_c_dev->stream);
144 const int tail = shard_c_host->
data_promised - shard_c_dev->data_size;
145 offloadMemsetAsync(&shard_c_dev->data[shard_c_dev->data_size], 0,
146 tail *
sizeof(
double), shard_c_dev->stream);
151 assert(0 != shard_c_dev->data_size);
152 dbm_multiply_gpu_launch_kernel(shard_c_dev->stream, alpha, ntasks, batch,
153 batch_dev, ctx->pack_a_dev.data,
154 ctx->pack_b_dev.data, shard_c_dev->data);
155 OFFLOAD_CHECK(offloadGetLastError());
160 offloadEventSynchronize(memsetup);
161 offloadEventDestroy(memsetup);
164 if (NULL != old_data_dev) {
173void dbm_multiply_gpu_download_results(dbm_multiply_gpu_context_t *ctx) {
177#pragma omp parallel for DBM_OMP_SCHEDULE
178 for (
int i = 0;
i < ctx->nshards;
i++) {
184 dbm_shard_gpu_t *shard_c_dev = &ctx->shards_c_dev[
i];
185 assert(shard_c_host->
data_size == shard_c_dev->data_size);
186 const size_t size = shard_c_dev->data_size *
sizeof(double);
187 offloadMemcpyAsyncDtoH(shard_c_host->
data, shard_c_dev->data, size,
188 shard_c_dev->stream);
196void dbm_multiply_gpu_stop(dbm_multiply_gpu_context_t *ctx) {
201#pragma omp parallel for DBM_OMP_SCHEDULE
202 for (
int i = 0;
i < ctx->nshards;
i++) {
203 dbm_shard_gpu_t *shard_c_dev = &ctx->shards_c_dev[
i];
204 offloadStreamSynchronize(shard_c_dev->stream);
205 offloadStreamDestroy(shard_c_dev->stream);
208 free(ctx->shards_c_dev);
213 offloadStreamDestroy(ctx->main_stream);
#define DBM_ALLOCATION_FACTOR
void dbm_mempool_device_free(const void *memory)
Internal routine for releasing memory back to the pool.
void * dbm_mempool_device_malloc(size_t size)
Internal routine for allocating device memory from the pool.
void dbm_shard_allocate_promised_blocks(dbm_shard_t *shard)
Internal routine for allocating and zeroing any promised block's data.
static void const int const int i
subroutine, public offload_activate_chosen_device()
Activates the device selected via offload_set_chosen_device()
Internal struct for storing a pack - essentially a shard for MPI.
Internal struct for storing a matrix shard.
Internal struct for storing a task, ie. a single block multiplication.