8#ifndef DBM_MULTIPLY_GPU_H
9#define DBM_MULTIPLY_GPU_H
11#include "../offload/offload_runtime.h"
12#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_DBM)
25 offloadStream_t stream;
34 offloadStream_t main_stream;
35 offloadEvent_t upload_event;
38 dbm_shard_gpu_t *shards_c_dev;
45} dbm_multiply_gpu_context_t;
51void dbm_multiply_gpu_start(
const int max_batch_size,
const int nshards,
53 dbm_multiply_gpu_context_t *ctx);
59bool dbm_multiply_gpu_upload_packs(
const dbm_pack_t *pack_a,
61 dbm_multiply_gpu_context_t *ctx);
67void dbm_multiply_gpu_process_batch(
const int ntasks,
const dbm_task_t *batch,
69 const int kshard,
const bool finish,
70 dbm_multiply_gpu_context_t *ctx);
76void dbm_multiply_gpu_stop(dbm_multiply_gpu_context_t *ctx);
Internal struct for storing a pack - essentially a shard for MPI.
Internal struct for storing a matrix shard.
Internal struct for storing a task, ie. a single block multiplication.