7#ifndef DBM_MULTIPLY_GPU_H
8#define DBM_MULTIPLY_GPU_H
10#include "../offload/offload_runtime.h"
11#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_DBM)
24 offloadStream_t stream;
33 offloadStream_t main_stream;
34 offloadEvent_t upload_event;
37 dbm_shard_gpu_t *shards_c_dev;
44} dbm_multiply_gpu_context_t;
50void dbm_multiply_gpu_start(
const int max_batch_size,
const int nshards,
52 dbm_multiply_gpu_context_t *ctx);
58bool dbm_multiply_gpu_upload_packs(
const dbm_pack_t *pack_a,
60 dbm_multiply_gpu_context_t *ctx);
66void dbm_multiply_gpu_process_batch(
const int ntasks,
const dbm_task_t *batch,
68 const int kshard,
const bool finish,
69 dbm_multiply_gpu_context_t *ctx);
75void dbm_multiply_gpu_stop(dbm_multiply_gpu_context_t *ctx);
Internal struct for storing a pack - essentially a shard for MPI.
Internal struct for storing a matrix shard.
Internal struct for storing a task, ie. a single block multiplication.