8#ifndef DBM_MULTIPLY_GPU_H
9#define DBM_MULTIPLY_GPU_H
11#include "../offload/offload_runtime.h"
12#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_DBM)
25 offloadStream_t stream;
33 offloadStream_t main_stream;
37 dbm_shard_gpu_t *shards_c_dev;
44} dbm_multiply_gpu_context_t;
50void dbm_multiply_gpu_start(
const int max_batch_size,
const int nshards,
52 dbm_multiply_gpu_context_t *ctx);
58void dbm_multiply_gpu_upload_packs(
const dbm_pack_t *pack_a,
60 dbm_multiply_gpu_context_t *ctx);
66void dbm_multiply_gpu_process_batch(
const int ntasks,
const dbm_task_t *batch,
67 const double alpha,
const int kshard,
68 dbm_multiply_gpu_context_t *ctx);
74void dbm_multiply_gpu_download_results(dbm_multiply_gpu_context_t *ctx);
80void dbm_multiply_gpu_stop(dbm_multiply_gpu_context_t *ctx);
Internal struct for storing a pack - essentially a shard for MPI.
Internal struct for storing a matrix shard.
Internal struct for storing a task, ie. a single block multiplication.