8 #ifndef DBM_MULTIPLY_GPU_H
9 #define DBM_MULTIPLY_GPU_H
11 #include "../offload/offload_runtime.h"
12 #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_DBM)
25 offloadStream_t stream;
33 offloadStream_t main_stream;
37 dbm_shard_gpu_t *shards_c_dev;
44 } dbm_multiply_gpu_context_t;
50 void dbm_multiply_gpu_start(
const int max_batch_size,
const int nshards,
52 dbm_multiply_gpu_context_t *ctx);
58 void dbm_multiply_gpu_upload_packs(
const dbm_pack_t *pack_a,
60 dbm_multiply_gpu_context_t *ctx);
66 void dbm_multiply_gpu_process_batch(
const int ntasks,
const dbm_task_t *batch,
67 const int mnk_range[3][2],
68 const double alpha,
const int kshard,
69 dbm_multiply_gpu_context_t *ctx);
75 void dbm_multiply_gpu_download_results(dbm_multiply_gpu_context_t *ctx);
81 void dbm_multiply_gpu_stop(dbm_multiply_gpu_context_t *ctx);
Internal struct for storing a pack - essentially a shard for MPI.
Internal struct for storing a matrix shard.
Internal struct for storing a task, ie. a single block multiplication.