8#include "../offload/offload_runtime.h"
9#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_DBM)
14#error "OpenMP should not be used in .cu files to accommodate HIP."
17#define NUM_THREADS 256
23__device__
static void atomicAddDouble(
double *address,
double val) {
27#if __CUDA_ARCH__ >= 600
28 atomicAdd(address, val);
31 unsigned long long int *address_as_ull = (
unsigned long long int *)address;
32 unsigned long long int old = *address_as_ull, assumed;
36 old = atomicCAS(address_as_ull, assumed,
37 __double_as_longlong(val + __longlong_as_double(assumed)));
40 }
while (assumed != old);
49template <
int M,
int N>
50__device__
static void process_task(
const int m,
const int n,
const int k,
51 const double alpha,
const double *block_a,
52 const double *block_b,
double *block_c,
54 constexpr int MAXEXTENT = (M > N ? M : N);
55 constexpr int K = NUM_THREADS / MAXEXTENT;
57 static_assert(K * MAXEXTENT == NUM_THREADS,
"Wasting threads.");
58 static_assert(M * N <= NUM_THREADS,
"Not enough threads to cover tile.");
60 double *tile_a = shared_mem;
61 double *tile_b = &shared_mem[K * M];
65 const int y = threadIdx.x / M;
66 const int x = threadIdx.x - y * M;
67 const int xT = threadIdx.x / N;
68 const int yT = threadIdx.x - xT * N;
71 for (
int i_tile = 0; i_tile < m; i_tile += M) {
72 for (
int j_tile = 0; j_tile < n; j_tile += N) {
75 for (
int l_tile = 0; l_tile < k; l_tile += K) {
78 const int i = i_tile + x;
79 const int l = l_tile + y;
80 const int idx = l * m +
i;
81 const bool load = (l < k &&
i < m);
82 tile_a[y * M + x] = (load) ? block_a[
idx] : 0.0;
87 if (yT < N && xT < K) {
88 const int j = j_tile + yT;
89 const int l = l_tile + xT;
90 const int idx = l * n + j;
91 const bool load = (l < k && j < n);
92 tile_b[xT * N + yT] = (load) ? block_b[
idx] : 0.0;
99 for (
int z = 0; z < K; z++) {
100 result += tile_a[z * M + x] * tile_b[z * N + y];
107 if (x < M && y < N) {
108 const int i = i_tile + x;
109 const int j = j_tile + y;
110 if (
i < m && j < n) {
111 const int idx = j * m +
i;
113 atomicAddDouble(&block_c[
idx], alpha * result);
124__global__
static void process_batch_kernel(
const double alpha,
126 const double *pack_a_data,
127 const double *pack_b_data,
128 double *shard_c_data) {
129 __shared__
double shared_mem[2 * NUM_THREADS];
133 const int m = task.
m;
134 const int n = task.
n;
135 const int k = task.
k;
137 const double *blk_a = &pack_a_data[task.
offset_a];
138 const double *blk_b = &pack_b_data[task.
offset_b];
139 double *blk_c = &shard_c_data[task.
offset_c];
141 if (m <= 4 && n <= 4) {
142 process_task<4, 4>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
144 }
else if (m <= 4 && n <= 8) {
145 process_task<4, 8>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
147 }
else if (m <= 4 && n <= 16) {
148 process_task<4, 16>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
150 }
else if (m <= 4 && n <= 32) {
151 process_task<4, 32>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
154 process_task<4, 64>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
156 }
else if (m <= 8 && n <= 4) {
157 process_task<8, 4>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
159 }
else if (m <= 16 && n <= 4) {
160 process_task<16, 4>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
162 }
else if (m <= 32 && n <= 4) {
163 process_task<32, 4>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
166 process_task<64, 4>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
168 }
else if (m <= 8 && n <= 8) {
169 process_task<8, 8>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
171 }
else if (m <= 8 && n <= 16) {
172 process_task<8, 16>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
175 process_task<8, 32>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
177 }
else if (m <= 16 && n <= 8) {
178 process_task<16, 8>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
181 process_task<32, 8>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
184 process_task<16, 16>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
193void dbm_multiply_gpu_launch_kernel(offloadStream_t stream,
double alpha,
196 const double *pack_a_data,
197 const double *pack_b_data,
198 double *shard_c_data) {
199 const int nblocks = ntasks;
200 const int threads_per_block = NUM_THREADS;
201 const size_t smem_per_block = 0;
203 process_batch_kernel<<<nblocks, threads_per_block, smem_per_block, stream>>>(
204 alpha, tasks, pack_a_data, pack_b_data, shard_c_data);
static GRID_HOST_DEVICE int idx(const orbital a)
Return coset index of given orbital angular momentum.
static void const int const int i
Internal struct for storing a task, ie. a single block multiplication.