8#include "../offload/offload_runtime.h"
9#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_DBM)
14#error "OpenMP should not be used in .cu files to accommodate HIP."
17#define NUM_THREADS 256
23__device__
static void atomicAddDouble(
double *address,
double val) {
27#if __CUDA_ARCH__ >= 600
28 atomicAdd(address, val);
31 unsigned long long int *address_as_ull = (
unsigned long long int *)address;
32 unsigned long long int old = *address_as_ull, assumed;
36 old = atomicCAS(address_as_ull, assumed,
37 __double_as_longlong(val + __longlong_as_double(assumed)));
40 }
while (assumed != old);
49template <
int M,
int N>
50__device__
static void process_task(
const int m,
const int n,
const int k,
51 const double alpha,
const double *block_a,
52 const double *block_b,
double *block_c,
55 constexpr int MAXEXTENT = (M > N ? M : N);
56 constexpr int K = NUM_THREADS / MAXEXTENT;
58 static_assert(K * MAXEXTENT == NUM_THREADS,
"Wasting threads.");
59 static_assert(M * N <= NUM_THREADS,
"Not enough threads to cover tile.");
61 double *tile_a = shared_mem;
62 double *tile_b = &shared_mem[K * M];
66 const int y = threadIdx.x / M;
67 const int x = threadIdx.x - y * M;
68 const int xT = threadIdx.x / N;
69 const int yT = threadIdx.x - xT * N;
72 for (
int i_tile = 0; i_tile < m; i_tile += M) {
73 for (
int j_tile = 0; j_tile < n; j_tile += N) {
75 for (
int l_tile = 0; l_tile < k; l_tile += K) {
79 const int i = i_tile + x;
80 const int l = l_tile + y;
81 const int idx = l * m +
i;
82 const bool load = (l < k &&
i < m);
83 tile_a[y * M + x] = (load) ? block_a[
idx] : 0.0;
88 if (yT < N && xT < K) {
89 const int j = j_tile + yT;
90 const int l = l_tile + xT;
91 const int idx = l * n + j;
92 const bool load = (l < k && j < n);
93 tile_b[xT * N + yT] = (load) ? block_b[
idx] : 0.0;
100 for (
int z = 0; z < K; z++) {
101 result += tile_a[z * M + x] * tile_b[z * N + y];
108 if (x < M && y < N) {
109 const int i = i_tile + x;
110 const int j = j_tile + y;
111 if (
i < m && j < n) {
112 const int idx = j * m +
i;
114 atomicAddDouble(&block_c[
idx], alpha * result);
125__global__
static void process_batch_kernel(
const double alpha,
127 const double *pack_a_data,
128 const double *pack_b_data,
129 double *shard_c_data) {
131 __shared__
double shared_mem[2 * NUM_THREADS];
135 const int m = task.
m;
136 const int n = task.
n;
137 const int k = task.
k;
139 const double *blk_a = &pack_a_data[task.
offset_a];
140 const double *blk_b = &pack_b_data[task.
offset_b];
141 double *blk_c = &shard_c_data[task.
offset_c];
143 if (m <= 4 && n <= 4) {
144 process_task<4, 4>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
146 }
else if (m <= 4 && n <= 8) {
147 process_task<4, 8>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
149 }
else if (m <= 4 && n <= 16) {
150 process_task<4, 16>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
152 }
else if (m <= 4 && n <= 32) {
153 process_task<4, 32>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
156 process_task<4, 64>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
158 }
else if (m <= 8 && n <= 4) {
159 process_task<8, 4>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
161 }
else if (m <= 16 && n <= 4) {
162 process_task<16, 4>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
164 }
else if (m <= 32 && n <= 4) {
165 process_task<32, 4>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
168 process_task<64, 4>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
170 }
else if (m <= 8 && n <= 8) {
171 process_task<8, 8>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
173 }
else if (m <= 8 && n <= 16) {
174 process_task<8, 16>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
177 process_task<8, 32>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
179 }
else if (m <= 16 && n <= 8) {
180 process_task<16, 8>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
183 process_task<32, 8>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
186 process_task<16, 16>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
195void dbm_multiply_gpu_launch_kernel(
const offloadStream_t stream,
196 const double alpha,
const int ntasks,
199 const double *pack_a_data,
200 const double *pack_b_data,
201 double *shard_c_data) {
202 const int nblocks = ntasks;
203 const int threads_per_block = NUM_THREADS;
204 const size_t smem_per_block = 0;
206 process_batch_kernel<<<nblocks, threads_per_block, smem_per_block, stream>>>(
207 alpha, tasks, pack_a_data, pack_b_data, shard_c_data);
static GRID_HOST_DEVICE int idx(const orbital a)
Return coset index of given orbital angular momentum.
static void const int const int i
Internal struct for storing a task, ie. a single block multiplication.