8 #include "../offload/offload_runtime.h"
9 #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_DBM)
14 #error "OpenMP should not be used in .cu files to accommodate HIP."
17 #define NUM_THREADS 256
23 __device__
static void atomicAddDouble(
double *address,
double val) {
27 #if __CUDA_ARCH__ >= 600
28 atomicAdd(address, val);
31 unsigned long long int *address_as_ull = (
unsigned long long int *)address;
32 unsigned long long int old = *address_as_ull, assumed;
36 old = atomicCAS(address_as_ull, assumed,
37 __double_as_longlong(val + __longlong_as_double(assumed)));
40 }
while (assumed != old);
49 __device__ constexpr
static inline int imax(
int x,
int y) {
50 return (x > y ? x : y);
57 template <
int M,
int N>
58 __device__
static void process_task(
const int m,
const int n,
const int k,
59 const double alpha,
const double *block_a,
60 const double *block_b,
double *block_c,
63 constexpr
int K = NUM_THREADS /
imax(M, N);
65 static_assert(K *
imax(M, N) == NUM_THREADS,
"Wasting threads.");
66 static_assert(M * N <= NUM_THREADS,
"Not enough threads to cover tile.");
68 double *tile_a = shared_mem;
69 double *tile_b = &shared_mem[K * M];
73 const int y = threadIdx.x / M;
74 const int x = threadIdx.x - y * M;
75 const int xT = threadIdx.x / N;
76 const int yT = threadIdx.x - xT * N;
79 for (
int i_tile = 0; i_tile < m; i_tile += M) {
80 for (
int j_tile = 0; j_tile < n; j_tile += N) {
82 for (
int l_tile = 0; l_tile < k; l_tile += K) {
86 const int i = i_tile + x;
87 const int l = l_tile + y;
88 const int idx = l * m +
i;
89 const bool load = (l < k &&
i < m);
90 tile_a[y * M + x] = (load) ? block_a[
idx] : 0.0;
95 if (yT < N && xT < K) {
96 const int j = j_tile + yT;
97 const int l = l_tile + xT;
98 const int idx = l * n + j;
99 const bool load = (l < k && j < n);
100 tile_b[xT * N + yT] = (load) ? block_b[
idx] : 0.0;
105 if (x < M && y < N) {
107 for (
int z = 0; z < K; z++) {
108 result += tile_a[z * M + x] * tile_b[z * N + y];
115 if (x < M && y < N) {
116 const int i = i_tile + x;
117 const int j = j_tile + y;
118 if (
i < m && j < n) {
119 const int idx = j * m +
i;
121 atomicAddDouble(&block_c[
idx], alpha * result);
132 __global__
static void process_batch_kernel(
const double alpha,
134 const double *pack_a_data,
135 const double *pack_b_data,
136 double *shard_c_data) {
138 __shared__
double shared_mem[2 * NUM_THREADS];
142 const int m = task.
m;
143 const int n = task.
n;
144 const int k = task.
k;
146 const double *blk_a = &pack_a_data[task.
offset_a];
147 const double *blk_b = &pack_b_data[task.
offset_b];
148 double *blk_c = &shard_c_data[task.
offset_c];
150 if (m <= 4 && n <= 4) {
151 process_task<4, 4>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
153 }
else if (m <= 4 && n <= 8) {
154 process_task<4, 8>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
156 }
else if (m <= 4 && n <= 16) {
157 process_task<4, 16>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
159 }
else if (m <= 4 && n <= 32) {
160 process_task<4, 32>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
163 process_task<4, 64>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
165 }
else if (m <= 8 && n <= 4) {
166 process_task<8, 4>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
168 }
else if (m <= 16 && n <= 4) {
169 process_task<16, 4>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
171 }
else if (m <= 32 && n <= 4) {
172 process_task<32, 4>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
175 process_task<64, 4>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
177 }
else if (m <= 8 && n <= 8) {
178 process_task<8, 8>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
180 }
else if (m <= 8 && n <= 16) {
181 process_task<8, 16>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
184 process_task<8, 32>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
186 }
else if (m <= 16 && n <= 8) {
187 process_task<16, 8>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
190 process_task<32, 8>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
193 process_task<16, 16>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
202 void dbm_multiply_gpu_launch_kernel(
203 const offloadStream_t stream,
const int mnk_range[3][2],
const double alpha,
204 const int ntasks,
const dbm_task_t *batch,
const double *pack_a_data,
205 const double *pack_b_data,
double *shard_c_data) {
206 const int nblocks = ntasks;
207 const int threads_per_block = NUM_THREADS;
208 const size_t smem_per_block = 0;
210 process_batch_kernel<<<nblocks, threads_per_block, smem_per_block, stream>>>(
211 alpha, batch, pack_a_data, pack_b_data, shard_c_data);
static int imax(int x, int y)
Returns the larger of two given integer (missing from the C standard)
static GRID_HOST_DEVICE int idx(const orbital a)
Return coset index of given orbital angular momentum.
static void const int const int i
Internal struct for storing a task, ie. a single block multiplication.