(git:374b731)
Loading...
Searching...
No Matches
dbm_multiply_gpu_kernel.cu
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2024 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: BSD-3-Clause */
6/*----------------------------------------------------------------------------*/
7
8#include "../offload/offload_runtime.h"
9#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_DBM)
10
12
13#if defined(_OMP_H)
14#error "OpenMP should not be used in .cu files to accommodate HIP."
15#endif
16
17#define NUM_THREADS 256
18
19/*******************************************************************************
20 * \brief Atomic add for doubles that also works prior to compute capability 6.
21 * \author Ole Schuett
22 ******************************************************************************/
23__device__ static void atomicAddDouble(double *address, double val) {
24 if (val == 0.0)
25 return;
26
27#if __CUDA_ARCH__ >= 600
28 atomicAdd(address, val); // part of gpu library
29#else
30 // https://docs.nvidia.com/gpu/gpu-c-programming-guide/index.html#atomic-functions
31 unsigned long long int *address_as_ull = (unsigned long long int *)address;
32 unsigned long long int old = *address_as_ull, assumed;
33
34 do {
35 assumed = old;
36 old = atomicCAS(address_as_ull, assumed,
37 __double_as_longlong(val + __longlong_as_double(assumed)));
38
39 // Uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
40 } while (assumed != old);
41
42#endif
43}
44
45/*******************************************************************************
46 * \brief Returns the larger of two given integer (missing from the C standard)
47 * \author Ole Schuett
48 ******************************************************************************/
49__device__ constexpr static inline int imax(int x, int y) {
50 return (x > y ? x : y);
51}
52
53/*******************************************************************************
54 * \brief Processes a task using tile dimensions give by template parameters.
55 * \author Ole Schuett
56 ******************************************************************************/
57template <int M, int N>
58__device__ static void process_task(const int m, const int n, const int k,
59 const double alpha, const double *block_a,
60 const double *block_b, double *block_c,
61 double *shared_mem) {
62
63 constexpr int K = NUM_THREADS / imax(M, N);
64
65 static_assert(K * imax(M, N) == NUM_THREADS, "Wasting threads.");
66 static_assert(M * N <= NUM_THREADS, "Not enough threads to cover tile.");
67
68 double *tile_a = shared_mem;
69 double *tile_b = &shared_mem[K * M];
70
71 // Layout threads to cover the entire tile.
72 // Indices x,y,z mark the position within the tile.
73 const int y = threadIdx.x / M; // Can exceed N, guaranteed to reach K.
74 const int x = threadIdx.x - y * M; // Fastest index, does not exceed M.
75 const int xT = threadIdx.x / N; // Can exceed M, guaranteed to reach K.
76 const int yT = threadIdx.x - xT * N; // Fastest index, does not exceed N.
77
78 // Indices {ijl}_tile mark the beginning of the current tile.
79 for (int i_tile = 0; i_tile < m; i_tile += M) {
80 for (int j_tile = 0; j_tile < n; j_tile += N) {
81 double result = 0.0;
82 for (int l_tile = 0; l_tile < k; l_tile += K) {
83
84 // Load tile_a from global into shared memory.
85 if (x < M && y < K) {
86 const int i = i_tile + x;
87 const int l = l_tile + y;
88 const int idx = l * m + i; // transa = "N"
89 const bool load = (l < k && i < m);
90 tile_a[y * M + x] = (load) ? block_a[idx] : 0.0;
91 }
92
93 // Load tile_b from global into shared memory.
94 // Use transposed thread mapping to achieve coalesced memory reads.
95 if (yT < N && xT < K) {
96 const int j = j_tile + yT;
97 const int l = l_tile + xT;
98 const int idx = l * n + j; // transb = "T"
99 const bool load = (l < k && j < n);
100 tile_b[xT * N + yT] = (load) ? block_b[idx] : 0.0;
101 }
102
103 // Multiply tiles from shared memory.
104 __syncthreads();
105 if (x < M && y < N) {
106#pragma unroll
107 for (int z = 0; z < K; z++) {
108 result += tile_a[z * M + x] * tile_b[z * N + y];
109 }
110 }
111 __syncthreads();
112 }
113
114 // Add result tile to block_c in global memory.
115 if (x < M && y < N) {
116 const int i = i_tile + x;
117 const int j = j_tile + y;
118 if (i < m && j < n) {
119 const int idx = j * m + i;
120 // Need atomics because other thread blocks might work on same C.
121 atomicAddDouble(&block_c[idx], alpha * result);
122 }
123 }
124 }
125 }
126}
127
128/*******************************************************************************
129 * \brief A generic matrix multiplication kernel.
130 * \author Ole Schuett
131 ******************************************************************************/
132__global__ static void process_batch_kernel(const double alpha,
133 const dbm_task_t *batch,
134 const double *pack_a_data,
135 const double *pack_b_data,
136 double *shard_c_data) {
137
138 __shared__ double shared_mem[2 * NUM_THREADS];
139
140 const dbm_task_t task = batch[blockIdx.x];
141
142 const int m = task.m;
143 const int n = task.n;
144 const int k = task.k;
145
146 const double *blk_a = &pack_a_data[task.offset_a];
147 const double *blk_b = &pack_b_data[task.offset_b];
148 double *blk_c = &shard_c_data[task.offset_c];
149
150 if (m <= 4 && n <= 4) {
151 process_task<4, 4>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
152
153 } else if (m <= 4 && n <= 8) {
154 process_task<4, 8>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
155
156 } else if (m <= 4 && n <= 16) {
157 process_task<4, 16>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
158
159 } else if (m <= 4 && n <= 32) {
160 process_task<4, 32>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
161
162 } else if (m <= 4) {
163 process_task<4, 64>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
164
165 } else if (m <= 8 && n <= 4) {
166 process_task<8, 4>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
167
168 } else if (m <= 16 && n <= 4) {
169 process_task<16, 4>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
170
171 } else if (m <= 32 && n <= 4) {
172 process_task<32, 4>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
173
174 } else if (n <= 4) {
175 process_task<64, 4>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
176
177 } else if (m <= 8 && n <= 8) {
178 process_task<8, 8>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
179
180 } else if (m <= 8 && n <= 16) {
181 process_task<8, 16>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
182
183 } else if (m <= 8) {
184 process_task<8, 32>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
185
186 } else if (m <= 16 && n <= 8) {
187 process_task<16, 8>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
188
189 } else if (n <= 8) {
190 process_task<32, 8>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
191
192 } else {
193 process_task<16, 16>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
194 }
195}
196
197/*******************************************************************************
198 * \brief Internal routine for launching the GPU kernel.
199 * All arguments are assumed to be device pointers.
200 * \author Ole Schuett
201 ******************************************************************************/
202void dbm_multiply_gpu_launch_kernel(
203 const offloadStream_t stream, const int mnk_range[3][2], const double alpha,
204 const int ntasks, const dbm_task_t *batch, const double *pack_a_data,
205 const double *pack_b_data, double *shard_c_data) {
206 const int nblocks = ntasks; // TODO tune launch parameters.
207 const int threads_per_block = NUM_THREADS;
208 const size_t smem_per_block = 0;
209 (void)mnk_range; // mark used
210 process_batch_kernel<<<nblocks, threads_per_block, smem_per_block, stream>>>(
211 alpha, batch, pack_a_data, pack_b_data, shard_c_data);
212}
213
214#endif // defined(__OFFLOAD) && !defined(__NO_OFFLOAD_DBM)
215
216// EOF
static int imax(int x, int y)
Returns the larger of two given integer (missing from the C standard)
static GRID_HOST_DEVICE int idx(const orbital a)
Return coset index of given orbital angular momentum.
static void const int const int i
Internal struct for storing a task, ie. a single block multiplication.