(git:b279b6b)
dbm_multiply_gpu_kernel.cu
Go to the documentation of this file.
1 /*----------------------------------------------------------------------------*/
2 /* CP2K: A general program to perform molecular dynamics simulations */
3 /* Copyright 2000-2024 CP2K developers group <https://cp2k.org> */
4 /* */
5 /* SPDX-License-Identifier: BSD-3-Clause */
6 /*----------------------------------------------------------------------------*/
7 
8 #include "../offload/offload_runtime.h"
9 #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_DBM)
10 
12 
13 #if defined(_OMP_H)
14 #error "OpenMP should not be used in .cu files to accommodate HIP."
15 #endif
16 
17 #define NUM_THREADS 256
18 
19 /*******************************************************************************
20  * \brief Atomic add for doubles that also works prior to compute capability 6.
21  * \author Ole Schuett
22  ******************************************************************************/
23 __device__ static void atomicAddDouble(double *address, double val) {
24  if (val == 0.0)
25  return;
26 
27 #if __CUDA_ARCH__ >= 600
28  atomicAdd(address, val); // part of gpu library
29 #else
30  // https://docs.nvidia.com/gpu/gpu-c-programming-guide/index.html#atomic-functions
31  unsigned long long int *address_as_ull = (unsigned long long int *)address;
32  unsigned long long int old = *address_as_ull, assumed;
33 
34  do {
35  assumed = old;
36  old = atomicCAS(address_as_ull, assumed,
37  __double_as_longlong(val + __longlong_as_double(assumed)));
38 
39  // Uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
40  } while (assumed != old);
41 
42 #endif
43 }
44 
45 /*******************************************************************************
46  * \brief Returns the larger of two given integer (missing from the C standard)
47  * \author Ole Schuett
48  ******************************************************************************/
49 __device__ constexpr static inline int imax(int x, int y) {
50  return (x > y ? x : y);
51 }
52 
53 /*******************************************************************************
54  * \brief Processes a task using tile dimensions give by template parameters.
55  * \author Ole Schuett
56  ******************************************************************************/
57 template <int M, int N>
58 __device__ static void process_task(const int m, const int n, const int k,
59  const double alpha, const double *block_a,
60  const double *block_b, double *block_c,
61  double *shared_mem) {
62 
63  constexpr int K = NUM_THREADS / imax(M, N);
64 
65  static_assert(K * imax(M, N) == NUM_THREADS, "Wasting threads.");
66  static_assert(M * N <= NUM_THREADS, "Not enough threads to cover tile.");
67 
68  double *tile_a = shared_mem;
69  double *tile_b = &shared_mem[K * M];
70 
71  // Layout threads to cover the entire tile.
72  // Indices x,y,z mark the position within the tile.
73  const int y = threadIdx.x / M; // Can exceed N, guaranteed to reach K.
74  const int x = threadIdx.x - y * M; // Fastest index, does not exceed M.
75  const int xT = threadIdx.x / N; // Can exceed M, guaranteed to reach K.
76  const int yT = threadIdx.x - xT * N; // Fastest index, does not exceed N.
77 
78  // Indices {ijl}_tile mark the beginning of the current tile.
79  for (int i_tile = 0; i_tile < m; i_tile += M) {
80  for (int j_tile = 0; j_tile < n; j_tile += N) {
81  double result = 0.0;
82  for (int l_tile = 0; l_tile < k; l_tile += K) {
83 
84  // Load tile_a from global into shared memory.
85  if (x < M && y < K) {
86  const int i = i_tile + x;
87  const int l = l_tile + y;
88  const int idx = l * m + i; // transa = "N"
89  const bool load = (l < k && i < m);
90  tile_a[y * M + x] = (load) ? block_a[idx] : 0.0;
91  }
92 
93  // Load tile_b from global into shared memory.
94  // Use transposed thread mapping to achieve coalesced memory reads.
95  if (yT < N && xT < K) {
96  const int j = j_tile + yT;
97  const int l = l_tile + xT;
98  const int idx = l * n + j; // transb = "T"
99  const bool load = (l < k && j < n);
100  tile_b[xT * N + yT] = (load) ? block_b[idx] : 0.0;
101  }
102 
103  // Multiply tiles from shared memory.
104  __syncthreads();
105  if (x < M && y < N) {
106 #pragma unroll
107  for (int z = 0; z < K; z++) {
108  result += tile_a[z * M + x] * tile_b[z * N + y];
109  }
110  }
111  __syncthreads();
112  }
113 
114  // Add result tile to block_c in global memory.
115  if (x < M && y < N) {
116  const int i = i_tile + x;
117  const int j = j_tile + y;
118  if (i < m && j < n) {
119  const int idx = j * m + i;
120  // Need atomics because other thread blocks might work on same C.
121  atomicAddDouble(&block_c[idx], alpha * result);
122  }
123  }
124  }
125  }
126 }
127 
128 /*******************************************************************************
129  * \brief A generic matrix multiplication kernel.
130  * \author Ole Schuett
131  ******************************************************************************/
132 __global__ static void process_batch_kernel(const double alpha,
133  const dbm_task_t *batch,
134  const double *pack_a_data,
135  const double *pack_b_data,
136  double *shard_c_data) {
137 
138  __shared__ double shared_mem[2 * NUM_THREADS];
139 
140  const dbm_task_t task = batch[blockIdx.x];
141 
142  const int m = task.m;
143  const int n = task.n;
144  const int k = task.k;
145 
146  const double *blk_a = &pack_a_data[task.offset_a];
147  const double *blk_b = &pack_b_data[task.offset_b];
148  double *blk_c = &shard_c_data[task.offset_c];
149 
150  if (m <= 4 && n <= 4) {
151  process_task<4, 4>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
152 
153  } else if (m <= 4 && n <= 8) {
154  process_task<4, 8>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
155 
156  } else if (m <= 4 && n <= 16) {
157  process_task<4, 16>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
158 
159  } else if (m <= 4 && n <= 32) {
160  process_task<4, 32>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
161 
162  } else if (m <= 4) {
163  process_task<4, 64>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
164 
165  } else if (m <= 8 && n <= 4) {
166  process_task<8, 4>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
167 
168  } else if (m <= 16 && n <= 4) {
169  process_task<16, 4>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
170 
171  } else if (m <= 32 && n <= 4) {
172  process_task<32, 4>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
173 
174  } else if (n <= 4) {
175  process_task<64, 4>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
176 
177  } else if (m <= 8 && n <= 8) {
178  process_task<8, 8>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
179 
180  } else if (m <= 8 && n <= 16) {
181  process_task<8, 16>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
182 
183  } else if (m <= 8) {
184  process_task<8, 32>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
185 
186  } else if (m <= 16 && n <= 8) {
187  process_task<16, 8>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
188 
189  } else if (n <= 8) {
190  process_task<32, 8>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
191 
192  } else {
193  process_task<16, 16>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
194  }
195 }
196 
197 /*******************************************************************************
198  * \brief Internal routine for launching the GPU kernel.
199  * All arguments are assumed to be device pointers.
200  * \author Ole Schuett
201  ******************************************************************************/
202 void dbm_multiply_gpu_launch_kernel(
203  const offloadStream_t stream, const int mnk_range[3][2], const double alpha,
204  const int ntasks, const dbm_task_t *batch, const double *pack_a_data,
205  const double *pack_b_data, double *shard_c_data) {
206  const int nblocks = ntasks; // TODO tune launch parameters.
207  const int threads_per_block = NUM_THREADS;
208  const size_t smem_per_block = 0;
209  (void)mnk_range; // mark used
210  process_batch_kernel<<<nblocks, threads_per_block, smem_per_block, stream>>>(
211  alpha, batch, pack_a_data, pack_b_data, shard_c_data);
212 }
213 
214 #endif // defined(__OFFLOAD) && !defined(__NO_OFFLOAD_DBM)
215 
216 // EOF
static int imax(int x, int y)
Returns the larger of two given integer (missing from the C standard)
static GRID_HOST_DEVICE int idx(const orbital a)
Return coset index of given orbital angular momentum.
Definition: grid_common.h:153
static void const int const int i
Internal struct for storing a task, ie. a single block multiplication.