(git:ed6f26b)
Loading...
Searching...
No Matches
dbm_multiply_gpu_kernel.cu
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2025 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: BSD-3-Clause */
6/*----------------------------------------------------------------------------*/
7
8#include "../offload/offload_runtime.h"
9#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_DBM)
10
12
13#if defined(_OMP_H)
14#error "OpenMP should not be used in .cu files to accommodate HIP."
15#endif
16
17#define NUM_THREADS 256
18
19/*******************************************************************************
20 * \brief Atomic add for doubles that also works prior to compute capability 6.
21 * \author Ole Schuett
22 ******************************************************************************/
23__device__ static void atomicAddDouble(double *address, double val) {
24 if (val == 0.0)
25 return;
26
27#if __CUDA_ARCH__ >= 600
28 atomicAdd(address, val); // part of gpu library
29#else
30 // https://docs.nvidia.com/gpu/gpu-c-programming-guide/index.html#atomic-functions
31 unsigned long long int *address_as_ull = (unsigned long long int *)address;
32 unsigned long long int old = *address_as_ull, assumed;
33
34 do {
35 assumed = old;
36 old = atomicCAS(address_as_ull, assumed,
37 __double_as_longlong(val + __longlong_as_double(assumed)));
38
39 // Uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
40 } while (assumed != old);
41
42#endif
43}
44
45/*******************************************************************************
46 * \brief Processes a task using tile dimensions give by template parameters.
47 * \author Ole Schuett
48 ******************************************************************************/
49template <int M, int N>
50__device__ static void process_task(const int m, const int n, const int k,
51 const double alpha, const double *block_a,
52 const double *block_b, double *block_c,
53 double *shared_mem) {
54
55 constexpr int MAXEXTENT = (M > N ? M : N);
56 constexpr int K = NUM_THREADS / MAXEXTENT;
57
58 static_assert(K * MAXEXTENT == NUM_THREADS, "Wasting threads.");
59 static_assert(M * N <= NUM_THREADS, "Not enough threads to cover tile.");
60
61 double *tile_a = shared_mem;
62 double *tile_b = &shared_mem[K * M];
63
64 // Layout threads to cover the entire tile.
65 // Indices x,y,z mark the position within the tile.
66 const int y = threadIdx.x / M; // Can exceed N, guaranteed to reach K.
67 const int x = threadIdx.x - y * M; // Fastest index, does not exceed M.
68 const int xT = threadIdx.x / N; // Can exceed M, guaranteed to reach K.
69 const int yT = threadIdx.x - xT * N; // Fastest index, does not exceed N.
70
71 // Indices {ijl}_tile mark the beginning of the current tile.
72 for (int i_tile = 0; i_tile < m; i_tile += M) {
73 for (int j_tile = 0; j_tile < n; j_tile += N) {
74 double result = 0.0;
75 for (int l_tile = 0; l_tile < k; l_tile += K) {
76
77 // Load tile_a from global into shared memory.
78 if (x < M && y < K) {
79 const int i = i_tile + x;
80 const int l = l_tile + y;
81 const int idx = l * m + i; // transa = "N"
82 const bool load = (l < k && i < m);
83 tile_a[y * M + x] = (load) ? block_a[idx] : 0.0;
84 }
85
86 // Load tile_b from global into shared memory.
87 // Use transposed thread mapping to achieve coalesced memory reads.
88 if (yT < N && xT < K) {
89 const int j = j_tile + yT;
90 const int l = l_tile + xT;
91 const int idx = l * n + j; // transb = "T"
92 const bool load = (l < k && j < n);
93 tile_b[xT * N + yT] = (load) ? block_b[idx] : 0.0;
94 }
95
96 // Multiply tiles from shared memory.
97 __syncthreads();
98 if (x < M && y < N) {
99#pragma unroll
100 for (int z = 0; z < K; z++) {
101 result += tile_a[z * M + x] * tile_b[z * N + y];
102 }
103 }
104 __syncthreads();
105 }
106
107 // Add result tile to block_c in global memory.
108 if (x < M && y < N) {
109 const int i = i_tile + x;
110 const int j = j_tile + y;
111 if (i < m && j < n) {
112 const int idx = j * m + i;
113 // Need atomics because other thread blocks might work on same C.
114 atomicAddDouble(&block_c[idx], alpha * result);
115 }
116 }
117 }
118 }
119}
120
121/*******************************************************************************
122 * \brief A generic matrix multiplication kernel.
123 * \author Ole Schuett
124 ******************************************************************************/
125__global__ static void process_batch_kernel(const double alpha,
126 const dbm_task_t *batch,
127 const double *pack_a_data,
128 const double *pack_b_data,
129 double *shard_c_data) {
130
131 __shared__ double shared_mem[2 * NUM_THREADS];
132
133 const dbm_task_t task = batch[blockIdx.x];
134
135 const int m = task.m;
136 const int n = task.n;
137 const int k = task.k;
138
139 const double *blk_a = &pack_a_data[task.offset_a];
140 const double *blk_b = &pack_b_data[task.offset_b];
141 double *blk_c = &shard_c_data[task.offset_c];
142
143 if (m <= 4 && n <= 4) {
144 process_task<4, 4>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
145
146 } else if (m <= 4 && n <= 8) {
147 process_task<4, 8>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
148
149 } else if (m <= 4 && n <= 16) {
150 process_task<4, 16>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
151
152 } else if (m <= 4 && n <= 32) {
153 process_task<4, 32>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
154
155 } else if (m <= 4) {
156 process_task<4, 64>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
157
158 } else if (m <= 8 && n <= 4) {
159 process_task<8, 4>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
160
161 } else if (m <= 16 && n <= 4) {
162 process_task<16, 4>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
163
164 } else if (m <= 32 && n <= 4) {
165 process_task<32, 4>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
166
167 } else if (n <= 4) {
168 process_task<64, 4>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
169
170 } else if (m <= 8 && n <= 8) {
171 process_task<8, 8>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
172
173 } else if (m <= 8 && n <= 16) {
174 process_task<8, 16>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
175
176 } else if (m <= 8) {
177 process_task<8, 32>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
178
179 } else if (m <= 16 && n <= 8) {
180 process_task<16, 8>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
181
182 } else if (n <= 8) {
183 process_task<32, 8>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
184
185 } else {
186 process_task<16, 16>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
187 }
188}
189
190/*******************************************************************************
191 * \brief Internal routine for launching the GPU kernel.
192 * All arguments are assumed to be device pointers.
193 * \author Ole Schuett
194 ******************************************************************************/
195void dbm_multiply_gpu_launch_kernel(const offloadStream_t stream,
196 const double alpha, const int ntasks,
197 const dbm_task_t *tasks_host,
198 const dbm_task_t *tasks,
199 const double *pack_a_data,
200 const double *pack_b_data,
201 double *shard_c_data) {
202 const int nblocks = ntasks; // TODO tune launch parameters.
203 const int threads_per_block = NUM_THREADS;
204 const size_t smem_per_block = 0;
205 (void)tasks_host; // mark used
206 process_batch_kernel<<<nblocks, threads_per_block, smem_per_block, stream>>>(
207 alpha, tasks, pack_a_data, pack_b_data, shard_c_data);
208}
209
210#endif // defined(__OFFLOAD) && !defined(__NO_OFFLOAD_DBM)
211
212// EOF
static GRID_HOST_DEVICE int idx(const orbital a)
Return coset index of given orbital angular momentum.
static void const int const int i
Internal struct for storing a task, ie. a single block multiplication.