(git:9a1f30a)
Loading...
Searching...
No Matches
dbm_multiply_gpu_kernel.cu
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2026 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: BSD-3-Clause */
6/*----------------------------------------------------------------------------*/
7
8#include "../offload/offload_runtime.h"
9#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_DBM)
10
12
13#if defined(_OMP_H)
14#error "OpenMP should not be used in .cu files to accommodate HIP."
15#endif
16
17#define NUM_THREADS 256
18
19/*******************************************************************************
20 * \brief Atomic add for doubles that also works prior to compute capability 6.
21 * \author Ole Schuett
22 ******************************************************************************/
23__device__ static void atomicAddDouble(double *address, double val) {
24 if (val == 0.0)
25 return;
26
27#if __CUDA_ARCH__ >= 600
28 atomicAdd(address, val); // part of gpu library
29#else
30 // https://docs.nvidia.com/gpu/gpu-c-programming-guide/index.html#atomic-functions
31 unsigned long long int *address_as_ull = (unsigned long long int *)address;
32 unsigned long long int old = *address_as_ull, assumed;
33
34 do {
35 assumed = old;
36 old = atomicCAS(address_as_ull, assumed,
37 __double_as_longlong(val + __longlong_as_double(assumed)));
38
39 // Uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
40 } while (assumed != old);
41
42#endif
43}
44
45/*******************************************************************************
46 * \brief Processes a task using tile dimensions give by template parameters.
47 * \author Ole Schuett
48 ******************************************************************************/
49template <int M, int N>
50__device__ static void process_task(const int m, const int n, const int k,
51 const double alpha, const double *block_a,
52 const double *block_b, double *block_c,
53 double *shared_mem) {
54 constexpr int MAXEXTENT = (M > N ? M : N);
55 constexpr int K = NUM_THREADS / MAXEXTENT;
56
57 static_assert(K * MAXEXTENT == NUM_THREADS, "Wasting threads.");
58 static_assert(M * N <= NUM_THREADS, "Not enough threads to cover tile.");
59
60 double *tile_a = shared_mem;
61 double *tile_b = &shared_mem[K * M];
62
63 // Layout threads to cover the entire tile.
64 // Indices x,y,z mark the position within the tile.
65 const int y = threadIdx.x / M; // Can exceed N, guaranteed to reach K.
66 const int x = threadIdx.x - y * M; // Fastest index, does not exceed M.
67 const int xT = threadIdx.x / N; // Can exceed M, guaranteed to reach K.
68 const int yT = threadIdx.x - xT * N; // Fastest index, does not exceed N.
69
70 // Indices {ijl}_tile mark the beginning of the current tile.
71 for (int i_tile = 0; i_tile < m; i_tile += M) {
72 for (int j_tile = 0; j_tile < n; j_tile += N) {
73 double result = 0.0;
74
75 for (int l_tile = 0; l_tile < k; l_tile += K) {
76 // Load tile_a from global into shared memory.
77 if (x < M && y < K) {
78 const int i = i_tile + x;
79 const int l = l_tile + y;
80 const int idx = l * m + i; // transa = "N"
81 const bool load = (l < k && i < m);
82 tile_a[y * M + x] = (load) ? block_a[idx] : 0.0;
83 }
84
85 // Load tile_b from global into shared memory.
86 // Use transposed thread mapping to achieve coalesced memory reads.
87 if (yT < N && xT < K) {
88 const int j = j_tile + yT;
89 const int l = l_tile + xT;
90 const int idx = l * n + j; // transb = "T"
91 const bool load = (l < k && j < n);
92 tile_b[xT * N + yT] = (load) ? block_b[idx] : 0.0;
93 }
94
95 // Multiply tiles from shared memory.
96 __syncthreads();
97 if (x < M && y < N) {
98#pragma unroll
99 for (int z = 0; z < K; z++) {
100 result += tile_a[z * M + x] * tile_b[z * N + y];
101 }
102 }
103 __syncthreads();
104 }
105
106 // Add result tile to block_c in global memory.
107 if (x < M && y < N) {
108 const int i = i_tile + x;
109 const int j = j_tile + y;
110 if (i < m && j < n) {
111 const int idx = j * m + i;
112 // Need atomics because other thread blocks might work on same C.
113 atomicAddDouble(&block_c[idx], alpha * result);
114 }
115 }
116 }
117 }
118}
119
120/*******************************************************************************
121 * \brief A generic matrix multiplication kernel.
122 * \author Ole Schuett
123 ******************************************************************************/
124__global__ static void process_batch_kernel(const double alpha,
125 const dbm_task_t *batch,
126 const double *pack_a_data,
127 const double *pack_b_data,
128 double *shard_c_data) {
129 __shared__ double shared_mem[2 * NUM_THREADS];
130
131 const dbm_task_t task = batch[blockIdx.x];
132
133 const int m = task.m;
134 const int n = task.n;
135 const int k = task.k;
136
137 const double *blk_a = &pack_a_data[task.offset_a];
138 const double *blk_b = &pack_b_data[task.offset_b];
139 double *blk_c = &shard_c_data[task.offset_c];
140
141 if (m <= 4 && n <= 4) {
142 process_task<4, 4>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
143
144 } else if (m <= 4 && n <= 8) {
145 process_task<4, 8>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
146
147 } else if (m <= 4 && n <= 16) {
148 process_task<4, 16>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
149
150 } else if (m <= 4 && n <= 32) {
151 process_task<4, 32>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
152
153 } else if (m <= 4) {
154 process_task<4, 64>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
155
156 } else if (m <= 8 && n <= 4) {
157 process_task<8, 4>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
158
159 } else if (m <= 16 && n <= 4) {
160 process_task<16, 4>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
161
162 } else if (m <= 32 && n <= 4) {
163 process_task<32, 4>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
164
165 } else if (n <= 4) {
166 process_task<64, 4>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
167
168 } else if (m <= 8 && n <= 8) {
169 process_task<8, 8>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
170
171 } else if (m <= 8 && n <= 16) {
172 process_task<8, 16>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
173
174 } else if (m <= 8) {
175 process_task<8, 32>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
176
177 } else if (m <= 16 && n <= 8) {
178 process_task<16, 8>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
179
180 } else if (n <= 8) {
181 process_task<32, 8>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
182
183 } else {
184 process_task<16, 16>(m, n, k, alpha, blk_a, blk_b, blk_c, shared_mem);
185 }
186}
187
188/*******************************************************************************
189 * \brief Internal routine for launching the GPU kernel.
190 * All arguments are assumed to be device pointers.
191 * \author Ole Schuett
192 ******************************************************************************/
193void dbm_multiply_gpu_launch_kernel(offloadStream_t stream, double alpha,
194 int ntasks, const dbm_task_t *tasks_host,
195 const dbm_task_t *tasks,
196 const double *pack_a_data,
197 const double *pack_b_data,
198 double *shard_c_data) {
199 const int nblocks = ntasks; // TODO tune launch parameters.
200 const int threads_per_block = NUM_THREADS;
201 const size_t smem_per_block = 0;
202 (void)tasks_host; // mark used
203 process_batch_kernel<<<nblocks, threads_per_block, smem_per_block, stream>>>(
204 alpha, tasks, pack_a_data, pack_b_data, shard_c_data);
205}
206
207#endif // defined(__OFFLOAD) && !defined(__NO_OFFLOAD_DBM)
208
209// EOF
static GRID_HOST_DEVICE int idx(const orbital a)
Return coset index of given orbital angular momentum.
static void const int const int i
Internal struct for storing a task, ie. a single block multiplication.