(git:d43dca4)
Loading...
Searching...
No Matches
grid_gpu_collocate.cu
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2026 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: BSD-3-Clause */
6/*----------------------------------------------------------------------------*/
7
8#include "../../offload/offload_runtime.h"
9#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_GRID)
10
11#include <algorithm>
12#include <assert.h>
13#include <limits.h>
14#include <math.h>
15#include <stdio.h>
16#include <stdlib.h>
17#include <string.h>
18
19#define GRID_DO_COLLOCATE 1
20#include "../common/grid_common.h"
21#include "grid_gpu_collint.h"
22#include "grid_gpu_collocate.h"
23
24// This has to be included after grid_gpu_collint.h
25#include "../common/grid_prepare_pab.h"
26
27#if defined(_OMP_H)
28#error "OpenMP should not be used in .cu files to accommodate HIP."
29#endif
30
31/*******************************************************************************
32 * \brief Collocate a single grid point with distance d{xyz} from center.
33 * \author Ole Schuett
34 ******************************************************************************/
35__device__ static void cxyz_to_gridpoint(const double dx, const double dy,
36 const double dz, const double zetp,
37 const int lp, const cxyz_store *cxyz,
38 double *gridpoint) {
39
40 // Squared distance of point from center.
41 const double r2 = dx * dx + dy * dy + dz * dz;
42 const double gaussian = exp(-zetp * r2);
43
44 // accumulate into register
45 double gridpoint_reg = 0.0;
46
47 // Manually unrolled loops based on terms in coset_inv.
48 // For lp > 6 the register usage increases and the kernel becomes slower.
49 gridpoint_reg += cxyz[0];
50
51 if (lp >= 1) {
52 gridpoint_reg += cxyz[1] * dx;
53 gridpoint_reg += cxyz[2] * dy;
54 gridpoint_reg += cxyz[3] * dz;
55 if (lp >= 2) {
56 const double dx2 = dx * dx;
57 const double dy2 = dy * dy;
58 const double dz2 = dz * dz;
59 gridpoint_reg += cxyz[4] * dx2;
60 gridpoint_reg += cxyz[5] * dx * dy;
61 gridpoint_reg += cxyz[6] * dx * dz;
62 gridpoint_reg += cxyz[7] * dy2;
63 gridpoint_reg += cxyz[8] * dy * dz;
64 gridpoint_reg += cxyz[9] * dz2;
65 if (lp >= 3) {
66 const double dx3 = dx2 * dx;
67 const double dy3 = dy2 * dy;
68 const double dz3 = dz2 * dz;
69 gridpoint_reg += cxyz[10] * dx3;
70 gridpoint_reg += cxyz[11] * dx2 * dy;
71 gridpoint_reg += cxyz[12] * dx2 * dz;
72 gridpoint_reg += cxyz[13] * dx * dy2;
73 gridpoint_reg += cxyz[14] * dx * dy * dz;
74 gridpoint_reg += cxyz[15] * dx * dz2;
75 gridpoint_reg += cxyz[16] * dy3;
76 gridpoint_reg += cxyz[17] * dy2 * dz;
77 gridpoint_reg += cxyz[18] * dy * dz2;
78 gridpoint_reg += cxyz[19] * dz3;
79 if (lp >= 4) {
80 const double dx4 = dx3 * dx;
81 const double dy4 = dy3 * dy;
82 const double dz4 = dz3 * dz;
83 gridpoint_reg += cxyz[20] * dx4;
84 gridpoint_reg += cxyz[21] * dx3 * dy;
85 gridpoint_reg += cxyz[22] * dx3 * dz;
86 gridpoint_reg += cxyz[23] * dx2 * dy2;
87 gridpoint_reg += cxyz[24] * dx2 * dy * dz;
88 gridpoint_reg += cxyz[25] * dx2 * dz2;
89 gridpoint_reg += cxyz[26] * dx * dy3;
90 gridpoint_reg += cxyz[27] * dx * dy2 * dz;
91 gridpoint_reg += cxyz[28] * dx * dy * dz2;
92 gridpoint_reg += cxyz[29] * dx * dz3;
93 gridpoint_reg += cxyz[30] * dy4;
94 gridpoint_reg += cxyz[31] * dy3 * dz;
95 gridpoint_reg += cxyz[32] * dy2 * dz2;
96 gridpoint_reg += cxyz[33] * dy * dz3;
97 gridpoint_reg += cxyz[34] * dz4;
98 if (lp >= 5) {
99 const double dx5 = dx4 * dx;
100 const double dy5 = dy4 * dy;
101 const double dz5 = dz4 * dz;
102 gridpoint_reg += cxyz[35] * dx5;
103 gridpoint_reg += cxyz[36] * dx4 * dy;
104 gridpoint_reg += cxyz[37] * dx4 * dz;
105 gridpoint_reg += cxyz[38] * dx3 * dy2;
106 gridpoint_reg += cxyz[39] * dx3 * dy * dz;
107 gridpoint_reg += cxyz[40] * dx3 * dz2;
108 gridpoint_reg += cxyz[41] * dx2 * dy3;
109 gridpoint_reg += cxyz[42] * dx2 * dy2 * dz;
110 gridpoint_reg += cxyz[43] * dx2 * dy * dz2;
111 gridpoint_reg += cxyz[44] * dx2 * dz3;
112 gridpoint_reg += cxyz[45] * dx * dy4;
113 gridpoint_reg += cxyz[46] * dx * dy3 * dz;
114 gridpoint_reg += cxyz[47] * dx * dy2 * dz2;
115 gridpoint_reg += cxyz[48] * dx * dy * dz3;
116 gridpoint_reg += cxyz[49] * dx * dz4;
117 gridpoint_reg += cxyz[50] * dy5;
118 gridpoint_reg += cxyz[51] * dy4 * dz;
119 gridpoint_reg += cxyz[52] * dy3 * dz2;
120 gridpoint_reg += cxyz[53] * dy2 * dz3;
121 gridpoint_reg += cxyz[54] * dy * dz4;
122 gridpoint_reg += cxyz[55] * dz5;
123 if (lp >= 6) {
124 const double dx6 = dx5 * dx;
125 const double dy6 = dy5 * dy;
126 const double dz6 = dz5 * dz;
127 gridpoint_reg += cxyz[56] * dx6;
128 gridpoint_reg += cxyz[57] * dx5 * dy;
129 gridpoint_reg += cxyz[58] * dx5 * dz;
130 gridpoint_reg += cxyz[59] * dx4 * dy2;
131 gridpoint_reg += cxyz[60] * dx4 * dy * dz;
132 gridpoint_reg += cxyz[61] * dx4 * dz2;
133 gridpoint_reg += cxyz[62] * dx3 * dy3;
134 gridpoint_reg += cxyz[63] * dx3 * dy2 * dz;
135 gridpoint_reg += cxyz[64] * dx3 * dy * dz2;
136 gridpoint_reg += cxyz[65] * dx3 * dz3;
137 gridpoint_reg += cxyz[66] * dx2 * dy4;
138 gridpoint_reg += cxyz[67] * dx2 * dy3 * dz;
139 gridpoint_reg += cxyz[68] * dx2 * dy2 * dz2;
140 gridpoint_reg += cxyz[69] * dx2 * dy * dz3;
141 gridpoint_reg += cxyz[70] * dx2 * dz4;
142 gridpoint_reg += cxyz[71] * dx * dy5;
143 gridpoint_reg += cxyz[72] * dx * dy4 * dz;
144 gridpoint_reg += cxyz[73] * dx * dy3 * dz2;
145 gridpoint_reg += cxyz[74] * dx * dy2 * dz3;
146 gridpoint_reg += cxyz[75] * dx * dy * dz4;
147 gridpoint_reg += cxyz[76] * dx * dz5;
148 gridpoint_reg += cxyz[77] * dy6;
149 gridpoint_reg += cxyz[78] * dy5 * dz;
150 gridpoint_reg += cxyz[79] * dy4 * dz2;
151 gridpoint_reg += cxyz[80] * dy3 * dz3;
152 gridpoint_reg += cxyz[81] * dy2 * dz4;
153 gridpoint_reg += cxyz[82] * dy * dz5;
154 gridpoint_reg += cxyz[83] * dz6;
155 }
156 }
157 }
158 }
159 }
160 }
161
162 // Handle higher values of lp.
163 if (lp >= 7) {
164 for (int i = 84; i < ncoset(lp); i++) {
165 double val = cxyz[i];
166 const orbital a = coset_inv[i];
167 for (int j = 0; j < a.l[0]; j++) {
168 val *= dx;
169 }
170 for (int j = 0; j < a.l[1]; j++) {
171 val *= dy;
172 }
173 for (int j = 0; j < a.l[2]; j++) {
174 val *= dz;
175 }
176 gridpoint_reg += val;
177 }
178 }
179
180 atomicAddDouble(gridpoint, gridpoint_reg * gaussian);
181}
182
183/*******************************************************************************
184 * \brief Collocates coefficients C_xyz onto the grid.
185 * \author Ole Schuett
186 ******************************************************************************/
187__device__ static void cxyz_to_grid(const kernel_params *params,
188 const smem_task *task, const double *cxyz,
189 double *grid) {
190
191 if (task->use_orthorhombic_kernel) {
192 ortho_cxyz_to_grid(params, task, cxyz, grid);
193 } else {
194 general_cxyz_to_grid(params, task, cxyz, grid);
195 }
196}
197
198/*******************************************************************************
199 * \brief Decontracts the subblock, going from spherical to cartesian harmonics.
200 * \author Ole Schuett
201 ******************************************************************************/
202template <bool IS_FUNC_AB>
203__device__ static void block_to_cab(const kernel_params *params,
204 const smem_task *task, cab_store *cab) {
205
206 // The carthesian index runs over exponents and then over angular momentum.
207 // The spherical index runs over angular momentum and then over contractions.
208
209 // Decontract block, apply prepare_pab, and store in cab.
210 // This is a double matrix product. Since the pab block can be quite large the
211 // two products are fused to conserve shared memory.
212
213 // TODO find if we can make better bounds for ico and jco because
214 // we only need a submatrix of cab.
215 if (threadIdx.z == 0) { // TODO: How bad is this?
216 const int jco_start = ncoset(task->lb_min_basis - 1) + threadIdx.y;
217 const int jco_end = ncoset(task->lb_max_basis);
218 for (int jco = jco_start; jco < jco_end; jco += blockDim.y) {
219 const orbital b = coset_inv[jco];
220 const int ico_start = ncoset(task->la_min_basis - 1) + threadIdx.x;
221 const int ico_end = ncoset(task->la_max_basis);
222 for (int ico = ico_start; ico < ico_end; ico += blockDim.x) {
223 const orbital a = coset_inv[ico];
224 double pab_val = 0.0;
225 for (int i = 0; i < task->nsgf_setb; i++) {
226 const double sphib = task->sphib[i * task->maxcob + idx(b)];
227 for (int j = 0; j < task->nsgf_seta; j++) {
228 double block_val;
229 if (task->block_transposed) {
230 block_val =
231 task->pab_block[j * task->nsgfb + i] * task->off_diag_twice;
232 } else {
233 block_val =
234 task->pab_block[i * task->nsgfa + j] * task->off_diag_twice;
235 }
236 const double sphia = task->sphia[j * task->maxcoa + idx(a)];
237 pab_val += block_val * sphia * sphib;
238 }
239 }
240 if (IS_FUNC_AB) {
241 // fast path for common case
242 prepare_pab(GRID_FUNC_AB, a, b, task->zeta, task->zetb, pab_val, cab);
243 } else {
244 // Since prepare_pab is a register hog we use it only when needed.
245 prepare_pab(params->func, a, b, task->zeta, task->zetb, pab_val, cab);
246 }
247 }
248 }
249 }
250 __syncthreads(); // because of concurrent writes to cab
251}
252
253/*******************************************************************************
254 * \brief Cuda kernel for collocating all tasks of one grid level.
255 * \author Ole Schuett
256 ******************************************************************************/
257template <bool IS_FUNC_AB>
258__device__ static void collocate_kernel(const kernel_params *params) {
259
260 // Copy task from global to shared memory and precompute some stuff.
261 __shared__ smem_task task;
262 load_task(params, &task);
263
264 // Check if radius is below the resolution of the grid.
265 if (2.0 * task.radius < task.dh_max) {
266 return; // nothing to do
267 }
268
269 // Allot dynamic shared memory.
270 extern __shared__ double shared_memory[];
271 double *smem_cab = &shared_memory[params->smem_cab_offset];
272 double *smem_alpha = &shared_memory[params->smem_alpha_offset];
273 double *smem_cxyz = &shared_memory[params->smem_cxyz_offset];
274
275 // Allocate Cab from global memory if it does not fit into shared memory.
276 cab_store cab = {.data = NULL, .n1 = task.n1};
277 if (params->smem_cab_length < task.n1 * task.n2) {
278 cab.data = malloc_cab(&task);
279 } else {
280 cab.data = smem_cab;
281 }
282
283 zero_cab(&cab, task.n1 * task.n2);
284 compute_alpha(&task, smem_alpha);
285
286 block_to_cab<IS_FUNC_AB>(params, &task, &cab);
287 cab_to_cxyz(&task, smem_alpha, &cab, smem_cxyz);
288 cxyz_to_grid(params, &task, smem_cxyz, params->grid);
289
290 if (params->smem_cab_length < task.n1 * task.n2) {
291 free_cab(cab.data);
292 }
293}
294
295/*******************************************************************************
296 * \brief Specialized Cuda kernel that can only collocate GRID_FUNC_AB.
297 * \author Ole Schuett
298 ******************************************************************************/
299__global__ static void collocate_kernel_density(const kernel_params params) {
300 collocate_kernel<true>(&params);
301}
302
303/*******************************************************************************
304 * \brief Cuda kernel that can collocate any function, ie. GRID_FUNC_*.
305 * \author Ole Schuett
306 ******************************************************************************/
307__global__ static void collocate_kernel_anyfunc(const kernel_params params) {
308 collocate_kernel<false>(&params);
309}
310
311/*******************************************************************************
312 * \brief Launches the Cuda kernel that collocates all tasks of one grid level.
313 * \author Ole Schuett
314 ******************************************************************************/
315void grid_gpu_collocate_one_grid_level(
316 const grid_gpu_task_list *task_list, const int first_task,
317 const int last_task, const enum grid_func func,
318 const grid_gpu_layout *layout, const offloadStream_t stream,
319 const double *pab_blocks_dev, double *grid_dev, int *lp_diff) {
320
321 // Compute max angular momentum.
322 const prepare_ldiffs ldiffs = prepare_get_ldiffs(func);
323 *lp_diff = ldiffs.la_max_diff + ldiffs.lb_max_diff; // for reporting stats
324 const int la_max = task_list->lmax + ldiffs.la_max_diff;
325 const int lb_max = task_list->lmax + ldiffs.lb_max_diff;
326 const int lp_max = la_max + lb_max;
327
328 const int ntasks = last_task - first_task + 1;
329 if (ntasks == 0) {
330 return; // Nothing to do and lp_diff already set.
331 }
332
333 init_constant_memory();
334
335 // Small Cab blocks are stored in shared mem, larger ones in global memory.
336 const int CAB_SMEM_LIMIT = ncoset(5) * ncoset(5); // = 56 * 56 = 3136
337
338 // Compute required shared memory.
339 const int alpha_len = 3 * (lb_max + 1) * (la_max + 1) * (lp_max + 1);
340 const int cxyz_len = ncoset(lp_max);
341 const int cab_len = imin(CAB_SMEM_LIMIT, ncoset(lb_max) * ncoset(la_max));
342 const size_t smem_per_block =
343 (alpha_len + cxyz_len + cab_len) * sizeof(double);
344
345 // kernel parameters
346 kernel_params params;
347 params.smem_cab_length = cab_len;
348 params.smem_cab_offset = 0;
349 params.smem_alpha_offset = params.smem_cab_offset + cab_len;
350 params.smem_cxyz_offset = params.smem_alpha_offset + alpha_len;
351 params.first_task = first_task;
352 params.func = func;
353 params.grid = grid_dev;
354 params.la_min_diff = ldiffs.la_min_diff;
355 params.lb_min_diff = ldiffs.lb_min_diff;
356 params.la_max_diff = ldiffs.la_max_diff;
357 params.lb_max_diff = ldiffs.lb_max_diff;
358 params.tasks = task_list->tasks_dev;
359 params.pab_blocks = pab_blocks_dev;
360 memcpy(params.dh, layout->dh, 9 * sizeof(double));
361 memcpy(params.dh_inv, layout->dh_inv, 9 * sizeof(double));
362 memcpy(params.npts_global, layout->npts_global, 3 * sizeof(int));
363 memcpy(params.npts_local, layout->npts_local, 3 * sizeof(int));
364 memcpy(params.shift_local, layout->shift_local, 3 * sizeof(int));
365
366 // Launch !
367 const int nblocks = ntasks;
368 const dim3 threads_per_block(4, 4, 4);
369
370 if (func == GRID_FUNC_AB) {
371 collocate_kernel_density<<<nblocks, threads_per_block, smem_per_block,
372 stream>>>(params);
373 } else {
374 collocate_kernel_anyfunc<<<nblocks, threads_per_block, smem_per_block,
375 stream>>>(params);
376 }
377 OFFLOAD_CHECK(offloadGetLastError());
378}
379
380#endif // defined(__OFFLOAD) && !defined(__NO_OFFLOAD_GRID)
381// EOF
static int imin(int x, int y)
Returns the smaller of the two integers (missing from the C standard).
Definition dbm_miniapp.c:40
static GRID_HOST_DEVICE int idx(const orbital a)
Return coset index of given orbital angular momentum.
grid_func
@ GRID_FUNC_AB
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
static void const int const int i
static GRID_DEVICE void prepare_pab(const enum grid_func func, const orbital a, const orbital b, const double zeta, const double zetb, const double pab_val, cab_store *cab)
Transforms a given element of the density matrix according to func.
static prepare_ldiffs prepare_get_ldiffs(const enum grid_func func)
Returns difference in angular momentum range for given func.
static void cxyz_to_grid(const bool orthorhombic, const int border_mask, const int lp, const double zetp, const double dh[3][3], const double dh_inv[3][3], const double rp[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, GRID_CONST_WHEN_COLLOCATE double *cxyz, GRID_CONST_WHEN_INTEGRATE double *grid)
Collocates coefficients C_xyz onto the grid.
static void cab_to_cxyz(const int la_max, const int la_min, const int lb_max, const int lb_min, const double prefactor, const double ra[3], const double rb[3], const double rp[3], GRID_CONST_WHEN_COLLOCATE double *cab, GRID_CONST_WHEN_INTEGRATE double *cxyz)
Transforms coefficients C_ab into C_xyz.
static void general_cxyz_to_grid(const int border_mask, const int lp, const double zetp, const double dh[3][3], const double dh_inv[3][3], const double rp[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, GRID_CONST_WHEN_COLLOCATE double *cxyz, GRID_CONST_WHEN_INTEGRATE double *grid)
Collocates coefficients C_xyz onto the grid for general case.
static void ortho_cxyz_to_grid(const int lp, const double zetp, const double dh[3][3], const double dh_inv[3][3], const double rp[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const double radius, GRID_CONST_WHEN_COLLOCATE double *cxyz, GRID_CONST_WHEN_INTEGRATE double *grid)
Collocates coefficients C_xyz onto the grid for orthorhombic case.
real(dp), dimension(3) a
real(dp), dimension(3) b
integer, dimension(:), allocatable, public ncoset
integer, parameter, public gaussian
__constant__ orbital coset_inv[1330]
__inline__ __device__ void compute_alpha(const smem_task< T > &task, T *__restrict__ alpha)
Computes the polynomial expansion coefficients: (x-a)**lxa (x-b)**lxb -> sum_{ls} alpha(ls,...
Cab matrix container to be passed through get_force/virial to cab_get.
const double * data
Orbital angular momentum.
Differences in angular momentum.