(git:374b731)
Loading...
Searching...
No Matches
grid_gpu_collocate.cu
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2024 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: BSD-3-Clause */
6/*----------------------------------------------------------------------------*/
7
8#include "../../offload/offload_runtime.h"
9#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_GRID)
10
11#include <algorithm>
12#include <assert.h>
13#include <limits.h>
14#include <math.h>
15#include <stdio.h>
16#include <stdlib.h>
17#include <string.h>
18
19#define GRID_DO_COLLOCATE 1
20#include "../common/grid_common.h"
21#include "grid_gpu_collint.h"
22#include "grid_gpu_collocate.h"
23
24// This has to be included after grid_gpu_collint.h
25#include "../common/grid_prepare_pab.h"
26
27#if defined(_OMP_H)
28#error "OpenMP should not be used in .cu files to accommodate HIP."
29#endif
30
31/*******************************************************************************
32 * \brief Collocate a single grid point with distance d{xyz} from center.
33 * \author Ole Schuett
34 ******************************************************************************/
35__device__ static void cxyz_to_gridpoint(const double dx, const double dy,
36 const double dz, const double zetp,
37 const int lp, const cxyz_store *cxyz,
38 double *gridpoint) {
39
40 // Squared distance of point from center.
41 const double r2 = dx * dx + dy * dy + dz * dz;
42 const double gaussian = exp(-zetp * r2);
43
44 // accumulate into register
45 double gridpoint_reg = 0.0;
46
47 // Manually unrolled loops based on terms in coset_inv.
48 // For lp > 6 the register usage increases and the kernel becomes slower.
49 gridpoint_reg += cxyz[0];
50
51 if (lp >= 1) {
52 gridpoint_reg += cxyz[1] * dx;
53 gridpoint_reg += cxyz[2] * dy;
54 gridpoint_reg += cxyz[3] * dz;
55 if (lp >= 2) {
56 const double dx2 = dx * dx;
57 const double dy2 = dy * dy;
58 const double dz2 = dz * dz;
59 gridpoint_reg += cxyz[4] * dx2;
60 gridpoint_reg += cxyz[5] * dx * dy;
61 gridpoint_reg += cxyz[6] * dx * dz;
62 gridpoint_reg += cxyz[7] * dy2;
63 gridpoint_reg += cxyz[8] * dy * dz;
64 gridpoint_reg += cxyz[9] * dz2;
65 if (lp >= 3) {
66 const double dx3 = dx2 * dx;
67 const double dy3 = dy2 * dy;
68 const double dz3 = dz2 * dz;
69 gridpoint_reg += cxyz[10] * dx3;
70 gridpoint_reg += cxyz[11] * dx2 * dy;
71 gridpoint_reg += cxyz[12] * dx2 * dz;
72 gridpoint_reg += cxyz[13] * dx * dy2;
73 gridpoint_reg += cxyz[14] * dx * dy * dz;
74 gridpoint_reg += cxyz[15] * dx * dz2;
75 gridpoint_reg += cxyz[16] * dy3;
76 gridpoint_reg += cxyz[17] * dy2 * dz;
77 gridpoint_reg += cxyz[18] * dy * dz2;
78 gridpoint_reg += cxyz[19] * dz3;
79 if (lp >= 4) {
80 const double dx4 = dx3 * dx;
81 const double dy4 = dy3 * dy;
82 const double dz4 = dz3 * dz;
83 gridpoint_reg += cxyz[20] * dx4;
84 gridpoint_reg += cxyz[21] * dx3 * dy;
85 gridpoint_reg += cxyz[22] * dx3 * dz;
86 gridpoint_reg += cxyz[23] * dx2 * dy2;
87 gridpoint_reg += cxyz[24] * dx2 * dy * dz;
88 gridpoint_reg += cxyz[25] * dx2 * dz2;
89 gridpoint_reg += cxyz[26] * dx * dy3;
90 gridpoint_reg += cxyz[27] * dx * dy2 * dz;
91 gridpoint_reg += cxyz[28] * dx * dy * dz2;
92 gridpoint_reg += cxyz[29] * dx * dz3;
93 gridpoint_reg += cxyz[30] * dy4;
94 gridpoint_reg += cxyz[31] * dy3 * dz;
95 gridpoint_reg += cxyz[32] * dy2 * dz2;
96 gridpoint_reg += cxyz[33] * dy * dz3;
97 gridpoint_reg += cxyz[34] * dz4;
98 if (lp >= 5) {
99 const double dx5 = dx4 * dx;
100 const double dy5 = dy4 * dy;
101 const double dz5 = dz4 * dz;
102 gridpoint_reg += cxyz[35] * dx5;
103 gridpoint_reg += cxyz[36] * dx4 * dy;
104 gridpoint_reg += cxyz[37] * dx4 * dz;
105 gridpoint_reg += cxyz[38] * dx3 * dy2;
106 gridpoint_reg += cxyz[39] * dx3 * dy * dz;
107 gridpoint_reg += cxyz[40] * dx3 * dz2;
108 gridpoint_reg += cxyz[41] * dx2 * dy3;
109 gridpoint_reg += cxyz[42] * dx2 * dy2 * dz;
110 gridpoint_reg += cxyz[43] * dx2 * dy * dz2;
111 gridpoint_reg += cxyz[44] * dx2 * dz3;
112 gridpoint_reg += cxyz[45] * dx * dy4;
113 gridpoint_reg += cxyz[46] * dx * dy3 * dz;
114 gridpoint_reg += cxyz[47] * dx * dy2 * dz2;
115 gridpoint_reg += cxyz[48] * dx * dy * dz3;
116 gridpoint_reg += cxyz[49] * dx * dz4;
117 gridpoint_reg += cxyz[50] * dy5;
118 gridpoint_reg += cxyz[51] * dy4 * dz;
119 gridpoint_reg += cxyz[52] * dy3 * dz2;
120 gridpoint_reg += cxyz[53] * dy2 * dz3;
121 gridpoint_reg += cxyz[54] * dy * dz4;
122 gridpoint_reg += cxyz[55] * dz5;
123 if (lp >= 6) {
124 const double dx6 = dx5 * dx;
125 const double dy6 = dy5 * dy;
126 const double dz6 = dz5 * dz;
127 gridpoint_reg += cxyz[56] * dx6;
128 gridpoint_reg += cxyz[57] * dx5 * dy;
129 gridpoint_reg += cxyz[58] * dx5 * dz;
130 gridpoint_reg += cxyz[59] * dx4 * dy2;
131 gridpoint_reg += cxyz[60] * dx4 * dy * dz;
132 gridpoint_reg += cxyz[61] * dx4 * dz2;
133 gridpoint_reg += cxyz[62] * dx3 * dy3;
134 gridpoint_reg += cxyz[63] * dx3 * dy2 * dz;
135 gridpoint_reg += cxyz[64] * dx3 * dy * dz2;
136 gridpoint_reg += cxyz[65] * dx3 * dz3;
137 gridpoint_reg += cxyz[66] * dx2 * dy4;
138 gridpoint_reg += cxyz[67] * dx2 * dy3 * dz;
139 gridpoint_reg += cxyz[68] * dx2 * dy2 * dz2;
140 gridpoint_reg += cxyz[69] * dx2 * dy * dz3;
141 gridpoint_reg += cxyz[70] * dx2 * dz4;
142 gridpoint_reg += cxyz[71] * dx * dy5;
143 gridpoint_reg += cxyz[72] * dx * dy4 * dz;
144 gridpoint_reg += cxyz[73] * dx * dy3 * dz2;
145 gridpoint_reg += cxyz[74] * dx * dy2 * dz3;
146 gridpoint_reg += cxyz[75] * dx * dy * dz4;
147 gridpoint_reg += cxyz[76] * dx * dz5;
148 gridpoint_reg += cxyz[77] * dy6;
149 gridpoint_reg += cxyz[78] * dy5 * dz;
150 gridpoint_reg += cxyz[79] * dy4 * dz2;
151 gridpoint_reg += cxyz[80] * dy3 * dz3;
152 gridpoint_reg += cxyz[81] * dy2 * dz4;
153 gridpoint_reg += cxyz[82] * dy * dz5;
154 gridpoint_reg += cxyz[83] * dz6;
155 }
156 }
157 }
158 }
159 }
160 }
161
162 // Handle higher values of lp.
163 if (lp >= 7) {
164 for (int i = 84; i < ncoset(lp); i++) {
165 double val = cxyz[i];
166 const orbital a = coset_inv[i];
167 for (int j = 0; j < a.l[0]; j++) {
168 val *= dx;
169 }
170 for (int j = 0; j < a.l[1]; j++) {
171 val *= dy;
172 }
173 for (int j = 0; j < a.l[2]; j++) {
174 val *= dz;
175 }
176 gridpoint_reg += val;
177 }
178 }
179
180 atomicAddDouble(gridpoint, gridpoint_reg * gaussian);
181}
182
183/*******************************************************************************
184 * \brief Collocates coefficients C_xyz onto the grid.
185 * \author Ole Schuett
186 ******************************************************************************/
187__device__ static void cxyz_to_grid(const kernel_params *params,
188 const smem_task *task, const double *cxyz,
189 double *grid) {
190
191 if (task->use_orthorhombic_kernel) {
192 ortho_cxyz_to_grid(params, task, cxyz, grid);
193 } else {
194 general_cxyz_to_grid(params, task, cxyz, grid);
195 }
196}
197
198/*******************************************************************************
199 * \brief Decontracts the subblock, going from spherical to cartesian harmonics.
200 * \author Ole Schuett
201 ******************************************************************************/
202template <bool IS_FUNC_AB>
203__device__ static void block_to_cab(const kernel_params *params,
204 const smem_task *task, cab_store *cab) {
205
206 // The spherical index runs over angular momentum and then over contractions.
207 // The carthesian index runs over exponents and then over angular momentum.
208
209 // Decontract block, apply prepare_pab, and store in cab.
210 // This is a double matrix product. Since the pab block can be quite large the
211 // two products are fused to conserve shared memory.
212 for (int i = threadIdx.z; i < task->nsgf_setb; i += blockDim.z) {
213 for (int j = threadIdx.y; j < task->nsgf_seta; j += blockDim.y) {
214 double block_val;
215 if (task->block_transposed) {
216 block_val = task->pab_block[j * task->nsgfb + i] * task->off_diag_twice;
217 } else {
218 block_val = task->pab_block[i * task->nsgfa + j] * task->off_diag_twice;
219 }
220
221 if (IS_FUNC_AB) {
222 // fast path for common case
223 const int jco_start = ncoset(task->lb_min_basis - 1) + threadIdx.x;
224 const int jco_end = ncoset(task->lb_max_basis);
225 for (int jco = jco_start; jco < jco_end; jco += blockDim.x) {
226 const orbital b = coset_inv[jco];
227 const double sphib = task->sphib[i * task->maxcob + jco];
228 const int ico_start = ncoset(task->la_min_basis - 1);
229 const int ico_end = ncoset(task->la_max_basis);
230 for (int ico = ico_start; ico < ico_end; ico++) {
231 const orbital a = coset_inv[ico];
232 const double sphia = task->sphia[j * task->maxcoa + ico];
233 const double pab_val = block_val * sphia * sphib;
234 prepare_pab(GRID_FUNC_AB, a, b, task->zeta, task->zetb, pab_val,
235 cab);
236 }
237 }
238 } else {
239 // TODO find if we can make better bounds for ico and jco because
240 // we only need a submatrix of cab.
241 // Since prepare_pab is a register hog we use it only when really needed
242 const int jco_start = ncoset(task->lb_min_basis - 1) + threadIdx.x;
243 const int jco_end = ncoset(task->lb_max_basis);
244 for (int jco = jco_start; jco < jco_end; jco += blockDim.x) {
245 const orbital b = coset_inv[jco];
246 const int ico_start = ncoset(task->la_min_basis - 1);
247 const int ico_end = ncoset(task->la_max_basis);
248 for (int ico = ico_start; ico < ico_end; ico++) {
249 const orbital a = coset_inv[ico];
250 const double sphia = task->sphia[j * task->maxcoa + idx(a)];
251 const double sphib = task->sphib[i * task->maxcob + idx(b)];
252 const double pab_val = block_val * sphia * sphib;
253 prepare_pab(params->func, a, b, task->zeta, task->zetb, pab_val,
254 cab);
255 }
256 }
257 }
258 }
259 }
260 __syncthreads(); // because of concurrent writes to cab
261}
262
263/*******************************************************************************
264 * \brief Cuda kernel for collocating all tasks of one grid level.
265 * \author Ole Schuett
266 ******************************************************************************/
267template <bool IS_FUNC_AB>
268__device__ static void collocate_kernel(const kernel_params *params) {
269
270 // Copy task from global to shared memory and precompute some stuff.
271 __shared__ smem_task task;
272 load_task(params, &task);
273
274 // Check if radius is below the resolution of the grid.
275 if (2.0 * task.radius < task.dh_max) {
276 return; // nothing to do
277 }
278
279 // Allot dynamic shared memory.
280 extern __shared__ double shared_memory[];
281 double *smem_cab = &shared_memory[params->smem_cab_offset];
282 double *smem_alpha = &shared_memory[params->smem_alpha_offset];
283 double *smem_cxyz = &shared_memory[params->smem_cxyz_offset];
284
285 // Allocate Cab from global memory if it does not fit into shared memory.
286 cab_store cab = {.data = NULL, .n1 = task.n1};
287 if (params->smem_cab_length < task.n1 * task.n2) {
288 cab.data = malloc_cab(&task);
289 } else {
290 cab.data = smem_cab;
291 }
292
293 zero_cab(&cab, task.n1 * task.n2);
294 compute_alpha(&task, smem_alpha);
295
296 block_to_cab<IS_FUNC_AB>(params, &task, &cab);
297 cab_to_cxyz(&task, smem_alpha, &cab, smem_cxyz);
298 cxyz_to_grid(params, &task, smem_cxyz, params->grid);
299
300 if (params->smem_cab_length < task.n1 * task.n2) {
301 free_cab(cab.data);
302 }
303}
304
305/*******************************************************************************
306 * \brief Specialized Cuda kernel that can only collocate GRID_FUNC_AB.
307 * \author Ole Schuett
308 ******************************************************************************/
309__global__ static void collocate_kernel_density(const kernel_params params) {
310 collocate_kernel<true>(&params);
311}
312
313/*******************************************************************************
314 * \brief Cuda kernel that can collocate any function, ie. GRID_FUNC_*.
315 * \author Ole Schuett
316 ******************************************************************************/
317__global__ static void collocate_kernel_anyfunc(const kernel_params params) {
318 collocate_kernel<false>(&params);
319}
320
321/*******************************************************************************
322 * \brief Launches the Cuda kernel that collocates all tasks of one grid level.
323 * \author Ole Schuett
324 ******************************************************************************/
325void grid_gpu_collocate_one_grid_level(
326 const grid_gpu_task_list *task_list, const int first_task,
327 const int last_task, const enum grid_func func,
328 const grid_gpu_layout *layout, const offloadStream_t stream,
329 const double *pab_blocks_dev, double *grid_dev, int *lp_diff) {
330
331 // Compute max angular momentum.
332 const prepare_ldiffs ldiffs = prepare_get_ldiffs(func);
333 *lp_diff = ldiffs.la_max_diff + ldiffs.lb_max_diff; // for reporting stats
334 const int la_max = task_list->lmax + ldiffs.la_max_diff;
335 const int lb_max = task_list->lmax + ldiffs.lb_max_diff;
336 const int lp_max = la_max + lb_max;
337
338 const int ntasks = last_task - first_task + 1;
339 if (ntasks == 0) {
340 return; // Nothing to do and lp_diff already set.
341 }
342
343 init_constant_memory();
344
345 // Small Cab blocks are stored in shared mem, larger ones in global memory.
346 const int CAB_SMEM_LIMIT = ncoset(5) * ncoset(5); // = 56 * 56 = 3136
347
348 // Compute required shared memory.
349 const int alpha_len = 3 * (lb_max + 1) * (la_max + 1) * (lp_max + 1);
350 const int cxyz_len = ncoset(lp_max);
351 const int cab_len = imin(CAB_SMEM_LIMIT, ncoset(lb_max) * ncoset(la_max));
352 const size_t smem_per_block =
353 (alpha_len + cxyz_len + cab_len) * sizeof(double);
354
355 // kernel parameters
356 kernel_params params;
357 params.smem_cab_length = cab_len;
358 params.smem_cab_offset = 0;
359 params.smem_alpha_offset = params.smem_cab_offset + cab_len;
360 params.smem_cxyz_offset = params.smem_alpha_offset + alpha_len;
361 params.first_task = first_task;
362 params.func = func;
363 params.grid = grid_dev;
364 params.la_min_diff = ldiffs.la_min_diff;
365 params.lb_min_diff = ldiffs.lb_min_diff;
366 params.la_max_diff = ldiffs.la_max_diff;
367 params.lb_max_diff = ldiffs.lb_max_diff;
368 params.tasks = task_list->tasks_dev;
369 params.pab_blocks = pab_blocks_dev;
370 memcpy(params.dh, layout->dh, 9 * sizeof(double));
371 memcpy(params.dh_inv, layout->dh_inv, 9 * sizeof(double));
372 memcpy(params.npts_global, layout->npts_global, 3 * sizeof(int));
373 memcpy(params.npts_local, layout->npts_local, 3 * sizeof(int));
374 memcpy(params.shift_local, layout->shift_local, 3 * sizeof(int));
375
376 // Launch !
377 const int nblocks = ntasks;
378 const dim3 threads_per_block(4, 4, 4);
379
380 if (func == GRID_FUNC_AB) {
381 collocate_kernel_density<<<nblocks, threads_per_block, smem_per_block,
382 stream>>>(params);
383 } else {
384 collocate_kernel_anyfunc<<<nblocks, threads_per_block, smem_per_block,
385 stream>>>(params);
386 }
387 OFFLOAD_CHECK(offloadGetLastError());
388}
389
390#endif // defined(__OFFLOAD) && !defined(__NO_OFFLOAD_GRID)
391// EOF
static int imin(int x, int y)
Returns the smaller of two given integer (missing from the C standard)
Definition dbm_miniapp.c:38
static GRID_HOST_DEVICE int idx(const orbital a)
Return coset index of given orbital angular momentum.
grid_func
@ GRID_FUNC_AB
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
static void const int const int i
static GRID_DEVICE void prepare_pab(const enum grid_func func, const orbital a, const orbital b, const double zeta, const double zetb, const double pab_val, cab_store *cab)
Transforms a given element of the density matrix according to func.
static prepare_ldiffs prepare_get_ldiffs(const enum grid_func func)
Returns difference in angular momentum range for given func.
static void cxyz_to_grid(const bool orthorhombic, const int border_mask, const int lp, const double zetp, const double dh[3][3], const double dh_inv[3][3], const double rp[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, GRID_CONST_WHEN_COLLOCATE double *cxyz, GRID_CONST_WHEN_INTEGRATE double *grid)
Collocates coefficients C_xyz onto the grid.
static void cab_to_cxyz(const int la_max, const int la_min, const int lb_max, const int lb_min, const double prefactor, const double ra[3], const double rb[3], const double rp[3], GRID_CONST_WHEN_COLLOCATE double *cab, GRID_CONST_WHEN_INTEGRATE double *cxyz)
Transforms coefficients C_ab into C_xyz.
static void general_cxyz_to_grid(const int border_mask, const int lp, const double zetp, const double dh[3][3], const double dh_inv[3][3], const double rp[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, GRID_CONST_WHEN_COLLOCATE double *cxyz, GRID_CONST_WHEN_INTEGRATE double *grid)
Collocates coefficients C_xyz onto the grid for general case.
static void ortho_cxyz_to_grid(const int lp, const double zetp, const double dh[3][3], const double dh_inv[3][3], const double rp[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const double radius, GRID_CONST_WHEN_COLLOCATE double *cxyz, GRID_CONST_WHEN_INTEGRATE double *grid)
Collocates coefficients C_xyz onto the grid for orthorhombic case.
real(dp), dimension(3) a
real(dp), dimension(3) b
integer, dimension(:), allocatable, public ncoset
integer, parameter, public gaussian
__constant__ orbital coset_inv[1330]
__inline__ __device__ void compute_alpha(const smem_task< T > &task, T *__restrict__ alpha)
Computes the polynomial expansion coefficients: (x-a)**lxa (x-b)**lxb -> sum_{ls} alpha(ls,...
Cab matrix container to be passed through get_force/virial to cab_get.
const double * data
Orbital angular momentum.
Differences in angular momentum.