(git:34ef472)
grid_gpu_collocate.cu
Go to the documentation of this file.
1 /*----------------------------------------------------------------------------*/
2 /* CP2K: A general program to perform molecular dynamics simulations */
3 /* Copyright 2000-2024 CP2K developers group <https://cp2k.org> */
4 /* */
5 /* SPDX-License-Identifier: BSD-3-Clause */
6 /*----------------------------------------------------------------------------*/
7 
8 #include "../../offload/offload_runtime.h"
9 #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_GRID)
10 
11 #include <algorithm>
12 #include <assert.h>
13 #include <limits.h>
14 #include <math.h>
15 #include <stdio.h>
16 #include <stdlib.h>
17 #include <string.h>
18 
19 #define GRID_DO_COLLOCATE 1
20 #include "../common/grid_common.h"
21 #include "grid_gpu_collint.h"
22 #include "grid_gpu_collocate.h"
23 
24 // This has to be included after grid_gpu_collint.h
25 #include "../common/grid_prepare_pab.h"
26 
27 #if defined(_OMP_H)
28 #error "OpenMP should not be used in .cu files to accommodate HIP."
29 #endif
30 
31 /*******************************************************************************
32  * \brief Collocate a single grid point with distance d{xyz} from center.
33  * \author Ole Schuett
34  ******************************************************************************/
35 __device__ static void cxyz_to_gridpoint(const double dx, const double dy,
36  const double dz, const double zetp,
37  const int lp, const cxyz_store *cxyz,
38  double *gridpoint) {
39 
40  // Squared distance of point from center.
41  const double r2 = dx * dx + dy * dy + dz * dz;
42  const double gaussian = exp(-zetp * r2);
43 
44  // accumulate into register
45  double gridpoint_reg = 0.0;
46 
47  // Manually unrolled loops based on terms in coset_inv.
48  // For lp > 6 the register usage increases and the kernel becomes slower.
49  gridpoint_reg += cxyz[0];
50 
51  if (lp >= 1) {
52  gridpoint_reg += cxyz[1] * dx;
53  gridpoint_reg += cxyz[2] * dy;
54  gridpoint_reg += cxyz[3] * dz;
55  if (lp >= 2) {
56  const double dx2 = dx * dx;
57  const double dy2 = dy * dy;
58  const double dz2 = dz * dz;
59  gridpoint_reg += cxyz[4] * dx2;
60  gridpoint_reg += cxyz[5] * dx * dy;
61  gridpoint_reg += cxyz[6] * dx * dz;
62  gridpoint_reg += cxyz[7] * dy2;
63  gridpoint_reg += cxyz[8] * dy * dz;
64  gridpoint_reg += cxyz[9] * dz2;
65  if (lp >= 3) {
66  const double dx3 = dx2 * dx;
67  const double dy3 = dy2 * dy;
68  const double dz3 = dz2 * dz;
69  gridpoint_reg += cxyz[10] * dx3;
70  gridpoint_reg += cxyz[11] * dx2 * dy;
71  gridpoint_reg += cxyz[12] * dx2 * dz;
72  gridpoint_reg += cxyz[13] * dx * dy2;
73  gridpoint_reg += cxyz[14] * dx * dy * dz;
74  gridpoint_reg += cxyz[15] * dx * dz2;
75  gridpoint_reg += cxyz[16] * dy3;
76  gridpoint_reg += cxyz[17] * dy2 * dz;
77  gridpoint_reg += cxyz[18] * dy * dz2;
78  gridpoint_reg += cxyz[19] * dz3;
79  if (lp >= 4) {
80  const double dx4 = dx3 * dx;
81  const double dy4 = dy3 * dy;
82  const double dz4 = dz3 * dz;
83  gridpoint_reg += cxyz[20] * dx4;
84  gridpoint_reg += cxyz[21] * dx3 * dy;
85  gridpoint_reg += cxyz[22] * dx3 * dz;
86  gridpoint_reg += cxyz[23] * dx2 * dy2;
87  gridpoint_reg += cxyz[24] * dx2 * dy * dz;
88  gridpoint_reg += cxyz[25] * dx2 * dz2;
89  gridpoint_reg += cxyz[26] * dx * dy3;
90  gridpoint_reg += cxyz[27] * dx * dy2 * dz;
91  gridpoint_reg += cxyz[28] * dx * dy * dz2;
92  gridpoint_reg += cxyz[29] * dx * dz3;
93  gridpoint_reg += cxyz[30] * dy4;
94  gridpoint_reg += cxyz[31] * dy3 * dz;
95  gridpoint_reg += cxyz[32] * dy2 * dz2;
96  gridpoint_reg += cxyz[33] * dy * dz3;
97  gridpoint_reg += cxyz[34] * dz4;
98  if (lp >= 5) {
99  const double dx5 = dx4 * dx;
100  const double dy5 = dy4 * dy;
101  const double dz5 = dz4 * dz;
102  gridpoint_reg += cxyz[35] * dx5;
103  gridpoint_reg += cxyz[36] * dx4 * dy;
104  gridpoint_reg += cxyz[37] * dx4 * dz;
105  gridpoint_reg += cxyz[38] * dx3 * dy2;
106  gridpoint_reg += cxyz[39] * dx3 * dy * dz;
107  gridpoint_reg += cxyz[40] * dx3 * dz2;
108  gridpoint_reg += cxyz[41] * dx2 * dy3;
109  gridpoint_reg += cxyz[42] * dx2 * dy2 * dz;
110  gridpoint_reg += cxyz[43] * dx2 * dy * dz2;
111  gridpoint_reg += cxyz[44] * dx2 * dz3;
112  gridpoint_reg += cxyz[45] * dx * dy4;
113  gridpoint_reg += cxyz[46] * dx * dy3 * dz;
114  gridpoint_reg += cxyz[47] * dx * dy2 * dz2;
115  gridpoint_reg += cxyz[48] * dx * dy * dz3;
116  gridpoint_reg += cxyz[49] * dx * dz4;
117  gridpoint_reg += cxyz[50] * dy5;
118  gridpoint_reg += cxyz[51] * dy4 * dz;
119  gridpoint_reg += cxyz[52] * dy3 * dz2;
120  gridpoint_reg += cxyz[53] * dy2 * dz3;
121  gridpoint_reg += cxyz[54] * dy * dz4;
122  gridpoint_reg += cxyz[55] * dz5;
123  if (lp >= 6) {
124  const double dx6 = dx5 * dx;
125  const double dy6 = dy5 * dy;
126  const double dz6 = dz5 * dz;
127  gridpoint_reg += cxyz[56] * dx6;
128  gridpoint_reg += cxyz[57] * dx5 * dy;
129  gridpoint_reg += cxyz[58] * dx5 * dz;
130  gridpoint_reg += cxyz[59] * dx4 * dy2;
131  gridpoint_reg += cxyz[60] * dx4 * dy * dz;
132  gridpoint_reg += cxyz[61] * dx4 * dz2;
133  gridpoint_reg += cxyz[62] * dx3 * dy3;
134  gridpoint_reg += cxyz[63] * dx3 * dy2 * dz;
135  gridpoint_reg += cxyz[64] * dx3 * dy * dz2;
136  gridpoint_reg += cxyz[65] * dx3 * dz3;
137  gridpoint_reg += cxyz[66] * dx2 * dy4;
138  gridpoint_reg += cxyz[67] * dx2 * dy3 * dz;
139  gridpoint_reg += cxyz[68] * dx2 * dy2 * dz2;
140  gridpoint_reg += cxyz[69] * dx2 * dy * dz3;
141  gridpoint_reg += cxyz[70] * dx2 * dz4;
142  gridpoint_reg += cxyz[71] * dx * dy5;
143  gridpoint_reg += cxyz[72] * dx * dy4 * dz;
144  gridpoint_reg += cxyz[73] * dx * dy3 * dz2;
145  gridpoint_reg += cxyz[74] * dx * dy2 * dz3;
146  gridpoint_reg += cxyz[75] * dx * dy * dz4;
147  gridpoint_reg += cxyz[76] * dx * dz5;
148  gridpoint_reg += cxyz[77] * dy6;
149  gridpoint_reg += cxyz[78] * dy5 * dz;
150  gridpoint_reg += cxyz[79] * dy4 * dz2;
151  gridpoint_reg += cxyz[80] * dy3 * dz3;
152  gridpoint_reg += cxyz[81] * dy2 * dz4;
153  gridpoint_reg += cxyz[82] * dy * dz5;
154  gridpoint_reg += cxyz[83] * dz6;
155  }
156  }
157  }
158  }
159  }
160  }
161 
162  // Handle higher values of lp.
163  if (lp >= 7) {
164  for (int i = 84; i < ncoset(lp); i++) {
165  double val = cxyz[i];
166  const orbital a = coset_inv[i];
167  for (int j = 0; j < a.l[0]; j++) {
168  val *= dx;
169  }
170  for (int j = 0; j < a.l[1]; j++) {
171  val *= dy;
172  }
173  for (int j = 0; j < a.l[2]; j++) {
174  val *= dz;
175  }
176  gridpoint_reg += val;
177  }
178  }
179 
180  atomicAddDouble(gridpoint, gridpoint_reg * gaussian);
181 }
182 
183 /*******************************************************************************
184  * \brief Collocates coefficients C_xyz onto the grid.
185  * \author Ole Schuett
186  ******************************************************************************/
187 __device__ static void cxyz_to_grid(const kernel_params *params,
188  const smem_task *task, const double *cxyz,
189  double *grid) {
190 
191  if (task->use_orthorhombic_kernel) {
192  ortho_cxyz_to_grid(params, task, cxyz, grid);
193  } else {
194  general_cxyz_to_grid(params, task, cxyz, grid);
195  }
196 }
197 
198 /*******************************************************************************
199  * \brief Decontracts the subblock, going from spherical to cartesian harmonics.
200  * \author Ole Schuett
201  ******************************************************************************/
202 template <bool IS_FUNC_AB>
203 __device__ static void block_to_cab(const kernel_params *params,
204  const smem_task *task, cab_store *cab) {
205 
206  // The spherical index runs over angular momentum and then over contractions.
207  // The carthesian index runs over exponents and then over angular momentum.
208 
209  // Decontract block, apply prepare_pab, and store in cab.
210  // This is a double matrix product. Since the pab block can be quite large the
211  // two products are fused to conserve shared memory.
212  for (int i = threadIdx.z; i < task->nsgf_setb; i += blockDim.z) {
213  for (int j = threadIdx.y; j < task->nsgf_seta; j += blockDim.y) {
214  double block_val;
215  if (task->block_transposed) {
216  block_val = task->pab_block[j * task->nsgfb + i] * task->off_diag_twice;
217  } else {
218  block_val = task->pab_block[i * task->nsgfa + j] * task->off_diag_twice;
219  }
220 
221  if (IS_FUNC_AB) {
222  // fast path for common case
223  const int jco_start = ncoset(task->lb_min_basis - 1) + threadIdx.x;
224  const int jco_end = ncoset(task->lb_max_basis);
225  for (int jco = jco_start; jco < jco_end; jco += blockDim.x) {
226  const orbital b = coset_inv[jco];
227  const double sphib = task->sphib[i * task->maxcob + jco];
228  const int ico_start = ncoset(task->la_min_basis - 1);
229  const int ico_end = ncoset(task->la_max_basis);
230  for (int ico = ico_start; ico < ico_end; ico++) {
231  const orbital a = coset_inv[ico];
232  const double sphia = task->sphia[j * task->maxcoa + ico];
233  const double pab_val = block_val * sphia * sphib;
234  prepare_pab(GRID_FUNC_AB, a, b, task->zeta, task->zetb, pab_val,
235  cab);
236  }
237  }
238  } else {
239  // TODO find if we can make better bounds for ico and jco because
240  // we only need a submatrix of cab.
241  // Since prepare_pab is a register hog we use it only when really needed
242  const int jco_start = ncoset(task->lb_min_basis - 1) + threadIdx.x;
243  const int jco_end = ncoset(task->lb_max_basis);
244  for (int jco = jco_start; jco < jco_end; jco += blockDim.x) {
245  const orbital b = coset_inv[jco];
246  const int ico_start = ncoset(task->la_min_basis - 1);
247  const int ico_end = ncoset(task->la_max_basis);
248  for (int ico = ico_start; ico < ico_end; ico++) {
249  const orbital a = coset_inv[ico];
250  const double sphia = task->sphia[j * task->maxcoa + idx(a)];
251  const double sphib = task->sphib[i * task->maxcob + idx(b)];
252  const double pab_val = block_val * sphia * sphib;
253  prepare_pab(params->func, a, b, task->zeta, task->zetb, pab_val,
254  cab);
255  }
256  }
257  }
258  }
259  }
260  __syncthreads(); // because of concurrent writes to cab
261 }
262 
263 /*******************************************************************************
264  * \brief Cuda kernel for collocating all tasks of one grid level.
265  * \author Ole Schuett
266  ******************************************************************************/
267 template <bool IS_FUNC_AB>
268 __device__ static void collocate_kernel(const kernel_params *params) {
269 
270  // Copy task from global to shared memory and precompute some stuff.
271  __shared__ smem_task task;
272  load_task(params, &task);
273 
274  // Check if radius is below the resolution of the grid.
275  if (2.0 * task.radius < task.dh_max) {
276  return; // nothing to do
277  }
278 
279  // Allot dynamic shared memory.
280  extern __shared__ double shared_memory[];
281  double *smem_cab = &shared_memory[params->smem_cab_offset];
282  double *smem_alpha = &shared_memory[params->smem_alpha_offset];
283  double *smem_cxyz = &shared_memory[params->smem_cxyz_offset];
284 
285  // Allocate Cab from global memory if it does not fit into shared memory.
286  cab_store cab = {.data = NULL, .n1 = task.n1};
287  if (params->smem_cab_length < task.n1 * task.n2) {
288  cab.data = malloc_cab(&task);
289  } else {
290  cab.data = smem_cab;
291  }
292 
293  zero_cab(&cab, task.n1 * task.n2);
294  compute_alpha(&task, smem_alpha);
295 
296  block_to_cab<IS_FUNC_AB>(params, &task, &cab);
297  cab_to_cxyz(&task, smem_alpha, &cab, smem_cxyz);
298  cxyz_to_grid(params, &task, smem_cxyz, params->grid);
299 
300  if (params->smem_cab_length < task.n1 * task.n2) {
301  free_cab(cab.data);
302  }
303 }
304 
305 /*******************************************************************************
306  * \brief Specialized Cuda kernel that can only collocate GRID_FUNC_AB.
307  * \author Ole Schuett
308  ******************************************************************************/
309 __global__ static void collocate_kernel_density(const kernel_params params) {
310  collocate_kernel<true>(&params);
311 }
312 
313 /*******************************************************************************
314  * \brief Cuda kernel that can collocate any function, ie. GRID_FUNC_*.
315  * \author Ole Schuett
316  ******************************************************************************/
317 __global__ static void collocate_kernel_anyfunc(const kernel_params params) {
318  collocate_kernel<false>(&params);
319 }
320 
321 /*******************************************************************************
322  * \brief Launches the Cuda kernel that collocates all tasks of one grid level.
323  * \author Ole Schuett
324  ******************************************************************************/
325 void grid_gpu_collocate_one_grid_level(
326  const grid_gpu_task_list *task_list, const int first_task,
327  const int last_task, const enum grid_func func,
328  const grid_gpu_layout *layout, const offloadStream_t stream,
329  const double *pab_blocks_dev, double *grid_dev, int *lp_diff) {
330 
331  // Compute max angular momentum.
332  const prepare_ldiffs ldiffs = prepare_get_ldiffs(func);
333  *lp_diff = ldiffs.la_max_diff + ldiffs.lb_max_diff; // for reporting stats
334  const int la_max = task_list->lmax + ldiffs.la_max_diff;
335  const int lb_max = task_list->lmax + ldiffs.lb_max_diff;
336  const int lp_max = la_max + lb_max;
337 
338  const int ntasks = last_task - first_task + 1;
339  if (ntasks == 0) {
340  return; // Nothing to do and lp_diff already set.
341  }
342 
344 
345  // Small Cab blocks are stored in shared mem, larger ones in global memory.
346  const int CAB_SMEM_LIMIT = ncoset(5) * ncoset(5); // = 56 * 56 = 3136
347 
348  // Compute required shared memory.
349  const int alpha_len = 3 * (lb_max + 1) * (la_max + 1) * (lp_max + 1);
350  const int cxyz_len = ncoset(lp_max);
351  const int cab_len = imin(CAB_SMEM_LIMIT, ncoset(lb_max) * ncoset(la_max));
352  const size_t smem_per_block =
353  (alpha_len + cxyz_len + cab_len) * sizeof(double);
354 
355  // kernel parameters
356  kernel_params params;
357  params.smem_cab_length = cab_len;
358  params.smem_cab_offset = 0;
359  params.smem_alpha_offset = params.smem_cab_offset + cab_len;
360  params.smem_cxyz_offset = params.smem_alpha_offset + alpha_len;
361  params.first_task = first_task;
362  params.func = func;
363  params.grid = grid_dev;
364  params.la_min_diff = ldiffs.la_min_diff;
365  params.lb_min_diff = ldiffs.lb_min_diff;
366  params.la_max_diff = ldiffs.la_max_diff;
367  params.lb_max_diff = ldiffs.lb_max_diff;
368  params.tasks = task_list->tasks_dev;
369  params.pab_blocks = pab_blocks_dev;
370  memcpy(params.dh, layout->dh, 9 * sizeof(double));
371  memcpy(params.dh_inv, layout->dh_inv, 9 * sizeof(double));
372  memcpy(params.npts_global, layout->npts_global, 3 * sizeof(int));
373  memcpy(params.npts_local, layout->npts_local, 3 * sizeof(int));
374  memcpy(params.shift_local, layout->shift_local, 3 * sizeof(int));
375 
376  // Launch !
377  const int nblocks = ntasks;
378  const dim3 threads_per_block(4, 4, 4);
379 
380  if (func == GRID_FUNC_AB) {
381  collocate_kernel_density<<<nblocks, threads_per_block, smem_per_block,
382  stream>>>(params);
383  } else {
384  collocate_kernel_anyfunc<<<nblocks, threads_per_block, smem_per_block,
385  stream>>>(params);
386  }
387  OFFLOAD_CHECK(offloadGetLastError());
388 }
389 
390 #endif // defined(__OFFLOAD) && !defined(__NO_OFFLOAD_GRID)
391 // EOF
static int imin(int x, int y)
Returns the smaller of two given integer (missing from the C standard)
Definition: dbm_miniapp.c:38
static GRID_HOST_DEVICE int ncoset(const int l)
Number of Cartesian orbitals up to given angular momentum quantum.
Definition: grid_common.h:73
static GRID_HOST_DEVICE int idx(const orbital a)
Return coset index of given orbital angular momentum.
Definition: grid_common.h:153
grid_func
@ GRID_FUNC_AB
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
static void const int const int i
static GRID_DEVICE void prepare_pab(const enum grid_func func, const orbital a, const orbital b, const double zeta, const double zetb, const double pab_val, cab_store *cab)
Transforms a given element of the density matrix according to func.
static prepare_ldiffs prepare_get_ldiffs(const enum grid_func func)
Returns difference in angular momentum range for given func.
static void cxyz_to_grid(const bool orthorhombic, const int border_mask, const int lp, const double zetp, const double dh[3][3], const double dh_inv[3][3], const double rp[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, GRID_CONST_WHEN_COLLOCATE double *cxyz, GRID_CONST_WHEN_INTEGRATE double *grid)
Collocates coefficients C_xyz onto the grid.
static void cab_to_cxyz(const int la_max, const int la_min, const int lb_max, const int lb_min, const double prefactor, const double ra[3], const double rb[3], const double rp[3], GRID_CONST_WHEN_COLLOCATE double *cab, GRID_CONST_WHEN_INTEGRATE double *cxyz)
Transforms coefficients C_ab into C_xyz.
static void general_cxyz_to_grid(const int border_mask, const int lp, const double zetp, const double dh[3][3], const double dh_inv[3][3], const double rp[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, GRID_CONST_WHEN_COLLOCATE double *cxyz, GRID_CONST_WHEN_INTEGRATE double *grid)
Collocates coefficients C_xyz onto the grid for general case.
static void ortho_cxyz_to_grid(const int lp, const double zetp, const double dh[3][3], const double dh_inv[3][3], const double rp[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const double radius, GRID_CONST_WHEN_COLLOCATE double *cxyz, GRID_CONST_WHEN_INTEGRATE double *grid)
Collocates coefficients C_xyz onto the grid for orthorhombic case.
real(dp), dimension(3) a
Definition: ai_eri_debug.F:31
real(dp), dimension(3) b
Definition: ai_eri_debug.F:31
integer, parameter, public gaussian
static void init_constant_memory()
Initializes the device's constant memory.
__constant__ orbital coset_inv[1330]
__inline__ __device__ void compute_alpha(const smem_task< T > &task, T *__restrict__ alpha)
Computes the polynomial expansion coefficients: (x-a)**lxa (x-b)**lxb -> sum_{ls} alpha(ls,...
Cab matrix container to be passed through get_force/virial to cab_get.
const double * data
Orbital angular momentum.
Definition: grid_common.h:125
Differences in angular momentum.