(git:34ef472)
grid_gpu_collint.h
Go to the documentation of this file.
1 /*----------------------------------------------------------------------------*/
2 /* CP2K: A general program to perform molecular dynamics simulations */
3 /* Copyright 2000-2024 CP2K developers group <https://cp2k.org> */
4 /* */
5 /* SPDX-License-Identifier: BSD-3-Clause */
6 /*----------------------------------------------------------------------------*/
7 
8 #include "../../offload/offload_runtime.h"
9 
10 #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_GRID)
11 
12 #include <algorithm>
13 #include <assert.h>
14 #include <limits.h>
15 #include <math.h>
16 #include <stdio.h>
17 #include <stdlib.h>
18 #include <string.h>
19 
20 #include "../common/grid_basis_set.h"
21 #include "../common/grid_common.h"
22 #include "grid_gpu_task_list.h"
23 
24 #if (GRID_DO_COLLOCATE)
25 #define GRID_CONST_WHEN_COLLOCATE const
26 #define GRID_CONST_WHEN_INTEGRATE
27 #else
28 #define GRID_CONST_WHEN_COLLOCATE
29 #define GRID_CONST_WHEN_INTEGRATE const
30 #endif
31 
32 /*******************************************************************************
33  * \brief Forward declarations for inner-most loop bodies.
34  * Implementations are in grid_gpu_collocate.cu / grid_gpu_integrate.cu.
35  * \author Ole Schuett
36  ******************************************************************************/
37 #if (GRID_DO_COLLOCATE)
38 // collocate
39 typedef double cxyz_store;
40 
41 __device__ static void cxyz_to_gridpoint(const double dx, const double dy,
42  const double dz, const double zetp,
43  const int lp, const cxyz_store *cxyz,
44  double *gridpoint);
45 #else
46 // integrate
47 typedef struct {
48  double *regs;
49  int offset;
50 } cxyz_store;
51 
52 __device__ static void gridpoint_to_cxyz(const double dx, const double dy,
53  const double dz, const double zetp,
54  const int lp, const double *gridpoint,
55  cxyz_store *store);
56 #endif
57 
58 /*******************************************************************************
59  * \brief Atomic add for doubles that also works prior to compute capability 6.
60  * \author Ole Schuett
61  ******************************************************************************/
62 __device__ static void atomicAddDouble(double *address, double val) {
63  if (val == 0.0)
64  return;
65 
66 #if __CUDA_ARCH__ >= 600
67  atomicAdd(address, val); // part of cuda library
68 #else
69  // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions
70  unsigned long long int *address_as_ull = (unsigned long long int *)address;
71  unsigned long long int old = *address_as_ull, assumed;
72 
73  do {
74  assumed = old;
75  old = atomicCAS(address_as_ull, assumed,
76  __double_as_longlong(val + __longlong_as_double(assumed)));
77 
78  // Uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
79  } while (assumed != old);
80 
81 #endif
82 }
83 
84 /*******************************************************************************
85  * \brief Cab matrix container to be passed through prepare_pab to cab_add.
86  * \author Ole Schuett
87  ******************************************************************************/
88 typedef struct {
89  double *data;
90  const int n1;
91 } cab_store;
92 
93 /*******************************************************************************
94  * \brief Returns matrix element cab[idx(b)][idx(a)].
95  * \author Ole Schuett
96  ******************************************************************************/
97 __device__ static inline double cab_get(const cab_store *cab, const orbital a,
98  const orbital b) {
99  const int i = idx(b) * cab->n1 + idx(a);
100  return cab->data[i];
101 }
102 
103 /*******************************************************************************
104  * \brief Adds given value to matrix element cab[idx(b)][idx(a)].
105  * \author Ole Schuett
106  ******************************************************************************/
107 __device__ static inline void cab_add(cab_store *cab, const orbital a,
108  const orbital b, const double value) {
109  const int i = idx(b) * cab->n1 + idx(a);
110  atomicAddDouble(&cab->data[i], value);
111 }
112 
113 /*******************************************************************************
114  * \brief Parameters of the collocate kernel.
115  * \author Ole Schuett
116  ******************************************************************************/
117 typedef struct {
118  int smem_cab_length;
119  int smem_cab_offset;
120  int smem_alpha_offset;
121  int smem_cxyz_offset;
122  int first_task;
123  int npts_global[3];
124  int npts_local[3];
125  int shift_local[3];
126  double dh[3][3];
127  double dh_inv[3][3];
128  const grid_gpu_task *tasks;
129  const double *pab_blocks;
131  int la_min_diff;
132  int lb_min_diff;
133  int la_max_diff;
134  int lb_max_diff;
135 
136 #if (GRID_DO_COLLOCATE)
137  // collocate
138  enum grid_func func;
139 #else
140  // integrate
141  double *hab_blocks;
142  double *forces;
143  double *virial;
144 #endif
145 } kernel_params;
146 
147 /*******************************************************************************
148  * \brief Shared memory representation of a task.
149  * \author Ole Schuett
150  ******************************************************************************/
151 typedef struct smem_task_struct : grid_gpu_task_struct {
152  // angular momentum range of actual collocate / integrate operation
153  int la_max;
154  int lb_max;
155  int la_min;
156  int lb_min;
157  int lp;
158 
159  // size of the cab matrix
160  int n1;
161  int n2;
162 
163  const double *pab_block;
164 
165 #if (!GRID_DO_COLLOCATE)
166  // integrate
167  double *hab_block;
168  double *forces_a;
169  double *forces_b;
170 #endif
171 } smem_task;
172 
173 /*******************************************************************************
174  * \brief Tabulated functions kept in constant memory to reduce register count.
175  * \author Ole Schuett
176  ******************************************************************************/
177 __constant__ orbital coset_inv[1330];
178 __constant__ double binomial_coef[19][19];
179 
180 /*******************************************************************************
181  * \brief Initializes the device's constant memory.
182  * \author Ole Schuett
183  ******************************************************************************/
184 static void init_constant_memory() {
185  static bool initialized = false;
186  if (initialized) {
187  return; // constant memory has to be initialized only once
188  }
189 
190  // Inverse coset mapping
191  orbital coset_inv_host[1330];
192  for (int lx = 0; lx <= 18; lx++) {
193  for (int ly = 0; ly <= 18 - lx; ly++) {
194  for (int lz = 0; lz <= 18 - lx - ly; lz++) {
195  const int i = coset(lx, ly, lz);
196  coset_inv_host[i] = {{lx, ly, lz}};
197  }
198  }
199  }
200  offloadMemcpyToSymbol(coset_inv, &coset_inv_host, sizeof(coset_inv_host));
201 
202  // Binomial coefficient
203  double binomial_coef_host[19][19];
204  memset(binomial_coef_host, 0, sizeof(binomial_coef_host));
205  for (int n = 0; n <= 18; n++) {
206  for (int k = 0; k <= n; k++) {
207  binomial_coef_host[n][k] = fac(n) / fac(k) / fac(n - k);
208  }
209  }
210  offloadMemcpyToSymbol(binomial_coef, &binomial_coef_host,
211  sizeof(binomial_coef_host));
212 
213  initialized = true;
214 }
215 
216 /*******************************************************************************
217  * \brief Collocates coefficients C_xyz onto the grid for orthorhombic case.
218  * \author Ole Schuett
219  ******************************************************************************/
220 __device__ static void
221 ortho_cxyz_to_grid(const kernel_params *params, const smem_task *task,
222  GRID_CONST_WHEN_COLLOCATE cxyz_store *cxyz,
224 
225  // The cube contains an even number of grid points in each direction and
226  // collocation is always performed on a pair of two opposing grid points.
227  // Hence, the points with index 0 and 1 are both assigned distance zero via
228  // the formular distance=(2*index-1)/2.
229  const int kmin = ceil(-1e-8 - task->disr_radius * params->dh_inv[2][2]);
230  for (int k = threadIdx.z + kmin; k <= 1 - kmin; k += blockDim.z) {
231  const int ka = task->cube_center_shifted[2] + k;
232  const int kg =
233  modulo(ka, params->npts_global[2]); // target location on the grid
234  const int kd = (2 * k - 1) / 2; // distance from center in grid points
235  const double kr = kd * params->dh[2][2]; // distance from center in a.u.
236  const double kremain = task->disr_radius * task->disr_radius - kr * kr;
237  const int jmin =
238  ceil(-1e-8 - sqrt(fmax(0.0, kremain)) * params->dh_inv[1][1]);
239  for (int j = threadIdx.y + jmin; j <= 1 - jmin; j += blockDim.y) {
240  const int ja = task->cube_center_shifted[1] + j;
241  const int jg =
242  modulo(ja, params->npts_global[1]); // target location on the grid
243  const int jd = (2 * j - 1) / 2; // distance from center in grid points
244  const double jr = jd * params->dh[1][1]; // distance from center in a.u.
245  const double jremain = kremain - jr * jr;
246  const int imin =
247  ceil(-1e-8 - sqrt(fmax(0.0, jremain)) * params->dh_inv[0][0]);
248 
249  for (int i = threadIdx.x + imin; i <= 1 - imin; i += blockDim.x) {
250  const int ia = task->cube_center_shifted[0] + i;
251  const int ig =
252  modulo(ia, params->npts_global[0]); // target location on the grid
253 
254  // The distances above (kd, jd) do not take roffset into account,
255  // ie. they always snap to the next grid point. This allowed the legacy
256  // implementation to cache the loop bounds.
257  // For the calculation of the grid value we'll use the true distance.
258  const double dx = i * params->dh[0][0] + task->cube_offset[0];
259  const double dy = j * params->dh[1][1] + task->cube_offset[1];
260  const double dz = k * params->dh[2][2] + task->cube_offset[2];
261 
262  const int grid_index =
263  kg * params->npts_local[1] * params->npts_local[0] +
264  jg * params->npts_local[0] + ig;
265 
266 #if (GRID_DO_COLLOCATE)
267  // collocate
268  cxyz_to_gridpoint(dx, dy, dz, task->zetp, task->lp, cxyz,
269  &grid[grid_index]);
270 #else
271  // integrate
272  gridpoint_to_cxyz(dx, dy, dz, task->zetp, task->lp, &grid[grid_index],
273  cxyz);
274 #endif
275  }
276  }
277  }
278 
279  __syncthreads(); // because of concurrent writes to grid / cxyz
280 }
281 
282 /*******************************************************************************
283  * \brief Collocates coefficients C_xyz onto the grid for general case.
284  * \author Ole Schuett
285  ******************************************************************************/
286 __device__ static void
287 general_cxyz_to_grid(const kernel_params *params, const smem_task *task,
288  GRID_CONST_WHEN_COLLOCATE cxyz_store *cxyz,
290 
291  // Go over the grid
292  const int k_start = threadIdx.z + task->index_min[2];
293  for (int k = k_start; k <= task->index_max[2]; k += blockDim.z) {
294  const int kg = modulo(k - params->shift_local[2], params->npts_global[2]);
295  if (kg < task->bounds_k[0] || task->bounds_k[1] < kg) {
296  continue;
297  }
298  const int j_start = threadIdx.y + task->index_min[1];
299  for (int j = j_start; j <= task->index_max[1]; j += blockDim.y) {
300  const int jg = modulo(j - params->shift_local[1], params->npts_global[1]);
301  if (jg < task->bounds_j[0] || task->bounds_j[1] < jg) {
302  continue;
303  }
304 
305  const int i_start = threadIdx.x + task->index_min[0];
306  for (int i = i_start; i <= task->index_max[0]; i += blockDim.x) {
307  const int ig =
308  modulo(i - params->shift_local[0], params->npts_global[0]);
309  if (ig < task->bounds_i[0] || task->bounds_i[1] < ig) {
310  continue;
311  }
312  // Compute distance of current grid point from center of Gaussian.
313  const double di = i - task->gp[0];
314  double dx = di * params->dh[0][0];
315  double dy = di * params->dh[0][1];
316  double dz = di * params->dh[0][2];
317  const double dj = j - task->gp[1];
318  dx += dj * params->dh[1][0];
319  dy += dj * params->dh[1][1];
320  dz += dj * params->dh[1][2];
321  const double dk = k - task->gp[2];
322  dx += dk * params->dh[2][0];
323  dy += dk * params->dh[2][1];
324  dz += dk * params->dh[2][2];
325 
326  // Cycle if the point is not within the radius.
327  const double r2 = dx * dx + dy * dy + dz * dz;
328  if (r2 >= task->radius2) {
329  continue;
330  }
331 
332  const int stride = params->npts_local[1] * params->npts_local[0];
333  const int grid_index = kg * stride + jg * params->npts_local[0] + ig;
334 
335 #if (GRID_DO_COLLOCATE)
336  // collocate
337  cxyz_to_gridpoint(dx, dy, dz, task->zetp, task->lp, cxyz,
338  &grid[grid_index]);
339 #else
340  // integrate
341  gridpoint_to_cxyz(dx, dy, dz, task->zetp, task->lp, &grid[grid_index],
342  cxyz);
343 #endif
344  }
345  }
346  }
347 
348  __syncthreads(); // because of concurrent writes to grid / cxyz
349 }
350 
351 /*******************************************************************************
352  * \brief Computes the polynomial expansion coefficients:
353  * (x-a)**lxa (x-b)**lxb -> sum_{ls} alpha(ls,lxa,lxb,1)*(x-p)**ls
354  * \author Ole Schuett
355  ******************************************************************************/
356 __device__ static void compute_alpha(const smem_task *task, double *alpha) {
357  // strides for accessing alpha
358  const int s3 = (task->lp + 1);
359  const int s2 = (task->la_max + 1) * s3;
360  const int s1 = (task->lb_max + 1) * s2;
361 
362  for (int idir = threadIdx.z; idir < 3; idir += blockDim.z) {
363  const double drpa = task->rp[idir] - task->ra[idir];
364  const double drpb = task->rp[idir] - task->rb[idir];
365  for (int la = threadIdx.y; la <= task->la_max; la += blockDim.y) {
366  for (int lb = threadIdx.x; lb <= task->lb_max; lb += blockDim.x) {
367  for (int i = 0; i <= task->lp; i++) {
368  const int base = idir * s1 + lb * s2 + la * s3;
369  alpha[base + i] = 0.0;
370  }
371  double a = 1.0;
372  for (int k = 0; k <= la; k++) {
373  double b = 1.0;
374  for (int l = 0; l <= lb; l++) {
375  const int base = idir * s1 + lb * s2 + la * s3;
376  alpha[base + la - l + lb - k] +=
377  a * b * binomial_coef[la][k] * binomial_coef[lb][l];
378  b *= drpb;
379  }
380  a *= drpa;
381  }
382  }
383  }
384  }
385  __syncthreads(); // because of concurrent writes to alpha
386 }
387 
388 /*******************************************************************************
389  * \brief Transforms coefficients C_ab into C_xyz.
390  * \author Ole Schuett
391  ******************************************************************************/
392 __device__ static void cab_to_cxyz(const smem_task *task, const double *alpha,
394  GRID_CONST_WHEN_INTEGRATE double *cxyz) {
395 
396  // *** initialise the coefficient matrix, we transform the sum
397  //
398  // sum_{lxa,lya,lza,lxb,lyb,lzb} P_{lxa,lya,lza,lxb,lyb,lzb} *
399  // (x-a_x)**lxa (y-a_y)**lya (z-a_z)**lza (x-b_x)**lxb (y-a_y)**lya
400  // (z-a_z)**lza
401  //
402  // into
403  //
404  // sum_{lxp,lyp,lzp} P_{lxp,lyp,lzp} (x-p_x)**lxp (y-p_y)**lyp (z-p_z)**lzp
405  //
406  // where p is center of the product gaussian, and lp = la_max + lb_max
407  // (current implementation is l**7)
408 
409  // strides for accessing alpha
410  const int s3 = (task->lp + 1);
411  const int s2 = (task->la_max + 1) * s3;
412  const int s1 = (task->lb_max + 1) * s2;
413 
414  // TODO: Maybe we can transpose alpha to index it directly with ico and jco.
415 
416 #if (GRID_DO_COLLOCATE)
417  // collocate
418  for (int lzp = threadIdx.z; lzp <= task->lp; lzp += blockDim.z) {
419  for (int lyp = threadIdx.y; lyp <= task->lp - lzp; lyp += blockDim.y) {
420  for (int lxp = threadIdx.x; lxp <= task->lp - lzp - lyp;
421  lxp += blockDim.x) {
422  double reg = 0.0; // accumulate into a register
423  const int jco_start = ncoset(task->lb_min - 1);
424  const int jco_end = ncoset(task->lb_max);
425  for (int jco = jco_start; jco < jco_end; jco++) {
426  const orbital b = coset_inv[jco];
427  const int ico_start = ncoset(task->la_min - 1);
428  const int ico_end = ncoset(task->la_max);
429  for (int ico = ico_start; ico < ico_end; ico++) {
430  const orbital a = coset_inv[ico];
431 #else
432  // integrate
433  if (threadIdx.z == 0) { // TODO: How bad is this?
434  const int jco_start = ncoset(task->lb_min - 1) + threadIdx.y;
435  const int jco_end = ncoset(task->lb_max);
436  for (int jco = jco_start; jco < jco_end; jco += blockDim.y) {
437  const orbital b = coset_inv[jco];
438  const int ico_start = ncoset(task->la_min - 1) + threadIdx.x;
439  const int ico_end = ncoset(task->la_max);
440  for (int ico = ico_start; ico < ico_end; ico += blockDim.x) {
441  const orbital a = coset_inv[ico];
442  double reg = 0.0; // accumulate into a register
443  for (int lzp = 0; lzp <= task->lp; lzp++) {
444  for (int lyp = 0; lyp <= task->lp - lzp; lyp++) {
445  for (int lxp = 0; lxp <= task->lp - lzp - lyp; lxp++) {
446 #endif
447 
448  const double p = task->prefactor *
449  alpha[0 * s1 + b.l[0] * s2 + a.l[0] * s3 + lxp] *
450  alpha[1 * s1 + b.l[1] * s2 + a.l[1] * s3 + lyp] *
451  alpha[2 * s1 + b.l[2] * s2 + a.l[2] * s3 + lzp];
452 
453 #if (GRID_DO_COLLOCATE)
454  reg += p * cab_get(cab, a, b); // collocate
455 #else
456  reg += p * cxyz[coset(lxp, lyp, lzp)]; // integrate
457 #endif
458  }
459  }
460 
461 #if (GRID_DO_COLLOCATE)
462  // collocate
463  cxyz[coset(lxp, lyp, lzp)] = reg; // overwrite - no zeroing needed.
464  }
465 #else
466  // integrate
467  }
468  cab_add(cab, a, b, reg); // partial loop coverage -> zero it
469  }
470 #endif
471  }
472  }
473  __syncthreads(); // because of concurrent writes to cxyz / cab
474 }
475 
476 /*******************************************************************************
477  * \brief Allocates Cab block from global memory.
478  * \author Ole Schuett
479  ******************************************************************************/
480 __device__ static double *malloc_cab(const smem_task *task) {
481  __shared__ double *gmem_cab;
482  if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
483  gmem_cab = (double *)malloc(task->n1 * task->n2 * sizeof(double));
484  assert(gmem_cab != NULL &&
485  "MallocHeapSize too low, please increase it in grid_library_init()");
486  }
487  __syncthreads(); // wait for write to shared gmem_cab variable
488  return gmem_cab;
489 }
490 
491 /*******************************************************************************
492  * \brief Frees Cab block. Only needed if it was allocated from global memory.
493  * \author Ole Schuett
494  ******************************************************************************/
495 __device__ static void free_cab(double *gmem_cab) {
496  __syncthreads(); // Ensure that all threads have finished using cab.
497  if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
498  free(gmem_cab);
499  }
500 }
501 
502 /*******************************************************************************
503  * \brief Initializes the cab matrix with zeros.
504  * \author Ole Schuett
505  ******************************************************************************/
506 __device__ static void zero_cab(cab_store *cab, const int cab_len) {
507  if (threadIdx.z == 0 && threadIdx.y == 0) {
508  for (int i = threadIdx.x; i < cab_len; i += blockDim.x) {
509  cab->data[i] = 0.0;
510  }
511  }
512  __syncthreads(); // because of concurrent writes to cab
513 }
514 
515 /*******************************************************************************
516  * \brief Copies a task from global to shared memory and does precomputations.
517  * \author Ole Schuett
518  ******************************************************************************/
519 __device__ static void load_task(const kernel_params *params, smem_task *task) {
520 
521  // Parallel copy of base task from global to shared memory.
522  if (threadIdx.y == 0 && threadIdx.z == 0) {
523  const int itask = params->first_task + blockIdx.x;
524  const grid_gpu_task *src_task = &params->tasks[itask];
525  grid_gpu_task *dest_task = task; // Upcast to base struct.
526  for (int i = threadIdx.x; i < (int)sizeof(grid_gpu_task); i += blockDim.x) {
527  ((char *)dest_task)[i] = ((const char *)src_task)[i];
528  }
529  }
530  __syncthreads(); // because of concurrent writes to task
531 
532  if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
533  // angular momentum range for the actual collocate/integrate opteration.
534  task->la_max = task->la_max_basis + params->la_max_diff;
535  task->lb_max = task->lb_max_basis + params->lb_max_diff;
536  task->la_min = imax(task->la_min_basis + params->la_min_diff, 0);
537  task->lb_min = imax(task->lb_min_basis + params->lb_min_diff, 0);
538  task->lp = task->la_max + task->lb_max;
539 
540  // size of the cab matrix
541  task->n1 = ncoset(task->la_max);
542  task->n2 = ncoset(task->lb_max);
543 
544  task->pab_block = &params->pab_blocks[task->ab_block_offset];
545 
546  // integrate
547 #if (!GRID_DO_COLLOCATE)
548  task->hab_block = &params->hab_blocks[task->ab_block_offset];
549  if (params->forces != NULL) {
550  task->forces_a = &params->forces[3 * task->iatom];
551  task->forces_b = &params->forces[3 * task->jatom];
552  }
553 #endif
554  }
555  __syncthreads(); // because of concurrent writes to task
556 }
557 
558 #endif // defined(__OFFLOAD) && !defined(__NO_OFFLOAD_GRID)
559 // EOF
static int imax(int x, int y)
Returns the larger of two given integer (missing from the C standard)
static int imin(int x, int y)
Returns the smaller of two given integer (missing from the C standard)
Definition: dbm_miniapp.c:38
static GRID_HOST_DEVICE int coset(int lx, int ly, int lz)
Maps three angular momentum components to a single zero based index.
Definition: grid_common.h:87
static GRID_HOST_DEVICE int ncoset(const int l)
Number of Cartesian orbitals up to given angular momentum quantum.
Definition: grid_common.h:73
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
Definition: grid_common.h:117
static GRID_HOST_DEVICE double fac(const int i)
Factorial function, e.g. fac(5) = 5! = 120.
Definition: grid_common.h:48
static GRID_HOST_DEVICE int idx(const orbital a)
Return coset index of given orbital angular momentum.
Definition: grid_common.h:153
grid_func
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
#define GRID_CONST_WHEN_COLLOCATE
static void const int const int i
static void const int const int const int const int const int const double const int const int const int npts_local[3]
#define GRID_CONST_WHEN_INTEGRATE
static GRID_DEVICE void cab_add(cab_store *cab, const orbital a, const orbital b, const double value)
Adds given value to matrix element cab[idx(b)][idx(a)]. This function has to be implemented by the im...
static GRID_DEVICE double cab_get(const cab_store *cab, const orbital a, const orbital b)
Returns matrix element cab[idx(b)][idx(a)]. This function has to be implemented by the importing comp...
static void cab_to_cxyz(const int la_max, const int la_min, const int lb_max, const int lb_min, const double prefactor, const double ra[3], const double rb[3], const double rp[3], GRID_CONST_WHEN_COLLOCATE double *cab, GRID_CONST_WHEN_INTEGRATE double *cxyz)
Transforms coefficients C_ab into C_xyz.
static void general_cxyz_to_grid(const int border_mask, const int lp, const double zetp, const double dh[3][3], const double dh_inv[3][3], const double rp[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, GRID_CONST_WHEN_COLLOCATE double *cxyz, GRID_CONST_WHEN_INTEGRATE double *grid)
Collocates coefficients C_xyz onto the grid for general case.
static void ortho_cxyz_to_grid(const int lp, const double zetp, const double dh[3][3], const double dh_inv[3][3], const double rp[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const double radius, GRID_CONST_WHEN_COLLOCATE double *cxyz, GRID_CONST_WHEN_INTEGRATE double *grid)
Collocates coefficients C_xyz onto the grid for orthorhombic case.
real(dp), dimension(3) a
Definition: ai_eri_debug.F:31
real(dp), dimension(3) b
Definition: ai_eri_debug.F:31
real(dp), dimension(3) p
Definition: ai_eri_debug.F:32
static void init_constant_memory()
Initializes the device's constant memory.
__constant__ orbital coset_inv[1330]
__inline__ __device__ void compute_alpha(const smem_task< T > &task, T *__restrict__ alpha)
Computes the polynomial expansion coefficients: (x-a)**lxa (x-b)**lxb -> sum_{ls} alpha(ls,...
__constant__ int binomial_coef[19][19]
Cab matrix container to be passed through get_force/virial to cab_get.
const double * data
Orbital angular momentum.
Definition: grid_common.h:125