(git:374b731)
Loading...
Searching...
No Matches
grid_gpu_collint.h
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2024 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: BSD-3-Clause */
6/*----------------------------------------------------------------------------*/
7
8#include "../../offload/offload_runtime.h"
9
10#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_GRID)
11
12#include <algorithm>
13#include <assert.h>
14#include <limits.h>
15#include <math.h>
16#include <stdio.h>
17#include <stdlib.h>
18#include <string.h>
19
20#include "../common/grid_basis_set.h"
21#include "../common/grid_common.h"
22#include "grid_gpu_task_list.h"
23
24#if (GRID_DO_COLLOCATE)
25#define GRID_CONST_WHEN_COLLOCATE const
26#define GRID_CONST_WHEN_INTEGRATE
27#else
28#define GRID_CONST_WHEN_COLLOCATE
29#define GRID_CONST_WHEN_INTEGRATE const
30#endif
31
32/*******************************************************************************
33 * \brief Forward declarations for inner-most loop bodies.
34 * Implementations are in grid_gpu_collocate.cu / grid_gpu_integrate.cu.
35 * \author Ole Schuett
36 ******************************************************************************/
37#if (GRID_DO_COLLOCATE)
38// collocate
39typedef double cxyz_store;
40
41__device__ static void cxyz_to_gridpoint(const double dx, const double dy,
42 const double dz, const double zetp,
43 const int lp, const cxyz_store *cxyz,
44 double *gridpoint);
45#else
46// integrate
47typedef struct {
48 double *regs;
49 int offset;
50} cxyz_store;
51
52__device__ static void gridpoint_to_cxyz(const double dx, const double dy,
53 const double dz, const double zetp,
54 const int lp, const double *gridpoint,
55 cxyz_store *store);
56#endif
57
58/*******************************************************************************
59 * \brief Atomic add for doubles that also works prior to compute capability 6.
60 * \author Ole Schuett
61 ******************************************************************************/
62__device__ static void atomicAddDouble(double *address, double val) {
63 if (val == 0.0)
64 return;
65
66#if __CUDA_ARCH__ >= 600
67 atomicAdd(address, val); // part of cuda library
68#else
69 // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions
70 unsigned long long int *address_as_ull = (unsigned long long int *)address;
71 unsigned long long int old = *address_as_ull, assumed;
72
73 do {
74 assumed = old;
75 old = atomicCAS(address_as_ull, assumed,
76 __double_as_longlong(val + __longlong_as_double(assumed)));
77
78 // Uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
79 } while (assumed != old);
80
81#endif
82}
83
84/*******************************************************************************
85 * \brief Cab matrix container to be passed through prepare_pab to cab_add.
86 * \author Ole Schuett
87 ******************************************************************************/
88typedef struct {
89 double *data;
90 const int n1;
91} cab_store;
92
93/*******************************************************************************
94 * \brief Returns matrix element cab[idx(b)][idx(a)].
95 * \author Ole Schuett
96 ******************************************************************************/
97__device__ static inline double cab_get(const cab_store *cab, const orbital a,
98 const orbital b) {
99 const int i = idx(b) * cab->n1 + idx(a);
100 return cab->data[i];
101}
102
103/*******************************************************************************
104 * \brief Adds given value to matrix element cab[idx(b)][idx(a)].
105 * \author Ole Schuett
106 ******************************************************************************/
107__device__ static inline void cab_add(cab_store *cab, const orbital a,
108 const orbital b, const double value) {
109 const int i = idx(b) * cab->n1 + idx(a);
110 atomicAddDouble(&cab->data[i], value);
111}
112
113/*******************************************************************************
114 * \brief Parameters of the collocate kernel.
115 * \author Ole Schuett
116 ******************************************************************************/
117typedef struct {
118 int smem_cab_length;
119 int smem_cab_offset;
120 int smem_alpha_offset;
121 int smem_cxyz_offset;
122 int first_task;
123 int npts_global[3];
124 int npts_local[3];
125 int shift_local[3];
126 double dh[3][3];
127 double dh_inv[3][3];
128 const grid_gpu_task *tasks;
129 const double *pab_blocks;
131 int la_min_diff;
132 int lb_min_diff;
133 int la_max_diff;
134 int lb_max_diff;
135
136#if (GRID_DO_COLLOCATE)
137 // collocate
138 enum grid_func func;
139#else
140 // integrate
141 double *hab_blocks;
142 double *forces;
143 double *virial;
144#endif
145} kernel_params;
146
147/*******************************************************************************
148 * \brief Shared memory representation of a task.
149 * \author Ole Schuett
150 ******************************************************************************/
151typedef struct smem_task_struct : grid_gpu_task_struct {
152 // angular momentum range of actual collocate / integrate operation
153 int la_max;
154 int lb_max;
155 int la_min;
156 int lb_min;
157 int lp;
158
159 // size of the cab matrix
160 int n1;
161 int n2;
162
163 const double *pab_block;
164
165#if (!GRID_DO_COLLOCATE)
166 // integrate
167 double *hab_block;
168 double *forces_a;
169 double *forces_b;
170#endif
171} smem_task;
172
173/*******************************************************************************
174 * \brief Tabulated functions kept in constant memory to reduce register count.
175 * \author Ole Schuett
176 ******************************************************************************/
177__constant__ orbital coset_inv[1330];
178__constant__ double binomial_coef[19][19];
179
180/*******************************************************************************
181 * \brief Initializes the device's constant memory.
182 * \author Ole Schuett
183 ******************************************************************************/
184static void init_constant_memory() {
185 static bool initialized = false;
186 if (initialized) {
187 return; // constant memory has to be initialized only once
188 }
189
190 // Inverse coset mapping
191 orbital coset_inv_host[1330];
192 for (int lx = 0; lx <= 18; lx++) {
193 for (int ly = 0; ly <= 18 - lx; ly++) {
194 for (int lz = 0; lz <= 18 - lx - ly; lz++) {
195 const int i = coset(lx, ly, lz);
196 coset_inv_host[i] = {{lx, ly, lz}};
197 }
198 }
199 }
200 offloadMemcpyToSymbol(coset_inv, &coset_inv_host, sizeof(coset_inv_host));
201
202 // Binomial coefficient
203 double binomial_coef_host[19][19];
204 memset(binomial_coef_host, 0, sizeof(binomial_coef_host));
205 for (int n = 0; n <= 18; n++) {
206 for (int k = 0; k <= n; k++) {
207 binomial_coef_host[n][k] = fac(n) / fac(k) / fac(n - k);
208 }
209 }
210 offloadMemcpyToSymbol(binomial_coef, &binomial_coef_host,
211 sizeof(binomial_coef_host));
212
213 initialized = true;
214}
215
216/*******************************************************************************
217 * \brief Collocates coefficients C_xyz onto the grid for orthorhombic case.
218 * \author Ole Schuett
219 ******************************************************************************/
220__device__ static void
221ortho_cxyz_to_grid(const kernel_params *params, const smem_task *task,
222 GRID_CONST_WHEN_COLLOCATE cxyz_store *cxyz,
224
225 // The cube contains an even number of grid points in each direction and
226 // collocation is always performed on a pair of two opposing grid points.
227 // Hence, the points with index 0 and 1 are both assigned distance zero via
228 // the formular distance=(2*index-1)/2.
229 const int kmin = ceil(-1e-8 - task->disr_radius * params->dh_inv[2][2]);
230 for (int k = threadIdx.z + kmin; k <= 1 - kmin; k += blockDim.z) {
231 const int ka = task->cube_center_shifted[2] + k;
232 const int kg =
233 modulo(ka, params->npts_global[2]); // target location on the grid
234 const int kd = (2 * k - 1) / 2; // distance from center in grid points
235 const double kr = kd * params->dh[2][2]; // distance from center in a.u.
236 const double kremain = task->disr_radius * task->disr_radius - kr * kr;
237 const int jmin =
238 ceil(-1e-8 - sqrt(fmax(0.0, kremain)) * params->dh_inv[1][1]);
239 for (int j = threadIdx.y + jmin; j <= 1 - jmin; j += blockDim.y) {
240 const int ja = task->cube_center_shifted[1] + j;
241 const int jg =
242 modulo(ja, params->npts_global[1]); // target location on the grid
243 const int jd = (2 * j - 1) / 2; // distance from center in grid points
244 const double jr = jd * params->dh[1][1]; // distance from center in a.u.
245 const double jremain = kremain - jr * jr;
246 const int imin =
247 ceil(-1e-8 - sqrt(fmax(0.0, jremain)) * params->dh_inv[0][0]);
248
249 for (int i = threadIdx.x + imin; i <= 1 - imin; i += blockDim.x) {
250 const int ia = task->cube_center_shifted[0] + i;
251 const int ig =
252 modulo(ia, params->npts_global[0]); // target location on the grid
253
254 // The distances above (kd, jd) do not take roffset into account,
255 // ie. they always snap to the next grid point. This allowed the legacy
256 // implementation to cache the loop bounds.
257 // For the calculation of the grid value we'll use the true distance.
258 const double dx = i * params->dh[0][0] + task->cube_offset[0];
259 const double dy = j * params->dh[1][1] + task->cube_offset[1];
260 const double dz = k * params->dh[2][2] + task->cube_offset[2];
261
262 const int grid_index =
263 kg * params->npts_local[1] * params->npts_local[0] +
264 jg * params->npts_local[0] + ig;
265
266#if (GRID_DO_COLLOCATE)
267 // collocate
268 cxyz_to_gridpoint(dx, dy, dz, task->zetp, task->lp, cxyz,
269 &grid[grid_index]);
270#else
271 // integrate
272 gridpoint_to_cxyz(dx, dy, dz, task->zetp, task->lp, &grid[grid_index],
273 cxyz);
274#endif
275 }
276 }
277 }
278
279 __syncthreads(); // because of concurrent writes to grid / cxyz
280}
281
282/*******************************************************************************
283 * \brief Collocates coefficients C_xyz onto the grid for general case.
284 * \author Ole Schuett
285 ******************************************************************************/
286__device__ static void
287general_cxyz_to_grid(const kernel_params *params, const smem_task *task,
288 GRID_CONST_WHEN_COLLOCATE cxyz_store *cxyz,
290
291 // Go over the grid
292 const int k_start = threadIdx.z + task->index_min[2];
293 for (int k = k_start; k <= task->index_max[2]; k += blockDim.z) {
294 const int kg = modulo(k - params->shift_local[2], params->npts_global[2]);
295 if (kg < task->bounds_k[0] || task->bounds_k[1] < kg) {
296 continue;
297 }
298 const int j_start = threadIdx.y + task->index_min[1];
299 for (int j = j_start; j <= task->index_max[1]; j += blockDim.y) {
300 const int jg = modulo(j - params->shift_local[1], params->npts_global[1]);
301 if (jg < task->bounds_j[0] || task->bounds_j[1] < jg) {
302 continue;
303 }
304
305 const int i_start = threadIdx.x + task->index_min[0];
306 for (int i = i_start; i <= task->index_max[0]; i += blockDim.x) {
307 const int ig =
308 modulo(i - params->shift_local[0], params->npts_global[0]);
309 if (ig < task->bounds_i[0] || task->bounds_i[1] < ig) {
310 continue;
311 }
312 // Compute distance of current grid point from center of Gaussian.
313 const double di = i - task->gp[0];
314 double dx = di * params->dh[0][0];
315 double dy = di * params->dh[0][1];
316 double dz = di * params->dh[0][2];
317 const double dj = j - task->gp[1];
318 dx += dj * params->dh[1][0];
319 dy += dj * params->dh[1][1];
320 dz += dj * params->dh[1][2];
321 const double dk = k - task->gp[2];
322 dx += dk * params->dh[2][0];
323 dy += dk * params->dh[2][1];
324 dz += dk * params->dh[2][2];
325
326 // Cycle if the point is not within the radius.
327 const double r2 = dx * dx + dy * dy + dz * dz;
328 if (r2 >= task->radius2) {
329 continue;
330 }
331
332 const int stride = params->npts_local[1] * params->npts_local[0];
333 const int grid_index = kg * stride + jg * params->npts_local[0] + ig;
334
335#if (GRID_DO_COLLOCATE)
336 // collocate
337 cxyz_to_gridpoint(dx, dy, dz, task->zetp, task->lp, cxyz,
338 &grid[grid_index]);
339#else
340 // integrate
341 gridpoint_to_cxyz(dx, dy, dz, task->zetp, task->lp, &grid[grid_index],
342 cxyz);
343#endif
344 }
345 }
346 }
347
348 __syncthreads(); // because of concurrent writes to grid / cxyz
349}
350
351/*******************************************************************************
352 * \brief Computes the polynomial expansion coefficients:
353 * (x-a)**lxa (x-b)**lxb -> sum_{ls} alpha(ls,lxa,lxb,1)*(x-p)**ls
354 * \author Ole Schuett
355 ******************************************************************************/
356__device__ static void compute_alpha(const smem_task *task, double *alpha) {
357 // strides for accessing alpha
358 const int s3 = (task->lp + 1);
359 const int s2 = (task->la_max + 1) * s3;
360 const int s1 = (task->lb_max + 1) * s2;
361
362 for (int idir = threadIdx.z; idir < 3; idir += blockDim.z) {
363 const double drpa = task->rp[idir] - task->ra[idir];
364 const double drpb = task->rp[idir] - task->rb[idir];
365 for (int la = threadIdx.y; la <= task->la_max; la += blockDim.y) {
366 for (int lb = threadIdx.x; lb <= task->lb_max; lb += blockDim.x) {
367 for (int i = 0; i <= task->lp; i++) {
368 const int base = idir * s1 + lb * s2 + la * s3;
369 alpha[base + i] = 0.0;
370 }
371 double a = 1.0;
372 for (int k = 0; k <= la; k++) {
373 double b = 1.0;
374 for (int l = 0; l <= lb; l++) {
375 const int base = idir * s1 + lb * s2 + la * s3;
376 alpha[base + la - l + lb - k] +=
377 a * b * binomial_coef[la][k] * binomial_coef[lb][l];
378 b *= drpb;
379 }
380 a *= drpa;
381 }
382 }
383 }
384 }
385 __syncthreads(); // because of concurrent writes to alpha
386}
387
388/*******************************************************************************
389 * \brief Transforms coefficients C_ab into C_xyz.
390 * \author Ole Schuett
391 ******************************************************************************/
392__device__ static void cab_to_cxyz(const smem_task *task, const double *alpha,
394 GRID_CONST_WHEN_INTEGRATE double *cxyz) {
395
396 // *** initialise the coefficient matrix, we transform the sum
397 //
398 // sum_{lxa,lya,lza,lxb,lyb,lzb} P_{lxa,lya,lza,lxb,lyb,lzb} *
399 // (x-a_x)**lxa (y-a_y)**lya (z-a_z)**lza (x-b_x)**lxb (y-a_y)**lya
400 // (z-a_z)**lza
401 //
402 // into
403 //
404 // sum_{lxp,lyp,lzp} P_{lxp,lyp,lzp} (x-p_x)**lxp (y-p_y)**lyp (z-p_z)**lzp
405 //
406 // where p is center of the product gaussian, and lp = la_max + lb_max
407 // (current implementation is l**7)
408
409 // strides for accessing alpha
410 const int s3 = (task->lp + 1);
411 const int s2 = (task->la_max + 1) * s3;
412 const int s1 = (task->lb_max + 1) * s2;
413
414 // TODO: Maybe we can transpose alpha to index it directly with ico and jco.
415
416#if (GRID_DO_COLLOCATE)
417 // collocate
418 for (int lzp = threadIdx.z; lzp <= task->lp; lzp += blockDim.z) {
419 for (int lyp = threadIdx.y; lyp <= task->lp - lzp; lyp += blockDim.y) {
420 for (int lxp = threadIdx.x; lxp <= task->lp - lzp - lyp;
421 lxp += blockDim.x) {
422 double reg = 0.0; // accumulate into a register
423 const int jco_start = ncoset(task->lb_min - 1);
424 const int jco_end = ncoset(task->lb_max);
425 for (int jco = jco_start; jco < jco_end; jco++) {
426 const orbital b = coset_inv[jco];
427 const int ico_start = ncoset(task->la_min - 1);
428 const int ico_end = ncoset(task->la_max);
429 for (int ico = ico_start; ico < ico_end; ico++) {
430 const orbital a = coset_inv[ico];
431#else
432 // integrate
433 if (threadIdx.z == 0) { // TODO: How bad is this?
434 const int jco_start = ncoset(task->lb_min - 1) + threadIdx.y;
435 const int jco_end = ncoset(task->lb_max);
436 for (int jco = jco_start; jco < jco_end; jco += blockDim.y) {
437 const orbital b = coset_inv[jco];
438 const int ico_start = ncoset(task->la_min - 1) + threadIdx.x;
439 const int ico_end = ncoset(task->la_max);
440 for (int ico = ico_start; ico < ico_end; ico += blockDim.x) {
441 const orbital a = coset_inv[ico];
442 double reg = 0.0; // accumulate into a register
443 for (int lzp = 0; lzp <= task->lp; lzp++) {
444 for (int lyp = 0; lyp <= task->lp - lzp; lyp++) {
445 for (int lxp = 0; lxp <= task->lp - lzp - lyp; lxp++) {
446#endif
447
448 const double p = task->prefactor *
449 alpha[0 * s1 + b.l[0] * s2 + a.l[0] * s3 + lxp] *
450 alpha[1 * s1 + b.l[1] * s2 + a.l[1] * s3 + lyp] *
451 alpha[2 * s1 + b.l[2] * s2 + a.l[2] * s3 + lzp];
452
453#if (GRID_DO_COLLOCATE)
454 reg += p * cab_get(cab, a, b); // collocate
455#else
456 reg += p * cxyz[coset(lxp, lyp, lzp)]; // integrate
457#endif
458 }
459 }
460
461#if (GRID_DO_COLLOCATE)
462 // collocate
463 cxyz[coset(lxp, lyp, lzp)] = reg; // overwrite - no zeroing needed.
464 }
465#else
466 // integrate
467 }
468 cab_add(cab, a, b, reg); // partial loop coverage -> zero it
469 }
470#endif
471 }
472 }
473 __syncthreads(); // because of concurrent writes to cxyz / cab
474}
475
476/*******************************************************************************
477 * \brief Allocates Cab block from global memory.
478 * \author Ole Schuett
479 ******************************************************************************/
480__device__ static double *malloc_cab(const smem_task *task) {
481 __shared__ double *gmem_cab;
482 if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
483 gmem_cab = (double *)malloc(task->n1 * task->n2 * sizeof(double));
484 assert(gmem_cab != NULL &&
485 "MallocHeapSize too low, please increase it in grid_library_init()");
486 }
487 __syncthreads(); // wait for write to shared gmem_cab variable
488 return gmem_cab;
489}
490
491/*******************************************************************************
492 * \brief Frees Cab block. Only needed if it was allocated from global memory.
493 * \author Ole Schuett
494 ******************************************************************************/
495__device__ static void free_cab(double *gmem_cab) {
496 __syncthreads(); // Ensure that all threads have finished using cab.
497 if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
498 free(gmem_cab);
499 }
500}
501
502/*******************************************************************************
503 * \brief Initializes the cab matrix with zeros.
504 * \author Ole Schuett
505 ******************************************************************************/
506__device__ static void zero_cab(cab_store *cab, const int cab_len) {
507 if (threadIdx.z == 0 && threadIdx.y == 0) {
508 for (int i = threadIdx.x; i < cab_len; i += blockDim.x) {
509 cab->data[i] = 0.0;
510 }
511 }
512 __syncthreads(); // because of concurrent writes to cab
513}
514
515/*******************************************************************************
516 * \brief Copies a task from global to shared memory and does precomputations.
517 * \author Ole Schuett
518 ******************************************************************************/
519__device__ static void load_task(const kernel_params *params, smem_task *task) {
520
521 // Parallel copy of base task from global to shared memory.
522 if (threadIdx.y == 0 && threadIdx.z == 0) {
523 const int itask = params->first_task + blockIdx.x;
524 const grid_gpu_task *src_task = &params->tasks[itask];
525 grid_gpu_task *dest_task = task; // Upcast to base struct.
526 for (int i = threadIdx.x; i < (int)sizeof(grid_gpu_task); i += blockDim.x) {
527 ((char *)dest_task)[i] = ((const char *)src_task)[i];
528 }
529 }
530 __syncthreads(); // because of concurrent writes to task
531
532 if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
533 // angular momentum range for the actual collocate/integrate opteration.
534 task->la_max = task->la_max_basis + params->la_max_diff;
535 task->lb_max = task->lb_max_basis + params->lb_max_diff;
536 task->la_min = imax(task->la_min_basis + params->la_min_diff, 0);
537 task->lb_min = imax(task->lb_min_basis + params->lb_min_diff, 0);
538 task->lp = task->la_max + task->lb_max;
539
540 // size of the cab matrix
541 task->n1 = ncoset(task->la_max);
542 task->n2 = ncoset(task->lb_max);
543
544 task->pab_block = &params->pab_blocks[task->ab_block_offset];
545
546 // integrate
547#if (!GRID_DO_COLLOCATE)
548 task->hab_block = &params->hab_blocks[task->ab_block_offset];
549 if (params->forces != NULL) {
550 task->forces_a = &params->forces[3 * task->iatom];
551 task->forces_b = &params->forces[3 * task->jatom];
552 }
553#endif
554 }
555 __syncthreads(); // because of concurrent writes to task
556}
557
558#endif // defined(__OFFLOAD) && !defined(__NO_OFFLOAD_GRID)
559// EOF
static int imax(int x, int y)
Returns the larger of two given integer (missing from the C standard)
static int imin(int x, int y)
Returns the smaller of two given integer (missing from the C standard)
Definition dbm_miniapp.c:38
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
static GRID_HOST_DEVICE int idx(const orbital a)
Return coset index of given orbital angular momentum.
grid_func
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
#define GRID_CONST_WHEN_COLLOCATE
static void const int const int i
static void const int const int const int const int const int const double const int const int const int npts_local[3]
#define GRID_CONST_WHEN_INTEGRATE
static GRID_DEVICE void cab_add(cab_store *cab, const orbital a, const orbital b, const double value)
Adds given value to matrix element cab[idx(b)][idx(a)]. This function has to be implemented by the im...
static GRID_DEVICE double cab_get(const cab_store *cab, const orbital a, const orbital b)
Returns matrix element cab[idx(b)][idx(a)]. This function has to be implemented by the importing comp...
static void cab_to_cxyz(const int la_max, const int la_min, const int lb_max, const int lb_min, const double prefactor, const double ra[3], const double rb[3], const double rp[3], GRID_CONST_WHEN_COLLOCATE double *cab, GRID_CONST_WHEN_INTEGRATE double *cxyz)
Transforms coefficients C_ab into C_xyz.
static void general_cxyz_to_grid(const int border_mask, const int lp, const double zetp, const double dh[3][3], const double dh_inv[3][3], const double rp[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, GRID_CONST_WHEN_COLLOCATE double *cxyz, GRID_CONST_WHEN_INTEGRATE double *grid)
Collocates coefficients C_xyz onto the grid for general case.
static void ortho_cxyz_to_grid(const int lp, const double zetp, const double dh[3][3], const double dh_inv[3][3], const double rp[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const double radius, GRID_CONST_WHEN_COLLOCATE double *cxyz, GRID_CONST_WHEN_INTEGRATE double *grid)
Collocates coefficients C_xyz onto the grid for orthorhombic case.
real(dp), dimension(3) a
real(dp), dimension(3) b
real(dp), dimension(3) p
real(kind=dp), dimension(0:maxfac), parameter, public fac
integer, dimension(:), allocatable, public ncoset
integer, dimension(:, :, :), allocatable, public coset
__constant__ orbital coset_inv[1330]
__inline__ __device__ void compute_alpha(const smem_task< T > &task, T *__restrict__ alpha)
Computes the polynomial expansion coefficients: (x-a)**lxa (x-b)**lxb -> sum_{ls} alpha(ls,...
__constant__ int binomial_coef[19][19]
Cab matrix container to be passed through get_force/virial to cab_get.
const double * data
Orbital angular momentum.