8#include "../../offload/offload_runtime.h"
10#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_GRID)
20#include "../common/grid_basis_set.h"
21#include "../common/grid_common.h"
24#if (GRID_DO_COLLOCATE)
25#define GRID_CONST_WHEN_COLLOCATE const
26#define GRID_CONST_WHEN_INTEGRATE
28#define GRID_CONST_WHEN_COLLOCATE
29#define GRID_CONST_WHEN_INTEGRATE const
37#if (GRID_DO_COLLOCATE)
39typedef double cxyz_store;
41__device__
static void cxyz_to_gridpoint(
const double dx,
const double dy,
42 const double dz,
const double zetp,
43 const int lp,
const cxyz_store *cxyz,
52__device__
static void gridpoint_to_cxyz(
const double dx,
const double dy,
53 const double dz,
const double zetp,
54 const int lp,
const double *gridpoint,
62__device__
static void atomicAddDouble(
double *address,
double val) {
66#if __CUDA_ARCH__ >= 600
67 atomicAdd(address, val);
70 unsigned long long int *address_as_ull = (
unsigned long long int *)address;
71 unsigned long long int old = *address_as_ull, assumed;
75 old = atomicCAS(address_as_ull, assumed,
76 __double_as_longlong(val + __longlong_as_double(assumed)));
79 }
while (assumed != old);
108 const orbital b,
const double value) {
110 atomicAddDouble(&cab->
data[
i], value);
120 int smem_alpha_offset;
121 int smem_cxyz_offset;
128 const grid_gpu_task *tasks;
129 const double *pab_blocks;
136#if (GRID_DO_COLLOCATE)
151typedef struct smem_task_struct : grid_gpu_task_struct {
163 const double *pab_block;
165#if (!GRID_DO_COLLOCATE)
184static void init_constant_memory() {
185 static bool initialized =
false;
192 for (
int lx = 0; lx <= 18; lx++) {
193 for (
int ly = 0; ly <= 18 - lx; ly++) {
194 for (
int lz = 0; lz <= 18 - lx - ly; lz++) {
195 const int i =
coset(lx, ly, lz);
196 coset_inv_host[
i] = {{lx, ly, lz}};
200 offloadMemcpyToSymbol(coset_inv, &coset_inv_host,
sizeof(coset_inv_host));
203 double binomial_coef_host[19][19];
204 memset(binomial_coef_host, 0,
sizeof(binomial_coef_host));
205 for (
int n = 0; n <= 18; n++) {
206 for (
int k = 0; k <= n; k++) {
207 binomial_coef_host[n][k] =
fac(n) /
fac(k) /
fac(n - k);
210 offloadMemcpyToSymbol(binomial_coef, &binomial_coef_host,
211 sizeof(binomial_coef_host));
220__device__
static void
229 const int kmin = ceil(-1e-8 - task->disr_radius * params->dh_inv[2][2]);
230 for (
int k = threadIdx.z + kmin; k <= 1 - kmin; k += blockDim.z) {
231 const int ka = task->cube_center_shifted[2] + k;
233 modulo(ka, params->npts_global[2]);
234 const int kd = (2 * k - 1) / 2;
235 const double kr = kd * params->dh[2][2];
236 const double kremain = task->disr_radius * task->disr_radius - kr * kr;
238 ceil(-1e-8 - sqrt(fmax(0.0, kremain)) * params->dh_inv[1][1]);
239 for (
int j = threadIdx.y + jmin; j <= 1 - jmin; j += blockDim.y) {
240 const int ja = task->cube_center_shifted[1] + j;
242 modulo(ja, params->npts_global[1]);
243 const int jd = (2 * j - 1) / 2;
244 const double jr = jd * params->dh[1][1];
245 const double jremain = kremain - jr * jr;
247 ceil(-1e-8 - sqrt(fmax(0.0, jremain)) * params->dh_inv[0][0]);
249 for (
int i = threadIdx.x +
imin;
i <= 1 -
imin;
i += blockDim.x) {
250 const int ia = task->cube_center_shifted[0] +
i;
252 modulo(ia, params->npts_global[0]);
258 const double dx =
i * params->dh[0][0] + task->cube_offset[0];
259 const double dy = j * params->dh[1][1] + task->cube_offset[1];
260 const double dz = k * params->dh[2][2] + task->cube_offset[2];
262 const int grid_index =
263 kg * params->npts_local[1] * params->npts_local[0] +
264 jg * params->npts_local[0] + ig;
266#if (GRID_DO_COLLOCATE)
268 cxyz_to_gridpoint(dx, dy, dz, task->zetp, task->lp, cxyz,
272 gridpoint_to_cxyz(dx, dy, dz, task->zetp, task->lp, &
grid[grid_index],
286__device__
static void
292 const int k_start = threadIdx.z + task->index_min[2];
293 for (
int k = k_start; k <= task->index_max[2]; k += blockDim.z) {
294 const int kg =
modulo(k - params->shift_local[2], params->npts_global[2]);
295 if (kg < task->bounds_k[0] || task->bounds_k[1] < kg) {
298 const int j_start = threadIdx.y + task->index_min[1];
299 for (
int j = j_start; j <= task->index_max[1]; j += blockDim.y) {
300 const int jg =
modulo(j - params->shift_local[1], params->npts_global[1]);
301 if (jg < task->bounds_j[0] || task->bounds_j[1] < jg) {
305 const int i_start = threadIdx.x + task->index_min[0];
306 for (
int i = i_start;
i <= task->index_max[0];
i += blockDim.x) {
308 modulo(
i - params->shift_local[0], params->npts_global[0]);
309 if (ig < task->bounds_i[0] || task->bounds_i[1] < ig) {
313 const double di =
i - task->gp[0];
314 double dx = di * params->dh[0][0];
315 double dy = di * params->dh[0][1];
316 double dz = di * params->dh[0][2];
317 const double dj = j - task->gp[1];
318 dx += dj * params->dh[1][0];
319 dy += dj * params->dh[1][1];
320 dz += dj * params->dh[1][2];
321 const double dk = k - task->gp[2];
322 dx += dk * params->dh[2][0];
323 dy += dk * params->dh[2][1];
324 dz += dk * params->dh[2][2];
327 const double r2 = dx * dx + dy * dy + dz * dz;
328 if (r2 >= task->radius2) {
332 const int stride = params->npts_local[1] * params->npts_local[0];
333 const int grid_index = kg * stride + jg * params->npts_local[0] + ig;
335#if (GRID_DO_COLLOCATE)
337 cxyz_to_gridpoint(dx, dy, dz, task->zetp, task->lp, cxyz,
341 gridpoint_to_cxyz(dx, dy, dz, task->zetp, task->lp, &
grid[grid_index],
356__device__
static void compute_alpha(
const smem_task *task,
double *alpha) {
358 const int s3 = (task->lp + 1);
359 const int s2 = (task->la_max + 1) * s3;
360 const int s1 = (task->lb_max + 1) * s2;
362 for (
int idir = threadIdx.z; idir < 3; idir += blockDim.z) {
363 const double drpa = task->rp[idir] - task->ra[idir];
364 const double drpb = task->rp[idir] - task->rb[idir];
365 for (
int la = threadIdx.y; la <= task->la_max; la += blockDim.y) {
366 for (
int lb = threadIdx.x; lb <= task->lb_max; lb += blockDim.x) {
367 for (
int i = 0;
i <= task->lp;
i++) {
368 const int base = idir * s1 + lb * s2 + la * s3;
369 alpha[base +
i] = 0.0;
372 for (
int k = 0; k <= la; k++) {
374 for (
int l = 0; l <= lb; l++) {
375 const int base = idir * s1 + lb * s2 + la * s3;
376 alpha[base + la - l + lb - k] +=
392__device__
static void cab_to_cxyz(
const smem_task *task,
const double *alpha,
410 const int s3 = (task->lp + 1);
411 const int s2 = (task->la_max + 1) * s3;
412 const int s1 = (task->lb_max + 1) * s2;
416#if (GRID_DO_COLLOCATE)
418 for (
int lzp = threadIdx.z; lzp <= task->lp; lzp += blockDim.z) {
419 for (
int lyp = threadIdx.y; lyp <= task->lp - lzp; lyp += blockDim.y) {
420 for (
int lxp = threadIdx.x; lxp <= task->lp - lzp - lyp;
423 const int jco_start =
ncoset(task->lb_min - 1);
424 const int jco_end =
ncoset(task->lb_max);
425 for (
int jco = jco_start; jco < jco_end; jco++) {
427 const int ico_start =
ncoset(task->la_min - 1);
428 const int ico_end =
ncoset(task->la_max);
429 for (
int ico = ico_start; ico < ico_end; ico++) {
433 if (threadIdx.z == 0) {
434 const int jco_start =
ncoset(task->lb_min - 1) + threadIdx.y;
435 const int jco_end =
ncoset(task->lb_max);
436 for (
int jco = jco_start; jco < jco_end; jco += blockDim.y) {
438 const int ico_start =
ncoset(task->la_min - 1) + threadIdx.x;
439 const int ico_end =
ncoset(task->la_max);
440 for (
int ico = ico_start; ico < ico_end; ico += blockDim.x) {
443 for (
int lzp = 0; lzp <= task->lp; lzp++) {
444 for (
int lyp = 0; lyp <= task->lp - lzp; lyp++) {
445 for (
int lxp = 0; lxp <= task->lp - lzp - lyp; lxp++) {
448 const double p = task->prefactor *
449 alpha[0 * s1 +
b.l[0] * s2 +
a.l[0] * s3 + lxp] *
450 alpha[1 * s1 +
b.l[1] * s2 +
a.l[1] * s3 + lyp] *
451 alpha[2 * s1 +
b.l[2] * s2 +
a.l[2] * s3 + lzp];
453#if (GRID_DO_COLLOCATE)
456 reg +=
p * cxyz[
coset(lxp, lyp, lzp)];
461#if (GRID_DO_COLLOCATE)
463 cxyz[
coset(lxp, lyp, lzp)] = reg;
480__device__
static double *malloc_cab(
const smem_task *task) {
481 __shared__
double *gmem_cab;
482 if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
483 gmem_cab = (
double *)malloc(task->n1 * task->n2 *
sizeof(
double));
484 assert(gmem_cab != NULL &&
485 "MallocHeapSize too low, please increase it in grid_library_init()");
495__device__
static void free_cab(
double *gmem_cab) {
497 if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
506__device__
static void zero_cab(
cab_store *cab,
const int cab_len) {
507 if (threadIdx.z == 0 && threadIdx.y == 0) {
508 for (
int i = threadIdx.x;
i < cab_len;
i += blockDim.x) {
519__device__
static void load_task(
const kernel_params *params, smem_task *task) {
522 if (threadIdx.y == 0 && threadIdx.z == 0) {
523 const int itask = params->first_task + blockIdx.x;
524 const grid_gpu_task *src_task = ¶ms->tasks[itask];
525 grid_gpu_task *dest_task = task;
526 for (
int i = threadIdx.x;
i < (int)
sizeof(grid_gpu_task);
i += blockDim.x) {
527 ((
char *)dest_task)[
i] = ((
const char *)src_task)[
i];
532 if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
534 task->la_max = task->la_max_basis + params->la_max_diff;
535 task->lb_max = task->lb_max_basis + params->lb_max_diff;
536 task->la_min =
imax(task->la_min_basis + params->la_min_diff, 0);
537 task->lb_min =
imax(task->lb_min_basis + params->lb_min_diff, 0);
538 task->lp = task->la_max + task->lb_max;
541 task->n1 =
ncoset(task->la_max);
542 task->n2 =
ncoset(task->lb_max);
544 task->pab_block = ¶ms->pab_blocks[task->ab_block_offset];
547#if (!GRID_DO_COLLOCATE)
548 task->hab_block = ¶ms->hab_blocks[task->ab_block_offset];
549 if (params->forces != NULL) {
550 task->forces_a = ¶ms->forces[3 * task->iatom];
551 task->forces_b = ¶ms->forces[3 * task->jatom];
static int imax(int x, int y)
Returns the larger of two given integers (missing from the C standard)
static int imin(int x, int y)
Returns the smaller of the two integers (missing from the C standard).
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
static GRID_HOST_DEVICE int idx(const orbital a)
Return coset index of given orbital angular momentum.
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
#define GRID_CONST_WHEN_COLLOCATE
static void const int const int i
static void const int const int const int const int const int const double const int const int const int npts_local[3]
#define GRID_CONST_WHEN_INTEGRATE
static GRID_DEVICE void cab_add(cab_store *cab, const orbital a, const orbital b, const double value)
Adds given value to matrix element cab[idx(b)][idx(a)]. This function has to be implemented by the im...
static GRID_DEVICE double cab_get(const cab_store *cab, const orbital a, const orbital b)
Returns matrix element cab[idx(b)][idx(a)]. This function has to be implemented by the importing comp...
static void cab_to_cxyz(const int la_max, const int la_min, const int lb_max, const int lb_min, const double prefactor, const double ra[3], const double rb[3], const double rp[3], GRID_CONST_WHEN_COLLOCATE double *cab, GRID_CONST_WHEN_INTEGRATE double *cxyz)
Transforms coefficients C_ab into C_xyz.
static void general_cxyz_to_grid(const int border_mask, const int lp, const double zetp, const double dh[3][3], const double dh_inv[3][3], const double rp[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, GRID_CONST_WHEN_COLLOCATE double *cxyz, GRID_CONST_WHEN_INTEGRATE double *grid)
Collocates coefficients C_xyz onto the grid for general case.
static void ortho_cxyz_to_grid(const int lp, const double zetp, const double dh[3][3], const double dh_inv[3][3], const double rp[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const double radius, GRID_CONST_WHEN_COLLOCATE double *cxyz, GRID_CONST_WHEN_INTEGRATE double *grid)
Collocates coefficients C_xyz onto the grid for orthorhombic case.
real(kind=dp), dimension(0:maxfac), parameter, public fac
integer, dimension(:), allocatable, public ncoset
integer, dimension(:, :, :), allocatable, public coset
__constant__ orbital coset_inv[1330]
__inline__ __device__ void compute_alpha(const smem_task< T > &task, T *__restrict__ alpha)
Computes the polynomial expansion coefficients: (x-a)**lxa (x-b)**lxb -> sum_{ls} alpha(ls,...
__constant__ int binomial_coef[19][19]
Cab matrix container to be passed through get_force/virial to cab_get.
Orbital angular momentum.