14 #ifndef GRID_HIP_INTERNAL_HEADER_H
15 #define GRID_HIP_INTERNAL_HEADER_H
19 #include <hip/hip_runtime.h>
26 #define GRID_DEVICE __device__
28 #include "../common/grid_basis_set.h"
29 #include "../common/grid_constants.h"
36 #if defined(__HIP_PLATFORM_NVIDIA__)
37 #if __CUDA_ARCH__ < 600
38 __device__ __inline__
double atomicAdd(
double *address,
double val) {
39 unsigned long long int *address_as_ull = (
unsigned long long int *)address;
40 unsigned long long int old = *address_as_ull, assumed;
44 old = atomicCAS(address_as_ull, assumed,
45 __double_as_longlong(val + __longlong_as_double(assumed)));
49 }
while (assumed != old);
51 return __longlong_as_double(old);
96 unsigned short int n1;
97 unsigned short int n2;
139 __device__ __inline__
double fac(
const int i) {
140 static const double table[] = {
141 0.10000000000000000000E+01, 0.10000000000000000000E+01,
142 0.20000000000000000000E+01, 0.60000000000000000000E+01,
143 0.24000000000000000000E+02, 0.12000000000000000000E+03,
144 0.72000000000000000000E+03, 0.50400000000000000000E+04,
145 0.40320000000000000000E+05, 0.36288000000000000000E+06,
146 0.36288000000000000000E+07, 0.39916800000000000000E+08,
147 0.47900160000000000000E+09, 0.62270208000000000000E+10,
148 0.87178291200000000000E+11, 0.13076743680000000000E+13,
149 0.20922789888000000000E+14, 0.35568742809600000000E+15,
150 0.64023737057280000000E+16, 0.12164510040883200000E+18,
151 0.24329020081766400000E+19, 0.51090942171709440000E+20,
152 0.11240007277776076800E+22, 0.25852016738884976640E+23,
153 0.62044840173323943936E+24, 0.15511210043330985984E+26,
154 0.40329146112660563558E+27, 0.10888869450418352161E+29,
155 0.30488834461171386050E+30, 0.88417619937397019545E+31,
156 0.26525285981219105864E+33};
164 __host__ __device__ __inline__
int ncoset(
const int l) {
165 static const int table[] = {1,
168 20, 35, 56, 84, 120, 165, 220, 286,
169 364, 455, 560, 680, 816, 969, 1140, 1330};
176 __host__ __device__ __inline__
int coset(
int lx,
int ly,
int lz) {
177 const int l = lx + ly + lz;
181 return ncoset(l - 1) + ((l - lx) * (l - lx + 1)) / 2 + lz;
199 b.l[
i] = max(0,
a.l[
i] - 1);
207 return coset(
a.l[0],
a.l[1],
a.l[2]);
210 __device__ __inline__
double power(
const double x,
const int expo) {
212 for (
int i = 1;
i <= expo;
i++)
220 template <
typename T =
double>
222 const T value,
const int n, T *cab) {
223 atomicAdd(&cab[
idx(
b) * n +
idx(
a)], value);
231 static bool initialized =
false;
238 for (
int lx = 0; lx <= 18; lx++) {
239 for (
int ly = 0; ly <= 18 - lx; ly++) {
240 for (
int lz = 0; lz <= 18 - lx - ly; lz++) {
241 const int i =
coset(lx, ly, lz);
242 coset_inv_host[
i] = {{lx, ly, lz}};
247 hipMemcpyToSymbol(
coset_inv, &coset_inv_host,
sizeof(coset_inv_host), 0,
248 hipMemcpyHostToDevice);
249 assert(error == hipSuccess);
252 int binomial_coef_host[19][19] = {
253 {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
254 {1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
255 {1, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
256 {1, 3, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
257 {1, 4, 6, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
258 {1, 5, 10, 10, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
259 {1, 6, 15, 20, 15, 6, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
260 {1, 7, 21, 35, 35, 21, 7, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
261 {1, 8, 28, 56, 70, 56, 28, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
262 {1, 9, 36, 84, 126, 126, 84, 36, 9, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0},
263 {1, 10, 45, 120, 210, 252, 210, 120, 45, 10, 1, 0, 0, 0, 0, 0, 0, 0, 0},
264 {1, 11, 55, 165, 330, 462, 462, 330, 165, 55, 11, 1, 0, 0, 0, 0, 0, 0, 0},
265 {1, 12, 66, 220, 495, 792, 924, 792, 495, 220, 66, 12, 1, 0, 0, 0, 0, 0,
267 {1, 13, 78, 286, 715, 1287, 1716, 1716, 1287, 715, 286, 78, 13, 1, 0, 0,
269 {1, 14, 91, 364, 1001, 2002, 3003, 3432, 3003, 2002, 1001, 364, 91, 14, 1,
271 {1, 15, 105, 455, 1365, 3003, 5005, 6435, 6435, 5005, 3003, 1365, 455,
272 105, 15, 1, 0, 0, 0},
273 {1, 16, 120, 560, 1820, 4368, 8008, 11440, 12870, 11440, 8008, 4368, 1820,
274 560, 120, 16, 1, 0, 0},
275 {1, 17, 136, 680, 2380, 6188, 12376, 19448, 24310, 24310, 19448, 12376,
276 6188, 2380, 680, 136, 17, 1, 0},
277 {1, 18, 153, 816, 3060, 8568, 18564, 31824, 43758, 48620, 43758, 31824,
278 18564, 8568, 3060, 816, 153, 18, 1}};
281 sizeof(binomial_coef_host), 0, hipMemcpyHostToDevice);
282 assert(error == hipSuccess);
287 __inline__ __device__ double3
289 const double y,
const double z) {
295 r3.x = z * dh_[6] + y * dh_[3] + x * dh_[0];
296 r3.y = z * dh_[7] + y * dh_[4] + x * dh_[1];
297 r3.z = z * dh_[8] + y * dh_[5] + x * dh_[2];
302 const float x,
const float y,
309 r3.x = z * dh_[6] + y * dh_[3] + x * dh_[0];
310 r3.y = z * dh_[7] + y * dh_[4] + x * dh_[1];
311 r3.z = z * dh_[8] + y * dh_[5] + x * dh_[2];
319 template <
typename T>
321 T *__restrict__ alpha) {
323 const int s3 = (task.
lp + 1);
324 const int s2 = (task.
la_max + 1) * s3;
325 const int s1 = (task.
lb_max + 1) * s2;
327 threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
328 for (
int i = tid;
i < 3 * s1;
i += blockDim.x * blockDim.y * blockDim.z)
333 for (
int idir = threadIdx.z; idir < 3; idir += blockDim.z) {
334 const T drpa = task.
rp[idir] - task.
ra[idir];
335 const T drpb = task.
rp[idir] - task.
rb[idir];
336 for (
int la = threadIdx.y; la <= task.
la_max; la += blockDim.y) {
337 for (
int lb = threadIdx.x; lb <= task.
lb_max; lb += blockDim.x) {
339 for (
int k = 0; k <= la; k++) {
341 const int base = idir * s1 + lb * s2 + la * s3;
342 for (
int l = 0; l <= lb; l++) {
343 alpha[base + la - l + lb - k] +=
355 __host__ __inline__ __device__
void
357 const double3 *__restrict__
const rp,
358 double3 *__restrict__ rp_c) {
359 rp_c->x = dh_inv_[0] * rp->x + dh_inv_[3] * rp->y + dh_inv_[6] * rp->z;
360 rp_c->y = dh_inv_[1] * rp->x + dh_inv_[4] * rp->y + dh_inv_[7] * rp->z;
361 rp_c->z = dh_inv_[2] * rp->x + dh_inv_[5] * rp->y + dh_inv_[8] * rp->z;
364 __host__ __inline__ __device__
void
366 const double *__restrict__ dh_,
const double3 *__restrict__
const rp,
367 double3 *__restrict__ rp_c) {
368 rp_c->x = dh_[0] * rp->x + dh_[3] * rp->y + dh_[6] * rp->z;
369 rp_c->y = dh_[1] * rp->x + dh_[4] * rp->y + dh_[7] * rp->z;
370 rp_c->z = dh_[2] * rp->x + dh_[5] * rp->y + dh_[8] * rp->z;
373 __host__ __inline__ __device__
void
375 const float3 *__restrict__
const rp,
376 float3 *__restrict__ rp_c) {
377 rp_c->x = dh_inv_[0] * rp->x + dh_inv_[3] * rp->y + dh_inv_[6] * rp->z;
378 rp_c->y = dh_inv_[1] * rp->x + dh_inv_[4] * rp->y + dh_inv_[7] * rp->z;
379 rp_c->z = dh_inv_[2] * rp->x + dh_inv_[5] * rp->y + dh_inv_[8] * rp->z;
382 __host__ __inline__ __device__
void
384 const float *__restrict__ dh_,
const float3 *__restrict__
const rp,
385 float3 *__restrict__ rp_c) {
386 rp_c->x = dh_[0] * rp->x + dh_[3] * rp->y + dh_[6] * rp->z;
387 rp_c->y = dh_[1] * rp->x + dh_[4] * rp->y + dh_[7] * rp->z;
388 rp_c->z = dh_[2] * rp->x + dh_[5] * rp->y + dh_[8] * rp->z;
391 template <
typename T,
typename T3,
bool orthorhombic_>
393 const T radius,
const T *
const __restrict__ dh_,
394 const T *
const __restrict__ dh_inv_,
const T3 *__restrict__ rp,
395 T3 *__restrict__ roffset, int3 *__restrict__ cubecenter,
396 int3 *__restrict__ lb_cube, int3 *__restrict__ cube_size) {
405 cubecenter->x = std::floor(rp1.x);
406 cubecenter->y = std::floor(rp1.y);
407 cubecenter->z = std::floor(rp1.z);
424 T norm1, norm2, norm3;
425 norm1 = dh_[0] * dh_[0] + dh_[1] * dh_[1] + dh_[2] * dh_[2];
426 norm2 = dh_[3] * dh_[3] + dh_[4] * dh_[4] + dh_[5] * dh_[5];
427 norm3 = dh_[6] * dh_[6] + dh_[7] * dh_[7] + dh_[8] * dh_[8];
429 norm1 = std::sqrt(norm1);
430 norm2 = std::sqrt(norm2);
431 norm3 = std::sqrt(norm3);
433 const T disr_radius =
434 std::min(norm1, std::min(norm2, norm3)) *
436 (
int)ceil(radius / std::min(norm1, std::min(norm2, norm3))));
438 rp2.x = cubecenter->x;
439 rp2.y = cubecenter->y;
440 rp2.z = cubecenter->z;
456 lb_cube->x = std::ceil(-1e-8 - rp3.x);
457 lb_cube->y = std::ceil(-1e-8 - rp3.y);
458 lb_cube->z = std::ceil(-1e-8 - rp3.z);
471 cube_size->x = 2 - 2 * lb_cube->x;
472 cube_size->y = 2 - 2 * lb_cube->y;
473 cube_size->z = 2 - 2 * lb_cube->z;
478 lb_cube->x = INT_MAX;
480 lb_cube->y = INT_MAX;
482 lb_cube->z = INT_MAX;
485 for (
int i = -1;
i <= 1;
i++) {
486 for (
int j = -1; j <= 1; j++) {
487 for (
int k = -1; k <= 1; k++) {
489 r.x = rp->x + ((T)
i) * radius;
490 r.y = rp->y + ((T)j) * radius;
491 r.z = rp->z + ((T)k) * radius;
494 lb_cube->x = std::min(lb_cube->x, (
int)std::floor(roffset->x));
495 ub_cube.x = std::max(ub_cube.x, (
int)std::ceil(roffset->x));
497 lb_cube->y = std::min(lb_cube->y, (
int)std::floor(roffset->y));
498 ub_cube.y = std::max(ub_cube.y, (
int)std::ceil(roffset->y));
500 lb_cube->z = std::min(lb_cube->z, (
int)std::floor(roffset->z));
501 ub_cube.z = std::max(ub_cube.z, (
int)std::ceil(roffset->z));
506 cube_size->x = ub_cube.x - lb_cube->x;
507 cube_size->y = ub_cube.y - lb_cube->y;
508 cube_size->z = ub_cube.z - lb_cube->z;
512 roffset->x = cubecenter->x - rp1.x;
513 roffset->y = cubecenter->y - rp1.y;
514 roffset->z = cubecenter->z - rp1.z;
518 lb_cube->x -= cubecenter->x;
519 lb_cube->y -= cubecenter->y;
520 lb_cube->z -= cubecenter->z;
527 const int border_mask,
528 const int *border_width,
529 int3 *
const window_size,
530 int3 *
const window_shift) {
535 window_size->x = grid_size[2] - 1;
536 window_size->y = grid_size[1] - 1;
537 window_size->z = grid_size[0] - 1;
539 if (border_mask & (1 << 0))
540 window_shift->x += border_width[2];
541 if (border_mask & (1 << 1))
542 window_size->x -= border_width[2];
543 if (border_mask & (1 << 2))
544 window_shift->y += border_width[1];
545 if (border_mask & (1 << 3))
546 window_size->y -= border_width[1];
547 if (border_mask & (1 << 4))
548 window_shift->z += border_width[0];
549 if (border_mask & (1 << 5))
550 window_size->z -= border_width[0];
556 template <
typename T>
557 __device__ __inline__
static void
559 const T *__restrict__ cab, T *__restrict__ cxyz) {
575 const int s3 = (task.
lp + 1);
576 const int s2 = (task.
la_max + 1) * s3;
577 const int s1 = (task.
lb_max + 1) * s2;
581 threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
584 i += blockDim.x * blockDim.y * blockDim.z) {
592 alpha[0 * s1 +
b.l[0] * s2 +
a.l[0] * s3 +
co.l[0]] *
593 alpha[1 * s1 +
b.l[1] * s2 +
a.l[1] * s3 +
co.l[1]] *
594 alpha[2 * s1 +
b.l[2] * s2 +
a.l[2] * s3 +
co.l[2]];
595 reg +=
p * cab[jco * task.
n1 + ico];
607 template <
typename T>
608 __device__ __inline__
static void
610 const T *__restrict__ cxyz, T *__restrict__ cab) {
626 const int s3 = (task.
lp + 1);
627 const int s2 = (task.
la_max + 1) * s3;
628 const int s1 = (task.
lb_max + 1) * s2;
631 threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
632 for (
int jco = tid / 8; jco <
ncoset(task.
lb_max); jco += 8) {
634 for (
int ico = tid % 8; ico <
ncoset(task.
la_max); ico += 8) {
637 for (
int ic = 0; ic <
ncoset(task.
lp); ic++) {
640 alpha[
b.l[0] * s2 +
a.l[0] * s3 +
co.l[0]] *
641 alpha[s1 +
b.l[1] * s2 +
a.l[1] * s3 +
co.l[1]] *
642 alpha[2 * s1 +
b.l[2] * s2 +
a.l[2] * s3 +
co.l[2]];
646 cab[jco * task.
n1 + ico] = reg;
659 template <
typename T,
typename T3>
660 __device__ __inline__
void
663 if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
664 const auto &glb_task = dev.
tasks[task_id];
665 task.
zetp = glb_task.zeta + glb_task.zetb;
666 task.
radius = glb_task.radius;
681 task.
roffset.x = glb_task.roffset.x;
682 task.
roffset.y = glb_task.roffset.y;
683 task.
roffset.z = glb_task.roffset.z;
685 task.
lb_cube.x = glb_task.lb_cube.x;
686 task.
lb_cube.y = glb_task.lb_cube.y;
687 task.
lb_cube.z = glb_task.lb_cube.z;
700 template <
typename T>
704 if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
705 const auto &glb_task = dev.
tasks[task_id];
706 const int iatom = glb_task.
iatom;
707 const int jatom = glb_task.jatom;
708 task.
zeta = glb_task.zeta;
709 task.
zetb = glb_task.zetb;
711 for (
int i = 0;
i < 3;
i++) {
712 task.
rab[
i] = glb_task.rab[
i];
713 task.
ra[
i] = glb_task.ra[
i];
725 const int la_max_basis = glb_task.la_max;
726 const int lb_max_basis = glb_task.lb_max;
727 const int la_min_basis = glb_task.la_min;
728 const int lb_min_basis = glb_task.lb_min;
749 task.
nsgfa = glb_task.nsgfa;
750 task.
nsgfb = glb_task.nsgfb;
757 task.
maxcoa = glb_task.maxcoa;
758 task.
maxcob = glb_task.maxcob;
767 const int block_offset = dev.
block_offsets[glb_task.block_num];
771 if (dev.
ptr_dev[3] !=
nullptr) {
773 if (dev.
ptr_dev[4] !=
nullptr) {
786 int smem_per_block_{0};
799 lp_max_ = la_max_ + lb_max_;
802 alpha_len_ = 3 * (lb_max_ + 1) * (la_max_ + 1) * (lp_max_ + 1);
803 smem_per_block_ = std::max(alpha_len_, 64) *
sizeof(double);
805 if (smem_per_block_ > 64 * 1024) {
807 "ERROR: Not enough shared memory in grid_gpu_collocate.\n");
808 fprintf(stderr,
"cab_len: %i, ", cab_len_);
809 fprintf(stderr,
"alpha_len: %i, ", alpha_len_);
810 fprintf(stderr,
"total smem_per_block: %f kb\n\n",
811 smem_per_block_ / 1024.0);
828 inline int lp_diff()
const {
return lp_diff_; }
832 inline int lp_max()
const {
return lp_max_; }
int smem_alpha_offset() const
int smem_per_block() const
smem_parameters(const ldiffs_value ldiffs, const int lmax)
ldiffs_value ldiffs() const
int smem_cab_offset() const
int smem_cxyz_offset() const
static void const int const int i
integer, dimension(:, :, :), allocatable, public co
__inline__ __device__ void compute_window_size(const int *const grid_size, const int border_mask, const int *border_width, int3 *const window_size, int3 *const window_shift)
__host__ __inline__ __device__ void convert_from_lattice_coordinates_to_cartesian(const double *__restrict__ dh_, const double3 *__restrict__ const rp, double3 *__restrict__ rp_c)
__device__ static __inline__ void cab_to_cxyz(const smem_task< T > &task, const T *__restrict__ alpha, const T *__restrict__ cab, T *__restrict__ cxyz)
Transforms coefficients C_ab into C_xyz.
__host__ __device__ __inline__ int ncoset(const int l)
Number of Cartesian orbitals up to given angular momentum quantum.
__device__ __inline__ void prep_term(const orbital a, const orbital b, const T value, const int n, T *cab)
Adds given value to matrix element cab[idx(b)][idx(a)].
__device__ __inline__ void fill_smem_task_coef(const kernel_params &dev, const int task_id, smem_task< T > &task)
Copies a task from global to shared memory and does precomputations.
__device__ __inline__ double fac(const int i)
Factorial function, e.g. fac(5) = 5! = 120.
__device__ __inline__ double power(const double x, const int expo)
__host__ __device__ __inline__ int coset(int lx, int ly, int lz)
Maps three angular momentum components to a single zero based index.
__inline__ __device__ orbital down(const int i, const orbital &a)
Decrease i'th component of given orbital angular momentum.
__host__ __inline__ __device__ void convert_to_lattice_coordinates(const double *dh_inv_, const double3 *__restrict__ const rp, double3 *__restrict__ rp_c)
static void init_constant_memory()
Initializes the device's constant memory.
__device__ static __inline__ void cxyz_to_cab(const smem_task< T > &task, const T *__restrict__ alpha, const T *__restrict__ cxyz, T *__restrict__ cab)
Transforms coefficients C_xyz into C_ab.
__inline__ __device__ int idx(const orbital a)
Return coset index of given orbital angular momentum.
__constant__ orbital coset_inv[1330]
__device__ __inline__ orbital up(const int i, const orbital &a)
Increase i'th component of given orbital angular momentum.
__inline__ __device__ void compute_alpha(const smem_task< T > &task, T *__restrict__ alpha)
Computes the polynomial expansion coefficients: (x-a)**lxa (x-b)**lxb -> sum_{ls} alpha(ls,...
__device__ __inline__ void fill_smem_task_reduced(const kernel_params &dev, const int task_id, smem_task_reduced< T, T3 > &task)
Copies a task from global to shared memory.
__inline__ __device__ double3 compute_coordinates(const double *__restrict__ dh_, const double x, const double y, const double z)
__constant__ int binomial_coef[19][19]
__inline__ T compute_cube_properties(const T radius, const T *const __restrict__ dh_, const T *const __restrict__ dh_inv_, const T3 *__restrict__ rp, T3 *__restrict__ roffset, int3 *__restrict__ cubecenter, int3 *__restrict__ lb_cube, int3 *__restrict__ cube_size)
Parameters of the collocate kernel.
Differences in angular momentum.
Orbital angular momentum.
data needed for collocate and integrate kernels
data needed for calculating the coefficients, forces, and stress
unsigned short int nsgf_seta
unsigned short int nsgf_setb