15#if defined(__AVX2__) && defined(__FMA__)
19#include "../common/grid_common.h"
20#include "../common/grid_library.h"
21#include "../common/grid_sphere_cache.h"
23#define GRID_MAX_LP_OPTIMIZED 9
25#if (GRID_DO_COLLOCATE)
26#define GRID_CONST_WHEN_COLLOCATE const
27#define GRID_CONST_WHEN_INTEGRATE
29#define GRID_CONST_WHEN_COLLOCATE
30#define GRID_CONST_WHEN_INTEGRATE const
38ortho_cx_to_grid_scalar(const
int lp, const
int cmax, const
int i,
39 const
double pol[3][lp + 1][2 *
cmax + 1],
46#if (GRID_DO_COLLOCATE)
48 double reg[4] = {0.0, 0.0, 0.0, 0.0};
49#pragma omp simd reduction(+ : reg)
50 for (
int lxp = 0; lxp <= lp; lxp++) {
51 const double p =
pol[0][lxp][
i +
cmax];
52 reg[0] +=
cx[lxp * 4 + 0] * p;
53 reg[1] +=
cx[lxp * 4 + 1] * p;
54 reg[2] +=
cx[lxp * 4 + 2] * p;
55 reg[3] +=
cx[lxp * 4 + 3] * p;
66 for (
int lxp = 0; lxp <= lp; lxp++) {
67 const double p =
pol[0][lxp][
i +
cmax];
68 cx[lxp * 4 + 0] += reg[0] * p;
69 cx[lxp * 4 + 1] += reg[1] * p;
70 cx[lxp * 4 + 2] += reg[2] * p;
71 cx[lxp * 4 + 3] += reg[3] * p;
81#if defined(__AVX2__) && defined(__FMA__)
83ortho_cx_to_grid_avx2(
const int lp,
const int cmax,
const int i,
84 const double pol[3][lp + 1][2 *
cmax + 1],
91 const int icmax =
i +
cmax;
93#if (GRID_DO_COLLOCATE)
96 __m256d p_vec = _mm256_loadu_pd(&
pol[0][0][icmax]);
97 __m256d r_vec_0 = _mm256_mul_pd(p_vec, _mm256_set1_pd(
cx[0]));
98 __m256d r_vec_1 = _mm256_mul_pd(p_vec, _mm256_set1_pd(
cx[1]));
99 __m256d r_vec_2 = _mm256_mul_pd(p_vec, _mm256_set1_pd(
cx[2]));
100 __m256d r_vec_3 = _mm256_mul_pd(p_vec, _mm256_set1_pd(
cx[3]));
104 for (
int lxp = 1; lxp <= lp; lxp++) {
105 const double *cx_base = &
cx[lxp * 4];
106 p_vec = _mm256_loadu_pd(&
pol[0][lxp][icmax]);
107 r_vec_0 = _mm256_fmadd_pd(p_vec, _mm256_set1_pd(cx_base[0]), r_vec_0);
108 r_vec_1 = _mm256_fmadd_pd(p_vec, _mm256_set1_pd(cx_base[1]), r_vec_1);
109 r_vec_2 = _mm256_fmadd_pd(p_vec, _mm256_set1_pd(cx_base[2]), r_vec_2);
110 r_vec_3 = _mm256_fmadd_pd(p_vec, _mm256_set1_pd(cx_base[3]), r_vec_3);
114 _mm256_storeu_pd(
grid_0, _mm256_add_pd(_mm256_loadu_pd(
grid_0), r_vec_0));
115 _mm256_storeu_pd(
grid_1, _mm256_add_pd(_mm256_loadu_pd(
grid_1), r_vec_1));
116 _mm256_storeu_pd(
grid_2, _mm256_add_pd(_mm256_loadu_pd(
grid_2), r_vec_2));
117 _mm256_storeu_pd(
grid_3, _mm256_add_pd(_mm256_loadu_pd(
grid_3), r_vec_3));
121 __m256d grid_vec_0 = _mm256_loadu_pd(
grid_0);
122 __m256d grid_vec_1 = _mm256_loadu_pd(
grid_1);
123 __m256d grid_vec_2 = _mm256_loadu_pd(
grid_2);
124 __m256d grid_vec_3 = _mm256_loadu_pd(
grid_3);
127 for (
int lxp = 0; lxp <= lp; lxp++) {
128 __m256d p_vec = _mm256_loadu_pd(&
pol[0][lxp][icmax]);
131 __m256d xy0 = _mm256_mul_pd(p_vec, grid_vec_0);
132 __m256d xy1 = _mm256_mul_pd(p_vec, grid_vec_1);
133 __m256d xy2 = _mm256_mul_pd(p_vec, grid_vec_2);
134 __m256d xy3 = _mm256_mul_pd(p_vec, grid_vec_3);
137 __m256d temp01 = _mm256_hadd_pd(xy0, xy1);
140 __m256d temp23 = _mm256_hadd_pd(xy2, xy3);
143 __m256d swapped = _mm256_permute2f128_pd(temp01, temp23, 0x21);
146 __m256d blended = _mm256_blend_pd(temp01, temp23, 0b1100);
148 __m256d r_vec = _mm256_add_pd(swapped, blended);
151 double *cx_base = &
cx[lxp * 4];
152 _mm256_storeu_pd(cx_base, _mm256_add_pd(r_vec, _mm256_loadu_pd(cx_base)));
165 const
double pol[3][lp + 1][2 *
cmax + 1],
172 const int lb = *((*sphere_bounds_iter)++);
173 const int ub = 1 - lb;
178 for (
int istart = lb; istart <=
ub; istart++) {
180 const int cube2grid =
map[0][istart +
cmax] - istart;
193#if defined(__AVX2__) && defined(__FMA__)
194 const int istop_vec = istart + 4 * ((istop - istart + 1) / 4) - 1;
195 for (
int i = istart;
i <= istop_vec;
i += 4) {
196 const int ig =
i + cube2grid;
197 ortho_cx_to_grid_avx2(lp,
cmax,
i,
pol,
cx, &grid_base_0[ig],
198 &grid_base_1[ig], &grid_base_2[ig],
201 istart = istop_vec + 1;
205 for (
int i = istart;
i <= istop;
i++) {
206 const int ig =
i + cube2grid;
207 ortho_cx_to_grid_scalar(lp,
cmax,
i,
pol,
cx, &grid_base_0[ig],
208 &grid_base_1[ig], &grid_base_2[ig],
221 const
double pol[3][lp + 1][2 *
cmax + 1],
225 for (
int lyp = 0; lyp <= lp; lyp++) {
226 for (
int lxp = 0; lxp <= lp - lyp; lxp++) {
227 const double p1 =
pol[1][lyp][
j1 +
cmax];
228 const double p2 =
pol[1][lyp][
j2 +
cmax];
229 const int cxy_index = lyp * (lp + 1) * 2 + lxp * 2;
231#if (GRID_DO_COLLOCATE)
233 cx[lxp * 4 + 0] +=
cxy[cxy_index + 0] * p1;
234 cx[lxp * 4 + 1] +=
cxy[cxy_index + 1] * p1;
235 cx[lxp * 4 + 2] +=
cxy[cxy_index + 0] * p2;
236 cx[lxp * 4 + 3] +=
cxy[cxy_index + 1] * p2;
239 cxy[cxy_index + 0] +=
cx[lxp * 4 + 0] * p1;
240 cxy[cxy_index + 1] +=
cx[lxp * 4 + 1] * p1;
241 cxy[cxy_index + 0] +=
cx[lxp * 4 + 2] * p2;
242 cxy[cxy_index + 1] +=
cx[lxp * 4 + 3] * p2;
252static inline void __attribute__((always_inline)) ortho_cxy_to_grid_low(
253 const int lp,
const int j1,
const int j2,
const int kg1,
const int kg2,
255 const double pol[3][lp + 1][2 *
cmax + 1],
const int map[3][2 *
cmax + 1],
260#if (GRID_DO_COLLOCATE)
263 ortho_cx_to_grid(lp,
kg1,
kg2,
jg1,
jg2,
cmax,
pol,
map,
sections,
npts_local,
267 ortho_cx_to_grid(lp,
kg1,
kg2,
jg1,
jg2,
cmax,
pol,
map,
sections,
npts_local,
278 const int lp,
const int kg1,
const int kg2,
const int cmax,
279 const double pol[3][lp + 1][2 *
cmax + 1],
const int map[3][2 *
cmax + 1],
289 const int jstart = *((*sphere_bounds_iter)++);
290 const size_t cx_size = (lp + 1) * 4;
292 for (
int j1 = jstart;
j1 <= 0;
j1++) {
293 const int j2 = 1 -
j1;
297 memset(
cx, 0, cx_size *
sizeof(
double));
304 ortho_cxy_to_grid_low(ilp,
j1,
j2,
kg1,
kg2,
jg1,
jg2,
cmax,
pol,
map,
310 ortho_cxy_to_grid_low(lp,
j1,
j2,
kg1,
kg2,
jg1,
jg2,
cmax,
pol,
map,
323 const double pol[3][lp + 1][2 *
cmax + 1],
327 for (
int lzp = 0; lzp <= lp; lzp++) {
328 for (
int lyp = 0; lyp <= lp - lzp; lyp++) {
329 for (
int lxp = 0; lxp <= lp - lzp - lyp; lxp++) {
330 const double p1 =
pol[2][lzp][k1 +
cmax];
331 const double p2 =
pol[2][lzp][k2 +
cmax];
332 const int cxyz_index =
333 lzp * (lp + 1) * (lp + 1) + lyp * (lp + 1) + lxp;
334 const int cxy_index = lyp * (lp + 1) * 2 + lxp * 2;
336#if (GRID_DO_COLLOCATE)
338 cxy[cxy_index + 0] += cxyz[cxyz_index] * p1;
339 cxy[cxy_index + 1] += cxyz[cxyz_index] * p2;
342 cxyz[cxyz_index] +=
cxy[cxy_index + 0] * p1;
343 cxyz[cxyz_index] +=
cxy[cxy_index + 1] * p2;
356 const double dh_inv[3][3],
const double rp[3],
357 const int npts_global[3],
const int npts_local[3],
358 const int shift_local[3],
const double radius,
372 for (
int i = 0;
i < 3;
i++) {
373 double dh_inv_rp = 0.0;
374 for (
int j = 0; j < 3; j++) {
375 dh_inv_rp += dh_inv[j][
i] * rp[j];
377 cubecenter[
i] = (int)floor(dh_inv_rp);
381 for (
int i = 0;
i < 3;
i++) {
382 roffset[
i] = rp[
i] - ((double)cubecenter[
i]) * dh[
i][
i];
392 int lb_cube[3], ub_cube[3];
393 for (
int i = 0;
i < 3;
i++) {
394 lb_cube[
i] = (int)ceil(-1e-8 - disr_radius * dh_inv[
i][
i]);
395 ub_cube[
i] = 1 - lb_cube[
i];
399 modulo(cubecenter[
i] + lb_cube[
i] - shift_local[
i], npts_global[
i]) -
402 assert(offset + lb_cube[
i] >= 0);
407 const int cmax =
imax(
imax(ub_cube[0], ub_cube[1]), ub_cube[2]);
410 double pol_mutable[3][lp + 1][2 *
cmax + 1];
411 for (
int idir = 0; idir < 3; idir++) {
412 const double dr = dh[idir][idir];
413 const double ro = roffset[idir];
417 const double t_exp_1 = exp(-zetp * pow(dr, 2));
418 const double t_exp_2 = pow(t_exp_1, 2);
419 double t_exp_min_1 = exp(-zetp * pow(+dr - ro, 2));
420 double t_exp_min_2 = exp(-2 * zetp * (+dr - ro) * (-dr));
421 for (
int ig = 0; ig >= lb_cube[idir]; ig--) {
422 const double rpg = ig * dr - ro;
423 t_exp_min_1 *= t_exp_min_2 * t_exp_1;
424 t_exp_min_2 *= t_exp_2;
425 double pg = t_exp_min_1;
426 for (
int icoef = 0; icoef <= lp; icoef++) {
427 pol_mutable[idir][icoef][ig +
cmax] = pg;
431 double t_exp_plus_1 = exp(-zetp * pow(-ro, 2));
432 double t_exp_plus_2 = exp(-2 * zetp * (-ro) * (+dr));
433 for (
int ig = 0; ig >= lb_cube[idir]; ig--) {
434 const double rpg = (1 - ig) * dr - ro;
435 t_exp_plus_1 *= t_exp_plus_2 * t_exp_1;
436 t_exp_plus_2 *= t_exp_2;
437 double pg = t_exp_plus_1;
438 for (
int icoef = 0; icoef <= lp; icoef++) {
439 pol_mutable[idir][icoef][1 - ig +
cmax] = pg;
444 const double(*
pol)[lp + 1][2 *
cmax + 1] =
445 (
const double(*)[lp + 1][2 *
cmax + 1]) pol_mutable;
448 int map_mutable[3][2 *
cmax + 1];
449 for (
int i = 0;
i < 3;
i++) {
450 for (
int k = -
cmax; k <= +
cmax; k++) {
451 map_mutable[
i][k +
cmax] =
452 modulo(cubecenter[
i] + k - shift_local[
i], npts_global[
i]);
455 const int(*
map)[2 *
cmax + 1] = (
const int(*)[2 *
cmax + 1]) map_mutable;
458 int sections_mutable[3][2 *
cmax + 1];
459 for (
int i = 0;
i < 3;
i++) {
460 for (
int kg = 2 *
cmax; kg >= 0; kg--) {
461 if (kg == 2 *
cmax ||
map[
i][kg] !=
map[
i][kg + 1] - 1) {
462 sections_mutable[
i][kg] = 0;
464 sections_mutable[
i][kg] = sections_mutable[
i][kg + 1] + 1;
469 (
const int(*)[2 *
cmax + 1]) sections_mutable;
472 const int kstart = *((*sphere_bounds_iter)++);
473 const size_t cxy_size = (lp + 1) * (lp + 1) * 2;
474 double cxy[cxy_size];
475 for (
int k1 = kstart; k1 <= 0; k1++) {
476 const int k2 = 1 - k1;
480 memset(
cxy, 0, cxy_size *
sizeof(
double));
482#if (GRID_DO_COLLOCATE)
501 const int lp,
const int jg,
const int kg,
const int ismin,
const int ismax,
502 const int npts_local[3],
const int index_min[3],
const int index_max[3],
503 const int map_i[],
const int sections_i[],
const double gp[3],
const int k,
504 const int j,
const double exp_ij[],
const double exp_jk[],
513 for (
int istart = ismin; istart <= ismax; istart++) {
514 const int istop =
imin(ismax, istart + sections_i[istart - index_min[0]]);
515 if (map_i[istart - index_min[0]] < 0) {
520 const int cube2grid = map_i[istart - index_min[0]] - istart;
521 for (
int i = istart;
i <= istop;
i++) {
522 const int ig =
i + cube2grid;
523 const double di =
i - gp[0];
525 const int stride_i = index_max[0] - index_min[0] + 1;
526 const int stride_j = index_max[1] - index_min[1] + 1;
527 const int stride_k = index_max[2] - index_min[2] + 1;
528 const int idx_ij = (j - index_min[1]) * stride_i +
i - index_min[0];
529 const int idx_jk = (k - index_min[2]) * stride_j + j - index_min[1];
530 const int idx_ki = (
i - index_min[0]) * stride_k + k - index_min[2];
541 const double gaussian =
exp_ij[idx_ij] * exp_jk[idx_jk] * exp_ki[idx_ki];
543 const int grid_index = base + ig;
546#if (GRID_DO_COLLOCATE)
549 for (
int il = 0; il <= lp; il++) {
553 grid[grid_index] += reg;
556 const double reg =
grid[grid_index];
557 for (
int il = 0; il <= lp; il++) {
576 for (
int jl = 0; jl <= lp; jl++) {
577 for (
int il = 0; il <= lp - jl; il++) {
578 const int cij_index = jl * (lp + 1) + il;
579#if (GRID_DO_COLLOCATE)
580 ci[il] += cij[cij_index] * djp;
582 cij[cij_index] += ci[il] * djp;
593static inline void __attribute__((always_inline)) general_cij_to_grid_low(
594 const int lp,
const int jg,
const int kg,
const int ismin,
const int ismax,
595 const int npts_local[3],
const int index_min[3],
const int index_max[3],
596 const int map_i[],
const int sections_i[],
const double gp[3],
const int k,
597 const int j,
const double exp_ij[],
const double exp_jk[],
598 const double exp_ki[],
const double dj,
double *ci,
602#if (GRID_DO_COLLOCATE)
606 map_i, sections_i, gp, k, j,
exp_ij, exp_jk, exp_ki, ci,
611 map_i, sections_i, gp, k, j,
exp_ij, exp_jk, exp_ki, ci,
622 const int lp,
const int k,
const int kg,
const int npts_local[3],
623 const int index_min[3],
const int index_max[3],
const int map_i[],
624 const int map_j[],
const int sections_i[],
const int sections_j[],
625 const double dh[3][3],
const double gp[3],
const double radius,
626 const double exp_ij[],
const double exp_jk[],
const double exp_ki[],
630 for (
int j = index_min[1]; j <= index_max[1]; j++) {
631 const int jg = map_j[j - index_min[1]];
633 j += sections_j[j - index_min[1]];
656 double a = 0.0,
b = 0.0,
c = 0.0;
657 for (
int i = 0;
i < 3;
i++) {
658 const double v = (0 - gp[0]) * dh[0][
i] + (j - gp[1]) * dh[1][
i] +
659 (k - gp[2]) * dh[2][
i];
660 a += dh[0][
i] * dh[0][
i];
661 b += 2.0 * v * dh[0][
i];
664 const double d =
b *
b - 4.0 *
a * (
c - radius * radius);
667 const double sqrt_d = sqrt(d);
668 const double inv_2a = 1.0 / (2.0 *
a);
669 const int ismin = (int)ceil((-b - sqrt_d) * inv_2a);
670 const int ismax = (int)floor((-b + sqrt_d) * inv_2a);
671 const double dj = j - gp[1];
674 memset(ci, 0,
sizeof(ci));
681 general_cij_to_grid_low(ilp, jg, kg, ismin, ismax,
npts_local,
682 index_min, index_max, map_i, sections_i, gp,
683 k, j,
exp_ij, exp_jk, exp_ki, dj, ci, cij,
688 general_cij_to_grid_low(lp, jg, kg, ismin, ismax,
npts_local, index_min,
689 index_max, map_i, sections_i, gp, k, j,
exp_ij,
690 exp_jk, exp_ki, dj, ci, cij,
grid);
704 for (
int kl = 0; kl <= lp; kl++) {
705 for (
int jl = 0; jl <= lp - kl; jl++) {
706 for (
int il = 0; il <= lp - kl - jl; il++) {
707 const int cij_index = jl * (lp + 1) + il;
708 const int cijk_index =
709 kl * (lp + 1) * (lp + 1) + jl * (lp + 1) + il;
710#if (GRID_DO_COLLOCATE)
711 cij[cij_index] += cijk[cijk_index] * dkp;
713 cijk[cijk_index] += cij[cij_index] * dkp;
727 const int shift_local,
const int npts_global,
731 for (
int k = index_min; k <= index_max; k++) {
732 const int kg =
modulo(k - shift_local, npts_global);
733 if (bounds[0] <= kg && kg <= bounds[1]) {
734 map[k - index_min] = kg;
736 map[k - index_min] = INT_MIN;
741 const int range = index_max - index_min + 1;
742 for (
int kg = range - 1; kg >= 0; kg--) {
743 if (kg == range - 1 ||
map[kg] !=
map[kg + 1] - 1) {
757 const int index_max[3],
const double zetp,
758 const double dh[3][3],
const double gp[3],
759 double exp_table[]) {
761 const int stride_i = index_max[idir] - index_min[idir] + 1;
762 const double h_ii = dh[idir][0] * dh[idir][0] + dh[idir][1] * dh[idir][1] +
763 dh[idir][2] * dh[idir][2];
764 const double h_ij = dh[idir][0] * dh[jdir][0] + dh[idir][1] * dh[jdir][1] +
765 dh[idir][2] * dh[jdir][2];
767 for (
int i = index_min[idir];
i <= index_max[idir];
i++) {
768 const double di =
i - gp[idir];
769 const double rii = di * di * h_ii;
770 const double rij_unit = di * h_ij;
771 const double exp_ij_unit = exp(-zetp * 2.0 * rij_unit);
774 const int j_center = (int)gp[jdir];
775 const double dj_center = j_center - gp[jdir];
776 const double rij_center = dj_center * rij_unit;
777 const double exp_ij_center = exp(-zetp * (rii + 2.0 * rij_center));
780 double exp_ij = exp_ij_center;
781 for (
int j = j_center; j <= index_max[jdir]; j++) {
782 const int idx = (j - index_min[jdir]) * stride_i +
i - index_min[idir];
788 const double exp_ij_unit_inv = 1.0 / exp_ij_unit;
789 exp_ij = exp_ij_center * exp_ij_unit_inv;
790 for (
int j = j_center - 1; j >= index_min[jdir]; j--) {
791 const int idx = (j - index_min[jdir]) * stride_i +
i - index_min[idir];
793 exp_ij *= exp_ij_unit_inv;
804 const double dh[3][3],
const double dh_inv[3][3],
805 const double rp[3],
const int npts_global[3],
806 const int npts_local[3],
const int shift_local[3],
807 const int border_width[3],
const double radius,
818 if (border_mask & (1 << 0))
819 bounds_i[0] += border_width[0];
820 if (border_mask & (1 << 1))
821 bounds_i[1] -= border_width[0];
822 if (border_mask & (1 << 2))
823 bounds_j[0] += border_width[1];
824 if (border_mask & (1 << 3))
825 bounds_j[1] -= border_width[1];
826 if (border_mask & (1 << 4))
827 bounds_k[0] += border_width[2];
828 if (border_mask & (1 << 5))
829 bounds_k[1] -= border_width[2];
833 double gp[3] = {0.0, 0.0, 0.0};
834 for (
int i = 0;
i < 3;
i++) {
835 for (
int j = 0; j < 3; j++) {
836 gp[
i] += dh_inv[j][
i] * rp[j];
844 int index_min[3] = {INT_MAX, INT_MAX, INT_MAX};
845 int index_max[3] = {INT_MIN, INT_MIN, INT_MIN};
846 for (
int i = -1;
i <= 1;
i++) {
847 for (
int j = -1; j <= 1; j++) {
848 for (
int k = -1; k <= 1; k++) {
849 const double x = rp[0] +
i * radius;
850 const double y = rp[1] + j * radius;
851 const double z = rp[2] + k * radius;
852 for (
int idir = 0; idir < 3; idir++) {
854 dh_inv[0][idir] * x + dh_inv[1][idir] * y + dh_inv[2][idir] * z;
855 index_min[idir] =
imin(index_min[idir], (
int)floor(resc));
856 index_max[idir] =
imax(index_max[idir], (
int)ceil(resc));
863 const int range_i = index_max[0] - index_min[0] + 1;
864 int map_i[range_i], sections_i[range_i];
866 npts_global[0], bounds_i, map_i, sections_i);
867 const int range_j = index_max[1] - index_min[1] + 1;
868 int map_j[range_j], sections_j[range_j];
870 npts_global[1], bounds_j, map_j, sections_j);
871 const int range_k = index_max[2] - index_min[2] + 1;
872 int map_k[range_k], sections_k[range_k];
874 npts_global[2], bounds_k, map_k, sections_k);
877 double exp_ij[range_i * range_j];
879 double exp_jk[range_j * range_k];
881 double exp_ki[range_k * range_i];
885 const int cij_size = (lp + 1) * (lp + 1);
886 double cij[cij_size];
887 for (
int k = index_min[2]; k <= index_max[2]; k++) {
888 const int kg = map_k[k - index_min[2]];
890 k += sections_k[k - index_min[2]];
895 memset(cij, 0, cij_size *
sizeof(
double));
897#if (GRID_DO_COLLOCATE)
901 map_j, sections_i, sections_j, dh, gp, radius,
exp_ij,
902 exp_jk, exp_ki, cij,
grid);
906 map_j, sections_i, sections_j, dh, gp, radius,
exp_ij,
907 exp_jk, exp_ki, cij,
grid);
927 double hmatgridp[lp + 1][3][3];
928 for (
int i = 0;
i < 3;
i++) {
929 for (
int j = 0; j < 3; j++) {
930 hmatgridp[0][j][
i] = 1.0;
931 for (
int k = 1; k <= lp; k++) {
932 hmatgridp[k][j][
i] = hmatgridp[k - 1][j][
i] * dh[j][
i];
938 for (
int klx = 0; klx <= lpx; klx++) {
939 for (
int jlx = 0; jlx <= lpx - klx; jlx++) {
940 for (
int ilx = 0; ilx <= lpx - klx - jlx; ilx++) {
941 const int lx = ilx + jlx + klx;
942 const int lpy = lp - lx;
943 for (
int kly = 0; kly <= lpy; kly++) {
944 for (
int jly = 0; jly <= lpy - kly; jly++) {
945 for (
int ily = 0; ily <= lpy - kly - jly; ily++) {
946 const int ly = ily + jly + kly;
947 const int lpz = lp - lx - ly;
948 for (
int klz = 0; klz <= lpz; klz++) {
949 for (
int jlz = 0; jlz <= lpz - klz; jlz++) {
950 for (
int ilz = 0; ilz <= lpz - klz - jlz; ilz++) {
951 const int lz = ilz + jlz + klz;
952 const int il = ilx + ily + ilz;
953 const int jl = jlx + jly + jlz;
954 const int kl = klx + kly + klz;
955 const int lp1 = lp + 1;
956 const int cijk_index =
957 kl * lp1 * lp1 + jl * lp1 + il;
958 const int cxyz_index =
959 lz * lp1 * lp1 + ly * lp1 + lx;
961 hmatgridp[ilx][0][0] * hmatgridp[jlx][1][0] *
962 hmatgridp[klx][2][0] * hmatgridp[ily][0][1] *
963 hmatgridp[jly][1][1] * hmatgridp[kly][2][1] *
964 hmatgridp[ilz][0][2] * hmatgridp[jlz][1][2] *
965 hmatgridp[klz][2][2] *
fac(lx) *
fac(ly) *
fac(lz) /
968#if (GRID_DO_COLLOCATE)
969 cijk[cijk_index] += cxyz[cxyz_index] *
p;
971 cxyz[cxyz_index] += cijk[cijk_index] *
p;
990 const double dh[3][3],
const double dh_inv[3][3],
991 const double rp[3],
const int npts_global[3],
992 const int npts_local[3],
const int shift_local[3],
993 const int border_width[3],
const double radius,
997 const size_t cijk_size = (lp + 1) * (lp + 1) * (lp + 1);
998 double cijk[cijk_size];
999 memset(cijk, 0, cijk_size *
sizeof(
double));
1001#if (GRID_DO_COLLOCATE)
1005 npts_local, shift_local, border_width, radius, cijk,
1010 npts_local, shift_local, border_width, radius, cijk,
1021cxyz_to_grid(
const bool orthorhombic,
const int border_mask,
const int lp,
1022 const double zetp,
const double dh[3][3],
1023 const double dh_inv[3][3],
const double rp[3],
1024 const int npts_global[3],
const int npts_local[3],
1025 const int shift_local[3],
const int border_width[3],
1030 if (orthorhombic && border_mask == 0) {
1033 shift_local, radius, cxyz,
grid);
1037 npts_local, shift_local, border_width, radius, cxyz,
1047static inline void cab_to_cxyz(
const int la_max,
const int la_min,
1048 const int lb_max,
const int lb_min,
1049 const double prefactor,
const double ra[3],
1050 const double rb[3],
const double rp[3],
1056 const int lp = la_max + lb_max;
1057 double alpha[3][lb_max + 1][la_max + 1][lp + 1];
1058 memset(alpha, 0, 3 * (lb_max + 1) * (la_max + 1) * (lp + 1) *
sizeof(
double));
1060 for (
int i = 0;
i < 3;
i++) {
1061 const double drpa = rp[
i] - ra[
i];
1062 const double drpb = rp[
i] - rb[
i];
1063 for (
int lxa = 0; lxa <= la_max; lxa++) {
1064 for (
int lxb = 0; lxb <= lb_max; lxb++) {
1065 double binomial_k_lxa = 1.0;
1067 for (
int k = 0; k <= lxa; k++) {
1068 double binomial_l_lxb = 1.0;
1070 for (
int l = 0; l <= lxb; l++) {
1071 alpha[
i][lxb][lxa][lxa - l + lxb - k] +=
1072 binomial_k_lxa * binomial_l_lxb *
a *
b;
1073 binomial_l_lxb *= ((double)(lxb - l)) / ((
double)(l + 1));
1076 binomial_k_lxa *= ((double)(lxa - k)) / ((
double)(k + 1));
1097 for (
int lzb = 0; lzb <= lb_max; lzb++) {
1098 for (
int lza = 0; lza <= la_max; lza++) {
1099 for (
int lyb = 0; lyb <= lb_max - lzb; lyb++) {
1100 for (
int lya = 0; lya <= la_max - lza; lya++) {
1101 const int lxb_min =
imax(lb_min - lzb - lyb, 0);
1102 const int lxa_min =
imax(la_min - lza - lya, 0);
1103 for (
int lxb = lxb_min; lxb <= lb_max - lzb - lyb; lxb++) {
1104 for (
int lxa = lxa_min; lxa <= la_max - lza - lya; lxa++) {
1105 const int ico =
coset(lxa, lya, lza);
1106 const int jco =
coset(lxb, lyb, lzb);
1107 const int cab_index = jco *
ncoset(la_max) + ico;
1108 for (
int lzp = 0; lzp <= lza + lzb; lzp++) {
1109 for (
int lyp = 0; lyp <= lp - lza - lzb; lyp++) {
1110 for (
int lxp = 0; lxp <= lp - lza - lzb - lyp; lxp++) {
1111 const double p = alpha[0][lxb][lxa][lxp] *
1112 alpha[1][lyb][lya][lyp] *
1113 alpha[2][lzb][lza][lzp] * prefactor;
1114 const int lp1 = lp + 1;
1115 const int cxyz_index =
1116 lzp * lp1 * lp1 + lyp * lp1 + lxp;
1117#if (GRID_DO_COLLOCATE)
1118 cxyz[cxyz_index] += cab[cab_index] *
p;
1120 cab[cab_index] += cxyz[cxyz_index] *
p;
1138cab_to_grid(
const bool orthorhombic,
const int border_mask,
const int la_max,
1139 const int la_min,
const int lb_max,
const int lb_min,
1140 const double zeta,
const double zetb,
const double rscale,
1141 const double dh[3][3],
const double dh_inv[3][3],
1142 const double ra[3],
const double rab[3],
const int npts_global[3],
1143 const int npts_local[3],
const int shift_local[3],
1144 const int border_width[3],
const double radius,
1149 double dh_max = 0.0;
1150 for (
int i = 0;
i < 3;
i++) {
1151 for (
int j = 0; j < 3; j++) {
1152 dh_max = fmax(dh_max, fabs(dh[
i][j]));
1155 if (2.0 * radius < dh_max) {
1159 const double zetp =
zeta + zetb;
1160 const double f = zetb / zetp;
1161 const double rab2 = rab[0] * rab[0] + rab[1] * rab[1] + rab[2] * rab[2];
1162 const double prefactor = rscale * exp(-zeta * f * rab2);
1163 double rp[3], rb[3];
1164 for (
int i = 0;
i < 3;
i++) {
1165 rp[
i] = ra[
i] + f * rab[
i];
1166 rb[
i] = ra[
i] + rab[
i];
1169 const int lp = la_max + lb_max;
1170 const size_t cxyz_size = (lp + 1) * (lp + 1) * (lp + 1);
1171 double cxyz[cxyz_size];
1172 memset(cxyz, 0, cxyz_size *
sizeof(
double));
1174#if (GRID_DO_COLLOCATE)
1176 cab_to_cxyz(la_max, la_min, lb_max, lb_min, prefactor, ra, rb, rp, cab, cxyz);
1177 cxyz_to_grid(orthorhombic, border_mask, lp, zetp, dh, dh_inv, rp, npts_global,
1181 cxyz_to_grid(orthorhombic, border_mask, lp, zetp, dh, dh_inv, rp, npts_global,
1183 cab_to_cxyz(la_max, la_min, lb_max, lb_min, prefactor, ra, rb, rp, cab, cxyz);
static int imax(int x, int y)
Returns the larger of two given integers (missing from the C standard)
static int imin(int x, int y)
Returns the smaller of the two integers (missing from the C standard).
#define GRID_PRAGMA_UNROLL_UP_TO(N)
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
static GRID_HOST_DEVICE int idx(const orbital a)
Return coset index of given orbital angular momentum.
#define GRID_PRAGMA_UNROLL(N)
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
#define GRID_CONST_WHEN_COLLOCATE
static void const int const int const double GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double GRID_CONST_WHEN_INTEGRATE double GRID_CONST_WHEN_INTEGRATE double * grid_2
static void const int const int const int jg1
static void const int const int j2
static void const int const int const double GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double GRID_CONST_WHEN_INTEGRATE double * grid_1
static void const int const int const int const int const int const double const int const int const int int ** sphere_bounds_iter
static void const int const int kg2
static void const int const int const int const int const int const double const int const int sections[3][2 *cmax+1]
static void const int const int i
static void const int const int const int const double GRID_CONST_WHEN_COLLOCATE double * cxy
static void __attribute__((always_inline)) ortho_cx_to_grid_scalar(const int lp
Simple loop body for ortho_cx_to_grid using plain C.
static void const int cmax
static void const int kg1
static void const int const int const double pol[3][lp+1][2 *cmax+1]
static void const int const int const int const int const int const double const int const int const int npts_local[3]
#define GRID_MAX_LP_OPTIMIZED
static void const int const int const int const int jg2
static void const int const int const double GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid_0
for(int lxp=0;lxp<=lp;lxp++)
static void const int const int const double GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double GRID_CONST_WHEN_INTEGRATE double GRID_CONST_WHEN_INTEGRATE double GRID_CONST_WHEN_INTEGRATE double * grid_3
static void const int const int const int const int const int const double const int map[3][2 *cmax+1]
#define GRID_CONST_WHEN_INTEGRATE
static void const int const int const double GRID_CONST_WHEN_COLLOCATE double * cx
#define GRID_DO_COLLOCATE
void exp_ij(const double alpha, const int offset_i, const int imin, const int imax, const int offset_j, const int jmin, const int jmax, tensor *exp_ij_)
void grid_library_counter_add(const int lp, const enum grid_backend backend, const enum grid_library_kernel kernel, const int increment)
Adds given increment to counter specified by lp, backend, and kernel.
grid_library_kernel
Various kernels provided by the grid library.
static void cxyz_to_grid(const bool orthorhombic, const int border_mask, const int lp, const double zetp, const double dh[3][3], const double dh_inv[3][3], const double rp[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, GRID_CONST_WHEN_COLLOCATE double *cxyz, GRID_CONST_WHEN_INTEGRATE double *grid)
Collocates coefficients C_xyz onto the grid.
static void cab_to_cxyz(const int la_max, const int la_min, const int lb_max, const int lb_min, const double prefactor, const double ra[3], const double rb[3], const double rp[3], GRID_CONST_WHEN_COLLOCATE double *cab, GRID_CONST_WHEN_INTEGRATE double *cxyz)
Transforms coefficients C_ab into C_xyz.
static void ortho_cxy_to_grid(const int lp, const int k1, const int k2, const int cmax, const double pol[3][lp+1][2 *cmax+1], const int map[3][2 *cmax+1], const double dh[3][3], const double dh_inv[3][3], const double disr_radius, const int npts_local[3], GRID_CONST_WHEN_COLLOCATE double *cxy, GRID_CONST_WHEN_INTEGRATE double *grid)
Collocates coefficients C_xy onto the grid for orthorhombic case.
static void general_cij_to_ci(const int lp, const double dj, GRID_CONST_WHEN_COLLOCATE double *cij, GRID_CONST_WHEN_INTEGRATE double *ci)
Transforms coefficients C_ij into C_i by fixing grid index j.
static void general_cijk_to_cij(const int lp, const double dk, GRID_CONST_WHEN_COLLOCATE double *cijk, GRID_CONST_WHEN_INTEGRATE double *cij)
Transforms coefficients C_ijk into C_ij by fixing grid index k.
static void general_precompute_mapping(const int index_min, const int index_max, const int shift_local, const int npts_global, const int bounds[2], int map[])
Precompute mapping of grid indices for general case.
static void ortho_cxy_to_cx(const int lp, const int j1, const int j2, const int cmax, const double pol[3][lp+1][2 *cmax+1], GRID_CONST_WHEN_COLLOCATE double *cxy, GRID_CONST_WHEN_INTEGRATE double *cx)
Transforms coefficients C_xy into C_x by fixing grid index j.
static void general_cxyz_to_cijk(const int lp, const double dh[3][3], GRID_CONST_WHEN_COLLOCATE double *cxyz, GRID_CONST_WHEN_INTEGRATE double *cijk)
Transforms coefficients C_xyz into C_ijk.
static void general_cij_to_grid(const int lp, const int k, const int kg, const int npts_local[3], const int index_min[3], const int index_max[3], const int map_i[], const int map_j[], const double dh[3][3], const double gp[3], const double radius, const double exp_ij[], const double exp_jk[], const double exp_ki[], GRID_CONST_WHEN_COLLOCATE double *cij, GRID_CONST_WHEN_INTEGRATE double *grid)
Collocates coefficients C_ij onto the grid for general case.
static void general_cijk_to_grid(const int border_mask, const int lp, const double zetp, const double dh[3][3], const double dh_inv[3][3], const double rp[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, GRID_CONST_WHEN_COLLOCATE double *cijk, GRID_CONST_WHEN_INTEGRATE double *grid)
Collocates coefficients C_ijk onto the grid for general case.
static void cab_to_grid(const bool orthorhombic, const int border_mask, const int la_max, const int la_min, const int lb_max, const int lb_min, const double zeta, const double zetb, const double rscale, const double dh[3][3], const double dh_inv[3][3], const double ra[3], const double rab[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, GRID_CONST_WHEN_COLLOCATE double *cab, GRID_CONST_WHEN_INTEGRATE double *grid)
Collocates coefficients C_ab onto the grid.
static void general_ci_to_grid(const int lp, const int jg, const int kg, const int ismin, const int ismax, const int npts_local[3], const int index_min[3], const int index_max[3], const int map_i[], const double gp[3], const int k, const int j, const double exp_ij[], const double exp_jk[], const double exp_ki[], GRID_CONST_WHEN_COLLOCATE double *ci, GRID_CONST_WHEN_INTEGRATE double *grid)
Collocates coefficients C_i onto the grid for general case.
static void ortho_cx_to_grid(const int lp, const int k1, const int k2, const int j1, const int j2, const int cmax, const double pol[3][lp+1][2 *cmax+1], const int map[3][2 *cmax+1], const double dh[3][3], const double dh_inv[3][3], const double kremain, const int npts_local[3], GRID_CONST_WHEN_COLLOCATE double *cx, GRID_CONST_WHEN_INTEGRATE double *grid)
Collocates coefficients C_x onto the grid for orthorhombic case.
static void ortho_cxyz_to_cxy(const int lp, const int k1, const int k2, const int cmax, const double pol[3][lp+1][2 *cmax+1], GRID_CONST_WHEN_COLLOCATE double *cxyz, GRID_CONST_WHEN_INTEGRATE double *cxy)
Transforms coefficients C_xyz into C_xz by fixing grid index k.
static void general_cxyz_to_grid(const int border_mask, const int lp, const double zetp, const double dh[3][3], const double dh_inv[3][3], const double rp[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, GRID_CONST_WHEN_COLLOCATE double *cxyz, GRID_CONST_WHEN_INTEGRATE double *grid)
Collocates coefficients C_xyz onto the grid for general case.
static void ortho_cxyz_to_grid(const int lp, const double zetp, const double dh[3][3], const double dh_inv[3][3], const double rp[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const double radius, GRID_CONST_WHEN_COLLOCATE double *cxyz, GRID_CONST_WHEN_INTEGRATE double *grid)
Collocates coefficients C_xyz onto the grid for orthorhombic case.
static void general_fill_exp_table(const int idir, const int jdir, const int index_min[3], const int index_max[3], const double zetp, const double dh[3][3], const double gp[3], double exp_table[])
Fill one of the 2D tables that speedup 3D Gaussian (Mathieu's trick).
void grid_sphere_cache_lookup(const double radius, const double dh[3][3], const double dh_inv[3][3], int **sphere_bounds, double *discr_radius)
Lookup the sphere bound from cache and compute them as needed. See grid_sphere_cache....
real(kind=dp), dimension(0:maxfac), parameter, public fac
integer, dimension(:), allocatable, public ncoset
integer, dimension(:, :, :), allocatable, public coset
integer, parameter, public gaussian