(git:6a2e663)
grid_cpu_collint.h
Go to the documentation of this file.
1 /*----------------------------------------------------------------------------*/
2 /* CP2K: A general program to perform molecular dynamics simulations */
3 /* Copyright 2000-2024 CP2K developers group <https://cp2k.org> */
4 /* */
5 /* SPDX-License-Identifier: BSD-3-Clause */
6 /*----------------------------------------------------------------------------*/
7 
8 #include <assert.h>
9 #include <limits.h>
10 #include <math.h>
11 #include <stdio.h>
12 #include <stdlib.h>
13 #include <string.h>
14 
15 #if defined(__AVX2__) && defined(__FMA__)
16 #include <immintrin.h>
17 #endif
18 
19 #include "../common/grid_common.h"
20 #include "../common/grid_library.h"
21 #include "../common/grid_sphere_cache.h"
22 
23 #define GRID_MAX_LP_OPTIMIZED 9
24 
25 #if (GRID_DO_COLLOCATE)
26 #define GRID_CONST_WHEN_COLLOCATE const
27 #define GRID_CONST_WHEN_INTEGRATE
28 #else
29 #define GRID_CONST_WHEN_COLLOCATE
30 #define GRID_CONST_WHEN_INTEGRATE const
31 #endif
32 
33 /*******************************************************************************
34  * \brief Simple loop body for ortho_cx_to_grid using plain C.
35  * \author Ole Schuett
36  ******************************************************************************/
37 static inline void __attribute__((always_inline))
38 ortho_cx_to_grid_scalar(const int lp, const int cmax, const int i,
39  const double pol[3][lp + 1][2 * cmax + 1],
45 
46 #if (GRID_DO_COLLOCATE)
47  // collocate
48  double reg[4] = {0.0, 0.0, 0.0, 0.0};
49 #pragma omp simd reduction(+ : reg)
50  for (int lxp = 0; lxp <= lp; lxp++) {
51  const double p = pol[0][lxp][i + cmax];
52  reg[0] += cx[lxp * 4 + 0] * p;
53  reg[1] += cx[lxp * 4 + 1] * p;
54  reg[2] += cx[lxp * 4 + 2] * p;
55  reg[3] += cx[lxp * 4 + 3] * p;
56  }
57  *grid_0 += reg[0];
58  *grid_1 += reg[1];
59  *grid_2 += reg[2];
60  *grid_3 += reg[3];
61 
62 #else
63  // integrate
64  const double reg[4] = {*grid_0, *grid_1, *grid_2, *grid_3};
65 #pragma omp simd
66  for (int lxp = 0; lxp <= lp; lxp++) {
67  const double p = pol[0][lxp][i + cmax];
68  cx[lxp * 4 + 0] += reg[0] * p;
69  cx[lxp * 4 + 1] += reg[1] * p;
70  cx[lxp * 4 + 2] += reg[2] * p;
71  cx[lxp * 4 + 3] += reg[3] * p;
72  }
73 #endif
74 }
75 
76 /*******************************************************************************
77  * \brief Optimized loop body for ortho_cx_to_grid using AVX2 Intel Intrinsics.
78  * This routine always processes four consecutive grid elements at once.
79  * \author Ole Schuett
80  ******************************************************************************/
81 #if defined(__AVX2__) && defined(__FMA__)
82 static inline void __attribute__((always_inline))
83 ortho_cx_to_grid_avx2(const int lp, const int cmax, const int i,
84  const double pol[3][lp + 1][2 * cmax + 1],
90 
91  const int icmax = i + cmax;
92 
93 #if (GRID_DO_COLLOCATE)
94  // collocate
95  // First iteration for lxp == 0 does not need add instructions.
96  __m256d p_vec = _mm256_loadu_pd(&pol[0][0][icmax]);
97  __m256d r_vec_0 = _mm256_mul_pd(p_vec, _mm256_set1_pd(cx[0]));
98  __m256d r_vec_1 = _mm256_mul_pd(p_vec, _mm256_set1_pd(cx[1]));
99  __m256d r_vec_2 = _mm256_mul_pd(p_vec, _mm256_set1_pd(cx[2]));
100  __m256d r_vec_3 = _mm256_mul_pd(p_vec, _mm256_set1_pd(cx[3]));
101 
102  // Remaining iterations for lxp > 0 use fused multiply adds.
104  for (int lxp = 1; lxp <= lp; lxp++) {
105  const double *cx_base = &cx[lxp * 4];
106  p_vec = _mm256_loadu_pd(&pol[0][lxp][icmax]);
107  r_vec_0 = _mm256_fmadd_pd(p_vec, _mm256_set1_pd(cx_base[0]), r_vec_0);
108  r_vec_1 = _mm256_fmadd_pd(p_vec, _mm256_set1_pd(cx_base[1]), r_vec_1);
109  r_vec_2 = _mm256_fmadd_pd(p_vec, _mm256_set1_pd(cx_base[2]), r_vec_2);
110  r_vec_3 = _mm256_fmadd_pd(p_vec, _mm256_set1_pd(cx_base[3]), r_vec_3);
111  }
112 
113  // Add vectors to grid one at a time, because they can aliase when cube wraps.
114  _mm256_storeu_pd(grid_0, _mm256_add_pd(_mm256_loadu_pd(grid_0), r_vec_0));
115  _mm256_storeu_pd(grid_1, _mm256_add_pd(_mm256_loadu_pd(grid_1), r_vec_1));
116  _mm256_storeu_pd(grid_2, _mm256_add_pd(_mm256_loadu_pd(grid_2), r_vec_2));
117  _mm256_storeu_pd(grid_3, _mm256_add_pd(_mm256_loadu_pd(grid_3), r_vec_3));
118 
119 #else
120  // integrate
121  __m256d grid_vec_0 = _mm256_loadu_pd(grid_0);
122  __m256d grid_vec_1 = _mm256_loadu_pd(grid_1);
123  __m256d grid_vec_2 = _mm256_loadu_pd(grid_2);
124  __m256d grid_vec_3 = _mm256_loadu_pd(grid_3);
125 
127  for (int lxp = 0; lxp <= lp; lxp++) {
128  __m256d p_vec = _mm256_loadu_pd(&pol[0][lxp][icmax]);
129 
130  // Do 4 dot products at once. https://stackoverflow.com/a/10454420
131  __m256d xy0 = _mm256_mul_pd(p_vec, grid_vec_0);
132  __m256d xy1 = _mm256_mul_pd(p_vec, grid_vec_1);
133  __m256d xy2 = _mm256_mul_pd(p_vec, grid_vec_2);
134  __m256d xy3 = _mm256_mul_pd(p_vec, grid_vec_3);
135 
136  // low to high: xy00+xy01 xy10+xy11 xy02+xy03 xy12+xy13
137  __m256d temp01 = _mm256_hadd_pd(xy0, xy1);
138 
139  // low to high: xy20+xy21 xy30+xy31 xy22+xy23 xy32+xy33
140  __m256d temp23 = _mm256_hadd_pd(xy2, xy3);
141 
142  // low to high: xy02+xy03 xy12+xy13 xy20+xy21 xy30+xy31
143  __m256d swapped = _mm256_permute2f128_pd(temp01, temp23, 0x21);
144 
145  // low to high: xy00+xy01 xy10+xy11 xy22+xy23 xy32+xy33
146  __m256d blended = _mm256_blend_pd(temp01, temp23, 0b1100);
147 
148  __m256d r_vec = _mm256_add_pd(swapped, blended);
149 
150  // cx += r_vec
151  double *cx_base = &cx[lxp * 4];
152  _mm256_storeu_pd(cx_base, _mm256_add_pd(r_vec, _mm256_loadu_pd(cx_base)));
153  }
154 #endif
155 }
156 #endif // __AVX2__ && __FMA__
157 
158 /*******************************************************************************
159  * \brief Collocates coefficients C_x onto the grid for orthorhombic case.
160  * \author Ole Schuett
161  ******************************************************************************/
162 static inline void __attribute__((always_inline))
163 ortho_cx_to_grid(const int lp, const int kg1, const int kg2, const int jg1,
164  const int jg2, const int cmax,
165  const double pol[3][lp + 1][2 * cmax + 1],
166  const int map[3][2 * cmax + 1],
167  const int sections[3][2 * cmax + 1], const int npts_local[3],
170 
171  // Lower and upper sphere bounds relative to center, ie. in cube coordinates.
172  const int lb = *((*sphere_bounds_iter)++);
173  const int ub = 1 - lb;
174 
175  // AVX instructions can only load/store from evenly spaced memory locations.
176  // Since the sphere bounds can wrap around due to the grid's periodicity,
177  // the inner loop runs over sections with homogeneous cube to grid mapping.
178  for (int istart = lb; istart <= ub; istart++) {
179  const int istop = imin(ub, istart + sections[0][istart + cmax]);
180  const int cube2grid = map[0][istart + cmax] - istart;
181 
182  const int stride = npts_local[1] * npts_local[0];
183  const int grid_index_0 = kg1 * stride + jg1 * npts_local[0];
184  const int grid_index_1 = kg2 * stride + jg1 * npts_local[0];
185  const int grid_index_2 = kg1 * stride + jg2 * npts_local[0];
186  const int grid_index_3 = kg2 * stride + jg2 * npts_local[0];
187  GRID_CONST_WHEN_INTEGRATE double *grid_base_0 = &grid[grid_index_0];
188  GRID_CONST_WHEN_INTEGRATE double *grid_base_1 = &grid[grid_index_1];
189  GRID_CONST_WHEN_INTEGRATE double *grid_base_2 = &grid[grid_index_2];
190  GRID_CONST_WHEN_INTEGRATE double *grid_base_3 = &grid[grid_index_3];
191 
192  // Use AVX2 to process grid points in chunks of four, ie. 256 bit vectors.
193 #if defined(__AVX2__) && defined(__FMA__)
194  const int istop_vec = istart + 4 * ((istop - istart + 1) / 4) - 1;
195  for (int i = istart; i <= istop_vec; i += 4) {
196  const int ig = i + cube2grid;
197  ortho_cx_to_grid_avx2(lp, cmax, i, pol, cx, &grid_base_0[ig],
198  &grid_base_1[ig], &grid_base_2[ig],
199  &grid_base_3[ig]);
200  }
201  istart = istop_vec + 1;
202 #endif
203 
204  // Process up to 3 remaining points - or everything if AVX2 isn't available.
205  for (int i = istart; i <= istop; i++) {
206  const int ig = i + cube2grid;
207  ortho_cx_to_grid_scalar(lp, cmax, i, pol, cx, &grid_base_0[ig],
208  &grid_base_1[ig], &grid_base_2[ig],
209  &grid_base_3[ig]);
210  }
211  istart = istop;
212  }
213 }
214 
215 /*******************************************************************************
216  * \brief Transforms coefficients C_xy into C_x by fixing grid index j.
217  * \author Ole Schuett
218  ******************************************************************************/
219 static inline void __attribute__((always_inline))
220 ortho_cxy_to_cx(const int lp, const int j1, const int j2, const int cmax,
221  const double pol[3][lp + 1][2 * cmax + 1],
223  GRID_CONST_WHEN_INTEGRATE double *cx) {
224 
225  for (int lyp = 0; lyp <= lp; lyp++) {
226  for (int lxp = 0; lxp <= lp - lyp; lxp++) {
227  const double p1 = pol[1][lyp][j1 + cmax];
228  const double p2 = pol[1][lyp][j2 + cmax];
229  const int cxy_index = lyp * (lp + 1) * 2 + lxp * 2; // [lyp, lxp, 0]
230 
231 #if (GRID_DO_COLLOCATE)
232  // collocate
233  cx[lxp * 4 + 0] += cxy[cxy_index + 0] * p1;
234  cx[lxp * 4 + 1] += cxy[cxy_index + 1] * p1;
235  cx[lxp * 4 + 2] += cxy[cxy_index + 0] * p2;
236  cx[lxp * 4 + 3] += cxy[cxy_index + 1] * p2;
237 #else
238  // integrate
239  cxy[cxy_index + 0] += cx[lxp * 4 + 0] * p1;
240  cxy[cxy_index + 1] += cx[lxp * 4 + 1] * p1;
241  cxy[cxy_index + 0] += cx[lxp * 4 + 2] * p2;
242  cxy[cxy_index + 1] += cx[lxp * 4 + 3] * p2;
243 #endif
244  }
245  }
246 }
247 
248 /*******************************************************************************
249  * \brief Loop body of ortho_cxy_to_grid to be inlined for low values of lp.
250  * \author Ole Schuett
251  ******************************************************************************/
252 static inline void __attribute__((always_inline))
253 ortho_cxy_to_grid_low(const int lp, const int j1, const int j2, const int kg1,
254  const int kg2, const int jg1, const int jg2,
255  const int cmax, const double pol[3][lp + 1][2 * cmax + 1],
256  const int map[3][2 * cmax + 1],
257  const int sections[3][2 * cmax + 1],
258  const int npts_local[3], int **sphere_bounds_iter,
259  double *cx, GRID_CONST_WHEN_COLLOCATE double *cxy,
261 
262 #if (GRID_DO_COLLOCATE)
263  // collocate
264  ortho_cxy_to_cx(lp, j1, j2, cmax, pol, cxy, cx);
267 #else
268  // integrate
271  ortho_cxy_to_cx(lp, j1, j2, cmax, pol, cxy, cx);
272 #endif
273 }
274 
275 /*******************************************************************************
276  * \brief Collocates coefficients C_xy onto the grid for orthorhombic case.
277  * \author Ole Schuett
278  ******************************************************************************/
279 static inline void ortho_cxy_to_grid(
280  const int lp, const int kg1, const int kg2, const int cmax,
281  const double pol[3][lp + 1][2 * cmax + 1], const int map[3][2 * cmax + 1],
282  const int sections[3][2 * cmax + 1], const int npts_local[3],
285 
286  // The cube contains an even number of grid points in each direction and
287  // collocation is always performed on a pair of two opposing grid points.
288  // Hence, the points with index 0 and 1 are both assigned distance zero via
289  // the formular distance=(2*index-1)/2.
290 
291  const int jstart = *((*sphere_bounds_iter)++);
292  const size_t cx_size = (lp + 1) * 4;
293  double cx[cx_size];
294  for (int j1 = jstart; j1 <= 0; j1++) {
295  const int j2 = 1 - j1;
296  const int jg1 = map[1][j1 + cmax];
297  const int jg2 = map[1][j2 + cmax];
298 
299  memset(cx, 0, cx_size * sizeof(double));
300 
301  // Generate separate branches for low values of lp gives up to 30% speedup.
302  if (lp <= GRID_MAX_LP_OPTIMIZED) {
304  for (int ilp = 0; ilp <= GRID_MAX_LP_OPTIMIZED; ilp++) {
305  if (lp == ilp) {
306  ortho_cxy_to_grid_low(ilp, j1, j2, kg1, kg2, jg1, jg2, cmax, pol, map,
308  cxy, grid);
309  }
310  }
311  } else {
312  ortho_cxy_to_grid_low(lp, j1, j2, kg1, kg2, jg1, jg2, cmax, pol, map,
314  grid);
315  }
316  }
317 }
318 
319 /*******************************************************************************
320  * \brief Transforms coefficients C_xyz into C_xz by fixing grid index k.
321  * \author Ole Schuett
322  ******************************************************************************/
323 static inline void ortho_cxyz_to_cxy(const int lp, const int k1, const int k2,
324  const int cmax,
325  const double pol[3][lp + 1][2 * cmax + 1],
326  GRID_CONST_WHEN_COLLOCATE double *cxyz,
327  GRID_CONST_WHEN_INTEGRATE double *cxy) {
328 
329  for (int lzp = 0; lzp <= lp; lzp++) {
330  for (int lyp = 0; lyp <= lp - lzp; lyp++) {
331  for (int lxp = 0; lxp <= lp - lzp - lyp; lxp++) {
332  const double p1 = pol[2][lzp][k1 + cmax];
333  const double p2 = pol[2][lzp][k2 + cmax];
334  const int cxyz_index =
335  lzp * (lp + 1) * (lp + 1) + lyp * (lp + 1) + lxp; // [lzp, lyp, lxp]
336  const int cxy_index = lyp * (lp + 1) * 2 + lxp * 2; // [lyp, lxp, 0]
337 
338 #if (GRID_DO_COLLOCATE)
339  // collocate
340  cxy[cxy_index + 0] += cxyz[cxyz_index] * p1;
341  cxy[cxy_index + 1] += cxyz[cxyz_index] * p2;
342 #else
343  // integrate
344  cxyz[cxyz_index] += cxy[cxy_index + 0] * p1;
345  cxyz[cxyz_index] += cxy[cxy_index + 1] * p2;
346 #endif
347  }
348  }
349  }
350 }
351 
352 /*******************************************************************************
353  * \brief Collocates coefficients C_xyz onto the grid for orthorhombic case.
354  * \author Ole Schuett
355  ******************************************************************************/
356 static inline void
357 ortho_cxyz_to_grid(const int lp, const double zetp, const double dh[3][3],
358  const double dh_inv[3][3], const double rp[3],
359  const int npts_global[3], const int npts_local[3],
360  const int shift_local[3], const double radius,
361  GRID_CONST_WHEN_COLLOCATE double *cxyz,
363 
364  // *** position of the gaussian product
365  //
366  // this is the actual definition of the position on the grid
367  // i.e. a point rp(:) gets here grid coordinates
368  // MODULO(rp(:)/dr(:),npts_global(:))+1
369  // hence (0.0,0.0,0.0) in real space is rsgrid%lb on the rsgrid in Fortran
370  // and (1,1,1) on grid here in C.
371 
372  // cubecenter(:) = FLOOR(MATMUL(dh_inv, rp))
373  int cubecenter[3];
374  for (int i = 0; i < 3; i++) {
375  double dh_inv_rp = 0.0;
376  for (int j = 0; j < 3; j++) {
377  dh_inv_rp += dh_inv[j][i] * rp[j];
378  }
379  cubecenter[i] = (int)floor(dh_inv_rp);
380  }
381 
382  double roffset[3];
383  for (int i = 0; i < 3; i++) {
384  roffset[i] = rp[i] - ((double)cubecenter[i]) * dh[i][i];
385  }
386 
387  // Lookup loop bounds for spherical cutoff.
388  int *sphere_bounds;
389  double disr_radius;
390  grid_sphere_cache_lookup(radius, dh, dh_inv, &sphere_bounds, &disr_radius);
391  int **sphere_bounds_iter = &sphere_bounds;
392 
393  // Cube bounds.
394  int lb_cube[3], ub_cube[3];
395  for (int i = 0; i < 3; i++) {
396  lb_cube[i] = (int)ceil(-1e-8 - disr_radius * dh_inv[i][i]);
397  ub_cube[i] = 1 - lb_cube[i];
398  // If grid is not period check that cube fits without wrapping.
399  if (npts_global[i] != npts_local[i]) {
400  const int offset =
401  modulo(cubecenter[i] + lb_cube[i] - shift_local[i], npts_global[i]) -
402  lb_cube[i];
403  assert(offset + ub_cube[i] < npts_local[i]);
404  assert(offset + lb_cube[i] >= 0);
405  }
406  }
407 
408  // cmax = MAXVAL(ub_cube)
409  const int cmax = imax(imax(ub_cube[0], ub_cube[1]), ub_cube[2]);
410 
411  // Precompute (x-xp)**lp*exp(..) for each direction.
412  double pol_mutable[3][lp + 1][2 * cmax + 1];
413  for (int idir = 0; idir < 3; idir++) {
414  const double dr = dh[idir][idir];
415  const double ro = roffset[idir];
416  // Reuse the result from the previous gridpoint to avoid to many exps:
417  // exp( -a*(x+d)**2) = exp(-a*x**2)*exp(-2*a*x*d)*exp(-a*d**2)
418  // exp(-2*a*(x+d)*d) = exp(-2*a*x*d)*exp(-2*a*d**2)
419  const double t_exp_1 = exp(-zetp * pow(dr, 2));
420  const double t_exp_2 = pow(t_exp_1, 2);
421  double t_exp_min_1 = exp(-zetp * pow(+dr - ro, 2));
422  double t_exp_min_2 = exp(-2 * zetp * (+dr - ro) * (-dr));
423  for (int ig = 0; ig >= lb_cube[idir]; ig--) {
424  const double rpg = ig * dr - ro;
425  t_exp_min_1 *= t_exp_min_2 * t_exp_1;
426  t_exp_min_2 *= t_exp_2;
427  double pg = t_exp_min_1;
428  for (int icoef = 0; icoef <= lp; icoef++) {
429  pol_mutable[idir][icoef][ig + cmax] = pg; // exp(-zetp*rpg**2)
430  pg *= rpg;
431  }
432  }
433  double t_exp_plus_1 = exp(-zetp * pow(-ro, 2));
434  double t_exp_plus_2 = exp(-2 * zetp * (-ro) * (+dr));
435  for (int ig = 0; ig >= lb_cube[idir]; ig--) {
436  const double rpg = (1 - ig) * dr - ro;
437  t_exp_plus_1 *= t_exp_plus_2 * t_exp_1;
438  t_exp_plus_2 *= t_exp_2;
439  double pg = t_exp_plus_1;
440  for (int icoef = 0; icoef <= lp; icoef++) {
441  pol_mutable[idir][icoef][1 - ig + cmax] = pg; // exp(-zetp*rpg**2)
442  pg *= rpg;
443  }
444  }
445  }
446  const double(*pol)[lp + 1][2 * cmax + 1] =
447  (const double(*)[lp + 1][2 * cmax + 1]) pol_mutable;
448 
449  // Precompute mapping from cube to grid indices for each direction
450  int map_mutable[3][2 * cmax + 1];
451  for (int i = 0; i < 3; i++) {
452  for (int k = -cmax; k <= +cmax; k++) {
453  map_mutable[i][k + cmax] =
454  modulo(cubecenter[i] + k - shift_local[i], npts_global[i]);
455  }
456  }
457  const int(*map)[2 * cmax + 1] = (const int(*)[2 * cmax + 1]) map_mutable;
458 
459  // Precompute length of sections with homogeneous cube to grid mapping.
460  int sections_mutable[3][2 * cmax + 1];
461  for (int i = 0; i < 3; i++) {
462  for (int kg = 2 * cmax; kg >= 0; kg--) {
463  if (kg == 2 * cmax || map[i][kg] != map[i][kg + 1] - 1) {
464  sections_mutable[i][kg] = 0;
465  } else {
466  sections_mutable[i][kg] = sections_mutable[i][kg + 1] + 1;
467  }
468  }
469  }
470  const int(*sections)[2 * cmax + 1] =
471  (const int(*)[2 * cmax + 1]) sections_mutable;
472 
473  // Loop over k dimension of the cube.
474  const int kstart = *((*sphere_bounds_iter)++);
475  const size_t cxy_size = (lp + 1) * (lp + 1) * 2;
476  double cxy[cxy_size];
477  for (int k1 = kstart; k1 <= 0; k1++) {
478  const int k2 = 1 - k1;
479  const int kg1 = map[2][k1 + cmax];
480  const int kg2 = map[2][k2 + cmax];
481 
482  memset(cxy, 0, cxy_size * sizeof(double));
483 
484 #if (GRID_DO_COLLOCATE)
485  // collocate
486  ortho_cxyz_to_cxy(lp, k1, k2, cmax, pol, cxyz, cxy);
489 #else
490  // integrate
493  ortho_cxyz_to_cxy(lp, k1, k2, cmax, pol, cxyz, cxy);
494 #endif
495  }
496 }
497 
498 /*******************************************************************************
499  * \brief Collocates coefficients C_i onto the grid for general case.
500  * \author Ole Schuett
501  ******************************************************************************/
502 static inline void __attribute__((always_inline))
503 general_ci_to_grid(const int lp, const int jg, const int kg, const int ismin,
504  const int ismax, const int npts_local[3],
505  const int index_min[3], const int index_max[3],
506  const int map_i[], const int sections_i[],
507  const double gp[3], const int k, const int j,
508  const double exp_ij[], const double exp_jk[],
509  const double exp_ki[], GRID_CONST_WHEN_COLLOCATE double *ci,
511 
512  const int base = kg * npts_local[1] * npts_local[0] + jg * npts_local[0];
513 
514  // AVX instructions can only load/store from evenly spaced memory locations.
515  // Since the cube can wrap around due to the grid's periodicity,
516  // the inner loop runs over sections with homogeneous cube to grid mapping.
517  for (int istart = ismin; istart <= ismax; istart++) {
518  const int istop = imin(ismax, istart + sections_i[istart - index_min[0]]);
519  if (map_i[istart - index_min[0]] < 0) {
520  istart = istop; // skip over out-of-bounds indicies
521  continue;
522  }
523 
524  const int cube2grid = map_i[istart - index_min[0]] - istart;
525  for (int i = istart; i <= istop; i++) {
526  const int ig = i + cube2grid;
527  const double di = i - gp[0];
528 
529  const int stride_i = index_max[0] - index_min[0] + 1;
530  const int stride_j = index_max[1] - index_min[1] + 1;
531  const int stride_k = index_max[2] - index_min[2] + 1;
532  const int idx_ij = (j - index_min[1]) * stride_i + i - index_min[0];
533  const int idx_jk = (k - index_min[2]) * stride_j + j - index_min[1];
534  const int idx_ki = (i - index_min[0]) * stride_k + k - index_min[2];
535 
536  // Mathieu's trick: Calculate 3D Gaussian from three precomputed 2D tables
537  //
538  // r = (i-gp[0])*dh[0,:] + (j-gp[1])*dh[1,:] + (k-gp[2])*dh[2,:]
539  // = a + b + c
540  //
541  // r**2 = (a + b + c)**2 = a**2 + b**2 + c**2 + 2ab + 2bc + 2ca
542  //
543  // exp(-r**2) = exp(-a(a+2b)) * exp(-b*(b+2c)) * exp(-c*(c+2a))
544  //
545  const double gaussian = exp_ij[idx_ij] * exp_jk[idx_jk] * exp_ki[idx_ki];
546 
547  const int grid_index = base + ig; // [kg, jg, ig]
548  double dip = gaussian;
549 
550 #if (GRID_DO_COLLOCATE)
551  // collocate
552  double reg = 0.0;
553  for (int il = 0; il <= lp; il++) {
554  reg += ci[il] * dip;
555  dip *= di;
556  }
557  grid[grid_index] += reg;
558 #else
559  // integrate
560  const double reg = grid[grid_index];
561  for (int il = 0; il <= lp; il++) {
562  ci[il] += reg * dip;
563  dip *= di;
564  }
565 #endif
566  }
567  istart = istop;
568  }
569 }
570 
571 /*******************************************************************************
572  * \brief Transforms coefficients C_ij into C_i by fixing grid index j.
573  * \author Ole Schuett
574  ******************************************************************************/
575 static inline void __attribute__((always_inline))
576 general_cij_to_ci(const int lp, const double dj,
577  GRID_CONST_WHEN_COLLOCATE double *cij,
578  GRID_CONST_WHEN_INTEGRATE double *ci) {
579  double djp = 1.0;
580  for (int jl = 0; jl <= lp; jl++) {
581  for (int il = 0; il <= lp - jl; il++) {
582  const int cij_index = jl * (lp + 1) + il; // [jl, il]
583 #if (GRID_DO_COLLOCATE)
584  ci[il] += cij[cij_index] * djp; // collocate
585 #else
586  cij[cij_index] += ci[il] * djp; // integrate
587 #endif
588  }
589  djp *= dj;
590  }
591 }
592 
593 /*******************************************************************************
594  * \brief Loop body of general_cij_to_grid to be inlined for low values of lp.
595  * \author Ole Schuett
596  ******************************************************************************/
597 static inline void __attribute__((always_inline)) general_cij_to_grid_low(
598  const int lp, const int jg, const int kg, const int ismin, const int ismax,
599  const int npts_local[3], const int index_min[3], const int index_max[3],
600  const int map_i[], const int sections_i[], const double gp[3], const int k,
601  const int j, const double exp_ij[], const double exp_jk[],
602  const double exp_ki[], const double dj, double *ci,
603  GRID_CONST_WHEN_COLLOCATE double *cij,
605 
606 #if (GRID_DO_COLLOCATE)
607  // collocate
608  general_cij_to_ci(lp, dj, cij, ci);
609  general_ci_to_grid(lp, jg, kg, ismin, ismax, npts_local, index_min, index_max,
610  map_i, sections_i, gp, k, j, exp_ij, exp_jk, exp_ki, ci,
611  grid);
612 #else
613  // integrate
614  general_ci_to_grid(lp, jg, kg, ismin, ismax, npts_local, index_min, index_max,
615  map_i, sections_i, gp, k, j, exp_ij, exp_jk, exp_ki, ci,
616  grid);
617  general_cij_to_ci(lp, dj, cij, ci);
618 #endif
619 }
620 
621 /*******************************************************************************
622  * \brief Collocates coefficients C_ij onto the grid for general case.
623  * \author Ole Schuett
624  ******************************************************************************/
625 static inline void general_cij_to_grid(
626  const int lp, const int k, const int kg, const int npts_local[3],
627  const int index_min[3], const int index_max[3], const int map_i[],
628  const int map_j[], const int sections_i[], const int sections_j[],
629  const double dh[3][3], const double gp[3], const double radius,
630  const double exp_ij[], const double exp_jk[], const double exp_ki[],
631  GRID_CONST_WHEN_COLLOCATE double *cij,
633 
634  for (int j = index_min[1]; j <= index_max[1]; j++) {
635  const int jg = map_j[j - index_min[1]];
636  if (jg < 0) {
637  j += sections_j[j - index_min[1]]; // skip over out-of-bounds indicies
638  continue;
639  }
640 
641  //--------------------------------------------------------------------
642  // Find bounds for the inner loop based on a quadratic equation in i.
643  //
644  // The real-space vector from the center of the gaussian to the
645  // grid point i,j,k is given by:
646  // r = (i-gp[0])*dh[0,:] + (j-gp[1])*dh[1,:] + (k-gp[2])*dh[2,:]
647  //
648  // Separating the term that depends on i:
649  // r = i*dh[0,:] - gp[0]*dh[0,:] + (j-gp[1])*dh[1,:] + (k-gp[2])*dh[2,:]
650  // = i*dh[0,:] + v
651  //
652  // The squared distance works out to:
653  // r**2 = dh[0,:]**2 * i**2 + 2 * v * dh[0,:] * i + v**2
654  // = a * i**2 + b * i + c
655  //
656  // Solving r**2==radius**2 for i yields:
657  // d = b**2 - 4 * a * (c - radius**2)
658  // i = (-b \pm sqrt(d)) / (2*a)
659  //
660  double a = 0.0, b = 0.0, c = 0.0;
661  for (int i = 0; i < 3; i++) {
662  const double v = (0 - gp[0]) * dh[0][i] + (j - gp[1]) * dh[1][i] +
663  (k - gp[2]) * dh[2][i];
664  a += dh[0][i] * dh[0][i];
665  b += 2.0 * v * dh[0][i];
666  c += v * v;
667  }
668  const double d = b * b - 4.0 * a * (c - radius * radius);
669 
670  if (0.0 < d) {
671  const double sqrt_d = sqrt(d);
672  const double inv_2a = 1.0 / (2.0 * a);
673  const int ismin = (int)ceil((-b - sqrt_d) * inv_2a);
674  const int ismax = (int)floor((-b + sqrt_d) * inv_2a);
675  const double dj = j - gp[1];
676 
677  double ci[lp + 1];
678  memset(ci, 0, sizeof(ci));
679 
680  // Generate separate branches for low values of lp.
681  if (lp <= GRID_MAX_LP_OPTIMIZED) {
683  for (int ilp = 0; ilp <= GRID_MAX_LP_OPTIMIZED; ilp++) {
684  if (lp == ilp) {
685  general_cij_to_grid_low(ilp, jg, kg, ismin, ismax, npts_local,
686  index_min, index_max, map_i, sections_i, gp,
687  k, j, exp_ij, exp_jk, exp_ki, dj, ci, cij,
688  grid);
689  }
690  }
691  } else {
692  general_cij_to_grid_low(lp, jg, kg, ismin, ismax, npts_local, index_min,
693  index_max, map_i, sections_i, gp, k, j, exp_ij,
694  exp_jk, exp_ki, dj, ci, cij, grid);
695  }
696  }
697  }
698 }
699 
700 /*******************************************************************************
701  * \brief Transforms coefficients C_ijk into C_ij by fixing grid index k.
702  * \author Ole Schuett
703  ******************************************************************************/
704 static inline void general_cijk_to_cij(const int lp, const double dk,
705  GRID_CONST_WHEN_COLLOCATE double *cijk,
706  GRID_CONST_WHEN_INTEGRATE double *cij) {
707  double dkp = 1.0;
708  for (int kl = 0; kl <= lp; kl++) {
709  for (int jl = 0; jl <= lp - kl; jl++) {
710  for (int il = 0; il <= lp - kl - jl; il++) {
711  const int cij_index = jl * (lp + 1) + il; // [jl, il]
712  const int cijk_index =
713  kl * (lp + 1) * (lp + 1) + jl * (lp + 1) + il; // [kl, jl, il]
714 #if (GRID_DO_COLLOCATE)
715  cij[cij_index] += cijk[cijk_index] * dkp; // collocate
716 #else
717  cijk[cijk_index] += cij[cij_index] * dkp; // integrate
718 #endif
719  }
720  }
721  dkp *= dk;
722  }
723 }
724 
725 /*******************************************************************************
726  * \brief Precompute mapping of grid indices and its homogeneous sections.
727  * \author Ole Schuett
728  ******************************************************************************/
729 static inline void
730 general_precompute_mapping(const int index_min, const int index_max,
731  const int shift_local, const int npts_global,
732  const int bounds[2], int map[], int sections[]) {
733 
734  // Precompute mapping from continous grid indices to pbc wraped.
735  for (int k = index_min; k <= index_max; k++) {
736  const int kg = modulo(k - shift_local, npts_global);
737  if (bounds[0] <= kg && kg <= bounds[1]) {
738  map[k - index_min] = kg;
739  } else {
740  map[k - index_min] = INT_MIN; // out of bounds - not mapped
741  }
742  }
743 
744  // Precompute length of sections with homogeneous cube to grid mapping.
745  const int range = index_max - index_min + 1;
746  for (int kg = range - 1; kg >= 0; kg--) {
747  if (kg == range - 1 || map[kg] != map[kg + 1] - 1) {
748  sections[kg] = 0;
749  } else {
750  sections[kg] = sections[kg + 1] + 1;
751  }
752  }
753 }
754 
755 /*******************************************************************************
756  * \brief Fill one of the 2D tables that speedup 3D Gaussian (Mathieu's trick).
757  * \author Ole Schuett
758  ******************************************************************************/
759 static inline void
760 general_fill_exp_table(const int idir, const int jdir, const int index_min[3],
761  const int index_max[3], const double zetp,
762  const double dh[3][3], const double gp[3],
763  double exp_table[]) {
764 
765  const int stride_i = index_max[idir] - index_min[idir] + 1;
766  const double h_ii = dh[idir][0] * dh[idir][0] + dh[idir][1] * dh[idir][1] +
767  dh[idir][2] * dh[idir][2];
768  const double h_ij = dh[idir][0] * dh[jdir][0] + dh[idir][1] * dh[jdir][1] +
769  dh[idir][2] * dh[jdir][2];
770 
771  for (int i = index_min[idir]; i <= index_max[idir]; i++) {
772  const double di = i - gp[idir];
773  const double rii = di * di * h_ii;
774  const double rij_unit = di * h_ij;
775  const double exp_ij_unit = exp(-zetp * 2.0 * rij_unit);
776 
777  // compute exponentials symmetrically around cube center
778  const int j_center = (int)gp[jdir];
779  const double dj_center = j_center - gp[jdir];
780  const double rij_center = dj_center * rij_unit;
781  const double exp_ij_center = exp(-zetp * (rii + 2.0 * rij_center));
782 
783  // above center
784  double exp_ij = exp_ij_center;
785  for (int j = j_center; j <= index_max[jdir]; j++) {
786  const int idx = (j - index_min[jdir]) * stride_i + i - index_min[idir];
787  exp_table[idx] = exp_ij; // exp(-zetp * (di*di*h_ii + 2*di*dj*h_ij));
788  exp_ij *= exp_ij_unit;
789  }
790 
791  // below center
792  const double exp_ij_unit_inv = 1.0 / exp_ij_unit;
793  exp_ij = exp_ij_center * exp_ij_unit_inv;
794  for (int j = j_center - 1; j >= index_min[jdir]; j--) {
795  const int idx = (j - index_min[jdir]) * stride_i + i - index_min[idir];
796  exp_table[idx] = exp_ij; // exp(-zetp * (di*di*h_ii + 2*di*dj*h_ij));
797  exp_ij *= exp_ij_unit_inv;
798  }
799  }
800 }
801 
802 /*******************************************************************************
803  * \brief Collocates coefficients C_ijk onto the grid for general case.
804  * \author Ole Schuett
805  ******************************************************************************/
806 static inline void
807 general_cijk_to_grid(const int border_mask, const int lp, const double zetp,
808  const double dh[3][3], const double dh_inv[3][3],
809  const double rp[3], const int npts_global[3],
810  const int npts_local[3], const int shift_local[3],
811  const int border_width[3], const double radius,
812  GRID_CONST_WHEN_COLLOCATE double *cijk,
814 
815  // Default for border_mask == 0.
816  int bounds_i[2] = {0, npts_local[0] - 1};
817  int bounds_j[2] = {0, npts_local[1] - 1};
818  int bounds_k[2] = {0, npts_local[2] - 1};
819 
820  // See also rs_find_node() in task_list_methods.F.
821  // If the bit is set then we need to exclude the border in that direction.
822  if (border_mask & (1 << 0))
823  bounds_i[0] += border_width[0];
824  if (border_mask & (1 << 1))
825  bounds_i[1] -= border_width[0];
826  if (border_mask & (1 << 2))
827  bounds_j[0] += border_width[1];
828  if (border_mask & (1 << 3))
829  bounds_j[1] -= border_width[1];
830  if (border_mask & (1 << 4))
831  bounds_k[0] += border_width[2];
832  if (border_mask & (1 << 5))
833  bounds_k[1] -= border_width[2];
834 
835  // center in grid coords
836  // gp = MATMUL(dh_inv, rp)
837  double gp[3] = {0.0, 0.0, 0.0};
838  for (int i = 0; i < 3; i++) {
839  for (int j = 0; j < 3; j++) {
840  gp[i] += dh_inv[j][i] * rp[j];
841  }
842  }
843 
844  // Get the min max indices that contain at least the cube that contains a
845  // sphere around rp of radius radius if the cell is very non-orthogonal this
846  // implies that many useless points are included this estimate can be improved
847  // (i.e. not box but sphere should be used)
848  int index_min[3] = {INT_MAX, INT_MAX, INT_MAX};
849  int index_max[3] = {INT_MIN, INT_MIN, INT_MIN};
850  for (int i = -1; i <= 1; i++) {
851  for (int j = -1; j <= 1; j++) {
852  for (int k = -1; k <= 1; k++) {
853  const double x = rp[0] + i * radius;
854  const double y = rp[1] + j * radius;
855  const double z = rp[2] + k * radius;
856  for (int idir = 0; idir < 3; idir++) {
857  const double resc =
858  dh_inv[0][idir] * x + dh_inv[1][idir] * y + dh_inv[2][idir] * z;
859  index_min[idir] = imin(index_min[idir], (int)floor(resc));
860  index_max[idir] = imax(index_max[idir], (int)ceil(resc));
861  }
862  }
863  }
864  }
865 
866  // Precompute mappings
867  const int range_i = index_max[0] - index_min[0] + 1;
868  int map_i[range_i], sections_i[range_i];
869  general_precompute_mapping(index_min[0], index_max[0], shift_local[0],
870  npts_global[0], bounds_i, map_i, sections_i);
871  const int range_j = index_max[1] - index_min[1] + 1;
872  int map_j[range_j], sections_j[range_j];
873  general_precompute_mapping(index_min[1], index_max[1], shift_local[1],
874  npts_global[1], bounds_j, map_j, sections_j);
875  const int range_k = index_max[2] - index_min[2] + 1;
876  int map_k[range_k], sections_k[range_k];
877  general_precompute_mapping(index_min[2], index_max[2], shift_local[2],
878  npts_global[2], bounds_k, map_k, sections_k);
879 
880  // Precompute exponentials
881  double exp_ij[range_i * range_j];
882  general_fill_exp_table(0, 1, index_min, index_max, zetp, dh, gp, exp_ij);
883  double exp_jk[range_j * range_k];
884  general_fill_exp_table(1, 2, index_min, index_max, zetp, dh, gp, exp_jk);
885  double exp_ki[range_k * range_i];
886  general_fill_exp_table(2, 0, index_min, index_max, zetp, dh, gp, exp_ki);
887 
888  // go over the grid, but cycle if the point is not within the radius
889  const int cij_size = (lp + 1) * (lp + 1);
890  double cij[cij_size];
891  for (int k = index_min[2]; k <= index_max[2]; k++) {
892  const int kg = map_k[k - index_min[2]];
893  if (kg < 0) {
894  k += sections_k[k - index_min[2]]; // skip over out-of-bounds indicies
895  continue;
896  }
897 
898  // zero coef_xyt
899  memset(cij, 0, cij_size * sizeof(double));
900 
901 #if (GRID_DO_COLLOCATE)
902  // collocate
903  general_cijk_to_cij(lp, (double)k - gp[2], cijk, cij);
904  general_cij_to_grid(lp, k, kg, npts_local, index_min, index_max, map_i,
905  map_j, sections_i, sections_j, dh, gp, radius, exp_ij,
906  exp_jk, exp_ki, cij, grid);
907 #else
908  // integrate
909  general_cij_to_grid(lp, k, kg, npts_local, index_min, index_max, map_i,
910  map_j, sections_i, sections_j, dh, gp, radius, exp_ij,
911  exp_jk, exp_ki, cij, grid);
912  general_cijk_to_cij(lp, (double)k - gp[2], cijk, cij);
913 #endif
914  }
915 }
916 
917 /*******************************************************************************
918  * \brief Transforms coefficients C_xyz into C_ijk.
919  * \author Ole Schuett
920  ******************************************************************************/
921 static inline void
922 general_cxyz_to_cijk(const int lp, const double dh[3][3],
923  GRID_CONST_WHEN_COLLOCATE double *cxyz,
924  GRID_CONST_WHEN_INTEGRATE double *cijk) {
925 
926  // transform P_{lxp,lyp,lzp} into a P_{lip,ljp,lkp} such that
927  // sum_{lxp,lyp,lzp} P_{lxp,lyp,lzp} (x-x_p)**lxp (y-y_p)**lyp (z-z_p)**lzp =
928  // sum_{lip,ljp,lkp} P_{lip,ljp,lkp} (i-i_p)**lip (j-j_p)**ljp (k-k_p)**lkp
929 
930  // transform using multinomials
931  double hmatgridp[lp + 1][3][3];
932  for (int i = 0; i < 3; i++) {
933  for (int j = 0; j < 3; j++) {
934  hmatgridp[0][j][i] = 1.0;
935  for (int k = 1; k <= lp; k++) {
936  hmatgridp[k][j][i] = hmatgridp[k - 1][j][i] * dh[j][i];
937  }
938  }
939  }
940 
941  const int lpx = lp;
942  for (int klx = 0; klx <= lpx; klx++) {
943  for (int jlx = 0; jlx <= lpx - klx; jlx++) {
944  for (int ilx = 0; ilx <= lpx - klx - jlx; ilx++) {
945  const int lx = ilx + jlx + klx;
946  const int lpy = lp - lx;
947  for (int kly = 0; kly <= lpy; kly++) {
948  for (int jly = 0; jly <= lpy - kly; jly++) {
949  for (int ily = 0; ily <= lpy - kly - jly; ily++) {
950  const int ly = ily + jly + kly;
951  const int lpz = lp - lx - ly;
952  for (int klz = 0; klz <= lpz; klz++) {
953  for (int jlz = 0; jlz <= lpz - klz; jlz++) {
954  for (int ilz = 0; ilz <= lpz - klz - jlz; ilz++) {
955  const int lz = ilz + jlz + klz;
956  const int il = ilx + ily + ilz;
957  const int jl = jlx + jly + jlz;
958  const int kl = klx + kly + klz;
959  const int lp1 = lp + 1;
960  const int cijk_index =
961  kl * lp1 * lp1 + jl * lp1 + il; // [kl,jl,il]
962  const int cxyz_index =
963  lz * lp1 * lp1 + ly * lp1 + lx; // [lz,ly,lx]
964  const double p =
965  hmatgridp[ilx][0][0] * hmatgridp[jlx][1][0] *
966  hmatgridp[klx][2][0] * hmatgridp[ily][0][1] *
967  hmatgridp[jly][1][1] * hmatgridp[kly][2][1] *
968  hmatgridp[ilz][0][2] * hmatgridp[jlz][1][2] *
969  hmatgridp[klz][2][2] * fac(lx) * fac(ly) * fac(lz) /
970  (fac(ilx) * fac(ily) * fac(ilz) * fac(jlx) * fac(jly) *
971  fac(jlz) * fac(klx) * fac(kly) * fac(klz));
972 #if (GRID_DO_COLLOCATE)
973  cijk[cijk_index] += cxyz[cxyz_index] * p; // collocate
974 #else
975  cxyz[cxyz_index] += cijk[cijk_index] * p; // integrate
976 #endif
977  }
978  }
979  }
980  }
981  }
982  }
983  }
984  }
985  }
986 }
987 
988 /*******************************************************************************
989  * \brief Collocates coefficients C_xyz onto the grid for general case.
990  * \author Ole Schuett
991  ******************************************************************************/
992 static inline void
993 general_cxyz_to_grid(const int border_mask, const int lp, const double zetp,
994  const double dh[3][3], const double dh_inv[3][3],
995  const double rp[3], const int npts_global[3],
996  const int npts_local[3], const int shift_local[3],
997  const int border_width[3], const double radius,
998  GRID_CONST_WHEN_COLLOCATE double *cxyz,
1000 
1001  const size_t cijk_size = (lp + 1) * (lp + 1) * (lp + 1);
1002  double cijk[cijk_size];
1003  memset(cijk, 0, cijk_size * sizeof(double));
1004 
1005 #if (GRID_DO_COLLOCATE)
1006  // collocate
1007  general_cxyz_to_cijk(lp, dh, cxyz, cijk);
1008  general_cijk_to_grid(border_mask, lp, zetp, dh, dh_inv, rp, npts_global,
1009  npts_local, shift_local, border_width, radius, cijk,
1010  grid);
1011 #else
1012  // integrate
1013  general_cijk_to_grid(border_mask, lp, zetp, dh, dh_inv, rp, npts_global,
1014  npts_local, shift_local, border_width, radius, cijk,
1015  grid);
1016  general_cxyz_to_cijk(lp, dh, cxyz, cijk);
1017 #endif
1018 }
1019 
1020 /*******************************************************************************
1021  * \brief Collocates coefficients C_xyz onto the grid.
1022  * \author Ole Schuett
1023  ******************************************************************************/
1024 static inline void
1025 cxyz_to_grid(const bool orthorhombic, const int border_mask, const int lp,
1026  const double zetp, const double dh[3][3],
1027  const double dh_inv[3][3], const double rp[3],
1028  const int npts_global[3], const int npts_local[3],
1029  const int shift_local[3], const int border_width[3],
1030  const double radius, GRID_CONST_WHEN_COLLOCATE double *cxyz,
1031  GRID_CONST_WHEN_INTEGRATE double *grid) {
1032 
1033  enum grid_library_kernel k;
1034  if (orthorhombic && border_mask == 0) {
1036  ortho_cxyz_to_grid(lp, zetp, dh, dh_inv, rp, npts_global, npts_local,
1037  shift_local, radius, cxyz, grid);
1038  } else {
1040  general_cxyz_to_grid(border_mask, lp, zetp, dh, dh_inv, rp, npts_global,
1041  npts_local, shift_local, border_width, radius, cxyz,
1042  grid);
1043  }
1045 }
1046 
1047 /*******************************************************************************
1048  * \brief Transforms coefficients C_ab into C_xyz.
1049  * \author Ole Schuett
1050  ******************************************************************************/
1051 static inline void cab_to_cxyz(const int la_max, const int la_min,
1052  const int lb_max, const int lb_min,
1053  const double prefactor, const double ra[3],
1054  const double rb[3], const double rp[3],
1055  GRID_CONST_WHEN_COLLOCATE double *cab,
1056  GRID_CONST_WHEN_INTEGRATE double *cxyz) {
1057 
1058  // Computes the polynomial expansion coefficients:
1059  // (x-a)**lxa (x-b)**lxb -> sum_{ls} alpha(ls,lxa,lxb,1)*(x-p)**ls
1060  const int lp = la_max + lb_max;
1061  double alpha[3][lb_max + 1][la_max + 1][lp + 1];
1062  memset(alpha, 0, 3 * (lb_max + 1) * (la_max + 1) * (lp + 1) * sizeof(double));
1063 
1064  for (int i = 0; i < 3; i++) {
1065  const double drpa = rp[i] - ra[i];
1066  const double drpb = rp[i] - rb[i];
1067  for (int lxa = 0; lxa <= la_max; lxa++) {
1068  for (int lxb = 0; lxb <= lb_max; lxb++) {
1069  double binomial_k_lxa = 1.0;
1070  double a = 1.0;
1071  for (int k = 0; k <= lxa; k++) {
1072  double binomial_l_lxb = 1.0;
1073  double b = 1.0;
1074  for (int l = 0; l <= lxb; l++) {
1075  alpha[i][lxb][lxa][lxa - l + lxb - k] +=
1076  binomial_k_lxa * binomial_l_lxb * a * b;
1077  binomial_l_lxb *= ((double)(lxb - l)) / ((double)(l + 1));
1078  b *= drpb;
1079  }
1080  binomial_k_lxa *= ((double)(lxa - k)) / ((double)(k + 1));
1081  a *= drpa;
1082  }
1083  }
1084  }
1085  }
1086 
1087  // *** initialise the coefficient matrix, we transform the sum
1088  //
1089  // sum_{lxa,lya,lza,lxb,lyb,lzb} P_{lxa,lya,lza,lxb,lyb,lzb} *
1090  // (x-a_x)**lxa (y-a_y)**lya (z-a_z)**lza (x-b_x)**lxb (y-a_y)**lya
1091  // (z-a_z)**lza
1092  //
1093  // into
1094  //
1095  // sum_{lxp,lyp,lzp} P_{lxp,lyp,lzp} (x-p_x)**lxp (y-p_y)**lyp (z-p_z)**lzp
1096  //
1097  // where p is center of the product gaussian, and lp = la_max + lb_max
1098  // (current implementation is l**7)
1099  //
1100 
1101  for (int lzb = 0; lzb <= lb_max; lzb++) {
1102  for (int lza = 0; lza <= la_max; lza++) {
1103  for (int lyb = 0; lyb <= lb_max - lzb; lyb++) {
1104  for (int lya = 0; lya <= la_max - lza; lya++) {
1105  const int lxb_min = imax(lb_min - lzb - lyb, 0);
1106  const int lxa_min = imax(la_min - lza - lya, 0);
1107  for (int lxb = lxb_min; lxb <= lb_max - lzb - lyb; lxb++) {
1108  for (int lxa = lxa_min; lxa <= la_max - lza - lya; lxa++) {
1109  const int ico = coset(lxa, lya, lza);
1110  const int jco = coset(lxb, lyb, lzb);
1111  const int cab_index = jco * ncoset(la_max) + ico; // [jco, ico]
1112  for (int lzp = 0; lzp <= lza + lzb; lzp++) {
1113  for (int lyp = 0; lyp <= lp - lza - lzb; lyp++) {
1114  for (int lxp = 0; lxp <= lp - lza - lzb - lyp; lxp++) {
1115  const double p = alpha[0][lxb][lxa][lxp] *
1116  alpha[1][lyb][lya][lyp] *
1117  alpha[2][lzb][lza][lzp] * prefactor;
1118  const int lp1 = lp + 1;
1119  const int cxyz_index =
1120  lzp * lp1 * lp1 + lyp * lp1 + lxp; // [lzp, lyp, lxp]
1121 #if (GRID_DO_COLLOCATE)
1122  cxyz[cxyz_index] += cab[cab_index] * p; // collocate
1123 #else
1124  cab[cab_index] += cxyz[cxyz_index] * p; // integrate
1125 #endif
1126  }
1127  }
1128  }
1129  }
1130  }
1131  }
1132  }
1133  }
1134  }
1135 }
1136 
1137 /*******************************************************************************
1138  * \brief Collocates coefficients C_ab onto the grid.
1139  * \author Ole Schuett
1140  ******************************************************************************/
1141 static inline void
1142 cab_to_grid(const bool orthorhombic, const int border_mask, const int la_max,
1143  const int la_min, const int lb_max, const int lb_min,
1144  const double zeta, const double zetb, const double rscale,
1145  const double dh[3][3], const double dh_inv[3][3],
1146  const double ra[3], const double rab[3], const int npts_global[3],
1147  const int npts_local[3], const int shift_local[3],
1148  const int border_width[3], const double radius,
1149  GRID_CONST_WHEN_COLLOCATE double *cab,
1150  GRID_CONST_WHEN_INTEGRATE double *grid) {
1151 
1152  // Check if radius is too small to be mapped onto grid of given resolution.
1153  double dh_max = 0.0;
1154  for (int i = 0; i < 3; i++) {
1155  for (int j = 0; j < 3; j++) {
1156  dh_max = fmax(dh_max, fabs(dh[i][j]));
1157  }
1158  }
1159  if (2.0 * radius < dh_max) {
1160  return;
1161  }
1162 
1163  const double zetp = zeta + zetb;
1164  const double f = zetb / zetp;
1165  const double rab2 = rab[0] * rab[0] + rab[1] * rab[1] + rab[2] * rab[2];
1166  const double prefactor = rscale * exp(-zeta * f * rab2);
1167  double rp[3], rb[3];
1168  for (int i = 0; i < 3; i++) {
1169  rp[i] = ra[i] + f * rab[i];
1170  rb[i] = ra[i] + rab[i];
1171  }
1172 
1173  const int lp = la_max + lb_max;
1174  const size_t cxyz_size = (lp + 1) * (lp + 1) * (lp + 1);
1175  double cxyz[cxyz_size];
1176  memset(cxyz, 0, cxyz_size * sizeof(double));
1177 
1178 #if (GRID_DO_COLLOCATE)
1179  // collocate
1180  cab_to_cxyz(la_max, la_min, lb_max, lb_min, prefactor, ra, rb, rp, cab, cxyz);
1181  cxyz_to_grid(orthorhombic, border_mask, lp, zetp, dh, dh_inv, rp, npts_global,
1182  npts_local, shift_local, border_width, radius, cxyz, grid);
1183 #else
1184  // integrate
1185  cxyz_to_grid(orthorhombic, border_mask, lp, zetp, dh, dh_inv, rp, npts_global,
1186  npts_local, shift_local, border_width, radius, cxyz, grid);
1187  cab_to_cxyz(la_max, la_min, lb_max, lb_min, prefactor, ra, rb, rp, cab, cxyz);
1188 #endif
1189 }
1190 
1191 // EOF
static int imax(int x, int y)
Returns the larger of two given integer (missing from the C standard)
static int imin(int x, int y)
Returns the smaller of two given integer (missing from the C standard)
Definition: dbm_miniapp.c:38
static GRID_HOST_DEVICE int coset(int lx, int ly, int lz)
Maps three angular momentum components to a single zero based index.
Definition: grid_common.h:87
static GRID_HOST_DEVICE int ncoset(const int l)
Number of Cartesian orbitals up to given angular momentum quantum.
Definition: grid_common.h:73
#define GRID_PRAGMA_UNROLL_UP_TO(N)
Definition: grid_common.h:35
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
Definition: grid_common.h:117
static GRID_HOST_DEVICE double fac(const int i)
Factorial function, e.g. fac(5) = 5! = 120.
Definition: grid_common.h:48
static GRID_HOST_DEVICE int idx(const orbital a)
Return coset index of given orbital angular momentum.
Definition: grid_common.h:153
#define GRID_PRAGMA_UNROLL(N)
Definition: grid_common.h:34
@ GRID_BACKEND_CPU
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
#define GRID_CONST_WHEN_COLLOCATE
static void const int const int const double GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double GRID_CONST_WHEN_INTEGRATE double GRID_CONST_WHEN_INTEGRATE double * grid_2
static void const int j1
static void const int const int const int jg1
static void const int const int j2
static void const int const int const double GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double GRID_CONST_WHEN_INTEGRATE double * grid_1
static void const int const int const int const int const int const double const int const int const int int ** sphere_bounds_iter
static void const int const int kg2
static void const int const int const int const int const int const double const int const int sections[3][2 *cmax+1]
static void const int const int i
static void const int const int const int const double GRID_CONST_WHEN_COLLOCATE double * cxy
static void __attribute__((always_inline)) ortho_cx_to_grid_scalar(const int lp
Simple loop body for ortho_cx_to_grid using plain C.
static void const int cmax
static void const int kg1
static void const int const int const double pol[3][lp+1][2 *cmax+1]
const int ub
static void const int const int const int const int const int const double const int const int const int npts_local[3]
#define GRID_MAX_LP_OPTIMIZED
static void const int const int const int const int jg2
static void const int const int const double GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid_0
for(int lxp=0;lxp<=lp;lxp++)
static void const int const int const double GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double GRID_CONST_WHEN_INTEGRATE double GRID_CONST_WHEN_INTEGRATE double GRID_CONST_WHEN_INTEGRATE double * grid_3
static void const int const int const int const int const int const double const int map[3][2 *cmax+1]
#define GRID_CONST_WHEN_INTEGRATE
static void const int const int const double GRID_CONST_WHEN_COLLOCATE double * cx
#define GRID_DO_COLLOCATE
void exp_ij(const double alpha, const int offset_i, const int imin, const int imax, const int offset_j, const int jmin, const int jmax, tensor *exp_ij_)
void grid_library_counter_add(const int lp, const enum grid_backend backend, const enum grid_library_kernel kernel, const int increment)
Adds given increment to counter specified by lp, backend, and kernel.
Definition: grid_library.c:129
grid_library_kernel
Various kernels provided by the grid library.
Definition: grid_library.h:65
@ GRID_INTEGRATE_GENERAL
Definition: grid_library.h:69
@ GRID_COLLOCATE_ORTHO
Definition: grid_library.h:66
@ GRID_COLLOCATE_GENERAL
Definition: grid_library.h:68
@ GRID_INTEGRATE_ORTHO
Definition: grid_library.h:67
static void cxyz_to_grid(const bool orthorhombic, const int border_mask, const int lp, const double zetp, const double dh[3][3], const double dh_inv[3][3], const double rp[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, GRID_CONST_WHEN_COLLOCATE double *cxyz, GRID_CONST_WHEN_INTEGRATE double *grid)
Collocates coefficients C_xyz onto the grid.
static void cab_to_cxyz(const int la_max, const int la_min, const int lb_max, const int lb_min, const double prefactor, const double ra[3], const double rb[3], const double rp[3], GRID_CONST_WHEN_COLLOCATE double *cab, GRID_CONST_WHEN_INTEGRATE double *cxyz)
Transforms coefficients C_ab into C_xyz.
static void ortho_cxy_to_grid(const int lp, const int k1, const int k2, const int cmax, const double pol[3][lp+1][2 *cmax+1], const int map[3][2 *cmax+1], const double dh[3][3], const double dh_inv[3][3], const double disr_radius, const int npts_local[3], GRID_CONST_WHEN_COLLOCATE double *cxy, GRID_CONST_WHEN_INTEGRATE double *grid)
Collocates coefficients C_xy onto the grid for orthorhombic case.
static void general_cij_to_ci(const int lp, const double dj, GRID_CONST_WHEN_COLLOCATE double *cij, GRID_CONST_WHEN_INTEGRATE double *ci)
Transforms coefficients C_ij into C_i by fixing grid index j.
static void general_cijk_to_cij(const int lp, const double dk, GRID_CONST_WHEN_COLLOCATE double *cijk, GRID_CONST_WHEN_INTEGRATE double *cij)
Transforms coefficients C_ijk into C_ij by fixing grid index k.
static void general_precompute_mapping(const int index_min, const int index_max, const int shift_local, const int npts_global, const int bounds[2], int map[])
Precompute mapping of grid indices for general case.
static void ortho_cxy_to_cx(const int lp, const int j1, const int j2, const int cmax, const double pol[3][lp+1][2 *cmax+1], GRID_CONST_WHEN_COLLOCATE double *cxy, GRID_CONST_WHEN_INTEGRATE double *cx)
Transforms coefficients C_xy into C_x by fixing grid index j.
static void general_cxyz_to_cijk(const int lp, const double dh[3][3], GRID_CONST_WHEN_COLLOCATE double *cxyz, GRID_CONST_WHEN_INTEGRATE double *cijk)
Transforms coefficients C_xyz into C_ijk.
static void general_cij_to_grid(const int lp, const int k, const int kg, const int npts_local[3], const int index_min[3], const int index_max[3], const int map_i[], const int map_j[], const double dh[3][3], const double gp[3], const double radius, const double exp_ij[], const double exp_jk[], const double exp_ki[], GRID_CONST_WHEN_COLLOCATE double *cij, GRID_CONST_WHEN_INTEGRATE double *grid)
Collocates coefficients C_ij onto the grid for general case.
static void general_cijk_to_grid(const int border_mask, const int lp, const double zetp, const double dh[3][3], const double dh_inv[3][3], const double rp[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, GRID_CONST_WHEN_COLLOCATE double *cijk, GRID_CONST_WHEN_INTEGRATE double *grid)
Collocates coefficients C_ijk onto the grid for general case.
static void cab_to_grid(const bool orthorhombic, const int border_mask, const int la_max, const int la_min, const int lb_max, const int lb_min, const double zeta, const double zetb, const double rscale, const double dh[3][3], const double dh_inv[3][3], const double ra[3], const double rab[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, GRID_CONST_WHEN_COLLOCATE double *cab, GRID_CONST_WHEN_INTEGRATE double *grid)
Collocates coefficients C_ab onto the grid.
static void general_ci_to_grid(const int lp, const int jg, const int kg, const int ismin, const int ismax, const int npts_local[3], const int index_min[3], const int index_max[3], const int map_i[], const double gp[3], const int k, const int j, const double exp_ij[], const double exp_jk[], const double exp_ki[], GRID_CONST_WHEN_COLLOCATE double *ci, GRID_CONST_WHEN_INTEGRATE double *grid)
Collocates coefficients C_i onto the grid for general case.
static void ortho_cx_to_grid(const int lp, const int k1, const int k2, const int j1, const int j2, const int cmax, const double pol[3][lp+1][2 *cmax+1], const int map[3][2 *cmax+1], const double dh[3][3], const double dh_inv[3][3], const double kremain, const int npts_local[3], GRID_CONST_WHEN_COLLOCATE double *cx, GRID_CONST_WHEN_INTEGRATE double *grid)
Collocates coefficients C_x onto the grid for orthorhombic case.
static void ortho_cxyz_to_cxy(const int lp, const int k1, const int k2, const int cmax, const double pol[3][lp+1][2 *cmax+1], GRID_CONST_WHEN_COLLOCATE double *cxyz, GRID_CONST_WHEN_INTEGRATE double *cxy)
Transforms coefficients C_xyz into C_xz by fixing grid index k.
static void general_cxyz_to_grid(const int border_mask, const int lp, const double zetp, const double dh[3][3], const double dh_inv[3][3], const double rp[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, GRID_CONST_WHEN_COLLOCATE double *cxyz, GRID_CONST_WHEN_INTEGRATE double *grid)
Collocates coefficients C_xyz onto the grid for general case.
static void ortho_cxyz_to_grid(const int lp, const double zetp, const double dh[3][3], const double dh_inv[3][3], const double rp[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const double radius, GRID_CONST_WHEN_COLLOCATE double *cxyz, GRID_CONST_WHEN_INTEGRATE double *grid)
Collocates coefficients C_xyz onto the grid for orthorhombic case.
static void general_fill_exp_table(const int idir, const int jdir, const int index_min[3], const int index_max[3], const double zetp, const double dh[3][3], const double gp[3], double exp_table[])
Fill one of the 2D tables that speedup 3D Gaussian (Mathieu's trick).
void grid_sphere_cache_lookup(const double radius, const double dh[3][3], const double dh_inv[3][3], int **sphere_bounds, double *discr_radius)
Lookup the sphere bound from cache and compute them as needed. See grid_sphere_cache....
real(dp), dimension(3) c
Definition: ai_eri_debug.F:31
real(dp), dimension(3) a
Definition: ai_eri_debug.F:31
real(dp), dimension(3) d
Definition: ai_eri_debug.F:31
real(dp), dimension(3) b
Definition: ai_eri_debug.F:31
real(dp), dimension(3) p
Definition: ai_eri_debug.F:32
integer, parameter, public gaussian