(git:374b731)
Loading...
Searching...
No Matches
grid_cpu_collint.h
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2024 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: BSD-3-Clause */
6/*----------------------------------------------------------------------------*/
7
8#include <assert.h>
9#include <limits.h>
10#include <math.h>
11#include <stdio.h>
12#include <stdlib.h>
13#include <string.h>
14
15#if defined(__AVX2__) && defined(__FMA__)
16#include <immintrin.h>
17#endif
18
19#include "../common/grid_common.h"
20#include "../common/grid_library.h"
21#include "../common/grid_sphere_cache.h"
22
23#define GRID_MAX_LP_OPTIMIZED 9
24
25#if (GRID_DO_COLLOCATE)
26#define GRID_CONST_WHEN_COLLOCATE const
27#define GRID_CONST_WHEN_INTEGRATE
28#else
29#define GRID_CONST_WHEN_COLLOCATE
30#define GRID_CONST_WHEN_INTEGRATE const
31#endif
32
33/*******************************************************************************
34 * \brief Simple loop body for ortho_cx_to_grid using plain C.
35 * \author Ole Schuett
36 ******************************************************************************/
37static inline void __attribute__((always_inline))
38ortho_cx_to_grid_scalar(const int lp, const int cmax, const int i,
39 const double pol[3][lp + 1][2 * cmax + 1],
45
46#if (GRID_DO_COLLOCATE)
47 // collocate
48 double reg[4] = {0.0, 0.0, 0.0, 0.0};
49#pragma omp simd reduction(+ : reg)
50 for (int lxp = 0; lxp <= lp; lxp++) {
51 const double p = pol[0][lxp][i + cmax];
52 reg[0] += cx[lxp * 4 + 0] * p;
53 reg[1] += cx[lxp * 4 + 1] * p;
54 reg[2] += cx[lxp * 4 + 2] * p;
55 reg[3] += cx[lxp * 4 + 3] * p;
56 }
57 *grid_0 += reg[0];
58 *grid_1 += reg[1];
59 *grid_2 += reg[2];
60 *grid_3 += reg[3];
61
62#else
63 // integrate
64 const double reg[4] = {*grid_0, *grid_1, *grid_2, *grid_3};
65#pragma omp simd
66 for (int lxp = 0; lxp <= lp; lxp++) {
67 const double p = pol[0][lxp][i + cmax];
68 cx[lxp * 4 + 0] += reg[0] * p;
69 cx[lxp * 4 + 1] += reg[1] * p;
70 cx[lxp * 4 + 2] += reg[2] * p;
71 cx[lxp * 4 + 3] += reg[3] * p;
72 }
73#endif
74}
75
76/*******************************************************************************
77 * \brief Optimized loop body for ortho_cx_to_grid using AVX2 Intel Intrinsics.
78 * This routine always processes four consecutive grid elements at once.
79 * \author Ole Schuett
80 ******************************************************************************/
81#if defined(__AVX2__) && defined(__FMA__)
82static inline void __attribute__((always_inline))
83ortho_cx_to_grid_avx2(const int lp, const int cmax, const int i,
84 const double pol[3][lp + 1][2 * cmax + 1],
90
91 const int icmax = i + cmax;
92
93#if (GRID_DO_COLLOCATE)
94 // collocate
95 // First iteration for lxp == 0 does not need add instructions.
96 __m256d p_vec = _mm256_loadu_pd(&pol[0][0][icmax]);
97 __m256d r_vec_0 = _mm256_mul_pd(p_vec, _mm256_set1_pd(cx[0]));
98 __m256d r_vec_1 = _mm256_mul_pd(p_vec, _mm256_set1_pd(cx[1]));
99 __m256d r_vec_2 = _mm256_mul_pd(p_vec, _mm256_set1_pd(cx[2]));
100 __m256d r_vec_3 = _mm256_mul_pd(p_vec, _mm256_set1_pd(cx[3]));
101
102 // Remaining iterations for lxp > 0 use fused multiply adds.
104 for (int lxp = 1; lxp <= lp; lxp++) {
105 const double *cx_base = &cx[lxp * 4];
106 p_vec = _mm256_loadu_pd(&pol[0][lxp][icmax]);
107 r_vec_0 = _mm256_fmadd_pd(p_vec, _mm256_set1_pd(cx_base[0]), r_vec_0);
108 r_vec_1 = _mm256_fmadd_pd(p_vec, _mm256_set1_pd(cx_base[1]), r_vec_1);
109 r_vec_2 = _mm256_fmadd_pd(p_vec, _mm256_set1_pd(cx_base[2]), r_vec_2);
110 r_vec_3 = _mm256_fmadd_pd(p_vec, _mm256_set1_pd(cx_base[3]), r_vec_3);
111 }
112
113 // Add vectors to grid one at a time, because they can aliase when cube wraps.
114 _mm256_storeu_pd(grid_0, _mm256_add_pd(_mm256_loadu_pd(grid_0), r_vec_0));
115 _mm256_storeu_pd(grid_1, _mm256_add_pd(_mm256_loadu_pd(grid_1), r_vec_1));
116 _mm256_storeu_pd(grid_2, _mm256_add_pd(_mm256_loadu_pd(grid_2), r_vec_2));
117 _mm256_storeu_pd(grid_3, _mm256_add_pd(_mm256_loadu_pd(grid_3), r_vec_3));
118
119#else
120 // integrate
121 __m256d grid_vec_0 = _mm256_loadu_pd(grid_0);
122 __m256d grid_vec_1 = _mm256_loadu_pd(grid_1);
123 __m256d grid_vec_2 = _mm256_loadu_pd(grid_2);
124 __m256d grid_vec_3 = _mm256_loadu_pd(grid_3);
125
127 for (int lxp = 0; lxp <= lp; lxp++) {
128 __m256d p_vec = _mm256_loadu_pd(&pol[0][lxp][icmax]);
129
130 // Do 4 dot products at once. https://stackoverflow.com/a/10454420
131 __m256d xy0 = _mm256_mul_pd(p_vec, grid_vec_0);
132 __m256d xy1 = _mm256_mul_pd(p_vec, grid_vec_1);
133 __m256d xy2 = _mm256_mul_pd(p_vec, grid_vec_2);
134 __m256d xy3 = _mm256_mul_pd(p_vec, grid_vec_3);
135
136 // low to high: xy00+xy01 xy10+xy11 xy02+xy03 xy12+xy13
137 __m256d temp01 = _mm256_hadd_pd(xy0, xy1);
138
139 // low to high: xy20+xy21 xy30+xy31 xy22+xy23 xy32+xy33
140 __m256d temp23 = _mm256_hadd_pd(xy2, xy3);
141
142 // low to high: xy02+xy03 xy12+xy13 xy20+xy21 xy30+xy31
143 __m256d swapped = _mm256_permute2f128_pd(temp01, temp23, 0x21);
144
145 // low to high: xy00+xy01 xy10+xy11 xy22+xy23 xy32+xy33
146 __m256d blended = _mm256_blend_pd(temp01, temp23, 0b1100);
147
148 __m256d r_vec = _mm256_add_pd(swapped, blended);
149
150 // cx += r_vec
151 double *cx_base = &cx[lxp * 4];
152 _mm256_storeu_pd(cx_base, _mm256_add_pd(r_vec, _mm256_loadu_pd(cx_base)));
153 }
154#endif
155}
156#endif // __AVX2__ && __FMA__
157
158/*******************************************************************************
159 * \brief Collocates coefficients C_x onto the grid for orthorhombic case.
160 * \author Ole Schuett
161 ******************************************************************************/
162static inline void __attribute__((always_inline))
163ortho_cx_to_grid(const int lp, const int kg1, const int kg2, const int jg1,
164 const int jg2, const int cmax,
165 const double pol[3][lp + 1][2 * cmax + 1],
166 const int map[3][2 * cmax + 1],
167 const int sections[3][2 * cmax + 1], const int npts_local[3],
170
171 // Lower and upper sphere bounds relative to center, ie. in cube coordinates.
172 const int lb = *((*sphere_bounds_iter)++);
173 const int ub = 1 - lb;
174
175 // AVX instructions can only load/store from evenly spaced memory locations.
176 // Since the sphere bounds can wrap around due to the grid's periodicity,
177 // the inner loop runs over sections with homogeneous cube to grid mapping.
178 for (int istart = lb; istart <= ub; istart++) {
179 const int istop = imin(ub, istart + sections[0][istart + cmax]);
180 const int cube2grid = map[0][istart + cmax] - istart;
181
182 const int stride = npts_local[1] * npts_local[0];
183 const int grid_index_0 = kg1 * stride + jg1 * npts_local[0];
184 const int grid_index_1 = kg2 * stride + jg1 * npts_local[0];
185 const int grid_index_2 = kg1 * stride + jg2 * npts_local[0];
186 const int grid_index_3 = kg2 * stride + jg2 * npts_local[0];
187 GRID_CONST_WHEN_INTEGRATE double *grid_base_0 = &grid[grid_index_0];
188 GRID_CONST_WHEN_INTEGRATE double *grid_base_1 = &grid[grid_index_1];
189 GRID_CONST_WHEN_INTEGRATE double *grid_base_2 = &grid[grid_index_2];
190 GRID_CONST_WHEN_INTEGRATE double *grid_base_3 = &grid[grid_index_3];
191
192 // Use AVX2 to process grid points in chunks of four, ie. 256 bit vectors.
193#if defined(__AVX2__) && defined(__FMA__)
194 const int istop_vec = istart + 4 * ((istop - istart + 1) / 4) - 1;
195 for (int i = istart; i <= istop_vec; i += 4) {
196 const int ig = i + cube2grid;
197 ortho_cx_to_grid_avx2(lp, cmax, i, pol, cx, &grid_base_0[ig],
198 &grid_base_1[ig], &grid_base_2[ig],
199 &grid_base_3[ig]);
200 }
201 istart = istop_vec + 1;
202#endif
203
204 // Process up to 3 remaining points - or everything if AVX2 isn't available.
205 for (int i = istart; i <= istop; i++) {
206 const int ig = i + cube2grid;
207 ortho_cx_to_grid_scalar(lp, cmax, i, pol, cx, &grid_base_0[ig],
208 &grid_base_1[ig], &grid_base_2[ig],
209 &grid_base_3[ig]);
210 }
211 istart = istop;
212 }
213}
214
215/*******************************************************************************
216 * \brief Transforms coefficients C_xy into C_x by fixing grid index j.
217 * \author Ole Schuett
218 ******************************************************************************/
219static inline void __attribute__((always_inline))
220ortho_cxy_to_cx(const int lp, const int j1, const int j2, const int cmax,
221 const double pol[3][lp + 1][2 * cmax + 1],
224
225 for (int lyp = 0; lyp <= lp; lyp++) {
226 for (int lxp = 0; lxp <= lp - lyp; lxp++) {
227 const double p1 = pol[1][lyp][j1 + cmax];
228 const double p2 = pol[1][lyp][j2 + cmax];
229 const int cxy_index = lyp * (lp + 1) * 2 + lxp * 2; // [lyp, lxp, 0]
230
231#if (GRID_DO_COLLOCATE)
232 // collocate
233 cx[lxp * 4 + 0] += cxy[cxy_index + 0] * p1;
234 cx[lxp * 4 + 1] += cxy[cxy_index + 1] * p1;
235 cx[lxp * 4 + 2] += cxy[cxy_index + 0] * p2;
236 cx[lxp * 4 + 3] += cxy[cxy_index + 1] * p2;
237#else
238 // integrate
239 cxy[cxy_index + 0] += cx[lxp * 4 + 0] * p1;
240 cxy[cxy_index + 1] += cx[lxp * 4 + 1] * p1;
241 cxy[cxy_index + 0] += cx[lxp * 4 + 2] * p2;
242 cxy[cxy_index + 1] += cx[lxp * 4 + 3] * p2;
243#endif
244 }
245 }
246}
247
248/*******************************************************************************
249 * \brief Loop body of ortho_cxy_to_grid to be inlined for low values of lp.
250 * \author Ole Schuett
251 ******************************************************************************/
252static inline void __attribute__((always_inline)) ortho_cxy_to_grid_low(
253 const int lp, const int j1, const int j2, const int kg1, const int kg2,
254 const int jg1, const int jg2, const int cmax,
255 const double pol[3][lp + 1][2 * cmax + 1], const int map[3][2 * cmax + 1],
256 const int sections[3][2 * cmax + 1], const int npts_local[3],
257 int **sphere_bounds_iter, double *cx, GRID_CONST_WHEN_COLLOCATE double *cxy,
259
260#if (GRID_DO_COLLOCATE)
261 // collocate
262 ortho_cxy_to_cx(lp, j1, j2, cmax, pol, cxy, cx);
265#else
266 // integrate
269 ortho_cxy_to_cx(lp, j1, j2, cmax, pol, cxy, cx);
270#endif
271}
272
273/*******************************************************************************
274 * \brief Collocates coefficients C_xy onto the grid for orthorhombic case.
275 * \author Ole Schuett
276 ******************************************************************************/
277static inline void ortho_cxy_to_grid(
278 const int lp, const int kg1, const int kg2, const int cmax,
279 const double pol[3][lp + 1][2 * cmax + 1], const int map[3][2 * cmax + 1],
280 const int sections[3][2 * cmax + 1], const int npts_local[3],
283
284 // The cube contains an even number of grid points in each direction and
285 // collocation is always performed on a pair of two opposing grid points.
286 // Hence, the points with index 0 and 1 are both assigned distance zero via
287 // the formular distance=(2*index-1)/2.
288
289 const int jstart = *((*sphere_bounds_iter)++);
290 const size_t cx_size = (lp + 1) * 4;
291 double cx[cx_size];
292 for (int j1 = jstart; j1 <= 0; j1++) {
293 const int j2 = 1 - j1;
294 const int jg1 = map[1][j1 + cmax];
295 const int jg2 = map[1][j2 + cmax];
296
297 memset(cx, 0, cx_size * sizeof(double));
298
299 // Generate separate branches for low values of lp gives up to 30% speedup.
300 if (lp <= GRID_MAX_LP_OPTIMIZED) {
302 for (int ilp = 0; ilp <= GRID_MAX_LP_OPTIMIZED; ilp++) {
303 if (lp == ilp) {
304 ortho_cxy_to_grid_low(ilp, j1, j2, kg1, kg2, jg1, jg2, cmax, pol, map,
306 cxy, grid);
307 }
308 }
309 } else {
310 ortho_cxy_to_grid_low(lp, j1, j2, kg1, kg2, jg1, jg2, cmax, pol, map,
312 grid);
313 }
314 }
315}
316
317/*******************************************************************************
318 * \brief Transforms coefficients C_xyz into C_xz by fixing grid index k.
319 * \author Ole Schuett
320 ******************************************************************************/
321static inline void ortho_cxyz_to_cxy(const int lp, const int k1, const int k2,
322 const int cmax,
323 const double pol[3][lp + 1][2 * cmax + 1],
324 GRID_CONST_WHEN_COLLOCATE double *cxyz,
326
327 for (int lzp = 0; lzp <= lp; lzp++) {
328 for (int lyp = 0; lyp <= lp - lzp; lyp++) {
329 for (int lxp = 0; lxp <= lp - lzp - lyp; lxp++) {
330 const double p1 = pol[2][lzp][k1 + cmax];
331 const double p2 = pol[2][lzp][k2 + cmax];
332 const int cxyz_index =
333 lzp * (lp + 1) * (lp + 1) + lyp * (lp + 1) + lxp; // [lzp, lyp, lxp]
334 const int cxy_index = lyp * (lp + 1) * 2 + lxp * 2; // [lyp, lxp, 0]
335
336#if (GRID_DO_COLLOCATE)
337 // collocate
338 cxy[cxy_index + 0] += cxyz[cxyz_index] * p1;
339 cxy[cxy_index + 1] += cxyz[cxyz_index] * p2;
340#else
341 // integrate
342 cxyz[cxyz_index] += cxy[cxy_index + 0] * p1;
343 cxyz[cxyz_index] += cxy[cxy_index + 1] * p2;
344#endif
345 }
346 }
347 }
348}
349
350/*******************************************************************************
351 * \brief Collocates coefficients C_xyz onto the grid for orthorhombic case.
352 * \author Ole Schuett
353 ******************************************************************************/
354static inline void
355ortho_cxyz_to_grid(const int lp, const double zetp, const double dh[3][3],
356 const double dh_inv[3][3], const double rp[3],
357 const int npts_global[3], const int npts_local[3],
358 const int shift_local[3], const double radius,
359 GRID_CONST_WHEN_COLLOCATE double *cxyz,
361
362 // *** position of the gaussian product
363 //
364 // this is the actual definition of the position on the grid
365 // i.e. a point rp(:) gets here grid coordinates
366 // MODULO(rp(:)/dr(:),npts_global(:))+1
367 // hence (0.0,0.0,0.0) in real space is rsgrid%lb on the rsgrid in Fortran
368 // and (1,1,1) on grid here in C.
369
370 // cubecenter(:) = FLOOR(MATMUL(dh_inv, rp))
371 int cubecenter[3];
372 for (int i = 0; i < 3; i++) {
373 double dh_inv_rp = 0.0;
374 for (int j = 0; j < 3; j++) {
375 dh_inv_rp += dh_inv[j][i] * rp[j];
376 }
377 cubecenter[i] = (int)floor(dh_inv_rp);
378 }
379
380 double roffset[3];
381 for (int i = 0; i < 3; i++) {
382 roffset[i] = rp[i] - ((double)cubecenter[i]) * dh[i][i];
383 }
384
385 // Lookup loop bounds for spherical cutoff.
386 int *sphere_bounds;
387 double disr_radius;
388 grid_sphere_cache_lookup(radius, dh, dh_inv, &sphere_bounds, &disr_radius);
389 int **sphere_bounds_iter = &sphere_bounds;
390
391 // Cube bounds.
392 int lb_cube[3], ub_cube[3];
393 for (int i = 0; i < 3; i++) {
394 lb_cube[i] = (int)ceil(-1e-8 - disr_radius * dh_inv[i][i]);
395 ub_cube[i] = 1 - lb_cube[i];
396 // If grid is not period check that cube fits without wrapping.
397 if (npts_global[i] != npts_local[i]) {
398 const int offset =
399 modulo(cubecenter[i] + lb_cube[i] - shift_local[i], npts_global[i]) -
400 lb_cube[i];
401 assert(offset + ub_cube[i] < npts_local[i]);
402 assert(offset + lb_cube[i] >= 0);
403 }
404 }
405
406 // cmax = MAXVAL(ub_cube)
407 const int cmax = imax(imax(ub_cube[0], ub_cube[1]), ub_cube[2]);
408
409 // Precompute (x-xp)**lp*exp(..) for each direction.
410 double pol_mutable[3][lp + 1][2 * cmax + 1];
411 for (int idir = 0; idir < 3; idir++) {
412 const double dr = dh[idir][idir];
413 const double ro = roffset[idir];
414 // Reuse the result from the previous gridpoint to avoid to many exps:
415 // exp( -a*(x+d)**2) = exp(-a*x**2)*exp(-2*a*x*d)*exp(-a*d**2)
416 // exp(-2*a*(x+d)*d) = exp(-2*a*x*d)*exp(-2*a*d**2)
417 const double t_exp_1 = exp(-zetp * pow(dr, 2));
418 const double t_exp_2 = pow(t_exp_1, 2);
419 double t_exp_min_1 = exp(-zetp * pow(+dr - ro, 2));
420 double t_exp_min_2 = exp(-2 * zetp * (+dr - ro) * (-dr));
421 for (int ig = 0; ig >= lb_cube[idir]; ig--) {
422 const double rpg = ig * dr - ro;
423 t_exp_min_1 *= t_exp_min_2 * t_exp_1;
424 t_exp_min_2 *= t_exp_2;
425 double pg = t_exp_min_1;
426 for (int icoef = 0; icoef <= lp; icoef++) {
427 pol_mutable[idir][icoef][ig + cmax] = pg; // exp(-zetp*rpg**2)
428 pg *= rpg;
429 }
430 }
431 double t_exp_plus_1 = exp(-zetp * pow(-ro, 2));
432 double t_exp_plus_2 = exp(-2 * zetp * (-ro) * (+dr));
433 for (int ig = 0; ig >= lb_cube[idir]; ig--) {
434 const double rpg = (1 - ig) * dr - ro;
435 t_exp_plus_1 *= t_exp_plus_2 * t_exp_1;
436 t_exp_plus_2 *= t_exp_2;
437 double pg = t_exp_plus_1;
438 for (int icoef = 0; icoef <= lp; icoef++) {
439 pol_mutable[idir][icoef][1 - ig + cmax] = pg; // exp(-zetp*rpg**2)
440 pg *= rpg;
441 }
442 }
443 }
444 const double(*pol)[lp + 1][2 * cmax + 1] =
445 (const double(*)[lp + 1][2 * cmax + 1]) pol_mutable;
446
447 // Precompute mapping from cube to grid indices for each direction
448 int map_mutable[3][2 * cmax + 1];
449 for (int i = 0; i < 3; i++) {
450 for (int k = -cmax; k <= +cmax; k++) {
451 map_mutable[i][k + cmax] =
452 modulo(cubecenter[i] + k - shift_local[i], npts_global[i]);
453 }
454 }
455 const int(*map)[2 * cmax + 1] = (const int(*)[2 * cmax + 1]) map_mutable;
456
457 // Precompute length of sections with homogeneous cube to grid mapping.
458 int sections_mutable[3][2 * cmax + 1];
459 for (int i = 0; i < 3; i++) {
460 for (int kg = 2 * cmax; kg >= 0; kg--) {
461 if (kg == 2 * cmax || map[i][kg] != map[i][kg + 1] - 1) {
462 sections_mutable[i][kg] = 0;
463 } else {
464 sections_mutable[i][kg] = sections_mutable[i][kg + 1] + 1;
465 }
466 }
467 }
468 const int(*sections)[2 * cmax + 1] =
469 (const int(*)[2 * cmax + 1]) sections_mutable;
470
471 // Loop over k dimension of the cube.
472 const int kstart = *((*sphere_bounds_iter)++);
473 const size_t cxy_size = (lp + 1) * (lp + 1) * 2;
474 double cxy[cxy_size];
475 for (int k1 = kstart; k1 <= 0; k1++) {
476 const int k2 = 1 - k1;
477 const int kg1 = map[2][k1 + cmax];
478 const int kg2 = map[2][k2 + cmax];
479
480 memset(cxy, 0, cxy_size * sizeof(double));
481
482#if (GRID_DO_COLLOCATE)
483 // collocate
484 ortho_cxyz_to_cxy(lp, k1, k2, cmax, pol, cxyz, cxy);
487#else
488 // integrate
491 ortho_cxyz_to_cxy(lp, k1, k2, cmax, pol, cxyz, cxy);
492#endif
493 }
494}
495
496/*******************************************************************************
497 * \brief Collocates coefficients C_i onto the grid for general case.
498 * \author Ole Schuett
499 ******************************************************************************/
500static inline void __attribute__((always_inline)) general_ci_to_grid(
501 const int lp, const int jg, const int kg, const int ismin, const int ismax,
502 const int npts_local[3], const int index_min[3], const int index_max[3],
503 const int map_i[], const int sections_i[], const double gp[3], const int k,
504 const int j, const double exp_ij[], const double exp_jk[],
505 const double exp_ki[], GRID_CONST_WHEN_COLLOCATE double *ci,
507
508 const int base = kg * npts_local[1] * npts_local[0] + jg * npts_local[0];
509
510 // AVX instructions can only load/store from evenly spaced memory locations.
511 // Since the cube can wrap around due to the grid's periodicity,
512 // the inner loop runs over sections with homogeneous cube to grid mapping.
513 for (int istart = ismin; istart <= ismax; istart++) {
514 const int istop = imin(ismax, istart + sections_i[istart - index_min[0]]);
515 if (map_i[istart - index_min[0]] < 0) {
516 istart = istop; // skip over out-of-bounds indicies
517 continue;
518 }
519
520 const int cube2grid = map_i[istart - index_min[0]] - istart;
521 for (int i = istart; i <= istop; i++) {
522 const int ig = i + cube2grid;
523 const double di = i - gp[0];
524
525 const int stride_i = index_max[0] - index_min[0] + 1;
526 const int stride_j = index_max[1] - index_min[1] + 1;
527 const int stride_k = index_max[2] - index_min[2] + 1;
528 const int idx_ij = (j - index_min[1]) * stride_i + i - index_min[0];
529 const int idx_jk = (k - index_min[2]) * stride_j + j - index_min[1];
530 const int idx_ki = (i - index_min[0]) * stride_k + k - index_min[2];
531
532 // Mathieu's trick: Calculate 3D Gaussian from three precomputed 2D tables
533 //
534 // r = (i-gp[0])*dh[0,:] + (j-gp[1])*dh[1,:] + (k-gp[2])*dh[2,:]
535 // = a + b + c
536 //
537 // r**2 = (a + b + c)**2 = a**2 + b**2 + c**2 + 2ab + 2bc + 2ca
538 //
539 // exp(-r**2) = exp(-a(a+2b)) * exp(-b*(b+2c)) * exp(-c*(c+2a))
540 //
541 const double gaussian = exp_ij[idx_ij] * exp_jk[idx_jk] * exp_ki[idx_ki];
542
543 const int grid_index = base + ig; // [kg, jg, ig]
544 double dip = gaussian;
545
546#if (GRID_DO_COLLOCATE)
547 // collocate
548 double reg = 0.0;
549 for (int il = 0; il <= lp; il++) {
550 reg += ci[il] * dip;
551 dip *= di;
552 }
553 grid[grid_index] += reg;
554#else
555 // integrate
556 const double reg = grid[grid_index];
557 for (int il = 0; il <= lp; il++) {
558 ci[il] += reg * dip;
559 dip *= di;
560 }
561#endif
562 }
563 istart = istop;
564 }
565}
566
567/*******************************************************************************
568 * \brief Transforms coefficients C_ij into C_i by fixing grid index j.
569 * \author Ole Schuett
570 ******************************************************************************/
571static inline void __attribute__((always_inline))
572general_cij_to_ci(const int lp, const double dj,
573 GRID_CONST_WHEN_COLLOCATE double *cij,
574 GRID_CONST_WHEN_INTEGRATE double *ci) {
575 double djp = 1.0;
576 for (int jl = 0; jl <= lp; jl++) {
577 for (int il = 0; il <= lp - jl; il++) {
578 const int cij_index = jl * (lp + 1) + il; // [jl, il]
579#if (GRID_DO_COLLOCATE)
580 ci[il] += cij[cij_index] * djp; // collocate
581#else
582 cij[cij_index] += ci[il] * djp; // integrate
583#endif
584 }
585 djp *= dj;
586 }
587}
588
589/*******************************************************************************
590 * \brief Loop body of general_cij_to_grid to be inlined for low values of lp.
591 * \author Ole Schuett
592 ******************************************************************************/
593static inline void __attribute__((always_inline)) general_cij_to_grid_low(
594 const int lp, const int jg, const int kg, const int ismin, const int ismax,
595 const int npts_local[3], const int index_min[3], const int index_max[3],
596 const int map_i[], const int sections_i[], const double gp[3], const int k,
597 const int j, const double exp_ij[], const double exp_jk[],
598 const double exp_ki[], const double dj, double *ci,
599 GRID_CONST_WHEN_COLLOCATE double *cij,
601
602#if (GRID_DO_COLLOCATE)
603 // collocate
604 general_cij_to_ci(lp, dj, cij, ci);
605 general_ci_to_grid(lp, jg, kg, ismin, ismax, npts_local, index_min, index_max,
606 map_i, sections_i, gp, k, j, exp_ij, exp_jk, exp_ki, ci,
607 grid);
608#else
609 // integrate
610 general_ci_to_grid(lp, jg, kg, ismin, ismax, npts_local, index_min, index_max,
611 map_i, sections_i, gp, k, j, exp_ij, exp_jk, exp_ki, ci,
612 grid);
613 general_cij_to_ci(lp, dj, cij, ci);
614#endif
615}
616
617/*******************************************************************************
618 * \brief Collocates coefficients C_ij onto the grid for general case.
619 * \author Ole Schuett
620 ******************************************************************************/
621static inline void general_cij_to_grid(
622 const int lp, const int k, const int kg, const int npts_local[3],
623 const int index_min[3], const int index_max[3], const int map_i[],
624 const int map_j[], const int sections_i[], const int sections_j[],
625 const double dh[3][3], const double gp[3], const double radius,
626 const double exp_ij[], const double exp_jk[], const double exp_ki[],
627 GRID_CONST_WHEN_COLLOCATE double *cij,
629
630 for (int j = index_min[1]; j <= index_max[1]; j++) {
631 const int jg = map_j[j - index_min[1]];
632 if (jg < 0) {
633 j += sections_j[j - index_min[1]]; // skip over out-of-bounds indicies
634 continue;
635 }
636
637 //--------------------------------------------------------------------
638 // Find bounds for the inner loop based on a quadratic equation in i.
639 //
640 // The real-space vector from the center of the gaussian to the
641 // grid point i,j,k is given by:
642 // r = (i-gp[0])*dh[0,:] + (j-gp[1])*dh[1,:] + (k-gp[2])*dh[2,:]
643 //
644 // Separating the term that depends on i:
645 // r = i*dh[0,:] - gp[0]*dh[0,:] + (j-gp[1])*dh[1,:] + (k-gp[2])*dh[2,:]
646 // = i*dh[0,:] + v
647 //
648 // The squared distance works out to:
649 // r**2 = dh[0,:]**2 * i**2 + 2 * v * dh[0,:] * i + v**2
650 // = a * i**2 + b * i + c
651 //
652 // Solving r**2==radius**2 for i yields:
653 // d = b**2 - 4 * a * (c - radius**2)
654 // i = (-b \pm sqrt(d)) / (2*a)
655 //
656 double a = 0.0, b = 0.0, c = 0.0;
657 for (int i = 0; i < 3; i++) {
658 const double v = (0 - gp[0]) * dh[0][i] + (j - gp[1]) * dh[1][i] +
659 (k - gp[2]) * dh[2][i];
660 a += dh[0][i] * dh[0][i];
661 b += 2.0 * v * dh[0][i];
662 c += v * v;
663 }
664 const double d = b * b - 4.0 * a * (c - radius * radius);
665
666 if (0.0 < d) {
667 const double sqrt_d = sqrt(d);
668 const double inv_2a = 1.0 / (2.0 * a);
669 const int ismin = (int)ceil((-b - sqrt_d) * inv_2a);
670 const int ismax = (int)floor((-b + sqrt_d) * inv_2a);
671 const double dj = j - gp[1];
672
673 double ci[lp + 1];
674 memset(ci, 0, sizeof(ci));
675
676 // Generate separate branches for low values of lp.
677 if (lp <= GRID_MAX_LP_OPTIMIZED) {
679 for (int ilp = 0; ilp <= GRID_MAX_LP_OPTIMIZED; ilp++) {
680 if (lp == ilp) {
681 general_cij_to_grid_low(ilp, jg, kg, ismin, ismax, npts_local,
682 index_min, index_max, map_i, sections_i, gp,
683 k, j, exp_ij, exp_jk, exp_ki, dj, ci, cij,
684 grid);
685 }
686 }
687 } else {
688 general_cij_to_grid_low(lp, jg, kg, ismin, ismax, npts_local, index_min,
689 index_max, map_i, sections_i, gp, k, j, exp_ij,
690 exp_jk, exp_ki, dj, ci, cij, grid);
691 }
692 }
693 }
694}
695
696/*******************************************************************************
697 * \brief Transforms coefficients C_ijk into C_ij by fixing grid index k.
698 * \author Ole Schuett
699 ******************************************************************************/
700static inline void general_cijk_to_cij(const int lp, const double dk,
701 GRID_CONST_WHEN_COLLOCATE double *cijk,
702 GRID_CONST_WHEN_INTEGRATE double *cij) {
703 double dkp = 1.0;
704 for (int kl = 0; kl <= lp; kl++) {
705 for (int jl = 0; jl <= lp - kl; jl++) {
706 for (int il = 0; il <= lp - kl - jl; il++) {
707 const int cij_index = jl * (lp + 1) + il; // [jl, il]
708 const int cijk_index =
709 kl * (lp + 1) * (lp + 1) + jl * (lp + 1) + il; // [kl, jl, il]
710#if (GRID_DO_COLLOCATE)
711 cij[cij_index] += cijk[cijk_index] * dkp; // collocate
712#else
713 cijk[cijk_index] += cij[cij_index] * dkp; // integrate
714#endif
715 }
716 }
717 dkp *= dk;
718 }
719}
720
721/*******************************************************************************
722 * \brief Precompute mapping of grid indices and its homogeneous sections.
723 * \author Ole Schuett
724 ******************************************************************************/
725static inline void
726general_precompute_mapping(const int index_min, const int index_max,
727 const int shift_local, const int npts_global,
728 const int bounds[2], int map[], int sections[]) {
729
730 // Precompute mapping from continous grid indices to pbc wraped.
731 for (int k = index_min; k <= index_max; k++) {
732 const int kg = modulo(k - shift_local, npts_global);
733 if (bounds[0] <= kg && kg <= bounds[1]) {
734 map[k - index_min] = kg;
735 } else {
736 map[k - index_min] = INT_MIN; // out of bounds - not mapped
737 }
738 }
739
740 // Precompute length of sections with homogeneous cube to grid mapping.
741 const int range = index_max - index_min + 1;
742 for (int kg = range - 1; kg >= 0; kg--) {
743 if (kg == range - 1 || map[kg] != map[kg + 1] - 1) {
744 sections[kg] = 0;
745 } else {
746 sections[kg] = sections[kg + 1] + 1;
747 }
748 }
749}
750
751/*******************************************************************************
752 * \brief Fill one of the 2D tables that speedup 3D Gaussian (Mathieu's trick).
753 * \author Ole Schuett
754 ******************************************************************************/
755static inline void
756general_fill_exp_table(const int idir, const int jdir, const int index_min[3],
757 const int index_max[3], const double zetp,
758 const double dh[3][3], const double gp[3],
759 double exp_table[]) {
760
761 const int stride_i = index_max[idir] - index_min[idir] + 1;
762 const double h_ii = dh[idir][0] * dh[idir][0] + dh[idir][1] * dh[idir][1] +
763 dh[idir][2] * dh[idir][2];
764 const double h_ij = dh[idir][0] * dh[jdir][0] + dh[idir][1] * dh[jdir][1] +
765 dh[idir][2] * dh[jdir][2];
766
767 for (int i = index_min[idir]; i <= index_max[idir]; i++) {
768 const double di = i - gp[idir];
769 const double rii = di * di * h_ii;
770 const double rij_unit = di * h_ij;
771 const double exp_ij_unit = exp(-zetp * 2.0 * rij_unit);
772
773 // compute exponentials symmetrically around cube center
774 const int j_center = (int)gp[jdir];
775 const double dj_center = j_center - gp[jdir];
776 const double rij_center = dj_center * rij_unit;
777 const double exp_ij_center = exp(-zetp * (rii + 2.0 * rij_center));
778
779 // above center
780 double exp_ij = exp_ij_center;
781 for (int j = j_center; j <= index_max[jdir]; j++) {
782 const int idx = (j - index_min[jdir]) * stride_i + i - index_min[idir];
783 exp_table[idx] = exp_ij; // exp(-zetp * (di*di*h_ii + 2*di*dj*h_ij));
784 exp_ij *= exp_ij_unit;
785 }
786
787 // below center
788 const double exp_ij_unit_inv = 1.0 / exp_ij_unit;
789 exp_ij = exp_ij_center * exp_ij_unit_inv;
790 for (int j = j_center - 1; j >= index_min[jdir]; j--) {
791 const int idx = (j - index_min[jdir]) * stride_i + i - index_min[idir];
792 exp_table[idx] = exp_ij; // exp(-zetp * (di*di*h_ii + 2*di*dj*h_ij));
793 exp_ij *= exp_ij_unit_inv;
794 }
795 }
796}
797
798/*******************************************************************************
799 * \brief Collocates coefficients C_ijk onto the grid for general case.
800 * \author Ole Schuett
801 ******************************************************************************/
802static inline void
803general_cijk_to_grid(const int border_mask, const int lp, const double zetp,
804 const double dh[3][3], const double dh_inv[3][3],
805 const double rp[3], const int npts_global[3],
806 const int npts_local[3], const int shift_local[3],
807 const int border_width[3], const double radius,
808 GRID_CONST_WHEN_COLLOCATE double *cijk,
810
811 // Default for border_mask == 0.
812 int bounds_i[2] = {0, npts_local[0] - 1};
813 int bounds_j[2] = {0, npts_local[1] - 1};
814 int bounds_k[2] = {0, npts_local[2] - 1};
815
816 // See also rs_find_node() in task_list_methods.F.
817 // If the bit is set then we need to exclude the border in that direction.
818 if (border_mask & (1 << 0))
819 bounds_i[0] += border_width[0];
820 if (border_mask & (1 << 1))
821 bounds_i[1] -= border_width[0];
822 if (border_mask & (1 << 2))
823 bounds_j[0] += border_width[1];
824 if (border_mask & (1 << 3))
825 bounds_j[1] -= border_width[1];
826 if (border_mask & (1 << 4))
827 bounds_k[0] += border_width[2];
828 if (border_mask & (1 << 5))
829 bounds_k[1] -= border_width[2];
830
831 // center in grid coords
832 // gp = MATMUL(dh_inv, rp)
833 double gp[3] = {0.0, 0.0, 0.0};
834 for (int i = 0; i < 3; i++) {
835 for (int j = 0; j < 3; j++) {
836 gp[i] += dh_inv[j][i] * rp[j];
837 }
838 }
839
840 // Get the min max indices that contain at least the cube that contains a
841 // sphere around rp of radius radius if the cell is very non-orthogonal this
842 // implies that many useless points are included this estimate can be improved
843 // (i.e. not box but sphere should be used)
844 int index_min[3] = {INT_MAX, INT_MAX, INT_MAX};
845 int index_max[3] = {INT_MIN, INT_MIN, INT_MIN};
846 for (int i = -1; i <= 1; i++) {
847 for (int j = -1; j <= 1; j++) {
848 for (int k = -1; k <= 1; k++) {
849 const double x = rp[0] + i * radius;
850 const double y = rp[1] + j * radius;
851 const double z = rp[2] + k * radius;
852 for (int idir = 0; idir < 3; idir++) {
853 const double resc =
854 dh_inv[0][idir] * x + dh_inv[1][idir] * y + dh_inv[2][idir] * z;
855 index_min[idir] = imin(index_min[idir], (int)floor(resc));
856 index_max[idir] = imax(index_max[idir], (int)ceil(resc));
857 }
858 }
859 }
860 }
861
862 // Precompute mappings
863 const int range_i = index_max[0] - index_min[0] + 1;
864 int map_i[range_i], sections_i[range_i];
865 general_precompute_mapping(index_min[0], index_max[0], shift_local[0],
866 npts_global[0], bounds_i, map_i, sections_i);
867 const int range_j = index_max[1] - index_min[1] + 1;
868 int map_j[range_j], sections_j[range_j];
869 general_precompute_mapping(index_min[1], index_max[1], shift_local[1],
870 npts_global[1], bounds_j, map_j, sections_j);
871 const int range_k = index_max[2] - index_min[2] + 1;
872 int map_k[range_k], sections_k[range_k];
873 general_precompute_mapping(index_min[2], index_max[2], shift_local[2],
874 npts_global[2], bounds_k, map_k, sections_k);
875
876 // Precompute exponentials
877 double exp_ij[range_i * range_j];
878 general_fill_exp_table(0, 1, index_min, index_max, zetp, dh, gp, exp_ij);
879 double exp_jk[range_j * range_k];
880 general_fill_exp_table(1, 2, index_min, index_max, zetp, dh, gp, exp_jk);
881 double exp_ki[range_k * range_i];
882 general_fill_exp_table(2, 0, index_min, index_max, zetp, dh, gp, exp_ki);
883
884 // go over the grid, but cycle if the point is not within the radius
885 const int cij_size = (lp + 1) * (lp + 1);
886 double cij[cij_size];
887 for (int k = index_min[2]; k <= index_max[2]; k++) {
888 const int kg = map_k[k - index_min[2]];
889 if (kg < 0) {
890 k += sections_k[k - index_min[2]]; // skip over out-of-bounds indicies
891 continue;
892 }
893
894 // zero coef_xyt
895 memset(cij, 0, cij_size * sizeof(double));
896
897#if (GRID_DO_COLLOCATE)
898 // collocate
899 general_cijk_to_cij(lp, (double)k - gp[2], cijk, cij);
900 general_cij_to_grid(lp, k, kg, npts_local, index_min, index_max, map_i,
901 map_j, sections_i, sections_j, dh, gp, radius, exp_ij,
902 exp_jk, exp_ki, cij, grid);
903#else
904 // integrate
905 general_cij_to_grid(lp, k, kg, npts_local, index_min, index_max, map_i,
906 map_j, sections_i, sections_j, dh, gp, radius, exp_ij,
907 exp_jk, exp_ki, cij, grid);
908 general_cijk_to_cij(lp, (double)k - gp[2], cijk, cij);
909#endif
910 }
911}
912
913/*******************************************************************************
914 * \brief Transforms coefficients C_xyz into C_ijk.
915 * \author Ole Schuett
916 ******************************************************************************/
917static inline void
918general_cxyz_to_cijk(const int lp, const double dh[3][3],
919 GRID_CONST_WHEN_COLLOCATE double *cxyz,
920 GRID_CONST_WHEN_INTEGRATE double *cijk) {
921
922 // transform P_{lxp,lyp,lzp} into a P_{lip,ljp,lkp} such that
923 // sum_{lxp,lyp,lzp} P_{lxp,lyp,lzp} (x-x_p)**lxp (y-y_p)**lyp (z-z_p)**lzp =
924 // sum_{lip,ljp,lkp} P_{lip,ljp,lkp} (i-i_p)**lip (j-j_p)**ljp (k-k_p)**lkp
925
926 // transform using multinomials
927 double hmatgridp[lp + 1][3][3];
928 for (int i = 0; i < 3; i++) {
929 for (int j = 0; j < 3; j++) {
930 hmatgridp[0][j][i] = 1.0;
931 for (int k = 1; k <= lp; k++) {
932 hmatgridp[k][j][i] = hmatgridp[k - 1][j][i] * dh[j][i];
933 }
934 }
935 }
936
937 const int lpx = lp;
938 for (int klx = 0; klx <= lpx; klx++) {
939 for (int jlx = 0; jlx <= lpx - klx; jlx++) {
940 for (int ilx = 0; ilx <= lpx - klx - jlx; ilx++) {
941 const int lx = ilx + jlx + klx;
942 const int lpy = lp - lx;
943 for (int kly = 0; kly <= lpy; kly++) {
944 for (int jly = 0; jly <= lpy - kly; jly++) {
945 for (int ily = 0; ily <= lpy - kly - jly; ily++) {
946 const int ly = ily + jly + kly;
947 const int lpz = lp - lx - ly;
948 for (int klz = 0; klz <= lpz; klz++) {
949 for (int jlz = 0; jlz <= lpz - klz; jlz++) {
950 for (int ilz = 0; ilz <= lpz - klz - jlz; ilz++) {
951 const int lz = ilz + jlz + klz;
952 const int il = ilx + ily + ilz;
953 const int jl = jlx + jly + jlz;
954 const int kl = klx + kly + klz;
955 const int lp1 = lp + 1;
956 const int cijk_index =
957 kl * lp1 * lp1 + jl * lp1 + il; // [kl,jl,il]
958 const int cxyz_index =
959 lz * lp1 * lp1 + ly * lp1 + lx; // [lz,ly,lx]
960 const double p =
961 hmatgridp[ilx][0][0] * hmatgridp[jlx][1][0] *
962 hmatgridp[klx][2][0] * hmatgridp[ily][0][1] *
963 hmatgridp[jly][1][1] * hmatgridp[kly][2][1] *
964 hmatgridp[ilz][0][2] * hmatgridp[jlz][1][2] *
965 hmatgridp[klz][2][2] * fac(lx) * fac(ly) * fac(lz) /
966 (fac(ilx) * fac(ily) * fac(ilz) * fac(jlx) * fac(jly) *
967 fac(jlz) * fac(klx) * fac(kly) * fac(klz));
968#if (GRID_DO_COLLOCATE)
969 cijk[cijk_index] += cxyz[cxyz_index] * p; // collocate
970#else
971 cxyz[cxyz_index] += cijk[cijk_index] * p; // integrate
972#endif
973 }
974 }
975 }
976 }
977 }
978 }
979 }
980 }
981 }
982}
983
984/*******************************************************************************
985 * \brief Collocates coefficients C_xyz onto the grid for general case.
986 * \author Ole Schuett
987 ******************************************************************************/
988static inline void
989general_cxyz_to_grid(const int border_mask, const int lp, const double zetp,
990 const double dh[3][3], const double dh_inv[3][3],
991 const double rp[3], const int npts_global[3],
992 const int npts_local[3], const int shift_local[3],
993 const int border_width[3], const double radius,
994 GRID_CONST_WHEN_COLLOCATE double *cxyz,
996
997 const size_t cijk_size = (lp + 1) * (lp + 1) * (lp + 1);
998 double cijk[cijk_size];
999 memset(cijk, 0, cijk_size * sizeof(double));
1000
1001#if (GRID_DO_COLLOCATE)
1002 // collocate
1003 general_cxyz_to_cijk(lp, dh, cxyz, cijk);
1004 general_cijk_to_grid(border_mask, lp, zetp, dh, dh_inv, rp, npts_global,
1005 npts_local, shift_local, border_width, radius, cijk,
1006 grid);
1007#else
1008 // integrate
1009 general_cijk_to_grid(border_mask, lp, zetp, dh, dh_inv, rp, npts_global,
1010 npts_local, shift_local, border_width, radius, cijk,
1011 grid);
1012 general_cxyz_to_cijk(lp, dh, cxyz, cijk);
1013#endif
1014}
1015
1016/*******************************************************************************
1017 * \brief Collocates coefficients C_xyz onto the grid.
1018 * \author Ole Schuett
1019 ******************************************************************************/
1020static inline void
1021cxyz_to_grid(const bool orthorhombic, const int border_mask, const int lp,
1022 const double zetp, const double dh[3][3],
1023 const double dh_inv[3][3], const double rp[3],
1024 const int npts_global[3], const int npts_local[3],
1025 const int shift_local[3], const int border_width[3],
1026 const double radius, GRID_CONST_WHEN_COLLOCATE double *cxyz,
1028
1029 enum grid_library_kernel k;
1030 if (orthorhombic && border_mask == 0) {
1032 ortho_cxyz_to_grid(lp, zetp, dh, dh_inv, rp, npts_global, npts_local,
1033 shift_local, radius, cxyz, grid);
1034 } else {
1036 general_cxyz_to_grid(border_mask, lp, zetp, dh, dh_inv, rp, npts_global,
1037 npts_local, shift_local, border_width, radius, cxyz,
1038 grid);
1039 }
1041}
1042
1043/*******************************************************************************
1044 * \brief Transforms coefficients C_ab into C_xyz.
1045 * \author Ole Schuett
1046 ******************************************************************************/
1047static inline void cab_to_cxyz(const int la_max, const int la_min,
1048 const int lb_max, const int lb_min,
1049 const double prefactor, const double ra[3],
1050 const double rb[3], const double rp[3],
1051 GRID_CONST_WHEN_COLLOCATE double *cab,
1052 GRID_CONST_WHEN_INTEGRATE double *cxyz) {
1053
1054 // Computes the polynomial expansion coefficients:
1055 // (x-a)**lxa (x-b)**lxb -> sum_{ls} alpha(ls,lxa,lxb,1)*(x-p)**ls
1056 const int lp = la_max + lb_max;
1057 double alpha[3][lb_max + 1][la_max + 1][lp + 1];
1058 memset(alpha, 0, 3 * (lb_max + 1) * (la_max + 1) * (lp + 1) * sizeof(double));
1059
1060 for (int i = 0; i < 3; i++) {
1061 const double drpa = rp[i] - ra[i];
1062 const double drpb = rp[i] - rb[i];
1063 for (int lxa = 0; lxa <= la_max; lxa++) {
1064 for (int lxb = 0; lxb <= lb_max; lxb++) {
1065 double binomial_k_lxa = 1.0;
1066 double a = 1.0;
1067 for (int k = 0; k <= lxa; k++) {
1068 double binomial_l_lxb = 1.0;
1069 double b = 1.0;
1070 for (int l = 0; l <= lxb; l++) {
1071 alpha[i][lxb][lxa][lxa - l + lxb - k] +=
1072 binomial_k_lxa * binomial_l_lxb * a * b;
1073 binomial_l_lxb *= ((double)(lxb - l)) / ((double)(l + 1));
1074 b *= drpb;
1075 }
1076 binomial_k_lxa *= ((double)(lxa - k)) / ((double)(k + 1));
1077 a *= drpa;
1078 }
1079 }
1080 }
1081 }
1082
1083 // *** initialise the coefficient matrix, we transform the sum
1084 //
1085 // sum_{lxa,lya,lza,lxb,lyb,lzb} P_{lxa,lya,lza,lxb,lyb,lzb} *
1086 // (x-a_x)**lxa (y-a_y)**lya (z-a_z)**lza (x-b_x)**lxb (y-a_y)**lya
1087 // (z-a_z)**lza
1088 //
1089 // into
1090 //
1091 // sum_{lxp,lyp,lzp} P_{lxp,lyp,lzp} (x-p_x)**lxp (y-p_y)**lyp (z-p_z)**lzp
1092 //
1093 // where p is center of the product gaussian, and lp = la_max + lb_max
1094 // (current implementation is l**7)
1095 //
1096
1097 for (int lzb = 0; lzb <= lb_max; lzb++) {
1098 for (int lza = 0; lza <= la_max; lza++) {
1099 for (int lyb = 0; lyb <= lb_max - lzb; lyb++) {
1100 for (int lya = 0; lya <= la_max - lza; lya++) {
1101 const int lxb_min = imax(lb_min - lzb - lyb, 0);
1102 const int lxa_min = imax(la_min - lza - lya, 0);
1103 for (int lxb = lxb_min; lxb <= lb_max - lzb - lyb; lxb++) {
1104 for (int lxa = lxa_min; lxa <= la_max - lza - lya; lxa++) {
1105 const int ico = coset(lxa, lya, lza);
1106 const int jco = coset(lxb, lyb, lzb);
1107 const int cab_index = jco * ncoset(la_max) + ico; // [jco, ico]
1108 for (int lzp = 0; lzp <= lza + lzb; lzp++) {
1109 for (int lyp = 0; lyp <= lp - lza - lzb; lyp++) {
1110 for (int lxp = 0; lxp <= lp - lza - lzb - lyp; lxp++) {
1111 const double p = alpha[0][lxb][lxa][lxp] *
1112 alpha[1][lyb][lya][lyp] *
1113 alpha[2][lzb][lza][lzp] * prefactor;
1114 const int lp1 = lp + 1;
1115 const int cxyz_index =
1116 lzp * lp1 * lp1 + lyp * lp1 + lxp; // [lzp, lyp, lxp]
1117#if (GRID_DO_COLLOCATE)
1118 cxyz[cxyz_index] += cab[cab_index] * p; // collocate
1119#else
1120 cab[cab_index] += cxyz[cxyz_index] * p; // integrate
1121#endif
1122 }
1123 }
1124 }
1125 }
1126 }
1127 }
1128 }
1129 }
1130 }
1131}
1132
1133/*******************************************************************************
1134 * \brief Collocates coefficients C_ab onto the grid.
1135 * \author Ole Schuett
1136 ******************************************************************************/
1137static inline void
1138cab_to_grid(const bool orthorhombic, const int border_mask, const int la_max,
1139 const int la_min, const int lb_max, const int lb_min,
1140 const double zeta, const double zetb, const double rscale,
1141 const double dh[3][3], const double dh_inv[3][3],
1142 const double ra[3], const double rab[3], const int npts_global[3],
1143 const int npts_local[3], const int shift_local[3],
1144 const int border_width[3], const double radius,
1145 GRID_CONST_WHEN_COLLOCATE double *cab,
1147
1148 // Check if radius is too small to be mapped onto grid of given resolution.
1149 double dh_max = 0.0;
1150 for (int i = 0; i < 3; i++) {
1151 for (int j = 0; j < 3; j++) {
1152 dh_max = fmax(dh_max, fabs(dh[i][j]));
1153 }
1154 }
1155 if (2.0 * radius < dh_max) {
1156 return;
1157 }
1158
1159 const double zetp = zeta + zetb;
1160 const double f = zetb / zetp;
1161 const double rab2 = rab[0] * rab[0] + rab[1] * rab[1] + rab[2] * rab[2];
1162 const double prefactor = rscale * exp(-zeta * f * rab2);
1163 double rp[3], rb[3];
1164 for (int i = 0; i < 3; i++) {
1165 rp[i] = ra[i] + f * rab[i];
1166 rb[i] = ra[i] + rab[i];
1167 }
1168
1169 const int lp = la_max + lb_max;
1170 const size_t cxyz_size = (lp + 1) * (lp + 1) * (lp + 1);
1171 double cxyz[cxyz_size];
1172 memset(cxyz, 0, cxyz_size * sizeof(double));
1173
1174#if (GRID_DO_COLLOCATE)
1175 // collocate
1176 cab_to_cxyz(la_max, la_min, lb_max, lb_min, prefactor, ra, rb, rp, cab, cxyz);
1177 cxyz_to_grid(orthorhombic, border_mask, lp, zetp, dh, dh_inv, rp, npts_global,
1178 npts_local, shift_local, border_width, radius, cxyz, grid);
1179#else
1180 // integrate
1181 cxyz_to_grid(orthorhombic, border_mask, lp, zetp, dh, dh_inv, rp, npts_global,
1182 npts_local, shift_local, border_width, radius, cxyz, grid);
1183 cab_to_cxyz(la_max, la_min, lb_max, lb_min, prefactor, ra, rb, rp, cab, cxyz);
1184#endif
1185}
1186
1187// EOF
static int imax(int x, int y)
Returns the larger of two given integer (missing from the C standard)
static int imin(int x, int y)
Returns the smaller of two given integer (missing from the C standard)
Definition dbm_miniapp.c:38
#define GRID_PRAGMA_UNROLL_UP_TO(N)
Definition grid_common.h:35
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
static GRID_HOST_DEVICE int idx(const orbital a)
Return coset index of given orbital angular momentum.
#define GRID_PRAGMA_UNROLL(N)
Definition grid_common.h:34
@ GRID_BACKEND_CPU
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
#define GRID_CONST_WHEN_COLLOCATE
static void const int const int const double GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double GRID_CONST_WHEN_INTEGRATE double GRID_CONST_WHEN_INTEGRATE double * grid_2
static void const int j1
static void const int const int const int jg1
static void const int const int j2
static void const int const int const double GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double GRID_CONST_WHEN_INTEGRATE double * grid_1
static void const int const int const int const int const int const double const int const int const int int ** sphere_bounds_iter
static void const int const int kg2
static void const int const int const int const int const int const double const int const int sections[3][2 *cmax+1]
static void const int const int i
static void const int const int const int const double GRID_CONST_WHEN_COLLOCATE double * cxy
static void __attribute__((always_inline)) ortho_cx_to_grid_scalar(const int lp
Simple loop body for ortho_cx_to_grid using plain C.
static void const int cmax
static void const int kg1
static void const int const int const double pol[3][lp+1][2 *cmax+1]
const int ub
static void const int const int const int const int const int const double const int const int const int npts_local[3]
#define GRID_MAX_LP_OPTIMIZED
static void const int const int const int const int jg2
static void const int const int const double GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid_0
for(int lxp=0;lxp<=lp;lxp++)
static void const int const int const double GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double GRID_CONST_WHEN_INTEGRATE double GRID_CONST_WHEN_INTEGRATE double GRID_CONST_WHEN_INTEGRATE double * grid_3
static void const int const int const int const int const int const double const int map[3][2 *cmax+1]
#define GRID_CONST_WHEN_INTEGRATE
static void const int const int const double GRID_CONST_WHEN_COLLOCATE double * cx
#define GRID_DO_COLLOCATE
void exp_ij(const double alpha, const int offset_i, const int imin, const int imax, const int offset_j, const int jmin, const int jmax, tensor *exp_ij_)
void grid_library_counter_add(const int lp, const enum grid_backend backend, const enum grid_library_kernel kernel, const int increment)
Adds given increment to counter specified by lp, backend, and kernel.
grid_library_kernel
Various kernels provided by the grid library.
@ GRID_INTEGRATE_GENERAL
@ GRID_COLLOCATE_ORTHO
@ GRID_COLLOCATE_GENERAL
@ GRID_INTEGRATE_ORTHO
static void cxyz_to_grid(const bool orthorhombic, const int border_mask, const int lp, const double zetp, const double dh[3][3], const double dh_inv[3][3], const double rp[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, GRID_CONST_WHEN_COLLOCATE double *cxyz, GRID_CONST_WHEN_INTEGRATE double *grid)
Collocates coefficients C_xyz onto the grid.
static void cab_to_cxyz(const int la_max, const int la_min, const int lb_max, const int lb_min, const double prefactor, const double ra[3], const double rb[3], const double rp[3], GRID_CONST_WHEN_COLLOCATE double *cab, GRID_CONST_WHEN_INTEGRATE double *cxyz)
Transforms coefficients C_ab into C_xyz.
static void ortho_cxy_to_grid(const int lp, const int k1, const int k2, const int cmax, const double pol[3][lp+1][2 *cmax+1], const int map[3][2 *cmax+1], const double dh[3][3], const double dh_inv[3][3], const double disr_radius, const int npts_local[3], GRID_CONST_WHEN_COLLOCATE double *cxy, GRID_CONST_WHEN_INTEGRATE double *grid)
Collocates coefficients C_xy onto the grid for orthorhombic case.
static void general_cij_to_ci(const int lp, const double dj, GRID_CONST_WHEN_COLLOCATE double *cij, GRID_CONST_WHEN_INTEGRATE double *ci)
Transforms coefficients C_ij into C_i by fixing grid index j.
static void general_cijk_to_cij(const int lp, const double dk, GRID_CONST_WHEN_COLLOCATE double *cijk, GRID_CONST_WHEN_INTEGRATE double *cij)
Transforms coefficients C_ijk into C_ij by fixing grid index k.
static void general_precompute_mapping(const int index_min, const int index_max, const int shift_local, const int npts_global, const int bounds[2], int map[])
Precompute mapping of grid indices for general case.
static void ortho_cxy_to_cx(const int lp, const int j1, const int j2, const int cmax, const double pol[3][lp+1][2 *cmax+1], GRID_CONST_WHEN_COLLOCATE double *cxy, GRID_CONST_WHEN_INTEGRATE double *cx)
Transforms coefficients C_xy into C_x by fixing grid index j.
static void general_cxyz_to_cijk(const int lp, const double dh[3][3], GRID_CONST_WHEN_COLLOCATE double *cxyz, GRID_CONST_WHEN_INTEGRATE double *cijk)
Transforms coefficients C_xyz into C_ijk.
static void general_cij_to_grid(const int lp, const int k, const int kg, const int npts_local[3], const int index_min[3], const int index_max[3], const int map_i[], const int map_j[], const double dh[3][3], const double gp[3], const double radius, const double exp_ij[], const double exp_jk[], const double exp_ki[], GRID_CONST_WHEN_COLLOCATE double *cij, GRID_CONST_WHEN_INTEGRATE double *grid)
Collocates coefficients C_ij onto the grid for general case.
static void general_cijk_to_grid(const int border_mask, const int lp, const double zetp, const double dh[3][3], const double dh_inv[3][3], const double rp[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, GRID_CONST_WHEN_COLLOCATE double *cijk, GRID_CONST_WHEN_INTEGRATE double *grid)
Collocates coefficients C_ijk onto the grid for general case.
static void cab_to_grid(const bool orthorhombic, const int border_mask, const int la_max, const int la_min, const int lb_max, const int lb_min, const double zeta, const double zetb, const double rscale, const double dh[3][3], const double dh_inv[3][3], const double ra[3], const double rab[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, GRID_CONST_WHEN_COLLOCATE double *cab, GRID_CONST_WHEN_INTEGRATE double *grid)
Collocates coefficients C_ab onto the grid.
static void general_ci_to_grid(const int lp, const int jg, const int kg, const int ismin, const int ismax, const int npts_local[3], const int index_min[3], const int index_max[3], const int map_i[], const double gp[3], const int k, const int j, const double exp_ij[], const double exp_jk[], const double exp_ki[], GRID_CONST_WHEN_COLLOCATE double *ci, GRID_CONST_WHEN_INTEGRATE double *grid)
Collocates coefficients C_i onto the grid for general case.
static void ortho_cx_to_grid(const int lp, const int k1, const int k2, const int j1, const int j2, const int cmax, const double pol[3][lp+1][2 *cmax+1], const int map[3][2 *cmax+1], const double dh[3][3], const double dh_inv[3][3], const double kremain, const int npts_local[3], GRID_CONST_WHEN_COLLOCATE double *cx, GRID_CONST_WHEN_INTEGRATE double *grid)
Collocates coefficients C_x onto the grid for orthorhombic case.
static void ortho_cxyz_to_cxy(const int lp, const int k1, const int k2, const int cmax, const double pol[3][lp+1][2 *cmax+1], GRID_CONST_WHEN_COLLOCATE double *cxyz, GRID_CONST_WHEN_INTEGRATE double *cxy)
Transforms coefficients C_xyz into C_xz by fixing grid index k.
static void general_cxyz_to_grid(const int border_mask, const int lp, const double zetp, const double dh[3][3], const double dh_inv[3][3], const double rp[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, GRID_CONST_WHEN_COLLOCATE double *cxyz, GRID_CONST_WHEN_INTEGRATE double *grid)
Collocates coefficients C_xyz onto the grid for general case.
static void ortho_cxyz_to_grid(const int lp, const double zetp, const double dh[3][3], const double dh_inv[3][3], const double rp[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const double radius, GRID_CONST_WHEN_COLLOCATE double *cxyz, GRID_CONST_WHEN_INTEGRATE double *grid)
Collocates coefficients C_xyz onto the grid for orthorhombic case.
static void general_fill_exp_table(const int idir, const int jdir, const int index_min[3], const int index_max[3], const double zetp, const double dh[3][3], const double gp[3], double exp_table[])
Fill one of the 2D tables that speedup 3D Gaussian (Mathieu's trick).
void grid_sphere_cache_lookup(const double radius, const double dh[3][3], const double dh_inv[3][3], int **sphere_bounds, double *discr_radius)
Lookup the sphere bound from cache and compute them as needed. See grid_sphere_cache....
real(dp), dimension(3) c
real(dp), dimension(3) a
real(dp), dimension(3) d
real(dp), dimension(3) b
real(dp), dimension(3) p
real(kind=dp), dimension(0:maxfac), parameter, public fac
integer, dimension(:), allocatable, public ncoset
integer, dimension(:, :, :), allocatable, public coset
integer, parameter, public gaussian