(git:3add494)
grid_dgemm_non_orthorombic_corrections.c
Go to the documentation of this file.
1 /*----------------------------------------------------------------------------*/
2 /* CP2K: A general program to perform molecular dynamics simulations */
3 /* Copyright 2000-2024 CP2K developers group <https://cp2k.org> */
4 /* */
5 /* SPDX-License-Identifier: BSD-3-Clause */
6 /*----------------------------------------------------------------------------*/
7 
9 
10 #include <math.h>
11 #include <stdio.h>
12 #include <stdlib.h>
13 #include <string.h>
14 
15 #include "../common/grid_common.h"
16 #include "grid_dgemm_utils.h"
17 
18 double exp_recursive(const double c_exp, const double c_exp_minus_1,
19  const int index) {
20  if (index == -1)
21  return c_exp_minus_1;
22 
23  if (index == 1)
24  return c_exp;
25 
26  double res = 1.0;
27 
28  if (index < 0) {
29  for (int i = 0; i < -index; i++) {
30  res *= c_exp_minus_1;
31  }
32  return res;
33  }
34 
35  if (index > 0) {
36  for (int i = 0; i < index; i++) {
37  res *= c_exp;
38  }
39  return res;
40  }
41 
42  return 1.0;
43 }
44 
45 void exp_i(const double alpha, const int imin, const int imax,
46  double *restrict const res) {
47  const double c_exp_co = exp(alpha);
48  /* const double c_exp_minus_1 = 1/ c_exp; */
49  res[0] = exp(imin * alpha);
50  for (int i = 1; i < (imax - imin); i++) {
51  res[i] = res[i - 1] *
52  c_exp_co; // exp_recursive(c_exp_co, 1.0 / c_exp_co, i + imin);
53  }
54 }
55 
56 void exp_ij(const double alpha, const int offset_i, const int imin,
57  const int imax, const int offset_j, const int jmin, const int jmax,
58  tensor *exp_ij_) {
59  double c_exp = exp(alpha * imin);
60  const double c_exp_co = exp(alpha);
61 
62  for (int i = 0; i < (imax - imin); i++) {
63  double *restrict dst = &idx2(exp_ij_[0], i + offset_i, offset_j);
64  double ctmp = exp_recursive(c_exp, 1.0 / c_exp, jmin);
65 
66  for (int j = 0; j < (jmax - jmin); j++) {
67  dst[j] *= ctmp;
68  ctmp *= c_exp;
69  }
70  c_exp *= c_exp_co;
71  }
72 }
73 
75  const double mu_mean, const double *r_ab, const double basis[3][3],
76  const int *const xmin, const int *const xmax, bool plane[3],
77  tensor *const Exp) {
78  // zx, zy, yx
79  const int n[3][2] = {{0, 2}, {0, 1}, {1, 2}};
80 
81  // need to review this
82  const double c[3] = {
83  /* alpha gamma */
84  -2.0 * mu_mean *
85  (basis[0][0] * basis[2][0] + basis[0][1] * basis[2][1] +
86  basis[0][2] * basis[2][2]),
87  /* beta gamma */
88  -2.0 * mu_mean *
89  (basis[1][0] * basis[2][0] + basis[1][1] * basis[2][1] +
90  basis[1][2] * basis[2][2]),
91  /* alpha beta */
92  -2.0 * mu_mean *
93  (basis[0][0] * basis[1][0] + basis[0][1] * basis[1][1] +
94  basis[0][2] * basis[1][2])};
95 
96  /* a naive implementation of the computation of exp(-2 (v_i . v_j) (i
97  * - r_i) (j _ r_j)) requires n m exponentials but we can do much much
98  * better with only 7 exponentials
99  *
100  * first expand it. we get exp(2 (v_i . v_j) i j) exp(2 (v_i . v_j) i r_j)
101  * exp(2 (v_i . v_j) j r_i) exp(2 (v_i . v_j) r_i r_j). we can use the fact
102  * that the sum of two terms in an exponential is equal to the product of
103  * the two exponentials.
104  *
105  * this means that exp (a i) with i integer can be computed recursively with
106  * one exponential only
107  */
108 
109  /* we have a orthorombic case */
110  if (plane[0] && plane[1] && plane[2])
111  return;
112 
113  tensor exp_tmp;
114  double *x1, *x2;
115  /* printf("%d %d %d\n", plane[0], plane[1], plane[2]); */
116  const int max_elem =
117  imax(imax(xmax[0] - xmin[0], xmax[1] - xmin[1]), xmax[2] - xmin[2]) + 1;
118  initialize_tensor_3(Exp, 3, max_elem, max_elem);
119  realloc_tensor(Exp);
120 
121  x1 = grid_allocate_scratch(sizeof(double) * max_elem);
122  x2 = grid_allocate_scratch(sizeof(double) * max_elem);
123  initialize_tensor_2(&exp_tmp, Exp->size[1], Exp->size[2]);
124 
125  memset(&idx3(Exp[0], 0, 0, 0), 0, sizeof(double) * Exp->alloc_size_);
126  for (int dir = 0; dir < 3; dir++) {
127  int d1 = n[dir][0];
128  int d2 = n[dir][1];
129 
130  if (!plane[dir]) {
131  const double c_exp_const = exp(c[dir] * r_ab[d1] * r_ab[d2]);
132 
133  exp_i(-r_ab[d2] * c[dir], xmin[d1], xmax[d1] + 1, x1);
134  exp_i(-r_ab[d1] * c[dir], xmin[d2], xmax[d2] + 1, x2);
135 
136  exp_tmp.data = &idx3(Exp[0], dir, 0, 0);
137  cblas_dger(CblasRowMajor, xmax[d1] - xmin[d1] + 1,
138  xmax[d2] - xmin[d2] + 1, c_exp_const, x1, 1, x2, 1,
139  &idx2(exp_tmp, 0, 0), exp_tmp.ld_);
140  exp_ij(c[dir], 0, xmin[d1], xmax[d1] + 1, 0, xmin[d2], xmax[d2] + 1,
141  &exp_tmp);
142  }
143  }
144  grid_free_scratch(x1);
145  grid_free_scratch(x2);
146 }
147 
149  const double mu_mean, const double *r_ab, const double basis[3][3],
150  const int *const lower_corner, const int *const upper_corner,
151  const int *const block_size, const int *const offset, const int *const xmin,
152  const int *const xmax, bool *plane, tensor *const Exp) {
153  // zx, zy, yx
154  const int n[3][2] = {{0, 2}, {0, 1}, {1, 2}};
155 
156  // need to review this
157  const double c[3] = {
158  /* alpha gamma */
159  -2.0 * mu_mean *
160  (basis[0][0] * basis[2][0] + basis[0][1] * basis[2][1] +
161  basis[0][2] * basis[2][2]),
162  /* beta gamma */
163  -2.0 * mu_mean *
164  (basis[1][0] * basis[2][0] + basis[1][1] * basis[2][1] +
165  basis[1][2] * basis[2][2]),
166  /* alpha beta */
167  -2.0 * mu_mean *
168  (basis[0][0] * basis[1][0] + basis[0][1] * basis[1][1] +
169  basis[0][2] * basis[1][2])};
170 
171  /* a naive implementation of the computation of exp(-2 (v_i . v_j) (i
172  * - r_i) (j _ r_j)) requires n m exponentials but we can do much much
173  * better with only 7 exponentials
174  *
175  * first expand it. we get exp(2 (v_i . v_j) i j) exp(2 (v_i . v_j) i r_j)
176  * exp(2 (v_i . v_j) j r_i) exp(2 (v_i . v_j) r_i r_j). we can use the fact
177  * that the sum of two terms in an exponential is equal to the product of
178  * the two exponentials.
179  *
180  * this means that exp (a i) with i integer can be computed recursively with
181  * one exponential only
182  */
183 
184  /* we have a orthorombic case */
185  if (plane[0] && plane[1] && plane[2])
186  return;
187 
188  tensor exp_blocked;
189  double *x1, *x2;
190  /* printf("%d %d %d\n", plane[0], plane[1], plane[2]); */
191  initialize_tensor_2(&exp_blocked, imax(block_size[0], block_size[1]),
192  imax(block_size[1], block_size[2]));
193 
194  const int cube_size[3] = {(upper_corner[0] - lower_corner[0]) * block_size[0],
195  (upper_corner[1] - lower_corner[1]) * block_size[1],
196  (upper_corner[2] - lower_corner[2]) *
197  block_size[2]};
198 
199  const int max_elem = imax(imax(cube_size[0], cube_size[1]), cube_size[2]);
200  x1 = grid_allocate_scratch(sizeof(double) * max_elem);
201  x2 = grid_allocate_scratch(sizeof(double) * max_elem);
202 
203  initialize_tensor_4(Exp, 3,
204  imax(upper_corner[0] - lower_corner[0],
205  upper_corner[1] - lower_corner[1]),
206  imax(upper_corner[2] - lower_corner[2],
207  upper_corner[1] - lower_corner[1]),
208  exp_blocked.alloc_size_);
209 
210  realloc_tensor(Exp);
211  /* memset(Exp->data, 0, sizeof(double) * Exp->alloc_size_); */
212 
213  for (int dir = 0; dir < 3; dir++) {
214  int d1 = n[dir][0];
215  int d2 = n[dir][1];
216 
217  if (!plane[dir]) {
218  const double c_exp_const = exp(c[dir] * r_ab[d1] * r_ab[d2]);
219  memset(x1, 0, sizeof(double) * max_elem);
220  memset(x2, 0, sizeof(double) * max_elem);
221  /* memset(exp_tmp.data, 0, sizeof(double) * exp_tmp.alloc_size_); */
222  exp_i(-r_ab[d2] * c[dir], xmin[d1], xmax[d1] + 1, x1 + offset[d1]);
223  exp_i(-r_ab[d1] * c[dir], xmin[d2], xmax[d2] + 1, x2 + offset[d2]);
224 
225  for (int y = 0; y < (upper_corner[d1] - lower_corner[d1]); y++) {
226  const int y_1 = y * block_size[d1];
227  for (int x = 0; x < (upper_corner[d2] - lower_corner[d2]); x++) {
228  const int x_1 = x * block_size[d2];
229  exp_blocked.data = &idx4(Exp[0], dir, y, x, 0);
230 
231  for (int y2 = 0; y2 < block_size[d1]; y2++) {
232  double *restrict dst = &idx2(exp_blocked, y2, 0);
233  const double scal = x1[y_1 + y2] * c_exp_const;
234  double *restrict src = &x2[x_1];
235  //#pragma omp simd linear(dst, src) simdlen(8)
236  GRID_PRAGMA_SIMD((dst, src), 8)
237  for (int x3 = 0; x3 < block_size[d2]; x3++) {
238  dst[x3] = scal * src[x3];
239  }
240  }
241 
242  exp_ij(c[dir], 0, xmin[d1] - offset[d1] + y_1,
243  xmin[d1] - offset[d1] + y_1 + block_size[d1], 0,
244  xmin[d2] - offset[d2] + x_1,
245  xmin[d2] - offset[d2] + x_1 + block_size[d2], &exp_blocked);
246  }
247  }
248  }
249  }
250 
251  grid_free_scratch(x1);
252  grid_free_scratch(x2);
253  /* free(exp_tmp.data); */
254 }
255 
256 void apply_non_orthorombic_corrections(const bool plane[restrict 3],
257  const tensor *const Exp,
258  tensor *const cube) {
259  // Well we should never call non orthorombic corrections if everything is
260  // orthorombic
261  if (plane[0] && plane[1] && plane[2])
262  return;
263 
264  /*k and i are orthogonal, k and j as well */
265  if (plane[0] && plane[1]) {
266  for (int z = 0; z < cube->size[0]; z++) {
267  for (int y = 0; y < cube->size[1]; y++) {
268  double *restrict yx = &idx3(Exp[0], 2, y, 0);
269  double *restrict dst = &idx3(cube[0], z, y, 0);
270  GRID_PRAGMA_SIMD((dst, yx), 8)
271  for (int x = 0; x < cube->size[2]; x++) {
272  dst[x] *= yx[x];
273  }
274  }
275  }
276  return;
277  }
278 
279  /* k and i are orhogonal, i and j as well */
280  if (plane[0] && plane[2]) {
281  for (int z = 0; z < cube->size[0]; z++) {
282  for (int y = 0; y < cube->size[1]; y++) {
283  const double zy = idx3(Exp[0], 1, z, y);
284  double *restrict dst = &idx3(cube[0], z, y, 0);
285 
286  //#pragma omp simd linear(dst) simdlen(8)
287  GRID_PRAGMA_SIMD((dst), 8)
288  for (int x = 0; x < cube->size[2]; x++) {
289  dst[x] *= zy;
290  }
291  }
292  }
293  return;
294  }
295 
296  /* j, k are orthognal, i and j are orthognal */
297  if (plane[1] && plane[2]) {
298  for (int z = 0; z < cube->size[0]; z++) {
299  double *restrict zx = &idx3(Exp[0], 0, z, 0);
300  for (int y = 0; y < cube->size[1]; y++) {
301  double *restrict dst = &idx3(cube[0], z, y, 0);
302  //#pragma omp simd linear(dst, zx) simdlen(8)
303  GRID_PRAGMA_SIMD((dst, zx), 8)
304  for (int x = 0; x < cube->size[2]; x++) {
305  dst[x] *= zx[x];
306  }
307  }
308  }
309  return;
310  }
311 
312  if (plane[0]) {
313  // z perpendicular to x. but y non perpendicular to any
314  for (int z = 0; z < cube->size[0]; z++) {
315  for (int y = 0; y < cube->size[1]; y++) {
316  const double zy = idx3(Exp[0], 1, z, y);
317  double *restrict yx = &idx3(Exp[0], 2, y, 0);
318  double *restrict dst = &idx3(cube[0], z, y, 0);
319 
320  //#pragma omp simd linear(dst, yx) simdlen(8)
321  GRID_PRAGMA_SIMD((dst, yx), 8)
322  for (int x = 0; x < cube->size[2]; x++) {
323  dst[x] *= zy * yx[x];
324  }
325  }
326  }
327  return;
328  }
329 
330  if (plane[1]) {
331  // z perpendicular to y, but x and z are not and y and x neither
332  for (int z = 0; z < cube->size[0]; z++) {
333  double *restrict zx = &idx3(Exp[0], 0, z, 0);
334  for (int y = 0; y < cube->size[1]; y++) {
335  double *restrict yx = &idx3(Exp[0], 2, y, 0);
336  double *restrict dst = &idx3(cube[0], z, y, 0);
337  //#pragma omp simd linear(dst, yx) simdlen(8)
338  GRID_PRAGMA_SIMD((dst, yx), 8)
339  for (int x = 0; x < cube->size[2]; x++) {
340  dst[x] *= zx[x] * yx[x];
341  }
342  }
343  }
344  return;
345  }
346 
347  if (plane[2]) {
348  // x perpendicular to y, but x and z are not and y and z neither
349  for (int z = 0; z < cube->size[0]; z++) {
350  double *restrict zx = &idx3(Exp[0], 0, z, 0);
351  for (int y = 0; y < cube->size[1]; y++) {
352  const double zy = idx3(Exp[0], 1, z, y);
353  double *restrict dst = &idx3(cube[0], z, y, 0);
354 
355  //#pragma omp simd linear(dst) simdlen(8)
356  GRID_PRAGMA_SIMD((dst), 8)
357  for (int x = 0; x < cube->size[2]; x++) {
358  dst[x] *= zx[x] * zy;
359  }
360  }
361  }
362  return;
363  }
364 
365  /* generic case */
366 
367  for (int z = 0; z < cube->size[0]; z++) {
368  double *restrict zx = &idx3(Exp[0], 0, z, 0);
369  for (int y = 0; y < cube->size[1]; y++) {
370  const double zy = idx3(Exp[0], 1, z, y);
371  const double *restrict yx = &idx3(Exp[0], 2, y, 0);
372  double *restrict dst = &idx3(cube[0], z, y, 0);
373 
374  //#pragma omp simd linear(dst, zx, yx) simdlen(8)
375  GRID_PRAGMA_SIMD((dst, zx), 8)
376  for (int x = 0; x < cube->size[2]; x++) {
377  dst[x] *= zx[x] * zy * yx[x];
378  }
379  }
380  }
381  return;
382 }
383 
385  const struct tensor_ *const Exp, struct tensor_ *const m) {
386  for (int gamma = 0; gamma < m->size[0]; gamma++) {
387  for (int y1 = 0; y1 < m->size[1]; y1++) {
388  double *restrict dst = &idx3(m[0], gamma, y1, 0);
389  double *restrict src = &idx2(Exp[0], y1, 0);
390 
391  //#pragma omp simd linear(dst, src) simdlen(8)
392  GRID_PRAGMA_SIMD((dst, src), 8)
393  for (int x1 = 0; x1 < m->size[2]; x1++) {
394  dst[x1] *= src[x1];
395  }
396  }
397  }
398 }
399 
401  const struct tensor_ *const Exp, struct tensor_ *const m) {
402  for (int z1 = 0; z1 < m->size[0]; z1++) {
403  double *restrict src = &idx2(Exp[0], z1, 0);
404  for (int y1 = 0; y1 < m->size[1]; y1++) {
405  double *restrict dst = &idx3(m[0], z1, y1, 0);
406  //#pragma omp simd linear(dst, src) simdlen(8)
407  GRID_PRAGMA_SIMD((dst, src), 8)
408  for (int x1 = 0; x1 < m->size[2]; x1++) {
409  dst[x1] *= src[x1];
410  }
411  }
412  }
413 }
414 
416  const struct tensor_ *const Exp, struct tensor_ *const m) {
417  for (int z1 = 0; z1 < m->size[0]; z1++) {
418  for (int y1 = 0; y1 < m->size[1]; y1++) {
419  const double src = idx2(Exp[0], z1, y1);
420  double *restrict dst = &idx3(m[0], z1, y1, 0);
421  //#pragma omp simd linear(dst) simdlen(8)
422  GRID_PRAGMA_SIMD((dst), 8)
423  for (int x1 = 0; x1 < m->size[2]; x1++) {
424  dst[x1] *= src;
425  }
426  }
427  }
428 }
429 
431  const struct tensor_ *const Exp_xz, const struct tensor_ *const Exp_yz,
432  struct tensor_ *const m) {
433  for (int z1 = 0; z1 < m->size[0]; z1++) {
434  const double *restrict src_xz = &idx2(Exp_xz[0], z1, 0);
435  for (int y1 = 0; y1 < m->size[1]; y1++) {
436  const double src = idx2(Exp_yz[0], z1, y1);
437  double *restrict dst = &idx3(m[0], z1, y1, 0);
438  //#pragma omp simd linear(dst) simdlen(8)
439  GRID_PRAGMA_SIMD((dst), 8)
440  for (int x1 = 0; x1 < m->size[2]; x1++) {
441  dst[x1] *= src * src_xz[x1];
442  }
443  }
444  }
445 }
static int imax(int x, int y)
Returns the larger of two given integer (missing from the C standard)
static int imin(int x, int y)
Returns the smaller of two given integer (missing from the C standard)
Definition: dbm_miniapp.c:38
#define GRID_PRAGMA_SIMD(OBJS, N)
Definition: grid_common.h:20
static void const int const int i
void apply_non_orthorombic_corrections(const bool plane[restrict 3], const tensor *const Exp, tensor *const cube)
void exp_ij(const double alpha, const int offset_i, const int imin, const int imax, const int offset_j, const int jmin, const int jmax, tensor *exp_ij_)
void exp_i(const double alpha, const int imin, const int imax, double *restrict const res)
void calculate_non_orthorombic_corrections_tensor(const double mu_mean, const double *r_ab, const double basis[3][3], const int *const xmin, const int *const xmax, bool plane[3], tensor *const Exp)
void apply_non_orthorombic_corrections_xz_yz_blocked(const struct tensor_ *const Exp_xz, const struct tensor_ *const Exp_yz, struct tensor_ *const m)
void apply_non_orthorombic_corrections_yz_blocked(const struct tensor_ *const Exp, struct tensor_ *const m)
double exp_recursive(const double c_exp, const double c_exp_minus_1, const int index)
void calculate_non_orthorombic_corrections_tensor_blocked(const double mu_mean, const double *r_ab, const double basis[3][3], const int *const lower_corner, const int *const upper_corner, const int *const block_size, const int *const offset, const int *const xmin, const int *const xmax, bool *plane, tensor *const Exp)
void apply_non_orthorombic_corrections_xz_blocked(const struct tensor_ *const Exp, struct tensor_ *const m)
void apply_non_orthorombic_corrections_xy_blocked(const struct tensor_ *const Exp, struct tensor_ *const m)
size_t realloc_tensor(tensor *t)
static void initialize_tensor_4(struct tensor_ *a, int n1, int n2, int n3, int n4)
#define idx2(a, i, j)
#define idx4(a, i, j, k, l)
static void initialize_tensor_3(struct tensor_ *a, int n1, int n2, int n3)
#define idx3(a, i, j, k)
static void initialize_tensor_2(struct tensor_ *a, int n1, int n2)
void cblas_dger(const CBLAS_LAYOUT Layout, const int M, const int N, const double alpha, const double *X, const int incX, const double *Y, const int incY, double *A, const int lda)
static void * grid_allocate_scratch(size_t size)
@ CblasRowMajor
static void grid_free_scratch(void *ptr)
real(dp), dimension(3) c
Definition: ai_eri_debug.F:31
Calculation of the incomplete Gamma function F_n(t) for multi-center integrals over Cartesian Gaussia...
Definition: gamma.F:15