(git:374b731)
Loading...
Searching...
No Matches
grid_dgemm_non_orthorombic_corrections.c
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2024 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: BSD-3-Clause */
6/*----------------------------------------------------------------------------*/
7
9
10#include <math.h>
11#include <stdio.h>
12#include <stdlib.h>
13#include <string.h>
14
15#include "../common/grid_common.h"
16#include "grid_dgemm_utils.h"
17
18double exp_recursive(const double c_exp, const double c_exp_minus_1,
19 const int index) {
20 if (index == -1)
21 return c_exp_minus_1;
22
23 if (index == 1)
24 return c_exp;
25
26 double res = 1.0;
27
28 if (index < 0) {
29 for (int i = 0; i < -index; i++) {
30 res *= c_exp_minus_1;
31 }
32 return res;
33 }
34
35 if (index > 0) {
36 for (int i = 0; i < index; i++) {
37 res *= c_exp;
38 }
39 return res;
40 }
41
42 return 1.0;
43}
44
45void exp_i(const double alpha, const int imin, const int imax,
46 double *restrict const res) {
47 const double c_exp_co = exp(alpha);
48 /* const double c_exp_minus_1 = 1/ c_exp; */
49 res[0] = exp(imin * alpha);
50 for (int i = 1; i < (imax - imin); i++) {
51 res[i] = res[i - 1] *
52 c_exp_co; // exp_recursive(c_exp_co, 1.0 / c_exp_co, i + imin);
53 }
54}
55
56void exp_ij(const double alpha, const int offset_i, const int imin,
57 const int imax, const int offset_j, const int jmin, const int jmax,
58 tensor *exp_ij_) {
59 double c_exp = exp(alpha * imin);
60 const double c_exp_co = exp(alpha);
61
62 for (int i = 0; i < (imax - imin); i++) {
63 double *restrict dst = &idx2(exp_ij_[0], i + offset_i, offset_j);
64 double ctmp = exp_recursive(c_exp, 1.0 / c_exp, jmin);
65
66 for (int j = 0; j < (jmax - jmin); j++) {
67 dst[j] *= ctmp;
68 ctmp *= c_exp;
69 }
70 c_exp *= c_exp_co;
71 }
72}
73
75 const double mu_mean, const double *r_ab, const double basis[3][3],
76 const int *const xmin, const int *const xmax, bool plane[3],
77 tensor *const Exp) {
78 // zx, zy, yx
79 const int n[3][2] = {{0, 2}, {0, 1}, {1, 2}};
80
81 // need to review this
82 const double c[3] = {
83 /* alpha gamma */
84 -2.0 * mu_mean *
85 (basis[0][0] * basis[2][0] + basis[0][1] * basis[2][1] +
86 basis[0][2] * basis[2][2]),
87 /* beta gamma */
88 -2.0 * mu_mean *
89 (basis[1][0] * basis[2][0] + basis[1][1] * basis[2][1] +
90 basis[1][2] * basis[2][2]),
91 /* alpha beta */
92 -2.0 * mu_mean *
93 (basis[0][0] * basis[1][0] + basis[0][1] * basis[1][1] +
94 basis[0][2] * basis[1][2])};
95
96 /* a naive implementation of the computation of exp(-2 (v_i . v_j) (i
97 * - r_i) (j _ r_j)) requires n m exponentials but we can do much much
98 * better with only 7 exponentials
99 *
100 * first expand it. we get exp(2 (v_i . v_j) i j) exp(2 (v_i . v_j) i r_j)
101 * exp(2 (v_i . v_j) j r_i) exp(2 (v_i . v_j) r_i r_j). we can use the fact
102 * that the sum of two terms in an exponential is equal to the product of
103 * the two exponentials.
104 *
105 * this means that exp (a i) with i integer can be computed recursively with
106 * one exponential only
107 */
108
109 /* we have a orthorombic case */
110 if (plane[0] && plane[1] && plane[2])
111 return;
112
113 tensor exp_tmp;
114 double *x1, *x2;
115 /* printf("%d %d %d\n", plane[0], plane[1], plane[2]); */
116 const int max_elem =
117 imax(imax(xmax[0] - xmin[0], xmax[1] - xmin[1]), xmax[2] - xmin[2]) + 1;
118 initialize_tensor_3(Exp, 3, max_elem, max_elem);
119 realloc_tensor(Exp);
120
121 x1 = grid_allocate_scratch(sizeof(double) * max_elem);
122 x2 = grid_allocate_scratch(sizeof(double) * max_elem);
123 initialize_tensor_2(&exp_tmp, Exp->size[1], Exp->size[2]);
124
125 memset(&idx3(Exp[0], 0, 0, 0), 0, sizeof(double) * Exp->alloc_size_);
126 for (int dir = 0; dir < 3; dir++) {
127 int d1 = n[dir][0];
128 int d2 = n[dir][1];
129
130 if (!plane[dir]) {
131 const double c_exp_const = exp(c[dir] * r_ab[d1] * r_ab[d2]);
132
133 exp_i(-r_ab[d2] * c[dir], xmin[d1], xmax[d1] + 1, x1);
134 exp_i(-r_ab[d1] * c[dir], xmin[d2], xmax[d2] + 1, x2);
135
136 exp_tmp.data = &idx3(Exp[0], dir, 0, 0);
137 cblas_dger(CblasRowMajor, xmax[d1] - xmin[d1] + 1,
138 xmax[d2] - xmin[d2] + 1, c_exp_const, x1, 1, x2, 1,
139 &idx2(exp_tmp, 0, 0), exp_tmp.ld_);
140 exp_ij(c[dir], 0, xmin[d1], xmax[d1] + 1, 0, xmin[d2], xmax[d2] + 1,
141 &exp_tmp);
142 }
143 }
146}
147
149 const double mu_mean, const double *r_ab, const double basis[3][3],
150 const int *const lower_corner, const int *const upper_corner,
151 const int *const block_size, const int *const offset, const int *const xmin,
152 const int *const xmax, bool *plane, tensor *const Exp) {
153 // zx, zy, yx
154 const int n[3][2] = {{0, 2}, {0, 1}, {1, 2}};
155
156 // need to review this
157 const double c[3] = {
158 /* alpha gamma */
159 -2.0 * mu_mean *
160 (basis[0][0] * basis[2][0] + basis[0][1] * basis[2][1] +
161 basis[0][2] * basis[2][2]),
162 /* beta gamma */
163 -2.0 * mu_mean *
164 (basis[1][0] * basis[2][0] + basis[1][1] * basis[2][1] +
165 basis[1][2] * basis[2][2]),
166 /* alpha beta */
167 -2.0 * mu_mean *
168 (basis[0][0] * basis[1][0] + basis[0][1] * basis[1][1] +
169 basis[0][2] * basis[1][2])};
170
171 /* a naive implementation of the computation of exp(-2 (v_i . v_j) (i
172 * - r_i) (j _ r_j)) requires n m exponentials but we can do much much
173 * better with only 7 exponentials
174 *
175 * first expand it. we get exp(2 (v_i . v_j) i j) exp(2 (v_i . v_j) i r_j)
176 * exp(2 (v_i . v_j) j r_i) exp(2 (v_i . v_j) r_i r_j). we can use the fact
177 * that the sum of two terms in an exponential is equal to the product of
178 * the two exponentials.
179 *
180 * this means that exp (a i) with i integer can be computed recursively with
181 * one exponential only
182 */
183
184 /* we have a orthorombic case */
185 if (plane[0] && plane[1] && plane[2])
186 return;
187
188 tensor exp_blocked;
189 double *x1, *x2;
190 /* printf("%d %d %d\n", plane[0], plane[1], plane[2]); */
191 initialize_tensor_2(&exp_blocked, imax(block_size[0], block_size[1]),
192 imax(block_size[1], block_size[2]));
193
194 const int cube_size[3] = {(upper_corner[0] - lower_corner[0]) * block_size[0],
195 (upper_corner[1] - lower_corner[1]) * block_size[1],
196 (upper_corner[2] - lower_corner[2]) *
197 block_size[2]};
198
199 const int max_elem = imax(imax(cube_size[0], cube_size[1]), cube_size[2]);
200 x1 = grid_allocate_scratch(sizeof(double) * max_elem);
201 x2 = grid_allocate_scratch(sizeof(double) * max_elem);
202
203 initialize_tensor_4(Exp, 3,
204 imax(upper_corner[0] - lower_corner[0],
205 upper_corner[1] - lower_corner[1]),
206 imax(upper_corner[2] - lower_corner[2],
207 upper_corner[1] - lower_corner[1]),
208 exp_blocked.alloc_size_);
209
210 realloc_tensor(Exp);
211 /* memset(Exp->data, 0, sizeof(double) * Exp->alloc_size_); */
212
213 for (int dir = 0; dir < 3; dir++) {
214 int d1 = n[dir][0];
215 int d2 = n[dir][1];
216
217 if (!plane[dir]) {
218 const double c_exp_const = exp(c[dir] * r_ab[d1] * r_ab[d2]);
219 memset(x1, 0, sizeof(double) * max_elem);
220 memset(x2, 0, sizeof(double) * max_elem);
221 /* memset(exp_tmp.data, 0, sizeof(double) * exp_tmp.alloc_size_); */
222 exp_i(-r_ab[d2] * c[dir], xmin[d1], xmax[d1] + 1, x1 + offset[d1]);
223 exp_i(-r_ab[d1] * c[dir], xmin[d2], xmax[d2] + 1, x2 + offset[d2]);
224
225 for (int y = 0; y < (upper_corner[d1] - lower_corner[d1]); y++) {
226 const int y_1 = y * block_size[d1];
227 for (int x = 0; x < (upper_corner[d2] - lower_corner[d2]); x++) {
228 const int x_1 = x * block_size[d2];
229 exp_blocked.data = &idx4(Exp[0], dir, y, x, 0);
230
231 for (int y2 = 0; y2 < block_size[d1]; y2++) {
232 double *restrict dst = &idx2(exp_blocked, y2, 0);
233 const double scal = x1[y_1 + y2] * c_exp_const;
234 double *restrict src = &x2[x_1];
235 // #pragma omp simd linear(dst, src) simdlen(8)
236 GRID_PRAGMA_SIMD((dst, src), 8)
237 for (int x3 = 0; x3 < block_size[d2]; x3++) {
238 dst[x3] = scal * src[x3];
239 }
240 }
241
242 exp_ij(c[dir], 0, xmin[d1] - offset[d1] + y_1,
243 xmin[d1] - offset[d1] + y_1 + block_size[d1], 0,
244 xmin[d2] - offset[d2] + x_1,
245 xmin[d2] - offset[d2] + x_1 + block_size[d2], &exp_blocked);
246 }
247 }
248 }
249 }
250
253 /* free(exp_tmp.data); */
254}
255
256void apply_non_orthorombic_corrections(const bool plane[restrict 3],
257 const tensor *const Exp,
258 tensor *const cube) {
259 // Well we should never call non orthorombic corrections if everything is
260 // orthorombic
261 if (plane[0] && plane[1] && plane[2])
262 return;
263
264 /*k and i are orthogonal, k and j as well */
265 if (plane[0] && plane[1]) {
266 for (int z = 0; z < cube->size[0]; z++) {
267 for (int y = 0; y < cube->size[1]; y++) {
268 double *restrict yx = &idx3(Exp[0], 2, y, 0);
269 double *restrict dst = &idx3(cube[0], z, y, 0);
270 GRID_PRAGMA_SIMD((dst, yx), 8)
271 for (int x = 0; x < cube->size[2]; x++) {
272 dst[x] *= yx[x];
273 }
274 }
275 }
276 return;
277 }
278
279 /* k and i are orhogonal, i and j as well */
280 if (plane[0] && plane[2]) {
281 for (int z = 0; z < cube->size[0]; z++) {
282 for (int y = 0; y < cube->size[1]; y++) {
283 const double zy = idx3(Exp[0], 1, z, y);
284 double *restrict dst = &idx3(cube[0], z, y, 0);
285
286 // #pragma omp simd linear(dst) simdlen(8)
287 GRID_PRAGMA_SIMD((dst), 8)
288 for (int x = 0; x < cube->size[2]; x++) {
289 dst[x] *= zy;
290 }
291 }
292 }
293 return;
294 }
295
296 /* j, k are orthognal, i and j are orthognal */
297 if (plane[1] && plane[2]) {
298 for (int z = 0; z < cube->size[0]; z++) {
299 double *restrict zx = &idx3(Exp[0], 0, z, 0);
300 for (int y = 0; y < cube->size[1]; y++) {
301 double *restrict dst = &idx3(cube[0], z, y, 0);
302 // #pragma omp simd linear(dst, zx) simdlen(8)
303 GRID_PRAGMA_SIMD((dst, zx), 8)
304 for (int x = 0; x < cube->size[2]; x++) {
305 dst[x] *= zx[x];
306 }
307 }
308 }
309 return;
310 }
311
312 if (plane[0]) {
313 // z perpendicular to x. but y non perpendicular to any
314 for (int z = 0; z < cube->size[0]; z++) {
315 for (int y = 0; y < cube->size[1]; y++) {
316 const double zy = idx3(Exp[0], 1, z, y);
317 double *restrict yx = &idx3(Exp[0], 2, y, 0);
318 double *restrict dst = &idx3(cube[0], z, y, 0);
319
320 // #pragma omp simd linear(dst, yx) simdlen(8)
321 GRID_PRAGMA_SIMD((dst, yx), 8)
322 for (int x = 0; x < cube->size[2]; x++) {
323 dst[x] *= zy * yx[x];
324 }
325 }
326 }
327 return;
328 }
329
330 if (plane[1]) {
331 // z perpendicular to y, but x and z are not and y and x neither
332 for (int z = 0; z < cube->size[0]; z++) {
333 double *restrict zx = &idx3(Exp[0], 0, z, 0);
334 for (int y = 0; y < cube->size[1]; y++) {
335 double *restrict yx = &idx3(Exp[0], 2, y, 0);
336 double *restrict dst = &idx3(cube[0], z, y, 0);
337 // #pragma omp simd linear(dst, yx) simdlen(8)
338 GRID_PRAGMA_SIMD((dst, yx), 8)
339 for (int x = 0; x < cube->size[2]; x++) {
340 dst[x] *= zx[x] * yx[x];
341 }
342 }
343 }
344 return;
345 }
346
347 if (plane[2]) {
348 // x perpendicular to y, but x and z are not and y and z neither
349 for (int z = 0; z < cube->size[0]; z++) {
350 double *restrict zx = &idx3(Exp[0], 0, z, 0);
351 for (int y = 0; y < cube->size[1]; y++) {
352 const double zy = idx3(Exp[0], 1, z, y);
353 double *restrict dst = &idx3(cube[0], z, y, 0);
354
355 // #pragma omp simd linear(dst) simdlen(8)
356 GRID_PRAGMA_SIMD((dst), 8)
357 for (int x = 0; x < cube->size[2]; x++) {
358 dst[x] *= zx[x] * zy;
359 }
360 }
361 }
362 return;
363 }
364
365 /* generic case */
366
367 for (int z = 0; z < cube->size[0]; z++) {
368 double *restrict zx = &idx3(Exp[0], 0, z, 0);
369 for (int y = 0; y < cube->size[1]; y++) {
370 const double zy = idx3(Exp[0], 1, z, y);
371 const double *restrict yx = &idx3(Exp[0], 2, y, 0);
372 double *restrict dst = &idx3(cube[0], z, y, 0);
373
374 // #pragma omp simd linear(dst, zx, yx) simdlen(8)
375 GRID_PRAGMA_SIMD((dst, zx), 8)
376 for (int x = 0; x < cube->size[2]; x++) {
377 dst[x] *= zx[x] * zy * yx[x];
378 }
379 }
380 }
381 return;
382}
383
385 const struct tensor_ *const Exp, struct tensor_ *const m) {
386 for (int gamma = 0; gamma < m->size[0]; gamma++) {
387 for (int y1 = 0; y1 < m->size[1]; y1++) {
388 double *restrict dst = &idx3(m[0], gamma, y1, 0);
389 double *restrict src = &idx2(Exp[0], y1, 0);
390
391 // #pragma omp simd linear(dst, src) simdlen(8)
392 GRID_PRAGMA_SIMD((dst, src), 8)
393 for (int x1 = 0; x1 < m->size[2]; x1++) {
394 dst[x1] *= src[x1];
395 }
396 }
397 }
398}
399
401 const struct tensor_ *const Exp, struct tensor_ *const m) {
402 for (int z1 = 0; z1 < m->size[0]; z1++) {
403 double *restrict src = &idx2(Exp[0], z1, 0);
404 for (int y1 = 0; y1 < m->size[1]; y1++) {
405 double *restrict dst = &idx3(m[0], z1, y1, 0);
406 // #pragma omp simd linear(dst, src) simdlen(8)
407 GRID_PRAGMA_SIMD((dst, src), 8)
408 for (int x1 = 0; x1 < m->size[2]; x1++) {
409 dst[x1] *= src[x1];
410 }
411 }
412 }
413}
414
416 const struct tensor_ *const Exp, struct tensor_ *const m) {
417 for (int z1 = 0; z1 < m->size[0]; z1++) {
418 for (int y1 = 0; y1 < m->size[1]; y1++) {
419 const double src = idx2(Exp[0], z1, y1);
420 double *restrict dst = &idx3(m[0], z1, y1, 0);
421 // #pragma omp simd linear(dst) simdlen(8)
422 GRID_PRAGMA_SIMD((dst), 8)
423 for (int x1 = 0; x1 < m->size[2]; x1++) {
424 dst[x1] *= src;
425 }
426 }
427 }
428}
429
431 const struct tensor_ *const Exp_xz, const struct tensor_ *const Exp_yz,
432 struct tensor_ *const m) {
433 for (int z1 = 0; z1 < m->size[0]; z1++) {
434 const double *restrict src_xz = &idx2(Exp_xz[0], z1, 0);
435 for (int y1 = 0; y1 < m->size[1]; y1++) {
436 const double src = idx2(Exp_yz[0], z1, y1);
437 double *restrict dst = &idx3(m[0], z1, y1, 0);
438 // #pragma omp simd linear(dst) simdlen(8)
439 GRID_PRAGMA_SIMD((dst), 8)
440 for (int x1 = 0; x1 < m->size[2]; x1++) {
441 dst[x1] *= src * src_xz[x1];
442 }
443 }
444 }
445}
static int imax(int x, int y)
Returns the larger of two given integer (missing from the C standard)
static int imin(int x, int y)
Returns the smaller of two given integer (missing from the C standard)
Definition dbm_miniapp.c:38
#define GRID_PRAGMA_SIMD(OBJS, N)
Definition grid_common.h:20
static void const int const int i
void apply_non_orthorombic_corrections(const bool plane[restrict 3], const tensor *const Exp, tensor *const cube)
void exp_ij(const double alpha, const int offset_i, const int imin, const int imax, const int offset_j, const int jmin, const int jmax, tensor *exp_ij_)
void exp_i(const double alpha, const int imin, const int imax, double *restrict const res)
void calculate_non_orthorombic_corrections_tensor(const double mu_mean, const double *r_ab, const double basis[3][3], const int *const xmin, const int *const xmax, bool plane[3], tensor *const Exp)
void apply_non_orthorombic_corrections_xz_yz_blocked(const struct tensor_ *const Exp_xz, const struct tensor_ *const Exp_yz, struct tensor_ *const m)
void apply_non_orthorombic_corrections_yz_blocked(const struct tensor_ *const Exp, struct tensor_ *const m)
double exp_recursive(const double c_exp, const double c_exp_minus_1, const int index)
void calculate_non_orthorombic_corrections_tensor_blocked(const double mu_mean, const double *r_ab, const double basis[3][3], const int *const lower_corner, const int *const upper_corner, const int *const block_size, const int *const offset, const int *const xmin, const int *const xmax, bool *plane, tensor *const Exp)
void apply_non_orthorombic_corrections_xz_blocked(const struct tensor_ *const Exp, struct tensor_ *const m)
void apply_non_orthorombic_corrections_xy_blocked(const struct tensor_ *const Exp, struct tensor_ *const m)
size_t realloc_tensor(tensor *t)
static void initialize_tensor_4(struct tensor_ *a, int n1, int n2, int n3, int n4)
#define idx2(a, i, j)
#define idx4(a, i, j, k, l)
static void initialize_tensor_3(struct tensor_ *a, int n1, int n2, int n3)
#define idx3(a, i, j, k)
static void initialize_tensor_2(struct tensor_ *a, int n1, int n2)
void cblas_dger(const CBLAS_LAYOUT Layout, const int M, const int N, const double alpha, const double *X, const int incX, const double *Y, const int incY, double *A, const int lda)
static void * grid_allocate_scratch(size_t size)
@ CblasRowMajor
static void grid_free_scratch(void *ptr)
Calculation of the incomplete Gamma function F_n(t) for multi-center integrals over Cartesian Gaussia...
Definition gamma.F:15