(git:b1f098b)
Loading...
Searching...
No Matches
grid_dgemm_collocate.c
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2024 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: BSD-3-Clause */
6/*----------------------------------------------------------------------------*/
7
8#include <assert.h>
9#include <limits.h>
10#include <math.h>
11#include <omp.h>
12#include <stdbool.h>
13#include <stdio.h>
14#include <stdlib.h>
15#include <string.h>
16
17#ifdef __LIBXSMM
18#include <libxsmm.h>
19#endif
20
21#include "../common/grid_basis_set.h"
22#include "../common/grid_common.h"
23#include "../common/grid_constants.h"
32
33void collocate_l0(double *scratch, const double alpha, const bool orthogonal,
34 const struct tensor_ *exp_xy,
35 const struct tensor_ *p_alpha_beta_reduced_,
36 struct tensor_ *cube);
37
39 const grid_basis_set *jbasis,
40 const int iatom, const int jatom,
41 const int iset, const int jset,
42 double *const block, tensor *work,
43 tensor *pab) {
44 dgemm_params m1, m2;
45 memset(&m1, 0, sizeof(dgemm_params));
46 memset(&m2, 0, sizeof(dgemm_params));
47
48 // Define some more convenient aliases.
49 const int nsgf_seta = ibasis->nsgf_set[iset]; // size of spherical set
50 const int nsgf_setb = jbasis->nsgf_set[jset];
51 const int nsgfa = ibasis->nsgf; // size of entire spherical basis
52 const int nsgfb = jbasis->nsgf;
53 const int sgfa = ibasis->first_sgf[iset] - 1; // start of spherical set
54 const int sgfb = jbasis->first_sgf[jset] - 1;
55 const int maxcoa = ibasis->maxco;
56 const int maxcob = jbasis->maxco;
57 const int ncoseta = ncoset(ibasis->lmax[iset]);
58 const int ncosetb = ncoset(jbasis->lmax[jset]);
59 const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
60 const int ncob = jbasis->npgf[jset] * ncosetb;
61
62 initialize_tensor_2(work, nsgf_setb, ncoa);
63 realloc_tensor(work);
64
65 initialize_tensor_2(pab, ncob, ncoa);
66 realloc_tensor(pab);
67
68 // the rotations happen here.
69 if (iatom <= jatom) {
70 m1.op1 = 'N';
71 m1.op2 = 'N';
72 m1.m = work->size[0];
73 m1.n = work->size[1];
74 m1.k = nsgf_seta;
75 m1.alpha = 1.0;
76 m1.beta = 0.0;
77 m1.a = block + sgfb * nsgfa + sgfa;
78 m1.lda = nsgfa;
79 m1.b = &ibasis->sphi[sgfa * maxcoa];
80 m1.ldb = maxcoa;
81 m1.c = work->data;
82 m1.ldc = work->ld_;
83 } else {
84 m1.op1 = 'T';
85 m1.op2 = 'N';
86 m1.m = work->size[0];
87 m1.n = work->size[1];
88 m1.k = nsgf_seta;
89 m1.alpha = 1.0;
90 m1.beta = 0.0;
91 m1.a = block + sgfa * nsgfb + sgfb;
92 m1.lda = nsgfb;
93 m1.b = &ibasis->sphi[sgfa * maxcoa];
94 m1.ldb = maxcoa;
95 m1.c = work->data;
96 m1.ldc = work->ld_;
97 }
98 m1.use_libxsmm = true;
100
101 m2.op1 = 'T';
102 m2.op2 = 'N';
103 m2.m = pab->size[0];
104 m2.n = pab->size[1];
105 m2.k = work->size[0];
106 m2.alpha = 1.0;
107 m2.beta = 0.0;
108 m2.a = &jbasis->sphi[sgfb * maxcob];
109 m2.lda = maxcob;
110 m2.b = work->data;
111 m2.ldb = work->ld_;
112 m2.c = pab->data;
113 m2.ldc = pab->ld_;
114 m2.use_libxsmm = true;
115 dgemm_simplified(&m2);
116}
117
119 double *scratch, const double alpha, const bool *const orthogonal,
120 const struct tensor_ *Exp, const struct tensor_ *co,
121 const struct tensor_ *p_alpha_beta_reduced_, struct tensor_ *cube);
122
123/* compute the functions (x - x_i)^l exp (-eta (x - x_i)^2) for l = 0..lp using
124 * a recursive relation to avoid computing the exponential on each grid point. I
125 * think it is not really necessary anymore since it is *not* the dominating
126 * contribution to computation of collocate and integrate */
127
128void grid_fill_pol_dgemm(const bool transpose, const double dr,
129 const double roffset, const int pol_offset,
130 const int xmin, const int xmax, const int lp,
131 const int cmax, const double zetp, double *pol_) {
132 tensor pol;
133 const double t_exp_1 = exp(-zetp * dr * dr);
134 const double t_exp_2 = t_exp_1 * t_exp_1;
135
136 double t_exp_min_1 = exp(-zetp * (dr - roffset) * (dr - roffset));
137 double t_exp_min_2 = exp(2.0 * zetp * (dr - roffset) * dr);
138
139 double t_exp_plus_1 = exp(-zetp * roffset * roffset);
140 double t_exp_plus_2 = exp(2.0 * zetp * roffset * dr);
141
142 if (transpose) {
143 initialize_tensor_2(&pol, cmax, lp + 1);
144 pol.data = pol_;
145 /* It is original Ole code. I need to transpose the polynomials for the
146 * integration routine and Ole code already does it. */
147 for (int ig = 0; ig >= xmin; ig--) {
148 const double rpg = ig * dr - roffset;
149 t_exp_min_1 *= t_exp_min_2 * t_exp_1;
150 t_exp_min_2 *= t_exp_2;
151 double pg = t_exp_min_1;
152 for (int icoef = 0; icoef <= lp; icoef++) {
153 idx2(pol, pol_offset + ig - xmin, icoef) = pg;
154 pg *= rpg;
155 }
156 }
157
158 double t_exp_plus_1 = exp(-zetp * roffset * roffset);
159 double t_exp_plus_2 = exp(2 * zetp * roffset * dr);
160 for (int ig = 1; ig <= xmax; ig++) {
161 const double rpg = ig * dr - roffset;
162 t_exp_plus_1 *= t_exp_plus_2 * t_exp_1;
163 t_exp_plus_2 *= t_exp_2;
164 double pg = t_exp_plus_1;
165 for (int icoef = 0; icoef <= lp; icoef++) {
166 idx2(pol, pol_offset + ig - xmin, icoef) = pg;
167 pg *= rpg;
168 }
169 }
170
171 } else {
172 initialize_tensor_2(&pol, lp + 1, cmax);
173 pol.data = pol_;
174 /* memset(pol.data, 0, sizeof(double) * pol.alloc_size_); */
175 /*
176 * compute the values of all (x-xp)**lp*exp(..)
177 *
178 * still requires the old trick:
179 * new trick to avoid to many exps (reuse the result from the previous
180 * gridpoint): exp( -a*(x+d)**2)=exp(-a*x**2)*exp(-2*a*x*d)*exp(-a*d**2)
181 * exp(-2*a*(x+d)*d)=exp(-2*a*x*d)*exp(-2*a*d**2)
182 */
183
184 /* compute the exponential recursively and store the polynomial prefactors
185 * as well */
186 for (int ig = 0; ig >= xmin; ig--) {
187 const double rpg = ig * dr - roffset;
188 t_exp_min_1 *= t_exp_min_2 * t_exp_1;
189 t_exp_min_2 *= t_exp_2;
190 double pg = t_exp_min_1;
191 idx2(pol, 0, pol_offset + ig - xmin) = pg;
192 if (lp > 0)
193 idx2(pol, 1, pol_offset + ig - xmin) = rpg;
194 }
195
196 for (int ig = 1; ig <= xmax; ig++) {
197 const double rpg = ig * dr - roffset;
198 t_exp_plus_1 *= t_exp_plus_2 * t_exp_1;
199 t_exp_plus_2 *= t_exp_2;
200 double pg = t_exp_plus_1;
201 idx2(pol, 0, pol_offset + ig - xmin) = pg;
202 if (lp > 0)
203 idx2(pol, 1, pol_offset + ig - xmin) = rpg;
204 }
205
206 /* compute the remaining powers using previously computed stuff */
207 if (lp >= 2) {
208 double *__restrict__ poly = &idx2(pol, 1, 0);
209 double *__restrict__ src1 = &idx2(pol, 0, 0);
210 double *__restrict__ dst = &idx2(pol, 2, 0);
211 // #pragma omp simd linear(dst, src1, poly) simdlen(8)
212 GRID_PRAGMA_SIMD((dst, src1, poly), 8)
213 for (int ig = 0; ig < (xmax - xmin + 1 + pol_offset); ig++)
214 dst[ig] = src1[ig] * poly[ig] * poly[ig];
215 }
216
217 for (int icoef = 3; icoef <= lp; icoef++) {
218 double *__restrict__ poly = &idx2(pol, 1, 0);
219 double *__restrict__ src1 = &idx2(pol, icoef - 1, 0);
220 double *__restrict__ dst = &idx2(pol, icoef, 0);
221 GRID_PRAGMA_SIMD((dst, src1, poly), 8)
222 for (int ig = 0; ig < (xmax - xmin + 1 + pol_offset); ig++) {
223 dst[ig] = src1[ig] * poly[ig];
224 }
225 }
226
227 //
228 if (lp > 0) {
229 // I can not declare src__ variable constant because it breaks openmp
230 // standard.
231 double *__restrict__ dst = &idx2(pol, 1, 0);
232 double *__restrict__ src = &idx2(pol, 0, 0);
233 GRID_PRAGMA_SIMD((dst, src), 8)
234 for (int ig = 0; ig < (xmax - xmin + 1 + pol_offset); ig++) {
235 dst[ig] *= src[ig];
236 }
237 }
238 }
239}
240
242 const double disr_radius, const int cmax,
243 const int *const lb_cube,
244 const int *const cube_center) {
245 // a mapping so that the ig corresponds to the right grid point
246 int **map = handler->map;
247 map[1] = map[0] + 2 * cmax + 1;
248 map[2] = map[1] + 2 * cmax + 1;
249 // memset(map[0], 0xff, sizeof(int) * 3 * (2 * cmax + 1));
250
251 for (int i = 0; i < 3; i++) {
252 for (int ig = 0; ig < handler->cube.size[i]; ig++) {
253 map[i][ig] = modulo(cube_center[i] + lb_cube[i] + ig -
254 handler->grid.lower_corner[i],
255 handler->grid.full_size[i]);
256 }
257 }
258
259 const Interval zwindow = {.xmin = handler->grid.window_shift[0],
260 .xmax = handler->grid.window_size[0]};
261 const Interval ywindow = {.xmin = handler->grid.window_shift[1],
262 .xmax = handler->grid.window_size[1]};
263 const Interval xwindow = {.xmin = handler->grid.window_shift[2],
264 .xmax = handler->grid.window_size[2]};
265
266 for (int kg = 0; kg < handler->cube.size[0]; kg++) {
267 const int k = map[0][kg];
268 const int kd =
269 (2 * (kg + lb_cube[0]) - 1) / 2; // distance from center in grid points
270 const double kr = kd * handler->dh[2][2]; // distance from center in a.u.
271 const double kremain = disr_radius * disr_radius - kr * kr;
272
273 if ((kremain >= 0.0) && is_point_in_interval(k, zwindow)) {
274
275 const int jgmin = ceil(-1e-8 - sqrt(kremain) * handler->dh_inv[1][1]);
276 for (int jg = jgmin; jg <= (1 - jgmin); jg++) {
277 const int j = map[1][jg - lb_cube[1]];
278 const double jr = ((2 * jg - 1) >> 1) *
279 handler->dh[1][1]; // distance from center in a.u.
280 const double jremain = kremain - jr * jr;
281
282 if ((jremain >= 0.0) && is_point_in_interval(j, ywindow)) {
283 const int xmin = ceil(-1e-8 - sqrt(jremain) * handler->dh_inv[0][0]);
284 const int xmax = 1 - xmin;
285
286 // printf("xmin %d, xmax %d\n", xmin, xmax);
287 for (int x = xmin - lb_cube[2];
288 x < imin((xmax - lb_cube[2]), handler->cube.size[2]); x++) {
289 const int x1 = map[2][x];
290
291 if (!is_point_in_interval(x1, xwindow))
292 continue;
293
294 int lower_corner[3] = {k, j, x1};
295 int upper_corner[3] = {k + 1, j + 1, x1 + 1};
296
297 compute_interval(map[2], handler->grid.full_size[2],
298 handler->grid.size[2], xmax - lb_cube[2], x1, &x,
299 lower_corner + 2, upper_corner + 2, xwindow);
300
301 if (upper_corner[2] - lower_corner[2]) {
302 const int position1[3] = {kg, jg - lb_cube[1], x};
303
304 double *restrict dst = &idx3(handler->grid, lower_corner[0],
305 lower_corner[1], lower_corner[2]);
306 double *restrict src = &idx3(handler->cube, position1[0],
307 position1[1], position1[2]);
308
309 const int sizex = upper_corner[2] - lower_corner[2];
310 GRID_PRAGMA_SIMD((dst, src), 8)
311 for (int x = 0; x < sizex; x++) {
312 dst[x] += src[x];
313 }
314 }
315
316 if (handler->grid.size[2] == handler->grid.full_size[2])
317 update_loop_index(handler->grid.full_size[2], x1, &x);
318 else
319 x += upper_corner[2] - lower_corner[2] - 1;
320 }
321 }
322 }
323 }
324 }
325}
326
328 struct collocation_integration_ *const handler, const double disr_radius,
329 const int cmax, const int *const lb_cube, const int *const ub_cube,
330 const double *const roffset, const int *const cube_center) {
331
332 const double a = handler->dh[0][0] * handler->dh[0][0] +
333 handler->dh[0][1] * handler->dh[0][1] +
334 handler->dh[0][2] * handler->dh[0][2];
335 const double a_inv = 1.0 / a;
336 // a mapping so that the ig corresponds to the right grid point
337 int **map = handler->map;
338 map[1] = map[0] + 2 * cmax + 1;
339 map[2] = map[1] + 2 * cmax + 1;
340 // memset(map[0], 0xff, sizeof(int) * 3 * (2 * cmax + 1));
341
342 for (int i = 0; i < 3; i++) {
343 for (int ig = 0; ig < handler->cube.size[i]; ig++) {
344 map[i][ig] = modulo(cube_center[i] + ig + lb_cube[i] -
345 handler->grid.lower_corner[i],
346 handler->grid.full_size[i]);
347 }
348 }
349
350 const Interval zwindow = {.xmin = handler->grid.window_shift[0],
351 .xmax = handler->grid.window_size[0]};
352 const Interval ywindow = {.xmin = handler->grid.window_shift[1],
353 .xmax = handler->grid.window_size[1]};
354 const Interval xwindow = {.xmin = handler->grid.window_shift[2],
355 .xmax = handler->grid.window_size[2]};
356
357 for (int k = 0; k < handler->cube.size[0]; k++) {
358 const int iz = map[0][k];
359
360 if (!is_point_in_interval(iz, zwindow))
361 continue;
362
363 const double tz = (k + lb_cube[0] - roffset[0]);
364 const double z[3] = {tz * handler->dh[2][0], tz * handler->dh[2][1],
365 tz * handler->dh[2][2]};
366
367 for (int j = 0; j < handler->cube.size[1]; j++) {
368 const int iy = map[1][j];
369
370 if (!is_point_in_interval(iy, ywindow))
371 continue;
372
373 const double ty = (j - roffset[1] + lb_cube[1]);
374 const double y[3] = {z[0] + ty * handler->dh[1][0],
375 z[1] + ty * handler->dh[1][1],
376 z[2] + ty * handler->dh[1][2]};
377
378 /* Sqrt[(-2 x1 \[Alpha] - 2 y1 \[Beta] - 2 z1 \[Gamma])^2 - */
379 /* 4 (x1^2 + y1^2 + z1^2)
380 * (\[Alpha]^2 + \[Beta]^2 + \[Gamma]^2)] */
381
382 const double b =
383 -2.0 * (handler->dh[0][0] * (roffset[2] * handler->dh[0][0] - y[0]) +
384 handler->dh[0][1] * (roffset[2] * handler->dh[0][1] - y[1]) +
385 handler->dh[0][2] * (roffset[2] * handler->dh[0][2] - y[2]));
386
387 const double c = (roffset[2] * handler->dh[0][0] - y[0]) *
388 (roffset[2] * handler->dh[0][0] - y[0]) +
389 (roffset[2] * handler->dh[0][1] - y[1]) *
390 (roffset[2] * handler->dh[0][1] - y[1]) +
391 (roffset[2] * handler->dh[0][2] - y[2]) *
392 (roffset[2] * handler->dh[0][2] - y[2]) -
393 disr_radius * disr_radius;
394
395 double delta = b * b - 4.0 * a * c;
396
397 if (delta < 0.0)
398 continue;
399
400 delta = sqrt(delta);
401
402 const int xmin = imax(ceil((-b - delta) * 0.5 * a_inv), lb_cube[2]);
403 const int xmax = imin(floor((-b + delta) * 0.5 * a_inv), ub_cube[2]);
404
405 int lower_corner[3] = {iz, iy, xmin};
406 int upper_corner[3] = {iz + 1, iy + 1, xmin};
407
408 for (int x = xmin - lb_cube[2];
409 x < imin((xmax - lb_cube[2]), handler->cube.size[2]); x++) {
410 const int x1 = map[2][x];
411
412 if (!is_point_in_interval(x1, xwindow))
413 continue;
414
415 compute_interval(map[2], handler->grid.full_size[2],
416 handler->grid.size[2], xmax - lb_cube[2], x1, &x,
417 lower_corner + 2, upper_corner + 2, xwindow);
418
419 if (upper_corner[2] - lower_corner[2]) {
420 const int position1[3] = {k, j, x};
421
422 /* the function will internally take care of the local vs global
423 * grid */
424
425 double *__restrict__ dst = &idx3(handler->grid, lower_corner[0],
426 lower_corner[1], lower_corner[2]);
427 double *__restrict__ src =
428 &idx3(handler->cube, position1[0], position1[1], position1[2]);
429
430 const int sizex = upper_corner[2] - lower_corner[2];
431 GRID_PRAGMA_SIMD((dst, src), 8)
432 for (int x = 0; x < sizex; x++) {
433 dst[x] += src[x];
434 }
435
436 if (handler->grid.size[0] == handler->grid.full_size[0])
437 update_loop_index(handler->grid.full_size[2], x1, &x);
438 else
439 x += upper_corner[2] - lower_corner[2] - 1;
440 }
441 }
442 }
443 }
444}
445
446void collocate_l0(double *scratch, const double alpha, const bool orthogonal_xy,
447 const struct tensor_ *exp_xy,
448 const struct tensor_ *p_alpha_beta_reduced_,
449 struct tensor_ *cube) {
450 const double *__restrict pz =
451 &idx3(p_alpha_beta_reduced_[0], 0, 0, 0); /* k indice */
452 const double *__restrict py =
453 &idx3(p_alpha_beta_reduced_[0], 1, 0, 0); /* j indice */
454 const double *__restrict px =
455 &idx3(p_alpha_beta_reduced_[0], 2, 0, 0); /* i indice */
456
457 memset(&idx3(cube[0], 0, 0, 0), 0, sizeof(double) * cube->alloc_size_);
458 memset(scratch, 0, sizeof(double) * cube->size[1] * cube->ld_);
459
460 cblas_dger(CblasRowMajor, cube->size[1], cube->size[2], alpha, py, 1, px, 1,
461 scratch, cube->ld_);
462
463 if (exp_xy && !orthogonal_xy) {
464 for (int y = 0; y < cube->size[1]; y++) {
465 const double *__restrict src = &idx2(exp_xy[0], y, 0);
466 double *__restrict dst = &scratch[y * cube->ld_];
467 // #pragma omp simd linear(dst, src) simdlen(8)
468 GRID_PRAGMA_SIMD((dst, src), 8)
469 for (int x = 0; x < cube->size[2]; x++) {
470 dst[x] *= src[x];
471 }
472 }
473 }
474
475 cblas_dger(CblasRowMajor, cube->size[0], cube->size[1] * cube->ld_, 1.0, pz,
476 1, scratch, 1, &idx3(cube[0], 0, 0, 0), cube->size[1] * cube->ld_);
477}
478
479/* compute the following operation (variant) it is a tensor contraction
480
481 V_{kji} = \sum_{\alpha\beta\gamma}
482 C_{\alpha\gamma\beta} T_{2,\alpha,i} T_{1,\beta,j} T_{0,\gamma,k}
483
484*/
486 double *scratch, const double alpha, const bool *const orthogonal,
487 const struct tensor_ *Exp, const struct tensor_ *co,
488 const struct tensor_ *p_alpha_beta_reduced_, struct tensor_ *cube) {
489 if (co->size[0] > 1) {
490 dgemm_params m1, m2, m3;
491
492 memset(&m1, 0, sizeof(dgemm_params));
493 memset(&m2, 0, sizeof(dgemm_params));
494 memset(&m3, 0, sizeof(dgemm_params));
495 tensor T;
496 tensor W;
497
498 double *__restrict const pz =
499 &idx3(p_alpha_beta_reduced_[0], 0, 0, 0); /* k indice */
500 double *__restrict const py =
501 &idx3(p_alpha_beta_reduced_[0], 1, 0, 0); /* j indice */
502 double *__restrict const px =
503 &idx3(p_alpha_beta_reduced_[0], 2, 0, 0); /* i indice */
504
505 initialize_tensor_3(&T, co->size[0] /* alpha */, co->size[1] /* gamma */,
506 cube->size[1] /* j */);
507 initialize_tensor_3(&W, co->size[1] /* gamma */, cube->size[1] /* j */,
508 cube->size[2] /* i */);
509
510 T.data = scratch;
511 W.data = scratch + T.alloc_size_;
512 /* WARNING we are in row major layout. cblas allows it and it is more
513 * natural to read left to right than top to bottom
514 *
515 * we do first T_{\alpha,\gamma,j} = \sum_beta C_{alpha\gamma\beta}
516 * Y_{\beta, j}
517 *
518 * keep in mind that Y_{\beta, j} = p_alpha_beta_reduced(1, \beta, j)
519 * and the order of indices is also important. the last indice is the
520 * fastest one. it can be done with one dgemm.
521 */
522
523 m1.op1 = 'N';
524 m1.op2 = 'N';
525 m1.alpha = alpha;
526 m1.beta = 0.0;
527 m1.m = co->size[0] * co->size[1]; /* alpha gamma */
528 m1.n = cube->size[1]; /* j */
529 m1.k = co->size[2]; /* beta */
530 m1.a = co->data; // Coef_{alpha,gamma,beta} Coef_xzy
531 m1.lda = co->ld_;
532 m1.b = py; // Y_{beta, j} = p_alpha_beta_reduced(1, beta, j)
533 m1.ldb = p_alpha_beta_reduced_->ld_;
534 m1.c = T.data; // T_{\alpha, \gamma, j} = T(alpha, gamma, j)
535 m1.ldc = T.ld_;
536 m1.use_libxsmm = true;
537 /*
538 * the next step is a reduction along the alpha index.
539 *
540 * We compute then
541 *
542 * W_{gamma, j, i} = sum_{\alpha} T_{\gamma, j, alpha} X_{\alpha, i}
543 *
544 * which means we need to transpose T_{\alpha, \gamma, j} to get
545 * T_{\gamma, j, \alpha}. Fortunately we can do it while doing the
546 * matrix - matrix multiplication
547 */
548
549 m2.op1 = 'T';
550 m2.op2 = 'N';
551 m2.alpha = 1.0;
552 m2.beta = 0.0;
553 m2.m = cube->size[1] * co->size[1]; // (\gamma j) direction
554 m2.n = cube->size[2]; // i
555 m2.k = co->size[0]; // alpha
556 m2.a = T.data; // T_{\alpha, \gamma, j}
557 m2.lda = T.ld_ * co->size[1];
558 m2.b = px; // X_{alpha, i} = p_alpha_beta_reduced(0, alpha, i)
559 m2.ldb = p_alpha_beta_reduced_->ld_;
560 m2.c = W.data; // W_{\gamma, j, i}
561 m2.ldc = W.ld_;
562 m2.use_libxsmm = true;
563 /* the final step is again a reduction along the gamma indice. It can
564 * again be done with one dgemm. The operation is simply
565 *
566 * Cube_{k, j, i} = \sum_{alpha} Z_{k, \gamma} W_{\gamma, j, i}
567 *
568 * which means we need to transpose Z_{\gamma, k}.
569 */
570
571 m3.op1 = 'T';
572 m3.op2 = 'N';
573 m3.alpha = alpha;
574 m3.beta = 0.0;
575 m3.m = cube->size[0]; // Z_{k \gamma}
576 m3.n = cube->size[1] * cube->size[2]; // (ji) direction
577 m3.k = co->size[1]; // \gamma
578 m3.a = pz; // p_alpha_beta_reduced(0, gamma, i)
579 m3.lda = p_alpha_beta_reduced_->ld_;
580 m3.b = &idx3(W, 0, 0, 0); // W_{\gamma, j, i}
581 m3.ldb = W.size[1] * W.ld_;
582 m3.c = &idx3(cube[0], 0, 0, 0); // cube_{kji}
583 m3.ldc = cube->ld_ * cube->size[1];
584 m3.use_libxsmm = true;
585 dgemm_simplified(&m1);
586 dgemm_simplified(&m2);
587
588 // apply the non orthorombic corrections in the xy plane
589 if (Exp && !orthogonal[2]) {
590 tensor exp_xy;
591 initialize_tensor_2(&exp_xy, Exp->size[1], Exp->size[2]);
592 exp_xy.data = &idx3(Exp[0], 2, 0, 0);
594 }
595
596 dgemm_simplified(&m3);
597 } else {
598 if (Exp && !orthogonal[2]) {
599 tensor exp_xy;
600 initialize_tensor_2(&exp_xy, Exp->size[1], Exp->size[2]);
601
602 exp_xy.data = &idx3(Exp[0], 2, 0, 0);
603 collocate_l0(scratch, co->data[0] * alpha, orthogonal[2], &exp_xy,
604 p_alpha_beta_reduced_, cube);
605 } else {
606 collocate_l0(scratch, co->data[0] * alpha, true, NULL,
607 p_alpha_beta_reduced_, cube);
608 }
609 }
610
611 if (Exp == NULL)
612 return;
613
614 if (Exp && (!orthogonal[0] && !orthogonal[1])) {
615 tensor exp_yz;
616 tensor exp_xz;
617 initialize_tensor_2(&exp_xz, Exp->size[1], Exp->size[2]);
618 initialize_tensor_2(&exp_yz, Exp->size[1], Exp->size[2]);
619 exp_xz.data = &idx3(Exp[0], 0, 0, 0);
620 exp_yz.data = &idx3(Exp[0], 1, 0, 0);
622 return;
623 }
624
625 if (Exp && (!orthogonal[0] || !orthogonal[1])) {
626 if (!orthogonal[0]) {
627 tensor exp_xz;
628 initialize_tensor_2(&exp_xz, Exp->size[1], Exp->size[2]);
629 exp_xz.data = &idx3(Exp[0], 0, 0, 0);
631 }
632
633 if (!orthogonal[1]) {
634 tensor exp_yz;
635 initialize_tensor_2(&exp_yz, Exp->size[1], Exp->size[2]);
636 exp_yz.data = &idx3(Exp[0], 1, 0, 0);
638 }
639 }
640
641 return;
642}
643
644/* It is a sub-optimal version of the mapping in case of a cubic cutoff. But is
645 * very general and does not depend on the orthorombic nature of the grid. for
646 * orthorombic cases, it is faster to apply PCB directly on the polynomials. */
647
649 const int cmax, const int *const lower_boundaries_cube,
650 const int *const cube_center) {
651
652 // a mapping so that the ig corresponds to the right grid point
653 int **map = handler->map;
654 map[1] = map[0] + 2 * cmax + 1;
655 map[2] = map[1] + 2 * cmax + 1;
656
657 for (int i = 0; i < 3; i++) {
658 for (int ig = 0; ig < handler->cube.size[i]; ig++) {
659 map[i][ig] = modulo(cube_center[i] + ig + lower_boundaries_cube[i] -
660 handler->grid.lower_corner[i],
661 handler->grid.full_size[i]);
662 }
663 }
664
665 int lower_corner[3];
666 int upper_corner[3];
667
668 const Interval zwindow = {.xmin = handler->grid.window_shift[0],
669 .xmax = handler->grid.window_size[0]};
670 const Interval ywindow = {.xmin = handler->grid.window_shift[1],
671 .xmax = handler->grid.window_size[1]};
672 const Interval xwindow = {.xmin = handler->grid.window_shift[2],
673 .xmax = handler->grid.window_size[2]};
674
675 /* this code makes a decomposition of the cube such that we can add block of
676 * datas in a vectorized way. */
677
678 /* the decomposition depends unfortunately on the way the grid is split over
679 * mpi ranks. If each mpi rank has the full grid then it is simple. A 1D
680 * example of the decomposition will explain it better. We have an interval
681 * [x1, x1 + cube_size - 1] (and a second index x [0, cube_size -1]) and a
682 * grid that goes from [0.. grid_size - 1].
683 *
684 * We start from x1 and compute the largest interval [x1.. x1 + diff] that fit
685 * to [0.. grid_size - 1]. Computing the difference diff is simply
686 * min(grid_size - x1, cube_size - x). then we add the result in a vectorized
687 * fashion. we itterate the processus by reducing the interval [x1, x1 +
688 * cube_size - 1] until it is empty. */
689
690 for (int z = 0; (z < handler->cube.size[0]); z++) {
691 const int z1 = map[0][z];
692
693 if (!is_point_in_interval(z1, zwindow))
694 continue;
695
696 compute_interval(map[0], handler->grid.full_size[0], handler->grid.size[0],
697 handler->cube.size[0], z1, &z, lower_corner, upper_corner,
698 zwindow);
699
700 /* // We have a full plane. */
701 if (upper_corner[0] - lower_corner[0]) {
702 for (int y = 0; y < handler->cube.size[1]; y++) {
703 const int y1 = map[1][y];
704
705 // this check is completely irrelevant when running without MPI.
706 if (!is_point_in_interval(y1, ywindow))
707 continue;
708
709 compute_interval(map[1], handler->grid.full_size[1],
710 handler->grid.size[1], handler->cube.size[1], y1, &y,
711 lower_corner + 1, upper_corner + 1, ywindow);
712
713 if (upper_corner[1] - lower_corner[1]) {
714 for (int x = 0; x < handler->cube.size[2]; x++) {
715 const int x1 = map[2][x];
716
717 if (!is_point_in_interval(x1, xwindow))
718 continue;
719
720 compute_interval(map[2], handler->grid.full_size[2],
721 handler->grid.size[2], handler->cube.size[2], x1,
722 &x, lower_corner + 2, upper_corner + 2, xwindow);
723
724 if (upper_corner[2] - lower_corner[2]) {
725 const int position1[3] = {z, y, x};
726 /* the function will internally take care of the local vx global
727 * grid */
729 lower_corner, // lower corner of the portion of cube (in the
730 // full grid)
731 upper_corner, // upper corner of the portion of cube (in the
732 // full grid)
733 position1, // starting position subblock inside the cube
734 &handler->cube, // the cube to extract data from
735 &handler->grid); // the grid to add data from
736 if (handler->grid.size[2] == handler->grid.full_size[2])
737 update_loop_index(handler->grid.full_size[2], x1, &x);
738 else
739 x += upper_corner[2] - lower_corner[2] - 1;
740 }
741 }
742 if (handler->grid.size[1] == handler->grid.full_size[1])
743 update_loop_index(handler->grid.full_size[1], y1, &y);
744 else
745 y += upper_corner[1] - lower_corner[1] - 1;
746 }
747 }
748 if (handler->grid.size[0] == handler->grid.full_size[0])
749 update_loop_index(handler->grid.full_size[0], z1, &z);
750 else
751 z += upper_corner[0] - lower_corner[0] - 1;
752 }
753 }
754}
755
756// *****************************************************************************
758 const bool use_ortho, const double zetp, const double rp[3],
759 const double radius) {
760 // *** position of the gaussian product
761 //
762 // this is the actual definition of the position on the grid
763 // i.e. a point rp(:) gets here grid coordinates
764 // MODULO(rp(:)/dr(:),npts(:))+1
765 // hence (0.0,0.0,0.0) in real space is rsgrid%lb on the rsgrid ((1,1,1) on
766 // grid)
767
768 // cubecenter(:) = FLOOR(MATMUL(dh_inv, rp))
769 int cubecenter[3];
770 int cube_size[3];
771 int lb_cube[3], ub_cube[3];
772 int pol_offset[3] = {0, 0, 0};
773 double roffset[3];
774 double disr_radius;
775 /* cube : grid containing pointlike product between polynomials
776 *
777 * pol : grid containing the polynomials in all three directions
778 *
779 * pol_folded : grid containing the polynomials after folding for periodic
780 * boundaries conditions
781 */
782
783 /* seting up the cube parameters */
785 use_ortho, radius, (const double(*)[3])handler->dh,
786 (const double(*)[3])handler->dh_inv, rp, &disr_radius, roffset,
787 cubecenter, lb_cube, ub_cube, cube_size);
788
789 /* initialize the multidimensional array containing the polynomials */
790 initialize_tensor_3(&handler->pol, 3, handler->coef.size[0], cmax);
791 realloc_tensor(&handler->pol);
792 memset(handler->pol.data, 0, sizeof(double) * handler->pol.alloc_size_);
793
794 /* compute the polynomials */
795
796 // WARNING : do not reverse the order in pol otherwise you will have to
797 // reverse the order in collocate_dgemm as well.
798
799 if (use_ortho) {
800 grid_fill_pol_dgemm(false, handler->dh[0][0], roffset[2], pol_offset[2],
801 lb_cube[2], ub_cube[2], handler->coef.size[2] - 1, cmax,
802 zetp, &idx3(handler->pol, 2, 0, 0)); /* i indice */
803 grid_fill_pol_dgemm(false, handler->dh[1][1], roffset[1], pol_offset[1],
804 lb_cube[1], ub_cube[1], handler->coef.size[1] - 1, cmax,
805 zetp, &idx3(handler->pol, 1, 0, 0)); /* j indice */
806 grid_fill_pol_dgemm(false, handler->dh[2][2], roffset[0], pol_offset[0],
807 lb_cube[0], ub_cube[0], handler->coef.size[0] - 1, cmax,
808 zetp, &idx3(handler->pol, 0, 0, 0)); /* k indice */
809 } else {
810 grid_fill_pol_dgemm(false, 1.0, roffset[0], pol_offset[2], lb_cube[0],
811 ub_cube[0], handler->coef.size[0] - 1, cmax,
812 zetp * handler->dx[0],
813 &idx3(handler->pol, 0, 0, 0)); /* k indice */
814 grid_fill_pol_dgemm(false, 1.0, roffset[1], pol_offset[1], lb_cube[1],
815 ub_cube[1], handler->coef.size[1] - 1, cmax,
816 zetp * handler->dx[1],
817 &idx3(handler->pol, 1, 0, 0)); /* j indice */
818 grid_fill_pol_dgemm(false, 1.0, roffset[2], pol_offset[0], lb_cube[2],
819 ub_cube[2], handler->coef.size[2] - 1, cmax,
820 zetp * handler->dx[2],
821 &idx3(handler->pol, 2, 0, 0)); /* i indice */
822
824 zetp, roffset, (const double(*)[3])handler->dh, lb_cube, ub_cube,
825 handler->orthogonal, &handler->Exp);
826
827 /* Use a slightly modified version of Ole code */
828 grid_transform_coef_xzy_to_ikj((const double(*)[3])handler->dh,
829 &handler->coef);
830 }
831
832 /* allocate memory for the polynomial and the cube */
833
834 initialize_tensor_3(&handler->cube, cube_size[0], cube_size[1], cube_size[2]);
835
836 realloc_tensor(&handler->cube);
837 initialize_W_and_T(handler, &handler->cube, &handler->coef);
838
840 handler->scratch, // pointer to scratch memory
841 1.0, handler->orthogonal, &handler->Exp, &handler->coef, &handler->pol,
842 &handler->cube);
843
844 if (handler->apply_cutoff) {
845 if (use_ortho) {
846 apply_sphere_cutoff_ortho(handler, disr_radius, cmax, lb_cube,
847 cubecenter);
848 } else {
849 apply_spherical_cutoff_generic(handler, radius, cmax, lb_cube, ub_cube,
850 roffset, cubecenter);
851 }
852 return;
853 }
854 apply_mapping_cubic(handler, cmax, lb_cube, cubecenter);
855}
856
857//******************************************************************************
859 const bool use_ortho, const int border_mask, const enum grid_func func,
860 const int la_max, const int la_min, const int lb_max, const int lb_min,
861 const double zeta, const double zetb, const double rscale,
862 const double dh[3][3], const double dh_inv[3][3], const double ra[3],
863 const double rab[3], const int grid_global_size[3],
864 const int grid_local_size[3], const int shift_local[3],
865 const int border_width[3], const double radius, const int o1, const int o2,
866 const int n1, const int n2, const double pab_[n2][n1],
867 double *const grid_) {
869
870 tensor pab;
871 initialize_tensor_2(&pab, n2, n1);
872 alloc_tensor(&pab);
873 memcpy(pab.data, pab_, sizeof(double) * n1 * n2);
874 // Uncomment this to dump all tasks to file.
875 // #define __GRID_DUMP_TASKS
876 int offset[2] = {o1, o2};
877
878 int lmax[2] = {la_max, lb_max};
879 int lmin[2] = {la_min, lb_min};
880
881 const double zetp = zeta + zetb;
882 const double f = zetb / zetp;
883 const double rab2 = rab[0] * rab[0] + rab[1] * rab[1] + rab[2] * rab[2];
884 const double prefactor = rscale * exp(-zeta * f * rab2);
885 const double zeta_pair[2] = {zeta, zetb};
886 initialize_basis_vectors(handler, dh, dh_inv);
887 verify_orthogonality((const double(*)[3])dh, handler->orthogonal);
888
889 initialize_tensor_3(&(handler->grid), grid_local_size[2], grid_local_size[1],
890 grid_local_size[0]);
891
892 handler->grid.ld_ = grid_local_size[0];
893 handler->grid.data = grid_;
894
895 setup_global_grid_size(&handler->grid, (const int *)grid_global_size);
896
897 initialize_tensor_3(&handler->grid, grid_local_size[2], grid_local_size[1],
898 grid_local_size[0]);
899 handler->grid.ld_ = grid_local_size[0];
900
901 setup_grid_window(&handler->grid, shift_local, border_width, border_mask);
902
903 double rp[3], rb[3];
904 for (int i = 0; i < 3; i++) {
905 rp[i] = ra[i] + f * rab[i];
906 rb[i] = ra[i] + rab[i];
907 }
908
909 int lmin_diff[2], lmax_diff[2];
910 grid_prepare_get_ldiffs_dgemm(func, lmin_diff, lmax_diff);
911
912 int lmin_prep[2];
913 int lmax_prep[2];
914
915 lmin_prep[0] = imax(lmin[0] + lmin_diff[0], 0);
916 lmin_prep[1] = imax(lmin[1] + lmin_diff[1], 0);
917
918 lmax_prep[0] = lmax[0] + lmax_diff[0];
919 lmax_prep[1] = lmax[1] + lmax_diff[1];
920
921 const int n1_prep = ncoset(lmax_prep[0]);
922 const int n2_prep = ncoset(lmax_prep[1]);
923
924 /* I really do not like this. This will disappear */
925 tensor pab_prep;
926 initialize_tensor_2(&pab_prep, n2_prep, n1_prep);
927 alloc_tensor(&pab_prep);
928 memset(pab_prep.data, 0, pab_prep.alloc_size_ * sizeof(double));
929
930 grid_prepare_pab_dgemm(func, offset, lmax, lmin, &zeta_pair[0], &pab,
931 &pab_prep);
932
933 // *** initialise the coefficient matrix, we transform the sum
934 //
935 // sum_{lxa,lya,lza,lxb,lyb,lzb} P_{lxa,lya,lza,lxb,lyb,lzb} *
936 // (x-a_x)**lxa (y-a_y)**lya (z-a_z)**lza (x-b_x)**lxb (y-a_y)**lya
937 // (z-a_z)**lza
938 //
939 // into
940 //
941 // sum_{lxp,lyp,lzp} P_{lxp,lyp,lzp} (x-p_x)**lxp (y-p_y)**lyp (z-p_z)**lzp
942 //
943 // where p is center of the product gaussian, and lp = la_max + lb_max
944 // (current implementation is l**7)
945 //
946
947 /* precautionary tail since I probably intitialize data to NULL when I
948 * initialize a new tensor. I want to keep the memory (I put a ridiculous
949 * amount already) */
950
951 initialize_tensor_4(&(handler->alpha), 3, lmax_prep[1] + 1, lmax_prep[0] + 1,
952 lmax_prep[0] + lmax_prep[1] + 1);
953 realloc_tensor(&(handler->alpha));
954
955 const int lp = lmax_prep[0] + lmax_prep[1];
956
957 initialize_tensor_3(&(handler->coef), lp + 1, lp + 1, lp + 1);
958 realloc_tensor(&(handler->coef));
959
960 // initialy cp2k stores coef_xyz as coef[z][y][x]. this is fine but I
961 // need them to be stored as
962
963 grid_prepare_alpha_dgemm(ra, rb, rp, lmax_prep, &(handler->alpha));
964
965 //
966 // compute P_{lxp,lyp,lzp} given P_{lxa,lya,lza,lxb,lyb,lzb} and
967 // alpha(ls,lxa,lxb,1) use a three step procedure we don't store zeros, so
968 // counting is done using lxyz,lxy in order to have contiguous memory access
969 // in collocate_fast.F
970 //
971
972 // coef[x][z][y]
973 grid_compute_coefficients_dgemm(lmin_prep, lmax_prep, lp, prefactor,
974 &(handler->alpha), &pab_prep,
975 &(handler->coef));
976
977 grid_collocate(handler, use_ortho, zetp, rp, radius);
978
980 free(pab_prep.data);
981}
982
983void extract_blocks(grid_context *const ctx, const _task *const task,
984 const offload_buffer *pab_blocks, tensor *const work,
985 tensor *const pab) {
986 const int iatom = task->iatom;
987 const int jatom = task->jatom;
988 const int iset = task->iset;
989 const int jset = task->jset;
990 const int ikind = ctx->atom_kinds[iatom];
991 const int jkind = ctx->atom_kinds[jatom];
992 const grid_basis_set *ibasis = ctx->basis_sets[ikind];
993 const grid_basis_set *jbasis = ctx->basis_sets[jkind];
994
995 const int block_num = task->block_num;
996
997 // Locate current matrix block within the buffer. This block
998 // contains the weights of the gaussian pairs in the spherical
999 // harmonic basis, but we do computation in the cartesian
1000 // harmonic basis so we have to rotate the coefficients. It is nothing
1001 // else than a basis change and it done with two dgemm.
1002
1003 const int block_offset = ctx->block_offsets[block_num]; // zero based
1004 double *const block = &pab_blocks->host_buffer[block_offset];
1005
1006 rotate_to_cartesian_harmonics(ibasis, jbasis, iatom, jatom, iset, jset, block,
1007 work, pab);
1008}
1009
1011 struct collocation_integration_ *handler,
1012 const _task *const previous_task,
1013 const _task *const task,
1014 const offload_buffer *pab_blocks, tensor *const pab,
1015 tensor *const work, tensor *const pab_prep) {
1016 // Load subblock from buffer and decontract into Cartesian sublock pab.
1017 // The previous pab can be reused when only ipgf or jpgf has changed.
1018 if (task->update_block_ || (previous_task == NULL)) {
1019 extract_blocks(ctx, task, pab_blocks, work, pab);
1020 }
1021
1022 int lmin_prep[2];
1023 int lmax_prep[2];
1024
1025 lmin_prep[0] = imax(task->lmin[0] + handler->lmin_diff[0], 0);
1026 lmin_prep[1] = imax(task->lmin[1] + handler->lmin_diff[1], 0);
1027
1028 lmax_prep[0] = task->lmax[0] + handler->lmax_diff[0];
1029 lmax_prep[1] = task->lmax[1] + handler->lmax_diff[1];
1030
1031 const int n1_prep = ncoset(lmax_prep[0]);
1032 const int n2_prep = ncoset(lmax_prep[1]);
1033
1034 /* we do not reallocate memory. We initialized the structure with the
1035 * maximum lmax of the all list already.
1036 */
1037 initialize_tensor_2(pab_prep, n2_prep, n1_prep);
1038 realloc_tensor(pab_prep);
1039
1040 grid_prepare_pab_dgemm(handler->func, task->offset, task->lmin, task->lmax,
1041 &task->zeta[0], pab, pab_prep);
1042
1043 // *** initialise the coefficient matrix, we transform the sum
1044 //
1045 // sum_{lxa,lya,lza,lxb,lyb,lzb} P_{lxa,lya,lza,lxb,lyb,lzb} *
1046 // (x-a_x)**lxa (y-a_y)**lya (z-a_z)**lza (x-b_x)**lxb
1047 // (y-a_y)**lya (z-a_z)**lza
1048 //
1049 // into
1050 //
1051 // sum_{lxp,lyp,lzp} P_{lxp,lyp,lzp} (x-p_x)**lxp (y-p_y)**lyp
1052 // (z-p_z)**lzp
1053 //
1054 // where p is center of the product gaussian, and lp = la_max + lb_max
1055 // (current implementation is l**7)
1056 //
1057
1058 /* precautionary tail since I probably intitialize data to NULL when I
1059 * initialize a new tensor. I want to keep the memory (I put a ridiculous
1060 * amount already) */
1061
1062 initialize_tensor_4(&handler->alpha, 3, lmax_prep[1] + 1, lmax_prep[0] + 1,
1063 lmax_prep[0] + lmax_prep[1] + 1);
1064 realloc_tensor(&handler->alpha);
1065
1066 const int lp = lmax_prep[0] + lmax_prep[1];
1067
1068 initialize_tensor_3(&handler->coef, lp + 1, lp + 1, lp + 1);
1069 realloc_tensor(&handler->coef);
1070
1071 // these two functions can be done with dgemm again....
1072
1073 grid_prepare_alpha_dgemm(task->ra, task->rb, task->rp, lmax_prep,
1074 &handler->alpha);
1075
1076 // compute the coefficients after applying the function of interest
1077 // coef[x][z][y]
1079 lmin_prep, lmax_prep, lp,
1080 task->prefactor * ((task->iatom == task->jatom) ? 1.0 : 2.0),
1081 &handler->alpha, pab_prep, &handler->coef);
1082}
1083
1085 const int *const border_width,
1086 const int *const shift_local,
1087 const enum grid_func func, const int level,
1088 const offload_buffer *pab_blocks) {
1089 tensor *const grid = &ctx->grid[level];
1090 // Using default(shared) because with GCC 9 the behavior around const changed:
1091 // https://www.gnu.org/software/gcc/gcc-9/porting_to.html
1092#pragma omp parallel default(shared)
1093 {
1094 const int num_threads = omp_get_num_threads();
1095 const int thread_id = omp_get_thread_num();
1096
1097 tensor work, pab, pab_prep;
1098
1099 struct collocation_integration_ *handler = ctx->handler[thread_id];
1100
1101 handler->func = func;
1102 grid_prepare_get_ldiffs_dgemm(handler->func, handler->lmin_diff,
1103 handler->lmax_diff);
1104
1105 handler->apply_cutoff = ctx->apply_cutoff;
1106
1107 // Allocate pab matrix for re-use across tasks.
1108 initialize_tensor_2(&pab, ctx->maxco, ctx->maxco);
1109 alloc_tensor(&pab);
1110
1111 initialize_tensor_2(&work, ctx->maxco, ctx->maxco);
1112 alloc_tensor(&work);
1113
1114 initialize_tensor_2(&pab_prep, ctx->maxco, ctx->maxco);
1115 alloc_tensor(&pab_prep);
1116
1117 initialize_basis_vectors(handler, (const double(*)[3])grid->dh,
1118 (const double(*)[3])grid->dh_inv);
1119
1120 /* setup the grid parameters, window parameters (if the grid is split), etc
1121 */
1122
1123 tensor_copy(&handler->grid, grid);
1124
1125 for (int d = 0; d < 3; d++)
1126 handler->orthogonal[d] = handler->grid.orthogonal[d];
1127
1128 if ((thread_id == 0) || (num_threads == 1)) {
1129 // thread id 0 directly store the results in the final storage space
1130 handler->grid.data = ctx->grid[level].data;
1131 }
1132
1133 if ((num_threads > 1) && (thread_id > 0)) {
1134 handler->grid.data = ((double *)ctx->scratch) +
1135 (thread_id - 1) * handler->grid.alloc_size_;
1136 memset(handler->grid.data, 0, sizeof(double) * grid->alloc_size_);
1137 }
1138
1139 /* it is only useful when we split the list over multiple threads. The first
1140 * iteration should load the block whatever status the task->block_update_
1141 * has */
1142 const _task *prev_task = NULL;
1143#pragma omp for schedule(static)
1144 for (int itask = 0; itask < ctx->tasks_per_level[level]; itask++) {
1145 // Define some convenient aliases.
1146 const _task *task = &ctx->tasks[level][itask];
1147
1148 if (task->level != level) {
1149 printf("level %d, %d\n", task->level, level);
1150 abort();
1151 }
1152 /* the grid is divided over several ranks or not periodic */
1153 if ((handler->grid.size[0] != handler->grid.full_size[0]) ||
1154 (handler->grid.size[1] != handler->grid.full_size[1]) ||
1155 (handler->grid.size[2] != handler->grid.full_size[2])) {
1156 /* unfortunately the window where the gaussian should be added depends
1157 * on the bonds. So I have to adjust the window all the time. */
1158
1159 setup_grid_window(&handler->grid, shift_local, border_width,
1160 task->border_mask);
1161 }
1162
1163 /* this is a three steps procedure. pab_blocks contains the coefficients
1164 * of the operator in the spherical harmonic basis while we do computation
1165 * in the cartesian harmonic basis.
1166 *
1167 * step 1 : rotate the coefficients from the harmonic to the cartesian
1168 * basis
1169
1170 * step 2 : extract the subblock and apply additional transformations
1171 * corresponding the spatial derivatives of the operator (the last is not
1172 * always done)
1173
1174 * step 3 : change from (x - x_1)^\alpha (x - x_2)^\beta to (x -
1175 * x_{12})^k. It is a transformation which involves binomial
1176 * coefficients.
1177 *
1178 * \f[ (x - x_1) ^\alpha (x - x_2) ^ beta = \sum_{k_{1} k_{2}} ^
1179 * {\alpha\beta} \text{Binomial}(\alpha,k_1)
1180 * \text{Binomial}(\beta,k_2) (x - x_{12})^{k_1 + k_2} (x_12 - x_1)
1181 * ^{\alpha - k_1} (x_12 - x_2) ^{\beta - k_2} ]
1182 *
1183 * step 1 is done only when necessary, the two remaining steps are done
1184 * for each bond.
1185 */
1186
1187 compute_coefficients(ctx, handler, prev_task, task, pab_blocks, &pab,
1188 &work, &pab_prep);
1189
1190 grid_collocate(handler, ctx->orthorhombic, task->zetp, task->rp,
1191 task->radius);
1192 prev_task = task;
1193 }
1194
1195 // Merge thread local grids into shared grid. Could be improved though....
1196
1197 // thread 0 does nothing since the data are already placed in the final
1198 // destination
1199 if (num_threads > 1) {
1200 if ((grid->alloc_size_ / (num_threads - 1)) >= 2) {
1201 const int block_size = grid->alloc_size_ / (num_threads - 1) +
1202 (grid->alloc_size_ % (num_threads - 1));
1203
1204 for (int bk = 0; bk < num_threads; bk++) {
1205 if (thread_id > 0) {
1206 int bk_id = (bk + thread_id - 1) % num_threads;
1207 int begin = bk_id * block_size;
1208 int end = imin((bk_id + 1) * block_size, grid->alloc_size_);
1209 cblas_daxpy(end - begin, 1.0, handler->grid.data + begin, 1,
1210 grid->data + begin, 1);
1211 }
1212#pragma omp barrier
1213 }
1214 }
1215 } else {
1216#pragma omp critical
1217 if (thread_id > 0)
1218 cblas_daxpy(handler->grid.alloc_size_, 1.0,
1219 &idx3(handler->grid, 0, 0, 0), 1, &idx3(grid[0], 0, 0, 0),
1220 1);
1221 }
1222 handler->grid.data = NULL;
1223 free(pab.data);
1224 free(pab_prep.data);
1225 free(work.data);
1226 }
1227}
1228
1229/*******************************************************************************
1230 * \brief Collocate all tasks of a given list onto given grids.
1231 * See grid_task_list.h for details.
1232 ******************************************************************************/
1234 const enum grid_func func,
1235 const int nlevels,
1236 const offload_buffer *pab_blocks,
1237 offload_buffer *grids[nlevels]) {
1238
1239 grid_context *const ctx = (grid_context *)ptr;
1240
1241 assert(ctx->checksum == ctx_checksum);
1242
1243 const int max_threads = omp_get_max_threads();
1244
1245 assert(ctx->nlevels == nlevels);
1246
1247 // #pragma omp parallel for
1248 for (int level = 0; level < ctx->nlevels; level++) {
1249 const _layout *layout = &ctx->layouts[level];
1250 set_grid_parameters(&ctx->grid[level], ctx->orthorhombic,
1251 layout->npts_global, layout->npts_local,
1252 layout->shift_local, layout->border_width, layout->dh,
1253 layout->dh_inv, grids[level]);
1254 memset(ctx->grid[level].data, 0,
1255 sizeof(double) * ctx->grid[level].alloc_size_);
1256 }
1257
1258 if (ctx->scratch == NULL) {
1259 int max_size = ctx->grid[0].alloc_size_;
1260
1261 /* compute the size of the largest grid. It is used afterwards to allocate
1262 * scratch memory for the grid on each omp thread */
1263 for (int x = 1; x < nlevels; x++) {
1264 max_size = imax(ctx->grid[x].alloc_size_, max_size);
1265 }
1266
1267 max_size = ((max_size / 4096) + (max_size % 4096 != 0)) * 4096;
1268
1269 /* scratch is a void pointer !!!!! */
1270 ctx->scratch =
1271 grid_allocate_scratch(max_size * max_threads * sizeof(double));
1272 }
1273
1274 for (int level = 0; level < ctx->nlevels; level++) {
1275 const _layout *layout = &ctx->layouts[level];
1277 layout->shift_local, func, level,
1278 pab_blocks);
1279 }
1280
1282 ctx->scratch = NULL;
1283}
static int imax(int x, int y)
Returns the larger of two given integer (missing from the C standard)
static int max_threads
Definition dbm_library.c:24
static int imin(int x, int y)
Returns the smaller of two given integer (missing from the C standard)
Definition dbm_miniapp.c:38
#define GRID_PRAGMA_SIMD(OBJS, N)
Definition grid_common.h:20
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
grid_func
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
static void const int const int i
static void const int cmax
static void const int const int const double pol[3][lp+1][2 *cmax+1]
static void const int const int const int const int const int const double const int map[3][2 *cmax+1]
void grid_transform_coef_xzy_to_ikj(const double dh[3][3], const tensor *coef_xyz)
void grid_compute_coefficients_dgemm(const int *lmin, const int *lmax, const int lp, const double prefactor, const tensor *const alpha, const tensor *const pab, tensor *coef_xyz)
void grid_prepare_alpha_dgemm(const double ra[3], const double rb[3], const double rp[3], const int *lmax, tensor *alpha)
void grid_fill_pol_dgemm(const bool transpose, const double dr, const double roffset, const int pol_offset, const int xmin, const int xmax, const int lp, const int cmax, const double zetp, double *pol_)
void grid_dgemm_collocate_pgf_product(const bool use_ortho, const int border_mask, const enum grid_func func, const int la_max, const int la_min, const int lb_max, const int lb_min, const double zeta, const double zetb, const double rscale, const double dh[3][3], const double dh_inv[3][3], const double ra[3], const double rab[3], const int grid_global_size[3], const int grid_local_size[3], const int shift_local[3], const int border_width[3], const double radius, const int o1, const int o2, const int n1, const int n2, const double pab_[n2][n1], double *const grid_)
void grid_dgemm_collocate_task_list(grid_dgemm_task_list *const ptr, const enum grid_func func, const int nlevels, const offload_buffer *pab_blocks, offload_buffer *grids[nlevels])
Collocate all tasks of a given list onto given grids. See grid_task_list.h for details.
void apply_mapping_cubic(struct collocation_integration_ *handler, const int cmax, const int *const lower_boundaries_cube, const int *const cube_center)
void extract_blocks(grid_context *const ctx, const _task *const task, const offload_buffer *pab_blocks, tensor *const work, tensor *const pab)
void rotate_to_cartesian_harmonics(const grid_basis_set *ibasis, const grid_basis_set *jbasis, const int iatom, const int jatom, const int iset, const int jset, double *const block, tensor *work, tensor *pab)
void apply_spherical_cutoff_generic(struct collocation_integration_ *const handler, const double disr_radius, const int cmax, const int *const lb_cube, const int *const ub_cube, const double *const roffset, const int *const cube_center)
void tensor_reduction_for_collocate_integrate(double *scratch, const double alpha, const bool *const orthogonal, const struct tensor_ *Exp, const struct tensor_ *co, const struct tensor_ *p_alpha_beta_reduced_, struct tensor_ *cube)
void collocate_l0(double *scratch, const double alpha, const bool orthogonal, const struct tensor_ *exp_xy, const struct tensor_ *p_alpha_beta_reduced_, struct tensor_ *cube)
void compute_coefficients(grid_context *const ctx, struct collocation_integration_ *handler, const _task *const previous_task, const _task *const task, const offload_buffer *pab_blocks, tensor *const pab, tensor *const work, tensor *const pab_prep)
void collocate_one_grid_level_dgemm(grid_context *const ctx, const int *const border_width, const int *const shift_local, const enum grid_func func, const int level, const offload_buffer *pab_blocks)
void apply_sphere_cutoff_ortho(struct collocation_integration_ *const handler, const double disr_radius, const int cmax, const int *const lb_cube, const int *const cube_center)
void grid_collocate(collocation_integration *const handler, const bool use_ortho, const double zetp, const double rp[3], const double radius)
void initialize_basis_vectors(collocation_integration *const handler, const double dh[3][3], const double dh_inv[3][3])
void collocate_destroy_handle(void *gaussian_handle)
void initialize_W_and_T(collocation_integration *const handler, const tensor *cube, const tensor *coef)
struct collocation_integration_ * collocate_create_handle(void)
void set_grid_parameters(tensor *grid, const bool orthorhombic, const int grid_full_size[3], const int grid_local_size[3], const int shift_local[3], const int border_width[3], const double dh[3][3], const double dh_inv[3][3], offload_buffer *grid_)
void calculate_non_orthorombic_corrections_tensor(const double mu_mean, const double *r_ab, const double basis[3][3], const int *const xmin, const int *const xmax, bool plane[3], tensor *const Exp)
void apply_non_orthorombic_corrections_xz_yz_blocked(const struct tensor_ *const Exp_xz, const struct tensor_ *const Exp_yz, struct tensor_ *const m)
void apply_non_orthorombic_corrections_yz_blocked(const struct tensor_ *const Exp, struct tensor_ *const m)
void apply_non_orthorombic_corrections_xz_blocked(const struct tensor_ *const Exp, struct tensor_ *const m)
void apply_non_orthorombic_corrections_xy_blocked(const struct tensor_ *const Exp, struct tensor_ *const m)
void grid_prepare_pab_dgemm(const enum grid_func func, const int *const offset, const int *const lmin, const int *const lmax, const double *const zeta, tensor *const pab, tensor *const pab_prep)
void grid_prepare_get_ldiffs_dgemm(const enum grid_func func, int *const lmin_diff, int *const lmax_diff)
static bool is_point_in_interval(const int value, Interval x)
static void update_loop_index(const int global_grid_size, int x1, int *const x)
void tensor_copy(tensor *const b, const tensor *const a)
size_t realloc_tensor(tensor *t)
void alloc_tensor(tensor *t)
static void setup_grid_window(tensor *const grid, const int *const shift_local, const int *const border_width, const int border_mask)
static void initialize_tensor_4(struct tensor_ *a, int n1, int n2, int n3, int n4)
#define idx2(a, i, j)
static void setup_global_grid_size(tensor *const grid, const int *const full_size)
static void initialize_tensor_3(struct tensor_ *a, int n1, int n2, int n3)
#define idx3(a, i, j, k)
static void initialize_tensor_2(struct tensor_ *a, int n1, int n2)
int compute_cube_properties(const bool ortho, const double radius, const double dh[3][3], const double dh_inv[3][3], const double *rp, double *disr_radius, double *roffset, int *cubecenter, int *lb_cube, int *ub_cube, int *cube_size)
void compute_interval(const int *const map, const int full_size, const int size, const int cube_size, const int x1, int *x, int *const lower_corner, int *const upper_corner, const Interval window)
void cblas_dger(const CBLAS_LAYOUT Layout, const int M, const int N, const double alpha, const double *X, const int incX, const double *Y, const int incY, double *A, const int lda)
void cblas_daxpy(const int N, const double alpha, const double *X, const int incX, double *Y, const int incY)
void verify_orthogonality(const double dh[3][3], bool orthogonal[3])
void dgemm_simplified(dgemm_params *const m)
void add_sub_grid(const int *lower_corner, const int *upper_corner, const int *position, const tensor *subgrid, tensor *grid)
static void * grid_allocate_scratch(size_t size)
@ CblasRowMajor
static void grid_free_scratch(void *ptr)
double dh_inv[3][3]
Internal representation of a basis set.
grid_basis_set ** basis_sets
struct collocation_integration_ ** handler
Internal representation of a buffer.
double * host_buffer