(git:c28f603)
Loading...
Searching...
No Matches
grid_dgemm_collocate.c
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2026 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: BSD-3-Clause */
6/*----------------------------------------------------------------------------*/
7
8#include <assert.h>
9#include <limits.h>
10#include <math.h>
11#include <omp.h>
12#include <stdbool.h>
13#include <stdio.h>
14#include <stdlib.h>
15#include <string.h>
16
17#include "../common/grid_basis_set.h"
18#include "../common/grid_common.h"
19#include "../common/grid_constants.h"
28
29void collocate_l0(double *scratch, const double alpha, const bool orthogonal,
30 const struct tensor_ *exp_xy,
31 const struct tensor_ *p_alpha_beta_reduced_,
32 struct tensor_ *cube);
33
35 const grid_basis_set *jbasis,
36 const int iatom, const int jatom,
37 const int iset, const int jset,
38 double *const block, tensor *work,
39 tensor *pab) {
40 dgemm_params m1, m2;
41 memset(&m1, 0, sizeof(dgemm_params));
42 memset(&m2, 0, sizeof(dgemm_params));
43
44 // Define some more convenient aliases.
45 const int nsgf_seta = ibasis->nsgf_set[iset]; // size of spherical set
46 const int nsgf_setb = jbasis->nsgf_set[jset];
47 const int nsgfa = ibasis->nsgf; // size of entire spherical basis
48 const int nsgfb = jbasis->nsgf;
49 const int sgfa = ibasis->first_sgf[iset] - 1; // start of spherical set
50 const int sgfb = jbasis->first_sgf[jset] - 1;
51 const int maxcoa = ibasis->maxco;
52 const int maxcob = jbasis->maxco;
53 const int ncoseta = ncoset(ibasis->lmax[iset]);
54 const int ncosetb = ncoset(jbasis->lmax[jset]);
55 const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
56 const int ncob = jbasis->npgf[jset] * ncosetb;
57
58 initialize_tensor_2(work, nsgf_setb, ncoa);
59 realloc_tensor(work);
60
61 initialize_tensor_2(pab, ncob, ncoa);
62 realloc_tensor(pab);
63
64 // the rotations happen here.
65 if (iatom <= jatom) {
66 m1.op1 = 'N';
67 m1.op2 = 'N';
68 m1.m = work->size[0];
69 m1.n = work->size[1];
70 m1.k = nsgf_seta;
71 m1.alpha = 1.0;
72 m1.beta = 0.0;
73 m1.a = block + sgfb * nsgfa + sgfa;
74 m1.lda = nsgfa;
75 m1.b = &ibasis->sphi[sgfa * maxcoa];
76 m1.ldb = maxcoa;
77 m1.c = work->data;
78 m1.ldc = work->ld_;
79 } else {
80 m1.op1 = 'T';
81 m1.op2 = 'N';
82 m1.m = work->size[0];
83 m1.n = work->size[1];
84 m1.k = nsgf_seta;
85 m1.alpha = 1.0;
86 m1.beta = 0.0;
87 m1.a = block + sgfa * nsgfb + sgfb;
88 m1.lda = nsgfb;
89 m1.b = &ibasis->sphi[sgfa * maxcoa];
90 m1.ldb = maxcoa;
91 m1.c = work->data;
92 m1.ldc = work->ld_;
93 }
94
96
97 m2.op1 = 'T';
98 m2.op2 = 'N';
99 m2.m = pab->size[0];
100 m2.n = pab->size[1];
101 m2.k = work->size[0];
102 m2.alpha = 1.0;
103 m2.beta = 0.0;
104 m2.a = &jbasis->sphi[sgfb * maxcob];
105 m2.lda = maxcob;
106 m2.b = work->data;
107 m2.ldb = work->ld_;
108 m2.c = pab->data;
109 m2.ldc = pab->ld_;
110
111 dgemm_simplified(&m2);
112}
113
115 double *scratch, const double alpha, const bool *const orthogonal,
116 const struct tensor_ *Exp, const struct tensor_ *co,
117 const struct tensor_ *p_alpha_beta_reduced_, struct tensor_ *cube);
118
119/* compute the functions (x - x_i)^l exp (-eta (x - x_i)^2) for l = 0..lp using
120 * a recursive relation to avoid computing the exponential on each grid point. I
121 * think it is not really necessary anymore since it is *not* the dominating
122 * contribution to computation of collocate and integrate */
123
124void grid_fill_pol_dgemm(const bool transpose, const double dr,
125 const double roffset, const int pol_offset,
126 const int xmin, const int xmax, const int lp,
127 const int cmax, const double zetp, double *pol_) {
128 tensor pol;
129 const double t_exp_1 = exp(-zetp * dr * dr);
130 const double t_exp_2 = t_exp_1 * t_exp_1;
131
132 double t_exp_min_1 = exp(-zetp * (dr - roffset) * (dr - roffset));
133 double t_exp_min_2 = exp(2.0 * zetp * (dr - roffset) * dr);
134
135 double t_exp_plus_1 = exp(-zetp * roffset * roffset);
136 double t_exp_plus_2 = exp(2.0 * zetp * roffset * dr);
137
138 if (transpose) {
139 initialize_tensor_2(&pol, cmax, lp + 1);
140 pol.data = pol_;
141 /* It is original Ole code. I need to transpose the polynomials for the
142 * integration routine and Ole code already does it. */
143 for (int ig = 0; ig >= xmin; ig--) {
144 const double rpg = ig * dr - roffset;
145 t_exp_min_1 *= t_exp_min_2 * t_exp_1;
146 t_exp_min_2 *= t_exp_2;
147 double pg = t_exp_min_1;
148 for (int icoef = 0; icoef <= lp; icoef++) {
149 idx2(pol, pol_offset + ig - xmin, icoef) = pg;
150 pg *= rpg;
151 }
152 }
153
154 double t_exp_plus_1 = exp(-zetp * roffset * roffset);
155 double t_exp_plus_2 = exp(2 * zetp * roffset * dr);
156 for (int ig = 1; ig <= xmax; ig++) {
157 const double rpg = ig * dr - roffset;
158 t_exp_plus_1 *= t_exp_plus_2 * t_exp_1;
159 t_exp_plus_2 *= t_exp_2;
160 double pg = t_exp_plus_1;
161 for (int icoef = 0; icoef <= lp; icoef++) {
162 idx2(pol, pol_offset + ig - xmin, icoef) = pg;
163 pg *= rpg;
164 }
165 }
166
167 } else {
168 initialize_tensor_2(&pol, lp + 1, cmax);
169 pol.data = pol_;
170 /* memset(pol.data, 0, sizeof(double) * pol.alloc_size_); */
171 /*
172 * compute the values of all (x-xp)**lp*exp(..)
173 *
174 * still requires the old trick:
175 * new trick to avoid to many exps (reuse the result from the previous
176 * gridpoint): exp( -a*(x+d)**2)=exp(-a*x**2)*exp(-2*a*x*d)*exp(-a*d**2)
177 * exp(-2*a*(x+d)*d)=exp(-2*a*x*d)*exp(-2*a*d**2)
178 */
179
180 /* compute the exponential recursively and store the polynomial prefactors
181 * as well */
182 for (int ig = 0; ig >= xmin; ig--) {
183 const double rpg = ig * dr - roffset;
184 t_exp_min_1 *= t_exp_min_2 * t_exp_1;
185 t_exp_min_2 *= t_exp_2;
186 double pg = t_exp_min_1;
187 idx2(pol, 0, pol_offset + ig - xmin) = pg;
188 if (lp > 0)
189 idx2(pol, 1, pol_offset + ig - xmin) = rpg;
190 }
191
192 for (int ig = 1; ig <= xmax; ig++) {
193 const double rpg = ig * dr - roffset;
194 t_exp_plus_1 *= t_exp_plus_2 * t_exp_1;
195 t_exp_plus_2 *= t_exp_2;
196 double pg = t_exp_plus_1;
197 idx2(pol, 0, pol_offset + ig - xmin) = pg;
198 if (lp > 0)
199 idx2(pol, 1, pol_offset + ig - xmin) = rpg;
200 }
201
202 /* compute the remaining powers using previously computed stuff */
203 if (lp >= 2) {
204 double *__restrict__ poly = &idx2(pol, 1, 0);
205 double *__restrict__ src1 = &idx2(pol, 0, 0);
206 double *__restrict__ dst = &idx2(pol, 2, 0);
207 // #pragma omp simd linear(dst, src1, poly) simdlen(8)
208 GRID_PRAGMA_SIMD((dst, src1, poly), 8)
209 for (int ig = 0; ig < (xmax - xmin + 1 + pol_offset); ig++)
210 dst[ig] = src1[ig] * poly[ig] * poly[ig];
211 }
212
213 for (int icoef = 3; icoef <= lp; icoef++) {
214 double *__restrict__ poly = &idx2(pol, 1, 0);
215 double *__restrict__ src1 = &idx2(pol, icoef - 1, 0);
216 double *__restrict__ dst = &idx2(pol, icoef, 0);
217 GRID_PRAGMA_SIMD((dst, src1, poly), 8)
218 for (int ig = 0; ig < (xmax - xmin + 1 + pol_offset); ig++) {
219 dst[ig] = src1[ig] * poly[ig];
220 }
221 }
222
223 //
224 if (lp > 0) {
225 // I can not declare src__ variable constant because it breaks openmp
226 // standard.
227 double *__restrict__ dst = &idx2(pol, 1, 0);
228 double *__restrict__ src = &idx2(pol, 0, 0);
229 GRID_PRAGMA_SIMD((dst, src), 8)
230 for (int ig = 0; ig < (xmax - xmin + 1 + pol_offset); ig++) {
231 dst[ig] *= src[ig];
232 }
233 }
234 }
235}
236
238 const double disr_radius, const int cmax,
239 const int *const lb_cube,
240 const int *const cube_center) {
241 // a mapping so that the ig corresponds to the right grid point
242 int **map = handler->map;
243 map[1] = map[0] + 2 * cmax + 1;
244 map[2] = map[1] + 2 * cmax + 1;
245 // memset(map[0], 0xff, sizeof(int) * 3 * (2 * cmax + 1));
246
247 for (int i = 0; i < 3; i++) {
248 for (int ig = 0; ig < handler->cube.size[i]; ig++) {
249 map[i][ig] = modulo(cube_center[i] + lb_cube[i] + ig -
250 handler->grid.lower_corner[i],
251 handler->grid.full_size[i]);
252 }
253 }
254
255 const Interval zwindow = {.xmin = handler->grid.window_shift[0],
256 .xmax = handler->grid.window_size[0]};
257 const Interval ywindow = {.xmin = handler->grid.window_shift[1],
258 .xmax = handler->grid.window_size[1]};
259 const Interval xwindow = {.xmin = handler->grid.window_shift[2],
260 .xmax = handler->grid.window_size[2]};
261
262 for (int kg = 0; kg < handler->cube.size[0]; kg++) {
263 const int k = map[0][kg];
264 const int kd =
265 (2 * (kg + lb_cube[0]) - 1) / 2; // distance from center in grid points
266 const double kr = kd * handler->dh[2][2]; // distance from center in a.u.
267 const double kremain = disr_radius * disr_radius - kr * kr;
268
269 if ((kremain >= 0.0) && is_point_in_interval(k, zwindow)) {
270
271 const int jgmin = ceil(-1e-8 - sqrt(kremain) * handler->dh_inv[1][1]);
272 for (int jg = jgmin; jg <= (1 - jgmin); jg++) {
273 const int j = map[1][jg - lb_cube[1]];
274 const double jr = ((2 * jg - 1) >> 1) *
275 handler->dh[1][1]; // distance from center in a.u.
276 const double jremain = kremain - jr * jr;
277
278 if ((jremain >= 0.0) && is_point_in_interval(j, ywindow)) {
279 const int xmin = ceil(-1e-8 - sqrt(jremain) * handler->dh_inv[0][0]);
280 const int xmax = 1 - xmin;
281
282 // printf("xmin %d, xmax %d\n", xmin, xmax);
283 for (int x = xmin - lb_cube[2];
284 x < imin((xmax - lb_cube[2]), handler->cube.size[2]); x++) {
285 const int x1 = map[2][x];
286
287 if (!is_point_in_interval(x1, xwindow))
288 continue;
289
290 int lower_corner[3] = {k, j, x1};
291 int upper_corner[3] = {k + 1, j + 1, x1 + 1};
292
293 compute_interval(map[2], handler->grid.full_size[2],
294 handler->grid.size[2], xmax - lb_cube[2], x1, &x,
295 lower_corner + 2, upper_corner + 2, xwindow);
296
297 if (upper_corner[2] - lower_corner[2]) {
298 const int position1[3] = {kg, jg - lb_cube[1], x};
299
300 double *restrict dst = &idx3(handler->grid, lower_corner[0],
301 lower_corner[1], lower_corner[2]);
302 double *restrict src = &idx3(handler->cube, position1[0],
303 position1[1], position1[2]);
304
305 const int sizex = upper_corner[2] - lower_corner[2];
306 GRID_PRAGMA_SIMD((dst, src), 8)
307 for (int x = 0; x < sizex; x++) {
308 dst[x] += src[x];
309 }
310 }
311
312 if (handler->grid.size[2] == handler->grid.full_size[2])
313 update_loop_index(handler->grid.full_size[2], x1, &x);
314 else
315 x += upper_corner[2] - lower_corner[2] - 1;
316 }
317 }
318 }
319 }
320 }
321}
322
324 struct collocation_integration_ *const handler, const double disr_radius,
325 const int cmax, const int *const lb_cube, const int *const ub_cube,
326 const double *const roffset, const int *const cube_center) {
327
328 const double a = handler->dh[0][0] * handler->dh[0][0] +
329 handler->dh[0][1] * handler->dh[0][1] +
330 handler->dh[0][2] * handler->dh[0][2];
331 const double a_inv = 1.0 / a;
332 // a mapping so that the ig corresponds to the right grid point
333 int **map = handler->map;
334 map[1] = map[0] + 2 * cmax + 1;
335 map[2] = map[1] + 2 * cmax + 1;
336 // memset(map[0], 0xff, sizeof(int) * 3 * (2 * cmax + 1));
337
338 for (int i = 0; i < 3; i++) {
339 for (int ig = 0; ig < handler->cube.size[i]; ig++) {
340 map[i][ig] = modulo(cube_center[i] + ig + lb_cube[i] -
341 handler->grid.lower_corner[i],
342 handler->grid.full_size[i]);
343 }
344 }
345
346 const Interval zwindow = {.xmin = handler->grid.window_shift[0],
347 .xmax = handler->grid.window_size[0]};
348 const Interval ywindow = {.xmin = handler->grid.window_shift[1],
349 .xmax = handler->grid.window_size[1]};
350 const Interval xwindow = {.xmin = handler->grid.window_shift[2],
351 .xmax = handler->grid.window_size[2]};
352
353 for (int k = 0; k < handler->cube.size[0]; k++) {
354 const int iz = map[0][k];
355
356 if (!is_point_in_interval(iz, zwindow))
357 continue;
358
359 const double tz = (k + lb_cube[0] - roffset[0]);
360 const double z[3] = {tz * handler->dh[2][0], tz * handler->dh[2][1],
361 tz * handler->dh[2][2]};
362
363 for (int j = 0; j < handler->cube.size[1]; j++) {
364 const int iy = map[1][j];
365
366 if (!is_point_in_interval(iy, ywindow))
367 continue;
368
369 const double ty = (j - roffset[1] + lb_cube[1]);
370 const double y[3] = {z[0] + ty * handler->dh[1][0],
371 z[1] + ty * handler->dh[1][1],
372 z[2] + ty * handler->dh[1][2]};
373
374 /* Sqrt[(-2 x1 \[Alpha] - 2 y1 \[Beta] - 2 z1 \[Gamma])^2 - */
375 /* 4 (x1^2 + y1^2 + z1^2)
376 * (\[Alpha]^2 + \[Beta]^2 + \[Gamma]^2)] */
377
378 const double b =
379 -2.0 * (handler->dh[0][0] * (roffset[2] * handler->dh[0][0] - y[0]) +
380 handler->dh[0][1] * (roffset[2] * handler->dh[0][1] - y[1]) +
381 handler->dh[0][2] * (roffset[2] * handler->dh[0][2] - y[2]));
382
383 const double c = (roffset[2] * handler->dh[0][0] - y[0]) *
384 (roffset[2] * handler->dh[0][0] - y[0]) +
385 (roffset[2] * handler->dh[0][1] - y[1]) *
386 (roffset[2] * handler->dh[0][1] - y[1]) +
387 (roffset[2] * handler->dh[0][2] - y[2]) *
388 (roffset[2] * handler->dh[0][2] - y[2]) -
389 disr_radius * disr_radius;
390
391 double delta = b * b - 4.0 * a * c;
392
393 if (delta < 0.0)
394 continue;
395
396 delta = sqrt(delta);
397
398 const int xmin = imax(ceil((-b - delta) * 0.5 * a_inv), lb_cube[2]);
399 const int xmax = imin(floor((-b + delta) * 0.5 * a_inv), ub_cube[2]);
400
401 int lower_corner[3] = {iz, iy, xmin};
402 int upper_corner[3] = {iz + 1, iy + 1, xmin};
403
404 for (int x = xmin - lb_cube[2];
405 x < imin((xmax - lb_cube[2]), handler->cube.size[2]); x++) {
406 const int x1 = map[2][x];
407
408 if (!is_point_in_interval(x1, xwindow))
409 continue;
410
411 compute_interval(map[2], handler->grid.full_size[2],
412 handler->grid.size[2], xmax - lb_cube[2], x1, &x,
413 lower_corner + 2, upper_corner + 2, xwindow);
414
415 if (upper_corner[2] - lower_corner[2]) {
416 const int position1[3] = {k, j, x};
417
418 /* the function will internally take care of the local vs global
419 * grid */
420
421 double *__restrict__ dst = &idx3(handler->grid, lower_corner[0],
422 lower_corner[1], lower_corner[2]);
423 double *__restrict__ src =
424 &idx3(handler->cube, position1[0], position1[1], position1[2]);
425
426 const int sizex = upper_corner[2] - lower_corner[2];
427 GRID_PRAGMA_SIMD((dst, src), 8)
428 for (int x = 0; x < sizex; x++) {
429 dst[x] += src[x];
430 }
431
432 if (handler->grid.size[0] == handler->grid.full_size[0])
433 update_loop_index(handler->grid.full_size[2], x1, &x);
434 else
435 x += upper_corner[2] - lower_corner[2] - 1;
436 }
437 }
438 }
439 }
440}
441
442void collocate_l0(double *scratch, const double alpha, const bool orthogonal_xy,
443 const struct tensor_ *exp_xy,
444 const struct tensor_ *p_alpha_beta_reduced_,
445 struct tensor_ *cube) {
446 const double *__restrict pz =
447 &idx3(p_alpha_beta_reduced_[0], 0, 0, 0); /* k indice */
448 const double *__restrict py =
449 &idx3(p_alpha_beta_reduced_[0], 1, 0, 0); /* j indice */
450 const double *__restrict px =
451 &idx3(p_alpha_beta_reduced_[0], 2, 0, 0); /* i indice */
452
453 memset(&idx3(cube[0], 0, 0, 0), 0, sizeof(double) * cube->alloc_size_);
454 memset(scratch, 0, sizeof(double) * cube->size[1] * cube->ld_);
455
456 cblas_dger(CblasRowMajor, cube->size[1], cube->size[2], alpha, py, 1, px, 1,
457 scratch, cube->ld_);
458
459 if (exp_xy && !orthogonal_xy) {
460 for (int y = 0; y < cube->size[1]; y++) {
461 const double *__restrict src = &idx2(exp_xy[0], y, 0);
462 double *__restrict dst = &scratch[y * cube->ld_];
463 // #pragma omp simd linear(dst, src) simdlen(8)
464 GRID_PRAGMA_SIMD((dst, src), 8)
465 for (int x = 0; x < cube->size[2]; x++) {
466 dst[x] *= src[x];
467 }
468 }
469 }
470
471 cblas_dger(CblasRowMajor, cube->size[0], cube->size[1] * cube->ld_, 1.0, pz,
472 1, scratch, 1, &idx3(cube[0], 0, 0, 0), cube->size[1] * cube->ld_);
473}
474
475/* compute the following operation (variant) it is a tensor contraction
476
477 V_{kji} = \sum_{\alpha\beta\gamma}
478 C_{\alpha\gamma\beta} T_{2,\alpha,i} T_{1,\beta,j} T_{0,\gamma,k}
479
480*/
482 double *scratch, const double alpha, const bool *const orthogonal,
483 const struct tensor_ *Exp, const struct tensor_ *co,
484 const struct tensor_ *p_alpha_beta_reduced_, struct tensor_ *cube) {
485 if (co->size[0] > 1) {
486 dgemm_params m1, m2, m3;
487
488 memset(&m1, 0, sizeof(dgemm_params));
489 memset(&m2, 0, sizeof(dgemm_params));
490 memset(&m3, 0, sizeof(dgemm_params));
491 tensor T;
492 tensor W;
493
494 double *__restrict const pz =
495 &idx3(p_alpha_beta_reduced_[0], 0, 0, 0); /* k indice */
496 double *__restrict const py =
497 &idx3(p_alpha_beta_reduced_[0], 1, 0, 0); /* j indice */
498 double *__restrict const px =
499 &idx3(p_alpha_beta_reduced_[0], 2, 0, 0); /* i indice */
500
501 initialize_tensor_3(&T, co->size[0] /* alpha */, co->size[1] /* gamma */,
502 cube->size[1] /* j */);
503 initialize_tensor_3(&W, co->size[1] /* gamma */, cube->size[1] /* j */,
504 cube->size[2] /* i */);
505
506 T.data = scratch;
507 W.data = scratch + T.alloc_size_;
508 /* WARNING we are in row major layout. cblas allows it and it is more
509 * natural to read left to right than top to bottom
510 *
511 * we do first T_{\alpha,\gamma,j} = \sum_beta C_{alpha\gamma\beta}
512 * Y_{\beta, j}
513 *
514 * keep in mind that Y_{\beta, j} = p_alpha_beta_reduced(1, \beta, j)
515 * and the order of indices is also important. the last indice is the
516 * fastest one. it can be done with one dgemm.
517 */
518
519 m1.op1 = 'N';
520 m1.op2 = 'N';
521 m1.alpha = alpha;
522 m1.beta = 0.0;
523 m1.m = co->size[0] * co->size[1]; /* alpha gamma */
524 m1.n = cube->size[1]; /* j */
525 m1.k = co->size[2]; /* beta */
526 m1.a = co->data; // Coef_{alpha,gamma,beta} Coef_xzy
527 m1.lda = co->ld_;
528 m1.b = py; // Y_{beta, j} = p_alpha_beta_reduced(1, beta, j)
529 m1.ldb = p_alpha_beta_reduced_->ld_;
530 m1.c = T.data; // T_{\alpha, \gamma, j} = T(alpha, gamma, j)
531 m1.ldc = T.ld_;
532
533 /*
534 * the next step is a reduction along the alpha index.
535 *
536 * We compute then
537 *
538 * W_{gamma, j, i} = sum_{\alpha} T_{\gamma, j, alpha} X_{\alpha, i}
539 *
540 * which means we need to transpose T_{\alpha, \gamma, j} to get
541 * T_{\gamma, j, \alpha}. Fortunately we can do it while doing the
542 * matrix - matrix multiplication
543 */
544
545 m2.op1 = 'T';
546 m2.op2 = 'N';
547 m2.alpha = 1.0;
548 m2.beta = 0.0;
549 m2.m = cube->size[1] * co->size[1]; // (\gamma j) direction
550 m2.n = cube->size[2]; // i
551 m2.k = co->size[0]; // alpha
552 m2.a = T.data; // T_{\alpha, \gamma, j}
553 m2.lda = T.ld_ * co->size[1];
554 m2.b = px; // X_{alpha, i} = p_alpha_beta_reduced(0, alpha, i)
555 m2.ldb = p_alpha_beta_reduced_->ld_;
556 m2.c = W.data; // W_{\gamma, j, i}
557 m2.ldc = W.ld_;
558
559 /* the final step is again a reduction along the gamma indice. It can
560 * again be done with one dgemm. The operation is simply
561 *
562 * Cube_{k, j, i} = \sum_{alpha} Z_{k, \gamma} W_{\gamma, j, i}
563 *
564 * which means we need to transpose Z_{\gamma, k}.
565 */
566
567 m3.op1 = 'T';
568 m3.op2 = 'N';
569 m3.alpha = alpha;
570 m3.beta = 0.0;
571 m3.m = cube->size[0]; // Z_{k \gamma}
572 m3.n = cube->size[1] * cube->size[2]; // (ji) direction
573 m3.k = co->size[1]; // \gamma
574 m3.a = pz; // p_alpha_beta_reduced(0, gamma, i)
575 m3.lda = p_alpha_beta_reduced_->ld_;
576 m3.b = &idx3(W, 0, 0, 0); // W_{\gamma, j, i}
577 m3.ldb = W.size[1] * W.ld_;
578 m3.c = &idx3(cube[0], 0, 0, 0); // cube_{kji}
579 m3.ldc = cube->ld_ * cube->size[1];
580
581 dgemm_simplified(&m1);
582 dgemm_simplified(&m2);
583
584 // apply the non orthorombic corrections in the xy plane
585 if (Exp && !orthogonal[2]) {
586 tensor exp_xy;
587 initialize_tensor_2(&exp_xy, Exp->size[1], Exp->size[2]);
588 exp_xy.data = &idx3(Exp[0], 2, 0, 0);
590 }
591
592 dgemm_simplified(&m3);
593 } else {
594 if (Exp && !orthogonal[2]) {
595 tensor exp_xy;
596 initialize_tensor_2(&exp_xy, Exp->size[1], Exp->size[2]);
597
598 exp_xy.data = &idx3(Exp[0], 2, 0, 0);
599 collocate_l0(scratch, co->data[0] * alpha, orthogonal[2], &exp_xy,
600 p_alpha_beta_reduced_, cube);
601 } else {
602 collocate_l0(scratch, co->data[0] * alpha, true, NULL,
603 p_alpha_beta_reduced_, cube);
604 }
605 }
606
607 if (Exp == NULL)
608 return;
609
610 if (Exp && (!orthogonal[0] && !orthogonal[1])) {
611 tensor exp_yz;
612 tensor exp_xz;
613 initialize_tensor_2(&exp_xz, Exp->size[1], Exp->size[2]);
614 initialize_tensor_2(&exp_yz, Exp->size[1], Exp->size[2]);
615 exp_xz.data = &idx3(Exp[0], 0, 0, 0);
616 exp_yz.data = &idx3(Exp[0], 1, 0, 0);
618 return;
619 }
620
621 if (Exp && (!orthogonal[0] || !orthogonal[1])) {
622 if (!orthogonal[0]) {
623 tensor exp_xz;
624 initialize_tensor_2(&exp_xz, Exp->size[1], Exp->size[2]);
625 exp_xz.data = &idx3(Exp[0], 0, 0, 0);
627 }
628
629 if (!orthogonal[1]) {
630 tensor exp_yz;
631 initialize_tensor_2(&exp_yz, Exp->size[1], Exp->size[2]);
632 exp_yz.data = &idx3(Exp[0], 1, 0, 0);
634 }
635 }
636
637 return;
638}
639
640/* It is a sub-optimal version of the mapping in case of a cubic cutoff. But is
641 * very general and does not depend on the orthorombic nature of the grid. for
642 * orthorombic cases, it is faster to apply PCB directly on the polynomials. */
643
645 const int cmax, const int *const lower_boundaries_cube,
646 const int *const cube_center) {
647
648 // a mapping so that the ig corresponds to the right grid point
649 int **map = handler->map;
650 map[1] = map[0] + 2 * cmax + 1;
651 map[2] = map[1] + 2 * cmax + 1;
652
653 for (int i = 0; i < 3; i++) {
654 for (int ig = 0; ig < handler->cube.size[i]; ig++) {
655 map[i][ig] = modulo(cube_center[i] + ig + lower_boundaries_cube[i] -
656 handler->grid.lower_corner[i],
657 handler->grid.full_size[i]);
658 }
659 }
660
661 int lower_corner[3];
662 int upper_corner[3];
663
664 const Interval zwindow = {.xmin = handler->grid.window_shift[0],
665 .xmax = handler->grid.window_size[0]};
666 const Interval ywindow = {.xmin = handler->grid.window_shift[1],
667 .xmax = handler->grid.window_size[1]};
668 const Interval xwindow = {.xmin = handler->grid.window_shift[2],
669 .xmax = handler->grid.window_size[2]};
670
671 /* this code makes a decomposition of the cube such that we can add block of
672 * datas in a vectorized way. */
673
674 /* the decomposition depends unfortunately on the way the grid is split over
675 * mpi ranks. If each mpi rank has the full grid then it is simple. A 1D
676 * example of the decomposition will explain it better. We have an interval
677 * [x1, x1 + cube_size - 1] (and a second index x [0, cube_size -1]) and a
678 * grid that goes from [0.. grid_size - 1].
679 *
680 * We start from x1 and compute the largest interval [x1.. x1 + diff] that fit
681 * to [0.. grid_size - 1]. Computing the difference diff is simply
682 * min(grid_size - x1, cube_size - x). then we add the result in a vectorized
683 * fashion. we itterate the processus by reducing the interval [x1, x1 +
684 * cube_size - 1] until it is empty. */
685
686 for (int z = 0; (z < handler->cube.size[0]); z++) {
687 const int z1 = map[0][z];
688
689 if (!is_point_in_interval(z1, zwindow))
690 continue;
691
692 compute_interval(map[0], handler->grid.full_size[0], handler->grid.size[0],
693 handler->cube.size[0], z1, &z, lower_corner, upper_corner,
694 zwindow);
695
696 /* // We have a full plane. */
697 if (upper_corner[0] - lower_corner[0]) {
698 for (int y = 0; y < handler->cube.size[1]; y++) {
699 const int y1 = map[1][y];
700
701 // this check is completely irrelevant when running without MPI.
702 if (!is_point_in_interval(y1, ywindow))
703 continue;
704
705 compute_interval(map[1], handler->grid.full_size[1],
706 handler->grid.size[1], handler->cube.size[1], y1, &y,
707 lower_corner + 1, upper_corner + 1, ywindow);
708
709 if (upper_corner[1] - lower_corner[1]) {
710 for (int x = 0; x < handler->cube.size[2]; x++) {
711 const int x1 = map[2][x];
712
713 if (!is_point_in_interval(x1, xwindow))
714 continue;
715
716 compute_interval(map[2], handler->grid.full_size[2],
717 handler->grid.size[2], handler->cube.size[2], x1,
718 &x, lower_corner + 2, upper_corner + 2, xwindow);
719
720 if (upper_corner[2] - lower_corner[2]) {
721 const int position1[3] = {z, y, x};
722 /* the function will internally take care of the local vx global
723 * grid */
725 lower_corner, // lower corner of the portion of cube (in the
726 // full grid)
727 upper_corner, // upper corner of the portion of cube (in the
728 // full grid)
729 position1, // starting position subblock inside the cube
730 &handler->cube, // the cube to extract data from
731 &handler->grid); // the grid to add data from
732 if (handler->grid.size[2] == handler->grid.full_size[2])
733 update_loop_index(handler->grid.full_size[2], x1, &x);
734 else
735 x += upper_corner[2] - lower_corner[2] - 1;
736 }
737 }
738 if (handler->grid.size[1] == handler->grid.full_size[1])
739 update_loop_index(handler->grid.full_size[1], y1, &y);
740 else
741 y += upper_corner[1] - lower_corner[1] - 1;
742 }
743 }
744 if (handler->grid.size[0] == handler->grid.full_size[0])
745 update_loop_index(handler->grid.full_size[0], z1, &z);
746 else
747 z += upper_corner[0] - lower_corner[0] - 1;
748 }
749 }
750}
751
752// *****************************************************************************
754 const bool use_ortho, const double zetp, const double rp[3],
755 const double radius) {
756 // *** position of the gaussian product
757 //
758 // this is the actual definition of the position on the grid
759 // i.e. a point rp(:) gets here grid coordinates
760 // MODULO(rp(:)/dr(:),npts(:))+1
761 // hence (0.0,0.0,0.0) in real space is rsgrid%lb on the rsgrid ((1,1,1) on
762 // grid)
763
764 // cubecenter(:) = FLOOR(MATMUL(dh_inv, rp))
765 int cubecenter[3];
766 int cube_size[3];
767 int lb_cube[3], ub_cube[3];
768 int pol_offset[3] = {0, 0, 0};
769 double roffset[3];
770 double disr_radius;
771 /* cube : grid containing pointlike product between polynomials
772 *
773 * pol : grid containing the polynomials in all three directions
774 *
775 * pol_folded : grid containing the polynomials after folding for periodic
776 * boundaries conditions
777 */
778
779 /* seting up the cube parameters */
781 use_ortho, radius, (const double(*)[3])handler->dh,
782 (const double(*)[3])handler->dh_inv, rp, &disr_radius, roffset,
783 cubecenter, lb_cube, ub_cube, cube_size);
784
785 /* initialize the multidimensional array containing the polynomials */
786 initialize_tensor_3(&handler->pol, 3, handler->coef.size[0], cmax);
787 realloc_tensor(&handler->pol);
788 memset(handler->pol.data, 0, sizeof(double) * handler->pol.alloc_size_);
789
790 /* compute the polynomials */
791
792 // WARNING : do not reverse the order in pol otherwise you will have to
793 // reverse the order in collocate_dgemm as well.
794
795 if (use_ortho) {
796 grid_fill_pol_dgemm(false, handler->dh[0][0], roffset[2], pol_offset[2],
797 lb_cube[2], ub_cube[2], handler->coef.size[2] - 1, cmax,
798 zetp, &idx3(handler->pol, 2, 0, 0)); /* i indice */
799 grid_fill_pol_dgemm(false, handler->dh[1][1], roffset[1], pol_offset[1],
800 lb_cube[1], ub_cube[1], handler->coef.size[1] - 1, cmax,
801 zetp, &idx3(handler->pol, 1, 0, 0)); /* j indice */
802 grid_fill_pol_dgemm(false, handler->dh[2][2], roffset[0], pol_offset[0],
803 lb_cube[0], ub_cube[0], handler->coef.size[0] - 1, cmax,
804 zetp, &idx3(handler->pol, 0, 0, 0)); /* k indice */
805 } else {
806 grid_fill_pol_dgemm(false, 1.0, roffset[0], pol_offset[2], lb_cube[0],
807 ub_cube[0], handler->coef.size[0] - 1, cmax,
808 zetp * handler->dx[0],
809 &idx3(handler->pol, 0, 0, 0)); /* k indice */
810 grid_fill_pol_dgemm(false, 1.0, roffset[1], pol_offset[1], lb_cube[1],
811 ub_cube[1], handler->coef.size[1] - 1, cmax,
812 zetp * handler->dx[1],
813 &idx3(handler->pol, 1, 0, 0)); /* j indice */
814 grid_fill_pol_dgemm(false, 1.0, roffset[2], pol_offset[0], lb_cube[2],
815 ub_cube[2], handler->coef.size[2] - 1, cmax,
816 zetp * handler->dx[2],
817 &idx3(handler->pol, 2, 0, 0)); /* i indice */
818
820 zetp, roffset, (const double(*)[3])handler->dh, lb_cube, ub_cube,
821 handler->orthogonal, &handler->Exp);
822
823 /* Use a slightly modified version of Ole code */
824 grid_transform_coef_xzy_to_ikj((const double(*)[3])handler->dh,
825 &handler->coef);
826 }
827
828 /* allocate memory for the polynomial and the cube */
829
830 initialize_tensor_3(&handler->cube, cube_size[0], cube_size[1], cube_size[2]);
831
832 realloc_tensor(&handler->cube);
833 initialize_W_and_T(handler, &handler->cube, &handler->coef);
834
836 handler->scratch, // pointer to scratch memory
837 1.0, handler->orthogonal, &handler->Exp, &handler->coef, &handler->pol,
838 &handler->cube);
839
840 if (handler->apply_cutoff) {
841 if (use_ortho) {
842 apply_sphere_cutoff_ortho(handler, disr_radius, cmax, lb_cube,
843 cubecenter);
844 } else {
845 apply_spherical_cutoff_generic(handler, radius, cmax, lb_cube, ub_cube,
846 roffset, cubecenter);
847 }
848 return;
849 }
850 apply_mapping_cubic(handler, cmax, lb_cube, cubecenter);
851}
852
853//******************************************************************************
855 const bool use_ortho, const int border_mask, const enum grid_func func,
856 const int la_max, const int la_min, const int lb_max, const int lb_min,
857 const double zeta, const double zetb, const double rscale,
858 const double dh[3][3], const double dh_inv[3][3], const double ra[3],
859 const double rab[3], const int grid_global_size[3],
860 const int grid_local_size[3], const int shift_local[3],
861 const int border_width[3], const double radius, const int o1, const int o2,
862 const int n1, const int n2, const double pab_[n2][n1],
863 double *const grid_) {
865
866 tensor pab;
867 initialize_tensor_2(&pab, n2, n1);
868 alloc_tensor(&pab);
869 memcpy(pab.data, pab_, sizeof(double) * n1 * n2);
870 // Uncomment this to dump all tasks to file.
871 // #define __GRID_DUMP_TASKS
872 int offset[2] = {o1, o2};
873
874 int lmax[2] = {la_max, lb_max};
875 int lmin[2] = {la_min, lb_min};
876
877 const double zetp = zeta + zetb;
878 const double f = zetb / zetp;
879 const double rab2 = rab[0] * rab[0] + rab[1] * rab[1] + rab[2] * rab[2];
880 const double prefactor = rscale * exp(-zeta * f * rab2);
881 const double zeta_pair[2] = {zeta, zetb};
882 initialize_basis_vectors(handler, dh, dh_inv);
883 verify_orthogonality((const double(*)[3])dh, handler->orthogonal);
884
885 initialize_tensor_3(&(handler->grid), grid_local_size[2], grid_local_size[1],
886 grid_local_size[0]);
887
888 handler->grid.ld_ = grid_local_size[0];
889 handler->grid.data = grid_;
890
891 setup_global_grid_size(&handler->grid, (const int *)grid_global_size);
892
893 initialize_tensor_3(&handler->grid, grid_local_size[2], grid_local_size[1],
894 grid_local_size[0]);
895 handler->grid.ld_ = grid_local_size[0];
896
897 setup_grid_window(&handler->grid, shift_local, border_width, border_mask);
898
899 double rp[3], rb[3];
900 for (int i = 0; i < 3; i++) {
901 rp[i] = ra[i] + f * rab[i];
902 rb[i] = ra[i] + rab[i];
903 }
904
905 int lmin_diff[2], lmax_diff[2];
906 grid_prepare_get_ldiffs_dgemm(func, lmin_diff, lmax_diff);
907
908 int lmin_prep[2];
909 int lmax_prep[2];
910
911 lmin_prep[0] = imax(lmin[0] + lmin_diff[0], 0);
912 lmin_prep[1] = imax(lmin[1] + lmin_diff[1], 0);
913
914 lmax_prep[0] = lmax[0] + lmax_diff[0];
915 lmax_prep[1] = lmax[1] + lmax_diff[1];
916
917 const int n1_prep = ncoset(lmax_prep[0]);
918 const int n2_prep = ncoset(lmax_prep[1]);
919
920 /* I really do not like this. This will disappear */
921 tensor pab_prep;
922 initialize_tensor_2(&pab_prep, n2_prep, n1_prep);
923 alloc_tensor(&pab_prep);
924 memset(pab_prep.data, 0, pab_prep.alloc_size_ * sizeof(double));
925
926 grid_prepare_pab_dgemm(func, offset, lmax, lmin, &zeta_pair[0], &pab,
927 &pab_prep);
928
929 // *** initialise the coefficient matrix, we transform the sum
930 //
931 // sum_{lxa,lya,lza,lxb,lyb,lzb} P_{lxa,lya,lza,lxb,lyb,lzb} *
932 // (x-a_x)**lxa (y-a_y)**lya (z-a_z)**lza (x-b_x)**lxb (y-a_y)**lya
933 // (z-a_z)**lza
934 //
935 // into
936 //
937 // sum_{lxp,lyp,lzp} P_{lxp,lyp,lzp} (x-p_x)**lxp (y-p_y)**lyp (z-p_z)**lzp
938 //
939 // where p is center of the product gaussian, and lp = la_max + lb_max
940 // (current implementation is l**7)
941 //
942
943 /* precautionary tail since I probably intitialize data to NULL when I
944 * initialize a new tensor. I want to keep the memory (I put a ridiculous
945 * amount already) */
946
947 initialize_tensor_4(&(handler->alpha), 3, lmax_prep[1] + 1, lmax_prep[0] + 1,
948 lmax_prep[0] + lmax_prep[1] + 1);
949 realloc_tensor(&(handler->alpha));
950
951 const int lp = lmax_prep[0] + lmax_prep[1];
952
953 initialize_tensor_3(&(handler->coef), lp + 1, lp + 1, lp + 1);
954 realloc_tensor(&(handler->coef));
955
956 // initialy cp2k stores coef_xyz as coef[z][y][x]. this is fine but I
957 // need them to be stored as
958
959 grid_prepare_alpha_dgemm(ra, rb, rp, lmax_prep, &(handler->alpha));
960
961 //
962 // compute P_{lxp,lyp,lzp} given P_{lxa,lya,lza,lxb,lyb,lzb} and
963 // alpha(ls,lxa,lxb,1) use a three step procedure we don't store zeros, so
964 // counting is done using lxyz,lxy in order to have contiguous memory access
965 // in collocate_fast.F
966 //
967
968 // coef[x][z][y]
969 grid_compute_coefficients_dgemm(lmin_prep, lmax_prep, lp, prefactor,
970 &(handler->alpha), &pab_prep,
971 &(handler->coef));
972
973 grid_collocate(handler, use_ortho, zetp, rp, radius);
974
976 free(pab_prep.data);
977}
978
979void extract_blocks(grid_context *const ctx, const _task *const task,
980 const offload_buffer *pab_blocks, tensor *const work,
981 tensor *const pab) {
982 const int iatom = task->iatom;
983 const int jatom = task->jatom;
984 const int iset = task->iset;
985 const int jset = task->jset;
986 const int ikind = ctx->atom_kinds[iatom];
987 const int jkind = ctx->atom_kinds[jatom];
988 const grid_basis_set *ibasis = ctx->basis_sets[ikind];
989 const grid_basis_set *jbasis = ctx->basis_sets[jkind];
990
991 const int block_num = task->block_num;
992
993 // Locate current matrix block within the buffer. This block
994 // contains the weights of the gaussian pairs in the spherical
995 // harmonic basis, but we do computation in the cartesian
996 // harmonic basis so we have to rotate the coefficients. It is nothing
997 // else than a basis change and it done with two dgemm.
998
999 const int block_offset = ctx->block_offsets[block_num]; // zero based
1000 double *const block = &pab_blocks->host_buffer[block_offset];
1001
1002 rotate_to_cartesian_harmonics(ibasis, jbasis, iatom, jatom, iset, jset, block,
1003 work, pab);
1004}
1005
1007 struct collocation_integration_ *handler,
1008 const _task *const previous_task,
1009 const _task *const task,
1010 const offload_buffer *pab_blocks, tensor *const pab,
1011 tensor *const work, tensor *const pab_prep) {
1012 // Load subblock from buffer and decontract into Cartesian sublock pab.
1013 // The previous pab can be reused when only ipgf or jpgf has changed.
1014 if (task->update_block_ || (previous_task == NULL)) {
1015 extract_blocks(ctx, task, pab_blocks, work, pab);
1016 }
1017
1018 int lmin_prep[2];
1019 int lmax_prep[2];
1020
1021 lmin_prep[0] = imax(task->lmin[0] + handler->lmin_diff[0], 0);
1022 lmin_prep[1] = imax(task->lmin[1] + handler->lmin_diff[1], 0);
1023
1024 lmax_prep[0] = task->lmax[0] + handler->lmax_diff[0];
1025 lmax_prep[1] = task->lmax[1] + handler->lmax_diff[1];
1026
1027 const int n1_prep = ncoset(lmax_prep[0]);
1028 const int n2_prep = ncoset(lmax_prep[1]);
1029
1030 /* we do not reallocate memory. We initialized the structure with the
1031 * maximum lmax of the all list already.
1032 */
1033 initialize_tensor_2(pab_prep, n2_prep, n1_prep);
1034 realloc_tensor(pab_prep);
1035
1036 grid_prepare_pab_dgemm(handler->func, task->offset, task->lmin, task->lmax,
1037 &task->zeta[0], pab, pab_prep);
1038
1039 // *** initialise the coefficient matrix, we transform the sum
1040 //
1041 // sum_{lxa,lya,lza,lxb,lyb,lzb} P_{lxa,lya,lza,lxb,lyb,lzb} *
1042 // (x-a_x)**lxa (y-a_y)**lya (z-a_z)**lza (x-b_x)**lxb
1043 // (y-a_y)**lya (z-a_z)**lza
1044 //
1045 // into
1046 //
1047 // sum_{lxp,lyp,lzp} P_{lxp,lyp,lzp} (x-p_x)**lxp (y-p_y)**lyp
1048 // (z-p_z)**lzp
1049 //
1050 // where p is center of the product gaussian, and lp = la_max + lb_max
1051 // (current implementation is l**7)
1052 //
1053
1054 /* precautionary tail since I probably intitialize data to NULL when I
1055 * initialize a new tensor. I want to keep the memory (I put a ridiculous
1056 * amount already) */
1057
1058 initialize_tensor_4(&handler->alpha, 3, lmax_prep[1] + 1, lmax_prep[0] + 1,
1059 lmax_prep[0] + lmax_prep[1] + 1);
1060 realloc_tensor(&handler->alpha);
1061
1062 const int lp = lmax_prep[0] + lmax_prep[1];
1063
1064 initialize_tensor_3(&handler->coef, lp + 1, lp + 1, lp + 1);
1065 realloc_tensor(&handler->coef);
1066
1067 // these two functions can be done with dgemm again....
1068
1069 grid_prepare_alpha_dgemm(task->ra, task->rb, task->rp, lmax_prep,
1070 &handler->alpha);
1071
1072 // compute the coefficients after applying the function of interest
1073 // coef[x][z][y]
1075 lmin_prep, lmax_prep, lp,
1076 task->prefactor * ((task->iatom == task->jatom) ? 1.0 : 2.0),
1077 &handler->alpha, pab_prep, &handler->coef);
1078}
1079
1081 const int *const border_width,
1082 const int *const shift_local,
1083 const enum grid_func func, const int level,
1084 const offload_buffer *pab_blocks) {
1085 tensor *const grid = &ctx->grid[level];
1086 // Using default(shared) because with GCC 9 the behavior around const changed:
1087 // https://www.gnu.org/software/gcc/gcc-9/porting_to.html
1088#pragma omp parallel default(shared)
1089 {
1090 const int num_threads = omp_get_num_threads();
1091 const int thread_id = omp_get_thread_num();
1092
1093 tensor work, pab, pab_prep;
1094
1095 struct collocation_integration_ *handler = ctx->handler[thread_id];
1096
1097 handler->func = func;
1098 grid_prepare_get_ldiffs_dgemm(handler->func, handler->lmin_diff,
1099 handler->lmax_diff);
1100
1101 handler->apply_cutoff = ctx->apply_cutoff;
1102
1103 // Allocate pab matrix for re-use across tasks.
1104 initialize_tensor_2(&pab, ctx->maxco, ctx->maxco);
1105 alloc_tensor(&pab);
1106
1107 initialize_tensor_2(&work, ctx->maxco, ctx->maxco);
1108 alloc_tensor(&work);
1109
1110 initialize_tensor_2(&pab_prep, ctx->maxco, ctx->maxco);
1111 alloc_tensor(&pab_prep);
1112
1113 initialize_basis_vectors(handler, (const double(*)[3])grid->dh,
1114 (const double(*)[3])grid->dh_inv);
1115
1116 /* setup the grid parameters, window parameters (if the grid is split), etc
1117 */
1118
1119 tensor_copy(&handler->grid, grid);
1120
1121 for (int d = 0; d < 3; d++)
1122 handler->orthogonal[d] = handler->grid.orthogonal[d];
1123
1124 if ((thread_id == 0) || (num_threads == 1)) {
1125 // thread id 0 directly store the results in the final storage space
1126 handler->grid.data = ctx->grid[level].data;
1127 }
1128
1129 if ((num_threads > 1) && (thread_id > 0)) {
1130 handler->grid.data = ((double *)ctx->scratch) +
1131 (thread_id - 1) * handler->grid.alloc_size_;
1132 memset(handler->grid.data, 0, sizeof(double) * grid->alloc_size_);
1133 }
1134
1135 /* it is only useful when we split the list over multiple threads. The first
1136 * iteration should load the block whatever status the task->block_update_
1137 * has */
1138 const _task *prev_task = NULL;
1139#pragma omp for schedule(static)
1140 for (int itask = 0; itask < ctx->tasks_per_level[level]; itask++) {
1141 // Define some convenient aliases.
1142 const _task *task = &ctx->tasks[level][itask];
1143
1144 if (task->level != level) {
1145 printf("level %d, %d\n", task->level, level);
1146 abort();
1147 }
1148 /* the grid is divided over several ranks or not periodic */
1149 if ((handler->grid.size[0] != handler->grid.full_size[0]) ||
1150 (handler->grid.size[1] != handler->grid.full_size[1]) ||
1151 (handler->grid.size[2] != handler->grid.full_size[2])) {
1152 /* unfortunately the window where the gaussian should be added depends
1153 * on the bonds. So I have to adjust the window all the time. */
1154
1155 setup_grid_window(&handler->grid, shift_local, border_width,
1156 task->border_mask);
1157 }
1158
1159 /* this is a three steps procedure. pab_blocks contains the coefficients
1160 * of the operator in the spherical harmonic basis while we do computation
1161 * in the cartesian harmonic basis.
1162 *
1163 * step 1 : rotate the coefficients from the harmonic to the cartesian
1164 * basis
1165
1166 * step 2 : extract the subblock and apply additional transformations
1167 * corresponding the spatial derivatives of the operator (the last is not
1168 * always done)
1169
1170 * step 3 : change from (x - x_1)^\alpha (x - x_2)^\beta to (x -
1171 * x_{12})^k. It is a transformation which involves binomial
1172 * coefficients.
1173 *
1174 * \f[ (x - x_1) ^\alpha (x - x_2) ^ beta = \sum_{k_{1} k_{2}} ^
1175 * {\alpha\beta} \text{Binomial}(\alpha,k_1)
1176 * \text{Binomial}(\beta,k_2) (x - x_{12})^{k_1 + k_2} (x_12 - x_1)
1177 * ^{\alpha - k_1} (x_12 - x_2) ^{\beta - k_2} ]
1178 *
1179 * step 1 is done only when necessary, the two remaining steps are done
1180 * for each bond.
1181 */
1182
1183 compute_coefficients(ctx, handler, prev_task, task, pab_blocks, &pab,
1184 &work, &pab_prep);
1185
1186 grid_collocate(handler, ctx->orthorhombic, task->zetp, task->rp,
1187 task->radius);
1188 prev_task = task;
1189 }
1190
1191 // Merge thread local grids into shared grid. Could be improved though....
1192
1193 // thread 0 does nothing since the data are already placed in the final
1194 // destination
1195 if (num_threads > 1) {
1196 if ((grid->alloc_size_ / (num_threads - 1)) >= 2) {
1197 const int block_size = grid->alloc_size_ / (num_threads - 1) +
1198 (grid->alloc_size_ % (num_threads - 1));
1199
1200 for (int bk = 0; bk < num_threads; bk++) {
1201 if (thread_id > 0) {
1202 int bk_id = (bk + thread_id - 1) % num_threads;
1203 int begin = bk_id * block_size;
1204 int end = imin((bk_id + 1) * block_size, grid->alloc_size_);
1205 cblas_daxpy(end - begin, 1.0, handler->grid.data + begin, 1,
1206 grid->data + begin, 1);
1207 }
1208#pragma omp barrier
1209 }
1210 }
1211 } else {
1212#pragma omp critical
1213 if (thread_id > 0)
1214 cblas_daxpy(handler->grid.alloc_size_, 1.0,
1215 &idx3(handler->grid, 0, 0, 0), 1, &idx3(grid[0], 0, 0, 0),
1216 1);
1217 }
1218 handler->grid.data = NULL;
1219 free(pab.data);
1220 free(pab_prep.data);
1221 free(work.data);
1222 }
1223}
1224
1225/*******************************************************************************
1226 * \brief Collocate all tasks of a given list onto given grids.
1227 * See grid_task_list.h for details.
1228 ******************************************************************************/
1230 const enum grid_func func,
1231 const int nlevels,
1232 const offload_buffer *pab_blocks,
1233 offload_buffer *grids[nlevels]) {
1234
1235 grid_context *const ctx = (grid_context *)ptr;
1236
1237 assert(ctx->checksum == ctx_checksum);
1238
1239 const int max_threads = omp_get_max_threads();
1240
1241 assert(ctx->nlevels == nlevels);
1242
1243 // #pragma omp parallel for
1244 for (int level = 0; level < ctx->nlevels; level++) {
1245 const _layout *layout = &ctx->layouts[level];
1246 set_grid_parameters(&ctx->grid[level], ctx->orthorhombic,
1247 layout->npts_global, layout->npts_local,
1248 layout->shift_local, layout->border_width, layout->dh,
1249 layout->dh_inv, grids[level]);
1250 memset(ctx->grid[level].data, 0,
1251 sizeof(double) * ctx->grid[level].alloc_size_);
1252 }
1253
1254 if (ctx->scratch == NULL) {
1255 int max_size = ctx->grid[0].alloc_size_;
1256
1257 /* compute the size of the largest grid. It is used afterwards to allocate
1258 * scratch memory for the grid on each omp thread */
1259 for (int x = 1; x < nlevels; x++) {
1260 max_size = imax(ctx->grid[x].alloc_size_, max_size);
1261 }
1262
1263 max_size = ((max_size / 4096) + (max_size % 4096 != 0)) * 4096;
1264
1265 /* scratch is a void pointer !!!!! */
1266 ctx->scratch =
1267 grid_allocate_scratch(max_size * max_threads * sizeof(double));
1268 }
1269
1270 for (int level = 0; level < ctx->nlevels; level++) {
1271 const _layout *layout = &ctx->layouts[level];
1273 layout->shift_local, func, level,
1274 pab_blocks);
1275 }
1276
1278 ctx->scratch = NULL;
1279}
static int imax(int x, int y)
Returns the larger of two given integers (missing from the C standard)
static int max_threads
Definition dbm_library.c:25
static int imin(int x, int y)
Returns the smaller of the two integers (missing from the C standard).
Definition dbm_miniapp.c:36
#define GRID_PRAGMA_SIMD(OBJS, N)
Definition grid_common.h:23
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
grid_func
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
static void const int const int i
static void const int cmax
static void const int const int const double pol[3][lp+1][2 *cmax+1]
static void const int const int const int const int const int const double const int map[3][2 *cmax+1]
void grid_transform_coef_xzy_to_ikj(const double dh[3][3], const tensor *coef_xyz)
void grid_compute_coefficients_dgemm(const int *lmin, const int *lmax, const int lp, const double prefactor, const tensor *const alpha, const tensor *const pab, tensor *coef_xyz)
void grid_prepare_alpha_dgemm(const double ra[3], const double rb[3], const double rp[3], const int *lmax, tensor *alpha)
void grid_fill_pol_dgemm(const bool transpose, const double dr, const double roffset, const int pol_offset, const int xmin, const int xmax, const int lp, const int cmax, const double zetp, double *pol_)
void grid_dgemm_collocate_pgf_product(const bool use_ortho, const int border_mask, const enum grid_func func, const int la_max, const int la_min, const int lb_max, const int lb_min, const double zeta, const double zetb, const double rscale, const double dh[3][3], const double dh_inv[3][3], const double ra[3], const double rab[3], const int grid_global_size[3], const int grid_local_size[3], const int shift_local[3], const int border_width[3], const double radius, const int o1, const int o2, const int n1, const int n2, const double pab_[n2][n1], double *const grid_)
void grid_dgemm_collocate_task_list(grid_dgemm_task_list *const ptr, const enum grid_func func, const int nlevels, const offload_buffer *pab_blocks, offload_buffer *grids[nlevels])
Collocate all tasks of a given list onto given grids. See grid_task_list.h for details.
void apply_mapping_cubic(struct collocation_integration_ *handler, const int cmax, const int *const lower_boundaries_cube, const int *const cube_center)
void extract_blocks(grid_context *const ctx, const _task *const task, const offload_buffer *pab_blocks, tensor *const work, tensor *const pab)
void rotate_to_cartesian_harmonics(const grid_basis_set *ibasis, const grid_basis_set *jbasis, const int iatom, const int jatom, const int iset, const int jset, double *const block, tensor *work, tensor *pab)
void apply_spherical_cutoff_generic(struct collocation_integration_ *const handler, const double disr_radius, const int cmax, const int *const lb_cube, const int *const ub_cube, const double *const roffset, const int *const cube_center)
void tensor_reduction_for_collocate_integrate(double *scratch, const double alpha, const bool *const orthogonal, const struct tensor_ *Exp, const struct tensor_ *co, const struct tensor_ *p_alpha_beta_reduced_, struct tensor_ *cube)
void collocate_l0(double *scratch, const double alpha, const bool orthogonal, const struct tensor_ *exp_xy, const struct tensor_ *p_alpha_beta_reduced_, struct tensor_ *cube)
void compute_coefficients(grid_context *const ctx, struct collocation_integration_ *handler, const _task *const previous_task, const _task *const task, const offload_buffer *pab_blocks, tensor *const pab, tensor *const work, tensor *const pab_prep)
void collocate_one_grid_level_dgemm(grid_context *const ctx, const int *const border_width, const int *const shift_local, const enum grid_func func, const int level, const offload_buffer *pab_blocks)
void apply_sphere_cutoff_ortho(struct collocation_integration_ *const handler, const double disr_radius, const int cmax, const int *const lb_cube, const int *const cube_center)
void grid_collocate(collocation_integration *const handler, const bool use_ortho, const double zetp, const double rp[3], const double radius)
void initialize_basis_vectors(collocation_integration *const handler, const double dh[3][3], const double dh_inv[3][3])
void collocate_destroy_handle(void *gaussian_handle)
void initialize_W_and_T(collocation_integration *const handler, const tensor *cube, const tensor *coef)
struct collocation_integration_ * collocate_create_handle(void)
void set_grid_parameters(tensor *grid, const bool orthorhombic, const int grid_full_size[3], const int grid_local_size[3], const int shift_local[3], const int border_width[3], const double dh[3][3], const double dh_inv[3][3], offload_buffer *grid_)
void calculate_non_orthorombic_corrections_tensor(const double mu_mean, const double *r_ab, const double basis[3][3], const int *const xmin, const int *const xmax, bool plane[3], tensor *const Exp)
void apply_non_orthorombic_corrections_xz_yz_blocked(const struct tensor_ *const Exp_xz, const struct tensor_ *const Exp_yz, struct tensor_ *const m)
void apply_non_orthorombic_corrections_yz_blocked(const struct tensor_ *const Exp, struct tensor_ *const m)
void apply_non_orthorombic_corrections_xz_blocked(const struct tensor_ *const Exp, struct tensor_ *const m)
void apply_non_orthorombic_corrections_xy_blocked(const struct tensor_ *const Exp, struct tensor_ *const m)
void grid_prepare_pab_dgemm(const enum grid_func func, const int *const offset, const int *const lmin, const int *const lmax, const double *const zeta, tensor *const pab, tensor *const pab_prep)
void grid_prepare_get_ldiffs_dgemm(const enum grid_func func, int *const lmin_diff, int *const lmax_diff)
static bool is_point_in_interval(const int value, Interval x)
static void update_loop_index(const int global_grid_size, int x1, int *const x)
void grid_dgemm_task_list
opaque pointer hidding the internal representation of the structure. It is not needed to know what ex...
void tensor_copy(tensor *const b, const tensor *const a)
size_t realloc_tensor(tensor *t)
void alloc_tensor(tensor *t)
static void setup_grid_window(tensor *const grid, const int *const shift_local, const int *const border_width, const int border_mask)
static void initialize_tensor_4(struct tensor_ *a, int n1, int n2, int n3, int n4)
#define idx2(a, i, j)
static void setup_global_grid_size(tensor *const grid, const int *const full_size)
static void initialize_tensor_3(struct tensor_ *a, int n1, int n2, int n3)
#define idx3(a, i, j, k)
static void initialize_tensor_2(struct tensor_ *a, int n1, int n2)
int compute_cube_properties(const bool ortho, const double radius, const double dh[3][3], const double dh_inv[3][3], const double *rp, double *disr_radius, double *roffset, int *cubecenter, int *lb_cube, int *ub_cube, int *cube_size)
void compute_interval(const int *const map, const int full_size, const int size, const int cube_size, const int x1, int *x, int *const lower_corner, int *const upper_corner, const Interval window)
void cblas_dger(const CBLAS_LAYOUT Layout, const int M, const int N, const double alpha, const double *X, const int incX, const double *Y, const int incY, double *A, const int lda)
void cblas_daxpy(const int N, const double alpha, const double *X, const int incX, double *Y, const int incY)
void verify_orthogonality(const double dh[3][3], bool orthogonal[3])
void dgemm_simplified(dgemm_params *const m)
void add_sub_grid(const int *lower_corner, const int *upper_corner, const int *position, const tensor *subgrid, tensor *grid)
static void * grid_allocate_scratch(size_t size)
@ CblasRowMajor
static void grid_free_scratch(void *ptr)
double dh_inv[3][3]
Internal representation of a basis set.
grid_basis_set ** basis_sets
struct collocation_integration_ ** handler
Internal representation of a buffer.
double * host_buffer