(git:9aade48)
Loading...
Searching...
No Matches
grid_dgemm_integrate.c
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2026 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: BSD-3-Clause */
6/*----------------------------------------------------------------------------*/
7
8#include <assert.h>
9#include <limits.h>
10#include <math.h>
11#include <stdbool.h>
12#include <stdio.h>
13#include <stdlib.h>
14#include <string.h>
15#include <unistd.h>
16
17#include <omp.h>
18
19#include "../common/grid_common.h"
26#include "grid_dgemm_utils.h"
27
29 struct collocation_integration_ *const handler, const double disr_radius,
30 const int cmax, const int *const lb_cube, const int *const cube_center) {
31 // a mapping so that the ig corresponds to the right grid point
32 int **map = handler->map;
33 map[1] = map[0] + 2 * cmax + 1;
34 map[2] = map[1] + 2 * cmax + 1;
35 // memset(map[0], 0xff, sizeof(int) * 3 * (2 * cmax + 1));
36
37 for (int i = 0; i < 3; i++) {
38 for (int ig = 0; ig < handler->cube.size[i]; ig++) {
39 map[i][ig] = modulo(cube_center[i] + lb_cube[i] + ig -
40 handler->grid.lower_corner[i],
41 handler->grid.full_size[i]);
42 }
43 }
44
45 const Interval zwindow = {.xmin = handler->grid.window_shift[0],
46 .xmax = handler->grid.window_size[0]};
47 const Interval ywindow = {.xmin = handler->grid.window_shift[1],
48 .xmax = handler->grid.window_size[1]};
49 const Interval xwindow = {.xmin = handler->grid.window_shift[2],
50 .xmax = handler->grid.window_size[2]};
51
52 for (int kg = 0; kg < handler->cube.size[0]; kg++) {
53 const int k = map[0][kg];
54 const int kd =
55 (2 * (kg + lb_cube[0]) - 1) / 2; // distance from center in grid points
56 const double kr = kd * handler->dh[2][2]; // distance from center in a.u.
57 const double kremain = disr_radius * disr_radius - kr * kr;
58
59 if ((kremain >= 0.0) && is_point_in_interval(k, zwindow)) {
60
61 const int jgmin = ceil(-1e-8 - sqrt(kremain) * handler->dh_inv[1][1]);
62 for (int jg = jgmin; jg <= (1 - jgmin); jg++) {
63 const int j = map[1][jg - lb_cube[1]];
64 const double jr = ((2 * jg - 1) >> 1) *
65 handler->dh[1][1]; // distance from center in a.u.
66 const double jremain = kremain - jr * jr;
67
68 if ((jremain >= 0.0) && is_point_in_interval(j, ywindow)) {
69 const int xmin = ceil(-1e-8 - sqrt(jremain) * handler->dh_inv[0][0]);
70 const int xmax = 1 - xmin;
71
72 // printf("xmin %d, xmax %d\n", xmin, xmax);
73 for (int x = xmin - lb_cube[2];
74 x < imin((xmax - lb_cube[2]), handler->cube.size[2]); x++) {
75 const int x1 = map[2][x];
76
77 if (!is_point_in_interval(x1, xwindow))
78 continue;
79
80 int lower_corner[3] = {k, j, x1};
81 int upper_corner[3] = {k + 1, j + 1, x1 + 1};
82
83 compute_interval(map[2], handler->grid.full_size[2],
84 handler->grid.size[2], handler->cube.size[2], x1,
85 &x, lower_corner + 2, upper_corner + 2, xwindow);
86
87 if (upper_corner[2] - lower_corner[2]) {
88 const int position1[3] = {kg, jg - lb_cube[1], x};
89
90 /* the function will internally take care of the local vs global
91 * grid */
92
93 double *restrict dst = &idx3(handler->cube, position1[0],
94 position1[1], position1[2]);
95
96 double *restrict src = &idx3(handler->grid, lower_corner[0],
97 lower_corner[1], lower_corner[2]);
98
99 const int sizex = upper_corner[2] - lower_corner[2];
100 GRID_PRAGMA_SIMD((dst, src), 8)
101 for (int x = 0; x < sizex; x++) {
102 dst[x] = src[x];
103 }
104 }
105
106 if (handler->grid.size[2] == handler->grid.full_size[2])
107 update_loop_index(handler->grid.full_size[2], x1, &x);
108 else
109 x += upper_corner[2] - lower_corner[2] - 1;
110 }
111 }
112 }
113 }
114 }
115}
116
118 struct collocation_integration_ *const handler, const double disr_radius,
119 const int cmax, const int *const lb_cube, const int *const ub_cube,
120 const double *const roffset, const int *const cube_center) {
121
122 const double a = handler->dh[0][0] * handler->dh[0][0] +
123 handler->dh[0][1] * handler->dh[0][1] +
124 handler->dh[0][2] * handler->dh[0][2];
125 const double a_inv = 1.0 / a;
126 // a mapping so that the ig corresponds to the right grid point
127 int **map = handler->map;
128 map[1] = map[0] + 2 * cmax + 1;
129 map[2] = map[1] + 2 * cmax + 1;
130 // memset(map[0], 0xff, sizeof(int) * 3 * (2 * cmax + 1));
131
132 for (int i = 0; i < 3; i++) {
133 for (int ig = 0; ig < handler->cube.size[i]; ig++) {
134 map[i][ig] = modulo(cube_center[i] + ig + lb_cube[i] -
135 handler->grid.lower_corner[i],
136 handler->grid.full_size[i]);
137 }
138 }
139
140 const Interval zwindow = {.xmin = handler->grid.window_shift[0],
141 .xmax = handler->grid.window_size[0]};
142 const Interval ywindow = {.xmin = handler->grid.window_shift[1],
143 .xmax = handler->grid.window_size[1]};
144 const Interval xwindow = {.xmin = handler->grid.window_shift[2],
145 .xmax = handler->grid.window_size[2]};
146
147 for (int k = 0; k < handler->cube.size[0]; k++) {
148 const int iz = map[0][k];
149
150 if (!is_point_in_interval(iz, zwindow))
151 continue;
152
153 const double tz = (k + lb_cube[0] - roffset[0]);
154 const double z[3] = {tz * handler->dh[2][0], tz * handler->dh[2][1],
155 tz * handler->dh[2][2]};
156
157 for (int j = 0; j < handler->cube.size[1]; j++) {
158 const int iy = map[1][j];
159
160 if (!is_point_in_interval(iy, ywindow))
161 continue;
162
163 const double ty = (j - roffset[1] + lb_cube[1]);
164 const double y[3] = {z[0] + ty * handler->dh[1][0],
165 z[1] + ty * handler->dh[1][1],
166 z[2] + ty * handler->dh[1][2]};
167
168 /* Sqrt[(-2 x1 \[Alpha] - 2 y1 \[Beta] - 2 z1 \[Gamma])^2 - */
169 /* 4 (x1^2 + y1^2 + z1^2)
170 * (\[Alpha]^2 + \[Beta]^2 + \[Gamma]^2)] */
171
172 const double b =
173 -2.0 * (handler->dh[0][0] * (roffset[2] * handler->dh[0][0] - y[0]) +
174 handler->dh[0][1] * (roffset[2] * handler->dh[0][1] - y[1]) +
175 handler->dh[0][2] * (roffset[2] * handler->dh[0][2] - y[2]));
176
177 const double c = (roffset[2] * handler->dh[0][0] - y[0]) *
178 (roffset[2] * handler->dh[0][0] - y[0]) +
179 (roffset[2] * handler->dh[0][1] - y[1]) *
180 (roffset[2] * handler->dh[0][1] - y[1]) +
181 (roffset[2] * handler->dh[0][2] - y[2]) *
182 (roffset[2] * handler->dh[0][2] - y[2]) -
183 disr_radius * disr_radius;
184
185 double delta = b * b - 4.0 * a * c;
186
187 if (delta < 0.0)
188 continue;
189
190 delta = sqrt(delta);
191
192 const int xmin = imax(ceil((-b - delta) * 0.5 * a_inv), lb_cube[2]);
193 const int xmax = imin(floor((-b + delta) * 0.5 * a_inv), ub_cube[2]);
194
195 int lower_corner[3] = {iz, iy, xmin};
196 int upper_corner[3] = {iz + 1, iy + 1, xmin};
197
198 for (int x = xmin - lb_cube[2];
199 x < imin((xmax - lb_cube[2]), handler->cube.size[2]); x++) {
200 const int x1 = map[2][x];
201
202 if (!is_point_in_interval(x1, xwindow))
203 continue;
204
205 compute_interval(map[2], handler->grid.full_size[2],
206 handler->grid.size[2], handler->cube.size[2], x1, &x,
207 lower_corner + 2, upper_corner + 2, xwindow);
208
209 if (upper_corner[2] - lower_corner[2]) {
210 const int position1[3] = {k, j, x};
211
212 /* the function will internally take care of the local vs global
213 * grid */
214
215 double *__restrict__ src = &idx3(handler->grid, lower_corner[0],
216 lower_corner[1], lower_corner[2]);
217 double *__restrict__ dst =
218 &idx3(handler->cube, position1[0], position1[1], position1[2]);
219
220 const int sizex = upper_corner[2] - lower_corner[2];
221
222 // #pragma omp simd linear(dst, src) simdlen(8)
223 GRID_PRAGMA_SIMD((dst, src), 8)
224 for (int x = 0; x < sizex; x++) {
225 dst[x] = src[x];
226 }
227
228 if (handler->grid.size[2] == handler->grid.full_size[2])
229 update_loop_index(handler->grid.full_size[2], x1, &x);
230 else
231 x += upper_corner[2] - lower_corner[2] - 1;
232 }
233 }
234 }
235 }
236}
237
239 const _task *prev_task,
240 const _task *task, tensor *const hab,
241 tensor *work, // some scratch matrix
242 double *blocks) {
243 if (prev_task != NULL) {
244 /* prev_task is NULL when we deal with the first iteration. In that case
245 * we just need to initialize the hab matrix to the proper size. Note
246 * that resizing does not really occurs since memory allocation is done
247 * for the maximum size the matrix can have. */
248 const int iatom = prev_task->iatom;
249 const int jatom = prev_task->jatom;
250 const int iset = prev_task->iset;
251 const int jset = prev_task->jset;
252 const int ikind = ctx->atom_kinds[iatom];
253 const int jkind = ctx->atom_kinds[jatom];
254 const grid_basis_set *ibasis = ctx->basis_sets[ikind];
255 const grid_basis_set *jbasis = ctx->basis_sets[jkind];
256
257 const int block_num = prev_task->block_num;
258 double *const block = &blocks[ctx->block_offsets[block_num]];
259
260 const int ncoseta = ncoset(ibasis->lmax[iset]);
261 const int ncosetb = ncoset(jbasis->lmax[jset]);
262
263 const int ncoa = ibasis->npgf[iset] * ncoseta;
264 const int ncob = jbasis->npgf[jset] * ncosetb;
265
266 const int nsgf_seta = ibasis->nsgf_set[iset]; // size of spherical set */
267 const int nsgf_setb = jbasis->nsgf_set[jset];
268 const int nsgfa = ibasis->nsgf; // size of entire spherical basis
269 const int nsgfb = jbasis->nsgf;
270 const int sgfa = ibasis->first_sgf[iset] - 1; // start of spherical set
271 const int sgfb = jbasis->first_sgf[jset] - 1;
272 const int maxcoa = ibasis->maxco;
273 const int maxcob = jbasis->maxco;
274
275 initialize_tensor_2(work, nsgf_setb, ncoa);
276 realloc_tensor(work);
277
278 // Warning these matrices are row major....
279
280 dgemm_params m1, m2;
281 memset(&m1, 0, sizeof(dgemm_params));
282 memset(&m2, 0, sizeof(dgemm_params));
283
284 m1.op1 = 'N';
285 m1.op2 = 'N';
286 m1.m = nsgf_setb;
287 m1.n = ncoa;
288 m1.k = ncob;
289 m1.alpha = 1.0;
290 m1.beta = 0.0;
291 m1.a = &jbasis->sphi[sgfb * maxcob];
292 m1.lda = maxcob;
293 m1.b = hab->data;
294 m1.ldb = ncoa;
295 m1.c = work->data;
296 m1.ldc = work->ld_;
297
298 // phi[b][ncob]
299 // work[b][ncoa] = phi[b][ncob] * hab[ncob][ncoa]
300
301 if (iatom <= jatom) {
302 // I need to have the final result in the form
303
304 // block[b][a] = work[b][ncoa] transpose(phi[a][ncoa])
305 m2.op1 = 'N';
306 m2.op2 = 'T';
307 m2.m = nsgf_setb;
308 m2.n = nsgf_seta;
309 m2.k = ncoa;
310 m2.a = work->data;
311 m2.lda = work->ld_;
312 m2.b = &ibasis->sphi[sgfa * maxcoa];
313 m2.ldb = maxcoa;
314 m2.c = block + sgfb * nsgfa + sgfa;
315 m2.ldc = nsgfa;
316 m2.alpha = 1.0;
317 m2.beta = 1.0;
318 } else {
319 // block[a][b] = phi[a][ncoa] Transpose(work[b][ncoa])
320 m2.op1 = 'N';
321 m2.op2 = 'T';
322 m2.m = nsgf_seta;
323 m2.n = nsgf_setb;
324 m2.k = ncoa;
325 m2.a = &ibasis->sphi[sgfa * maxcoa];
326 m2.lda = maxcoa;
327 m2.b = work->data;
328 m2.ldb = work->ld_;
329 m2.c = block + sgfa * nsgfb + sgfb;
330 m2.ldc = nsgfb;
331 m2.alpha = 1.0;
332 m2.beta = 1.0;
333 }
334
335 /* these dgemm are *row* major */
336 dgemm_simplified(&m1);
337 dgemm_simplified(&m2);
338 }
339
340 if (task != NULL) {
341 const int iatom = task->iatom;
342 const int jatom = task->jatom;
343 const int ikind = ctx->atom_kinds[iatom];
344 const int jkind = ctx->atom_kinds[jatom];
345 const int iset = task->iset;
346 const int jset = task->jset;
347 const grid_basis_set *ibasis = ctx->basis_sets[ikind];
348 const grid_basis_set *jbasis = ctx->basis_sets[jkind];
349 const int ncoseta = ncoset(ibasis->lmax[iset]);
350 const int ncosetb = ncoset(jbasis->lmax[jset]);
351
352 const int ncoa = ibasis->npgf[iset] * ncoseta;
353 const int ncob = jbasis->npgf[jset] * ncosetb;
354
355 initialize_tensor_2(hab, ncob, ncoa);
356 realloc_tensor(hab);
357 }
358}
359
360void update_force_pair(orbital a, orbital b, const double pab,
361 const double ftz[2], const double *const rab,
362 const tensor *const vab, tensor *force) {
363 const double axpm0 = idx2(vab[0], idx(b), idx(a));
364 for (int i = 0; i < 3; i++) {
365 const double aip1 = idx2(vab[0], idx(b), idx(up(i, a)));
366 const double aim1 = idx2(vab[0], idx(b), idx(down(i, a)));
367 const double bim1 = idx2(vab[0], idx(down(i, b)), idx(a));
368 idx2(force[0], 0, i) += pab * (ftz[0] * aip1 - a.l[i] * aim1);
369 idx2(force[0], 1, i) +=
370 pab * (ftz[1] * (aip1 - rab[i] * axpm0) - b.l[i] * bim1);
371 }
372}
373
374void update_virial_pair(orbital a, orbital b, const double pab,
375 const double ftz[2], const double *const rab,
376 const tensor *const vab, tensor *virial) {
377 for (int i = 0; i < 3; i++) {
378 for (int j = 0; j < 3; j++) {
379 idx3(virial[0], 0, i, j) +=
380 pab * ftz[0] * idx2(vab[0], idx(b), idx(up(i, up(j, a)))) -
381 pab * a.l[j] * idx2(vab[0], idx(b), idx(up(i, down(j, a))));
382 }
383 }
384
385 for (int i = 0; i < 3; i++) {
386 for (int j = 0; j < 3; j++) {
387 idx3(virial[0], 1, i, j) +=
388 pab * ftz[1] *
389 (idx2(vab[0], idx(b), idx(up(i, up(j, a)))) -
390 idx2(vab[0], idx(b), idx(up(i, a))) * rab[j] -
391 idx2(vab[0], idx(b), idx(up(j, a))) * rab[i] +
392 idx2(vab[0], idx(b), idx(a)) * rab[j] * rab[i]) -
393 pab * b.l[j] * idx2(vab[0], idx(up(i, down(j, b))), idx(a));
394 }
395 }
396}
397
398void update_all(const bool compute_forces, const bool compute_virial,
399 const orbital a, const orbital b, const double f,
400 const double *const ftz, const double *rab, const tensor *vab,
401 const double pab, double *hab, tensor *forces,
402 tensor *virials) {
403
404 *hab += f * idx2(vab[0], idx(b), idx(a));
405
406 if (compute_forces) {
407 update_force_pair(a, b, f * pab, ftz, rab, vab, forces);
408 }
409
410 if (compute_virial) {
411 update_virial_pair(a, b, f * pab, ftz, rab, vab, virials);
412 }
413}
414
415static void update_tau(const bool compute_forces, const bool compute_virial,
416 const orbital a, const orbital b, const double ftz[2],
417 const double *const rab, const tensor *const vab,
418 const double pab, double *const hab, tensor *forces,
419 tensor *virials) {
420
421 for (int i = 0; i < 3; i++) {
422 update_all(compute_forces, compute_virial, down(i, a), down(i, b),
423 0.5 * a.l[i] * b.l[i], ftz, rab, vab, pab, hab, forces, virials);
424 update_all(compute_forces, compute_virial, up(i, a), down(i, b),
425 -0.5 * ftz[0] * b.l[i], ftz, rab, vab, pab, hab, forces,
426 virials);
427 update_all(compute_forces, compute_virial, down(i, a), up(i, b),
428 -0.5 * a.l[i] * ftz[1], ftz, rab, vab, pab, hab, forces,
429 virials);
430 update_all(compute_forces, compute_virial, up(i, a), up(i, b),
431 0.5 * ftz[0] * ftz[1], ftz, rab, vab, pab, hab, forces, virials);
432 }
433}
434
435static void
436update_hab_forces_and_stress(const _task *task, const tensor *const vab,
437 const tensor *const pab, const bool compute_tau,
438 const bool compute_forces,
439 const bool compute_virial, tensor *const forces,
440 tensor *const virial, tensor *const hab) {
441 double zeta[2] = {task->zeta[0] * 2.0, task->zeta[1] * 2.0};
442 for (int lb = task->lmin[1]; lb <= task->lmax[1]; lb++) {
443 for (int la = task->lmin[0]; la <= task->lmax[0]; la++) {
444 for (int bx = 0; bx <= lb; bx++) {
445 for (int by = 0; by <= lb - bx; by++) {
446 const int bz = lb - bx - by;
447 const orbital b = {{bx, by, bz}};
448 const int idx_b = task->offset[1] + idx(b);
449 for (int ax = 0; ax <= la; ax++) {
450 for (int ay = 0; ay <= la - ax; ay++) {
451 const int az = la - ax - ay;
452 const orbital a = {{ax, ay, az}};
453 const int idx_a = task->offset[0] + idx(a);
454 double *habval = &idx2(hab[0], idx_b, idx_a);
455 const double prefactor = idx2(pab[0], idx_b, idx_a);
456
457 // now compute the forces
458 if (compute_tau) {
459 update_tau(compute_forces, compute_virial, a, b, zeta,
460 task->rab, vab, prefactor, habval, forces, virial);
461 } else {
462 update_all(compute_forces, compute_virial, a, b, 1.0, zeta,
463 task->rab, vab, prefactor, habval, forces, virial);
464 }
465 }
466 }
467 }
468 }
469 }
470 }
471}
472
473/* It is a sub-optimal version of the mapping in case of a cubic cutoff. But is
474 * very general and does not depend on the orthorombic nature of the grid. for
475 * orthorombic cases, it is faster to apply PCB directly on the polynomials. */
476
477void extract_cube(struct collocation_integration_ *handler, const int cmax,
478 const int *lower_boundaries_cube, const int *cube_center) {
479
480 // a mapping so that the ig corresponds to the right grid point
481 int **map = handler->map;
482 map[1] = map[0] + 2 * cmax + 1;
483 map[2] = map[1] + 2 * cmax + 1;
484 // memset(map[0], 0xff, sizeof(int) * 3 * (2 * cmax + 1));
485 for (int i = 0; i < 3; i++) {
486 for (int ig = 0; ig < handler->cube.size[i]; ig++) {
487 map[i][ig] = modulo(cube_center[i] + ig + lower_boundaries_cube[i] -
488 handler->grid.lower_corner[i],
489 handler->grid.full_size[i]);
490 }
491 }
492
493 int lower_corner[3];
494 int upper_corner[3];
495
496 const Interval zwindow = {.xmin = handler->grid.window_shift[0],
497 .xmax = handler->grid.window_size[0]};
498 const Interval ywindow = {.xmin = handler->grid.window_shift[1],
499 .xmax = handler->grid.window_size[1]};
500 const Interval xwindow = {.xmin = handler->grid.window_shift[2],
501 .xmax = handler->grid.window_size[2]};
502
503 for (int z = 0; (z < handler->cube.size[0]); z++) {
504 const int z1 = map[0][z];
505
506 if (!is_point_in_interval(z1, zwindow))
507 continue;
508
509 compute_interval(map[0], handler->grid.full_size[0], handler->grid.size[0],
510 handler->cube.size[0], z1, &z, lower_corner, upper_corner,
511 zwindow);
512
513 /* // We have a full plane. */
514 if (upper_corner[0] - lower_corner[0]) {
515 for (int y = 0; y < handler->cube.size[1]; y++) {
516 const int y1 = map[1][y];
517
518 // this check is completely irrelevant when running without MPI.
519 if (!is_point_in_interval(y1, ywindow))
520 continue;
521
522 compute_interval(map[1], handler->grid.full_size[1],
523 handler->grid.size[1], handler->cube.size[1], y1, &y,
524 lower_corner + 1, upper_corner + 1, ywindow);
525
526 if (upper_corner[1] - lower_corner[1]) {
527 for (int x = 0; x < handler->cube.size[2]; x++) {
528 const int x1 = map[2][x];
529
530 if (!is_point_in_interval(x1, xwindow))
531 continue;
532
533 compute_interval(map[2], handler->grid.full_size[2],
534 handler->grid.size[2], handler->cube.size[2], x1,
535 &x, lower_corner + 2, upper_corner + 2, xwindow);
536
537 if (upper_corner[2] - lower_corner[2]) { // should be non zero
538 const int position1[3] = {z, y, x};
539
540 /* the function will internally take care of the local vx global
541 * grid */
543 lower_corner, // lower corner of the portion of cube (in the
544 // full grid)
545 upper_corner, // upper corner of the portion of cube (in the
546 // full grid)
547 position1, // starting position subblock inside the cube
548 &handler->grid,
549 &handler->cube); // the grid to add data from
550
551 if (handler->grid.size[2] == handler->grid.full_size[2])
552 update_loop_index(handler->grid.full_size[2], x1, &x);
553 else
554 x += upper_corner[2] - lower_corner[2] - 1;
555 }
556 }
557 if (handler->grid.size[1] == handler->grid.full_size[1])
558 update_loop_index(handler->grid.full_size[1], y1, &y);
559 else
560 y += upper_corner[1] - lower_corner[1] - 1;
561 }
562 }
563 if (handler->grid.size[0] == handler->grid.full_size[0])
564 update_loop_index(handler->grid.full_size[0], z1, &z);
565 else
566 z += upper_corner[0] - lower_corner[0] - 1;
567 }
568 }
569}
570
572 const bool use_ortho, const double zetp, const double rp[3],
573 const double radius) {
574 if (handler == NULL)
575 abort();
576
577 const int lp = handler->coef.size[0] - 1;
578
579 int cubecenter[3];
580 int cube_size[3];
581 int lb_cube[3], ub_cube[3];
582 double roffset[3];
583 double disr_radius;
584
585 /* cube : grid comtaining pointlike product between polynomials
586 *
587 * pol : grid containing the polynomials in all three directions
588 */
589
590 /* seting up the cube parameters */
592 use_ortho, radius, (const double(*)[3])handler->dh,
593 (const double(*)[3])handler->dh_inv, rp, &disr_radius, roffset,
594 cubecenter, lb_cube, ub_cube, cube_size);
595
596 /* initialize the multidimensional array containing the polynomials */
597 if (lp != 0) {
598 initialize_tensor_3(&handler->pol, 3, cmax, handler->coef.size[0]);
599 } else {
600 initialize_tensor_3(&handler->pol, 3, handler->coef.size[0], cmax);
601 }
602 handler->pol_alloc_size = realloc_tensor(&handler->pol);
603
604 /* allocate memory for the polynomial and the cube */
605
606 initialize_tensor_3(&handler->cube, cube_size[0], cube_size[1], cube_size[2]);
607
608 handler->cube_alloc_size = realloc_tensor(&handler->cube);
609
610 initialize_W_and_T(handler, &handler->coef, &handler->cube);
611
612 /* compute the polynomials */
613
614 // WARNING : do not reverse the order in pol otherwise you will have to
615 // reverse the order in collocate_dgemm as well.
616
617 /* the tensor contraction is done for a given order so I have to be careful
618 * how the tensors X, Y, Z are stored. In collocate, they are stored
619 * normally 0 (Z), (1) Y, (2) X in the table pol. but in integrate (which
620 * uses the same tensor reduction), I have a special treatment for l = 0. In
621 * that case the order *should* be the same than for collocate. For l > 0,
622 * we need a different storage order which is (X) 2, (Y) 0, and (Z) 1.
623 *
624 * the reason for this is that the cube is stored as cube[z][y][x] so the
625 * first direction taken for the dgemm is along x.
626 */
627
628 int perm[3] = {2, 0, 1};
629
630 if (lp == 0) {
631 /* I need to restore the same order than for collocate */
632 perm[0] = 0;
633 perm[1] = 1;
634 perm[2] = 2;
635 }
636
637 bool use_ortho_forced = handler->orthogonal[0] && handler->orthogonal[1] &&
638 handler->orthogonal[2];
639 if (use_ortho) {
640 grid_fill_pol_dgemm((lp != 0), handler->dh[0][0], roffset[2], 0, lb_cube[2],
641 ub_cube[2], lp, cmax, zetp,
642 &idx3(handler->pol, perm[2], 0, 0)); /* i indice */
643 grid_fill_pol_dgemm((lp != 0), handler->dh[1][1], roffset[1], 0, lb_cube[1],
644 ub_cube[1], lp, cmax, zetp,
645 &idx3(handler->pol, perm[1], 0, 0)); /* j indice */
646 grid_fill_pol_dgemm((lp != 0), handler->dh[2][2], roffset[0], 0, lb_cube[0],
647 ub_cube[0], lp, cmax, zetp,
648 &idx3(handler->pol, perm[0], 0, 0)); /* k indice */
649 } else {
650 double dx[3];
651
652 dx[2] = handler->dh[0][0] * handler->dh[0][0] +
653 handler->dh[0][1] * handler->dh[0][1] +
654 handler->dh[0][2] * handler->dh[0][2];
655
656 dx[1] = handler->dh[1][0] * handler->dh[1][0] +
657 handler->dh[1][1] * handler->dh[1][1] +
658 handler->dh[1][2] * handler->dh[1][2];
659
660 dx[0] = handler->dh[2][0] * handler->dh[2][0] +
661 handler->dh[2][1] * handler->dh[2][1] +
662 handler->dh[2][2] * handler->dh[2][2];
663
664 grid_fill_pol_dgemm((lp != 0), 1.0, roffset[2], 0, lb_cube[2], ub_cube[2],
665 lp, cmax, zetp * dx[2],
666 &idx3(handler->pol, perm[2], 0, 0)); /* i indice */
667 grid_fill_pol_dgemm((lp != 0), 1.0, roffset[1], 0, lb_cube[1], ub_cube[1],
668 lp, cmax, zetp * dx[1],
669 &idx3(handler->pol, perm[1], 0, 0)); /* j indice */
670 grid_fill_pol_dgemm((lp != 0), 1.0, roffset[0], 0, lb_cube[0], ub_cube[0],
671 lp, cmax, zetp * dx[0],
672 &idx3(handler->pol, perm[0], 0, 0)); /* k indice */
673
674 /* the three remaining tensors are initialized in the function */
676 zetp, roffset, (const double(*)[3])handler->dh, lb_cube, ub_cube,
677 handler->orthogonal, &handler->Exp);
678 }
679
680 if (handler->apply_cutoff) {
681 memset(handler->cube.data, 0, sizeof(double) * handler->cube.alloc_size_);
682 if (!use_ortho && !use_ortho_forced) {
684 handler, disr_radius, cmax, lb_cube, ub_cube, roffset, cubecenter);
685 } else {
687 lb_cube, cubecenter);
688 }
689 } else {
690 extract_cube(handler, cmax, lb_cube, cubecenter);
691 }
692
693 if (!use_ortho && !use_ortho_forced)
695 &handler->cube);
696
697 if (lp != 0) {
699 // pointer to scratch memory
700 1.0, handler->orthogonal, NULL,
701 &handler->cube, &handler->pol,
702 &handler->coef);
703 } else {
704 /* it is very specific to integrate because we might end up with a
705 * single element after the tensor product/contraction. In that case, I
706 * compute the cube and then do a scalar product between the two. */
707
708 /* we could also do this with 2 matrix-vector multiplications and a scalar
709 * product
710 *
711 * H_{jk} = C_{ijk} . P_i (along x) C_{ijk} is *stored C[k][j][i]* !!!!!!
712 * L_{k} = H_{jk} . P_j (along y)
713 * v_{ab} = L_k . P_k (along z)
714 */
715
716 tensor cube_tmp;
717 initialize_tensor_2(&cube_tmp, cube_size[0], cube_size[1]);
718 alloc_tensor(&cube_tmp);
719
720 /* first along x */
722 handler->cube.size[0] * handler->cube.size[1],
723 handler->cube.size[2], 1.0, handler->cube.data,
724 handler->cube.ld_, &idx3(handler->pol, 2, 0, 0), 1, 0.0,
725 cube_tmp.data, 1);
726
727 /* second along j */
729 handler->cube.size[1], 1.0, cube_tmp.data, cube_tmp.ld_,
730 &idx3(handler->pol, 1, 0, 0), 1, 0.0, handler->scratch, 1);
731
732 /* finally along k, it is a scalar product.... */
733 handler->coef.data[0] = cblas_ddot(handler->cube.size[0], handler->scratch,
734 1, &idx3(handler->pol, 0, 0, 0), 1);
735
736 free(cube_tmp.data);
737 }
738
739 /* go from ijk -> xyz */
740 if (!use_ortho)
741 grid_transform_coef_jik_to_yxz((const double(*)[3])handler->dh,
742 &handler->coef);
743}
744
746 grid_context *const ctx, const int level, const bool calculate_tau,
747 const bool calculate_forces, const bool calculate_virial,
748 const int *const shift_local, const int *const border_width,
749 const offload_buffer *const pab_blocks, offload_buffer *const hab_blocks,
750 tensor *forces_, tensor *virial_) {
751 tensor *const grid = &ctx->grid[level];
752
753 // Using default(shared) because with GCC 9 the behavior around const changed:
754 // https://www.gnu.org/software/gcc/gcc-9/porting_to.html
755#pragma omp parallel default(shared)
756 {
757 const int num_threads = omp_get_num_threads();
758 const int thread_id = omp_get_thread_num();
759
760 double *hab_block_local = NULL;
761
762 if (num_threads == 1) {
763 hab_block_local = hab_blocks->host_buffer;
764 } else {
765 hab_block_local = ((double *)ctx->scratch) +
766 thread_id * (hab_blocks->size / sizeof(double));
767 memset(hab_block_local, 0, hab_blocks->size);
768 }
769
770 tensor work, pab, hab, vab, forces_local_, virial_local_,
771 forces_local_pair_, virial_local_pair_;
772 memset(&work, 0, sizeof(tensor));
773 memset(&pab, 0, sizeof(tensor));
774 memset(&hab, 0, sizeof(tensor));
775 memset(&vab, 0, sizeof(tensor));
776 memset(&forces_local_, 0, sizeof(tensor));
777 memset(&virial_local_, 0, sizeof(tensor));
778 memset(&forces_local_pair_, 0, sizeof(tensor));
779 memset(&virial_local_pair_, 0, sizeof(tensor));
780
781 struct collocation_integration_ *handler = ctx->handler[thread_id];
782 handler->apply_cutoff = ctx->apply_cutoff;
783 handler->lmax_diff[0] = 0;
784 handler->lmax_diff[1] = 0;
785 handler->lmin_diff[0] = 0;
786 handler->lmin_diff[1] = 0;
787
788 if (calculate_tau || calculate_forces || calculate_virial) {
789 handler->lmax_diff[0] = 1;
790 handler->lmax_diff[1] = 0;
791 handler->lmin_diff[0] = -1;
792 handler->lmin_diff[1] = -1;
793 }
794
795 if (calculate_virial) {
796 handler->lmax_diff[0]++;
797 handler->lmax_diff[1]++;
798 }
799
800 if (calculate_tau) {
801 handler->lmax_diff[0]++;
802 handler->lmax_diff[1]++;
803 handler->lmin_diff[0]--;
804 handler->lmin_diff[1]--;
805 }
806
807 // Allocate pab matrix for re-use across tasks.
808 initialize_tensor_2(&pab, ctx->maxco, ctx->maxco);
809 alloc_tensor(&pab);
810
811 initialize_tensor_2(&vab, ctx->maxco, ctx->maxco);
812 alloc_tensor(&vab);
813
814 initialize_tensor_2(&work, ctx->maxco, ctx->maxco);
815 alloc_tensor(&work);
816
817 initialize_tensor_2(&hab, ctx->maxco, ctx->maxco);
818 alloc_tensor(&hab);
819
820 if (calculate_forces) {
821 initialize_tensor_2(&forces_local_, forces_->size[0], forces_->size[1]);
822 alloc_tensor(&forces_local_);
823 initialize_tensor_2(&virial_local_, 3, 3);
824 alloc_tensor(&virial_local_);
825 memset(forces_local_.data, 0, sizeof(double) * forces_local_.alloc_size_);
826 memset(virial_local_.data, 0, sizeof(double) * virial_local_.alloc_size_);
827 initialize_tensor_2(&forces_local_pair_, 2, 3);
828 alloc_tensor(&forces_local_pair_);
829 initialize_tensor_3(&virial_local_pair_, 2, 3, 3);
830 alloc_tensor(&virial_local_pair_);
831 }
832
833 initialize_basis_vectors(handler, (const double(*)[3])grid->dh,
834 (const double(*)[3])grid->dh_inv);
835
836 tensor_copy(&handler->grid, grid);
837 handler->grid.data = grid->data;
838 for (int d = 0; d < 3; d++)
839 handler->orthogonal[d] = grid->orthogonal[d];
840
841 /* it is only useful when we split the list over multiple threads. The
842 * first iteration should load the block whatever status the
843 * task->block_update_ variable has */
844 const _task *prev_task = NULL;
845#pragma omp for schedule(static)
846 for (int itask = 0; itask < ctx->tasks_per_level[level]; itask++) {
847 // Define some convenient aliases.
848 const _task *task = &ctx->tasks[level][itask];
849
850 if (task->level != level) {
851 printf("level %d, %d\n", task->level, level);
852 abort();
853 }
854
855 if (task->update_block_ || (prev_task == NULL)) {
856 /* need to load pab if forces are needed */
857 if (calculate_forces) {
858 extract_blocks(ctx, task, pab_blocks, &work, &pab);
859 }
860 /* store the coefficients of the operator after rotation to
861 * the spherical harmonic basis */
862
863 rotate_and_store_coefficients(ctx, prev_task, task, &hab, &work,
864 hab_block_local);
865
866 /* There is a difference here between collocate and integrate.
867 * For collocate, I do not need to know the task where blocks
868 * have been updated the last time. For integrate this
869 * information is crucial to update the coefficients of the
870 * potential */
871 prev_task = task;
872 memset(hab.data, 0, sizeof(double) * hab.alloc_size_);
873 }
874
875 /* the grid is divided over several ranks or not periodic */
876 if ((handler->grid.size[0] != handler->grid.full_size[0]) ||
877 (handler->grid.size[1] != handler->grid.full_size[1]) ||
878 (handler->grid.size[2] != handler->grid.full_size[2])) {
879 /* unfortunately the window where the gaussian should be added depends
880 * on the bonds. So I have to adjust the window all the time. */
881
882 setup_grid_window(&handler->grid, shift_local, border_width,
883 task->border_mask);
884 }
885
886 int lmax[2] = {task->lmax[0] + handler->lmax_diff[0],
887 task->lmax[1] + handler->lmax_diff[1]};
888 int lmin[2] = {task->lmin[0] + handler->lmin_diff[0],
889 task->lmin[1] + handler->lmin_diff[1]};
890
891 lmin[0] = imax(lmin[0], 0);
892 lmin[1] = imax(lmin[1], 0);
893
894 initialize_tensor_4(&(handler->alpha), 3, lmax[1] + 1, lmax[0] + 1,
895 lmax[0] + lmax[1] + 1);
896 realloc_tensor(&(handler->alpha));
897
898 const int lp = lmax[0] + lmax[1];
899
900 initialize_tensor_3(&(handler->coef), lp + 1, lp + 1, lp + 1);
901 realloc_tensor(&(handler->coef));
902 grid_integrate(handler, ctx->orthorhombic, task->zetp, task->rp,
903 task->radius);
904 /*
905 handler->coef contains coefficients in the (x - x_12) basis. now
906 we need to rotate them in the (x - x_1) (x - x_2) basis
907 */
908
909 /* compute the transformation matrix */
910 grid_prepare_alpha_dgemm(task->ra, task->rb, task->rp, lmax,
911 &(handler->alpha));
912
913 initialize_tensor_2(&vab, ncoset(lmax[1]), ncoset(lmax[0]));
914 realloc_tensor(&vab);
915
916 grid_compute_vab(lmin, lmax, lp, task->prefactor,
917 &handler->alpha, // contains the matrix for the rotation
918 &handler->coef, // contains the coefficients of
919 // the potential in the
920 // (x_x_12) basis
921 &vab); // contains the coefficients of the potential
922 // in the (x - x_1)(x - x_2) basis
923
924 if (calculate_forces) {
925 memset(forces_local_pair_.data, 0,
926 sizeof(double) * forces_local_pair_.alloc_size_);
927 memset(virial_local_pair_.data, 0,
928 sizeof(double) * virial_local_pair_.alloc_size_);
929 }
930
932 task, &vab, &pab, calculate_tau, calculate_forces, calculate_virial,
933 &forces_local_pair_, /* matrix
934 * containing the
935 * contribution of
936 * the gaussian
937 * pair for each
938 * atom */
939 &virial_local_pair_, /* same but for the virial term (stress tensor)
940 */
941 &hab);
942
943 if (calculate_forces) {
944 const double scaling = (task->iatom == task->jatom) ? 1.0 : 2.0;
945 idx2(forces_local_, task->iatom, 0) +=
946 scaling * idx2(forces_local_pair_, 0, 0);
947 idx2(forces_local_, task->iatom, 1) +=
948 scaling * idx2(forces_local_pair_, 0, 1);
949 idx2(forces_local_, task->iatom, 2) +=
950 scaling * idx2(forces_local_pair_, 0, 2);
951
952 idx2(forces_local_, task->jatom, 0) +=
953 scaling * idx2(forces_local_pair_, 1, 0);
954 idx2(forces_local_, task->jatom, 1) +=
955 scaling * idx2(forces_local_pair_, 1, 1);
956 idx2(forces_local_, task->jatom, 2) +=
957 scaling * idx2(forces_local_pair_, 1, 2);
958 if (calculate_virial) {
959 for (int i = 0; i < 3; i++) {
960 for (int j = 0; j < 3; j++) {
961 idx2(virial_local_, i, j) +=
962 scaling * (idx3(virial_local_pair_, 0, i, j) +
963 idx3(virial_local_pair_, 1, i, j));
964 }
965 }
966 }
967 }
968 }
969
970 rotate_and_store_coefficients(ctx, prev_task, NULL, &hab, &work,
971 hab_block_local);
972
973 // now reduction over the hab blocks
974 if (num_threads > 1) {
975 // does not store the number of elements but the amount of memory
976 // occupied. That's a strange choice.
977 const int hab_size = hab_blocks->size / sizeof(double);
978 if ((hab_size / num_threads) >= 2) {
979 const int block_size =
980 hab_size / num_threads + (hab_size % num_threads);
981
982 for (int bk = 0; bk < num_threads; bk++) {
983 int bk_id = (bk + thread_id) % num_threads;
984 size_t begin = bk_id * block_size;
985 size_t end = imin((bk_id + 1) * block_size, hab_size);
986 cblas_daxpy(end - begin, 1.0, hab_block_local + begin, 1,
987 hab_blocks->host_buffer + begin, 1);
988#pragma omp barrier
989 }
990 } else {
991 const int hab_size = hab_blocks->size / sizeof(double);
992#pragma omp critical
993 cblas_daxpy(hab_size, 1.0, hab_block_local, 1, hab_blocks->host_buffer,
994 1);
995 }
996 }
997
998 if (calculate_forces) {
999 if (num_threads > 1) {
1000 if ((forces_->alloc_size_ / num_threads) >= 2) {
1001 const int block_size = forces_->alloc_size_ / num_threads +
1002 (forces_->alloc_size_ % num_threads);
1003
1004 for (int bk = 0; bk < num_threads; bk++) {
1005 int bk_id = (bk + thread_id) % num_threads;
1006 int begin = bk_id * block_size;
1007 int end = imin((bk_id + 1) * block_size, forces_->alloc_size_);
1008 cblas_daxpy(end - begin, 1.0, forces_local_.data + begin, 1,
1009 forces_->data + begin, 1);
1010#pragma omp barrier
1011 }
1012 } else {
1013#pragma omp critical
1014 cblas_daxpy(forces_local_.alloc_size_, 1.0, forces_local_.data, 1,
1015 forces_->data, 1);
1016 }
1017 } else {
1018 // we are running with OMP_NUM_THREADS=1
1019 cblas_daxpy(forces_local_.alloc_size_, 1.0, forces_local_.data, 1,
1020 forces_->data, 1);
1021 }
1022 }
1023
1024 if (calculate_virial) {
1025#pragma omp critical
1026 for (int i = 0; i < 3; i++) {
1027 for (int j = 0; j < 3; j++) {
1028 idx2(virial_[0], i, j) += idx2(virial_local_, i, j);
1029 }
1030 }
1031 }
1032
1033 handler->grid.data = NULL;
1034 free(vab.data);
1035 free(work.data);
1036 free(pab.data);
1037 free(hab.data);
1038
1039 if (calculate_forces) {
1040 free(forces_local_.data);
1041 free(virial_local_.data);
1042 free(virial_local_pair_.data);
1043 free(forces_local_pair_.data);
1044 }
1045 }
1046}
1047
1048/*******************************************************************************
1049 * \brief Integrate all tasks of in given list from given grids using matrix -
1050 * matrix multiplication
1051 ******************************************************************************/
1053 void *ptr, const bool compute_tau, const int natoms, const int nlevels,
1054 const offload_buffer *const pab_blocks, offload_buffer *grids[nlevels],
1055 offload_buffer *hab_blocks, double forces[natoms][3], double virial[3][3]) {
1056
1057 grid_context *const ctx = (grid_context *)ptr;
1058
1059 assert(ctx->checksum == ctx_checksum);
1060 assert(ctx->nlevels == nlevels);
1061 assert(ctx->natoms == natoms);
1062
1063 // Zero result arrays.
1064 memset(hab_blocks->host_buffer, 0, hab_blocks->size);
1065
1066 const int max_threads = omp_get_max_threads();
1067
1068 if (ctx->scratch == NULL)
1069 ctx->scratch = malloc(hab_blocks->size * max_threads);
1070 assert(ctx->scratch != NULL);
1071
1072 // #pragma omp parallel for
1073 for (int level = 0; level < ctx->nlevels; level++) {
1074 const _layout *layout = &ctx->layouts[level];
1075 set_grid_parameters(&ctx->grid[level], ctx->orthorhombic,
1076 layout->npts_global, layout->npts_local,
1077 layout->shift_local, layout->border_width, layout->dh,
1078 layout->dh_inv, grids[level]);
1079 ctx->grid[level].data = grids[level]->host_buffer;
1080 }
1081
1082 bool calculate_virial = (virial != NULL);
1083 bool calculate_forces = (forces != NULL);
1084
1085 tensor forces_, virial_;
1086 if (calculate_forces) {
1087 initialize_tensor_2(&forces_, natoms, 3);
1088 alloc_tensor(&forces_);
1089 initialize_tensor_2(&virial_, 3, 3);
1090 alloc_tensor(&virial_);
1091 memset(forces_.data, 0, sizeof(double) * forces_.alloc_size_);
1092 memset(virial_.data, 0, sizeof(double) * virial_.alloc_size_);
1093 }
1094
1095 for (int level = 0; level < ctx->nlevels; level++) {
1096 const _layout *layout = &ctx->layouts[level];
1097 integrate_one_grid_level_dgemm(ctx, level, compute_tau, calculate_forces,
1098 calculate_virial, layout->shift_local,
1099 layout->border_width, pab_blocks, hab_blocks,
1100 &forces_, &virial_);
1101 ctx->grid[level].data = NULL;
1102 }
1103 if (calculate_forces) {
1104 if (calculate_virial) {
1105 virial[0][0] = idx2(virial_, 0, 0);
1106 virial[0][1] = idx2(virial_, 0, 1);
1107 virial[0][2] = idx2(virial_, 0, 2);
1108 virial[1][0] = idx2(virial_, 1, 0);
1109 virial[1][1] = idx2(virial_, 1, 1);
1110 virial[1][2] = idx2(virial_, 1, 2);
1111 virial[2][0] = idx2(virial_, 2, 0);
1112 virial[2][1] = idx2(virial_, 2, 1);
1113 virial[2][2] = idx2(virial_, 2, 2);
1114 }
1115
1116 memcpy(forces[0], forces_.data, sizeof(double) * forces_.alloc_size_);
1117 free(forces_.data);
1118 free(virial_.data);
1119 }
1120
1121 free(ctx->scratch);
1122 ctx->scratch = NULL;
1123}
static int imax(int x, int y)
Returns the larger of two given integers (missing from the C standard)
static int max_threads
Definition dbm_library.c:25
static int imin(int x, int y)
Returns the smaller of the two integers (missing from the C standard).
Definition dbm_miniapp.c:36
#define GRID_PRAGMA_SIMD(OBJS, N)
Definition grid_common.h:23
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
static GRID_HOST_DEVICE orbital up(const int i, const orbital a)
Increase i'th component of given orbital angular momentum.
static GRID_HOST_DEVICE int idx(const orbital a)
Return coset index of given orbital angular momentum.
static GRID_HOST_DEVICE orbital down(const int i, const orbital a)
Decrease i'th component of given orbital angular momentum.
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
static void const int const int i
static void const int cmax
static void const int const int const int const int const int const double const int map[3][2 *cmax+1]
void grid_compute_vab(const int *const lmin, const int *const lmax, const int lp, const double prefactor, const tensor *const alpha, const tensor *const coef_xyz, tensor *vab)
void grid_transform_coef_jik_to_yxz(const double dh[3][3], const tensor *coef_xyz)
void grid_prepare_alpha_dgemm(const double ra[3], const double rb[3], const double rp[3], const int *lmax, tensor *alpha)
void grid_fill_pol_dgemm(const bool transpose, const double dr, const double roffset, const int pol_offset, const int xmin, const int xmax, const int lp, const int cmax, const double zetp, double *pol_)
void extract_blocks(grid_context *const ctx, const _task *const task, const offload_buffer *pab_blocks, tensor *const work, tensor *const pab)
void tensor_reduction_for_collocate_integrate(double *scratch, const double alpha, const bool *const orthogonal, const struct tensor_ *Exp, const struct tensor_ *co, const struct tensor_ *p_alpha_beta_reduced_, struct tensor_ *cube)
void initialize_basis_vectors(collocation_integration *const handler, const double dh[3][3], const double dh_inv[3][3])
void initialize_W_and_T(collocation_integration *const handler, const tensor *cube, const tensor *coef)
void set_grid_parameters(tensor *grid, const bool orthorhombic, const int grid_full_size[3], const int grid_local_size[3], const int shift_local[3], const int border_width[3], const double dh[3][3], const double dh_inv[3][3], offload_buffer *grid_)
static void rotate_and_store_coefficients(grid_context *const ctx, const _task *prev_task, const _task *task, tensor *const hab, tensor *work, double *blocks)
static void update_tau(const bool compute_forces, const bool compute_virial, const orbital a, const orbital b, const double ftz[2], const double *const rab, const tensor *const vab, const double pab, double *const hab, tensor *forces, tensor *virials)
void integrate_one_grid_level_dgemm(grid_context *const ctx, const int level, const bool calculate_tau, const bool calculate_forces, const bool calculate_virial, const int *const shift_local, const int *const border_width, const offload_buffer *const pab_blocks, offload_buffer *const hab_blocks, tensor *forces_, tensor *virial_)
void extract_cube_within_spherical_cutoff_generic(struct collocation_integration_ *const handler, const double disr_radius, const int cmax, const int *const lb_cube, const int *const ub_cube, const double *const roffset, const int *const cube_center)
void update_force_pair(orbital a, orbital b, const double pab, const double ftz[2], const double *const rab, const tensor *const vab, tensor *force)
void update_virial_pair(orbital a, orbital b, const double pab, const double ftz[2], const double *const rab, const tensor *const vab, tensor *virial)
void grid_dgemm_integrate_task_list(void *ptr, const bool compute_tau, const int natoms, const int nlevels, const offload_buffer *const pab_blocks, offload_buffer *grids[nlevels], offload_buffer *hab_blocks, double forces[natoms][3], double virial[3][3])
Integrate all tasks of in given list from given grids using matrix - matrix multiplication.
void extract_cube_within_spherical_cutoff_ortho(struct collocation_integration_ *const handler, const double disr_radius, const int cmax, const int *const lb_cube, const int *const cube_center)
static void update_hab_forces_and_stress(const _task *task, const tensor *const vab, const tensor *const pab, const bool compute_tau, const bool compute_forces, const bool compute_virial, tensor *const forces, tensor *const virial, tensor *const hab)
void grid_integrate(collocation_integration *const handler, const bool use_ortho, const double zetp, const double rp[3], const double radius)
void extract_cube(struct collocation_integration_ *handler, const int cmax, const int *lower_boundaries_cube, const int *cube_center)
void update_all(const bool compute_forces, const bool compute_virial, const orbital a, const orbital b, const double f, const double *const ftz, const double *rab, const tensor *vab, const double pab, double *hab, tensor *forces, tensor *virials)
void apply_non_orthorombic_corrections(const bool plane[restrict 3], const tensor *const Exp, tensor *const cube)
void calculate_non_orthorombic_corrections_tensor(const double mu_mean, const double *r_ab, const double basis[3][3], const int *const xmin, const int *const xmax, bool plane[3], tensor *const Exp)
static bool is_point_in_interval(const int value, Interval x)
static void update_loop_index(const int global_grid_size, int x1, int *const x)
void tensor_copy(tensor *const b, const tensor *const a)
size_t realloc_tensor(tensor *t)
void alloc_tensor(tensor *t)
static void setup_grid_window(tensor *const grid, const int *const shift_local, const int *const border_width, const int border_mask)
static void initialize_tensor_4(struct tensor_ *a, int n1, int n2, int n3, int n4)
#define idx2(a, i, j)
static void initialize_tensor_3(struct tensor_ *a, int n1, int n2, int n3)
#define idx3(a, i, j, k)
static void initialize_tensor_2(struct tensor_ *a, int n1, int n2)
int compute_cube_properties(const bool ortho, const double radius, const double dh[3][3], const double dh_inv[3][3], const double *rp, double *disr_radius, double *roffset, int *cubecenter, int *lb_cube, int *ub_cube, int *cube_size)
double cblas_ddot(const int N, const double *X, const int incX, const double *Y, const int incY)
void extract_sub_grid(const int *lower_corner, const int *upper_corner, const int *position, const tensor *const grid, tensor *const subgrid)
void compute_interval(const int *const map, const int full_size, const int size, const int cube_size, const int x1, int *x, int *const lower_corner, int *const upper_corner, const Interval window)
void cblas_daxpy(const int N, const double alpha, const double *X, const int incX, double *Y, const int incY)
void cblas_dgemv(const CBLAS_LAYOUT order, const CBLAS_TRANSPOSE TransA, const int M, const int N, const double alpha, const double *A, const int lda, const double *X, const int incX, const double beta, double *Y, const int incY)
void dgemm_simplified(dgemm_params *const m)
@ CblasNoTrans
@ CblasRowMajor
double dh_inv[3][3]
Internal representation of a basis set.
grid_basis_set ** basis_sets
struct collocation_integration_ ** handler
Internal representation of a buffer.
double * host_buffer
Orbital angular momentum.