(git:b1f098b)
Loading...
Searching...
No Matches
grid_dgemm_integrate.c
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2024 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: BSD-3-Clause */
6/*----------------------------------------------------------------------------*/
7
8#include <assert.h>
9#include <limits.h>
10#include <math.h>
11#include <stdbool.h>
12#include <stdio.h>
13#include <stdlib.h>
14#include <string.h>
15#include <unistd.h>
16
17#include <omp.h>
18
19#ifdef __LIBXSMM
20#include <libxsmm.h>
21#endif
22
23#include "../common/grid_common.h"
30#include "grid_dgemm_utils.h"
31
33 struct collocation_integration_ *const handler, const double disr_radius,
34 const int cmax, const int *const lb_cube, const int *const cube_center) {
35 // a mapping so that the ig corresponds to the right grid point
36 int **map = handler->map;
37 map[1] = map[0] + 2 * cmax + 1;
38 map[2] = map[1] + 2 * cmax + 1;
39 // memset(map[0], 0xff, sizeof(int) * 3 * (2 * cmax + 1));
40
41 for (int i = 0; i < 3; i++) {
42 for (int ig = 0; ig < handler->cube.size[i]; ig++) {
43 map[i][ig] = modulo(cube_center[i] + lb_cube[i] + ig -
44 handler->grid.lower_corner[i],
45 handler->grid.full_size[i]);
46 }
47 }
48
49 const Interval zwindow = {.xmin = handler->grid.window_shift[0],
50 .xmax = handler->grid.window_size[0]};
51 const Interval ywindow = {.xmin = handler->grid.window_shift[1],
52 .xmax = handler->grid.window_size[1]};
53 const Interval xwindow = {.xmin = handler->grid.window_shift[2],
54 .xmax = handler->grid.window_size[2]};
55
56 for (int kg = 0; kg < handler->cube.size[0]; kg++) {
57 const int k = map[0][kg];
58 const int kd =
59 (2 * (kg + lb_cube[0]) - 1) / 2; // distance from center in grid points
60 const double kr = kd * handler->dh[2][2]; // distance from center in a.u.
61 const double kremain = disr_radius * disr_radius - kr * kr;
62
63 if ((kremain >= 0.0) && is_point_in_interval(k, zwindow)) {
64
65 const int jgmin = ceil(-1e-8 - sqrt(kremain) * handler->dh_inv[1][1]);
66 for (int jg = jgmin; jg <= (1 - jgmin); jg++) {
67 const int j = map[1][jg - lb_cube[1]];
68 const double jr = ((2 * jg - 1) >> 1) *
69 handler->dh[1][1]; // distance from center in a.u.
70 const double jremain = kremain - jr * jr;
71
72 if ((jremain >= 0.0) && is_point_in_interval(j, ywindow)) {
73 const int xmin = ceil(-1e-8 - sqrt(jremain) * handler->dh_inv[0][0]);
74 const int xmax = 1 - xmin;
75
76 // printf("xmin %d, xmax %d\n", xmin, xmax);
77 for (int x = xmin - lb_cube[2];
78 x < imin((xmax - lb_cube[2]), handler->cube.size[2]); x++) {
79 const int x1 = map[2][x];
80
81 if (!is_point_in_interval(x1, xwindow))
82 continue;
83
84 int lower_corner[3] = {k, j, x1};
85 int upper_corner[3] = {k + 1, j + 1, x1 + 1};
86
87 compute_interval(map[2], handler->grid.full_size[2],
88 handler->grid.size[2], handler->cube.size[2], x1,
89 &x, lower_corner + 2, upper_corner + 2, xwindow);
90
91 if (upper_corner[2] - lower_corner[2]) {
92 const int position1[3] = {kg, jg - lb_cube[1], x};
93
94 /* the function will internally take care of the local vs global
95 * grid */
96
97 double *restrict dst = &idx3(handler->cube, position1[0],
98 position1[1], position1[2]);
99
100 double *restrict src = &idx3(handler->grid, lower_corner[0],
101 lower_corner[1], lower_corner[2]);
102
103 const int sizex = upper_corner[2] - lower_corner[2];
104 GRID_PRAGMA_SIMD((dst, src), 8)
105 for (int x = 0; x < sizex; x++) {
106 dst[x] = src[x];
107 }
108 }
109
110 if (handler->grid.size[2] == handler->grid.full_size[2])
111 update_loop_index(handler->grid.full_size[2], x1, &x);
112 else
113 x += upper_corner[2] - lower_corner[2] - 1;
114 }
115 }
116 }
117 }
118 }
119}
120
122 struct collocation_integration_ *const handler, const double disr_radius,
123 const int cmax, const int *const lb_cube, const int *const ub_cube,
124 const double *const roffset, const int *const cube_center) {
125
126 const double a = handler->dh[0][0] * handler->dh[0][0] +
127 handler->dh[0][1] * handler->dh[0][1] +
128 handler->dh[0][2] * handler->dh[0][2];
129 const double a_inv = 1.0 / a;
130 // a mapping so that the ig corresponds to the right grid point
131 int **map = handler->map;
132 map[1] = map[0] + 2 * cmax + 1;
133 map[2] = map[1] + 2 * cmax + 1;
134 // memset(map[0], 0xff, sizeof(int) * 3 * (2 * cmax + 1));
135
136 for (int i = 0; i < 3; i++) {
137 for (int ig = 0; ig < handler->cube.size[i]; ig++) {
138 map[i][ig] = modulo(cube_center[i] + ig + lb_cube[i] -
139 handler->grid.lower_corner[i],
140 handler->grid.full_size[i]);
141 }
142 }
143
144 const Interval zwindow = {.xmin = handler->grid.window_shift[0],
145 .xmax = handler->grid.window_size[0]};
146 const Interval ywindow = {.xmin = handler->grid.window_shift[1],
147 .xmax = handler->grid.window_size[1]};
148 const Interval xwindow = {.xmin = handler->grid.window_shift[2],
149 .xmax = handler->grid.window_size[2]};
150
151 for (int k = 0; k < handler->cube.size[0]; k++) {
152 const int iz = map[0][k];
153
154 if (!is_point_in_interval(iz, zwindow))
155 continue;
156
157 const double tz = (k + lb_cube[0] - roffset[0]);
158 const double z[3] = {tz * handler->dh[2][0], tz * handler->dh[2][1],
159 tz * handler->dh[2][2]};
160
161 for (int j = 0; j < handler->cube.size[1]; j++) {
162 const int iy = map[1][j];
163
164 if (!is_point_in_interval(iy, ywindow))
165 continue;
166
167 const double ty = (j - roffset[1] + lb_cube[1]);
168 const double y[3] = {z[0] + ty * handler->dh[1][0],
169 z[1] + ty * handler->dh[1][1],
170 z[2] + ty * handler->dh[1][2]};
171
172 /* Sqrt[(-2 x1 \[Alpha] - 2 y1 \[Beta] - 2 z1 \[Gamma])^2 - */
173 /* 4 (x1^2 + y1^2 + z1^2)
174 * (\[Alpha]^2 + \[Beta]^2 + \[Gamma]^2)] */
175
176 const double b =
177 -2.0 * (handler->dh[0][0] * (roffset[2] * handler->dh[0][0] - y[0]) +
178 handler->dh[0][1] * (roffset[2] * handler->dh[0][1] - y[1]) +
179 handler->dh[0][2] * (roffset[2] * handler->dh[0][2] - y[2]));
180
181 const double c = (roffset[2] * handler->dh[0][0] - y[0]) *
182 (roffset[2] * handler->dh[0][0] - y[0]) +
183 (roffset[2] * handler->dh[0][1] - y[1]) *
184 (roffset[2] * handler->dh[0][1] - y[1]) +
185 (roffset[2] * handler->dh[0][2] - y[2]) *
186 (roffset[2] * handler->dh[0][2] - y[2]) -
187 disr_radius * disr_radius;
188
189 double delta = b * b - 4.0 * a * c;
190
191 if (delta < 0.0)
192 continue;
193
194 delta = sqrt(delta);
195
196 const int xmin = imax(ceil((-b - delta) * 0.5 * a_inv), lb_cube[2]);
197 const int xmax = imin(floor((-b + delta) * 0.5 * a_inv), ub_cube[2]);
198
199 int lower_corner[3] = {iz, iy, xmin};
200 int upper_corner[3] = {iz + 1, iy + 1, xmin};
201
202 for (int x = xmin - lb_cube[2];
203 x < imin((xmax - lb_cube[2]), handler->cube.size[2]); x++) {
204 const int x1 = map[2][x];
205
206 if (!is_point_in_interval(x1, xwindow))
207 continue;
208
209 compute_interval(map[2], handler->grid.full_size[2],
210 handler->grid.size[2], handler->cube.size[2], x1, &x,
211 lower_corner + 2, upper_corner + 2, xwindow);
212
213 if (upper_corner[2] - lower_corner[2]) {
214 const int position1[3] = {k, j, x};
215
216 /* the function will internally take care of the local vs global
217 * grid */
218
219 double *__restrict__ src = &idx3(handler->grid, lower_corner[0],
220 lower_corner[1], lower_corner[2]);
221 double *__restrict__ dst =
222 &idx3(handler->cube, position1[0], position1[1], position1[2]);
223
224 const int sizex = upper_corner[2] - lower_corner[2];
225
226 // #pragma omp simd linear(dst, src) simdlen(8)
227 GRID_PRAGMA_SIMD((dst, src), 8)
228 for (int x = 0; x < sizex; x++) {
229 dst[x] = src[x];
230 }
231
232 if (handler->grid.size[2] == handler->grid.full_size[2])
233 update_loop_index(handler->grid.full_size[2], x1, &x);
234 else
235 x += upper_corner[2] - lower_corner[2] - 1;
236 }
237 }
238 }
239 }
240}
241
243 const _task *prev_task,
244 const _task *task, tensor *const hab,
245 tensor *work, // some scratch matrix
246 double *blocks) {
247 if (prev_task != NULL) {
248 /* prev_task is NULL when we deal with the first iteration. In that case
249 * we just need to initialize the hab matrix to the proper size. Note
250 * that resizing does not really occurs since memory allocation is done
251 * for the maximum size the matrix can have. */
252 const int iatom = prev_task->iatom;
253 const int jatom = prev_task->jatom;
254 const int iset = prev_task->iset;
255 const int jset = prev_task->jset;
256 const int ikind = ctx->atom_kinds[iatom];
257 const int jkind = ctx->atom_kinds[jatom];
258 const grid_basis_set *ibasis = ctx->basis_sets[ikind];
259 const grid_basis_set *jbasis = ctx->basis_sets[jkind];
260
261 const int block_num = prev_task->block_num;
262 double *const block = &blocks[ctx->block_offsets[block_num]];
263
264 const int ncoseta = ncoset(ibasis->lmax[iset]);
265 const int ncosetb = ncoset(jbasis->lmax[jset]);
266
267 const int ncoa = ibasis->npgf[iset] * ncoseta;
268 const int ncob = jbasis->npgf[jset] * ncosetb;
269
270 const int nsgf_seta = ibasis->nsgf_set[iset]; // size of spherical set */
271 const int nsgf_setb = jbasis->nsgf_set[jset];
272 const int nsgfa = ibasis->nsgf; // size of entire spherical basis
273 const int nsgfb = jbasis->nsgf;
274 const int sgfa = ibasis->first_sgf[iset] - 1; // start of spherical set
275 const int sgfb = jbasis->first_sgf[jset] - 1;
276 const int maxcoa = ibasis->maxco;
277 const int maxcob = jbasis->maxco;
278
279 initialize_tensor_2(work, nsgf_setb, ncoa);
280 realloc_tensor(work);
281
282 // Warning these matrices are row major....
283
284 dgemm_params m1, m2;
285 memset(&m1, 0, sizeof(dgemm_params));
286 memset(&m2, 0, sizeof(dgemm_params));
287
288 m1.op1 = 'N';
289 m1.op2 = 'N';
290 m1.m = nsgf_setb;
291 m1.n = ncoa;
292 m1.k = ncob;
293 m1.alpha = 1.0;
294 m1.beta = 0.0;
295 m1.a = &jbasis->sphi[sgfb * maxcob];
296 m1.lda = maxcob;
297 m1.b = hab->data;
298 m1.ldb = ncoa;
299 m1.c = work->data;
300 m1.ldc = work->ld_;
301
302 // phi[b][ncob]
303 // work[b][ncoa] = phi[b][ncob] * hab[ncob][ncoa]
304
305 if (iatom <= jatom) {
306 // I need to have the final result in the form
307
308 // block[b][a] = work[b][ncoa] transpose(phi[a][ncoa])
309 m2.op1 = 'N';
310 m2.op2 = 'T';
311 m2.m = nsgf_setb;
312 m2.n = nsgf_seta;
313 m2.k = ncoa;
314 m2.a = work->data;
315 m2.lda = work->ld_;
316 m2.b = &ibasis->sphi[sgfa * maxcoa];
317 m2.ldb = maxcoa;
318 m2.c = block + sgfb * nsgfa + sgfa;
319 m2.ldc = nsgfa;
320 m2.alpha = 1.0;
321 m2.beta = 1.0;
322 } else {
323 // block[a][b] = phi[a][ncoa] Transpose(work[b][ncoa])
324 m2.op1 = 'N';
325 m2.op2 = 'T';
326 m2.m = nsgf_seta;
327 m2.n = nsgf_setb;
328 m2.k = ncoa;
329 m2.a = &ibasis->sphi[sgfa * maxcoa];
330 m2.lda = maxcoa;
331 m2.b = work->data;
332 m2.ldb = work->ld_;
333 m2.c = block + sgfa * nsgfb + sgfb;
334 m2.ldc = nsgfb;
335 m2.alpha = 1.0;
336 m2.beta = 1.0;
337 }
338
339 m1.use_libxsmm = true;
340 m2.use_libxsmm = true;
341
342 /* these dgemm are *row* major */
343 dgemm_simplified(&m1);
344 dgemm_simplified(&m2);
345 }
346
347 if (task != NULL) {
348 const int iatom = task->iatom;
349 const int jatom = task->jatom;
350 const int ikind = ctx->atom_kinds[iatom];
351 const int jkind = ctx->atom_kinds[jatom];
352 const int iset = task->iset;
353 const int jset = task->jset;
354 const grid_basis_set *ibasis = ctx->basis_sets[ikind];
355 const grid_basis_set *jbasis = ctx->basis_sets[jkind];
356 const int ncoseta = ncoset(ibasis->lmax[iset]);
357 const int ncosetb = ncoset(jbasis->lmax[jset]);
358
359 const int ncoa = ibasis->npgf[iset] * ncoseta;
360 const int ncob = jbasis->npgf[jset] * ncosetb;
361
362 initialize_tensor_2(hab, ncob, ncoa);
363 realloc_tensor(hab);
364 }
365}
366
367void update_force_pair(orbital a, orbital b, const double pab,
368 const double ftz[2], const double *const rab,
369 const tensor *const vab, tensor *force) {
370 const double axpm0 = idx2(vab[0], idx(b), idx(a));
371 for (int i = 0; i < 3; i++) {
372 const double aip1 = idx2(vab[0], idx(b), idx(up(i, a)));
373 const double aim1 = idx2(vab[0], idx(b), idx(down(i, a)));
374 const double bim1 = idx2(vab[0], idx(down(i, b)), idx(a));
375 idx2(force[0], 0, i) += pab * (ftz[0] * aip1 - a.l[i] * aim1);
376 idx2(force[0], 1, i) +=
377 pab * (ftz[1] * (aip1 - rab[i] * axpm0) - b.l[i] * bim1);
378 }
379}
380
381void update_virial_pair(orbital a, orbital b, const double pab,
382 const double ftz[2], const double *const rab,
383 const tensor *const vab, tensor *virial) {
384 for (int i = 0; i < 3; i++) {
385 for (int j = 0; j < 3; j++) {
386 idx3(virial[0], 0, i, j) +=
387 pab * ftz[0] * idx2(vab[0], idx(b), idx(up(i, up(j, a)))) -
388 pab * a.l[j] * idx2(vab[0], idx(b), idx(up(i, down(j, a))));
389 }
390 }
391
392 for (int i = 0; i < 3; i++) {
393 for (int j = 0; j < 3; j++) {
394 idx3(virial[0], 1, i, j) +=
395 pab * ftz[1] *
396 (idx2(vab[0], idx(b), idx(up(i, up(j, a)))) -
397 idx2(vab[0], idx(b), idx(up(i, a))) * rab[j] -
398 idx2(vab[0], idx(b), idx(up(j, a))) * rab[i] +
399 idx2(vab[0], idx(b), idx(a)) * rab[j] * rab[i]) -
400 pab * b.l[j] * idx2(vab[0], idx(up(i, down(j, b))), idx(a));
401 }
402 }
403}
404
405void update_all(const bool compute_forces, const bool compute_virial,
406 const orbital a, const orbital b, const double f,
407 const double *const ftz, const double *rab, const tensor *vab,
408 const double pab, double *hab, tensor *forces,
409 tensor *virials) {
410
411 *hab += f * idx2(vab[0], idx(b), idx(a));
412
413 if (compute_forces) {
414 update_force_pair(a, b, f * pab, ftz, rab, vab, forces);
415 }
416
417 if (compute_virial) {
418 update_virial_pair(a, b, f * pab, ftz, rab, vab, virials);
419 }
420}
421
422static void update_tau(const bool compute_forces, const bool compute_virial,
423 const orbital a, const orbital b, const double ftz[2],
424 const double *const rab, const tensor *const vab,
425 const double pab, double *const hab, tensor *forces,
426 tensor *virials) {
427
428 for (int i = 0; i < 3; i++) {
429 update_all(compute_forces, compute_virial, down(i, a), down(i, b),
430 0.5 * a.l[i] * b.l[i], ftz, rab, vab, pab, hab, forces, virials);
431 update_all(compute_forces, compute_virial, up(i, a), down(i, b),
432 -0.5 * ftz[0] * b.l[i], ftz, rab, vab, pab, hab, forces,
433 virials);
434 update_all(compute_forces, compute_virial, down(i, a), up(i, b),
435 -0.5 * a.l[i] * ftz[1], ftz, rab, vab, pab, hab, forces,
436 virials);
437 update_all(compute_forces, compute_virial, up(i, a), up(i, b),
438 0.5 * ftz[0] * ftz[1], ftz, rab, vab, pab, hab, forces, virials);
439 }
440}
441
442static void
443update_hab_forces_and_stress(const _task *task, const tensor *const vab,
444 const tensor *const pab, const bool compute_tau,
445 const bool compute_forces,
446 const bool compute_virial, tensor *const forces,
447 tensor *const virial, tensor *const hab) {
448 double zeta[2] = {task->zeta[0] * 2.0, task->zeta[1] * 2.0};
449 for (int lb = task->lmin[1]; lb <= task->lmax[1]; lb++) {
450 for (int la = task->lmin[0]; la <= task->lmax[0]; la++) {
451 for (int bx = 0; bx <= lb; bx++) {
452 for (int by = 0; by <= lb - bx; by++) {
453 const int bz = lb - bx - by;
454 const orbital b = {{bx, by, bz}};
455 const int idx_b = task->offset[1] + idx(b);
456 for (int ax = 0; ax <= la; ax++) {
457 for (int ay = 0; ay <= la - ax; ay++) {
458 const int az = la - ax - ay;
459 const orbital a = {{ax, ay, az}};
460 const int idx_a = task->offset[0] + idx(a);
461 double *habval = &idx2(hab[0], idx_b, idx_a);
462 const double prefactor = idx2(pab[0], idx_b, idx_a);
463
464 // now compute the forces
465 if (compute_tau) {
466 update_tau(compute_forces, compute_virial, a, b, zeta,
467 task->rab, vab, prefactor, habval, forces, virial);
468 } else {
469 update_all(compute_forces, compute_virial, a, b, 1.0, zeta,
470 task->rab, vab, prefactor, habval, forces, virial);
471 }
472 }
473 }
474 }
475 }
476 }
477 }
478}
479
480/* It is a sub-optimal version of the mapping in case of a cubic cutoff. But is
481 * very general and does not depend on the orthorombic nature of the grid. for
482 * orthorombic cases, it is faster to apply PCB directly on the polynomials. */
483
484void extract_cube(struct collocation_integration_ *handler, const int cmax,
485 const int *lower_boundaries_cube, const int *cube_center) {
486
487 // a mapping so that the ig corresponds to the right grid point
488 int **map = handler->map;
489 map[1] = map[0] + 2 * cmax + 1;
490 map[2] = map[1] + 2 * cmax + 1;
491 // memset(map[0], 0xff, sizeof(int) * 3 * (2 * cmax + 1));
492 for (int i = 0; i < 3; i++) {
493 for (int ig = 0; ig < handler->cube.size[i]; ig++) {
494 map[i][ig] = modulo(cube_center[i] + ig + lower_boundaries_cube[i] -
495 handler->grid.lower_corner[i],
496 handler->grid.full_size[i]);
497 }
498 }
499
500 int lower_corner[3];
501 int upper_corner[3];
502
503 const Interval zwindow = {.xmin = handler->grid.window_shift[0],
504 .xmax = handler->grid.window_size[0]};
505 const Interval ywindow = {.xmin = handler->grid.window_shift[1],
506 .xmax = handler->grid.window_size[1]};
507 const Interval xwindow = {.xmin = handler->grid.window_shift[2],
508 .xmax = handler->grid.window_size[2]};
509
510 for (int z = 0; (z < handler->cube.size[0]); z++) {
511 const int z1 = map[0][z];
512
513 if (!is_point_in_interval(z1, zwindow))
514 continue;
515
516 compute_interval(map[0], handler->grid.full_size[0], handler->grid.size[0],
517 handler->cube.size[0], z1, &z, lower_corner, upper_corner,
518 zwindow);
519
520 /* // We have a full plane. */
521 if (upper_corner[0] - lower_corner[0]) {
522 for (int y = 0; y < handler->cube.size[1]; y++) {
523 const int y1 = map[1][y];
524
525 // this check is completely irrelevant when running without MPI.
526 if (!is_point_in_interval(y1, ywindow))
527 continue;
528
529 compute_interval(map[1], handler->grid.full_size[1],
530 handler->grid.size[1], handler->cube.size[1], y1, &y,
531 lower_corner + 1, upper_corner + 1, ywindow);
532
533 if (upper_corner[1] - lower_corner[1]) {
534 for (int x = 0; x < handler->cube.size[2]; x++) {
535 const int x1 = map[2][x];
536
537 if (!is_point_in_interval(x1, xwindow))
538 continue;
539
540 compute_interval(map[2], handler->grid.full_size[2],
541 handler->grid.size[2], handler->cube.size[2], x1,
542 &x, lower_corner + 2, upper_corner + 2, xwindow);
543
544 if (upper_corner[2] - lower_corner[2]) { // should be non zero
545 const int position1[3] = {z, y, x};
546
547 /* the function will internally take care of the local vx global
548 * grid */
550 lower_corner, // lower corner of the portion of cube (in the
551 // full grid)
552 upper_corner, // upper corner of the portion of cube (in the
553 // full grid)
554 position1, // starting position subblock inside the cube
555 &handler->grid,
556 &handler->cube); // the grid to add data from
557
558 if (handler->grid.size[2] == handler->grid.full_size[2])
559 update_loop_index(handler->grid.full_size[2], x1, &x);
560 else
561 x += upper_corner[2] - lower_corner[2] - 1;
562 }
563 }
564 if (handler->grid.size[1] == handler->grid.full_size[1])
565 update_loop_index(handler->grid.full_size[1], y1, &y);
566 else
567 y += upper_corner[1] - lower_corner[1] - 1;
568 }
569 }
570 if (handler->grid.size[0] == handler->grid.full_size[0])
571 update_loop_index(handler->grid.full_size[0], z1, &z);
572 else
573 z += upper_corner[0] - lower_corner[0] - 1;
574 }
575 }
576}
577
579 const bool use_ortho, const double zetp, const double rp[3],
580 const double radius) {
581 if (handler == NULL)
582 abort();
583
584 const int lp = handler->coef.size[0] - 1;
585
586 int cubecenter[3];
587 int cube_size[3];
588 int lb_cube[3], ub_cube[3];
589 double roffset[3];
590 double disr_radius;
591
592 /* cube : grid comtaining pointlike product between polynomials
593 *
594 * pol : grid containing the polynomials in all three directions
595 */
596
597 /* seting up the cube parameters */
599 use_ortho, radius, (const double(*)[3])handler->dh,
600 (const double(*)[3])handler->dh_inv, rp, &disr_radius, roffset,
601 cubecenter, lb_cube, ub_cube, cube_size);
602
603 /* initialize the multidimensional array containing the polynomials */
604 if (lp != 0) {
605 initialize_tensor_3(&handler->pol, 3, cmax, handler->coef.size[0]);
606 } else {
607 initialize_tensor_3(&handler->pol, 3, handler->coef.size[0], cmax);
608 }
609 handler->pol_alloc_size = realloc_tensor(&handler->pol);
610
611 /* allocate memory for the polynomial and the cube */
612
613 initialize_tensor_3(&handler->cube, cube_size[0], cube_size[1], cube_size[2]);
614
615 handler->cube_alloc_size = realloc_tensor(&handler->cube);
616
617 initialize_W_and_T(handler, &handler->coef, &handler->cube);
618
619 /* compute the polynomials */
620
621 // WARNING : do not reverse the order in pol otherwise you will have to
622 // reverse the order in collocate_dgemm as well.
623
624 /* the tensor contraction is done for a given order so I have to be careful
625 * how the tensors X, Y, Z are stored. In collocate, they are stored
626 * normally 0 (Z), (1) Y, (2) X in the table pol. but in integrate (which
627 * uses the same tensor reduction), I have a special treatment for l = 0. In
628 * that case the order *should* be the same than for collocate. For l > 0,
629 * we need a different storage order which is (X) 2, (Y) 0, and (Z) 1.
630 *
631 * the reason for this is that the cube is stored as cube[z][y][x] so the
632 * first direction taken for the dgemm is along x.
633 */
634
635 int perm[3] = {2, 0, 1};
636
637 if (lp == 0) {
638 /* I need to restore the same order than for collocate */
639 perm[0] = 0;
640 perm[1] = 1;
641 perm[2] = 2;
642 }
643
644 bool use_ortho_forced = handler->orthogonal[0] && handler->orthogonal[1] &&
645 handler->orthogonal[2];
646 if (use_ortho) {
647 grid_fill_pol_dgemm((lp != 0), handler->dh[0][0], roffset[2], 0, lb_cube[2],
648 ub_cube[2], lp, cmax, zetp,
649 &idx3(handler->pol, perm[2], 0, 0)); /* i indice */
650 grid_fill_pol_dgemm((lp != 0), handler->dh[1][1], roffset[1], 0, lb_cube[1],
651 ub_cube[1], lp, cmax, zetp,
652 &idx3(handler->pol, perm[1], 0, 0)); /* j indice */
653 grid_fill_pol_dgemm((lp != 0), handler->dh[2][2], roffset[0], 0, lb_cube[0],
654 ub_cube[0], lp, cmax, zetp,
655 &idx3(handler->pol, perm[0], 0, 0)); /* k indice */
656 } else {
657 double dx[3];
658
659 dx[2] = handler->dh[0][0] * handler->dh[0][0] +
660 handler->dh[0][1] * handler->dh[0][1] +
661 handler->dh[0][2] * handler->dh[0][2];
662
663 dx[1] = handler->dh[1][0] * handler->dh[1][0] +
664 handler->dh[1][1] * handler->dh[1][1] +
665 handler->dh[1][2] * handler->dh[1][2];
666
667 dx[0] = handler->dh[2][0] * handler->dh[2][0] +
668 handler->dh[2][1] * handler->dh[2][1] +
669 handler->dh[2][2] * handler->dh[2][2];
670
671 grid_fill_pol_dgemm((lp != 0), 1.0, roffset[2], 0, lb_cube[2], ub_cube[2],
672 lp, cmax, zetp * dx[2],
673 &idx3(handler->pol, perm[2], 0, 0)); /* i indice */
674 grid_fill_pol_dgemm((lp != 0), 1.0, roffset[1], 0, lb_cube[1], ub_cube[1],
675 lp, cmax, zetp * dx[1],
676 &idx3(handler->pol, perm[1], 0, 0)); /* j indice */
677 grid_fill_pol_dgemm((lp != 0), 1.0, roffset[0], 0, lb_cube[0], ub_cube[0],
678 lp, cmax, zetp * dx[0],
679 &idx3(handler->pol, perm[0], 0, 0)); /* k indice */
680
681 /* the three remaining tensors are initialized in the function */
683 zetp, roffset, (const double(*)[3])handler->dh, lb_cube, ub_cube,
684 handler->orthogonal, &handler->Exp);
685 }
686
687 if (handler->apply_cutoff) {
688 memset(handler->cube.data, 0, sizeof(double) * handler->cube.alloc_size_);
689 if (!use_ortho && !use_ortho_forced) {
691 handler, disr_radius, cmax, lb_cube, ub_cube, roffset, cubecenter);
692 } else {
694 lb_cube, cubecenter);
695 }
696 } else {
697 extract_cube(handler, cmax, lb_cube, cubecenter);
698 }
699
700 if (!use_ortho && !use_ortho_forced)
702 &handler->cube);
703
704 if (lp != 0) {
706 // pointer to scratch memory
707 1.0, handler->orthogonal, NULL,
708 &handler->cube, &handler->pol,
709 &handler->coef);
710 } else {
711 /* it is very specific to integrate because we might end up with a
712 * single element after the tensor product/contraction. In that case, I
713 * compute the cube and then do a scalar product between the two. */
714
715 /* we could also do this with 2 matrix-vector multiplications and a scalar
716 * product
717 *
718 * H_{jk} = C_{ijk} . P_i (along x) C_{ijk} is *stored C[k][j][i]* !!!!!!
719 * L_{k} = H_{jk} . P_j (along y)
720 * v_{ab} = L_k . P_k (along z)
721 */
722
723 tensor cube_tmp;
724 initialize_tensor_2(&cube_tmp, cube_size[0], cube_size[1]);
725 alloc_tensor(&cube_tmp);
726
727 /* first along x */
729 handler->cube.size[0] * handler->cube.size[1],
730 handler->cube.size[2], 1.0, handler->cube.data,
731 handler->cube.ld_, &idx3(handler->pol, 2, 0, 0), 1, 0.0,
732 cube_tmp.data, 1);
733
734 /* second along j */
736 handler->cube.size[1], 1.0, cube_tmp.data, cube_tmp.ld_,
737 &idx3(handler->pol, 1, 0, 0), 1, 0.0, handler->scratch, 1);
738
739 /* finally along k, it is a scalar product.... */
740 handler->coef.data[0] = cblas_ddot(handler->cube.size[0], handler->scratch,
741 1, &idx3(handler->pol, 0, 0, 0), 1);
742
743 free(cube_tmp.data);
744 }
745
746 /* go from ijk -> xyz */
747 if (!use_ortho)
748 grid_transform_coef_jik_to_yxz((const double(*)[3])handler->dh,
749 &handler->coef);
750}
751
753 grid_context *const ctx, const int level, const bool calculate_tau,
754 const bool calculate_forces, const bool calculate_virial,
755 const int *const shift_local, const int *const border_width,
756 const offload_buffer *const pab_blocks, offload_buffer *const hab_blocks,
757 tensor *forces_, tensor *virial_) {
758 tensor *const grid = &ctx->grid[level];
759
760 // Using default(shared) because with GCC 9 the behavior around const changed:
761 // https://www.gnu.org/software/gcc/gcc-9/porting_to.html
762#pragma omp parallel default(shared)
763 {
764 const int num_threads = omp_get_num_threads();
765 const int thread_id = omp_get_thread_num();
766
767 double *hab_block_local = NULL;
768
769 if (num_threads == 1) {
770 hab_block_local = hab_blocks->host_buffer;
771 } else {
772 hab_block_local = ((double *)ctx->scratch) +
773 thread_id * (hab_blocks->size / sizeof(double));
774 memset(hab_block_local, 0, hab_blocks->size);
775 }
776
777 tensor work, pab, hab, vab, forces_local_, virial_local_,
778 forces_local_pair_, virial_local_pair_;
779 memset(&work, 0, sizeof(tensor));
780 memset(&pab, 0, sizeof(tensor));
781 memset(&hab, 0, sizeof(tensor));
782 memset(&vab, 0, sizeof(tensor));
783 memset(&forces_local_, 0, sizeof(tensor));
784 memset(&virial_local_, 0, sizeof(tensor));
785 memset(&forces_local_pair_, 0, sizeof(tensor));
786 memset(&virial_local_pair_, 0, sizeof(tensor));
787
788 struct collocation_integration_ *handler = ctx->handler[thread_id];
789 handler->apply_cutoff = ctx->apply_cutoff;
790 handler->lmax_diff[0] = 0;
791 handler->lmax_diff[1] = 0;
792 handler->lmin_diff[0] = 0;
793 handler->lmin_diff[1] = 0;
794
795 if (calculate_tau || calculate_forces || calculate_virial) {
796 handler->lmax_diff[0] = 1;
797 handler->lmax_diff[1] = 0;
798 handler->lmin_diff[0] = -1;
799 handler->lmin_diff[1] = -1;
800 }
801
802 if (calculate_virial) {
803 handler->lmax_diff[0]++;
804 handler->lmax_diff[1]++;
805 }
806
807 if (calculate_tau) {
808 handler->lmax_diff[0]++;
809 handler->lmax_diff[1]++;
810 handler->lmin_diff[0]--;
811 handler->lmin_diff[1]--;
812 }
813
814 // Allocate pab matrix for re-use across tasks.
815 initialize_tensor_2(&pab, ctx->maxco, ctx->maxco);
816 alloc_tensor(&pab);
817
818 initialize_tensor_2(&vab, ctx->maxco, ctx->maxco);
819 alloc_tensor(&vab);
820
821 initialize_tensor_2(&work, ctx->maxco, ctx->maxco);
822 alloc_tensor(&work);
823
824 initialize_tensor_2(&hab, ctx->maxco, ctx->maxco);
825 alloc_tensor(&hab);
826
827 if (calculate_forces) {
828 initialize_tensor_2(&forces_local_, forces_->size[0], forces_->size[1]);
829 alloc_tensor(&forces_local_);
830 initialize_tensor_2(&virial_local_, 3, 3);
831 alloc_tensor(&virial_local_);
832 memset(forces_local_.data, 0, sizeof(double) * forces_local_.alloc_size_);
833 memset(virial_local_.data, 0, sizeof(double) * virial_local_.alloc_size_);
834 initialize_tensor_2(&forces_local_pair_, 2, 3);
835 alloc_tensor(&forces_local_pair_);
836 initialize_tensor_3(&virial_local_pair_, 2, 3, 3);
837 alloc_tensor(&virial_local_pair_);
838 }
839
840 initialize_basis_vectors(handler, (const double(*)[3])grid->dh,
841 (const double(*)[3])grid->dh_inv);
842
843 tensor_copy(&handler->grid, grid);
844 handler->grid.data = grid->data;
845 for (int d = 0; d < 3; d++)
846 handler->orthogonal[d] = grid->orthogonal[d];
847
848 /* it is only useful when we split the list over multiple threads. The
849 * first iteration should load the block whatever status the
850 * task->block_update_ variable has */
851 const _task *prev_task = NULL;
852#pragma omp for schedule(static)
853 for (int itask = 0; itask < ctx->tasks_per_level[level]; itask++) {
854 // Define some convenient aliases.
855 const _task *task = &ctx->tasks[level][itask];
856
857 if (task->level != level) {
858 printf("level %d, %d\n", task->level, level);
859 abort();
860 }
861
862 if (task->update_block_ || (prev_task == NULL)) {
863 /* need to load pab if forces are needed */
864 if (calculate_forces) {
865 extract_blocks(ctx, task, pab_blocks, &work, &pab);
866 }
867 /* store the coefficients of the operator after rotation to
868 * the spherical harmonic basis */
869
870 rotate_and_store_coefficients(ctx, prev_task, task, &hab, &work,
871 hab_block_local);
872
873 /* There is a difference here between collocate and integrate.
874 * For collocate, I do not need to know the task where blocks
875 * have been updated the last time. For integrate this
876 * information is crucial to update the coefficients of the
877 * potential */
878 prev_task = task;
879 memset(hab.data, 0, sizeof(double) * hab.alloc_size_);
880 }
881
882 /* the grid is divided over several ranks or not periodic */
883 if ((handler->grid.size[0] != handler->grid.full_size[0]) ||
884 (handler->grid.size[1] != handler->grid.full_size[1]) ||
885 (handler->grid.size[2] != handler->grid.full_size[2])) {
886 /* unfortunately the window where the gaussian should be added depends
887 * on the bonds. So I have to adjust the window all the time. */
888
889 setup_grid_window(&handler->grid, shift_local, border_width,
890 task->border_mask);
891 }
892
893 int lmax[2] = {task->lmax[0] + handler->lmax_diff[0],
894 task->lmax[1] + handler->lmax_diff[1]};
895 int lmin[2] = {task->lmin[0] + handler->lmin_diff[0],
896 task->lmin[1] + handler->lmin_diff[1]};
897
898 lmin[0] = imax(lmin[0], 0);
899 lmin[1] = imax(lmin[1], 0);
900
901 initialize_tensor_4(&(handler->alpha), 3, lmax[1] + 1, lmax[0] + 1,
902 lmax[0] + lmax[1] + 1);
903 realloc_tensor(&(handler->alpha));
904
905 const int lp = lmax[0] + lmax[1];
906
907 initialize_tensor_3(&(handler->coef), lp + 1, lp + 1, lp + 1);
908 realloc_tensor(&(handler->coef));
909 grid_integrate(handler, ctx->orthorhombic, task->zetp, task->rp,
910 task->radius);
911 /*
912 handler->coef contains coefficients in the (x - x_12) basis. now
913 we need to rotate them in the (x - x_1) (x - x_2) basis
914 */
915
916 /* compute the transformation matrix */
917 grid_prepare_alpha_dgemm(task->ra, task->rb, task->rp, lmax,
918 &(handler->alpha));
919
920 initialize_tensor_2(&vab, ncoset(lmax[1]), ncoset(lmax[0]));
921 realloc_tensor(&vab);
922
923 grid_compute_vab(lmin, lmax, lp, task->prefactor,
924 &handler->alpha, // contains the matrix for the rotation
925 &handler->coef, // contains the coefficients of
926 // the potential in the
927 // (x_x_12) basis
928 &vab); // contains the coefficients of the potential
929 // in the (x - x_1)(x - x_2) basis
930
931 if (calculate_forces) {
932 memset(forces_local_pair_.data, 0,
933 sizeof(double) * forces_local_pair_.alloc_size_);
934 memset(virial_local_pair_.data, 0,
935 sizeof(double) * virial_local_pair_.alloc_size_);
936 }
937
939 task, &vab, &pab, calculate_tau, calculate_forces, calculate_virial,
940 &forces_local_pair_, /* matrix
941 * containing the
942 * contribution of
943 * the gaussian
944 * pair for each
945 * atom */
946 &virial_local_pair_, /* same but for the virial term (stress tensor)
947 */
948 &hab);
949
950 if (calculate_forces) {
951 const double scaling = (task->iatom == task->jatom) ? 1.0 : 2.0;
952 idx2(forces_local_, task->iatom, 0) +=
953 scaling * idx2(forces_local_pair_, 0, 0);
954 idx2(forces_local_, task->iatom, 1) +=
955 scaling * idx2(forces_local_pair_, 0, 1);
956 idx2(forces_local_, task->iatom, 2) +=
957 scaling * idx2(forces_local_pair_, 0, 2);
958
959 idx2(forces_local_, task->jatom, 0) +=
960 scaling * idx2(forces_local_pair_, 1, 0);
961 idx2(forces_local_, task->jatom, 1) +=
962 scaling * idx2(forces_local_pair_, 1, 1);
963 idx2(forces_local_, task->jatom, 2) +=
964 scaling * idx2(forces_local_pair_, 1, 2);
965 if (calculate_virial) {
966 for (int i = 0; i < 3; i++) {
967 for (int j = 0; j < 3; j++) {
968 idx2(virial_local_, i, j) +=
969 scaling * (idx3(virial_local_pair_, 0, i, j) +
970 idx3(virial_local_pair_, 1, i, j));
971 }
972 }
973 }
974 }
975 }
976
977 rotate_and_store_coefficients(ctx, prev_task, NULL, &hab, &work,
978 hab_block_local);
979
980 // now reduction over the hab blocks
981 if (num_threads > 1) {
982 // does not store the number of elements but the amount of memory
983 // occupied. That's a strange choice.
984 const int hab_size = hab_blocks->size / sizeof(double);
985 if ((hab_size / num_threads) >= 2) {
986 const int block_size =
987 hab_size / num_threads + (hab_size % num_threads);
988
989 for (int bk = 0; bk < num_threads; bk++) {
990 int bk_id = (bk + thread_id) % num_threads;
991 size_t begin = bk_id * block_size;
992 size_t end = imin((bk_id + 1) * block_size, hab_size);
993 cblas_daxpy(end - begin, 1.0, hab_block_local + begin, 1,
994 hab_blocks->host_buffer + begin, 1);
995#pragma omp barrier
996 }
997 } else {
998 const int hab_size = hab_blocks->size / sizeof(double);
999#pragma omp critical
1000 cblas_daxpy(hab_size, 1.0, hab_block_local, 1, hab_blocks->host_buffer,
1001 1);
1002 }
1003 }
1004
1005 if (calculate_forces) {
1006 if (num_threads > 1) {
1007 if ((forces_->alloc_size_ / num_threads) >= 2) {
1008 const int block_size = forces_->alloc_size_ / num_threads +
1009 (forces_->alloc_size_ % num_threads);
1010
1011 for (int bk = 0; bk < num_threads; bk++) {
1012 int bk_id = (bk + thread_id) % num_threads;
1013 int begin = bk_id * block_size;
1014 int end = imin((bk_id + 1) * block_size, forces_->alloc_size_);
1015 cblas_daxpy(end - begin, 1.0, forces_local_.data + begin, 1,
1016 forces_->data + begin, 1);
1017#pragma omp barrier
1018 }
1019 } else {
1020#pragma omp critical
1021 cblas_daxpy(forces_local_.alloc_size_, 1.0, forces_local_.data, 1,
1022 forces_->data, 1);
1023 }
1024 } else {
1025 // we are running with OMP_NUM_THREADS=1
1026 cblas_daxpy(forces_local_.alloc_size_, 1.0, forces_local_.data, 1,
1027 forces_->data, 1);
1028 }
1029 }
1030
1031 if (calculate_virial) {
1032#pragma omp critical
1033 for (int i = 0; i < 3; i++) {
1034 for (int j = 0; j < 3; j++) {
1035 idx2(virial_[0], i, j) += idx2(virial_local_, i, j);
1036 }
1037 }
1038 }
1039
1040 handler->grid.data = NULL;
1041 free(vab.data);
1042 free(work.data);
1043 free(pab.data);
1044 free(hab.data);
1045
1046 if (calculate_forces) {
1047 free(forces_local_.data);
1048 free(virial_local_.data);
1049 free(virial_local_pair_.data);
1050 free(forces_local_pair_.data);
1051 }
1052 }
1053}
1054
1055/*******************************************************************************
1056 * \brief Integrate all tasks of in given list from given grids using matrix -
1057 * matrix multiplication
1058 ******************************************************************************/
1060 void *ptr, const bool compute_tau, const int natoms, const int nlevels,
1061 const offload_buffer *const pab_blocks, offload_buffer *grids[nlevels],
1062 offload_buffer *hab_blocks, double forces[natoms][3], double virial[3][3]) {
1063
1064 grid_context *const ctx = (grid_context *)ptr;
1065
1066 assert(ctx->checksum == ctx_checksum);
1067 assert(ctx->nlevels == nlevels);
1068 assert(ctx->natoms == natoms);
1069
1070 // Zero result arrays.
1071 memset(hab_blocks->host_buffer, 0, hab_blocks->size);
1072
1073 const int max_threads = omp_get_max_threads();
1074
1075 if (ctx->scratch == NULL)
1076 ctx->scratch = malloc(hab_blocks->size * max_threads);
1077
1078 // #pragma omp parallel for
1079 for (int level = 0; level < ctx->nlevels; level++) {
1080 const _layout *layout = &ctx->layouts[level];
1081 set_grid_parameters(&ctx->grid[level], ctx->orthorhombic,
1082 layout->npts_global, layout->npts_local,
1083 layout->shift_local, layout->border_width, layout->dh,
1084 layout->dh_inv, grids[level]);
1085 ctx->grid[level].data = grids[level]->host_buffer;
1086 }
1087
1088 bool calculate_virial = (virial != NULL);
1089 bool calculate_forces = (forces != NULL);
1090
1091 tensor forces_, virial_;
1092 if (calculate_forces) {
1093 initialize_tensor_2(&forces_, natoms, 3);
1094 alloc_tensor(&forces_);
1095 initialize_tensor_2(&virial_, 3, 3);
1096 alloc_tensor(&virial_);
1097 memset(forces_.data, 0, sizeof(double) * forces_.alloc_size_);
1098 memset(virial_.data, 0, sizeof(double) * virial_.alloc_size_);
1099 }
1100
1101 for (int level = 0; level < ctx->nlevels; level++) {
1102 const _layout *layout = &ctx->layouts[level];
1103 integrate_one_grid_level_dgemm(ctx, level, compute_tau, calculate_forces,
1104 calculate_virial, layout->shift_local,
1105 layout->border_width, pab_blocks, hab_blocks,
1106 &forces_, &virial_);
1107 ctx->grid[level].data = NULL;
1108 }
1109 if (calculate_forces) {
1110 if (calculate_virial) {
1111 virial[0][0] = idx2(virial_, 0, 0);
1112 virial[0][1] = idx2(virial_, 0, 1);
1113 virial[0][2] = idx2(virial_, 0, 2);
1114 virial[1][0] = idx2(virial_, 1, 0);
1115 virial[1][1] = idx2(virial_, 1, 1);
1116 virial[1][2] = idx2(virial_, 1, 2);
1117 virial[2][0] = idx2(virial_, 2, 0);
1118 virial[2][1] = idx2(virial_, 2, 1);
1119 virial[2][2] = idx2(virial_, 2, 2);
1120 }
1121
1122 memcpy(forces[0], forces_.data, sizeof(double) * forces_.alloc_size_);
1123 free(forces_.data);
1124 free(virial_.data);
1125 }
1126
1127 free(ctx->scratch);
1128 ctx->scratch = NULL;
1129}
static int imax(int x, int y)
Returns the larger of two given integer (missing from the C standard)
static int max_threads
Definition dbm_library.c:24
static int imin(int x, int y)
Returns the smaller of two given integer (missing from the C standard)
Definition dbm_miniapp.c:38
#define GRID_PRAGMA_SIMD(OBJS, N)
Definition grid_common.h:20
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
static GRID_HOST_DEVICE orbital up(const int i, const orbital a)
Increase i'th component of given orbital angular momentum.
static GRID_HOST_DEVICE int idx(const orbital a)
Return coset index of given orbital angular momentum.
static GRID_HOST_DEVICE orbital down(const int i, const orbital a)
Decrease i'th component of given orbital angular momentum.
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
static void const int const int i
static void const int cmax
static void const int const int const int const int const int const double const int map[3][2 *cmax+1]
void grid_compute_vab(const int *const lmin, const int *const lmax, const int lp, const double prefactor, const tensor *const alpha, const tensor *const coef_xyz, tensor *vab)
void grid_transform_coef_jik_to_yxz(const double dh[3][3], const tensor *coef_xyz)
void grid_prepare_alpha_dgemm(const double ra[3], const double rb[3], const double rp[3], const int *lmax, tensor *alpha)
void grid_fill_pol_dgemm(const bool transpose, const double dr, const double roffset, const int pol_offset, const int xmin, const int xmax, const int lp, const int cmax, const double zetp, double *pol_)
void extract_blocks(grid_context *const ctx, const _task *const task, const offload_buffer *pab_blocks, tensor *const work, tensor *const pab)
void tensor_reduction_for_collocate_integrate(double *scratch, const double alpha, const bool *const orthogonal, const struct tensor_ *Exp, const struct tensor_ *co, const struct tensor_ *p_alpha_beta_reduced_, struct tensor_ *cube)
void initialize_basis_vectors(collocation_integration *const handler, const double dh[3][3], const double dh_inv[3][3])
void initialize_W_and_T(collocation_integration *const handler, const tensor *cube, const tensor *coef)
void set_grid_parameters(tensor *grid, const bool orthorhombic, const int grid_full_size[3], const int grid_local_size[3], const int shift_local[3], const int border_width[3], const double dh[3][3], const double dh_inv[3][3], offload_buffer *grid_)
static void rotate_and_store_coefficients(grid_context *const ctx, const _task *prev_task, const _task *task, tensor *const hab, tensor *work, double *blocks)
static void update_tau(const bool compute_forces, const bool compute_virial, const orbital a, const orbital b, const double ftz[2], const double *const rab, const tensor *const vab, const double pab, double *const hab, tensor *forces, tensor *virials)
void integrate_one_grid_level_dgemm(grid_context *const ctx, const int level, const bool calculate_tau, const bool calculate_forces, const bool calculate_virial, const int *const shift_local, const int *const border_width, const offload_buffer *const pab_blocks, offload_buffer *const hab_blocks, tensor *forces_, tensor *virial_)
void extract_cube_within_spherical_cutoff_generic(struct collocation_integration_ *const handler, const double disr_radius, const int cmax, const int *const lb_cube, const int *const ub_cube, const double *const roffset, const int *const cube_center)
void update_force_pair(orbital a, orbital b, const double pab, const double ftz[2], const double *const rab, const tensor *const vab, tensor *force)
void update_virial_pair(orbital a, orbital b, const double pab, const double ftz[2], const double *const rab, const tensor *const vab, tensor *virial)
void grid_dgemm_integrate_task_list(void *ptr, const bool compute_tau, const int natoms, const int nlevels, const offload_buffer *const pab_blocks, offload_buffer *grids[nlevels], offload_buffer *hab_blocks, double forces[natoms][3], double virial[3][3])
Integrate all tasks of in given list from given grids using matrix - matrix multiplication.
void extract_cube_within_spherical_cutoff_ortho(struct collocation_integration_ *const handler, const double disr_radius, const int cmax, const int *const lb_cube, const int *const cube_center)
static void update_hab_forces_and_stress(const _task *task, const tensor *const vab, const tensor *const pab, const bool compute_tau, const bool compute_forces, const bool compute_virial, tensor *const forces, tensor *const virial, tensor *const hab)
void grid_integrate(collocation_integration *const handler, const bool use_ortho, const double zetp, const double rp[3], const double radius)
void extract_cube(struct collocation_integration_ *handler, const int cmax, const int *lower_boundaries_cube, const int *cube_center)
void update_all(const bool compute_forces, const bool compute_virial, const orbital a, const orbital b, const double f, const double *const ftz, const double *rab, const tensor *vab, const double pab, double *hab, tensor *forces, tensor *virials)
void apply_non_orthorombic_corrections(const bool plane[restrict 3], const tensor *const Exp, tensor *const cube)
void calculate_non_orthorombic_corrections_tensor(const double mu_mean, const double *r_ab, const double basis[3][3], const int *const xmin, const int *const xmax, bool plane[3], tensor *const Exp)
static bool is_point_in_interval(const int value, Interval x)
static void update_loop_index(const int global_grid_size, int x1, int *const x)
void tensor_copy(tensor *const b, const tensor *const a)
size_t realloc_tensor(tensor *t)
void alloc_tensor(tensor *t)
static void setup_grid_window(tensor *const grid, const int *const shift_local, const int *const border_width, const int border_mask)
static void initialize_tensor_4(struct tensor_ *a, int n1, int n2, int n3, int n4)
#define idx2(a, i, j)
static void initialize_tensor_3(struct tensor_ *a, int n1, int n2, int n3)
#define idx3(a, i, j, k)
static void initialize_tensor_2(struct tensor_ *a, int n1, int n2)
int compute_cube_properties(const bool ortho, const double radius, const double dh[3][3], const double dh_inv[3][3], const double *rp, double *disr_radius, double *roffset, int *cubecenter, int *lb_cube, int *ub_cube, int *cube_size)
double cblas_ddot(const int N, const double *X, const int incX, const double *Y, const int incY)
void extract_sub_grid(const int *lower_corner, const int *upper_corner, const int *position, const tensor *const grid, tensor *const subgrid)
void compute_interval(const int *const map, const int full_size, const int size, const int cube_size, const int x1, int *x, int *const lower_corner, int *const upper_corner, const Interval window)
void cblas_daxpy(const int N, const double alpha, const double *X, const int incX, double *Y, const int incY)
void cblas_dgemv(const CBLAS_LAYOUT order, const CBLAS_TRANSPOSE TransA, const int M, const int N, const double alpha, const double *A, const int lda, const double *X, const int incX, const double beta, double *Y, const int incY)
void dgemm_simplified(dgemm_params *const m)
@ CblasNoTrans
@ CblasRowMajor
double dh_inv[3][3]
Internal representation of a basis set.
grid_basis_set ** basis_sets
struct collocation_integration_ ** handler
Internal representation of a buffer.
double * host_buffer
Orbital angular momentum.