16#include "../common/grid_common.h"
28 if (task_a->
level != task_b->level) {
29 return task_a->
level - task_b->level;
30 }
else if (task_a->
block_num != task_b->block_num) {
31 return task_a->
block_num - task_b->block_num;
32 }
else if (task_a->
iset != task_b->iset) {
33 return task_a->
iset - task_b->iset;
35 return task_a->
jset - task_b->jset;
44 const bool orthorhombic,
const int ntasks,
const int nlevels,
45 const int natoms,
const int nkinds,
const int nblocks,
46 const int block_offsets[nblocks],
const double atom_positions[natoms][3],
47 const int atom_kinds[natoms],
const grid_basis_set *basis_sets[nkinds],
48 const int level_list[ntasks],
const int iatom_list[ntasks],
49 const int jatom_list[ntasks],
const int iset_list[ntasks],
50 const int jset_list[ntasks],
const int ipgf_list[ntasks],
51 const int jpgf_list[ntasks],
const int border_mask_list[ntasks],
52 const int block_num_list[ntasks],
const double radius_list[ntasks],
53 const double rab_list[ntasks][3],
const int npts_global[nlevels][3],
54 const int npts_local[nlevels][3],
const int shift_local[nlevels][3],
55 const int border_width[nlevels][3],
const double dh[nlevels][3][3],
58 if (*task_list_out != NULL) {
65 assert(task_list != NULL);
68 task_list->
ntasks = ntasks;
70 task_list->
natoms = natoms;
71 task_list->
nkinds = nkinds;
74 size_t size = nblocks *
sizeof(int);
81 size = 3 * natoms *
sizeof(double);
88 size = natoms *
sizeof(int);
90 assert(task_list->
atom_kinds != NULL || size == 0);
92 memcpy(task_list->
atom_kinds, atom_kinds, size);
97 assert(task_list->
basis_sets != NULL || size == 0);
99 memcpy(task_list->
basis_sets, basis_sets, size);
103 task_list->
tasks = malloc(size);
104 assert(task_list->
tasks != NULL || size == 0);
105#pragma omp parallel for schedule(static) if (ntasks > GRID_OMP_MIN_ITERATIONS)
106 for (
int i = 0;
i < ntasks;
i++) {
124 task_list->
layouts = malloc(size);
125 assert(task_list->
layouts != NULL || size == 0);
126 for (
int level = 0; level < nlevels; level++) {
127 for (
int i = 0;
i < 3;
i++) {
132 for (
int j = 0; j < 3; j++) {
133 task_list->
layouts[level].
dh[
i][j] = dh[level][
i][j];
143 size = nlevels * nblocks *
sizeof(int);
148 const int nlevel_blocks = nlevels * nblocks;
149#pragma omp parallel for schedule(static) if (nlevel_blocks > \
150 GRID_OMP_MIN_ITERATIONS)
151 for (
int i = 0;
i < nlevel_blocks;
i++) {
155 for (
int itask = 0; itask < ntasks; itask++) {
156 const int level = task_list->
tasks[itask].
level - 1;
158 if (itask == 0 || task_list->
tasks[itask - 1].
level - 1 != level ||
166 task_list->
maxco = 0;
167 for (
int i = 0;
i < nkinds;
i++) {
172 size = omp_get_max_threads() *
sizeof(
double *);
176 size = omp_get_max_threads() *
sizeof(size_t);
181 *task_list_out = task_list;
198 free(task_list->
tasks);
202 for (
int i = 0;
i < omp_get_max_threads();
i++) {
216void dgemm_(
const char *transa,
const char *transb,
const int *m,
const int *n,
217 const int *k,
const double *alpha,
const double *a,
const int *lda,
218 const double *b,
const int *ldb,
const double *beta,
double *c,
225static void dgemm(
const char transa,
const char transb,
const int m,
226 const int n,
const int k,
const double alpha,
const double *a,
227 const int lda,
const double *b,
const int ldb,
228 const double beta,
double *c,
const int ldc) {
229 dgemm_(&transb, &transa, &n, &m, &k, &alpha, b, &ldb, a, &lda, &beta, c,
238 const int iset,
const int jset,
const bool transpose,
239 const double *block,
double *pab) {
242 const int ncoseta = ncoset(ibasis->
lmax[iset]);
243 const int ncosetb = ncoset(jbasis->
lmax[jset]);
244 const int ncoa = ibasis->
npgf[iset] * ncoseta;
245 const int ncob = jbasis->
npgf[jset] * ncosetb;
247 const int nsgf_seta = ibasis->
nsgf_set[iset];
248 const int nsgf_setb = jbasis->
nsgf_set[jset];
249 const int nsgfa = ibasis->
nsgf;
250 const int nsgfb = jbasis->
nsgf;
251 const int sgfa = ibasis->
first_sgf[iset] - 1;
252 const int sgfb = jbasis->
first_sgf[jset] - 1;
253 const int maxcoa = ibasis->
maxco;
254 const int maxcob = jbasis->
maxco;
256 double work[nsgf_setb * ncoa];
259 dgemm(
'N',
'N', nsgf_setb, ncoa, nsgf_seta, 1.0,
260 &block[sgfb * nsgfa + sgfa], nsgfa, &ibasis->
sphi[sgfa * maxcoa],
261 maxcoa, 0.0, work, ncoa);
264 dgemm(
'T',
'N', nsgf_setb, ncoa, nsgf_seta, 1.0,
265 &block[sgfa * nsgfb + sgfb], nsgfb, &ibasis->
sphi[sgfa * maxcoa],
266 maxcoa, 0.0, work, ncoa);
269 dgemm(
'T',
'N', ncob, ncoa, nsgf_setb, 1.0, &jbasis->
sphi[sgfb * maxcob],
270 maxcob, work, ncoa, 0.0, pab, ncoa);
279 const int *last_block_task,
const enum grid_func func,
280 const int npts_global[3],
const int npts_local[3],
const int shift_local[3],
281 const int border_width[3],
const double dh[3][3],
const double dh_inv[3][3],
291#pragma omp parallel default(shared)
293 const int ithread = omp_get_thread_num();
294 const int nthreads = omp_get_num_threads();
297 int old_offset = -1, old_iset = -1, old_jset = -1;
304 const size_t grid_size = npts_local_total *
sizeof(double);
315 double *
const my_grid = task_list->
threadlocals[ithread];
316 memset(my_grid, 0, grid_size);
319 const int chunk_size =
imax(1, task_list->
nblocks / (nthreads * 50));
320#pragma omp for schedule(dynamic, chunk_size)
321 for (
int block_num = 0; block_num < task_list->
nblocks; block_num++) {
322 const int first_task = first_block_task[block_num];
323 const int last_task = last_block_task[block_num];
325 for (
int itask = first_task; itask <= last_task; itask++) {
328 const int iatom = task->
iatom - 1;
329 const int jatom = task->
jatom - 1;
330 const int iset = task->
iset - 1;
331 const int jset = task->
jset - 1;
332 const int ipgf = task->
ipgf - 1;
333 const int jpgf = task->
jpgf - 1;
334 const int ikind = task_list->
atom_kinds[iatom] - 1;
335 const int jkind = task_list->
atom_kinds[jatom] - 1;
338 const double zeta = ibasis->
zet[iset * ibasis->
maxpgf + ipgf];
339 const double zetb = jbasis->
zet[jset * jbasis->
maxpgf + jpgf];
340 const int ncoseta = ncoset(ibasis->
lmax[iset]);
341 const int ncosetb = ncoset(jbasis->
lmax[jset]);
342 const int ncoa = ibasis->
npgf[iset] * ncoseta;
343 const int ncob = jbasis->
npgf[jset] * ncosetb;
344 const int block_num = task->
block_num - 1;
345 const int block_offset = task_list->
block_offsets[block_num];
346 const double *block = &pab_blocks[block_offset];
347 const bool transpose = (iatom <= jatom);
351 if (block_offset != old_offset || iset != old_iset ||
353 old_offset = block_offset;
356 load_pab(ibasis, jbasis, iset, jset, transpose, block, pab);
369 (iatom == jatom) ? 1 : 2,
383 (
const double(*)[ncoa])pab,
394 const int nreduction_cycles = ceil(log(nthreads) / log(2));
395 for (
int icycle = 1; icycle <= nreduction_cycles; icycle++) {
398 const int group_size = 1 << icycle;
399 const int igroup = ithread / group_size;
400 const int dest_thread = igroup * group_size;
401 const int src_thread = dest_thread + group_size / 2;
403 const int actual_group_size =
imin(group_size, nthreads - dest_thread);
405 const int rank =
modulo(ithread, group_size);
406 const int lb = (npts_local_total * rank) / actual_group_size;
407 const int ub = (npts_local_total * (rank + 1)) / actual_group_size;
408 if (src_thread < nthreads) {
410 for (
int i = lb;
i <
ub;
i++) {
419 const int lb = (npts_local_total * ithread) / nthreads;
420 const int ub = (npts_local_total * (ithread + 1)) / nthreads;
422 for (
int i = lb;
i <
ub;
i++) {
435 const enum grid_func func,
const int nlevels,
444 assert(task_list->
nlevels == nlevels);
446 for (
int level = 0; level < task_list->
nlevels; level++) {
452 task_list, first_block_task, last_block_task, func, layout->
npts_global,
464 const int jset,
const bool transpose,
465 const double *hab,
double *block) {
468 const int ncoseta = ncoset(ibasis->
lmax[iset]);
469 const int ncosetb = ncoset(jbasis->
lmax[jset]);
470 const int ncoa = ibasis->
npgf[iset] * ncoseta;
471 const int ncob = jbasis->
npgf[jset] * ncosetb;
473 const int nsgf_seta = ibasis->
nsgf_set[iset];
474 const int nsgf_setb = jbasis->
nsgf_set[jset];
475 const int nsgfa = ibasis->
nsgf;
476 const int nsgfb = jbasis->
nsgf;
477 const int sgfa = ibasis->
first_sgf[iset] - 1;
478 const int sgfb = jbasis->
first_sgf[jset] - 1;
479 const int maxcoa = ibasis->
maxco;
480 const int maxcob = jbasis->
maxco;
482 double work[nsgf_setb * ncoa];
485 dgemm(
'N',
'N', nsgf_setb, ncoa, ncob, 1.0, &jbasis->
sphi[sgfb * maxcob],
486 maxcob, hab, ncoa, 0.0, work, ncoa);
490 dgemm(
'N',
'T', nsgf_setb, nsgf_seta, ncoa, 1.0, work, ncoa,
491 &ibasis->
sphi[sgfa * maxcoa], maxcoa, 1.0,
492 &block[sgfb * nsgfa + sgfa], nsgfa);
495 dgemm(
'N',
'T', nsgf_seta, nsgf_setb, ncoa, 1.0,
496 &ibasis->
sphi[sgfa * maxcoa], maxcoa, work, ncoa, 1.0,
497 &block[sgfa * nsgfb + sgfb], nsgfb);
507 const int *last_block_task,
const bool compute_tau,
const int natoms,
508 const int npts_global[3],
const int npts_local[3],
const int shift_local[3],
509 const int border_width[3],
const double dh[3][3],
const double dh_inv[3][3],
511 offload_buffer *hab_blocks,
double forces[natoms][3],
double virial[3][3]) {
520#pragma omp parallel default(shared)
523 int old_offset = -1, old_iset = -1, old_jset = -1;
525 bool old_transpose =
false;
532 const int nthreads = omp_get_num_threads();
533 const int chunk_size =
imax(1, task_list->
nblocks / (nthreads * 50));
534#pragma omp for schedule(dynamic, chunk_size)
535 for (
int block_num = 0; block_num < task_list->
nblocks; block_num++) {
536 const int first_task = first_block_task[block_num];
537 const int last_task = last_block_task[block_num];
540 const int iatom = task_list->
tasks[first_task].
iatom - 1;
541 const int jatom = task_list->
tasks[first_task].
jatom - 1;
542 double my_forces[2][3] = {0};
543 double my_virials[2][3][3] = {0};
545 for (
int itask = first_task; itask <= last_task; itask++) {
548 assert(task->
block_num - 1 == block_num);
549 assert(task->
iatom - 1 == iatom && task->
jatom - 1 == jatom);
550 const int ikind = task_list->
atom_kinds[iatom] - 1;
551 const int jkind = task_list->
atom_kinds[jatom] - 1;
554 const int iset = task->
iset - 1;
555 const int jset = task->
jset - 1;
556 const int ipgf = task->
ipgf - 1;
557 const int jpgf = task->
jpgf - 1;
558 const double zeta = ibasis->
zet[iset * ibasis->
maxpgf + ipgf];
559 const double zetb = jbasis->
zet[jset * jbasis->
maxpgf + jpgf];
560 const int ncoseta = ncoset(ibasis->
lmax[iset]);
561 const int ncosetb = ncoset(jbasis->
lmax[jset]);
562 const int ncoa = ibasis->
npgf[iset] * ncoseta;
563 const int ncob = jbasis->
npgf[jset] * ncosetb;
564 const int block_offset = task_list->
block_offsets[block_num];
565 const bool transpose = (iatom <= jatom);
566 const bool pab_required = (forces != NULL || virial != NULL);
570 if (block_offset != old_offset || iset != old_iset ||
573 load_pab(ibasis, jbasis, iset, jset, transpose,
576 if (old_offset >= 0) {
577 store_hab(old_ibasis, old_jbasis, old_iset, old_jset, old_transpose,
580 memset(hab, 0, ncoa * ncob *
sizeof(
double));
581 old_offset = block_offset;
586 old_transpose = transpose;
613 (
double(*)[ncoa])hab,
614 (pab_required) ? (
const double(*)[ncoa])pab : NULL,
615 (forces != NULL) ? my_forces : NULL,
616 (virial != NULL) ? my_virials : NULL,
625 const double scalef = (iatom == jatom) ? 1.0 : 2.0;
626 if (forces != NULL) {
627#pragma omp critical(forces)
628 for (
int i = 0;
i < 3;
i++) {
629 forces[iatom][
i] += scalef * my_forces[0][
i];
630 forces[jatom][
i] += scalef * my_forces[1][
i];
633 if (virial != NULL) {
634#pragma omp critical(virial)
635 for (
int i = 0;
i < 3;
i++) {
636 for (
int j = 0; j < 3; j++) {
637 virial[
i][j] += scalef * my_virials[0][
i][j];
638 virial[
i][j] += scalef * my_virials[1][
i][j];
646 if (old_offset >= 0) {
647 store_hab(old_ibasis, old_jbasis, old_iset, old_jset, old_transpose, hab,
663 double forces[natoms][3],
double virial[3][3]) {
668 assert(task_list->
nlevels == nlevels);
669 assert(task_list->
natoms == natoms);
673 if (forces != NULL) {
674 memset(forces, 0, natoms * 3 *
sizeof(
double));
676 if (virial != NULL) {
677 memset(virial, 0, 9 *
sizeof(
double));
680 for (
int level = 0; level < task_list->
nlevels; level++) {
686 task_list, first_block_task, last_block_task, compute_tau, natoms,
689 grids[level], hab_blocks, forces, virial);
static int imax(int x, int y)
Returns the larger of two given integers (missing from the C standard)
static int imin(int x, int y)
Returns the smaller of the two integers (missing from the C standard).
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
#define GRID_PRAGMA_SIMD_LOOP
static GRID_HOST_DEVICE int idx(const orbital a)
Return coset index of given orbital angular momentum.
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
static void const int const int i
static void const int const int const int const int const int const double const int const int const int npts_local[3]
void grid_ref_collocate_pgf_product(const bool orthorhombic, const int border_mask, const enum grid_func func, const int la_max, const int la_min, const int lb_max, const int lb_min, const double zeta, const double zetb, const double rscale, const double dh[3][3], const double dh_inv[3][3], const double ra[3], const double rab[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, const int o1, const int o2, const int n1, const int n2, const double pab[n2][n1], double *grid)
Collocates a single product of primitiv Gaussians. See grid_ref_collocate.h for details.
void grid_ref_integrate_pgf_product(const bool orthorhombic, const bool compute_tau, const int border_mask, const int la_max, const int la_min, const int lb_max, const int lb_min, const double zeta, const double zetb, const double dh[3][3], const double dh_inv[3][3], const double ra[3], const double rab[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, const int o1, const int o2, const int n1, const int n2, const double *grid, double hab[n2][n1], const double pab[n2][n1], double forces[2][3], double virials[2][3][3], double hdab[n2][n1][3], double hadb[n2][n1][3], double a_hdab[n2][n1][3][3])
Integrates a single task. See grid_ref_integrate.h for details.
void grid_ref_free_task_list(grid_ref_task_list *ptr)
Deallocates given task list, basis_sets have to be freed separately.
static void load_pab(const grid_basis_set *ibasis, const grid_basis_set *jbasis, const int iset, const int jset, const bool transpose, const double *block, double *pab)
Transforms pab from contracted spherical to prim. cartesian basis.
static void integrate_one_grid_level(const grid_ref_task_list *ptr, const int *first_block_task, const int *last_block_task, const bool compute_tau, const int natoms, const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double dh[3][3], const double dh_inv[3][3], const offload_buffer *pab_blocks, const offload_buffer *grid, offload_buffer *hab_blocks, double forces[natoms][3], double virial[3][3])
Integrate a range of tasks that belong to the same grid level.
static int compare_tasks(const void *a, const void *b)
Comperator passed to qsort to compare two tasks.
void grid_ref_create_task_list(const bool orthorhombic, const int ntasks, const int nlevels, const int natoms, const int nkinds, const int nblocks, const int block_offsets[nblocks], const double atom_positions[natoms][3], const int atom_kinds[natoms], const grid_basis_set *basis_sets[nkinds], const int level_list[ntasks], const int iatom_list[ntasks], const int jatom_list[ntasks], const int iset_list[ntasks], const int jset_list[ntasks], const int ipgf_list[ntasks], const int jpgf_list[ntasks], const int border_mask_list[ntasks], const int block_num_list[ntasks], const double radius_list[ntasks], const double rab_list[ntasks][3], const int npts_global[nlevels][3], const int npts_local[nlevels][3], const int shift_local[nlevels][3], const int border_width[nlevels][3], const double dh[nlevels][3][3], const double dh_inv[nlevels][3][3], grid_ref_task_list **task_list_out)
Allocates a task list for the reference backend. See grid_task_list.h for details.
void grid_ref_integrate_task_list(const grid_ref_task_list *ptr, const bool compute_tau, const int natoms, const int nlevels, const offload_buffer *pab_blocks, const offload_buffer *grids[nlevels], offload_buffer *hab_blocks, double forces[natoms][3], double virial[3][3])
Integrate all tasks of in given list from given grids. See grid_task_list.h for details.
void grid_ref_collocate_task_list(const grid_ref_task_list *ptr, const enum grid_func func, const int nlevels, const offload_buffer *pab_blocks, offload_buffer *grids[nlevels])
Collocate all tasks of in given list onto given grids. See grid_task_list.h for details.
static void collocate_one_grid_level(const grid_ref_task_list *ptr, const int *first_block_task, const int *last_block_task, const enum grid_func func, const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double dh[3][3], const double dh_inv[3][3], const double *pab_blocks, offload_buffer *grid)
Collocate a range of tasks which are destined for the same grid level.
void dgemm_(const char *transa, const char *transb, const int *m, const int *n, const int *k, const double *alpha, const double *a, const int *lda, const double *b, const int *ldb, const double *beta, double *c, const int *ldc)
Prototype for BLAS dgemm.
static void store_hab(const grid_basis_set *ibasis, const grid_basis_set *jbasis, const int iset, const int jset, const bool transpose, const double *hab, double *block)
Transforms hab from prim. cartesian to contracted spherical basis.
static void dgemm(const char transa, const char transb, const int m, const int n, const int k, const double alpha, const double *a, const int lda, const double *b, const int ldb, const double beta, double *c, const int ldc)
Convenient wrapper to hide Fortran nature of dgemm_, swapping a and b.
Internal representation of a basis set.
Internal representation of a grid layout.
Internal representation of a task list.
grid_basis_set ** basis_sets
int * last_level_block_task
int * first_level_block_task
grid_ref_layout * layouts
size_t * threadlocal_sizes
Internal representation of a task.
Internal representation of a buffer.