16#include "../common/grid_common.h"
27 if (task_a->
level != task_b->level) {
28 return task_a->
level - task_b->level;
29 }
else if (task_a->
block_num != task_b->block_num) {
30 return task_a->
block_num - task_b->block_num;
31 }
else if (task_a->
iset != task_b->iset) {
32 return task_a->
iset - task_b->iset;
34 return task_a->
jset - task_b->jset;
43 const bool orthorhombic,
const int ntasks,
const int nlevels,
44 const int natoms,
const int nkinds,
const int nblocks,
45 const int block_offsets[nblocks],
const double atom_positions[natoms][3],
46 const int atom_kinds[natoms],
const grid_basis_set *basis_sets[nkinds],
47 const int level_list[ntasks],
const int iatom_list[ntasks],
48 const int jatom_list[ntasks],
const int iset_list[ntasks],
49 const int jset_list[ntasks],
const int ipgf_list[ntasks],
50 const int jpgf_list[ntasks],
const int border_mask_list[ntasks],
51 const int block_num_list[ntasks],
const double radius_list[ntasks],
52 const double rab_list[ntasks][3],
const int npts_global[nlevels][3],
53 const int npts_local[nlevels][3],
const int shift_local[nlevels][3],
54 const int border_width[nlevels][3],
const double dh[nlevels][3][3],
57 if (*task_list_out != NULL) {
63 assert(task_list != NULL);
66 task_list->
ntasks = ntasks;
68 task_list->
natoms = natoms;
69 task_list->
nkinds = nkinds;
72 size_t size = nblocks *
sizeof(int);
79 size = 3 * natoms *
sizeof(double);
86 size = natoms *
sizeof(int);
88 assert(task_list->
atom_kinds != NULL || size == 0);
90 memcpy(task_list->
atom_kinds, atom_kinds, size);
95 assert(task_list->
basis_sets != NULL || size == 0);
97 memcpy(task_list->
basis_sets, basis_sets, size);
101 task_list->
tasks = malloc(size);
102 assert(task_list->
tasks != NULL || size == 0);
103#pragma omp parallel for schedule(static) if (ntasks > GRID_OMP_MIN_ITERATIONS)
104 for (
int i = 0;
i < ntasks;
i++) {
122 task_list->
layouts = malloc(size);
123 assert(task_list->
layouts != NULL || size == 0);
124 for (
int level = 0; level < nlevels; level++) {
125 for (
int i = 0;
i < 3;
i++) {
130 for (
int j = 0; j < 3; j++) {
131 task_list->
layouts[level].
dh[
i][j] = dh[level][
i][j];
141 size = nlevels * nblocks *
sizeof(int);
146 const int nlevel_blocks = nlevels * nblocks;
147#pragma omp parallel for schedule(static) if (nlevel_blocks > \
148 GRID_OMP_MIN_ITERATIONS)
149 for (
int i = 0;
i < nlevel_blocks;
i++) {
153 for (
int itask = 0; itask < ntasks; itask++) {
154 const int level = task_list->
tasks[itask].
level - 1;
156 if (itask == 0 || task_list->
tasks[itask - 1].
level - 1 != level ||
164 task_list->
maxco = 0;
165 for (
int i = 0;
i < nkinds;
i++) {
170 size = omp_get_max_threads() *
sizeof(
double *);
174 size = omp_get_max_threads() *
sizeof(size_t);
179 *task_list_out = task_list;
191 free(task_list->
tasks);
195 for (
int i = 0;
i < omp_get_max_threads();
i++) {
209void dgemm_(
const char *transa,
const char *transb,
const int *m,
const int *n,
210 const int *k,
const double *alpha,
const double *a,
const int *lda,
211 const double *b,
const int *ldb,
const double *beta,
double *c,
218static void dgemm(
const char transa,
const char transb,
const int m,
219 const int n,
const int k,
const double alpha,
const double *a,
220 const int lda,
const double *b,
const int ldb,
221 const double beta,
double *c,
const int ldc) {
222 dgemm_(&transb, &transa, &n, &m, &k, &alpha, b, &ldb, a, &lda, &beta, c,
231 const int iset,
const int jset,
const bool transpose,
232 const double *block,
double *pab) {
235 const int ncoseta = ncoset(ibasis->
lmax[iset]);
236 const int ncosetb = ncoset(jbasis->
lmax[jset]);
237 const int ncoa = ibasis->
npgf[iset] * ncoseta;
238 const int ncob = jbasis->
npgf[jset] * ncosetb;
240 const int nsgf_seta = ibasis->
nsgf_set[iset];
241 const int nsgf_setb = jbasis->
nsgf_set[jset];
242 const int nsgfa = ibasis->
nsgf;
243 const int nsgfb = jbasis->
nsgf;
244 const int sgfa = ibasis->
first_sgf[iset] - 1;
245 const int sgfb = jbasis->
first_sgf[jset] - 1;
246 const int maxcoa = ibasis->
maxco;
247 const int maxcob = jbasis->
maxco;
249 double work[nsgf_setb * ncoa];
252 dgemm(
'N',
'N', nsgf_setb, ncoa, nsgf_seta, 1.0,
253 &block[sgfb * nsgfa + sgfa], nsgfa, &ibasis->
sphi[sgfa * maxcoa],
254 maxcoa, 0.0, work, ncoa);
257 dgemm(
'T',
'N', nsgf_setb, ncoa, nsgf_seta, 1.0,
258 &block[sgfa * nsgfb + sgfb], nsgfb, &ibasis->
sphi[sgfa * maxcoa],
259 maxcoa, 0.0, work, ncoa);
262 dgemm(
'T',
'N', ncob, ncoa, nsgf_setb, 1.0, &jbasis->
sphi[sgfb * maxcob],
263 maxcob, work, ncoa, 0.0, pab, ncoa);
272 const int *last_block_task,
const enum grid_func func,
273 const int npts_global[3],
const int npts_local[3],
const int shift_local[3],
274 const int border_width[3],
const double dh[3][3],
const double dh_inv[3][3],
279#pragma omp parallel default(shared)
281 const int ithread = omp_get_thread_num();
282 const int nthreads = omp_get_num_threads();
285 int old_offset = -1, old_iset = -1, old_jset = -1;
292 const size_t grid_size = npts_local_total *
sizeof(double);
303 double *
const my_grid = task_list->
threadlocals[ithread];
304 memset(my_grid, 0, grid_size);
307 const int chunk_size =
imax(1, task_list->
nblocks / (nthreads * 50));
308#pragma omp for schedule(dynamic, chunk_size)
309 for (
int block_num = 0; block_num < task_list->
nblocks; block_num++) {
310 const int first_task = first_block_task[block_num];
311 const int last_task = last_block_task[block_num];
313 for (
int itask = first_task; itask <= last_task; itask++) {
316 const int iatom = task->
iatom - 1;
317 const int jatom = task->
jatom - 1;
318 const int iset = task->
iset - 1;
319 const int jset = task->
jset - 1;
320 const int ipgf = task->
ipgf - 1;
321 const int jpgf = task->
jpgf - 1;
322 const int ikind = task_list->
atom_kinds[iatom] - 1;
323 const int jkind = task_list->
atom_kinds[jatom] - 1;
326 const double zeta = ibasis->
zet[iset * ibasis->
maxpgf + ipgf];
327 const double zetb = jbasis->
zet[jset * jbasis->
maxpgf + jpgf];
328 const int ncoseta = ncoset(ibasis->
lmax[iset]);
329 const int ncosetb = ncoset(jbasis->
lmax[jset]);
330 const int ncoa = ibasis->
npgf[iset] * ncoseta;
331 const int ncob = jbasis->
npgf[jset] * ncosetb;
332 const int block_num = task->
block_num - 1;
333 const int block_offset = task_list->
block_offsets[block_num];
334 const double *block = &pab_blocks[block_offset];
335 const bool transpose = (iatom <= jatom);
339 if (block_offset != old_offset || iset != old_iset ||
341 old_offset = block_offset;
344 load_pab(ibasis, jbasis, iset, jset, transpose, block, pab);
357 (iatom == jatom) ? 1 : 2,
371 (
const double(*)[ncoa])pab,
382 const int nreduction_cycles = ceil(log(nthreads) / log(2));
383 for (
int icycle = 1; icycle <= nreduction_cycles; icycle++) {
386 const int group_size = 1 << icycle;
387 const int igroup = ithread / group_size;
388 const int dest_thread = igroup * group_size;
389 const int src_thread = dest_thread + group_size / 2;
391 const int actual_group_size =
imin(group_size, nthreads - dest_thread);
393 const int rank =
modulo(ithread, group_size);
394 const int lb = (npts_local_total * rank) / actual_group_size;
395 const int ub = (npts_local_total * (rank + 1)) / actual_group_size;
396 if (src_thread < nthreads) {
398 for (
int i = lb;
i <
ub;
i++) {
407 const int lb = (npts_local_total * ithread) / nthreads;
408 const int ub = (npts_local_total * (ithread + 1)) / nthreads;
410 for (
int i = lb;
i <
ub;
i++) {
423 const enum grid_func func,
const int nlevels,
427 assert(task_list->
nlevels == nlevels);
429 for (
int level = 0; level < task_list->
nlevels; level++) {
435 task_list, first_block_task, last_block_task, func, layout->
npts_global,
447 const int jset,
const bool transpose,
448 const double *hab,
double *block) {
451 const int ncoseta = ncoset(ibasis->
lmax[iset]);
452 const int ncosetb = ncoset(jbasis->
lmax[jset]);
453 const int ncoa = ibasis->
npgf[iset] * ncoseta;
454 const int ncob = jbasis->
npgf[jset] * ncosetb;
456 const int nsgf_seta = ibasis->
nsgf_set[iset];
457 const int nsgf_setb = jbasis->
nsgf_set[jset];
458 const int nsgfa = ibasis->
nsgf;
459 const int nsgfb = jbasis->
nsgf;
460 const int sgfa = ibasis->
first_sgf[iset] - 1;
461 const int sgfb = jbasis->
first_sgf[jset] - 1;
462 const int maxcoa = ibasis->
maxco;
463 const int maxcob = jbasis->
maxco;
465 double work[nsgf_setb * ncoa];
468 dgemm(
'N',
'N', nsgf_setb, ncoa, ncob, 1.0, &jbasis->
sphi[sgfb * maxcob],
469 maxcob, hab, ncoa, 0.0, work, ncoa);
473 dgemm(
'N',
'T', nsgf_setb, nsgf_seta, ncoa, 1.0, work, ncoa,
474 &ibasis->
sphi[sgfa * maxcoa], maxcoa, 1.0,
475 &block[sgfb * nsgfa + sgfa], nsgfa);
478 dgemm(
'N',
'T', nsgf_seta, nsgf_setb, ncoa, 1.0,
479 &ibasis->
sphi[sgfa * maxcoa], maxcoa, work, ncoa, 1.0,
480 &block[sgfa * nsgfb + sgfb], nsgfb);
490 const int *last_block_task,
const bool compute_tau,
const int natoms,
491 const int npts_global[3],
const int npts_local[3],
const int shift_local[3],
492 const int border_width[3],
const double dh[3][3],
const double dh_inv[3][3],
494 offload_buffer *hab_blocks,
double forces[natoms][3],
double virial[3][3]) {
498#pragma omp parallel default(shared)
501 int old_offset = -1, old_iset = -1, old_jset = -1;
503 bool old_transpose =
false;
510 const int nthreads = omp_get_num_threads();
511 const int chunk_size =
imax(1, task_list->
nblocks / (nthreads * 50));
512#pragma omp for schedule(dynamic, chunk_size)
513 for (
int block_num = 0; block_num < task_list->
nblocks; block_num++) {
514 const int first_task = first_block_task[block_num];
515 const int last_task = last_block_task[block_num];
518 const int iatom = task_list->
tasks[first_task].
iatom - 1;
519 const int jatom = task_list->
tasks[first_task].
jatom - 1;
520 double my_forces[2][3] = {0};
521 double my_virials[2][3][3] = {0};
523 for (
int itask = first_task; itask <= last_task; itask++) {
526 assert(task->
block_num - 1 == block_num);
527 assert(task->
iatom - 1 == iatom && task->
jatom - 1 == jatom);
528 const int ikind = task_list->
atom_kinds[iatom] - 1;
529 const int jkind = task_list->
atom_kinds[jatom] - 1;
532 const int iset = task->
iset - 1;
533 const int jset = task->
jset - 1;
534 const int ipgf = task->
ipgf - 1;
535 const int jpgf = task->
jpgf - 1;
536 const double zeta = ibasis->
zet[iset * ibasis->
maxpgf + ipgf];
537 const double zetb = jbasis->
zet[jset * jbasis->
maxpgf + jpgf];
538 const int ncoseta = ncoset(ibasis->
lmax[iset]);
539 const int ncosetb = ncoset(jbasis->
lmax[jset]);
540 const int ncoa = ibasis->
npgf[iset] * ncoseta;
541 const int ncob = jbasis->
npgf[jset] * ncosetb;
542 const int block_offset = task_list->
block_offsets[block_num];
543 const bool transpose = (iatom <= jatom);
544 const bool pab_required = (forces != NULL || virial != NULL);
548 if (block_offset != old_offset || iset != old_iset ||
551 load_pab(ibasis, jbasis, iset, jset, transpose,
554 if (old_offset >= 0) {
555 store_hab(old_ibasis, old_jbasis, old_iset, old_jset, old_transpose,
558 memset(hab, 0, ncoa * ncob *
sizeof(
double));
559 old_offset = block_offset;
564 old_transpose = transpose;
591 (
double(*)[ncoa])hab,
592 (pab_required) ? (
const double(*)[ncoa])pab : NULL,
593 (forces != NULL) ? my_forces : NULL,
594 (virial != NULL) ? my_virials : NULL,
603 const double scalef = (iatom == jatom) ? 1.0 : 2.0;
604 if (forces != NULL) {
605#pragma omp critical(forces)
606 for (
int i = 0;
i < 3;
i++) {
607 forces[iatom][
i] += scalef * my_forces[0][
i];
608 forces[jatom][
i] += scalef * my_forces[1][
i];
611 if (virial != NULL) {
612#pragma omp critical(virial)
613 for (
int i = 0;
i < 3;
i++) {
614 for (
int j = 0; j < 3; j++) {
615 virial[
i][j] += scalef * my_virials[0][
i][j];
616 virial[
i][j] += scalef * my_virials[1][
i][j];
624 if (old_offset >= 0) {
625 store_hab(old_ibasis, old_jbasis, old_iset, old_jset, old_transpose, hab,
639 const int natoms,
const int nlevels,
const offload_buffer *pab_blocks,
641 double forces[natoms][3],
double virial[3][3]) {
643 assert(task_list->
nlevels == nlevels);
644 assert(task_list->
natoms == natoms);
648 if (forces != NULL) {
649 memset(forces, 0, natoms * 3 *
sizeof(
double));
651 if (virial != NULL) {
652 memset(virial, 0, 9 *
sizeof(
double));
655 for (
int level = 0; level < task_list->
nlevels; level++) {
661 task_list, first_block_task, last_block_task, compute_tau, natoms,
664 grids[level], hab_blocks, forces, virial);
static int imax(int x, int y)
Returns the larger of two given integers (missing from the C standard)
static int imin(int x, int y)
Returns the smaller of the two integers (missing from the C standard).
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
#define GRID_PRAGMA_SIMD_LOOP
static GRID_HOST_DEVICE int idx(const orbital a)
Return coset index of given orbital angular momentum.
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
static void const int const int i
static void const int const int const int const int const int const double const int const int const int npts_local[3]
void grid_ref_collocate_pgf_product(const bool orthorhombic, const int border_mask, const enum grid_func func, const int la_max, const int la_min, const int lb_max, const int lb_min, const double zeta, const double zetb, const double rscale, const double dh[3][3], const double dh_inv[3][3], const double ra[3], const double rab[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, const int o1, const int o2, const int n1, const int n2, const double pab[n2][n1], double *grid)
Collocates a single product of primitiv Gaussians. See grid_ref_collocate.h for details.
void grid_ref_integrate_pgf_product(const bool orthorhombic, const bool compute_tau, const int border_mask, const int la_max, const int la_min, const int lb_max, const int lb_min, const double zeta, const double zetb, const double dh[3][3], const double dh_inv[3][3], const double ra[3], const double rab[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, const int o1, const int o2, const int n1, const int n2, const double *grid, double hab[n2][n1], const double pab[n2][n1], double forces[2][3], double virials[2][3][3], double hdab[n2][n1][3], double hadb[n2][n1][3], double a_hdab[n2][n1][3][3])
Integrates a single task. See grid_ref_integrate.h for details.
void grid_ref_collocate_task_list(const grid_ref_task_list *task_list, const enum grid_func func, const int nlevels, const offload_buffer *pab_blocks, offload_buffer *grids[nlevels])
Collocate all tasks of in given list onto given grids. See grid_task_list.h for details.
static void load_pab(const grid_basis_set *ibasis, const grid_basis_set *jbasis, const int iset, const int jset, const bool transpose, const double *block, double *pab)
Transforms pab from contracted spherical to prim. cartesian basis.
static void collocate_one_grid_level(const grid_ref_task_list *task_list, const int *first_block_task, const int *last_block_task, const enum grid_func func, const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double dh[3][3], const double dh_inv[3][3], const double *pab_blocks, offload_buffer *grid)
Collocate a range of tasks which are destined for the same grid level.
void grid_ref_free_task_list(grid_ref_task_list *task_list)
Deallocates given task list, basis_sets have to be freed separately.
void grid_ref_integrate_task_list(const grid_ref_task_list *task_list, const bool compute_tau, const int natoms, const int nlevels, const offload_buffer *pab_blocks, const offload_buffer *grids[nlevels], offload_buffer *hab_blocks, double forces[natoms][3], double virial[3][3])
Integrate all tasks of in given list from given grids. See grid_task_list.h for details.
static int compare_tasks(const void *a, const void *b)
Comperator passed to qsort to compare two tasks.
void grid_ref_create_task_list(const bool orthorhombic, const int ntasks, const int nlevels, const int natoms, const int nkinds, const int nblocks, const int block_offsets[nblocks], const double atom_positions[natoms][3], const int atom_kinds[natoms], const grid_basis_set *basis_sets[nkinds], const int level_list[ntasks], const int iatom_list[ntasks], const int jatom_list[ntasks], const int iset_list[ntasks], const int jset_list[ntasks], const int ipgf_list[ntasks], const int jpgf_list[ntasks], const int border_mask_list[ntasks], const int block_num_list[ntasks], const double radius_list[ntasks], const double rab_list[ntasks][3], const int npts_global[nlevels][3], const int npts_local[nlevels][3], const int shift_local[nlevels][3], const int border_width[nlevels][3], const double dh[nlevels][3][3], const double dh_inv[nlevels][3][3], grid_ref_task_list **task_list_out)
Allocates a task list for the reference backend. See grid_task_list.h for details.
void dgemm_(const char *transa, const char *transb, const int *m, const int *n, const int *k, const double *alpha, const double *a, const int *lda, const double *b, const int *ldb, const double *beta, double *c, const int *ldc)
Prototype for BLAS dgemm.
static void integrate_one_grid_level(const grid_ref_task_list *task_list, const int *first_block_task, const int *last_block_task, const bool compute_tau, const int natoms, const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double dh[3][3], const double dh_inv[3][3], const offload_buffer *pab_blocks, const offload_buffer *grid, offload_buffer *hab_blocks, double forces[natoms][3], double virial[3][3])
Integrate a range of tasks that belong to the same grid level.
static void store_hab(const grid_basis_set *ibasis, const grid_basis_set *jbasis, const int iset, const int jset, const bool transpose, const double *hab, double *block)
Transforms hab from prim. cartesian to contracted spherical basis.
static void dgemm(const char transa, const char transb, const int m, const int n, const int k, const double alpha, const double *a, const int lda, const double *b, const int ldb, const double beta, double *c, const int ldc)
Convenient wrapper to hide Fortran nature of dgemm_, swapping a and b.
Internal representation of a basis set.
Internal representation of a grid layout.
Internal representation of a task list.
int * last_level_block_task
grid_ref_layout * layouts
int * first_level_block_task
grid_basis_set ** basis_sets
size_t * threadlocal_sizes
Internal representation of a task.
Internal representation of a buffer.