16#include "../common/grid_common.h"
27 if (task_a->
level != task_b->level) {
28 return task_a->
level - task_b->level;
29 }
else if (task_a->
block_num != task_b->block_num) {
30 return task_a->
block_num - task_b->block_num;
31 }
else if (task_a->
iset != task_b->iset) {
32 return task_a->
iset - task_b->iset;
34 return task_a->
jset - task_b->jset;
43 const bool orthorhombic,
const int ntasks,
const int nlevels,
44 const int natoms,
const int nkinds,
const int nblocks,
45 const int block_offsets[nblocks],
const double atom_positions[natoms][3],
46 const int atom_kinds[natoms],
const grid_basis_set *basis_sets[nkinds],
47 const int level_list[ntasks],
const int iatom_list[ntasks],
48 const int jatom_list[ntasks],
const int iset_list[ntasks],
49 const int jset_list[ntasks],
const int ipgf_list[ntasks],
50 const int jpgf_list[ntasks],
const int border_mask_list[ntasks],
51 const int block_num_list[ntasks],
const double radius_list[ntasks],
52 const double rab_list[ntasks][3],
const int npts_global[nlevels][3],
53 const int npts_local[nlevels][3],
const int shift_local[nlevels][3],
54 const int border_width[nlevels][3],
const double dh[nlevels][3][3],
57 if (*task_list_out != NULL) {
63 assert(task_list != NULL);
66 task_list->
ntasks = ntasks;
68 task_list->
natoms = natoms;
69 task_list->
nkinds = nkinds;
72 size_t size = nblocks *
sizeof(int);
79 size = 3 * natoms *
sizeof(double);
86 size = natoms *
sizeof(int);
88 assert(task_list->
atom_kinds != NULL || size == 0);
90 memcpy(task_list->
atom_kinds, atom_kinds, size);
95 assert(task_list->
basis_sets != NULL || size == 0);
97 memcpy(task_list->
basis_sets, basis_sets, size);
101 task_list->
tasks = malloc(size);
102 assert(task_list->
tasks != NULL || size == 0);
103 for (
int i = 0;
i < ntasks;
i++) {
121 task_list->
layouts = malloc(size);
122 assert(task_list->
layouts != NULL || size == 0);
123 for (
int level = 0; level < nlevels; level++) {
124 for (
int i = 0;
i < 3;
i++) {
129 for (
int j = 0; j < 3; j++) {
130 task_list->
layouts[level].
dh[
i][j] = dh[level][
i][j];
140 size = nlevels * nblocks *
sizeof(int);
145 for (
int i = 0;
i < nlevels * nblocks;
i++) {
149 for (
int itask = 0; itask < ntasks; itask++) {
150 const int level = task_list->
tasks[itask].
level - 1;
152 if (itask == 0 || task_list->
tasks[itask - 1].
level - 1 != level ||
160 task_list->
maxco = 0;
161 for (
int i = 0;
i < nkinds;
i++) {
166 size = omp_get_max_threads() *
sizeof(
double *);
170 size = omp_get_max_threads() *
sizeof(size_t);
175 *task_list_out = task_list;
187 free(task_list->
tasks);
191 for (
int i = 0;
i < omp_get_max_threads();
i++) {
205void dgemm_(
const char *transa,
const char *transb,
const int *m,
const int *n,
206 const int *k,
const double *alpha,
const double *a,
const int *lda,
207 const double *b,
const int *ldb,
const double *beta,
double *c,
214static void dgemm(
const char transa,
const char transb,
const int m,
215 const int n,
const int k,
const double alpha,
const double *a,
216 const int lda,
const double *b,
const int ldb,
217 const double beta,
double *c,
const int ldc) {
218 dgemm_(&transb, &transa, &n, &m, &k, &alpha, b, &ldb, a, &lda, &beta, c,
227 const int iset,
const int jset,
const bool transpose,
228 const double *block,
double *pab) {
231 const int ncoseta = ncoset(ibasis->
lmax[iset]);
232 const int ncosetb = ncoset(jbasis->
lmax[jset]);
233 const int ncoa = ibasis->
npgf[iset] * ncoseta;
234 const int ncob = jbasis->
npgf[jset] * ncosetb;
236 const int nsgf_seta = ibasis->
nsgf_set[iset];
237 const int nsgf_setb = jbasis->
nsgf_set[jset];
238 const int nsgfa = ibasis->
nsgf;
239 const int nsgfb = jbasis->
nsgf;
240 const int sgfa = ibasis->
first_sgf[iset] - 1;
241 const int sgfb = jbasis->
first_sgf[jset] - 1;
242 const int maxcoa = ibasis->
maxco;
243 const int maxcob = jbasis->
maxco;
245 double work[nsgf_setb * ncoa];
248 dgemm(
'N',
'N', nsgf_setb, ncoa, nsgf_seta, 1.0,
249 &block[sgfb * nsgfa + sgfa], nsgfa, &ibasis->
sphi[sgfa * maxcoa],
250 maxcoa, 0.0, work, ncoa);
253 dgemm(
'T',
'N', nsgf_setb, ncoa, nsgf_seta, 1.0,
254 &block[sgfa * nsgfb + sgfb], nsgfb, &ibasis->
sphi[sgfa * maxcoa],
255 maxcoa, 0.0, work, ncoa);
258 dgemm(
'T',
'N', ncob, ncoa, nsgf_setb, 1.0, &jbasis->
sphi[sgfb * maxcob],
259 maxcob, work, ncoa, 0.0, pab, ncoa);
268 const int *last_block_task,
const enum grid_func func,
269 const int npts_global[3],
const int npts_local[3],
const int shift_local[3],
270 const int border_width[3],
const double dh[3][3],
const double dh_inv[3][3],
275#pragma omp parallel default(shared)
277 const int ithread = omp_get_thread_num();
278 const int nthreads = omp_get_num_threads();
281 int old_offset = -1, old_iset = -1, old_jset = -1;
288 const size_t grid_size = npts_local_total *
sizeof(double);
299 double *
const my_grid = task_list->
threadlocals[ithread];
300 memset(my_grid, 0, grid_size);
303 const int chunk_size =
imax(1, task_list->
nblocks / (nthreads * 50));
304#pragma omp for schedule(dynamic, chunk_size)
305 for (
int block_num = 0; block_num < task_list->
nblocks; block_num++) {
306 const int first_task = first_block_task[block_num];
307 const int last_task = last_block_task[block_num];
309 for (
int itask = first_task; itask <= last_task; itask++) {
312 const int iatom = task->
iatom - 1;
313 const int jatom = task->
jatom - 1;
314 const int iset = task->
iset - 1;
315 const int jset = task->
jset - 1;
316 const int ipgf = task->
ipgf - 1;
317 const int jpgf = task->
jpgf - 1;
318 const int ikind = task_list->
atom_kinds[iatom] - 1;
319 const int jkind = task_list->
atom_kinds[jatom] - 1;
322 const double zeta = ibasis->
zet[iset * ibasis->
maxpgf + ipgf];
323 const double zetb = jbasis->
zet[jset * jbasis->
maxpgf + jpgf];
324 const int ncoseta = ncoset(ibasis->
lmax[iset]);
325 const int ncosetb = ncoset(jbasis->
lmax[jset]);
326 const int ncoa = ibasis->
npgf[iset] * ncoseta;
327 const int ncob = jbasis->
npgf[jset] * ncosetb;
328 const int block_num = task->
block_num - 1;
329 const int block_offset = task_list->
block_offsets[block_num];
330 const double *block = &pab_blocks[block_offset];
331 const bool transpose = (iatom <= jatom);
335 if (block_offset != old_offset || iset != old_iset ||
337 old_offset = block_offset;
340 load_pab(ibasis, jbasis, iset, jset, transpose, block, pab);
353 (iatom == jatom) ? 1 : 2,
367 (
const double(*)[ncoa])pab,
378 const int nreduction_cycles = ceil(log(nthreads) / log(2));
379 for (
int icycle = 1; icycle <= nreduction_cycles; icycle++) {
382 const int group_size = 1 << icycle;
383 const int igroup = ithread / group_size;
384 const int dest_thread = igroup * group_size;
385 const int src_thread = dest_thread + group_size / 2;
387 const int actual_group_size =
imin(group_size, nthreads - dest_thread);
389 const int rank =
modulo(ithread, group_size);
390 const int lb = (npts_local_total * rank) / actual_group_size;
391 const int ub = (npts_local_total * (rank + 1)) / actual_group_size;
392 if (src_thread < nthreads) {
393 for (
int i = lb;
i <
ub;
i++) {
402 const int lb = (npts_local_total * ithread) / nthreads;
403 const int ub = (npts_local_total * (ithread + 1)) / nthreads;
404 for (
int i = lb;
i <
ub;
i++) {
417 const enum grid_func func,
const int nlevels,
421 assert(task_list->
nlevels == nlevels);
423 for (
int level = 0; level < task_list->
nlevels; level++) {
429 task_list, first_block_task, last_block_task, func, layout->
npts_global,
441 const int jset,
const bool transpose,
442 const double *hab,
double *block) {
445 const int ncoseta = ncoset(ibasis->
lmax[iset]);
446 const int ncosetb = ncoset(jbasis->
lmax[jset]);
447 const int ncoa = ibasis->
npgf[iset] * ncoseta;
448 const int ncob = jbasis->
npgf[jset] * ncosetb;
450 const int nsgf_seta = ibasis->
nsgf_set[iset];
451 const int nsgf_setb = jbasis->
nsgf_set[jset];
452 const int nsgfa = ibasis->
nsgf;
453 const int nsgfb = jbasis->
nsgf;
454 const int sgfa = ibasis->
first_sgf[iset] - 1;
455 const int sgfb = jbasis->
first_sgf[jset] - 1;
456 const int maxcoa = ibasis->
maxco;
457 const int maxcob = jbasis->
maxco;
459 double work[nsgf_setb * ncoa];
462 dgemm(
'N',
'N', nsgf_setb, ncoa, ncob, 1.0, &jbasis->
sphi[sgfb * maxcob],
463 maxcob, hab, ncoa, 0.0, work, ncoa);
467 dgemm(
'N',
'T', nsgf_setb, nsgf_seta, ncoa, 1.0, work, ncoa,
468 &ibasis->
sphi[sgfa * maxcoa], maxcoa, 1.0,
469 &block[sgfb * nsgfa + sgfa], nsgfa);
472 dgemm(
'N',
'T', nsgf_seta, nsgf_setb, ncoa, 1.0,
473 &ibasis->
sphi[sgfa * maxcoa], maxcoa, work, ncoa, 1.0,
474 &block[sgfa * nsgfb + sgfb], nsgfb);
484 const int *last_block_task,
const bool compute_tau,
const int natoms,
485 const int npts_global[3],
const int npts_local[3],
const int shift_local[3],
486 const int border_width[3],
const double dh[3][3],
const double dh_inv[3][3],
488 offload_buffer *hab_blocks,
double forces[natoms][3],
double virial[3][3]) {
492#pragma omp parallel default(shared)
495 int old_offset = -1, old_iset = -1, old_jset = -1;
497 bool old_transpose =
false;
504 const int nthreads = omp_get_num_threads();
505 const int chunk_size =
imax(1, task_list->
nblocks / (nthreads * 50));
506#pragma omp for schedule(dynamic, chunk_size)
507 for (
int block_num = 0; block_num < task_list->
nblocks; block_num++) {
508 const int first_task = first_block_task[block_num];
509 const int last_task = last_block_task[block_num];
512 const int iatom = task_list->
tasks[first_task].
iatom - 1;
513 const int jatom = task_list->
tasks[first_task].
jatom - 1;
514 double my_forces[2][3] = {0};
515 double my_virials[2][3][3] = {0};
517 for (
int itask = first_task; itask <= last_task; itask++) {
520 assert(task->
block_num - 1 == block_num);
521 assert(task->
iatom - 1 == iatom && task->
jatom - 1 == jatom);
522 const int ikind = task_list->
atom_kinds[iatom] - 1;
523 const int jkind = task_list->
atom_kinds[jatom] - 1;
526 const int iset = task->
iset - 1;
527 const int jset = task->
jset - 1;
528 const int ipgf = task->
ipgf - 1;
529 const int jpgf = task->
jpgf - 1;
530 const double zeta = ibasis->
zet[iset * ibasis->
maxpgf + ipgf];
531 const double zetb = jbasis->
zet[jset * jbasis->
maxpgf + jpgf];
532 const int ncoseta = ncoset(ibasis->
lmax[iset]);
533 const int ncosetb = ncoset(jbasis->
lmax[jset]);
534 const int ncoa = ibasis->
npgf[iset] * ncoseta;
535 const int ncob = jbasis->
npgf[jset] * ncosetb;
536 const int block_offset = task_list->
block_offsets[block_num];
537 const bool transpose = (iatom <= jatom);
538 const bool pab_required = (forces != NULL || virial != NULL);
542 if (block_offset != old_offset || iset != old_iset ||
545 load_pab(ibasis, jbasis, iset, jset, transpose,
548 if (old_offset >= 0) {
549 store_hab(old_ibasis, old_jbasis, old_iset, old_jset, old_transpose,
552 memset(hab, 0, ncoa * ncob *
sizeof(
double));
553 old_offset = block_offset;
558 old_transpose = transpose;
585 (
double(*)[ncoa])hab,
586 (pab_required) ? (
const double(*)[ncoa])pab : NULL,
587 (forces != NULL) ? my_forces : NULL,
588 (virial != NULL) ? my_virials : NULL,
597 const double scalef = (iatom == jatom) ? 1.0 : 2.0;
598 if (forces != NULL) {
599#pragma omp critical(forces)
600 for (
int i = 0;
i < 3;
i++) {
601 forces[iatom][
i] += scalef * my_forces[0][
i];
602 forces[jatom][
i] += scalef * my_forces[1][
i];
605 if (virial != NULL) {
606#pragma omp critical(virial)
607 for (
int i = 0;
i < 3;
i++) {
608 for (
int j = 0; j < 3; j++) {
609 virial[
i][j] += scalef * my_virials[0][
i][j];
610 virial[
i][j] += scalef * my_virials[1][
i][j];
618 if (old_offset >= 0) {
619 store_hab(old_ibasis, old_jbasis, old_iset, old_jset, old_transpose, hab,
633 const int natoms,
const int nlevels,
const offload_buffer *pab_blocks,
635 double forces[natoms][3],
double virial[3][3]) {
637 assert(task_list->
nlevels == nlevels);
638 assert(task_list->
natoms == natoms);
642 if (forces != NULL) {
643 memset(forces, 0, natoms * 3 *
sizeof(
double));
645 if (virial != NULL) {
646 memset(virial, 0, 9 *
sizeof(
double));
649 for (
int level = 0; level < task_list->
nlevels; level++) {
655 task_list, first_block_task, last_block_task, compute_tau, natoms,
658 grids[level], hab_blocks, forces, virial);
static int imax(int x, int y)
Returns the larger of two given integers (missing from the C standard)
static int imin(int x, int y)
Returns the smaller of the two integers (missing from the C standard).
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
static GRID_HOST_DEVICE int idx(const orbital a)
Return coset index of given orbital angular momentum.
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
static void const int const int i
static void const int const int const int const int const int const double const int const int const int npts_local[3]
void grid_ref_collocate_pgf_product(const bool orthorhombic, const int border_mask, const enum grid_func func, const int la_max, const int la_min, const int lb_max, const int lb_min, const double zeta, const double zetb, const double rscale, const double dh[3][3], const double dh_inv[3][3], const double ra[3], const double rab[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, const int o1, const int o2, const int n1, const int n2, const double pab[n2][n1], double *grid)
Collocates a single product of primitiv Gaussians. See grid_ref_collocate.h for details.
void grid_ref_integrate_pgf_product(const bool orthorhombic, const bool compute_tau, const int border_mask, const int la_max, const int la_min, const int lb_max, const int lb_min, const double zeta, const double zetb, const double dh[3][3], const double dh_inv[3][3], const double ra[3], const double rab[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, const int o1, const int o2, const int n1, const int n2, const double *grid, double hab[n2][n1], const double pab[n2][n1], double forces[2][3], double virials[2][3][3], double hdab[n2][n1][3], double hadb[n2][n1][3], double a_hdab[n2][n1][3][3])
Integrates a single task. See grid_ref_integrate.h for details.
void grid_ref_collocate_task_list(const grid_ref_task_list *task_list, const enum grid_func func, const int nlevels, const offload_buffer *pab_blocks, offload_buffer *grids[nlevels])
Collocate all tasks of in given list onto given grids. See grid_task_list.h for details.
static void load_pab(const grid_basis_set *ibasis, const grid_basis_set *jbasis, const int iset, const int jset, const bool transpose, const double *block, double *pab)
Transforms pab from contracted spherical to prim. cartesian basis.
static void collocate_one_grid_level(const grid_ref_task_list *task_list, const int *first_block_task, const int *last_block_task, const enum grid_func func, const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double dh[3][3], const double dh_inv[3][3], const double *pab_blocks, offload_buffer *grid)
Collocate a range of tasks which are destined for the same grid level.
void grid_ref_free_task_list(grid_ref_task_list *task_list)
Deallocates given task list, basis_sets have to be freed separately.
void grid_ref_integrate_task_list(const grid_ref_task_list *task_list, const bool compute_tau, const int natoms, const int nlevels, const offload_buffer *pab_blocks, const offload_buffer *grids[nlevels], offload_buffer *hab_blocks, double forces[natoms][3], double virial[3][3])
Integrate all tasks of in given list from given grids. See grid_task_list.h for details.
static int compare_tasks(const void *a, const void *b)
Comperator passed to qsort to compare two tasks.
void grid_ref_create_task_list(const bool orthorhombic, const int ntasks, const int nlevels, const int natoms, const int nkinds, const int nblocks, const int block_offsets[nblocks], const double atom_positions[natoms][3], const int atom_kinds[natoms], const grid_basis_set *basis_sets[nkinds], const int level_list[ntasks], const int iatom_list[ntasks], const int jatom_list[ntasks], const int iset_list[ntasks], const int jset_list[ntasks], const int ipgf_list[ntasks], const int jpgf_list[ntasks], const int border_mask_list[ntasks], const int block_num_list[ntasks], const double radius_list[ntasks], const double rab_list[ntasks][3], const int npts_global[nlevels][3], const int npts_local[nlevels][3], const int shift_local[nlevels][3], const int border_width[nlevels][3], const double dh[nlevels][3][3], const double dh_inv[nlevels][3][3], grid_ref_task_list **task_list_out)
Allocates a task list for the reference backend. See grid_task_list.h for details.
void dgemm_(const char *transa, const char *transb, const int *m, const int *n, const int *k, const double *alpha, const double *a, const int *lda, const double *b, const int *ldb, const double *beta, double *c, const int *ldc)
Prototype for BLAS dgemm.
static void integrate_one_grid_level(const grid_ref_task_list *task_list, const int *first_block_task, const int *last_block_task, const bool compute_tau, const int natoms, const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double dh[3][3], const double dh_inv[3][3], const offload_buffer *pab_blocks, const offload_buffer *grid, offload_buffer *hab_blocks, double forces[natoms][3], double virial[3][3])
Integrate a range of tasks that belong to the same grid level.
static void store_hab(const grid_basis_set *ibasis, const grid_basis_set *jbasis, const int iset, const int jset, const bool transpose, const double *hab, double *block)
Transforms hab from prim. cartesian to contracted spherical basis.
static void dgemm(const char transa, const char transb, const int m, const int n, const int k, const double alpha, const double *a, const int lda, const double *b, const int ldb, const double beta, double *c, const int ldc)
Convenient wrapper to hide Fortran nature of dgemm_, swapping a and b.
Internal representation of a basis set.
Internal representation of a grid layout.
Internal representation of a task list.
int * last_level_block_task
grid_ref_layout * layouts
int * first_level_block_task
grid_basis_set ** basis_sets
size_t * threadlocal_sizes
Internal representation of a task.
Internal representation of a buffer.