16#include "../common/grid_common.h"
27 if (task_a->
level != task_b->level) {
28 return task_a->
level - task_b->level;
29 }
else if (task_a->
block_num != task_b->block_num) {
30 return task_a->
block_num - task_b->block_num;
31 }
else if (task_a->
iset != task_b->iset) {
32 return task_a->
iset - task_b->iset;
34 return task_a->
jset - task_b->jset;
43 const bool orthorhombic,
const int ntasks,
const int nlevels,
44 const int natoms,
const int nkinds,
const int nblocks,
45 const int block_offsets[nblocks],
const double atom_positions[natoms][3],
46 const int atom_kinds[natoms],
const grid_basis_set *basis_sets[nkinds],
47 const int level_list[ntasks],
const int iatom_list[ntasks],
48 const int jatom_list[ntasks],
const int iset_list[ntasks],
49 const int jset_list[ntasks],
const int ipgf_list[ntasks],
50 const int jpgf_list[ntasks],
const int border_mask_list[ntasks],
51 const int block_num_list[ntasks],
const double radius_list[ntasks],
52 const double rab_list[ntasks][3],
const int npts_global[nlevels][3],
53 const int npts_local[nlevels][3],
const int shift_local[nlevels][3],
54 const int border_width[nlevels][3],
const double dh[nlevels][3][3],
57 if (*task_list_out != NULL) {
63 assert(task_list != NULL);
66 task_list->
ntasks = ntasks;
68 task_list->
natoms = natoms;
69 task_list->
nkinds = nkinds;
72 size_t size = nblocks *
sizeof(int);
77 size = 3 * natoms *
sizeof(double);
82 size = natoms *
sizeof(int);
85 memcpy(task_list->
atom_kinds, atom_kinds, size);
90 memcpy(task_list->
basis_sets, basis_sets, size);
93 task_list->
tasks = malloc(size);
94 assert(task_list->
tasks != NULL);
95 for (
int i = 0;
i < ntasks;
i++) {
113 task_list->
layouts = malloc(size);
114 assert(task_list->
layouts != NULL);
115 for (
int level = 0; level < nlevels; level++) {
116 for (
int i = 0;
i < 3;
i++) {
121 for (
int j = 0; j < 3; j++) {
122 task_list->
layouts[level].
dh[
i][j] = dh[level][
i][j];
132 size = nlevels * nblocks *
sizeof(int);
137 for (
int i = 0;
i < nlevels * nblocks;
i++) {
141 for (
int itask = 0; itask < ntasks; itask++) {
142 const int level = task_list->
tasks[itask].
level - 1;
144 if (itask == 0 || task_list->
tasks[itask - 1].
level - 1 != level ||
152 task_list->
maxco = 0;
153 for (
int i = 0;
i < nkinds;
i++) {
158 size = omp_get_max_threads() *
sizeof(
double *);
162 size = omp_get_max_threads() *
sizeof(size_t);
167 *task_list_out = task_list;
179 free(task_list->
tasks);
183 for (
int i = 0;
i < omp_get_max_threads();
i++) {
197void dgemm_(
const char *transa,
const char *transb,
const int *m,
const int *n,
198 const int *k,
const double *alpha,
const double *a,
const int *lda,
199 const double *b,
const int *ldb,
const double *beta,
double *c,
206static void dgemm(
const char transa,
const char transb,
const int m,
207 const int n,
const int k,
const double alpha,
const double *a,
208 const int lda,
const double *b,
const int ldb,
209 const double beta,
double *c,
const int ldc) {
210 dgemm_(&transb, &transa, &n, &m, &k, &alpha, b, &ldb, a, &lda, &beta, c,
219 const int iset,
const int jset,
const bool transpose,
220 const double *block,
double *pab) {
223 const int ncoseta = ncoset(ibasis->
lmax[iset]);
224 const int ncosetb = ncoset(jbasis->
lmax[jset]);
225 const int ncoa = ibasis->
npgf[iset] * ncoseta;
226 const int ncob = jbasis->
npgf[jset] * ncosetb;
228 const int nsgf_seta = ibasis->
nsgf_set[iset];
229 const int nsgf_setb = jbasis->
nsgf_set[jset];
230 const int nsgfa = ibasis->
nsgf;
231 const int nsgfb = jbasis->
nsgf;
232 const int sgfa = ibasis->
first_sgf[iset] - 1;
233 const int sgfb = jbasis->
first_sgf[jset] - 1;
234 const int maxcoa = ibasis->
maxco;
235 const int maxcob = jbasis->
maxco;
237 double work[nsgf_setb * ncoa];
240 dgemm(
'N',
'N', nsgf_setb, ncoa, nsgf_seta, 1.0,
241 &block[sgfb * nsgfa + sgfa], nsgfa, &ibasis->
sphi[sgfa * maxcoa],
242 maxcoa, 0.0, work, ncoa);
245 dgemm(
'T',
'N', nsgf_setb, ncoa, nsgf_seta, 1.0,
246 &block[sgfa * nsgfb + sgfb], nsgfb, &ibasis->
sphi[sgfa * maxcoa],
247 maxcoa, 0.0, work, ncoa);
250 dgemm(
'T',
'N', ncob, ncoa, nsgf_setb, 1.0, &jbasis->
sphi[sgfb * maxcob],
251 maxcob, work, ncoa, 0.0, pab, ncoa);
260 const int *last_block_task,
const enum grid_func func,
261 const int npts_global[3],
const int npts_local[3],
const int shift_local[3],
262 const int border_width[3],
const double dh[3][3],
const double dh_inv[3][3],
267#pragma omp parallel default(shared)
269 const int ithread = omp_get_thread_num();
270 const int nthreads = omp_get_num_threads();
273 int old_offset = -1, old_iset = -1, old_jset = -1;
276 double pab[task_list->
maxco * task_list->
maxco];
280 const size_t grid_size = npts_local_total *
sizeof(double);
291 double *
const my_grid = task_list->
threadlocals[ithread];
292 memset(my_grid, 0, grid_size);
295 const int chunk_size =
imax(1, task_list->
nblocks / (nthreads * 50));
296#pragma omp for schedule(dynamic, chunk_size)
297 for (
int block_num = 0; block_num < task_list->
nblocks; block_num++) {
298 const int first_task = first_block_task[block_num];
299 const int last_task = last_block_task[block_num];
301 for (
int itask = first_task; itask <= last_task; itask++) {
304 const int iatom = task->
iatom - 1;
305 const int jatom = task->
jatom - 1;
306 const int iset = task->
iset - 1;
307 const int jset = task->
jset - 1;
308 const int ipgf = task->
ipgf - 1;
309 const int jpgf = task->
jpgf - 1;
310 const int ikind = task_list->
atom_kinds[iatom] - 1;
311 const int jkind = task_list->
atom_kinds[jatom] - 1;
314 const double zeta = ibasis->
zet[iset * ibasis->
maxpgf + ipgf];
315 const double zetb = jbasis->
zet[jset * jbasis->
maxpgf + jpgf];
316 const int ncoseta = ncoset(ibasis->
lmax[iset]);
317 const int ncosetb = ncoset(jbasis->
lmax[jset]);
318 const int ncoa = ibasis->
npgf[iset] * ncoseta;
319 const int ncob = jbasis->
npgf[jset] * ncosetb;
320 const int block_num = task->
block_num - 1;
321 const int block_offset = task_list->
block_offsets[block_num];
322 const double *block = &pab_blocks[block_offset];
323 const bool transpose = (iatom <= jatom);
327 if (block_offset != old_offset || iset != old_iset ||
329 old_offset = block_offset;
332 load_pab(ibasis, jbasis, iset, jset, transpose, block, pab);
345 (iatom == jatom) ? 1 : 2,
359 (
const double(*)[ncoa])pab,
370 const int nreduction_cycles = ceil(log(nthreads) / log(2));
371 for (
int icycle = 1; icycle <= nreduction_cycles; icycle++) {
374 const int group_size = 1 << icycle;
375 const int igroup = ithread / group_size;
376 const int dest_thread = igroup * group_size;
377 const int src_thread = dest_thread + group_size / 2;
379 const int actual_group_size =
imin(group_size, nthreads - dest_thread);
381 const int rank =
modulo(ithread, group_size);
382 const int lb = (npts_local_total * rank) / actual_group_size;
383 const int ub = (npts_local_total * (rank + 1)) / actual_group_size;
384 if (src_thread < nthreads) {
385 for (
int i = lb;
i <
ub;
i++) {
394 const int lb = (npts_local_total * ithread) / nthreads;
395 const int ub = (npts_local_total * (ithread + 1)) / nthreads;
396 for (
int i = lb;
i <
ub;
i++) {
409 const enum grid_func func,
const int nlevels,
413 assert(task_list->
nlevels == nlevels);
415 for (
int level = 0; level < task_list->
nlevels; level++) {
421 task_list, first_block_task, last_block_task, func, layout->
npts_global,
433 const int jset,
const bool transpose,
434 const double *hab,
double *block) {
437 const int ncoseta = ncoset(ibasis->
lmax[iset]);
438 const int ncosetb = ncoset(jbasis->
lmax[jset]);
439 const int ncoa = ibasis->
npgf[iset] * ncoseta;
440 const int ncob = jbasis->
npgf[jset] * ncosetb;
442 const int nsgf_seta = ibasis->
nsgf_set[iset];
443 const int nsgf_setb = jbasis->
nsgf_set[jset];
444 const int nsgfa = ibasis->
nsgf;
445 const int nsgfb = jbasis->
nsgf;
446 const int sgfa = ibasis->
first_sgf[iset] - 1;
447 const int sgfb = jbasis->
first_sgf[jset] - 1;
448 const int maxcoa = ibasis->
maxco;
449 const int maxcob = jbasis->
maxco;
451 double work[nsgf_setb * ncoa];
454 dgemm(
'N',
'N', nsgf_setb, ncoa, ncob, 1.0, &jbasis->
sphi[sgfb * maxcob],
455 maxcob, hab, ncoa, 0.0, work, ncoa);
459 dgemm(
'N',
'T', nsgf_setb, nsgf_seta, ncoa, 1.0, work, ncoa,
460 &ibasis->
sphi[sgfa * maxcoa], maxcoa, 1.0,
461 &block[sgfb * nsgfa + sgfa], nsgfa);
464 dgemm(
'N',
'T', nsgf_seta, nsgf_setb, ncoa, 1.0,
465 &ibasis->
sphi[sgfa * maxcoa], maxcoa, work, ncoa, 1.0,
466 &block[sgfa * nsgfb + sgfb], nsgfb);
476 const int *last_block_task,
const bool compute_tau,
const int natoms,
477 const int npts_global[3],
const int npts_local[3],
const int shift_local[3],
478 const int border_width[3],
const double dh[3][3],
const double dh_inv[3][3],
480 offload_buffer *hab_blocks,
double forces[natoms][3],
double virial[3][3]) {
484#pragma omp parallel default(shared)
487 int old_offset = -1, old_iset = -1, old_jset = -1;
489 bool old_transpose =
false;
492 double pab[task_list->
maxco * task_list->
maxco];
493 double hab[task_list->
maxco * task_list->
maxco];
496 const int nthreads = omp_get_num_threads();
497 const int chunk_size =
imax(1, task_list->
nblocks / (nthreads * 50));
498#pragma omp for schedule(dynamic, chunk_size)
499 for (
int block_num = 0; block_num < task_list->
nblocks; block_num++) {
500 const int first_task = first_block_task[block_num];
501 const int last_task = last_block_task[block_num];
504 const int iatom = task_list->
tasks[first_task].
iatom - 1;
505 const int jatom = task_list->
tasks[first_task].
jatom - 1;
506 double my_forces[2][3] = {0};
507 double my_virials[2][3][3] = {0};
509 for (
int itask = first_task; itask <= last_task; itask++) {
512 assert(task->
block_num - 1 == block_num);
513 assert(task->
iatom - 1 == iatom && task->
jatom - 1 == jatom);
514 const int ikind = task_list->
atom_kinds[iatom] - 1;
515 const int jkind = task_list->
atom_kinds[jatom] - 1;
518 const int iset = task->
iset - 1;
519 const int jset = task->
jset - 1;
520 const int ipgf = task->
ipgf - 1;
521 const int jpgf = task->
jpgf - 1;
522 const double zeta = ibasis->
zet[iset * ibasis->
maxpgf + ipgf];
523 const double zetb = jbasis->
zet[jset * jbasis->
maxpgf + jpgf];
524 const int ncoseta = ncoset(ibasis->
lmax[iset]);
525 const int ncosetb = ncoset(jbasis->
lmax[jset]);
526 const int ncoa = ibasis->
npgf[iset] * ncoseta;
527 const int ncob = jbasis->
npgf[jset] * ncosetb;
528 const int block_offset = task_list->
block_offsets[block_num];
529 const bool transpose = (iatom <= jatom);
530 const bool pab_required = (forces != NULL || virial != NULL);
534 if (block_offset != old_offset || iset != old_iset ||
537 load_pab(ibasis, jbasis, iset, jset, transpose,
540 if (old_offset >= 0) {
541 store_hab(old_ibasis, old_jbasis, old_iset, old_jset, old_transpose,
544 memset(hab, 0, ncoa * ncob *
sizeof(
double));
545 old_offset = block_offset;
550 old_transpose = transpose;
577 (
double(*)[ncoa])hab,
578 (pab_required) ? (
const double(*)[ncoa])pab : NULL,
579 (forces != NULL) ? my_forces : NULL,
580 (virial != NULL) ? my_virials : NULL,
589 const double scalef = (iatom == jatom) ? 1.0 : 2.0;
590 if (forces != NULL) {
591#pragma omp critical(forces)
592 for (
int i = 0;
i < 3;
i++) {
593 forces[iatom][
i] += scalef * my_forces[0][
i];
594 forces[jatom][
i] += scalef * my_forces[1][
i];
597 if (virial != NULL) {
598#pragma omp critical(virial)
599 for (
int i = 0;
i < 3;
i++) {
600 for (
int j = 0; j < 3; j++) {
601 virial[
i][j] += scalef * my_virials[0][
i][j];
602 virial[
i][j] += scalef * my_virials[1][
i][j];
610 if (old_offset >= 0) {
611 store_hab(old_ibasis, old_jbasis, old_iset, old_jset, old_transpose, hab,
625 const int natoms,
const int nlevels,
const offload_buffer *pab_blocks,
627 double forces[natoms][3],
double virial[3][3]) {
629 assert(task_list->
nlevels == nlevels);
630 assert(task_list->
natoms == natoms);
634 if (forces != NULL) {
635 memset(forces, 0, natoms * 3 *
sizeof(
double));
637 if (virial != NULL) {
638 memset(virial, 0, 9 *
sizeof(
double));
641 for (
int level = 0; level < task_list->
nlevels; level++) {
647 task_list, first_block_task, last_block_task, compute_tau, natoms,
650 grids[level], hab_blocks, forces, virial);
static int imax(int x, int y)
Returns the larger of two given integers (missing from the C standard)
static int imin(int x, int y)
Returns the smaller of the two integers (missing from the C standard).
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
static GRID_HOST_DEVICE int idx(const orbital a)
Return coset index of given orbital angular momentum.
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
static void const int const int i
static void const int const int const int const int const int const double const int const int const int npts_local[3]
void grid_ref_collocate_pgf_product(const bool orthorhombic, const int border_mask, const enum grid_func func, const int la_max, const int la_min, const int lb_max, const int lb_min, const double zeta, const double zetb, const double rscale, const double dh[3][3], const double dh_inv[3][3], const double ra[3], const double rab[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, const int o1, const int o2, const int n1, const int n2, const double pab[n2][n1], double *grid)
Collocates a single product of primitiv Gaussians. See grid_ref_collocate.h for details.
void grid_ref_integrate_pgf_product(const bool orthorhombic, const bool compute_tau, const int border_mask, const int la_max, const int la_min, const int lb_max, const int lb_min, const double zeta, const double zetb, const double dh[3][3], const double dh_inv[3][3], const double ra[3], const double rab[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, const int o1, const int o2, const int n1, const int n2, const double *grid, double hab[n2][n1], const double pab[n2][n1], double forces[2][3], double virials[2][3][3], double hdab[n2][n1][3], double hadb[n2][n1][3], double a_hdab[n2][n1][3][3])
Integrates a single task. See grid_ref_integrate.h for details.
void grid_ref_collocate_task_list(const grid_ref_task_list *task_list, const enum grid_func func, const int nlevels, const offload_buffer *pab_blocks, offload_buffer *grids[nlevels])
Collocate all tasks of in given list onto given grids. See grid_task_list.h for details.
static void load_pab(const grid_basis_set *ibasis, const grid_basis_set *jbasis, const int iset, const int jset, const bool transpose, const double *block, double *pab)
Transforms pab from contracted spherical to prim. cartesian basis.
static void collocate_one_grid_level(const grid_ref_task_list *task_list, const int *first_block_task, const int *last_block_task, const enum grid_func func, const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double dh[3][3], const double dh_inv[3][3], const double *pab_blocks, offload_buffer *grid)
Collocate a range of tasks which are destined for the same grid level.
void grid_ref_free_task_list(grid_ref_task_list *task_list)
Deallocates given task list, basis_sets have to be freed separately.
void grid_ref_integrate_task_list(const grid_ref_task_list *task_list, const bool compute_tau, const int natoms, const int nlevels, const offload_buffer *pab_blocks, const offload_buffer *grids[nlevels], offload_buffer *hab_blocks, double forces[natoms][3], double virial[3][3])
Integrate all tasks of in given list from given grids. See grid_task_list.h for details.
static int compare_tasks(const void *a, const void *b)
Comperator passed to qsort to compare two tasks.
void grid_ref_create_task_list(const bool orthorhombic, const int ntasks, const int nlevels, const int natoms, const int nkinds, const int nblocks, const int block_offsets[nblocks], const double atom_positions[natoms][3], const int atom_kinds[natoms], const grid_basis_set *basis_sets[nkinds], const int level_list[ntasks], const int iatom_list[ntasks], const int jatom_list[ntasks], const int iset_list[ntasks], const int jset_list[ntasks], const int ipgf_list[ntasks], const int jpgf_list[ntasks], const int border_mask_list[ntasks], const int block_num_list[ntasks], const double radius_list[ntasks], const double rab_list[ntasks][3], const int npts_global[nlevels][3], const int npts_local[nlevels][3], const int shift_local[nlevels][3], const int border_width[nlevels][3], const double dh[nlevels][3][3], const double dh_inv[nlevels][3][3], grid_ref_task_list **task_list_out)
Allocates a task list for the reference backend. See grid_task_list.h for details.
void dgemm_(const char *transa, const char *transb, const int *m, const int *n, const int *k, const double *alpha, const double *a, const int *lda, const double *b, const int *ldb, const double *beta, double *c, const int *ldc)
Prototype for BLAS dgemm.
static void integrate_one_grid_level(const grid_ref_task_list *task_list, const int *first_block_task, const int *last_block_task, const bool compute_tau, const int natoms, const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double dh[3][3], const double dh_inv[3][3], const offload_buffer *pab_blocks, const offload_buffer *grid, offload_buffer *hab_blocks, double forces[natoms][3], double virial[3][3])
Integrate a range of tasks that belong to the same grid level.
static void store_hab(const grid_basis_set *ibasis, const grid_basis_set *jbasis, const int iset, const int jset, const bool transpose, const double *hab, double *block)
Transforms hab from prim. cartesian to contracted spherical basis.
static void dgemm(const char transa, const char transb, const int m, const int n, const int k, const double alpha, const double *a, const int lda, const double *b, const int ldb, const double beta, double *c, const int ldc)
Convenient wrapper to hide Fortran nature of dgemm_, swapping a and b.
Internal representation of a basis set.
Internal representation of a grid layout.
Internal representation of a task list.
int * last_level_block_task
grid_ref_layout * layouts
int * first_level_block_task
grid_basis_set ** basis_sets
size_t * threadlocal_sizes
Internal representation of a task.
Internal representation of a buffer.