16 #include "../common/grid_common.h"
27 if (task_a->
level != task_b->level) {
28 return task_a->
level - task_b->level;
29 }
else if (task_a->
block_num != task_b->block_num) {
30 return task_a->
block_num - task_b->block_num;
31 }
else if (task_a->
iset != task_b->iset) {
32 return task_a->
iset - task_b->iset;
34 return task_a->
jset - task_b->jset;
43 const bool orthorhombic,
const int ntasks,
const int nlevels,
44 const int natoms,
const int nkinds,
const int nblocks,
45 const int block_offsets[nblocks],
const double atom_positions[natoms][3],
46 const int atom_kinds[natoms],
const grid_basis_set *basis_sets[nkinds],
47 const int level_list[ntasks],
const int iatom_list[ntasks],
48 const int jatom_list[ntasks],
const int iset_list[ntasks],
49 const int jset_list[ntasks],
const int ipgf_list[ntasks],
50 const int jpgf_list[ntasks],
const int border_mask_list[ntasks],
51 const int block_num_list[ntasks],
const double radius_list[ntasks],
52 const double rab_list[ntasks][3],
const int npts_global[nlevels][3],
53 const int npts_local[nlevels][3],
const int shift_local[nlevels][3],
54 const int border_width[nlevels][3],
const double dh[nlevels][3][3],
57 if (*task_list_out != NULL) {
65 task_list->
ntasks = ntasks;
67 task_list->
natoms = natoms;
68 task_list->
nkinds = nkinds;
71 size_t size = nblocks *
sizeof(int);
75 size = 3 * natoms *
sizeof(double);
79 size = natoms *
sizeof(int);
81 memcpy(task_list->
atom_kinds, atom_kinds, size);
85 memcpy(task_list->
basis_sets, basis_sets, size);
88 task_list->
tasks = malloc(size);
89 for (
int i = 0;
i < ntasks;
i++) {
107 task_list->
layouts = malloc(size);
108 for (
int level = 0; level < nlevels; level++) {
109 for (
int i = 0;
i < 3;
i++) {
114 for (
int j = 0; j < 3; j++) {
115 task_list->
layouts[level].
dh[
i][j] = dh[level][
i][j];
125 size = nlevels * nblocks *
sizeof(int);
128 for (
int i = 0;
i < nlevels * nblocks;
i++) {
132 for (
int itask = 0; itask < ntasks; itask++) {
133 const int level = task_list->
tasks[itask].
level - 1;
135 if (itask == 0 || task_list->
tasks[itask - 1].
level - 1 != level ||
143 task_list->
maxco = 0;
144 for (
int i = 0;
i < nkinds;
i++) {
149 size = omp_get_max_threads() *
sizeof(
double *);
152 size = omp_get_max_threads() *
sizeof(size_t);
156 *task_list_out = task_list;
168 free(task_list->
tasks);
172 for (
int i = 0;
i < omp_get_max_threads();
i++) {
186 void dgemm_(
const char *transa,
const char *transb,
const int *m,
const int *n,
187 const int *k,
const double *alpha,
const double *
a,
const int *lda,
188 const double *
b,
const int *ldb,
const double *beta,
double *
c,
195 static void dgemm(
const char transa,
const char transb,
const int m,
196 const int n,
const int k,
const double alpha,
const double *
a,
197 const int lda,
const double *
b,
const int ldb,
198 const double beta,
double *
c,
const int ldc) {
199 dgemm_(&transb, &transa, &n, &m, &k, &alpha,
b, &ldb,
a, &lda, &beta,
c,
208 const int iset,
const int jset,
const bool transpose,
209 const double *block,
double *pab) {
212 const int ncoseta =
ncoset(ibasis->
lmax[iset]);
213 const int ncosetb =
ncoset(jbasis->
lmax[jset]);
214 const int ncoa = ibasis->
npgf[iset] * ncoseta;
215 const int ncob = jbasis->
npgf[jset] * ncosetb;
217 const int nsgf_seta = ibasis->
nsgf_set[iset];
218 const int nsgf_setb = jbasis->
nsgf_set[jset];
219 const int nsgfa = ibasis->
nsgf;
220 const int nsgfb = jbasis->
nsgf;
221 const int sgfa = ibasis->
first_sgf[iset] - 1;
222 const int sgfb = jbasis->
first_sgf[jset] - 1;
223 const int maxcoa = ibasis->
maxco;
224 const int maxcob = jbasis->
maxco;
226 double work[nsgf_setb * ncoa];
229 dgemm(
'N',
'N', nsgf_setb, ncoa, nsgf_seta, 1.0,
230 &block[sgfb * nsgfa + sgfa], nsgfa, &ibasis->
sphi[sgfa * maxcoa],
231 maxcoa, 0.0, work, ncoa);
234 dgemm(
'T',
'N', nsgf_setb, ncoa, nsgf_seta, 1.0,
235 &block[sgfa * nsgfb + sgfb], nsgfb, &ibasis->
sphi[sgfa * maxcoa],
236 maxcoa, 0.0, work, ncoa);
239 dgemm(
'T',
'N', ncob, ncoa, nsgf_setb, 1.0, &jbasis->
sphi[sgfb * maxcob],
240 maxcob, work, ncoa, 0.0, pab, ncoa);
249 const int *last_block_task,
const enum grid_func func,
250 const int npts_global[3],
const int npts_local[3],
const int shift_local[3],
251 const int border_width[3],
const double dh[3][3],
const double dh_inv[3][3],
256 #pragma omp parallel default(shared)
258 const int ithread = omp_get_thread_num();
259 const int nthreads = omp_get_num_threads();
262 int old_offset = -1, old_iset = -1, old_jset = -1;
265 double pab[task_list->
maxco * task_list->
maxco];
269 const size_t grid_size = npts_local_total *
sizeof(double);
279 double *
const my_grid = task_list->
threadlocals[ithread];
280 memset(my_grid, 0, grid_size);
283 const int chunk_size =
imax(1, task_list->
nblocks / (nthreads * 50));
284 #pragma omp for schedule(dynamic, chunk_size)
285 for (
int block_num = 0; block_num < task_list->
nblocks; block_num++) {
286 const int first_task = first_block_task[block_num];
287 const int last_task = last_block_task[block_num];
289 for (
int itask = first_task; itask <= last_task; itask++) {
292 const int iatom = task->
iatom - 1;
293 const int jatom = task->
jatom - 1;
294 const int iset = task->
iset - 1;
295 const int jset = task->
jset - 1;
296 const int ipgf = task->
ipgf - 1;
297 const int jpgf = task->
jpgf - 1;
298 const int ikind = task_list->
atom_kinds[iatom] - 1;
299 const int jkind = task_list->
atom_kinds[jatom] - 1;
302 const double zeta = ibasis->
zet[iset * ibasis->
maxpgf + ipgf];
303 const double zetb = jbasis->
zet[jset * jbasis->
maxpgf + jpgf];
304 const int ncoseta =
ncoset(ibasis->
lmax[iset]);
305 const int ncosetb =
ncoset(jbasis->
lmax[jset]);
306 const int ncoa = ibasis->
npgf[iset] * ncoseta;
307 const int ncob = jbasis->
npgf[jset] * ncosetb;
308 const int block_num = task->
block_num - 1;
309 const int block_offset = task_list->
block_offsets[block_num];
310 const double *block = &pab_blocks[block_offset];
311 const bool transpose = (iatom <= jatom);
315 if (block_offset != old_offset || iset != old_iset ||
317 old_offset = block_offset;
320 load_pab(ibasis, jbasis, iset, jset, transpose, block, pab);
333 (iatom == jatom) ? 1 : 2,
347 (
const double(*)[ncoa])pab,
358 const int nreduction_cycles = ceil(log(nthreads) / log(2));
359 for (
int icycle = 1; icycle <= nreduction_cycles; icycle++) {
362 const int group_size = 1 << icycle;
363 const int igroup = ithread / group_size;
364 const int dest_thread = igroup * group_size;
365 const int src_thread = dest_thread + group_size / 2;
367 const int actual_group_size =
imin(group_size, nthreads - dest_thread);
369 const int rank =
modulo(ithread, group_size);
370 const int lb = (npts_local_total * rank) / actual_group_size;
371 const int ub = (npts_local_total * (rank + 1)) / actual_group_size;
372 if (src_thread < nthreads) {
373 for (
int i = lb;
i <
ub;
i++) {
382 const int lb = (npts_local_total * ithread) / nthreads;
383 const int ub = (npts_local_total * (ithread + 1)) / nthreads;
384 for (
int i = lb;
i <
ub;
i++) {
397 const enum grid_func func,
const int nlevels,
401 assert(task_list->
nlevels == nlevels);
403 for (
int level = 0; level < task_list->
nlevels; level++) {
409 task_list, first_block_task, last_block_task, func, layout->
npts_global,
421 const int jset,
const bool transpose,
422 const double *hab,
double *block) {
425 const int ncoseta =
ncoset(ibasis->
lmax[iset]);
426 const int ncosetb =
ncoset(jbasis->
lmax[jset]);
427 const int ncoa = ibasis->
npgf[iset] * ncoseta;
428 const int ncob = jbasis->
npgf[jset] * ncosetb;
430 const int nsgf_seta = ibasis->
nsgf_set[iset];
431 const int nsgf_setb = jbasis->
nsgf_set[jset];
432 const int nsgfa = ibasis->
nsgf;
433 const int nsgfb = jbasis->
nsgf;
434 const int sgfa = ibasis->
first_sgf[iset] - 1;
435 const int sgfb = jbasis->
first_sgf[jset] - 1;
436 const int maxcoa = ibasis->
maxco;
437 const int maxcob = jbasis->
maxco;
439 double work[nsgf_setb * ncoa];
442 dgemm(
'N',
'N', nsgf_setb, ncoa, ncob, 1.0, &jbasis->
sphi[sgfb * maxcob],
443 maxcob, hab, ncoa, 0.0, work, ncoa);
447 dgemm(
'N',
'T', nsgf_setb, nsgf_seta, ncoa, 1.0, work, ncoa,
448 &ibasis->
sphi[sgfa * maxcoa], maxcoa, 1.0,
449 &block[sgfb * nsgfa + sgfa], nsgfa);
452 dgemm(
'N',
'T', nsgf_seta, nsgf_setb, ncoa, 1.0,
453 &ibasis->
sphi[sgfa * maxcoa], maxcoa, work, ncoa, 1.0,
454 &block[sgfa * nsgfb + sgfb], nsgfb);
464 const int *last_block_task,
const bool compute_tau,
const int natoms,
465 const int npts_global[3],
const int npts_local[3],
const int shift_local[3],
466 const int border_width[3],
const double dh[3][3],
const double dh_inv[3][3],
468 offload_buffer *hab_blocks,
double forces[natoms][3],
double virial[3][3]) {
472 #pragma omp parallel default(shared)
475 int old_offset = -1, old_iset = -1, old_jset = -1;
477 bool old_transpose =
false;
480 double pab[task_list->
maxco * task_list->
maxco];
481 double hab[task_list->
maxco * task_list->
maxco];
484 const int nthreads = omp_get_num_threads();
485 const int chunk_size =
imax(1, task_list->
nblocks / (nthreads * 50));
486 #pragma omp for schedule(dynamic, chunk_size)
487 for (
int block_num = 0; block_num < task_list->
nblocks; block_num++) {
488 const int first_task = first_block_task[block_num];
489 const int last_task = last_block_task[block_num];
492 const int iatom = task_list->
tasks[first_task].
iatom - 1;
493 const int jatom = task_list->
tasks[first_task].
jatom - 1;
494 double my_forces[2][3] = {0};
495 double my_virials[2][3][3] = {0};
497 for (
int itask = first_task; itask <= last_task; itask++) {
500 assert(task->
block_num - 1 == block_num);
501 assert(task->
iatom - 1 == iatom && task->
jatom - 1 == jatom);
502 const int ikind = task_list->
atom_kinds[iatom] - 1;
503 const int jkind = task_list->
atom_kinds[jatom] - 1;
506 const int iset = task->
iset - 1;
507 const int jset = task->
jset - 1;
508 const int ipgf = task->
ipgf - 1;
509 const int jpgf = task->
jpgf - 1;
510 const double zeta = ibasis->
zet[iset * ibasis->
maxpgf + ipgf];
511 const double zetb = jbasis->
zet[jset * jbasis->
maxpgf + jpgf];
512 const int ncoseta =
ncoset(ibasis->
lmax[iset]);
513 const int ncosetb =
ncoset(jbasis->
lmax[jset]);
514 const int ncoa = ibasis->
npgf[iset] * ncoseta;
515 const int ncob = jbasis->
npgf[jset] * ncosetb;
516 const int block_offset = task_list->
block_offsets[block_num];
517 const bool transpose = (iatom <= jatom);
518 const bool pab_required = (forces != NULL || virial != NULL);
522 if (block_offset != old_offset || iset != old_iset ||
525 load_pab(ibasis, jbasis, iset, jset, transpose,
528 if (old_offset >= 0) {
529 store_hab(old_ibasis, old_jbasis, old_iset, old_jset, old_transpose,
532 memset(hab, 0, ncoa * ncob *
sizeof(
double));
533 old_offset = block_offset;
538 old_transpose = transpose;
565 (
double(*)[ncoa])hab,
566 (pab_required) ? (
const double(*)[ncoa])pab : NULL,
567 (forces != NULL) ? my_forces : NULL,
568 (virial != NULL) ? my_virials : NULL,
577 const double scalef = (iatom == jatom) ? 1.0 : 2.0;
578 if (forces != NULL) {
579 #pragma omp critical(forces)
580 for (
int i = 0;
i < 3;
i++) {
581 forces[iatom][
i] += scalef * my_forces[0][
i];
582 forces[jatom][
i] += scalef * my_forces[1][
i];
585 if (virial != NULL) {
586 #pragma omp critical(virial)
587 for (
int i = 0;
i < 3;
i++) {
588 for (
int j = 0; j < 3; j++) {
589 virial[
i][j] += scalef * my_virials[0][
i][j];
590 virial[
i][j] += scalef * my_virials[1][
i][j];
598 if (old_offset >= 0) {
599 store_hab(old_ibasis, old_jbasis, old_iset, old_jset, old_transpose, hab,
613 const int natoms,
const int nlevels,
const offload_buffer *pab_blocks,
615 double forces[natoms][3],
double virial[3][3]) {
617 assert(task_list->
nlevels == nlevels);
618 assert(task_list->
natoms == natoms);
622 if (forces != NULL) {
623 memset(forces, 0, natoms * 3 *
sizeof(
double));
625 if (virial != NULL) {
626 memset(virial, 0, 9 *
sizeof(
double));
629 for (
int level = 0; level < task_list->
nlevels; level++) {
635 task_list, first_block_task, last_block_task, compute_tau, natoms,
638 grids[level], hab_blocks, forces, virial);
static int imax(int x, int y)
Returns the larger of two given integer (missing from the C standard)
static int imin(int x, int y)
Returns the smaller of two given integer (missing from the C standard)
static GRID_HOST_DEVICE int ncoset(const int l)
Number of Cartesian orbitals up to given angular momentum quantum.
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
static GRID_HOST_DEVICE int idx(const orbital a)
Return coset index of given orbital angular momentum.
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
static void const int const int i
static void const int const int const int const int const int const double const int const int const int npts_local[3]
void grid_ref_collocate_pgf_product(const bool orthorhombic, const int border_mask, const enum grid_func func, const int la_max, const int la_min, const int lb_max, const int lb_min, const double zeta, const double zetb, const double rscale, const double dh[3][3], const double dh_inv[3][3], const double ra[3], const double rab[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, const int o1, const int o2, const int n1, const int n2, const double pab[n2][n1], double *grid)
Collocates a single product of primitiv Gaussians. See grid_ref_collocate.h for details.
void grid_ref_integrate_pgf_product(const bool orthorhombic, const bool compute_tau, const int border_mask, const int la_max, const int la_min, const int lb_max, const int lb_min, const double zeta, const double zetb, const double dh[3][3], const double dh_inv[3][3], const double ra[3], const double rab[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, const int o1, const int o2, const int n1, const int n2, const double *grid, double hab[n2][n1], const double pab[n2][n1], double forces[2][3], double virials[2][3][3], double hdab[n2][n1][3], double hadb[n2][n1][3], double a_hdab[n2][n1][3][3])
Integrates a single task. See grid_ref_integrate.h for details.
void grid_ref_collocate_task_list(const grid_ref_task_list *task_list, const enum grid_func func, const int nlevels, const offload_buffer *pab_blocks, offload_buffer *grids[nlevels])
Collocate all tasks of in given list onto given grids. See grid_task_list.h for details.
static void load_pab(const grid_basis_set *ibasis, const grid_basis_set *jbasis, const int iset, const int jset, const bool transpose, const double *block, double *pab)
Transforms pab from contracted spherical to prim. cartesian basis.
static void collocate_one_grid_level(const grid_ref_task_list *task_list, const int *first_block_task, const int *last_block_task, const enum grid_func func, const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double dh[3][3], const double dh_inv[3][3], const double *pab_blocks, offload_buffer *grid)
Collocate a range of tasks which are destined for the same grid level.
void grid_ref_free_task_list(grid_ref_task_list *task_list)
Deallocates given task list, basis_sets have to be freed separately.
void grid_ref_integrate_task_list(const grid_ref_task_list *task_list, const bool compute_tau, const int natoms, const int nlevels, const offload_buffer *pab_blocks, const offload_buffer *grids[nlevels], offload_buffer *hab_blocks, double forces[natoms][3], double virial[3][3])
Integrate all tasks of in given list from given grids. See grid_task_list.h for details.
static int compare_tasks(const void *a, const void *b)
Comperator passed to qsort to compare two tasks.
void grid_ref_create_task_list(const bool orthorhombic, const int ntasks, const int nlevels, const int natoms, const int nkinds, const int nblocks, const int block_offsets[nblocks], const double atom_positions[natoms][3], const int atom_kinds[natoms], const grid_basis_set *basis_sets[nkinds], const int level_list[ntasks], const int iatom_list[ntasks], const int jatom_list[ntasks], const int iset_list[ntasks], const int jset_list[ntasks], const int ipgf_list[ntasks], const int jpgf_list[ntasks], const int border_mask_list[ntasks], const int block_num_list[ntasks], const double radius_list[ntasks], const double rab_list[ntasks][3], const int npts_global[nlevels][3], const int npts_local[nlevels][3], const int shift_local[nlevels][3], const int border_width[nlevels][3], const double dh[nlevels][3][3], const double dh_inv[nlevels][3][3], grid_ref_task_list **task_list_out)
Allocates a task list for the reference backend. See grid_task_list.h for details.
void dgemm_(const char *transa, const char *transb, const int *m, const int *n, const int *k, const double *alpha, const double *a, const int *lda, const double *b, const int *ldb, const double *beta, double *c, const int *ldc)
Prototype for BLAS dgemm.
static void integrate_one_grid_level(const grid_ref_task_list *task_list, const int *first_block_task, const int *last_block_task, const bool compute_tau, const int natoms, const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double dh[3][3], const double dh_inv[3][3], const offload_buffer *pab_blocks, const offload_buffer *grid, offload_buffer *hab_blocks, double forces[natoms][3], double virial[3][3])
Integrate a range of tasks that belong to the same grid level.
static void store_hab(const grid_basis_set *ibasis, const grid_basis_set *jbasis, const int iset, const int jset, const bool transpose, const double *hab, double *block)
Transforms hab from prim. cartesian to contracted spherical basis.
static void dgemm(const char transa, const char transb, const int m, const int n, const int k, const double alpha, const double *a, const int lda, const double *b, const int ldb, const double beta, double *c, const int ldc)
Convenient wrapper to hide Fortran nature of dgemm_, swapping a and b.
Internal representation of a basis set.
Internal representation of a grid layout.
Internal representation of a task list.
int * last_level_block_task
grid_ref_layout * layouts
int * first_level_block_task
grid_basis_set ** basis_sets
size_t * threadlocal_sizes
Internal representation of a task.
Internal representation of a buffer.