16#include "../common/grid_common.h"
27 if (task_a->
level != task_b->level) {
28 return task_a->
level - task_b->level;
29 }
else if (task_a->
block_num != task_b->block_num) {
30 return task_a->
block_num - task_b->block_num;
31 }
else if (task_a->
iset != task_b->iset) {
32 return task_a->
iset - task_b->iset;
34 return task_a->
jset - task_b->jset;
43 const bool orthorhombic,
const int ntasks,
const int nlevels,
44 const int natoms,
const int nkinds,
const int nblocks,
45 const int block_offsets[nblocks],
const double atom_positions[natoms][3],
46 const int atom_kinds[natoms],
const grid_basis_set *basis_sets[nkinds],
47 const int level_list[ntasks],
const int iatom_list[ntasks],
48 const int jatom_list[ntasks],
const int iset_list[ntasks],
49 const int jset_list[ntasks],
const int ipgf_list[ntasks],
50 const int jpgf_list[ntasks],
const int border_mask_list[ntasks],
51 const int block_num_list[ntasks],
const double radius_list[ntasks],
52 const double rab_list[ntasks][3],
const int npts_global[nlevels][3],
53 const int npts_local[nlevels][3],
const int shift_local[nlevels][3],
54 const int border_width[nlevels][3],
const double dh[nlevels][3][3],
57 if (*task_list_out != NULL) {
63 assert(task_list != NULL);
66 task_list->
ntasks = ntasks;
68 task_list->
natoms = natoms;
69 task_list->
nkinds = nkinds;
72 size_t size = nblocks *
sizeof(int);
79 size = 3 * natoms *
sizeof(double);
86 size = natoms *
sizeof(int);
88 assert(task_list->
atom_kinds != NULL || size == 0);
90 memcpy(task_list->
atom_kinds, atom_kinds, size);
95 assert(task_list->
basis_sets != NULL || size == 0);
97 memcpy(task_list->
basis_sets, basis_sets, size);
101 task_list->
tasks = malloc(size);
102 assert(task_list->
tasks != NULL || size == 0);
103 for (
int i = 0;
i < ntasks;
i++) {
121 task_list->
layouts = malloc(size);
122 assert(task_list->
layouts != NULL || size == 0);
123 for (
int level = 0; level < nlevels; level++) {
124 for (
int i = 0;
i < 3;
i++) {
129 for (
int j = 0; j < 3; j++) {
130 task_list->
layouts[level].
dh[
i][j] = dh[level][
i][j];
140 size = nlevels * nblocks *
sizeof(int);
145 for (
int i = 0;
i < nlevels * nblocks;
i++) {
149 for (
int itask = 0; itask < ntasks; itask++) {
150 const int level = task_list->
tasks[itask].
level - 1;
152 if (itask == 0 || task_list->
tasks[itask - 1].
level - 1 != level ||
160 task_list->
maxco = 0;
161 for (
int i = 0;
i < nkinds;
i++) {
166 size = omp_get_max_threads() *
sizeof(
double *);
170 size = omp_get_max_threads() *
sizeof(size_t);
175 *task_list_out = task_list;
187 free(task_list->
tasks);
191 for (
int i = 0;
i < omp_get_max_threads();
i++) {
205void dgemm_(
const char *transa,
const char *transb,
const int *m,
const int *n,
206 const int *k,
const double *alpha,
const double *a,
const int *lda,
207 const double *b,
const int *ldb,
const double *beta,
double *c,
214static void dgemm(
const char transa,
const char transb,
const int m,
215 const int n,
const int k,
const double alpha,
const double *a,
216 const int lda,
const double *b,
const int ldb,
217 const double beta,
double *c,
const int ldc) {
218 dgemm_(&transb, &transa, &n, &m, &k, &alpha, b, &ldb, a, &lda, &beta, c,
227 const int iset,
const int jset,
const bool transpose,
228 const double *block,
double *pab) {
231 const int ncoseta = ncoset(ibasis->
lmax[iset]);
232 const int ncosetb = ncoset(jbasis->
lmax[jset]);
233 const int ncoa = ibasis->
npgf[iset] * ncoseta;
234 const int ncob = jbasis->
npgf[jset] * ncosetb;
236 const int nsgf_seta = ibasis->
nsgf_set[iset];
237 const int nsgf_setb = jbasis->
nsgf_set[jset];
238 const int nsgfa = ibasis->
nsgf;
239 const int nsgfb = jbasis->
nsgf;
240 const int sgfa = ibasis->
first_sgf[iset] - 1;
241 const int sgfb = jbasis->
first_sgf[jset] - 1;
242 const int maxcoa = ibasis->
maxco;
243 const int maxcob = jbasis->
maxco;
245 double work[nsgf_setb * ncoa];
248 dgemm(
'N',
'N', nsgf_setb, ncoa, nsgf_seta, 1.0,
249 &block[sgfb * nsgfa + sgfa], nsgfa, &ibasis->
sphi[sgfa * maxcoa],
250 maxcoa, 0.0, work, ncoa);
253 dgemm(
'T',
'N', nsgf_setb, ncoa, nsgf_seta, 1.0,
254 &block[sgfa * nsgfb + sgfb], nsgfb, &ibasis->
sphi[sgfa * maxcoa],
255 maxcoa, 0.0, work, ncoa);
258 dgemm(
'T',
'N', ncob, ncoa, nsgf_setb, 1.0, &jbasis->
sphi[sgfb * maxcob],
259 maxcob, work, ncoa, 0.0, pab, ncoa);
268 const int *last_block_task,
const enum grid_func func,
269 const int npts_global[3],
const int npts_local[3],
const int shift_local[3],
270 const int border_width[3],
const double dh[3][3],
const double dh_inv[3][3],
275#pragma omp parallel default(shared)
277 const int ithread = omp_get_thread_num();
278 const int nthreads = omp_get_num_threads();
281 int old_offset = -1, old_iset = -1, old_jset = -1;
288 const size_t grid_size = npts_local_total *
sizeof(double);
299 double *
const my_grid = task_list->
threadlocals[ithread];
300 memset(my_grid, 0, grid_size);
303 const int chunk_size =
imax(1, task_list->
nblocks / (nthreads * 50));
304#pragma omp for schedule(dynamic, chunk_size)
305 for (
int block_num = 0; block_num < task_list->
nblocks; block_num++) {
306 const int first_task = first_block_task[block_num];
307 const int last_task = last_block_task[block_num];
309 for (
int itask = first_task; itask <= last_task; itask++) {
312 const int iatom = task->
iatom - 1;
313 const int jatom = task->
jatom - 1;
314 const int iset = task->
iset - 1;
315 const int jset = task->
jset - 1;
316 const int ipgf = task->
ipgf - 1;
317 const int jpgf = task->
jpgf - 1;
318 const int ikind = task_list->
atom_kinds[iatom] - 1;
319 const int jkind = task_list->
atom_kinds[jatom] - 1;
322 const double zeta = ibasis->
zet[iset * ibasis->
maxpgf + ipgf];
323 const double zetb = jbasis->
zet[jset * jbasis->
maxpgf + jpgf];
324 const int ncoseta = ncoset(ibasis->
lmax[iset]);
325 const int ncosetb = ncoset(jbasis->
lmax[jset]);
326 const int ncoa = ibasis->
npgf[iset] * ncoseta;
327 const int ncob = jbasis->
npgf[jset] * ncosetb;
328 const int block_num = task->
block_num - 1;
329 const int block_offset = task_list->
block_offsets[block_num];
330 const double *block = &pab_blocks[block_offset];
331 const bool transpose = (iatom <= jatom);
335 if (block_offset != old_offset || iset != old_iset ||
337 old_offset = block_offset;
340 load_pab(ibasis, jbasis, iset, jset, transpose, block, pab);
353 (iatom == jatom) ? 1 : 2,
367 (
const double(*)[ncoa])pab,
378 const int nreduction_cycles = ceil(log(nthreads) / log(2));
379 for (
int icycle = 1; icycle <= nreduction_cycles; icycle++) {
382 const int group_size = 1 << icycle;
383 const int igroup = ithread / group_size;
384 const int dest_thread = igroup * group_size;
385 const int src_thread = dest_thread + group_size / 2;
387 const int actual_group_size =
imin(group_size, nthreads - dest_thread);
389 const int rank =
modulo(ithread, group_size);
390 const int64_t lb = ((int64_t)npts_local_total * rank) / actual_group_size;
392 ((int64_t)npts_local_total * (rank + 1)) / actual_group_size;
393 if (src_thread < nthreads) {
394 for (
int i = (
int)lb;
i < (int)
ub;
i++) {
403 const int64_t lb = ((int64_t)npts_local_total * ithread) / nthreads;
404 const int64_t
ub = ((int64_t)npts_local_total * (ithread + 1)) / nthreads;
405 for (
int i = (
int)lb;
i < (int)
ub;
i++) {
418 const enum grid_func func,
const int nlevels,
422 assert(task_list->
nlevels == nlevels);
424 for (
int level = 0; level < task_list->
nlevels; level++) {
430 task_list, first_block_task, last_block_task, func, layout->
npts_global,
442 const int jset,
const bool transpose,
443 const double *hab,
double *block) {
446 const int ncoseta = ncoset(ibasis->
lmax[iset]);
447 const int ncosetb = ncoset(jbasis->
lmax[jset]);
448 const int ncoa = ibasis->
npgf[iset] * ncoseta;
449 const int ncob = jbasis->
npgf[jset] * ncosetb;
451 const int nsgf_seta = ibasis->
nsgf_set[iset];
452 const int nsgf_setb = jbasis->
nsgf_set[jset];
453 const int nsgfa = ibasis->
nsgf;
454 const int nsgfb = jbasis->
nsgf;
455 const int sgfa = ibasis->
first_sgf[iset] - 1;
456 const int sgfb = jbasis->
first_sgf[jset] - 1;
457 const int maxcoa = ibasis->
maxco;
458 const int maxcob = jbasis->
maxco;
460 double work[nsgf_setb * ncoa];
463 dgemm(
'N',
'N', nsgf_setb, ncoa, ncob, 1.0, &jbasis->
sphi[sgfb * maxcob],
464 maxcob, hab, ncoa, 0.0, work, ncoa);
468 dgemm(
'N',
'T', nsgf_setb, nsgf_seta, ncoa, 1.0, work, ncoa,
469 &ibasis->
sphi[sgfa * maxcoa], maxcoa, 1.0,
470 &block[sgfb * nsgfa + sgfa], nsgfa);
473 dgemm(
'N',
'T', nsgf_seta, nsgf_setb, ncoa, 1.0,
474 &ibasis->
sphi[sgfa * maxcoa], maxcoa, work, ncoa, 1.0,
475 &block[sgfa * nsgfb + sgfb], nsgfb);
485 const int *last_block_task,
const bool compute_tau,
const int natoms,
486 const int npts_global[3],
const int npts_local[3],
const int shift_local[3],
487 const int border_width[3],
const double dh[3][3],
const double dh_inv[3][3],
489 offload_buffer *hab_blocks,
double forces[natoms][3],
double virial[3][3]) {
493#pragma omp parallel default(shared)
496 int old_offset = -1, old_iset = -1, old_jset = -1;
498 bool old_transpose =
false;
505 const int nthreads = omp_get_num_threads();
506 const int chunk_size =
imax(1, task_list->
nblocks / (nthreads * 50));
507#pragma omp for schedule(dynamic, chunk_size)
508 for (
int block_num = 0; block_num < task_list->
nblocks; block_num++) {
509 const int first_task = first_block_task[block_num];
510 const int last_task = last_block_task[block_num];
513 const int iatom = task_list->
tasks[first_task].
iatom - 1;
514 const int jatom = task_list->
tasks[first_task].
jatom - 1;
515 double my_forces[2][3] = {0};
516 double my_virials[2][3][3] = {0};
518 for (
int itask = first_task; itask <= last_task; itask++) {
521 assert(task->
block_num - 1 == block_num);
522 assert(task->
iatom - 1 == iatom && task->
jatom - 1 == jatom);
523 const int ikind = task_list->
atom_kinds[iatom] - 1;
524 const int jkind = task_list->
atom_kinds[jatom] - 1;
527 const int iset = task->
iset - 1;
528 const int jset = task->
jset - 1;
529 const int ipgf = task->
ipgf - 1;
530 const int jpgf = task->
jpgf - 1;
531 const double zeta = ibasis->
zet[iset * ibasis->
maxpgf + ipgf];
532 const double zetb = jbasis->
zet[jset * jbasis->
maxpgf + jpgf];
533 const int ncoseta = ncoset(ibasis->
lmax[iset]);
534 const int ncosetb = ncoset(jbasis->
lmax[jset]);
535 const int ncoa = ibasis->
npgf[iset] * ncoseta;
536 const int ncob = jbasis->
npgf[jset] * ncosetb;
537 const int block_offset = task_list->
block_offsets[block_num];
538 const bool transpose = (iatom <= jatom);
539 const bool pab_required = (forces != NULL || virial != NULL);
543 if (block_offset != old_offset || iset != old_iset ||
546 load_pab(ibasis, jbasis, iset, jset, transpose,
549 if (old_offset >= 0) {
550 store_hab(old_ibasis, old_jbasis, old_iset, old_jset, old_transpose,
553 memset(hab, 0, ncoa * ncob *
sizeof(
double));
554 old_offset = block_offset;
559 old_transpose = transpose;
586 (
double(*)[ncoa])hab,
587 (pab_required) ? (
const double(*)[ncoa])pab : NULL,
588 (forces != NULL) ? my_forces : NULL,
589 (virial != NULL) ? my_virials : NULL,
598 const double scalef = (iatom == jatom) ? 1.0 : 2.0;
599 if (forces != NULL) {
600#pragma omp critical(forces)
601 for (
int i = 0;
i < 3;
i++) {
602 forces[iatom][
i] += scalef * my_forces[0][
i];
603 forces[jatom][
i] += scalef * my_forces[1][
i];
606 if (virial != NULL) {
607#pragma omp critical(virial)
608 for (
int i = 0;
i < 3;
i++) {
609 for (
int j = 0; j < 3; j++) {
610 virial[
i][j] += scalef * my_virials[0][
i][j];
611 virial[
i][j] += scalef * my_virials[1][
i][j];
619 if (old_offset >= 0) {
620 store_hab(old_ibasis, old_jbasis, old_iset, old_jset, old_transpose, hab,
634 const int natoms,
const int nlevels,
const offload_buffer *pab_blocks,
636 double forces[natoms][3],
double virial[3][3]) {
638 assert(task_list->
nlevels == nlevels);
639 assert(task_list->
natoms == natoms);
640 assert(hab_blocks != NULL);
643 if (hab_blocks->
size != 0) {
646 if (forces != NULL) {
647 memset(forces, 0, natoms * 3 *
sizeof(
double));
649 if (virial != NULL) {
650 memset(virial, 0, 9 *
sizeof(
double));
653 for (
int level = 0; level < task_list->
nlevels; level++) {
659 task_list, first_block_task, last_block_task, compute_tau, natoms,
662 grids[level], hab_blocks, forces, virial);
static int imax(int x, int y)
Returns the larger of two given integers (missing from the C standard)
static int imin(int x, int y)
Returns the smaller of the two integers (missing from the C standard).
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
static GRID_HOST_DEVICE int idx(const orbital a)
Return coset index of given orbital angular momentum.
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
static void const int const int i
static void const int const int const int const int const int const double const int const int const int npts_local[3]
void grid_cpu_collocate_pgf_product(const bool orthorhombic, const int border_mask, const enum grid_func func, const int la_max, const int la_min, const int lb_max, const int lb_min, const double zeta, const double zetb, const double rscale, const double dh[3][3], const double dh_inv[3][3], const double ra[3], const double rab[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, const int o1, const int o2, const int n1, const int n2, const double pab[n2][n1], double *grid)
Public entry point. A thin wrapper with the only purpose of calling write_task_file when DUMP_TASKS =...
void grid_cpu_integrate_pgf_product(const bool orthorhombic, const bool compute_tau, const int border_mask, const int la_max, const int la_min, const int lb_max, const int lb_min, const double zeta, const double zetb, const double dh[3][3], const double dh_inv[3][3], const double ra[3], const double rab[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, const int o1, const int o2, const int n1, const int n2, const double *grid, double hab[n2][n1], const double pab[n2][n1], double forces[2][3], double virials[2][3][3], double hdab[n2][n1][3], double hadb[n2][n1][3], double a_hdab[n2][n1][3][3])
Integrates a single task. See grid_cpu_integrate.h for details.
static void load_pab(const grid_basis_set *ibasis, const grid_basis_set *jbasis, const int iset, const int jset, const bool transpose, const double *block, double *pab)
Transforms pab from contracted spherical to prim. cartesian basis.
void grid_cpu_free_task_list(grid_cpu_task_list *task_list)
Deallocates given task list, basis_sets have to be freed separately.
void grid_cpu_create_task_list(const bool orthorhombic, const int ntasks, const int nlevels, const int natoms, const int nkinds, const int nblocks, const int block_offsets[nblocks], const double atom_positions[natoms][3], const int atom_kinds[natoms], const grid_basis_set *basis_sets[nkinds], const int level_list[ntasks], const int iatom_list[ntasks], const int jatom_list[ntasks], const int iset_list[ntasks], const int jset_list[ntasks], const int ipgf_list[ntasks], const int jpgf_list[ntasks], const int border_mask_list[ntasks], const int block_num_list[ntasks], const double radius_list[ntasks], const double rab_list[ntasks][3], const int npts_global[nlevels][3], const int npts_local[nlevels][3], const int shift_local[nlevels][3], const int border_width[nlevels][3], const double dh[nlevels][3][3], const double dh_inv[nlevels][3][3], grid_cpu_task_list **task_list_out)
Allocates a task list for the cpu backend. See grid_task_list.h for details.
static void integrate_one_grid_level(const grid_cpu_task_list *task_list, const int *first_block_task, const int *last_block_task, const bool compute_tau, const int natoms, const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double dh[3][3], const double dh_inv[3][3], const offload_buffer *pab_blocks, const offload_buffer *grid, offload_buffer *hab_blocks, double forces[natoms][3], double virial[3][3])
Integrate a range of tasks that belong to the same grid level.
static void collocate_one_grid_level(const grid_cpu_task_list *task_list, const int *first_block_task, const int *last_block_task, const enum grid_func func, const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double dh[3][3], const double dh_inv[3][3], const double *pab_blocks, offload_buffer *grid)
Collocate a range of tasks which are destined for the same grid level.
static int compare_tasks(const void *a, const void *b)
Comperator passed to qsort to compare two tasks.
void grid_cpu_collocate_task_list(const grid_cpu_task_list *task_list, const enum grid_func func, const int nlevels, const offload_buffer *pab_blocks, offload_buffer *grids[nlevels])
Collocate all tasks of in given list onto given grids. See grid_task_list.h for details.
void dgemm_(const char *transa, const char *transb, const int *m, const int *n, const int *k, const double *alpha, const double *a, const int *lda, const double *b, const int *ldb, const double *beta, double *c, const int *ldc)
Prototype for BLAS dgemm.
void grid_cpu_integrate_task_list(const grid_cpu_task_list *task_list, const bool compute_tau, const int natoms, const int nlevels, const offload_buffer *pab_blocks, const offload_buffer *grids[nlevels], offload_buffer *hab_blocks, double forces[natoms][3], double virial[3][3])
Integrate all tasks of in given list from given grids. See grid_task_list.h for details.
static void store_hab(const grid_basis_set *ibasis, const grid_basis_set *jbasis, const int iset, const int jset, const bool transpose, const double *hab, double *block)
Transforms hab from prim. cartesian to contracted spherical basis.
static void dgemm(const char transa, const char transb, const int m, const int n, const int k, const double alpha, const double *a, const int lda, const double *b, const int ldb, const double beta, double *c, const int ldc)
Convenient wrapper to hide Fortran nature of dgemm_, swapping a and b.
Internal representation of a basis set.
Internal representation of a grid layout.
Internal representation of a task list.
grid_cpu_layout * layouts
int * first_level_block_task
grid_basis_set ** basis_sets
int * last_level_block_task
size_t * threadlocal_sizes
Internal representation of a task.
Internal representation of a buffer.