16#include "../common/grid_common.h"
28 if (task_a->
level != task_b->level) {
29 return task_a->
level - task_b->level;
30 }
else if (task_a->
block_num != task_b->block_num) {
31 return task_a->
block_num - task_b->block_num;
32 }
else if (task_a->
iset != task_b->iset) {
33 return task_a->
iset - task_b->iset;
35 return task_a->
jset - task_b->jset;
44 const bool orthorhombic,
const int ntasks,
const int nlevels,
45 const int natoms,
const int nkinds,
const int nblocks,
46 const int block_offsets[nblocks],
const double atom_positions[natoms][3],
47 const int atom_kinds[natoms],
const grid_basis_set *basis_sets[nkinds],
48 const int level_list[ntasks],
const int iatom_list[ntasks],
49 const int jatom_list[ntasks],
const int iset_list[ntasks],
50 const int jset_list[ntasks],
const int ipgf_list[ntasks],
51 const int jpgf_list[ntasks],
const int border_mask_list[ntasks],
52 const int block_num_list[ntasks],
const double radius_list[ntasks],
53 const double rab_list[ntasks][3],
const int npts_global[nlevels][3],
54 const int npts_local[nlevels][3],
const int shift_local[nlevels][3],
55 const int border_width[nlevels][3],
const double dh[nlevels][3][3],
58 if (*task_list_out != NULL) {
65 assert(task_list != NULL);
68 task_list->
ntasks = ntasks;
70 task_list->
natoms = natoms;
71 task_list->
nkinds = nkinds;
74 size_t size = nblocks *
sizeof(int);
81 size = 3 * natoms *
sizeof(double);
88 size = natoms *
sizeof(int);
90 assert(task_list->
atom_kinds != NULL || size == 0);
92 memcpy(task_list->
atom_kinds, atom_kinds, size);
97 assert(task_list->
basis_sets != NULL || size == 0);
99 memcpy(task_list->
basis_sets, basis_sets, size);
103 task_list->
tasks = malloc(size);
104 assert(task_list->
tasks != NULL || size == 0);
105#pragma omp parallel for schedule(static) if (ntasks > GRID_OMP_MIN_ITERATIONS)
106 for (
int i = 0;
i < ntasks;
i++) {
124 task_list->
layouts = malloc(size);
125 assert(task_list->
layouts != NULL || size == 0);
126 for (
int level = 0; level < nlevels; level++) {
127 for (
int i = 0;
i < 3;
i++) {
132 for (
int j = 0; j < 3; j++) {
133 task_list->
layouts[level].
dh[
i][j] = dh[level][
i][j];
143 size = nlevels * nblocks *
sizeof(int);
148 const int nlevel_blocks = nlevels * nblocks;
149#pragma omp parallel for schedule(static) if (nlevel_blocks > \
150 GRID_OMP_MIN_ITERATIONS)
151 for (
int i = 0;
i < nlevel_blocks;
i++) {
155 for (
int itask = 0; itask < ntasks; itask++) {
156 const int level = task_list->
tasks[itask].
level - 1;
158 if (itask == 0 || task_list->
tasks[itask - 1].
level - 1 != level ||
166 task_list->
maxco = 0;
167 for (
int i = 0;
i < nkinds;
i++) {
172 size = omp_get_max_threads() *
sizeof(
double *);
176 size = omp_get_max_threads() *
sizeof(size_t);
181 *task_list_out = task_list;
198 free(task_list->
tasks);
202 for (
int i = 0;
i < omp_get_max_threads();
i++) {
216void dgemm_(
const char *transa,
const char *transb,
const int *m,
const int *n,
217 const int *k,
const double *alpha,
const double *a,
const int *lda,
218 const double *b,
const int *ldb,
const double *beta,
double *c,
225static void dgemm(
const char transa,
const char transb,
const int m,
226 const int n,
const int k,
const double alpha,
const double *a,
227 const int lda,
const double *b,
const int ldb,
228 const double beta,
double *c,
const int ldc) {
229 dgemm_(&transb, &transa, &n, &m, &k, &alpha, b, &ldb, a, &lda, &beta, c,
238 const int iset,
const int jset,
const bool transpose,
239 const double *block,
double *pab) {
242 const int ncoseta = ncoset(ibasis->
lmax[iset]);
243 const int ncosetb = ncoset(jbasis->
lmax[jset]);
244 const int ncoa = ibasis->
npgf[iset] * ncoseta;
245 const int ncob = jbasis->
npgf[jset] * ncosetb;
247 const int nsgf_seta = ibasis->
nsgf_set[iset];
248 const int nsgf_setb = jbasis->
nsgf_set[jset];
249 const int nsgfa = ibasis->
nsgf;
250 const int nsgfb = jbasis->
nsgf;
251 const int sgfa = ibasis->
first_sgf[iset] - 1;
252 const int sgfb = jbasis->
first_sgf[jset] - 1;
253 const int maxcoa = ibasis->
maxco;
254 const int maxcob = jbasis->
maxco;
256 double work[nsgf_setb * ncoa];
259 dgemm(
'N',
'N', nsgf_setb, ncoa, nsgf_seta, 1.0,
260 &block[sgfb * nsgfa + sgfa], nsgfa, &ibasis->
sphi[sgfa * maxcoa],
261 maxcoa, 0.0, work, ncoa);
264 dgemm(
'T',
'N', nsgf_setb, ncoa, nsgf_seta, 1.0,
265 &block[sgfa * nsgfb + sgfb], nsgfb, &ibasis->
sphi[sgfa * maxcoa],
266 maxcoa, 0.0, work, ncoa);
269 dgemm(
'T',
'N', ncob, ncoa, nsgf_setb, 1.0, &jbasis->
sphi[sgfb * maxcob],
270 maxcob, work, ncoa, 0.0, pab, ncoa);
279 const int *last_block_task,
const enum grid_func func,
280 const int npts_global[3],
const int npts_local[3],
const int shift_local[3],
281 const int border_width[3],
const double dh[3][3],
const double dh_inv[3][3],
291#pragma omp parallel default(shared)
293 const int ithread = omp_get_thread_num();
294 const int nthreads = omp_get_num_threads();
297 int old_offset = -1, old_iset = -1, old_jset = -1;
304 const size_t grid_size = npts_local_total *
sizeof(double);
315 double *
const my_grid = task_list->
threadlocals[ithread];
316 memset(my_grid, 0, grid_size);
319 const int chunk_size =
imax(1, task_list->
nblocks / (nthreads * 50));
320#pragma omp for schedule(dynamic, chunk_size)
321 for (
int block_num = 0; block_num < task_list->
nblocks; block_num++) {
322 const int first_task = first_block_task[block_num];
323 const int last_task = last_block_task[block_num];
325 for (
int itask = first_task; itask <= last_task; itask++) {
328 const int iatom = task->
iatom - 1;
329 const int jatom = task->
jatom - 1;
330 const int iset = task->
iset - 1;
331 const int jset = task->
jset - 1;
332 const int ipgf = task->
ipgf - 1;
333 const int jpgf = task->
jpgf - 1;
334 const int ikind = task_list->
atom_kinds[iatom] - 1;
335 const int jkind = task_list->
atom_kinds[jatom] - 1;
338 const double zeta = ibasis->
zet[iset * ibasis->
maxpgf + ipgf];
339 const double zetb = jbasis->
zet[jset * jbasis->
maxpgf + jpgf];
340 const int ncoseta = ncoset(ibasis->
lmax[iset]);
341 const int ncosetb = ncoset(jbasis->
lmax[jset]);
342 const int ncoa = ibasis->
npgf[iset] * ncoseta;
343 const int ncob = jbasis->
npgf[jset] * ncosetb;
344 const int block_num = task->
block_num - 1;
345 const int block_offset = task_list->
block_offsets[block_num];
346 const double *block = &pab_blocks[block_offset];
347 const bool transpose = (iatom <= jatom);
351 if (block_offset != old_offset || iset != old_iset ||
353 old_offset = block_offset;
356 load_pab(ibasis, jbasis, iset, jset, transpose, block, pab);
369 (iatom == jatom) ? 1 : 2,
383 (
const double(*)[ncoa])pab,
394 const int nreduction_cycles = ceil(log(nthreads) / log(2));
395 for (
int icycle = 1; icycle <= nreduction_cycles; icycle++) {
398 const int group_size = 1 << icycle;
399 const int igroup = ithread / group_size;
400 const int dest_thread = igroup * group_size;
401 const int src_thread = dest_thread + group_size / 2;
403 const int actual_group_size =
imin(group_size, nthreads - dest_thread);
405 const int rank =
modulo(ithread, group_size);
406 const int64_t lb = ((int64_t)npts_local_total * rank) / actual_group_size;
408 ((int64_t)npts_local_total * (rank + 1)) / actual_group_size;
409 if (src_thread < nthreads) {
411 for (
int i = (
int)lb;
i < (int)
ub;
i++) {
420 const int64_t lb = ((int64_t)npts_local_total * ithread) / nthreads;
421 const int64_t
ub = ((int64_t)npts_local_total * (ithread + 1)) / nthreads;
423 for (
int i = (
int)lb;
i < (int)
ub;
i++) {
436 const enum grid_func func,
const int nlevels,
443 assert(task_list->
nlevels == nlevels);
445 for (
int level = 0; level < task_list->
nlevels; level++) {
451 task_list, first_block_task, last_block_task, func, layout->
npts_global,
463 const int jset,
const bool transpose,
464 const double *hab,
double *block) {
467 const int ncoseta = ncoset(ibasis->
lmax[iset]);
468 const int ncosetb = ncoset(jbasis->
lmax[jset]);
469 const int ncoa = ibasis->
npgf[iset] * ncoseta;
470 const int ncob = jbasis->
npgf[jset] * ncosetb;
472 const int nsgf_seta = ibasis->
nsgf_set[iset];
473 const int nsgf_setb = jbasis->
nsgf_set[jset];
474 const int nsgfa = ibasis->
nsgf;
475 const int nsgfb = jbasis->
nsgf;
476 const int sgfa = ibasis->
first_sgf[iset] - 1;
477 const int sgfb = jbasis->
first_sgf[jset] - 1;
478 const int maxcoa = ibasis->
maxco;
479 const int maxcob = jbasis->
maxco;
481 double work[nsgf_setb * ncoa];
484 dgemm(
'N',
'N', nsgf_setb, ncoa, ncob, 1.0, &jbasis->
sphi[sgfb * maxcob],
485 maxcob, hab, ncoa, 0.0, work, ncoa);
489 dgemm(
'N',
'T', nsgf_setb, nsgf_seta, ncoa, 1.0, work, ncoa,
490 &ibasis->
sphi[sgfa * maxcoa], maxcoa, 1.0,
491 &block[sgfb * nsgfa + sgfa], nsgfa);
494 dgemm(
'N',
'T', nsgf_seta, nsgf_setb, ncoa, 1.0,
495 &ibasis->
sphi[sgfa * maxcoa], maxcoa, work, ncoa, 1.0,
496 &block[sgfa * nsgfb + sgfb], nsgfb);
506 const int *last_block_task,
const bool compute_tau,
const int natoms,
507 const int npts_global[3],
const int npts_local[3],
const int shift_local[3],
508 const int border_width[3],
const double dh[3][3],
const double dh_inv[3][3],
510 offload_buffer *hab_blocks,
double forces[natoms][3],
double virial[3][3]) {
519#pragma omp parallel default(shared)
522 int old_offset = -1, old_iset = -1, old_jset = -1;
524 bool old_transpose =
false;
531 const int nthreads = omp_get_num_threads();
532 const int chunk_size =
imax(1, task_list->
nblocks / (nthreads * 50));
533#pragma omp for schedule(dynamic, chunk_size)
534 for (
int block_num = 0; block_num < task_list->
nblocks; block_num++) {
535 const int first_task = first_block_task[block_num];
536 const int last_task = last_block_task[block_num];
539 const int iatom = task_list->
tasks[first_task].
iatom - 1;
540 const int jatom = task_list->
tasks[first_task].
jatom - 1;
541 double my_forces[2][3] = {0};
542 double my_virials[2][3][3] = {0};
544 for (
int itask = first_task; itask <= last_task; itask++) {
547 assert(task->
block_num - 1 == block_num);
548 assert(task->
iatom - 1 == iatom && task->
jatom - 1 == jatom);
549 const int ikind = task_list->
atom_kinds[iatom] - 1;
550 const int jkind = task_list->
atom_kinds[jatom] - 1;
553 const int iset = task->
iset - 1;
554 const int jset = task->
jset - 1;
555 const int ipgf = task->
ipgf - 1;
556 const int jpgf = task->
jpgf - 1;
557 const double zeta = ibasis->
zet[iset * ibasis->
maxpgf + ipgf];
558 const double zetb = jbasis->
zet[jset * jbasis->
maxpgf + jpgf];
559 const int ncoseta = ncoset(ibasis->
lmax[iset]);
560 const int ncosetb = ncoset(jbasis->
lmax[jset]);
561 const int ncoa = ibasis->
npgf[iset] * ncoseta;
562 const int ncob = jbasis->
npgf[jset] * ncosetb;
563 const int block_offset = task_list->
block_offsets[block_num];
564 const bool transpose = (iatom <= jatom);
565 const bool pab_required = (forces != NULL || virial != NULL);
569 if (block_offset != old_offset || iset != old_iset ||
572 load_pab(ibasis, jbasis, iset, jset, transpose,
575 if (old_offset >= 0) {
576 store_hab(old_ibasis, old_jbasis, old_iset, old_jset, old_transpose,
579 memset(hab, 0, ncoa * ncob *
sizeof(
double));
580 old_offset = block_offset;
585 old_transpose = transpose;
612 (
double(*)[ncoa])hab,
613 (pab_required) ? (
const double(*)[ncoa])pab : NULL,
614 (forces != NULL) ? my_forces : NULL,
615 (virial != NULL) ? my_virials : NULL,
624 const double scalef = (iatom == jatom) ? 1.0 : 2.0;
625 if (forces != NULL) {
626#pragma omp critical(forces)
627 for (
int i = 0;
i < 3;
i++) {
628 forces[iatom][
i] += scalef * my_forces[0][
i];
629 forces[jatom][
i] += scalef * my_forces[1][
i];
632 if (virial != NULL) {
633#pragma omp critical(virial)
634 for (
int i = 0;
i < 3;
i++) {
635 for (
int j = 0; j < 3; j++) {
636 virial[
i][j] += scalef * my_virials[0][
i][j];
637 virial[
i][j] += scalef * my_virials[1][
i][j];
645 if (old_offset >= 0) {
646 store_hab(old_ibasis, old_jbasis, old_iset, old_jset, old_transpose, hab,
662 double forces[natoms][3],
double virial[3][3]) {
669 assert(task_list->
nlevels == nlevels);
670 assert(task_list->
natoms == natoms);
671 assert(hab_blocks != NULL);
674 if (hab_blocks->
size != 0) {
677 if (forces != NULL) {
678 memset(forces, 0, natoms * 3 *
sizeof(
double));
680 if (virial != NULL) {
681 memset(virial, 0, 9 *
sizeof(
double));
684 for (
int level = 0; level < task_list->
nlevels; level++) {
690 task_list, first_block_task, last_block_task, compute_tau, natoms,
693 grids[level], hab_blocks, forces, virial);
static int imax(int x, int y)
Returns the larger of two given integers (missing from the C standard)
static int imin(int x, int y)
Returns the smaller of the two integers (missing from the C standard).
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
#define GRID_PRAGMA_SIMD_LOOP
static GRID_HOST_DEVICE int idx(const orbital a)
Return coset index of given orbital angular momentum.
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
static void const int const int i
static void const int const int const int const int const int const double const int const int const int npts_local[3]
void grid_cpu_collocate_pgf_product(const bool orthorhombic, const int border_mask, const enum grid_func func, const int la_max, const int la_min, const int lb_max, const int lb_min, const double zeta, const double zetb, const double rscale, const double dh[3][3], const double dh_inv[3][3], const double ra[3], const double rab[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, const int o1, const int o2, const int n1, const int n2, const double pab[n2][n1], double *grid)
Public entry point. A thin wrapper with the only purpose of calling write_task_file when DUMP_TASKS =...
void grid_cpu_integrate_pgf_product(const bool orthorhombic, const bool compute_tau, const int border_mask, const int la_max, const int la_min, const int lb_max, const int lb_min, const double zeta, const double zetb, const double dh[3][3], const double dh_inv[3][3], const double ra[3], const double rab[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, const int o1, const int o2, const int n1, const int n2, const double *grid, double hab[n2][n1], const double pab[n2][n1], double forces[2][3], double virials[2][3][3], double hdab[n2][n1][3], double hadb[n2][n1][3], double a_hdab[n2][n1][3][3])
Integrates a single task. See grid_cpu_integrate.h for details.
static void collocate_one_grid_level(const grid_cpu_task_list *ptr, const int *first_block_task, const int *last_block_task, const enum grid_func func, const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double dh[3][3], const double dh_inv[3][3], const double *pab_blocks, offload_buffer *grid)
Collocate a range of tasks which are destined for the same grid level.
static void load_pab(const grid_basis_set *ibasis, const grid_basis_set *jbasis, const int iset, const int jset, const bool transpose, const double *block, double *pab)
Transforms pab from contracted spherical to prim. cartesian basis.
void grid_cpu_collocate_task_list(const grid_cpu_task_list *ptr, const enum grid_func func, const int nlevels, const offload_buffer *pab_blocks, offload_buffer *grids[nlevels])
Collocate all tasks of in given list onto given grids. See grid_task_list.h for details.
void grid_cpu_free_task_list(grid_cpu_task_list *ptr)
Deallocates given task list, basis_sets have to be freed separately.
void grid_cpu_create_task_list(const bool orthorhombic, const int ntasks, const int nlevels, const int natoms, const int nkinds, const int nblocks, const int block_offsets[nblocks], const double atom_positions[natoms][3], const int atom_kinds[natoms], const grid_basis_set *basis_sets[nkinds], const int level_list[ntasks], const int iatom_list[ntasks], const int jatom_list[ntasks], const int iset_list[ntasks], const int jset_list[ntasks], const int ipgf_list[ntasks], const int jpgf_list[ntasks], const int border_mask_list[ntasks], const int block_num_list[ntasks], const double radius_list[ntasks], const double rab_list[ntasks][3], const int npts_global[nlevels][3], const int npts_local[nlevels][3], const int shift_local[nlevels][3], const int border_width[nlevels][3], const double dh[nlevels][3][3], const double dh_inv[nlevels][3][3], grid_cpu_task_list **task_list_out)
Allocates a task list for the cpu backend. See grid_task_list.h for details.
static int compare_tasks(const void *a, const void *b)
Comperator passed to qsort to compare two tasks.
static void integrate_one_grid_level(const grid_cpu_task_list *ptr, const int *first_block_task, const int *last_block_task, const bool compute_tau, const int natoms, const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double dh[3][3], const double dh_inv[3][3], const offload_buffer *pab_blocks, const offload_buffer *grid, offload_buffer *hab_blocks, double forces[natoms][3], double virial[3][3])
Integrate a range of tasks that belong to the same grid level.
void dgemm_(const char *transa, const char *transb, const int *m, const int *n, const int *k, const double *alpha, const double *a, const int *lda, const double *b, const int *ldb, const double *beta, double *c, const int *ldc)
Prototype for BLAS dgemm.
static void store_hab(const grid_basis_set *ibasis, const grid_basis_set *jbasis, const int iset, const int jset, const bool transpose, const double *hab, double *block)
Transforms hab from prim. cartesian to contracted spherical basis.
void grid_cpu_integrate_task_list(const grid_cpu_task_list *ptr, const bool compute_tau, const int natoms, const int nlevels, const offload_buffer *pab_blocks, const offload_buffer *grids[nlevels], offload_buffer *hab_blocks, double forces[natoms][3], double virial[3][3])
Integrate all tasks of in given list from given grids. See grid_task_list.h for details.
static void dgemm(const char transa, const char transb, const int m, const int n, const int k, const double alpha, const double *a, const int lda, const double *b, const int ldb, const double beta, double *c, const int ldc)
Convenient wrapper to hide Fortran nature of dgemm_, swapping a and b.
Internal representation of a basis set.
Internal representation of a grid layout.
Internal representation of a task list.
size_t * threadlocal_sizes
grid_cpu_layout * layouts
grid_basis_set ** basis_sets
int * first_level_block_task
int * last_level_block_task
Internal representation of a task.
Internal representation of a buffer.