(git:58e3e09)
grid_ref_task_list.c
Go to the documentation of this file.
1 /*----------------------------------------------------------------------------*/
2 /* CP2K: A general program to perform molecular dynamics simulations */
3 /* Copyright 2000-2024 CP2K developers group <https://cp2k.org> */
4 /* */
5 /* SPDX-License-Identifier: BSD-3-Clause */
6 /*----------------------------------------------------------------------------*/
7 
8 #include <assert.h>
9 #include <math.h>
10 #include <omp.h>
11 #include <stdint.h>
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <string.h>
15 
16 #include "../common/grid_common.h"
17 #include "grid_ref_collocate.h"
18 #include "grid_ref_integrate.h"
19 #include "grid_ref_task_list.h"
20 
21 /*******************************************************************************
22  * \brief Comperator passed to qsort to compare two tasks.
23  * \author Ole Schuett
24  ******************************************************************************/
25 static int compare_tasks(const void *a, const void *b) {
26  const grid_ref_task *task_a = a, *task_b = b;
27  if (task_a->level != task_b->level) {
28  return task_a->level - task_b->level;
29  } else if (task_a->block_num != task_b->block_num) {
30  return task_a->block_num - task_b->block_num;
31  } else if (task_a->iset != task_b->iset) {
32  return task_a->iset - task_b->iset;
33  } else {
34  return task_a->jset - task_b->jset;
35  }
36 }
37 /*******************************************************************************
38  * \brief Allocates a task list for the reference backend.
39  * See grid_task_list.h for details.
40  * \author Ole Schuett
41  ******************************************************************************/
43  const bool orthorhombic, const int ntasks, const int nlevels,
44  const int natoms, const int nkinds, const int nblocks,
45  const int block_offsets[nblocks], const double atom_positions[natoms][3],
46  const int atom_kinds[natoms], const grid_basis_set *basis_sets[nkinds],
47  const int level_list[ntasks], const int iatom_list[ntasks],
48  const int jatom_list[ntasks], const int iset_list[ntasks],
49  const int jset_list[ntasks], const int ipgf_list[ntasks],
50  const int jpgf_list[ntasks], const int border_mask_list[ntasks],
51  const int block_num_list[ntasks], const double radius_list[ntasks],
52  const double rab_list[ntasks][3], const int npts_global[nlevels][3],
53  const int npts_local[nlevels][3], const int shift_local[nlevels][3],
54  const int border_width[nlevels][3], const double dh[nlevels][3][3],
55  const double dh_inv[nlevels][3][3], grid_ref_task_list **task_list_out) {
56 
57  if (*task_list_out != NULL) {
58  // This is actually an opportunity to reuse some buffers.
59  grid_ref_free_task_list(*task_list_out);
60  }
61 
62  grid_ref_task_list *task_list = malloc(sizeof(grid_ref_task_list));
63 
64  task_list->orthorhombic = orthorhombic;
65  task_list->ntasks = ntasks;
66  task_list->nlevels = nlevels;
67  task_list->natoms = natoms;
68  task_list->nkinds = nkinds;
69  task_list->nblocks = nblocks;
70 
71  size_t size = nblocks * sizeof(int);
72  task_list->block_offsets = malloc(size);
73  memcpy(task_list->block_offsets, block_offsets, size);
74 
75  size = 3 * natoms * sizeof(double);
76  task_list->atom_positions = malloc(size);
77  memcpy(task_list->atom_positions, atom_positions, size);
78 
79  size = natoms * sizeof(int);
80  task_list->atom_kinds = malloc(size);
81  memcpy(task_list->atom_kinds, atom_kinds, size);
82 
83  size = nkinds * sizeof(grid_basis_set *);
84  task_list->basis_sets = malloc(size);
85  memcpy(task_list->basis_sets, basis_sets, size);
86 
87  size = ntasks * sizeof(grid_ref_task);
88  task_list->tasks = malloc(size);
89  for (int i = 0; i < ntasks; i++) {
90  task_list->tasks[i].level = level_list[i];
91  task_list->tasks[i].iatom = iatom_list[i];
92  task_list->tasks[i].jatom = jatom_list[i];
93  task_list->tasks[i].iset = iset_list[i];
94  task_list->tasks[i].jset = jset_list[i];
95  task_list->tasks[i].ipgf = ipgf_list[i];
96  task_list->tasks[i].jpgf = jpgf_list[i];
97  task_list->tasks[i].border_mask = border_mask_list[i];
98  task_list->tasks[i].block_num = block_num_list[i];
99  task_list->tasks[i].radius = radius_list[i];
100  task_list->tasks[i].rab[0] = rab_list[i][0];
101  task_list->tasks[i].rab[1] = rab_list[i][1];
102  task_list->tasks[i].rab[2] = rab_list[i][2];
103  }
104 
105  // Store grid layouts.
106  size = nlevels * sizeof(grid_ref_layout);
107  task_list->layouts = malloc(size);
108  for (int level = 0; level < nlevels; level++) {
109  for (int i = 0; i < 3; i++) {
110  task_list->layouts[level].npts_global[i] = npts_global[level][i];
111  task_list->layouts[level].npts_local[i] = npts_local[level][i];
112  task_list->layouts[level].shift_local[i] = shift_local[level][i];
113  task_list->layouts[level].border_width[i] = border_width[level][i];
114  for (int j = 0; j < 3; j++) {
115  task_list->layouts[level].dh[i][j] = dh[level][i][j];
116  task_list->layouts[level].dh_inv[i][j] = dh_inv[level][i][j];
117  }
118  }
119  }
120 
121  // Sort tasks by level, block_num, iset, and jset.
122  qsort(task_list->tasks, ntasks, sizeof(grid_ref_task), &compare_tasks);
123 
124  // Find first and last task for each level and block.
125  size = nlevels * nblocks * sizeof(int);
126  task_list->first_level_block_task = malloc(size);
127  task_list->last_level_block_task = malloc(size);
128  for (int i = 0; i < nlevels * nblocks; i++) {
129  task_list->first_level_block_task[i] = 0;
130  task_list->last_level_block_task[i] = -1; // last < first means no tasks
131  }
132  for (int itask = 0; itask < ntasks; itask++) {
133  const int level = task_list->tasks[itask].level - 1;
134  const int block_num = task_list->tasks[itask].block_num - 1;
135  if (itask == 0 || task_list->tasks[itask - 1].level - 1 != level ||
136  task_list->tasks[itask - 1].block_num - 1 != block_num) {
137  task_list->first_level_block_task[level * nblocks + block_num] = itask;
138  }
139  task_list->last_level_block_task[level * nblocks + block_num] = itask;
140  }
141 
142  // Find largest Cartesian subblock size.
143  task_list->maxco = 0;
144  for (int i = 0; i < nkinds; i++) {
145  task_list->maxco = imax(task_list->maxco, task_list->basis_sets[i]->maxco);
146  }
147 
148  // Initialize thread-local storage.
149  size = omp_get_max_threads() * sizeof(double *);
150  task_list->threadlocals = malloc(size);
151  memset(task_list->threadlocals, 0, size);
152  size = omp_get_max_threads() * sizeof(size_t);
153  task_list->threadlocal_sizes = malloc(size);
154  memset(task_list->threadlocal_sizes, 0, size);
155 
156  *task_list_out = task_list;
157 }
158 
159 /*******************************************************************************
160  * \brief Deallocates given task list, basis_sets have to be freed separately.
161  * \author Ole Schuett
162  ******************************************************************************/
164  free(task_list->block_offsets);
165  free(task_list->atom_positions);
166  free(task_list->atom_kinds);
167  free(task_list->basis_sets);
168  free(task_list->tasks);
169  free(task_list->layouts);
170  free(task_list->first_level_block_task);
171  free(task_list->last_level_block_task);
172  for (int i = 0; i < omp_get_max_threads(); i++) {
173  if (task_list->threadlocals[i] != NULL) {
174  free(task_list->threadlocals[i]);
175  }
176  }
177  free(task_list->threadlocals);
178  free(task_list->threadlocal_sizes);
179  free(task_list);
180 }
181 
182 /*******************************************************************************
183  * \brief Prototype for BLAS dgemm.
184  * \author Ole Schuett
185  ******************************************************************************/
186 void dgemm_(const char *transa, const char *transb, const int *m, const int *n,
187  const int *k, const double *alpha, const double *a, const int *lda,
188  const double *b, const int *ldb, const double *beta, double *c,
189  const int *ldc);
190 
191 /*******************************************************************************
192  * \brief Convenient wrapper to hide Fortran nature of dgemm_, swapping a and b.
193  * \author Ole Schuett
194  ******************************************************************************/
195 static void dgemm(const char transa, const char transb, const int m,
196  const int n, const int k, const double alpha, const double *a,
197  const int lda, const double *b, const int ldb,
198  const double beta, double *c, const int ldc) {
199  dgemm_(&transb, &transa, &n, &m, &k, &alpha, b, &ldb, a, &lda, &beta, c,
200  &ldc);
201 }
202 
203 /*******************************************************************************
204  * \brief Transforms pab from contracted spherical to prim. cartesian basis.
205  * \author Ole Schuett
206  ******************************************************************************/
207 static void load_pab(const grid_basis_set *ibasis, const grid_basis_set *jbasis,
208  const int iset, const int jset, const bool transpose,
209  const double *block, double *pab) {
210 
211  // Define some more convenient aliases.
212  const int ncoseta = ncoset(ibasis->lmax[iset]);
213  const int ncosetb = ncoset(jbasis->lmax[jset]);
214  const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
215  const int ncob = jbasis->npgf[jset] * ncosetb;
216 
217  const int nsgf_seta = ibasis->nsgf_set[iset]; // size of spherical set
218  const int nsgf_setb = jbasis->nsgf_set[jset];
219  const int nsgfa = ibasis->nsgf; // size of entire spherical basis
220  const int nsgfb = jbasis->nsgf;
221  const int sgfa = ibasis->first_sgf[iset] - 1; // start of spherical set
222  const int sgfb = jbasis->first_sgf[jset] - 1;
223  const int maxcoa = ibasis->maxco;
224  const int maxcob = jbasis->maxco;
225 
226  double work[nsgf_setb * ncoa];
227  if (transpose) {
228  // work[nsgf_setb][ncoa] = MATMUL(subblock, ibasis->sphi)
229  dgemm('N', 'N', nsgf_setb, ncoa, nsgf_seta, 1.0,
230  &block[sgfb * nsgfa + sgfa], nsgfa, &ibasis->sphi[sgfa * maxcoa],
231  maxcoa, 0.0, work, ncoa);
232  } else {
233  // work[nsgf_setb][ncoa] = MATMUL(TRANSPOSE(subblock), ibasis->sphi)
234  dgemm('T', 'N', nsgf_setb, ncoa, nsgf_seta, 1.0,
235  &block[sgfa * nsgfb + sgfb], nsgfb, &ibasis->sphi[sgfa * maxcoa],
236  maxcoa, 0.0, work, ncoa);
237  }
238  // pab[ncob][ncoa] = MATMUL(TRANSPOSE(jbasis->sphi), work)
239  dgemm('T', 'N', ncob, ncoa, nsgf_setb, 1.0, &jbasis->sphi[sgfb * maxcob],
240  maxcob, work, ncoa, 0.0, pab, ncoa);
241 }
242 
243 /*******************************************************************************
244  * \brief Collocate a range of tasks which are destined for the same grid level.
245  * \author Ole Schuett
246  ******************************************************************************/
248  const grid_ref_task_list *task_list, const int *first_block_task,
249  const int *last_block_task, const enum grid_func func,
250  const int npts_global[3], const int npts_local[3], const int shift_local[3],
251  const int border_width[3], const double dh[3][3], const double dh_inv[3][3],
252  const double *pab_blocks, offload_buffer *grid) {
253 
254 // Using default(shared) because with GCC 9 the behavior around const changed:
255 // https://www.gnu.org/software/gcc/gcc-9/porting_to.html
256 #pragma omp parallel default(shared)
257  {
258  const int ithread = omp_get_thread_num();
259  const int nthreads = omp_get_num_threads();
260 
261  // Initialize variables to detect when a new subblock has to be fetched.
262  int old_offset = -1, old_iset = -1, old_jset = -1;
263 
264  // Matrix pab is re-used across tasks.
265  double pab[task_list->maxco * task_list->maxco];
266 
267  // Ensure that grid can fit into thread-local storage, reallocate if needed.
268  const int npts_local_total = npts_local[0] * npts_local[1] * npts_local[2];
269  const size_t grid_size = npts_local_total * sizeof(double);
270  if (task_list->threadlocal_sizes[ithread] < grid_size) {
271  if (task_list->threadlocals[ithread] != NULL) {
272  free(task_list->threadlocals[ithread]);
273  }
274  task_list->threadlocals[ithread] = malloc(grid_size);
275  task_list->threadlocal_sizes[ithread] = grid_size;
276  }
277 
278  // Zero thread-local copy of the grid.
279  double *const my_grid = task_list->threadlocals[ithread];
280  memset(my_grid, 0, grid_size);
281 
282  // Parallelize over blocks to avoid unnecessary calls to load_pab.
283  const int chunk_size = imax(1, task_list->nblocks / (nthreads * 50));
284 #pragma omp for schedule(dynamic, chunk_size)
285  for (int block_num = 0; block_num < task_list->nblocks; block_num++) {
286  const int first_task = first_block_task[block_num];
287  const int last_task = last_block_task[block_num];
288 
289  for (int itask = first_task; itask <= last_task; itask++) {
290  // Define some convenient aliases.
291  const grid_ref_task *task = &task_list->tasks[itask];
292  const int iatom = task->iatom - 1;
293  const int jatom = task->jatom - 1;
294  const int iset = task->iset - 1;
295  const int jset = task->jset - 1;
296  const int ipgf = task->ipgf - 1;
297  const int jpgf = task->jpgf - 1;
298  const int ikind = task_list->atom_kinds[iatom] - 1;
299  const int jkind = task_list->atom_kinds[jatom] - 1;
300  const grid_basis_set *ibasis = task_list->basis_sets[ikind];
301  const grid_basis_set *jbasis = task_list->basis_sets[jkind];
302  const double zeta = ibasis->zet[iset * ibasis->maxpgf + ipgf];
303  const double zetb = jbasis->zet[jset * jbasis->maxpgf + jpgf];
304  const int ncoseta = ncoset(ibasis->lmax[iset]);
305  const int ncosetb = ncoset(jbasis->lmax[jset]);
306  const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
307  const int ncob = jbasis->npgf[jset] * ncosetb;
308  const int block_num = task->block_num - 1;
309  const int block_offset = task_list->block_offsets[block_num];
310  const double *block = &pab_blocks[block_offset];
311  const bool transpose = (iatom <= jatom);
312 
313  // Load subblock from buffer and decontract into Cartesian sublock pab.
314  // The previous pab can be reused when only ipgf or jpgf has changed.
315  if (block_offset != old_offset || iset != old_iset ||
316  jset != old_jset) {
317  old_offset = block_offset;
318  old_iset = iset;
319  old_jset = jset;
320  load_pab(ibasis, jbasis, iset, jset, transpose, block, pab);
321  }
322 
324  /*orthorhombic=*/task_list->orthorhombic,
325  /*border_mask=*/task->border_mask,
326  /*func=*/func,
327  /*la_max=*/ibasis->lmax[iset],
328  /*la_min=*/ibasis->lmin[iset],
329  /*lb_max=*/jbasis->lmax[jset],
330  /*lb_min=*/jbasis->lmin[jset],
331  /*zeta=*/zeta,
332  /*zetb=*/zetb,
333  /*rscale=*/(iatom == jatom) ? 1 : 2,
334  /*dh=*/dh,
335  /*dh_inv=*/dh_inv,
336  /*ra=*/&task_list->atom_positions[3 * iatom],
337  /*rab=*/task->rab,
338  /*npts_global=*/npts_global,
339  /*npts_local=*/npts_local,
340  /*shift_local=*/shift_local,
341  /*border_width=*/border_width,
342  /*radius=*/task->radius,
343  /*o1=*/ipgf * ncoseta,
344  /*o2=*/jpgf * ncosetb,
345  /*n1=*/ncoa,
346  /*n2=*/ncob,
347  /*pab=*/(const double(*)[ncoa])pab,
348  /*grid=*/my_grid);
349 
350  } // end of task loop
351  } // end of block loop
352 
353 // While there should be an implicit barrier at the end of the block loop, this
354 // explicit barrier eliminates occasional seg faults with icc compiled binaries.
355 #pragma omp barrier
356 
357  // Merge thread-local grids via an efficient tree reduction.
358  const int nreduction_cycles = ceil(log(nthreads) / log(2)); // tree depth
359  for (int icycle = 1; icycle <= nreduction_cycles; icycle++) {
360  // Threads are divided into groups, whose size doubles with each cycle.
361  // After a cycle the reduced data is stored at first thread of each group.
362  const int group_size = 1 << icycle; // 2**icycle
363  const int igroup = ithread / group_size;
364  const int dest_thread = igroup * group_size;
365  const int src_thread = dest_thread + group_size / 2;
366  // The last group might actually be a bit smaller.
367  const int actual_group_size = imin(group_size, nthreads - dest_thread);
368  // Parallelize summation by dividing grid points across group members.
369  const int rank = modulo(ithread, group_size); // position within the group
370  const int lb = (npts_local_total * rank) / actual_group_size;
371  const int ub = (npts_local_total * (rank + 1)) / actual_group_size;
372  if (src_thread < nthreads) {
373  for (int i = lb; i < ub; i++) {
374  task_list->threadlocals[dest_thread][i] +=
375  task_list->threadlocals[src_thread][i];
376  }
377  }
378 #pragma omp barrier
379  }
380 
381  // Copy final result from first thread into shared grid.
382  const int lb = (npts_local_total * ithread) / nthreads;
383  const int ub = (npts_local_total * (ithread + 1)) / nthreads;
384  for (int i = lb; i < ub; i++) {
385  grid->host_buffer[i] = task_list->threadlocals[0][i];
386  }
387 
388  } // end of omp parallel region
389 }
390 
391 /*******************************************************************************
392  * \brief Collocate all tasks of in given list onto given grids.
393  * See grid_task_list.h for details.
394  * \author Ole Schuett
395  ******************************************************************************/
397  const enum grid_func func, const int nlevels,
398  const offload_buffer *pab_blocks,
399  offload_buffer *grids[nlevels]) {
400 
401  assert(task_list->nlevels == nlevels);
402 
403  for (int level = 0; level < task_list->nlevels; level++) {
404  const int idx = level * task_list->nblocks;
405  const int *first_block_task = &task_list->first_level_block_task[idx];
406  const int *last_block_task = &task_list->last_level_block_task[idx];
407  const grid_ref_layout *layout = &task_list->layouts[level];
409  task_list, first_block_task, last_block_task, func, layout->npts_global,
410  layout->npts_local, layout->shift_local, layout->border_width,
411  layout->dh, layout->dh_inv, pab_blocks->host_buffer, grids[level]);
412  }
413 }
414 
415 /*******************************************************************************
416  * \brief Transforms hab from prim. cartesian to contracted spherical basis.
417  * \author Ole Schuett
418  ******************************************************************************/
419 static inline void store_hab(const grid_basis_set *ibasis,
420  const grid_basis_set *jbasis, const int iset,
421  const int jset, const bool transpose,
422  const double *hab, double *block) {
423 
424  // Define some more convenient aliases.
425  const int ncoseta = ncoset(ibasis->lmax[iset]);
426  const int ncosetb = ncoset(jbasis->lmax[jset]);
427  const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
428  const int ncob = jbasis->npgf[jset] * ncosetb;
429 
430  const int nsgf_seta = ibasis->nsgf_set[iset]; // size of spherical set
431  const int nsgf_setb = jbasis->nsgf_set[jset];
432  const int nsgfa = ibasis->nsgf; // size of entire spherical basis
433  const int nsgfb = jbasis->nsgf;
434  const int sgfa = ibasis->first_sgf[iset] - 1; // start of spherical set
435  const int sgfb = jbasis->first_sgf[jset] - 1;
436  const int maxcoa = ibasis->maxco;
437  const int maxcob = jbasis->maxco;
438 
439  double work[nsgf_setb * ncoa];
440 
441  // work[nsgf_setb][ncoa] = MATMUL(jbasis->sphi, hab)
442  dgemm('N', 'N', nsgf_setb, ncoa, ncob, 1.0, &jbasis->sphi[sgfb * maxcob],
443  maxcob, hab, ncoa, 0.0, work, ncoa);
444 
445  if (transpose) {
446  // subblock[nsgf_setb][nsgf_seta] += MATMUL(work, TRANSPOSE(ibasis->sphi))
447  dgemm('N', 'T', nsgf_setb, nsgf_seta, ncoa, 1.0, work, ncoa,
448  &ibasis->sphi[sgfa * maxcoa], maxcoa, 1.0,
449  &block[sgfb * nsgfa + sgfa], nsgfa);
450  } else {
451  // subblock[nsgf_seta][nsgf_setb] += MATMUL(ibasis->sphi, TRANSPOSE(work))
452  dgemm('N', 'T', nsgf_seta, nsgf_setb, ncoa, 1.0,
453  &ibasis->sphi[sgfa * maxcoa], maxcoa, work, ncoa, 1.0,
454  &block[sgfa * nsgfb + sgfb], nsgfb);
455  }
456 }
457 
458 /*******************************************************************************
459  * \brief Integrate a range of tasks that belong to the same grid level.
460  * \author Ole Schuett
461  ******************************************************************************/
463  const grid_ref_task_list *task_list, const int *first_block_task,
464  const int *last_block_task, const bool compute_tau, const int natoms,
465  const int npts_global[3], const int npts_local[3], const int shift_local[3],
466  const int border_width[3], const double dh[3][3], const double dh_inv[3][3],
467  const offload_buffer *pab_blocks, const offload_buffer *grid,
468  offload_buffer *hab_blocks, double forces[natoms][3], double virial[3][3]) {
469 
470 // Using default(shared) because with GCC 9 the behavior around const changed:
471 // https://www.gnu.org/software/gcc/gcc-9/porting_to.html
472 #pragma omp parallel default(shared)
473  {
474  // Initialize variables to detect when a new subblock has to be fetched.
475  int old_offset = -1, old_iset = -1, old_jset = -1;
476  grid_basis_set *old_ibasis = NULL, *old_jbasis = NULL;
477  bool old_transpose = false;
478 
479  // Matrix pab and hab are re-used across tasks.
480  double pab[task_list->maxco * task_list->maxco];
481  double hab[task_list->maxco * task_list->maxco];
482 
483  // Parallelize over blocks to avoid concurred access to hab_blocks.
484  const int nthreads = omp_get_num_threads();
485  const int chunk_size = imax(1, task_list->nblocks / (nthreads * 50));
486 #pragma omp for schedule(dynamic, chunk_size)
487  for (int block_num = 0; block_num < task_list->nblocks; block_num++) {
488  const int first_task = first_block_task[block_num];
489  const int last_task = last_block_task[block_num];
490 
491  // Accumulate forces per block as it corresponds to a pair of atoms.
492  const int iatom = task_list->tasks[first_task].iatom - 1;
493  const int jatom = task_list->tasks[first_task].jatom - 1;
494  double my_forces[2][3] = {0};
495  double my_virials[2][3][3] = {0};
496 
497  for (int itask = first_task; itask <= last_task; itask++) {
498  // Define some convenient aliases.
499  const grid_ref_task *task = &task_list->tasks[itask];
500  assert(task->block_num - 1 == block_num);
501  assert(task->iatom - 1 == iatom && task->jatom - 1 == jatom);
502  const int ikind = task_list->atom_kinds[iatom] - 1;
503  const int jkind = task_list->atom_kinds[jatom] - 1;
504  grid_basis_set *ibasis = task_list->basis_sets[ikind];
505  grid_basis_set *jbasis = task_list->basis_sets[jkind];
506  const int iset = task->iset - 1;
507  const int jset = task->jset - 1;
508  const int ipgf = task->ipgf - 1;
509  const int jpgf = task->jpgf - 1;
510  const double zeta = ibasis->zet[iset * ibasis->maxpgf + ipgf];
511  const double zetb = jbasis->zet[jset * jbasis->maxpgf + jpgf];
512  const int ncoseta = ncoset(ibasis->lmax[iset]);
513  const int ncosetb = ncoset(jbasis->lmax[jset]);
514  const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
515  const int ncob = jbasis->npgf[jset] * ncosetb;
516  const int block_offset = task_list->block_offsets[block_num];
517  const bool transpose = (iatom <= jatom);
518  const bool pab_required = (forces != NULL || virial != NULL);
519 
520  // Load pab and store hab subblocks when needed.
521  // Previous hab and pab can be reused when only ipgf or jpgf changed.
522  if (block_offset != old_offset || iset != old_iset ||
523  jset != old_jset) {
524  if (pab_required) {
525  load_pab(ibasis, jbasis, iset, jset, transpose,
526  &pab_blocks->host_buffer[block_offset], pab);
527  }
528  if (old_offset >= 0) { // skip first iteration
529  store_hab(old_ibasis, old_jbasis, old_iset, old_jset, old_transpose,
530  hab, &hab_blocks->host_buffer[old_offset]);
531  }
532  memset(hab, 0, ncoa * ncob * sizeof(double));
533  old_offset = block_offset;
534  old_iset = iset;
535  old_jset = jset;
536  old_ibasis = ibasis;
537  old_jbasis = jbasis;
538  old_transpose = transpose;
539  }
540 
542  /*orthorhombic=*/task_list->orthorhombic,
543  /*compute_tau=*/compute_tau,
544  /*border_mask=*/task->border_mask,
545  /*la_max=*/ibasis->lmax[iset],
546  /*la_min=*/ibasis->lmin[iset],
547  /*lb_max=*/jbasis->lmax[jset],
548  /*lb_min=*/jbasis->lmin[jset],
549  /*zeta=*/zeta,
550  /*zetb=*/zetb,
551  /*dh=*/dh,
552  /*dh_inv=*/dh_inv,
553  /*ra=*/&task_list->atom_positions[3 * iatom],
554  /*rab=*/task->rab,
555  /*npts_global=*/npts_global,
556  /*npts_local=*/npts_local,
557  /*shift_local=*/shift_local,
558  /*border_width=*/border_width,
559  /*radius=*/task->radius,
560  /*o1=*/ipgf * ncoseta,
561  /*o2=*/jpgf * ncosetb,
562  /*n1=*/ncoa,
563  /*n2=*/ncob,
564  /*grid=*/grid->host_buffer,
565  /*hab=*/(double(*)[ncoa])hab,
566  /*pab=*/(pab_required) ? (const double(*)[ncoa])pab : NULL,
567  /*forces=*/(forces != NULL) ? my_forces : NULL,
568  /*virials=*/(virial != NULL) ? my_virials : NULL,
569  /*hdab=*/NULL,
570  /*hadb=*/NULL,
571  /*a_hdab=*/NULL);
572 
573  } // end of task loop
574 
575  // Merge thread-local forces and virial into shared ones.
576  // It does not seem worth the trouble to accumulate them thread-locally.
577  const double scalef = (iatom == jatom) ? 1.0 : 2.0;
578  if (forces != NULL) {
579 #pragma omp critical(forces)
580  for (int i = 0; i < 3; i++) {
581  forces[iatom][i] += scalef * my_forces[0][i];
582  forces[jatom][i] += scalef * my_forces[1][i];
583  }
584  }
585  if (virial != NULL) {
586 #pragma omp critical(virial)
587  for (int i = 0; i < 3; i++) {
588  for (int j = 0; j < 3; j++) {
589  virial[i][j] += scalef * my_virials[0][i][j];
590  virial[i][j] += scalef * my_virials[1][i][j];
591  }
592  }
593  }
594 
595  } // end of block loop
596 
597  // store final hab
598  if (old_offset >= 0) {
599  store_hab(old_ibasis, old_jbasis, old_iset, old_jset, old_transpose, hab,
600  &hab_blocks->host_buffer[old_offset]);
601  }
602 
603  } // end of omp parallel region
604 }
605 
606 /*******************************************************************************
607  * \brief Integrate all tasks of in given list from given grids.
608  * See grid_task_list.h for details.
609  * \author Ole Schuett
610  ******************************************************************************/
612  const grid_ref_task_list *task_list, const bool compute_tau,
613  const int natoms, const int nlevels, const offload_buffer *pab_blocks,
614  const offload_buffer *grids[nlevels], offload_buffer *hab_blocks,
615  double forces[natoms][3], double virial[3][3]) {
616 
617  assert(task_list->nlevels == nlevels);
618  assert(task_list->natoms == natoms);
619 
620  // Zero result arrays.
621  memset(hab_blocks->host_buffer, 0, hab_blocks->size);
622  if (forces != NULL) {
623  memset(forces, 0, natoms * 3 * sizeof(double));
624  }
625  if (virial != NULL) {
626  memset(virial, 0, 9 * sizeof(double));
627  }
628 
629  for (int level = 0; level < task_list->nlevels; level++) {
630  const int idx = level * task_list->nblocks;
631  const int *first_block_task = &task_list->first_level_block_task[idx];
632  const int *last_block_task = &task_list->last_level_block_task[idx];
633  const grid_ref_layout *layout = &task_list->layouts[level];
635  task_list, first_block_task, last_block_task, compute_tau, natoms,
636  layout->npts_global, layout->npts_local, layout->shift_local,
637  layout->border_width, layout->dh, layout->dh_inv, pab_blocks,
638  grids[level], hab_blocks, forces, virial);
639  }
640 }
641 
642 // EOF
static int imax(int x, int y)
Returns the larger of two given integer (missing from the C standard)
static int imin(int x, int y)
Returns the smaller of two given integer (missing from the C standard)
Definition: dbm_miniapp.c:38
static GRID_HOST_DEVICE int ncoset(const int l)
Number of Cartesian orbitals up to given angular momentum quantum.
Definition: grid_common.h:73
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
Definition: grid_common.h:117
static GRID_HOST_DEVICE int idx(const orbital a)
Return coset index of given orbital angular momentum.
Definition: grid_common.h:153
grid_func
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
static void const int const int i
const int ub
static void const int const int const int const int const int const double const int const int const int npts_local[3]
void grid_ref_collocate_pgf_product(const bool orthorhombic, const int border_mask, const enum grid_func func, const int la_max, const int la_min, const int lb_max, const int lb_min, const double zeta, const double zetb, const double rscale, const double dh[3][3], const double dh_inv[3][3], const double ra[3], const double rab[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, const int o1, const int o2, const int n1, const int n2, const double pab[n2][n1], double *grid)
Collocates a single product of primitiv Gaussians. See grid_ref_collocate.h for details.
void grid_ref_integrate_pgf_product(const bool orthorhombic, const bool compute_tau, const int border_mask, const int la_max, const int la_min, const int lb_max, const int lb_min, const double zeta, const double zetb, const double dh[3][3], const double dh_inv[3][3], const double ra[3], const double rab[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, const int o1, const int o2, const int n1, const int n2, const double *grid, double hab[n2][n1], const double pab[n2][n1], double forces[2][3], double virials[2][3][3], double hdab[n2][n1][3], double hadb[n2][n1][3], double a_hdab[n2][n1][3][3])
Integrates a single task. See grid_ref_integrate.h for details.
void grid_ref_collocate_task_list(const grid_ref_task_list *task_list, const enum grid_func func, const int nlevels, const offload_buffer *pab_blocks, offload_buffer *grids[nlevels])
Collocate all tasks of in given list onto given grids. See grid_task_list.h for details.
static void load_pab(const grid_basis_set *ibasis, const grid_basis_set *jbasis, const int iset, const int jset, const bool transpose, const double *block, double *pab)
Transforms pab from contracted spherical to prim. cartesian basis.
static void collocate_one_grid_level(const grid_ref_task_list *task_list, const int *first_block_task, const int *last_block_task, const enum grid_func func, const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double dh[3][3], const double dh_inv[3][3], const double *pab_blocks, offload_buffer *grid)
Collocate a range of tasks which are destined for the same grid level.
void grid_ref_free_task_list(grid_ref_task_list *task_list)
Deallocates given task list, basis_sets have to be freed separately.
void grid_ref_integrate_task_list(const grid_ref_task_list *task_list, const bool compute_tau, const int natoms, const int nlevels, const offload_buffer *pab_blocks, const offload_buffer *grids[nlevels], offload_buffer *hab_blocks, double forces[natoms][3], double virial[3][3])
Integrate all tasks of in given list from given grids. See grid_task_list.h for details.
static int compare_tasks(const void *a, const void *b)
Comperator passed to qsort to compare two tasks.
void grid_ref_create_task_list(const bool orthorhombic, const int ntasks, const int nlevels, const int natoms, const int nkinds, const int nblocks, const int block_offsets[nblocks], const double atom_positions[natoms][3], const int atom_kinds[natoms], const grid_basis_set *basis_sets[nkinds], const int level_list[ntasks], const int iatom_list[ntasks], const int jatom_list[ntasks], const int iset_list[ntasks], const int jset_list[ntasks], const int ipgf_list[ntasks], const int jpgf_list[ntasks], const int border_mask_list[ntasks], const int block_num_list[ntasks], const double radius_list[ntasks], const double rab_list[ntasks][3], const int npts_global[nlevels][3], const int npts_local[nlevels][3], const int shift_local[nlevels][3], const int border_width[nlevels][3], const double dh[nlevels][3][3], const double dh_inv[nlevels][3][3], grid_ref_task_list **task_list_out)
Allocates a task list for the reference backend. See grid_task_list.h for details.
void dgemm_(const char *transa, const char *transb, const int *m, const int *n, const int *k, const double *alpha, const double *a, const int *lda, const double *b, const int *ldb, const double *beta, double *c, const int *ldc)
Prototype for BLAS dgemm.
static void integrate_one_grid_level(const grid_ref_task_list *task_list, const int *first_block_task, const int *last_block_task, const bool compute_tau, const int natoms, const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double dh[3][3], const double dh_inv[3][3], const offload_buffer *pab_blocks, const offload_buffer *grid, offload_buffer *hab_blocks, double forces[natoms][3], double virial[3][3])
Integrate a range of tasks that belong to the same grid level.
static void store_hab(const grid_basis_set *ibasis, const grid_basis_set *jbasis, const int iset, const int jset, const bool transpose, const double *hab, double *block)
Transforms hab from prim. cartesian to contracted spherical basis.
static void dgemm(const char transa, const char transb, const int m, const int n, const int k, const double alpha, const double *a, const int lda, const double *b, const int ldb, const double beta, double *c, const int ldc)
Convenient wrapper to hide Fortran nature of dgemm_, swapping a and b.
real(dp), dimension(3) c
Definition: ai_eri_debug.F:31
real(dp), dimension(3) a
Definition: ai_eri_debug.F:31
real(dp), dimension(3) b
Definition: ai_eri_debug.F:31
Internal representation of a basis set.
Internal representation of a grid layout.
double dh_inv[3][3]
Internal representation of a task list.
grid_ref_layout * layouts
grid_basis_set ** basis_sets
grid_ref_task * tasks
Internal representation of a task.
Internal representation of a buffer.
double * host_buffer