(git:0d88fc2)
Loading...
Searching...
No Matches
grid_ref_task_list.c
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2026 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: BSD-3-Clause */
6/*----------------------------------------------------------------------------*/
7
8#include <assert.h>
9#include <math.h>
10#include <omp.h>
11#include <stdint.h>
12#include <stdio.h>
13#include <stdlib.h>
14#include <string.h>
15
16#include "../common/grid_common.h"
17#include "grid_ref_collocate.h"
18#include "grid_ref_integrate.h"
19#include "grid_ref_task_list.h"
21
22/*******************************************************************************
23 * \brief Comperator passed to qsort to compare two tasks.
24 * \author Ole Schuett
25 ******************************************************************************/
26static int compare_tasks(const void *a, const void *b) {
27 const grid_ref_task *task_a = a, *task_b = b;
28 if (task_a->level != task_b->level) {
29 return task_a->level - task_b->level;
30 } else if (task_a->block_num != task_b->block_num) {
31 return task_a->block_num - task_b->block_num;
32 } else if (task_a->iset != task_b->iset) {
33 return task_a->iset - task_b->iset;
34 } else {
35 return task_a->jset - task_b->jset;
36 }
37}
38/*******************************************************************************
39 * \brief Allocates a task list for the reference backend.
40 * See grid_task_list.h for details.
41 * \author Ole Schuett
42 ******************************************************************************/
44 const bool orthorhombic, const int ntasks, const int nlevels,
45 const int natoms, const int nkinds, const int nblocks,
46 const int block_offsets[nblocks], const double atom_positions[natoms][3],
47 const int atom_kinds[natoms], const grid_basis_set *basis_sets[nkinds],
48 const int level_list[ntasks], const int iatom_list[ntasks],
49 const int jatom_list[ntasks], const int iset_list[ntasks],
50 const int jset_list[ntasks], const int ipgf_list[ntasks],
51 const int jpgf_list[ntasks], const int border_mask_list[ntasks],
52 const int block_num_list[ntasks], const double radius_list[ntasks],
53 const double rab_list[ntasks][3], const int npts_global[nlevels][3],
54 const int npts_local[nlevels][3], const int shift_local[nlevels][3],
55 const int border_width[nlevels][3], const double dh[nlevels][3][3],
56 const double dh_inv[nlevels][3][3], grid_ref_task_list **task_list_out) {
57
58 if (*task_list_out != NULL) {
59 // This is actually an opportunity to reuse some buffers.
60 grid_ref_free_task_list(*task_list_out);
61 }
62
64 malloc(sizeof(grid_ref_task_list_internal));
65 assert(task_list != NULL);
66
67 task_list->orthorhombic = orthorhombic;
68 task_list->ntasks = ntasks;
69 task_list->nlevels = nlevels;
70 task_list->natoms = natoms;
71 task_list->nkinds = nkinds;
72 task_list->nblocks = nblocks;
73
74 size_t size = nblocks * sizeof(int);
75 task_list->block_offsets = malloc(size);
76 assert(task_list->block_offsets != NULL || size == 0);
77 if (size != 0) {
78 memcpy(task_list->block_offsets, block_offsets, size);
79 }
80
81 size = 3 * natoms * sizeof(double);
82 task_list->atom_positions = malloc(size);
83 assert(task_list->atom_positions != NULL || size == 0);
84 if (size != 0) {
85 memcpy(task_list->atom_positions, atom_positions, size);
86 }
87
88 size = natoms * sizeof(int);
89 task_list->atom_kinds = malloc(size);
90 assert(task_list->atom_kinds != NULL || size == 0);
91 if (size != 0) {
92 memcpy(task_list->atom_kinds, atom_kinds, size);
93 }
94
95 size = nkinds * sizeof(grid_basis_set *);
96 task_list->basis_sets = malloc(size);
97 assert(task_list->basis_sets != NULL || size == 0);
98 if (size != 0) {
99 memcpy(task_list->basis_sets, basis_sets, size);
100 }
101
102 size = ntasks * sizeof(grid_ref_task);
103 task_list->tasks = malloc(size);
104 assert(task_list->tasks != NULL || size == 0);
105#pragma omp parallel for schedule(static) if (ntasks > GRID_OMP_MIN_ITERATIONS)
106 for (int i = 0; i < ntasks; i++) {
107 task_list->tasks[i].level = level_list[i];
108 task_list->tasks[i].iatom = iatom_list[i];
109 task_list->tasks[i].jatom = jatom_list[i];
110 task_list->tasks[i].iset = iset_list[i];
111 task_list->tasks[i].jset = jset_list[i];
112 task_list->tasks[i].ipgf = ipgf_list[i];
113 task_list->tasks[i].jpgf = jpgf_list[i];
114 task_list->tasks[i].border_mask = border_mask_list[i];
115 task_list->tasks[i].block_num = block_num_list[i];
116 task_list->tasks[i].radius = radius_list[i];
117 task_list->tasks[i].rab[0] = rab_list[i][0];
118 task_list->tasks[i].rab[1] = rab_list[i][1];
119 task_list->tasks[i].rab[2] = rab_list[i][2];
120 }
121
122 // Store grid layouts.
123 size = nlevels * sizeof(grid_ref_layout);
124 task_list->layouts = malloc(size);
125 assert(task_list->layouts != NULL || size == 0);
126 for (int level = 0; level < nlevels; level++) {
127 for (int i = 0; i < 3; i++) {
128 task_list->layouts[level].npts_global[i] = npts_global[level][i];
129 task_list->layouts[level].npts_local[i] = npts_local[level][i];
130 task_list->layouts[level].shift_local[i] = shift_local[level][i];
131 task_list->layouts[level].border_width[i] = border_width[level][i];
132 for (int j = 0; j < 3; j++) {
133 task_list->layouts[level].dh[i][j] = dh[level][i][j];
134 task_list->layouts[level].dh_inv[i][j] = dh_inv[level][i][j];
135 }
136 }
137 }
138
139 // Sort tasks by level, block_num, iset, and jset.
140 qsort(task_list->tasks, ntasks, sizeof(grid_ref_task), &compare_tasks);
141
142 // Find first and last task for each level and block.
143 size = nlevels * nblocks * sizeof(int);
144 task_list->first_level_block_task = malloc(size);
145 assert(task_list->first_level_block_task != NULL || size == 0);
146 task_list->last_level_block_task = malloc(size);
147 assert(task_list->last_level_block_task != NULL || size == 0);
148 const int nlevel_blocks = nlevels * nblocks;
149#pragma omp parallel for schedule(static) if (nlevel_blocks > \
150 GRID_OMP_MIN_ITERATIONS)
151 for (int i = 0; i < nlevel_blocks; i++) {
152 task_list->first_level_block_task[i] = 0;
153 task_list->last_level_block_task[i] = -1; // last < first means no tasks
154 }
155 for (int itask = 0; itask < ntasks; itask++) {
156 const int level = task_list->tasks[itask].level - 1;
157 const int block_num = task_list->tasks[itask].block_num - 1;
158 if (itask == 0 || task_list->tasks[itask - 1].level - 1 != level ||
159 task_list->tasks[itask - 1].block_num - 1 != block_num) {
160 task_list->first_level_block_task[level * nblocks + block_num] = itask;
161 }
162 task_list->last_level_block_task[level * nblocks + block_num] = itask;
163 }
164
165 // Find largest Cartesian subblock size.
166 task_list->maxco = 0;
167 for (int i = 0; i < nkinds; i++) {
168 task_list->maxco = imax(task_list->maxco, task_list->basis_sets[i]->maxco);
169 }
170
171 // Initialize thread-local storage.
172 size = omp_get_max_threads() * sizeof(double *);
173 task_list->threadlocals = malloc(size);
174 assert(task_list->threadlocals != NULL);
175 memset(task_list->threadlocals, 0, size);
176 size = omp_get_max_threads() * sizeof(size_t);
177 task_list->threadlocal_sizes = malloc(size);
178 assert(task_list->threadlocal_sizes != NULL);
179 memset(task_list->threadlocal_sizes, 0, size);
180
181 *task_list_out = task_list;
182}
183
184/*******************************************************************************
185 * \brief Deallocates given task list, basis_sets have to be freed separately.
186 * \author Ole Schuett
187 ******************************************************************************/
189 if (ptr == NULL)
190 return;
191
193
194 free(task_list->block_offsets);
195 free(task_list->atom_positions);
196 free(task_list->atom_kinds);
197 free(task_list->basis_sets);
198 free(task_list->tasks);
199 free(task_list->layouts);
200 free(task_list->first_level_block_task);
201 free(task_list->last_level_block_task);
202 for (int i = 0; i < omp_get_max_threads(); i++) {
203 if (task_list->threadlocals[i] != NULL) {
204 free(task_list->threadlocals[i]);
205 }
206 }
207 free(task_list->threadlocals);
208 free(task_list->threadlocal_sizes);
209 free(task_list);
210}
211
212/*******************************************************************************
213 * \brief Prototype for BLAS dgemm.
214 * \author Ole Schuett
215 ******************************************************************************/
216void dgemm_(const char *transa, const char *transb, const int *m, const int *n,
217 const int *k, const double *alpha, const double *a, const int *lda,
218 const double *b, const int *ldb, const double *beta, double *c,
219 const int *ldc);
220
221/*******************************************************************************
222 * \brief Convenient wrapper to hide Fortran nature of dgemm_, swapping a and b.
223 * \author Ole Schuett
224 ******************************************************************************/
225static void dgemm(const char transa, const char transb, const int m,
226 const int n, const int k, const double alpha, const double *a,
227 const int lda, const double *b, const int ldb,
228 const double beta, double *c, const int ldc) {
229 dgemm_(&transb, &transa, &n, &m, &k, &alpha, b, &ldb, a, &lda, &beta, c,
230 &ldc);
231}
232
233/*******************************************************************************
234 * \brief Transforms pab from contracted spherical to prim. cartesian basis.
235 * \author Ole Schuett
236 ******************************************************************************/
237static void load_pab(const grid_basis_set *ibasis, const grid_basis_set *jbasis,
238 const int iset, const int jset, const bool transpose,
239 const double *block, double *pab) {
240
241 // Define some more convenient aliases.
242 const int ncoseta = ncoset(ibasis->lmax[iset]);
243 const int ncosetb = ncoset(jbasis->lmax[jset]);
244 const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
245 const int ncob = jbasis->npgf[jset] * ncosetb;
246
247 const int nsgf_seta = ibasis->nsgf_set[iset]; // size of spherical set
248 const int nsgf_setb = jbasis->nsgf_set[jset];
249 const int nsgfa = ibasis->nsgf; // size of entire spherical basis
250 const int nsgfb = jbasis->nsgf;
251 const int sgfa = ibasis->first_sgf[iset] - 1; // start of spherical set
252 const int sgfb = jbasis->first_sgf[jset] - 1;
253 const int maxcoa = ibasis->maxco;
254 const int maxcob = jbasis->maxco;
255
256 double work[nsgf_setb * ncoa];
257 if (transpose) {
258 // work[nsgf_setb][ncoa] = MATMUL(subblock, ibasis->sphi)
259 dgemm('N', 'N', nsgf_setb, ncoa, nsgf_seta, 1.0,
260 &block[sgfb * nsgfa + sgfa], nsgfa, &ibasis->sphi[sgfa * maxcoa],
261 maxcoa, 0.0, work, ncoa);
262 } else {
263 // work[nsgf_setb][ncoa] = MATMUL(TRANSPOSE(subblock), ibasis->sphi)
264 dgemm('T', 'N', nsgf_setb, ncoa, nsgf_seta, 1.0,
265 &block[sgfa * nsgfb + sgfb], nsgfb, &ibasis->sphi[sgfa * maxcoa],
266 maxcoa, 0.0, work, ncoa);
267 }
268 // pab[ncob][ncoa] = MATMUL(TRANSPOSE(jbasis->sphi), work)
269 dgemm('T', 'N', ncob, ncoa, nsgf_setb, 1.0, &jbasis->sphi[sgfb * maxcob],
270 maxcob, work, ncoa, 0.0, pab, ncoa);
271}
272
273/*******************************************************************************
274 * \brief Collocate a range of tasks which are destined for the same grid level.
275 * \author Ole Schuett
276 ******************************************************************************/
278 const grid_ref_task_list *ptr, const int *first_block_task,
279 const int *last_block_task, const enum grid_func func,
280 const int npts_global[3], const int npts_local[3], const int shift_local[3],
281 const int border_width[3], const double dh[3][3], const double dh_inv[3][3],
282 const double *pab_blocks, offload_buffer *grid) {
283
284 if (ptr == NULL)
285 return;
286
288
289// Using default(shared) because with GCC 9 the behavior around const changed:
290// https://www.gnu.org/software/gcc/gcc-9/porting_to.html
291#pragma omp parallel default(shared)
292 {
293 const int ithread = omp_get_thread_num();
294 const int nthreads = omp_get_num_threads();
295
296 // Initialize variables to detect when a new subblock has to be fetched.
297 int old_offset = -1, old_iset = -1, old_jset = -1;
298
299 // Matrix pab is re-used across tasks.
300 double pab[imax(task_list->maxco * task_list->maxco, 1)];
301
302 // Ensure that grid can fit into thread-local storage, reallocate if needed.
303 const int npts_local_total = npts_local[0] * npts_local[1] * npts_local[2];
304 const size_t grid_size = npts_local_total * sizeof(double);
305 if (task_list->threadlocal_sizes[ithread] < grid_size) {
306 if (task_list->threadlocals[ithread] != NULL) {
307 free(task_list->threadlocals[ithread]);
308 }
309 task_list->threadlocals[ithread] = malloc(grid_size);
310 assert(task_list->threadlocals[ithread] != NULL);
311 task_list->threadlocal_sizes[ithread] = grid_size;
312 }
313
314 // Zero thread-local copy of the grid.
315 double *const my_grid = task_list->threadlocals[ithread];
316 memset(my_grid, 0, grid_size);
317
318 // Parallelize over blocks to avoid unnecessary calls to load_pab.
319 const int chunk_size = imax(1, task_list->nblocks / (nthreads * 50));
320#pragma omp for schedule(dynamic, chunk_size)
321 for (int block_num = 0; block_num < task_list->nblocks; block_num++) {
322 const int first_task = first_block_task[block_num];
323 const int last_task = last_block_task[block_num];
324
325 for (int itask = first_task; itask <= last_task; itask++) {
326 // Define some convenient aliases.
327 const grid_ref_task *task = &task_list->tasks[itask];
328 const int iatom = task->iatom - 1;
329 const int jatom = task->jatom - 1;
330 const int iset = task->iset - 1;
331 const int jset = task->jset - 1;
332 const int ipgf = task->ipgf - 1;
333 const int jpgf = task->jpgf - 1;
334 const int ikind = task_list->atom_kinds[iatom] - 1;
335 const int jkind = task_list->atom_kinds[jatom] - 1;
336 const grid_basis_set *ibasis = task_list->basis_sets[ikind];
337 const grid_basis_set *jbasis = task_list->basis_sets[jkind];
338 const double zeta = ibasis->zet[iset * ibasis->maxpgf + ipgf];
339 const double zetb = jbasis->zet[jset * jbasis->maxpgf + jpgf];
340 const int ncoseta = ncoset(ibasis->lmax[iset]);
341 const int ncosetb = ncoset(jbasis->lmax[jset]);
342 const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
343 const int ncob = jbasis->npgf[jset] * ncosetb;
344 const int block_num = task->block_num - 1;
345 const int block_offset = task_list->block_offsets[block_num];
346 const double *block = &pab_blocks[block_offset];
347 const bool transpose = (iatom <= jatom);
348
349 // Load subblock from buffer and decontract into Cartesian sublock pab.
350 // The previous pab can be reused when only ipgf or jpgf has changed.
351 if (block_offset != old_offset || iset != old_iset ||
352 jset != old_jset) {
353 old_offset = block_offset;
354 old_iset = iset;
355 old_jset = jset;
356 load_pab(ibasis, jbasis, iset, jset, transpose, block, pab);
357 }
358
360 /*orthorhombic=*/task_list->orthorhombic,
361 /*border_mask=*/task->border_mask,
362 /*func=*/func,
363 /*la_max=*/ibasis->lmax[iset],
364 /*la_min=*/ibasis->lmin[iset],
365 /*lb_max=*/jbasis->lmax[jset],
366 /*lb_min=*/jbasis->lmin[jset],
367 /*zeta=*/zeta,
368 /*zetb=*/zetb,
369 /*rscale=*/(iatom == jatom) ? 1 : 2,
370 /*dh=*/dh,
371 /*dh_inv=*/dh_inv,
372 /*ra=*/&task_list->atom_positions[3 * iatom],
373 /*rab=*/task->rab,
374 /*npts_global=*/npts_global,
375 /*npts_local=*/npts_local,
376 /*shift_local=*/shift_local,
377 /*border_width=*/border_width,
378 /*radius=*/task->radius,
379 /*o1=*/ipgf * ncoseta,
380 /*o2=*/jpgf * ncosetb,
381 /*n1=*/ncoa,
382 /*n2=*/ncob,
383 /*pab=*/(const double(*)[ncoa])pab,
384 /*grid=*/my_grid);
385
386 } // end of task loop
387 } // end of block loop
388
389// While there should be an implicit barrier at the end of the block loop, this
390// explicit barrier eliminates occasional seg faults with icc compiled binaries.
391#pragma omp barrier
392
393 // Merge thread-local grids via an efficient tree reduction.
394 const int nreduction_cycles = ceil(log(nthreads) / log(2)); // tree depth
395 for (int icycle = 1; icycle <= nreduction_cycles; icycle++) {
396 // Threads are divided into groups, whose size doubles with each cycle.
397 // After a cycle the reduced data is stored at first thread of each group.
398 const int group_size = 1 << icycle; // 2**icycle
399 const int igroup = ithread / group_size;
400 const int dest_thread = igroup * group_size;
401 const int src_thread = dest_thread + group_size / 2;
402 // The last group might actually be a bit smaller.
403 const int actual_group_size = imin(group_size, nthreads - dest_thread);
404 // Parallelize summation by dividing grid points across group members.
405 const int rank = modulo(ithread, group_size); // position within the group
406 const int lb = (npts_local_total * rank) / actual_group_size;
407 const int ub = (npts_local_total * (rank + 1)) / actual_group_size;
408 if (src_thread < nthreads) {
410 for (int i = lb; i < ub; i++) {
411 task_list->threadlocals[dest_thread][i] +=
412 task_list->threadlocals[src_thread][i];
413 }
414 }
415#pragma omp barrier
416 }
417
418 // Copy final result from first thread into shared grid.
419 const int lb = (npts_local_total * ithread) / nthreads;
420 const int ub = (npts_local_total * (ithread + 1)) / nthreads;
422 for (int i = lb; i < ub; i++) {
423 grid->host_buffer[i] = task_list->threadlocals[0][i];
424 }
425
426 } // end of omp parallel region
427}
428
429/*******************************************************************************
430 * \brief Collocate all tasks of in given list onto given grids.
431 * See grid_task_list.h for details.
432 * \author Ole Schuett
433 ******************************************************************************/
435 const enum grid_func func, const int nlevels,
436 const offload_buffer *pab_blocks,
437 offload_buffer *grids[nlevels]) {
438
439 if (ptr == NULL)
440 return;
441
443
444 assert(task_list->nlevels == nlevels);
445
446 for (int level = 0; level < task_list->nlevels; level++) {
447 const int idx = level * task_list->nblocks;
448 const int *first_block_task = &task_list->first_level_block_task[idx];
449 const int *last_block_task = &task_list->last_level_block_task[idx];
450 const grid_ref_layout *layout = &task_list->layouts[level];
452 task_list, first_block_task, last_block_task, func, layout->npts_global,
453 layout->npts_local, layout->shift_local, layout->border_width,
454 layout->dh, layout->dh_inv, pab_blocks->host_buffer, grids[level]);
455 }
456}
457
458/*******************************************************************************
459 * \brief Transforms hab from prim. cartesian to contracted spherical basis.
460 * \author Ole Schuett
461 ******************************************************************************/
462static inline void store_hab(const grid_basis_set *ibasis,
463 const grid_basis_set *jbasis, const int iset,
464 const int jset, const bool transpose,
465 const double *hab, double *block) {
466
467 // Define some more convenient aliases.
468 const int ncoseta = ncoset(ibasis->lmax[iset]);
469 const int ncosetb = ncoset(jbasis->lmax[jset]);
470 const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
471 const int ncob = jbasis->npgf[jset] * ncosetb;
472
473 const int nsgf_seta = ibasis->nsgf_set[iset]; // size of spherical set
474 const int nsgf_setb = jbasis->nsgf_set[jset];
475 const int nsgfa = ibasis->nsgf; // size of entire spherical basis
476 const int nsgfb = jbasis->nsgf;
477 const int sgfa = ibasis->first_sgf[iset] - 1; // start of spherical set
478 const int sgfb = jbasis->first_sgf[jset] - 1;
479 const int maxcoa = ibasis->maxco;
480 const int maxcob = jbasis->maxco;
481
482 double work[nsgf_setb * ncoa];
483
484 // work[nsgf_setb][ncoa] = MATMUL(jbasis->sphi, hab)
485 dgemm('N', 'N', nsgf_setb, ncoa, ncob, 1.0, &jbasis->sphi[sgfb * maxcob],
486 maxcob, hab, ncoa, 0.0, work, ncoa);
487
488 if (transpose) {
489 // subblock[nsgf_setb][nsgf_seta] += MATMUL(work, TRANSPOSE(ibasis->sphi))
490 dgemm('N', 'T', nsgf_setb, nsgf_seta, ncoa, 1.0, work, ncoa,
491 &ibasis->sphi[sgfa * maxcoa], maxcoa, 1.0,
492 &block[sgfb * nsgfa + sgfa], nsgfa);
493 } else {
494 // subblock[nsgf_seta][nsgf_setb] += MATMUL(ibasis->sphi, TRANSPOSE(work))
495 dgemm('N', 'T', nsgf_seta, nsgf_setb, ncoa, 1.0,
496 &ibasis->sphi[sgfa * maxcoa], maxcoa, work, ncoa, 1.0,
497 &block[sgfa * nsgfb + sgfb], nsgfb);
498 }
499}
500
501/*******************************************************************************
502 * \brief Integrate a range of tasks that belong to the same grid level.
503 * \author Ole Schuett
504 ******************************************************************************/
506 const grid_ref_task_list *ptr, const int *first_block_task,
507 const int *last_block_task, const bool compute_tau, const int natoms,
508 const int npts_global[3], const int npts_local[3], const int shift_local[3],
509 const int border_width[3], const double dh[3][3], const double dh_inv[3][3],
510 const offload_buffer *pab_blocks, const offload_buffer *grid,
511 offload_buffer *hab_blocks, double forces[natoms][3], double virial[3][3]) {
512
513 if (ptr == NULL)
514 return;
515
517
518// Using default(shared) because with GCC 9 the behavior around const changed:
519// https://www.gnu.org/software/gcc/gcc-9/porting_to.html
520#pragma omp parallel default(shared)
521 {
522 // Initialize variables to detect when a new subblock has to be fetched.
523 int old_offset = -1, old_iset = -1, old_jset = -1;
524 grid_basis_set *old_ibasis = NULL, *old_jbasis = NULL;
525 bool old_transpose = false;
526
527 // Matrix pab and hab are re-used across tasks.
528 double pab[imax(task_list->maxco * task_list->maxco, 1)];
529 double hab[imax(task_list->maxco * task_list->maxco, 1)];
530
531 // Parallelize over blocks to avoid concurred access to hab_blocks.
532 const int nthreads = omp_get_num_threads();
533 const int chunk_size = imax(1, task_list->nblocks / (nthreads * 50));
534#pragma omp for schedule(dynamic, chunk_size)
535 for (int block_num = 0; block_num < task_list->nblocks; block_num++) {
536 const int first_task = first_block_task[block_num];
537 const int last_task = last_block_task[block_num];
538
539 // Accumulate forces per block as it corresponds to a pair of atoms.
540 const int iatom = task_list->tasks[first_task].iatom - 1;
541 const int jatom = task_list->tasks[first_task].jatom - 1;
542 double my_forces[2][3] = {0};
543 double my_virials[2][3][3] = {0};
544
545 for (int itask = first_task; itask <= last_task; itask++) {
546 // Define some convenient aliases.
547 const grid_ref_task *task = &task_list->tasks[itask];
548 assert(task->block_num - 1 == block_num);
549 assert(task->iatom - 1 == iatom && task->jatom - 1 == jatom);
550 const int ikind = task_list->atom_kinds[iatom] - 1;
551 const int jkind = task_list->atom_kinds[jatom] - 1;
552 grid_basis_set *ibasis = task_list->basis_sets[ikind];
553 grid_basis_set *jbasis = task_list->basis_sets[jkind];
554 const int iset = task->iset - 1;
555 const int jset = task->jset - 1;
556 const int ipgf = task->ipgf - 1;
557 const int jpgf = task->jpgf - 1;
558 const double zeta = ibasis->zet[iset * ibasis->maxpgf + ipgf];
559 const double zetb = jbasis->zet[jset * jbasis->maxpgf + jpgf];
560 const int ncoseta = ncoset(ibasis->lmax[iset]);
561 const int ncosetb = ncoset(jbasis->lmax[jset]);
562 const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
563 const int ncob = jbasis->npgf[jset] * ncosetb;
564 const int block_offset = task_list->block_offsets[block_num];
565 const bool transpose = (iatom <= jatom);
566 const bool pab_required = (forces != NULL || virial != NULL);
567
568 // Load pab and store hab subblocks when needed.
569 // Previous hab and pab can be reused when only ipgf or jpgf changed.
570 if (block_offset != old_offset || iset != old_iset ||
571 jset != old_jset) {
572 if (pab_required) {
573 load_pab(ibasis, jbasis, iset, jset, transpose,
574 &pab_blocks->host_buffer[block_offset], pab);
575 }
576 if (old_offset >= 0) { // skip first iteration
577 store_hab(old_ibasis, old_jbasis, old_iset, old_jset, old_transpose,
578 hab, &hab_blocks->host_buffer[old_offset]);
579 }
580 memset(hab, 0, ncoa * ncob * sizeof(double));
581 old_offset = block_offset;
582 old_iset = iset;
583 old_jset = jset;
584 old_ibasis = ibasis;
585 old_jbasis = jbasis;
586 old_transpose = transpose;
587 }
588
590 /*orthorhombic=*/task_list->orthorhombic,
591 /*compute_tau=*/compute_tau,
592 /*border_mask=*/task->border_mask,
593 /*la_max=*/ibasis->lmax[iset],
594 /*la_min=*/ibasis->lmin[iset],
595 /*lb_max=*/jbasis->lmax[jset],
596 /*lb_min=*/jbasis->lmin[jset],
597 /*zeta=*/zeta,
598 /*zetb=*/zetb,
599 /*dh=*/dh,
600 /*dh_inv=*/dh_inv,
601 /*ra=*/&task_list->atom_positions[3 * iatom],
602 /*rab=*/task->rab,
603 /*npts_global=*/npts_global,
604 /*npts_local=*/npts_local,
605 /*shift_local=*/shift_local,
606 /*border_width=*/border_width,
607 /*radius=*/task->radius,
608 /*o1=*/ipgf * ncoseta,
609 /*o2=*/jpgf * ncosetb,
610 /*n1=*/ncoa,
611 /*n2=*/ncob,
612 /*grid=*/grid->host_buffer,
613 /*hab=*/(double(*)[ncoa])hab,
614 /*pab=*/(pab_required) ? (const double(*)[ncoa])pab : NULL,
615 /*forces=*/(forces != NULL) ? my_forces : NULL,
616 /*virials=*/(virial != NULL) ? my_virials : NULL,
617 /*hdab=*/NULL,
618 /*hadb=*/NULL,
619 /*a_hdab=*/NULL);
620
621 } // end of task loop
622
623 // Merge thread-local forces and virial into shared ones.
624 // It does not seem worth the trouble to accumulate them thread-locally.
625 const double scalef = (iatom == jatom) ? 1.0 : 2.0;
626 if (forces != NULL) {
627#pragma omp critical(forces)
628 for (int i = 0; i < 3; i++) {
629 forces[iatom][i] += scalef * my_forces[0][i];
630 forces[jatom][i] += scalef * my_forces[1][i];
631 }
632 }
633 if (virial != NULL) {
634#pragma omp critical(virial)
635 for (int i = 0; i < 3; i++) {
636 for (int j = 0; j < 3; j++) {
637 virial[i][j] += scalef * my_virials[0][i][j];
638 virial[i][j] += scalef * my_virials[1][i][j];
639 }
640 }
641 }
642
643 } // end of block loop
644
645 // store final hab
646 if (old_offset >= 0) {
647 store_hab(old_ibasis, old_jbasis, old_iset, old_jset, old_transpose, hab,
648 &hab_blocks->host_buffer[old_offset]);
649 }
650
651 } // end of omp parallel region
652}
653
654/*******************************************************************************
655 * \brief Integrate all tasks of in given list from given grids.
656 * See grid_task_list.h for details.
657 * \author Ole Schuett
658 ******************************************************************************/
660 const grid_ref_task_list *ptr, const bool compute_tau, const int natoms,
661 const int nlevels, const offload_buffer *pab_blocks,
662 const offload_buffer *grids[nlevels], offload_buffer *hab_blocks,
663 double forces[natoms][3], double virial[3][3]) {
664 if (ptr == NULL)
665 return;
666
668 assert(task_list->nlevels == nlevels);
669 assert(task_list->natoms == natoms);
670
671 // Zero result arrays.
672 memset(hab_blocks->host_buffer, 0, hab_blocks->size);
673 if (forces != NULL) {
674 memset(forces, 0, natoms * 3 * sizeof(double));
675 }
676 if (virial != NULL) {
677 memset(virial, 0, 9 * sizeof(double));
678 }
679
680 for (int level = 0; level < task_list->nlevels; level++) {
681 const int idx = level * task_list->nblocks;
682 const int *first_block_task = &task_list->first_level_block_task[idx];
683 const int *last_block_task = &task_list->last_level_block_task[idx];
684 const grid_ref_layout *layout = &task_list->layouts[level];
686 task_list, first_block_task, last_block_task, compute_tau, natoms,
687 layout->npts_global, layout->npts_local, layout->shift_local,
688 layout->border_width, layout->dh, layout->dh_inv, pab_blocks,
689 grids[level], hab_blocks, forces, virial);
690 }
691}
692
693// EOF
static int imax(int x, int y)
Returns the larger of two given integers (missing from the C standard)
static int imin(int x, int y)
Returns the smaller of the two integers (missing from the C standard).
Definition dbm_miniapp.c:36
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
#define GRID_PRAGMA_SIMD_LOOP
Definition grid_common.h:25
static GRID_HOST_DEVICE int idx(const orbital a)
Return coset index of given orbital angular momentum.
grid_func
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
static void const int const int i
const int ub
static void const int const int const int const int const int const double const int const int const int npts_local[3]
void grid_ref_collocate_pgf_product(const bool orthorhombic, const int border_mask, const enum grid_func func, const int la_max, const int la_min, const int lb_max, const int lb_min, const double zeta, const double zetb, const double rscale, const double dh[3][3], const double dh_inv[3][3], const double ra[3], const double rab[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, const int o1, const int o2, const int n1, const int n2, const double pab[n2][n1], double *grid)
Collocates a single product of primitiv Gaussians. See grid_ref_collocate.h for details.
void grid_ref_integrate_pgf_product(const bool orthorhombic, const bool compute_tau, const int border_mask, const int la_max, const int la_min, const int lb_max, const int lb_min, const double zeta, const double zetb, const double dh[3][3], const double dh_inv[3][3], const double ra[3], const double rab[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, const int o1, const int o2, const int n1, const int n2, const double *grid, double hab[n2][n1], const double pab[n2][n1], double forces[2][3], double virials[2][3][3], double hdab[n2][n1][3], double hadb[n2][n1][3], double a_hdab[n2][n1][3][3])
Integrates a single task. See grid_ref_integrate.h for details.
void grid_ref_free_task_list(grid_ref_task_list *ptr)
Deallocates given task list, basis_sets have to be freed separately.
static void load_pab(const grid_basis_set *ibasis, const grid_basis_set *jbasis, const int iset, const int jset, const bool transpose, const double *block, double *pab)
Transforms pab from contracted spherical to prim. cartesian basis.
static void integrate_one_grid_level(const grid_ref_task_list *ptr, const int *first_block_task, const int *last_block_task, const bool compute_tau, const int natoms, const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double dh[3][3], const double dh_inv[3][3], const offload_buffer *pab_blocks, const offload_buffer *grid, offload_buffer *hab_blocks, double forces[natoms][3], double virial[3][3])
Integrate a range of tasks that belong to the same grid level.
static int compare_tasks(const void *a, const void *b)
Comperator passed to qsort to compare two tasks.
void grid_ref_create_task_list(const bool orthorhombic, const int ntasks, const int nlevels, const int natoms, const int nkinds, const int nblocks, const int block_offsets[nblocks], const double atom_positions[natoms][3], const int atom_kinds[natoms], const grid_basis_set *basis_sets[nkinds], const int level_list[ntasks], const int iatom_list[ntasks], const int jatom_list[ntasks], const int iset_list[ntasks], const int jset_list[ntasks], const int ipgf_list[ntasks], const int jpgf_list[ntasks], const int border_mask_list[ntasks], const int block_num_list[ntasks], const double radius_list[ntasks], const double rab_list[ntasks][3], const int npts_global[nlevels][3], const int npts_local[nlevels][3], const int shift_local[nlevels][3], const int border_width[nlevels][3], const double dh[nlevels][3][3], const double dh_inv[nlevels][3][3], grid_ref_task_list **task_list_out)
Allocates a task list for the reference backend. See grid_task_list.h for details.
void grid_ref_integrate_task_list(const grid_ref_task_list *ptr, const bool compute_tau, const int natoms, const int nlevels, const offload_buffer *pab_blocks, const offload_buffer *grids[nlevels], offload_buffer *hab_blocks, double forces[natoms][3], double virial[3][3])
Integrate all tasks of in given list from given grids. See grid_task_list.h for details.
void grid_ref_collocate_task_list(const grid_ref_task_list *ptr, const enum grid_func func, const int nlevels, const offload_buffer *pab_blocks, offload_buffer *grids[nlevels])
Collocate all tasks of in given list onto given grids. See grid_task_list.h for details.
static void collocate_one_grid_level(const grid_ref_task_list *ptr, const int *first_block_task, const int *last_block_task, const enum grid_func func, const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double dh[3][3], const double dh_inv[3][3], const double *pab_blocks, offload_buffer *grid)
Collocate a range of tasks which are destined for the same grid level.
void dgemm_(const char *transa, const char *transb, const int *m, const int *n, const int *k, const double *alpha, const double *a, const int *lda, const double *b, const int *ldb, const double *beta, double *c, const int *ldc)
Prototype for BLAS dgemm.
static void store_hab(const grid_basis_set *ibasis, const grid_basis_set *jbasis, const int iset, const int jset, const bool transpose, const double *hab, double *block)
Transforms hab from prim. cartesian to contracted spherical basis.
static void dgemm(const char transa, const char transb, const int m, const int n, const int k, const double alpha, const double *a, const int lda, const double *b, const int ldb, const double beta, double *c, const int ldc)
Convenient wrapper to hide Fortran nature of dgemm_, swapping a and b.
void grid_ref_task_list
Internal representation of a basis set.
Internal representation of a grid layout.
Internal representation of a task list.
Internal representation of a task.
Internal representation of a buffer.
double * host_buffer