(git:d18deda)
Loading...
Searching...
No Matches
grid_cpu_task_list.c
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2025 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: BSD-3-Clause */
6/*----------------------------------------------------------------------------*/
7
8#include <assert.h>
9#include <math.h>
10#include <omp.h>
11#include <stdint.h>
12#include <stdio.h>
13#include <stdlib.h>
14#include <string.h>
15
16#include "../common/grid_common.h"
17#include "grid_cpu_collocate.h"
18#include "grid_cpu_integrate.h"
19#include "grid_cpu_task_list.h"
20
21/*******************************************************************************
22 * \brief Comperator passed to qsort to compare two tasks.
23 * \author Ole Schuett
24 ******************************************************************************/
25static int compare_tasks(const void *a, const void *b) {
26 const grid_cpu_task *task_a = a, *task_b = b;
27 if (task_a->level != task_b->level) {
28 return task_a->level - task_b->level;
29 } else if (task_a->block_num != task_b->block_num) {
30 return task_a->block_num - task_b->block_num;
31 } else if (task_a->iset != task_b->iset) {
32 return task_a->iset - task_b->iset;
33 } else {
34 return task_a->jset - task_b->jset;
35 }
36}
37/*******************************************************************************
38 * \brief Allocates a task list for the cpu backend.
39 * See grid_task_list.h for details.
40 * \author Ole Schuett
41 ******************************************************************************/
43 const bool orthorhombic, const int ntasks, const int nlevels,
44 const int natoms, const int nkinds, const int nblocks,
45 const int block_offsets[nblocks], const double atom_positions[natoms][3],
46 const int atom_kinds[natoms], const grid_basis_set *basis_sets[nkinds],
47 const int level_list[ntasks], const int iatom_list[ntasks],
48 const int jatom_list[ntasks], const int iset_list[ntasks],
49 const int jset_list[ntasks], const int ipgf_list[ntasks],
50 const int jpgf_list[ntasks], const int border_mask_list[ntasks],
51 const int block_num_list[ntasks], const double radius_list[ntasks],
52 const double rab_list[ntasks][3], const int npts_global[nlevels][3],
53 const int npts_local[nlevels][3], const int shift_local[nlevels][3],
54 const int border_width[nlevels][3], const double dh[nlevels][3][3],
55 const double dh_inv[nlevels][3][3], grid_cpu_task_list **task_list_out) {
56
57 if (*task_list_out != NULL) {
58 // This is actually an opportunity to reuse some buffers.
59 grid_cpu_free_task_list(*task_list_out);
60 }
61
62 grid_cpu_task_list *task_list = malloc(sizeof(grid_cpu_task_list));
63 assert(task_list != NULL);
64
65 task_list->orthorhombic = orthorhombic;
66 task_list->ntasks = ntasks;
67 task_list->nlevels = nlevels;
68 task_list->natoms = natoms;
69 task_list->nkinds = nkinds;
70 task_list->nblocks = nblocks;
71
72 size_t size = nblocks * sizeof(int);
73 task_list->block_offsets = malloc(size);
74 assert(task_list->block_offsets != NULL);
75 memcpy(task_list->block_offsets, block_offsets, size);
76
77 size = 3 * natoms * sizeof(double);
78 task_list->atom_positions = malloc(size);
79 assert(task_list->atom_positions != NULL);
80 memcpy(task_list->atom_positions, atom_positions, size);
81
82 size = natoms * sizeof(int);
83 task_list->atom_kinds = malloc(size);
84 assert(task_list->atom_kinds != NULL);
85 memcpy(task_list->atom_kinds, atom_kinds, size);
86
87 size = nkinds * sizeof(grid_basis_set *);
88 task_list->basis_sets = malloc(size);
89 assert(task_list->basis_sets != NULL);
90 memcpy(task_list->basis_sets, basis_sets, size);
91
92 size = ntasks * sizeof(grid_cpu_task);
93 task_list->tasks = malloc(size);
94 assert(task_list->tasks != NULL);
95 for (int i = 0; i < ntasks; i++) {
96 task_list->tasks[i].level = level_list[i];
97 task_list->tasks[i].iatom = iatom_list[i];
98 task_list->tasks[i].jatom = jatom_list[i];
99 task_list->tasks[i].iset = iset_list[i];
100 task_list->tasks[i].jset = jset_list[i];
101 task_list->tasks[i].ipgf = ipgf_list[i];
102 task_list->tasks[i].jpgf = jpgf_list[i];
103 task_list->tasks[i].border_mask = border_mask_list[i];
104 task_list->tasks[i].block_num = block_num_list[i];
105 task_list->tasks[i].radius = radius_list[i];
106 task_list->tasks[i].rab[0] = rab_list[i][0];
107 task_list->tasks[i].rab[1] = rab_list[i][1];
108 task_list->tasks[i].rab[2] = rab_list[i][2];
109 }
110
111 // Store grid layouts.
112 size = nlevels * sizeof(grid_cpu_layout);
113 task_list->layouts = malloc(size);
114 assert(task_list->layouts != NULL);
115 for (int level = 0; level < nlevels; level++) {
116 for (int i = 0; i < 3; i++) {
117 task_list->layouts[level].npts_global[i] = npts_global[level][i];
118 task_list->layouts[level].npts_local[i] = npts_local[level][i];
119 task_list->layouts[level].shift_local[i] = shift_local[level][i];
120 task_list->layouts[level].border_width[i] = border_width[level][i];
121 for (int j = 0; j < 3; j++) {
122 task_list->layouts[level].dh[i][j] = dh[level][i][j];
123 task_list->layouts[level].dh_inv[i][j] = dh_inv[level][i][j];
124 }
125 }
126 }
127
128 // Sort tasks by level, block_num, iset, and jset.
129 qsort(task_list->tasks, ntasks, sizeof(grid_cpu_task), &compare_tasks);
130
131 // Find first and last task for each level and block.
132 size = nlevels * nblocks * sizeof(int);
133 task_list->first_level_block_task = malloc(size);
134 assert(task_list->first_level_block_task != NULL);
135 task_list->last_level_block_task = malloc(size);
136 assert(task_list->last_level_block_task != NULL);
137 for (int i = 0; i < nlevels * nblocks; i++) {
138 task_list->first_level_block_task[i] = 0;
139 task_list->last_level_block_task[i] = -1; // last < first means no tasks
140 }
141 for (int itask = 0; itask < ntasks; itask++) {
142 const int level = task_list->tasks[itask].level - 1;
143 const int block_num = task_list->tasks[itask].block_num - 1;
144 if (itask == 0 || task_list->tasks[itask - 1].level - 1 != level ||
145 task_list->tasks[itask - 1].block_num - 1 != block_num) {
146 task_list->first_level_block_task[level * nblocks + block_num] = itask;
147 }
148 task_list->last_level_block_task[level * nblocks + block_num] = itask;
149 }
150
151 // Find largest Cartesian subblock size.
152 task_list->maxco = 0;
153 for (int i = 0; i < nkinds; i++) {
154 task_list->maxco = imax(task_list->maxco, task_list->basis_sets[i]->maxco);
155 }
156
157 // Initialize thread-local storage.
158 size = omp_get_max_threads() * sizeof(double *);
159 task_list->threadlocals = malloc(size);
160 assert(task_list->threadlocals != NULL);
161 memset(task_list->threadlocals, 0, size);
162 size = omp_get_max_threads() * sizeof(size_t);
163 task_list->threadlocal_sizes = malloc(size);
164 assert(task_list->threadlocal_sizes != NULL);
165 memset(task_list->threadlocal_sizes, 0, size);
166
167 *task_list_out = task_list;
168}
169
170/*******************************************************************************
171 * \brief Deallocates given task list, basis_sets have to be freed separately.
172 * \author Ole Schuett
173 ******************************************************************************/
175 free(task_list->block_offsets);
176 free(task_list->atom_positions);
177 free(task_list->atom_kinds);
178 free(task_list->basis_sets);
179 free(task_list->tasks);
180 free(task_list->layouts);
181 free(task_list->first_level_block_task);
182 free(task_list->last_level_block_task);
183 for (int i = 0; i < omp_get_max_threads(); i++) {
184 if (task_list->threadlocals[i] != NULL) {
185 free(task_list->threadlocals[i]);
186 }
187 }
188 free(task_list->threadlocals);
189 free(task_list->threadlocal_sizes);
190 free(task_list);
191}
192
193/*******************************************************************************
194 * \brief Prototype for BLAS dgemm.
195 * \author Ole Schuett
196 ******************************************************************************/
197void dgemm_(const char *transa, const char *transb, const int *m, const int *n,
198 const int *k, const double *alpha, const double *a, const int *lda,
199 const double *b, const int *ldb, const double *beta, double *c,
200 const int *ldc);
201
202/*******************************************************************************
203 * \brief Convenient wrapper to hide Fortran nature of dgemm_, swapping a and b.
204 * \author Ole Schuett
205 ******************************************************************************/
206static void dgemm(const char transa, const char transb, const int m,
207 const int n, const int k, const double alpha, const double *a,
208 const int lda, const double *b, const int ldb,
209 const double beta, double *c, const int ldc) {
210 dgemm_(&transb, &transa, &n, &m, &k, &alpha, b, &ldb, a, &lda, &beta, c,
211 &ldc);
212}
213
214/*******************************************************************************
215 * \brief Transforms pab from contracted spherical to prim. cartesian basis.
216 * \author Ole Schuett
217 ******************************************************************************/
218static void load_pab(const grid_basis_set *ibasis, const grid_basis_set *jbasis,
219 const int iset, const int jset, const bool transpose,
220 const double *block, double *pab) {
221
222 // Define some more convenient aliases.
223 const int ncoseta = ncoset(ibasis->lmax[iset]);
224 const int ncosetb = ncoset(jbasis->lmax[jset]);
225 const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
226 const int ncob = jbasis->npgf[jset] * ncosetb;
227
228 const int nsgf_seta = ibasis->nsgf_set[iset]; // size of spherical set
229 const int nsgf_setb = jbasis->nsgf_set[jset];
230 const int nsgfa = ibasis->nsgf; // size of entire spherical basis
231 const int nsgfb = jbasis->nsgf;
232 const int sgfa = ibasis->first_sgf[iset] - 1; // start of spherical set
233 const int sgfb = jbasis->first_sgf[jset] - 1;
234 const int maxcoa = ibasis->maxco;
235 const int maxcob = jbasis->maxco;
236
237 double work[nsgf_setb * ncoa];
238 if (transpose) {
239 // work[nsgf_setb][ncoa] = MATMUL(subblock, ibasis->sphi)
240 dgemm('N', 'N', nsgf_setb, ncoa, nsgf_seta, 1.0,
241 &block[sgfb * nsgfa + sgfa], nsgfa, &ibasis->sphi[sgfa * maxcoa],
242 maxcoa, 0.0, work, ncoa);
243 } else {
244 // work[nsgf_setb][ncoa] = MATMUL(TRANSPOSE(subblock), ibasis->sphi)
245 dgemm('T', 'N', nsgf_setb, ncoa, nsgf_seta, 1.0,
246 &block[sgfa * nsgfb + sgfb], nsgfb, &ibasis->sphi[sgfa * maxcoa],
247 maxcoa, 0.0, work, ncoa);
248 }
249 // pab[ncob][ncoa] = MATMUL(TRANSPOSE(jbasis->sphi), work)
250 dgemm('T', 'N', ncob, ncoa, nsgf_setb, 1.0, &jbasis->sphi[sgfb * maxcob],
251 maxcob, work, ncoa, 0.0, pab, ncoa);
252}
253
254/*******************************************************************************
255 * \brief Collocate a range of tasks which are destined for the same grid level.
256 * \author Ole Schuett
257 ******************************************************************************/
259 const grid_cpu_task_list *task_list, const int *first_block_task,
260 const int *last_block_task, const enum grid_func func,
261 const int npts_global[3], const int npts_local[3], const int shift_local[3],
262 const int border_width[3], const double dh[3][3], const double dh_inv[3][3],
263 const double *pab_blocks, offload_buffer *grid) {
264
265// Using default(shared) because with GCC 9 the behavior around const changed:
266// https://www.gnu.org/software/gcc/gcc-9/porting_to.html
267#pragma omp parallel default(shared)
268 {
269 const int ithread = omp_get_thread_num();
270 const int nthreads = omp_get_num_threads();
271
272 // Initialize variables to detect when a new subblock has to be fetched.
273 int old_offset = -1, old_iset = -1, old_jset = -1;
274
275 // Matrix pab is re-used across tasks.
276 double pab[task_list->maxco * task_list->maxco];
277
278 // Ensure that grid can fit into thread-local storage, reallocate if needed.
279 const int npts_local_total = npts_local[0] * npts_local[1] * npts_local[2];
280 const size_t grid_size = npts_local_total * sizeof(double);
281 if (task_list->threadlocal_sizes[ithread] < grid_size) {
282 if (task_list->threadlocals[ithread] != NULL) {
283 free(task_list->threadlocals[ithread]);
284 }
285 task_list->threadlocals[ithread] = malloc(grid_size);
286 assert(task_list->threadlocals[ithread] != NULL);
287 task_list->threadlocal_sizes[ithread] = grid_size;
288 }
289
290 // Zero thread-local copy of the grid.
291 double *const my_grid = task_list->threadlocals[ithread];
292 memset(my_grid, 0, grid_size);
293
294 // Parallelize over blocks to avoid unnecessary calls to load_pab.
295 const int chunk_size = imax(1, task_list->nblocks / (nthreads * 50));
296#pragma omp for schedule(dynamic, chunk_size)
297 for (int block_num = 0; block_num < task_list->nblocks; block_num++) {
298 const int first_task = first_block_task[block_num];
299 const int last_task = last_block_task[block_num];
300
301 for (int itask = first_task; itask <= last_task; itask++) {
302 // Define some convenient aliases.
303 const grid_cpu_task *task = &task_list->tasks[itask];
304 const int iatom = task->iatom - 1;
305 const int jatom = task->jatom - 1;
306 const int iset = task->iset - 1;
307 const int jset = task->jset - 1;
308 const int ipgf = task->ipgf - 1;
309 const int jpgf = task->jpgf - 1;
310 const int ikind = task_list->atom_kinds[iatom] - 1;
311 const int jkind = task_list->atom_kinds[jatom] - 1;
312 const grid_basis_set *ibasis = task_list->basis_sets[ikind];
313 const grid_basis_set *jbasis = task_list->basis_sets[jkind];
314 const double zeta = ibasis->zet[iset * ibasis->maxpgf + ipgf];
315 const double zetb = jbasis->zet[jset * jbasis->maxpgf + jpgf];
316 const int ncoseta = ncoset(ibasis->lmax[iset]);
317 const int ncosetb = ncoset(jbasis->lmax[jset]);
318 const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
319 const int ncob = jbasis->npgf[jset] * ncosetb;
320 const int block_num = task->block_num - 1;
321 const int block_offset = task_list->block_offsets[block_num];
322 const double *block = &pab_blocks[block_offset];
323 const bool transpose = (iatom <= jatom);
324
325 // Load subblock from buffer and decontract into Cartesian sublock pab.
326 // The previous pab can be reused when only ipgf or jpgf has changed.
327 if (block_offset != old_offset || iset != old_iset ||
328 jset != old_jset) {
329 old_offset = block_offset;
330 old_iset = iset;
331 old_jset = jset;
332 load_pab(ibasis, jbasis, iset, jset, transpose, block, pab);
333 }
334
336 /*orthorhombic=*/task_list->orthorhombic,
337 /*border_mask=*/task->border_mask,
338 /*func=*/func,
339 /*la_max=*/ibasis->lmax[iset],
340 /*la_min=*/ibasis->lmin[iset],
341 /*lb_max=*/jbasis->lmax[jset],
342 /*lb_min=*/jbasis->lmin[jset],
343 /*zeta=*/zeta,
344 /*zetb=*/zetb,
345 /*rscale=*/(iatom == jatom) ? 1 : 2,
346 /*dh=*/dh,
347 /*dh_inv=*/dh_inv,
348 /*ra=*/&task_list->atom_positions[3 * iatom],
349 /*rab=*/task->rab,
350 /*npts_global=*/npts_global,
351 /*npts_local=*/npts_local,
352 /*shift_local=*/shift_local,
353 /*border_width=*/border_width,
354 /*radius=*/task->radius,
355 /*o1=*/ipgf * ncoseta,
356 /*o2=*/jpgf * ncosetb,
357 /*n1=*/ncoa,
358 /*n2=*/ncob,
359 /*pab=*/(const double(*)[ncoa])pab,
360 /*grid=*/my_grid);
361
362 } // end of task loop
363 } // end of block loop
364
365// While there should be an implicit barrier at the end of the block loop, this
366// explicit barrier eliminates occasional seg faults with icc compiled binaries.
367#pragma omp barrier
368
369 // Merge thread-local grids via an efficient tree reduction.
370 const int nreduction_cycles = ceil(log(nthreads) / log(2)); // tree depth
371 for (int icycle = 1; icycle <= nreduction_cycles; icycle++) {
372 // Threads are divided into groups, whose size doubles with each cycle.
373 // After a cycle the reduced data is stored at first thread of each group.
374 const int group_size = 1 << icycle; // 2**icycle
375 const int igroup = ithread / group_size;
376 const int dest_thread = igroup * group_size;
377 const int src_thread = dest_thread + group_size / 2;
378 // The last group might actually be a bit smaller.
379 const int actual_group_size = imin(group_size, nthreads - dest_thread);
380 // Parallelize summation by dividing grid points across group members.
381 const int rank = modulo(ithread, group_size); // position within the group
382 const int64_t lb = ((int64_t)npts_local_total * rank) / actual_group_size;
383 const int64_t ub =
384 ((int64_t)npts_local_total * (rank + 1)) / actual_group_size;
385 if (src_thread < nthreads) {
386 for (int i = (int)lb; i < (int)ub; i++) {
387 task_list->threadlocals[dest_thread][i] +=
388 task_list->threadlocals[src_thread][i];
389 }
390 }
391#pragma omp barrier
392 }
393
394 // Copy final result from first thread into shared grid.
395 const int64_t lb = ((int64_t)npts_local_total * ithread) / nthreads;
396 const int64_t ub = ((int64_t)npts_local_total * (ithread + 1)) / nthreads;
397 for (int i = (int)lb; i < (int)ub; i++) {
398 grid->host_buffer[i] = task_list->threadlocals[0][i];
399 }
400
401 } // end of omp parallel region
402}
403
404/*******************************************************************************
405 * \brief Collocate all tasks of in given list onto given grids.
406 * See grid_task_list.h for details.
407 * \author Ole Schuett
408 ******************************************************************************/
410 const enum grid_func func, const int nlevels,
411 const offload_buffer *pab_blocks,
412 offload_buffer *grids[nlevels]) {
413
414 assert(task_list->nlevels == nlevels);
415
416 for (int level = 0; level < task_list->nlevels; level++) {
417 const int idx = level * task_list->nblocks;
418 const int *first_block_task = &task_list->first_level_block_task[idx];
419 const int *last_block_task = &task_list->last_level_block_task[idx];
420 const grid_cpu_layout *layout = &task_list->layouts[level];
422 task_list, first_block_task, last_block_task, func, layout->npts_global,
423 layout->npts_local, layout->shift_local, layout->border_width,
424 layout->dh, layout->dh_inv, pab_blocks->host_buffer, grids[level]);
425 }
426}
427
428/*******************************************************************************
429 * \brief Transforms hab from prim. cartesian to contracted spherical basis.
430 * \author Ole Schuett
431 ******************************************************************************/
432static inline void store_hab(const grid_basis_set *ibasis,
433 const grid_basis_set *jbasis, const int iset,
434 const int jset, const bool transpose,
435 const double *hab, double *block) {
436
437 // Define some more convenient aliases.
438 const int ncoseta = ncoset(ibasis->lmax[iset]);
439 const int ncosetb = ncoset(jbasis->lmax[jset]);
440 const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
441 const int ncob = jbasis->npgf[jset] * ncosetb;
442
443 const int nsgf_seta = ibasis->nsgf_set[iset]; // size of spherical set
444 const int nsgf_setb = jbasis->nsgf_set[jset];
445 const int nsgfa = ibasis->nsgf; // size of entire spherical basis
446 const int nsgfb = jbasis->nsgf;
447 const int sgfa = ibasis->first_sgf[iset] - 1; // start of spherical set
448 const int sgfb = jbasis->first_sgf[jset] - 1;
449 const int maxcoa = ibasis->maxco;
450 const int maxcob = jbasis->maxco;
451
452 double work[nsgf_setb * ncoa];
453
454 // work[nsgf_setb][ncoa] = MATMUL(jbasis->sphi, hab)
455 dgemm('N', 'N', nsgf_setb, ncoa, ncob, 1.0, &jbasis->sphi[sgfb * maxcob],
456 maxcob, hab, ncoa, 0.0, work, ncoa);
457
458 if (transpose) {
459 // subblock[nsgf_setb][nsgf_seta] += MATMUL(work, TRANSPOSE(ibasis->sphi))
460 dgemm('N', 'T', nsgf_setb, nsgf_seta, ncoa, 1.0, work, ncoa,
461 &ibasis->sphi[sgfa * maxcoa], maxcoa, 1.0,
462 &block[sgfb * nsgfa + sgfa], nsgfa);
463 } else {
464 // subblock[nsgf_seta][nsgf_setb] += MATMUL(ibasis->sphi, TRANSPOSE(work))
465 dgemm('N', 'T', nsgf_seta, nsgf_setb, ncoa, 1.0,
466 &ibasis->sphi[sgfa * maxcoa], maxcoa, work, ncoa, 1.0,
467 &block[sgfa * nsgfb + sgfb], nsgfb);
468 }
469}
470
471/*******************************************************************************
472 * \brief Integrate a range of tasks that belong to the same grid level.
473 * \author Ole Schuett
474 ******************************************************************************/
476 const grid_cpu_task_list *task_list, const int *first_block_task,
477 const int *last_block_task, const bool compute_tau, const int natoms,
478 const int npts_global[3], const int npts_local[3], const int shift_local[3],
479 const int border_width[3], const double dh[3][3], const double dh_inv[3][3],
480 const offload_buffer *pab_blocks, const offload_buffer *grid,
481 offload_buffer *hab_blocks, double forces[natoms][3], double virial[3][3]) {
482
483// Using default(shared) because with GCC 9 the behavior around const changed:
484// https://www.gnu.org/software/gcc/gcc-9/porting_to.html
485#pragma omp parallel default(shared)
486 {
487 // Initialize variables to detect when a new subblock has to be fetched.
488 int old_offset = -1, old_iset = -1, old_jset = -1;
489 grid_basis_set *old_ibasis = NULL, *old_jbasis = NULL;
490 bool old_transpose = false;
491
492 // Matrix pab and hab are re-used across tasks.
493 double pab[task_list->maxco * task_list->maxco];
494 double hab[task_list->maxco * task_list->maxco];
495
496 // Parallelize over blocks to avoid concurred access to hab_blocks.
497 const int nthreads = omp_get_num_threads();
498 const int chunk_size = imax(1, task_list->nblocks / (nthreads * 50));
499#pragma omp for schedule(dynamic, chunk_size)
500 for (int block_num = 0; block_num < task_list->nblocks; block_num++) {
501 const int first_task = first_block_task[block_num];
502 const int last_task = last_block_task[block_num];
503
504 // Accumulate forces per block as it corresponds to a pair of atoms.
505 const int iatom = task_list->tasks[first_task].iatom - 1;
506 const int jatom = task_list->tasks[first_task].jatom - 1;
507 double my_forces[2][3] = {0};
508 double my_virials[2][3][3] = {0};
509
510 for (int itask = first_task; itask <= last_task; itask++) {
511 // Define some convenient aliases.
512 const grid_cpu_task *task = &task_list->tasks[itask];
513 assert(task->block_num - 1 == block_num);
514 assert(task->iatom - 1 == iatom && task->jatom - 1 == jatom);
515 const int ikind = task_list->atom_kinds[iatom] - 1;
516 const int jkind = task_list->atom_kinds[jatom] - 1;
517 grid_basis_set *ibasis = task_list->basis_sets[ikind];
518 grid_basis_set *jbasis = task_list->basis_sets[jkind];
519 const int iset = task->iset - 1;
520 const int jset = task->jset - 1;
521 const int ipgf = task->ipgf - 1;
522 const int jpgf = task->jpgf - 1;
523 const double zeta = ibasis->zet[iset * ibasis->maxpgf + ipgf];
524 const double zetb = jbasis->zet[jset * jbasis->maxpgf + jpgf];
525 const int ncoseta = ncoset(ibasis->lmax[iset]);
526 const int ncosetb = ncoset(jbasis->lmax[jset]);
527 const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
528 const int ncob = jbasis->npgf[jset] * ncosetb;
529 const int block_offset = task_list->block_offsets[block_num];
530 const bool transpose = (iatom <= jatom);
531 const bool pab_required = (forces != NULL || virial != NULL);
532
533 // Load pab and store hab subblocks when needed.
534 // Previous hab and pab can be reused when only ipgf or jpgf changed.
535 if (block_offset != old_offset || iset != old_iset ||
536 jset != old_jset) {
537 if (pab_required) {
538 load_pab(ibasis, jbasis, iset, jset, transpose,
539 &pab_blocks->host_buffer[block_offset], pab);
540 }
541 if (old_offset >= 0) { // skip first iteration
542 store_hab(old_ibasis, old_jbasis, old_iset, old_jset, old_transpose,
543 hab, &hab_blocks->host_buffer[old_offset]);
544 }
545 memset(hab, 0, ncoa * ncob * sizeof(double));
546 old_offset = block_offset;
547 old_iset = iset;
548 old_jset = jset;
549 old_ibasis = ibasis;
550 old_jbasis = jbasis;
551 old_transpose = transpose;
552 }
553
555 /*orthorhombic=*/task_list->orthorhombic,
556 /*compute_tau=*/compute_tau,
557 /*border_mask=*/task->border_mask,
558 /*la_max=*/ibasis->lmax[iset],
559 /*la_min=*/ibasis->lmin[iset],
560 /*lb_max=*/jbasis->lmax[jset],
561 /*lb_min=*/jbasis->lmin[jset],
562 /*zeta=*/zeta,
563 /*zetb=*/zetb,
564 /*dh=*/dh,
565 /*dh_inv=*/dh_inv,
566 /*ra=*/&task_list->atom_positions[3 * iatom],
567 /*rab=*/task->rab,
568 /*npts_global=*/npts_global,
569 /*npts_local=*/npts_local,
570 /*shift_local=*/shift_local,
571 /*border_width=*/border_width,
572 /*radius=*/task->radius,
573 /*o1=*/ipgf * ncoseta,
574 /*o2=*/jpgf * ncosetb,
575 /*n1=*/ncoa,
576 /*n2=*/ncob,
577 /*grid=*/grid->host_buffer,
578 /*hab=*/(double(*)[ncoa])hab,
579 /*pab=*/(pab_required) ? (const double(*)[ncoa])pab : NULL,
580 /*forces=*/(forces != NULL) ? my_forces : NULL,
581 /*virials=*/(virial != NULL) ? my_virials : NULL,
582 /*hdab=*/NULL,
583 /*hadb=*/NULL,
584 /*a_hdab=*/NULL);
585
586 } // end of task loop
587
588 // Merge thread-local forces and virial into shared ones.
589 // It does not seem worth the trouble to accumulate them thread-locally.
590 const double scalef = (iatom == jatom) ? 1.0 : 2.0;
591 if (forces != NULL) {
592#pragma omp critical(forces)
593 for (int i = 0; i < 3; i++) {
594 forces[iatom][i] += scalef * my_forces[0][i];
595 forces[jatom][i] += scalef * my_forces[1][i];
596 }
597 }
598 if (virial != NULL) {
599#pragma omp critical(virial)
600 for (int i = 0; i < 3; i++) {
601 for (int j = 0; j < 3; j++) {
602 virial[i][j] += scalef * my_virials[0][i][j];
603 virial[i][j] += scalef * my_virials[1][i][j];
604 }
605 }
606 }
607
608 } // end of block loop
609
610 // store final hab
611 if (old_offset >= 0) {
612 store_hab(old_ibasis, old_jbasis, old_iset, old_jset, old_transpose, hab,
613 &hab_blocks->host_buffer[old_offset]);
614 }
615
616 } // end of omp parallel region
617}
618
619/*******************************************************************************
620 * \brief Integrate all tasks of in given list from given grids.
621 * See grid_task_list.h for details.
622 * \author Ole Schuett
623 ******************************************************************************/
625 const grid_cpu_task_list *task_list, const bool compute_tau,
626 const int natoms, const int nlevels, const offload_buffer *pab_blocks,
627 const offload_buffer *grids[nlevels], offload_buffer *hab_blocks,
628 double forces[natoms][3], double virial[3][3]) {
629
630 assert(task_list->nlevels == nlevels);
631 assert(task_list->natoms == natoms);
632
633 // Zero result arrays.
634 memset(hab_blocks->host_buffer, 0, hab_blocks->size);
635 if (forces != NULL) {
636 memset(forces, 0, natoms * 3 * sizeof(double));
637 }
638 if (virial != NULL) {
639 memset(virial, 0, 9 * sizeof(double));
640 }
641
642 for (int level = 0; level < task_list->nlevels; level++) {
643 const int idx = level * task_list->nblocks;
644 const int *first_block_task = &task_list->first_level_block_task[idx];
645 const int *last_block_task = &task_list->last_level_block_task[idx];
646 const grid_cpu_layout *layout = &task_list->layouts[level];
648 task_list, first_block_task, last_block_task, compute_tau, natoms,
649 layout->npts_global, layout->npts_local, layout->shift_local,
650 layout->border_width, layout->dh, layout->dh_inv, pab_blocks,
651 grids[level], hab_blocks, forces, virial);
652 }
653}
654
655// EOF
static int imax(int x, int y)
Returns the larger of two given integers (missing from the C standard)
static int imin(int x, int y)
Returns the smaller of the two integers (missing from the C standard).
Definition dbm_miniapp.c:38
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
static GRID_HOST_DEVICE int idx(const orbital a)
Return coset index of given orbital angular momentum.
grid_func
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
static void const int const int i
const int ub
static void const int const int const int const int const int const double const int const int const int npts_local[3]
void grid_cpu_collocate_pgf_product(const bool orthorhombic, const int border_mask, const enum grid_func func, const int la_max, const int la_min, const int lb_max, const int lb_min, const double zeta, const double zetb, const double rscale, const double dh[3][3], const double dh_inv[3][3], const double ra[3], const double rab[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, const int o1, const int o2, const int n1, const int n2, const double pab[n2][n1], double *grid)
Public entry point. A thin wrapper with the only purpose of calling write_task_file when DUMP_TASKS =...
void grid_cpu_integrate_pgf_product(const bool orthorhombic, const bool compute_tau, const int border_mask, const int la_max, const int la_min, const int lb_max, const int lb_min, const double zeta, const double zetb, const double dh[3][3], const double dh_inv[3][3], const double ra[3], const double rab[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, const int o1, const int o2, const int n1, const int n2, const double *grid, double hab[n2][n1], const double pab[n2][n1], double forces[2][3], double virials[2][3][3], double hdab[n2][n1][3], double hadb[n2][n1][3], double a_hdab[n2][n1][3][3])
Integrates a single task. See grid_cpu_integrate.h for details.
static void load_pab(const grid_basis_set *ibasis, const grid_basis_set *jbasis, const int iset, const int jset, const bool transpose, const double *block, double *pab)
Transforms pab from contracted spherical to prim. cartesian basis.
void grid_cpu_free_task_list(grid_cpu_task_list *task_list)
Deallocates given task list, basis_sets have to be freed separately.
void grid_cpu_create_task_list(const bool orthorhombic, const int ntasks, const int nlevels, const int natoms, const int nkinds, const int nblocks, const int block_offsets[nblocks], const double atom_positions[natoms][3], const int atom_kinds[natoms], const grid_basis_set *basis_sets[nkinds], const int level_list[ntasks], const int iatom_list[ntasks], const int jatom_list[ntasks], const int iset_list[ntasks], const int jset_list[ntasks], const int ipgf_list[ntasks], const int jpgf_list[ntasks], const int border_mask_list[ntasks], const int block_num_list[ntasks], const double radius_list[ntasks], const double rab_list[ntasks][3], const int npts_global[nlevels][3], const int npts_local[nlevels][3], const int shift_local[nlevels][3], const int border_width[nlevels][3], const double dh[nlevels][3][3], const double dh_inv[nlevels][3][3], grid_cpu_task_list **task_list_out)
Allocates a task list for the cpu backend. See grid_task_list.h for details.
static void integrate_one_grid_level(const grid_cpu_task_list *task_list, const int *first_block_task, const int *last_block_task, const bool compute_tau, const int natoms, const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double dh[3][3], const double dh_inv[3][3], const offload_buffer *pab_blocks, const offload_buffer *grid, offload_buffer *hab_blocks, double forces[natoms][3], double virial[3][3])
Integrate a range of tasks that belong to the same grid level.
static void collocate_one_grid_level(const grid_cpu_task_list *task_list, const int *first_block_task, const int *last_block_task, const enum grid_func func, const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double dh[3][3], const double dh_inv[3][3], const double *pab_blocks, offload_buffer *grid)
Collocate a range of tasks which are destined for the same grid level.
static int compare_tasks(const void *a, const void *b)
Comperator passed to qsort to compare two tasks.
void grid_cpu_collocate_task_list(const grid_cpu_task_list *task_list, const enum grid_func func, const int nlevels, const offload_buffer *pab_blocks, offload_buffer *grids[nlevels])
Collocate all tasks of in given list onto given grids. See grid_task_list.h for details.
void dgemm_(const char *transa, const char *transb, const int *m, const int *n, const int *k, const double *alpha, const double *a, const int *lda, const double *b, const int *ldb, const double *beta, double *c, const int *ldc)
Prototype for BLAS dgemm.
void grid_cpu_integrate_task_list(const grid_cpu_task_list *task_list, const bool compute_tau, const int natoms, const int nlevels, const offload_buffer *pab_blocks, const offload_buffer *grids[nlevels], offload_buffer *hab_blocks, double forces[natoms][3], double virial[3][3])
Integrate all tasks of in given list from given grids. See grid_task_list.h for details.
static void store_hab(const grid_basis_set *ibasis, const grid_basis_set *jbasis, const int iset, const int jset, const bool transpose, const double *hab, double *block)
Transforms hab from prim. cartesian to contracted spherical basis.
static void dgemm(const char transa, const char transb, const int m, const int n, const int k, const double alpha, const double *a, const int lda, const double *b, const int ldb, const double beta, double *c, const int ldc)
Convenient wrapper to hide Fortran nature of dgemm_, swapping a and b.
Internal representation of a basis set.
Internal representation of a grid layout.
Internal representation of a task list.
grid_cpu_layout * layouts
grid_basis_set ** basis_sets
grid_cpu_task * tasks
Internal representation of a task.
Internal representation of a buffer.
double * host_buffer