(git:d43dca4)
Loading...
Searching...
No Matches
grid_cpu_task_list.c
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2026 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: BSD-3-Clause */
6/*----------------------------------------------------------------------------*/
7
8#include <assert.h>
9#include <math.h>
10#include <omp.h>
11#include <stdint.h>
12#include <stdio.h>
13#include <stdlib.h>
14#include <string.h>
15
16#include "../common/grid_common.h"
17#include "grid_cpu_collocate.h"
18#include "grid_cpu_integrate.h"
19#include "grid_cpu_task_list.h"
20
21/*******************************************************************************
22 * \brief Comperator passed to qsort to compare two tasks.
23 * \author Ole Schuett
24 ******************************************************************************/
25static int compare_tasks(const void *a, const void *b) {
26 const grid_cpu_task *task_a = a, *task_b = b;
27 if (task_a->level != task_b->level) {
28 return task_a->level - task_b->level;
29 } else if (task_a->block_num != task_b->block_num) {
30 return task_a->block_num - task_b->block_num;
31 } else if (task_a->iset != task_b->iset) {
32 return task_a->iset - task_b->iset;
33 } else {
34 return task_a->jset - task_b->jset;
35 }
36}
37/*******************************************************************************
38 * \brief Allocates a task list for the cpu backend.
39 * See grid_task_list.h for details.
40 * \author Ole Schuett
41 ******************************************************************************/
43 const bool orthorhombic, const int ntasks, const int nlevels,
44 const int natoms, const int nkinds, const int nblocks,
45 const int block_offsets[nblocks], const double atom_positions[natoms][3],
46 const int atom_kinds[natoms], const grid_basis_set *basis_sets[nkinds],
47 const int level_list[ntasks], const int iatom_list[ntasks],
48 const int jatom_list[ntasks], const int iset_list[ntasks],
49 const int jset_list[ntasks], const int ipgf_list[ntasks],
50 const int jpgf_list[ntasks], const int border_mask_list[ntasks],
51 const int block_num_list[ntasks], const double radius_list[ntasks],
52 const double rab_list[ntasks][3], const int npts_global[nlevels][3],
53 const int npts_local[nlevels][3], const int shift_local[nlevels][3],
54 const int border_width[nlevels][3], const double dh[nlevels][3][3],
55 const double dh_inv[nlevels][3][3], grid_cpu_task_list **task_list_out) {
56
57 if (*task_list_out != NULL) {
58 // This is actually an opportunity to reuse some buffers.
59 grid_cpu_free_task_list(*task_list_out);
60 }
61
62 grid_cpu_task_list *task_list = malloc(sizeof(grid_cpu_task_list));
63 assert(task_list != NULL);
64
65 task_list->orthorhombic = orthorhombic;
66 task_list->ntasks = ntasks;
67 task_list->nlevels = nlevels;
68 task_list->natoms = natoms;
69 task_list->nkinds = nkinds;
70 task_list->nblocks = nblocks;
71
72 size_t size = nblocks * sizeof(int);
73 task_list->block_offsets = malloc(size);
74 assert(task_list->block_offsets != NULL || size == 0);
75 if (size != 0) {
76 memcpy(task_list->block_offsets, block_offsets, size);
77 }
78
79 size = 3 * natoms * sizeof(double);
80 task_list->atom_positions = malloc(size);
81 assert(task_list->atom_positions != NULL || size == 0);
82 if (size != 0) {
83 memcpy(task_list->atom_positions, atom_positions, size);
84 }
85
86 size = natoms * sizeof(int);
87 task_list->atom_kinds = malloc(size);
88 assert(task_list->atom_kinds != NULL || size == 0);
89 if (size != 0) {
90 memcpy(task_list->atom_kinds, atom_kinds, size);
91 }
92
93 size = nkinds * sizeof(grid_basis_set *);
94 task_list->basis_sets = malloc(size);
95 assert(task_list->basis_sets != NULL || size == 0);
96 if (size != 0) {
97 memcpy(task_list->basis_sets, basis_sets, size);
98 }
99
100 size = ntasks * sizeof(grid_cpu_task);
101 task_list->tasks = malloc(size);
102 assert(task_list->tasks != NULL || size == 0);
103#pragma omp parallel for schedule(static) if (ntasks > GRID_OMP_MIN_ITERATIONS)
104 for (int i = 0; i < ntasks; i++) {
105 task_list->tasks[i].level = level_list[i];
106 task_list->tasks[i].iatom = iatom_list[i];
107 task_list->tasks[i].jatom = jatom_list[i];
108 task_list->tasks[i].iset = iset_list[i];
109 task_list->tasks[i].jset = jset_list[i];
110 task_list->tasks[i].ipgf = ipgf_list[i];
111 task_list->tasks[i].jpgf = jpgf_list[i];
112 task_list->tasks[i].border_mask = border_mask_list[i];
113 task_list->tasks[i].block_num = block_num_list[i];
114 task_list->tasks[i].radius = radius_list[i];
115 task_list->tasks[i].rab[0] = rab_list[i][0];
116 task_list->tasks[i].rab[1] = rab_list[i][1];
117 task_list->tasks[i].rab[2] = rab_list[i][2];
118 }
119
120 // Store grid layouts.
121 size = nlevels * sizeof(grid_cpu_layout);
122 task_list->layouts = malloc(size);
123 assert(task_list->layouts != NULL || size == 0);
124 for (int level = 0; level < nlevels; level++) {
125 for (int i = 0; i < 3; i++) {
126 task_list->layouts[level].npts_global[i] = npts_global[level][i];
127 task_list->layouts[level].npts_local[i] = npts_local[level][i];
128 task_list->layouts[level].shift_local[i] = shift_local[level][i];
129 task_list->layouts[level].border_width[i] = border_width[level][i];
130 for (int j = 0; j < 3; j++) {
131 task_list->layouts[level].dh[i][j] = dh[level][i][j];
132 task_list->layouts[level].dh_inv[i][j] = dh_inv[level][i][j];
133 }
134 }
135 }
136
137 // Sort tasks by level, block_num, iset, and jset.
138 qsort(task_list->tasks, ntasks, sizeof(grid_cpu_task), &compare_tasks);
139
140 // Find first and last task for each level and block.
141 size = nlevels * nblocks * sizeof(int);
142 task_list->first_level_block_task = malloc(size);
143 assert(task_list->first_level_block_task != NULL || size == 0);
144 task_list->last_level_block_task = malloc(size);
145 assert(task_list->last_level_block_task != NULL || size == 0);
146 const int nlevel_blocks = nlevels * nblocks;
147#pragma omp parallel for schedule(static) if (nlevel_blocks > \
148 GRID_OMP_MIN_ITERATIONS)
149 for (int i = 0; i < nlevel_blocks; i++) {
150 task_list->first_level_block_task[i] = 0;
151 task_list->last_level_block_task[i] = -1; // last < first means no tasks
152 }
153 for (int itask = 0; itask < ntasks; itask++) {
154 const int level = task_list->tasks[itask].level - 1;
155 const int block_num = task_list->tasks[itask].block_num - 1;
156 if (itask == 0 || task_list->tasks[itask - 1].level - 1 != level ||
157 task_list->tasks[itask - 1].block_num - 1 != block_num) {
158 task_list->first_level_block_task[level * nblocks + block_num] = itask;
159 }
160 task_list->last_level_block_task[level * nblocks + block_num] = itask;
161 }
162
163 // Find largest Cartesian subblock size.
164 task_list->maxco = 0;
165 for (int i = 0; i < nkinds; i++) {
166 task_list->maxco = imax(task_list->maxco, task_list->basis_sets[i]->maxco);
167 }
168
169 // Initialize thread-local storage.
170 size = omp_get_max_threads() * sizeof(double *);
171 task_list->threadlocals = malloc(size);
172 assert(task_list->threadlocals != NULL);
173 memset(task_list->threadlocals, 0, size);
174 size = omp_get_max_threads() * sizeof(size_t);
175 task_list->threadlocal_sizes = malloc(size);
176 assert(task_list->threadlocal_sizes != NULL);
177 memset(task_list->threadlocal_sizes, 0, size);
178
179 *task_list_out = task_list;
180}
181
182/*******************************************************************************
183 * \brief Deallocates given task list, basis_sets have to be freed separately.
184 * \author Ole Schuett
185 ******************************************************************************/
187 free(task_list->block_offsets);
188 free(task_list->atom_positions);
189 free(task_list->atom_kinds);
190 free(task_list->basis_sets);
191 free(task_list->tasks);
192 free(task_list->layouts);
193 free(task_list->first_level_block_task);
194 free(task_list->last_level_block_task);
195 for (int i = 0; i < omp_get_max_threads(); i++) {
196 if (task_list->threadlocals[i] != NULL) {
197 free(task_list->threadlocals[i]);
198 }
199 }
200 free(task_list->threadlocals);
201 free(task_list->threadlocal_sizes);
202 free(task_list);
203}
204
205/*******************************************************************************
206 * \brief Prototype for BLAS dgemm.
207 * \author Ole Schuett
208 ******************************************************************************/
209void dgemm_(const char *transa, const char *transb, const int *m, const int *n,
210 const int *k, const double *alpha, const double *a, const int *lda,
211 const double *b, const int *ldb, const double *beta, double *c,
212 const int *ldc);
213
214/*******************************************************************************
215 * \brief Convenient wrapper to hide Fortran nature of dgemm_, swapping a and b.
216 * \author Ole Schuett
217 ******************************************************************************/
218static void dgemm(const char transa, const char transb, const int m,
219 const int n, const int k, const double alpha, const double *a,
220 const int lda, const double *b, const int ldb,
221 const double beta, double *c, const int ldc) {
222 dgemm_(&transb, &transa, &n, &m, &k, &alpha, b, &ldb, a, &lda, &beta, c,
223 &ldc);
224}
225
226/*******************************************************************************
227 * \brief Transforms pab from contracted spherical to prim. cartesian basis.
228 * \author Ole Schuett
229 ******************************************************************************/
230static void load_pab(const grid_basis_set *ibasis, const grid_basis_set *jbasis,
231 const int iset, const int jset, const bool transpose,
232 const double *block, double *pab) {
233
234 // Define some more convenient aliases.
235 const int ncoseta = ncoset(ibasis->lmax[iset]);
236 const int ncosetb = ncoset(jbasis->lmax[jset]);
237 const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
238 const int ncob = jbasis->npgf[jset] * ncosetb;
239
240 const int nsgf_seta = ibasis->nsgf_set[iset]; // size of spherical set
241 const int nsgf_setb = jbasis->nsgf_set[jset];
242 const int nsgfa = ibasis->nsgf; // size of entire spherical basis
243 const int nsgfb = jbasis->nsgf;
244 const int sgfa = ibasis->first_sgf[iset] - 1; // start of spherical set
245 const int sgfb = jbasis->first_sgf[jset] - 1;
246 const int maxcoa = ibasis->maxco;
247 const int maxcob = jbasis->maxco;
248
249 double work[nsgf_setb * ncoa];
250 if (transpose) {
251 // work[nsgf_setb][ncoa] = MATMUL(subblock, ibasis->sphi)
252 dgemm('N', 'N', nsgf_setb, ncoa, nsgf_seta, 1.0,
253 &block[sgfb * nsgfa + sgfa], nsgfa, &ibasis->sphi[sgfa * maxcoa],
254 maxcoa, 0.0, work, ncoa);
255 } else {
256 // work[nsgf_setb][ncoa] = MATMUL(TRANSPOSE(subblock), ibasis->sphi)
257 dgemm('T', 'N', nsgf_setb, ncoa, nsgf_seta, 1.0,
258 &block[sgfa * nsgfb + sgfb], nsgfb, &ibasis->sphi[sgfa * maxcoa],
259 maxcoa, 0.0, work, ncoa);
260 }
261 // pab[ncob][ncoa] = MATMUL(TRANSPOSE(jbasis->sphi), work)
262 dgemm('T', 'N', ncob, ncoa, nsgf_setb, 1.0, &jbasis->sphi[sgfb * maxcob],
263 maxcob, work, ncoa, 0.0, pab, ncoa);
264}
265
266/*******************************************************************************
267 * \brief Collocate a range of tasks which are destined for the same grid level.
268 * \author Ole Schuett
269 ******************************************************************************/
271 const grid_cpu_task_list *task_list, const int *first_block_task,
272 const int *last_block_task, const enum grid_func func,
273 const int npts_global[3], const int npts_local[3], const int shift_local[3],
274 const int border_width[3], const double dh[3][3], const double dh_inv[3][3],
275 const double *pab_blocks, offload_buffer *grid) {
276
277// Using default(shared) because with GCC 9 the behavior around const changed:
278// https://www.gnu.org/software/gcc/gcc-9/porting_to.html
279#pragma omp parallel default(shared)
280 {
281 const int ithread = omp_get_thread_num();
282 const int nthreads = omp_get_num_threads();
283
284 // Initialize variables to detect when a new subblock has to be fetched.
285 int old_offset = -1, old_iset = -1, old_jset = -1;
286
287 // Matrix pab is re-used across tasks.
288 double pab[imax(task_list->maxco * task_list->maxco, 1)];
289
290 // Ensure that grid can fit into thread-local storage, reallocate if needed.
291 const int npts_local_total = npts_local[0] * npts_local[1] * npts_local[2];
292 const size_t grid_size = npts_local_total * sizeof(double);
293 if (task_list->threadlocal_sizes[ithread] < grid_size) {
294 if (task_list->threadlocals[ithread] != NULL) {
295 free(task_list->threadlocals[ithread]);
296 }
297 task_list->threadlocals[ithread] = malloc(grid_size);
298 assert(task_list->threadlocals[ithread] != NULL);
299 task_list->threadlocal_sizes[ithread] = grid_size;
300 }
301
302 // Zero thread-local copy of the grid.
303 double *const my_grid = task_list->threadlocals[ithread];
304 memset(my_grid, 0, grid_size);
305
306 // Parallelize over blocks to avoid unnecessary calls to load_pab.
307 const int chunk_size = imax(1, task_list->nblocks / (nthreads * 50));
308#pragma omp for schedule(dynamic, chunk_size)
309 for (int block_num = 0; block_num < task_list->nblocks; block_num++) {
310 const int first_task = first_block_task[block_num];
311 const int last_task = last_block_task[block_num];
312
313 for (int itask = first_task; itask <= last_task; itask++) {
314 // Define some convenient aliases.
315 const grid_cpu_task *task = &task_list->tasks[itask];
316 const int iatom = task->iatom - 1;
317 const int jatom = task->jatom - 1;
318 const int iset = task->iset - 1;
319 const int jset = task->jset - 1;
320 const int ipgf = task->ipgf - 1;
321 const int jpgf = task->jpgf - 1;
322 const int ikind = task_list->atom_kinds[iatom] - 1;
323 const int jkind = task_list->atom_kinds[jatom] - 1;
324 const grid_basis_set *ibasis = task_list->basis_sets[ikind];
325 const grid_basis_set *jbasis = task_list->basis_sets[jkind];
326 const double zeta = ibasis->zet[iset * ibasis->maxpgf + ipgf];
327 const double zetb = jbasis->zet[jset * jbasis->maxpgf + jpgf];
328 const int ncoseta = ncoset(ibasis->lmax[iset]);
329 const int ncosetb = ncoset(jbasis->lmax[jset]);
330 const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
331 const int ncob = jbasis->npgf[jset] * ncosetb;
332 const int block_num = task->block_num - 1;
333 const int block_offset = task_list->block_offsets[block_num];
334 const double *block = &pab_blocks[block_offset];
335 const bool transpose = (iatom <= jatom);
336
337 // Load subblock from buffer and decontract into Cartesian sublock pab.
338 // The previous pab can be reused when only ipgf or jpgf has changed.
339 if (block_offset != old_offset || iset != old_iset ||
340 jset != old_jset) {
341 old_offset = block_offset;
342 old_iset = iset;
343 old_jset = jset;
344 load_pab(ibasis, jbasis, iset, jset, transpose, block, pab);
345 }
346
348 /*orthorhombic=*/task_list->orthorhombic,
349 /*border_mask=*/task->border_mask,
350 /*func=*/func,
351 /*la_max=*/ibasis->lmax[iset],
352 /*la_min=*/ibasis->lmin[iset],
353 /*lb_max=*/jbasis->lmax[jset],
354 /*lb_min=*/jbasis->lmin[jset],
355 /*zeta=*/zeta,
356 /*zetb=*/zetb,
357 /*rscale=*/(iatom == jatom) ? 1 : 2,
358 /*dh=*/dh,
359 /*dh_inv=*/dh_inv,
360 /*ra=*/&task_list->atom_positions[3 * iatom],
361 /*rab=*/task->rab,
362 /*npts_global=*/npts_global,
363 /*npts_local=*/npts_local,
364 /*shift_local=*/shift_local,
365 /*border_width=*/border_width,
366 /*radius=*/task->radius,
367 /*o1=*/ipgf * ncoseta,
368 /*o2=*/jpgf * ncosetb,
369 /*n1=*/ncoa,
370 /*n2=*/ncob,
371 /*pab=*/(const double(*)[ncoa])pab,
372 /*grid=*/my_grid);
373
374 } // end of task loop
375 } // end of block loop
376
377// While there should be an implicit barrier at the end of the block loop, this
378// explicit barrier eliminates occasional seg faults with icc compiled binaries.
379#pragma omp barrier
380
381 // Merge thread-local grids via an efficient tree reduction.
382 const int nreduction_cycles = ceil(log(nthreads) / log(2)); // tree depth
383 for (int icycle = 1; icycle <= nreduction_cycles; icycle++) {
384 // Threads are divided into groups, whose size doubles with each cycle.
385 // After a cycle the reduced data is stored at first thread of each group.
386 const int group_size = 1 << icycle; // 2**icycle
387 const int igroup = ithread / group_size;
388 const int dest_thread = igroup * group_size;
389 const int src_thread = dest_thread + group_size / 2;
390 // The last group might actually be a bit smaller.
391 const int actual_group_size = imin(group_size, nthreads - dest_thread);
392 // Parallelize summation by dividing grid points across group members.
393 const int rank = modulo(ithread, group_size); // position within the group
394 const int64_t lb = ((int64_t)npts_local_total * rank) / actual_group_size;
395 const int64_t ub =
396 ((int64_t)npts_local_total * (rank + 1)) / actual_group_size;
397 if (src_thread < nthreads) {
399 for (int i = (int)lb; i < (int)ub; i++) {
400 task_list->threadlocals[dest_thread][i] +=
401 task_list->threadlocals[src_thread][i];
402 }
403 }
404#pragma omp barrier
405 }
406
407 // Copy final result from first thread into shared grid.
408 const int64_t lb = ((int64_t)npts_local_total * ithread) / nthreads;
409 const int64_t ub = ((int64_t)npts_local_total * (ithread + 1)) / nthreads;
411 for (int i = (int)lb; i < (int)ub; i++) {
412 grid->host_buffer[i] = task_list->threadlocals[0][i];
413 }
414
415 } // end of omp parallel region
416}
417
418/*******************************************************************************
419 * \brief Collocate all tasks of in given list onto given grids.
420 * See grid_task_list.h for details.
421 * \author Ole Schuett
422 ******************************************************************************/
424 const enum grid_func func, const int nlevels,
425 const offload_buffer *pab_blocks,
426 offload_buffer *grids[nlevels]) {
427
428 assert(task_list->nlevels == nlevels);
429
430 for (int level = 0; level < task_list->nlevels; level++) {
431 const int idx = level * task_list->nblocks;
432 const int *first_block_task = &task_list->first_level_block_task[idx];
433 const int *last_block_task = &task_list->last_level_block_task[idx];
434 const grid_cpu_layout *layout = &task_list->layouts[level];
436 task_list, first_block_task, last_block_task, func, layout->npts_global,
437 layout->npts_local, layout->shift_local, layout->border_width,
438 layout->dh, layout->dh_inv, pab_blocks->host_buffer, grids[level]);
439 }
440}
441
442/*******************************************************************************
443 * \brief Transforms hab from prim. cartesian to contracted spherical basis.
444 * \author Ole Schuett
445 ******************************************************************************/
446static inline void store_hab(const grid_basis_set *ibasis,
447 const grid_basis_set *jbasis, const int iset,
448 const int jset, const bool transpose,
449 const double *hab, double *block) {
450
451 // Define some more convenient aliases.
452 const int ncoseta = ncoset(ibasis->lmax[iset]);
453 const int ncosetb = ncoset(jbasis->lmax[jset]);
454 const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
455 const int ncob = jbasis->npgf[jset] * ncosetb;
456
457 const int nsgf_seta = ibasis->nsgf_set[iset]; // size of spherical set
458 const int nsgf_setb = jbasis->nsgf_set[jset];
459 const int nsgfa = ibasis->nsgf; // size of entire spherical basis
460 const int nsgfb = jbasis->nsgf;
461 const int sgfa = ibasis->first_sgf[iset] - 1; // start of spherical set
462 const int sgfb = jbasis->first_sgf[jset] - 1;
463 const int maxcoa = ibasis->maxco;
464 const int maxcob = jbasis->maxco;
465
466 double work[nsgf_setb * ncoa];
467
468 // work[nsgf_setb][ncoa] = MATMUL(jbasis->sphi, hab)
469 dgemm('N', 'N', nsgf_setb, ncoa, ncob, 1.0, &jbasis->sphi[sgfb * maxcob],
470 maxcob, hab, ncoa, 0.0, work, ncoa);
471
472 if (transpose) {
473 // subblock[nsgf_setb][nsgf_seta] += MATMUL(work, TRANSPOSE(ibasis->sphi))
474 dgemm('N', 'T', nsgf_setb, nsgf_seta, ncoa, 1.0, work, ncoa,
475 &ibasis->sphi[sgfa * maxcoa], maxcoa, 1.0,
476 &block[sgfb * nsgfa + sgfa], nsgfa);
477 } else {
478 // subblock[nsgf_seta][nsgf_setb] += MATMUL(ibasis->sphi, TRANSPOSE(work))
479 dgemm('N', 'T', nsgf_seta, nsgf_setb, ncoa, 1.0,
480 &ibasis->sphi[sgfa * maxcoa], maxcoa, work, ncoa, 1.0,
481 &block[sgfa * nsgfb + sgfb], nsgfb);
482 }
483}
484
485/*******************************************************************************
486 * \brief Integrate a range of tasks that belong to the same grid level.
487 * \author Ole Schuett
488 ******************************************************************************/
490 const grid_cpu_task_list *task_list, const int *first_block_task,
491 const int *last_block_task, const bool compute_tau, const int natoms,
492 const int npts_global[3], const int npts_local[3], const int shift_local[3],
493 const int border_width[3], const double dh[3][3], const double dh_inv[3][3],
494 const offload_buffer *pab_blocks, const offload_buffer *grid,
495 offload_buffer *hab_blocks, double forces[natoms][3], double virial[3][3]) {
496
497// Using default(shared) because with GCC 9 the behavior around const changed:
498// https://www.gnu.org/software/gcc/gcc-9/porting_to.html
499#pragma omp parallel default(shared)
500 {
501 // Initialize variables to detect when a new subblock has to be fetched.
502 int old_offset = -1, old_iset = -1, old_jset = -1;
503 grid_basis_set *old_ibasis = NULL, *old_jbasis = NULL;
504 bool old_transpose = false;
505
506 // Matrix pab and hab are re-used across tasks.
507 double pab[imax(task_list->maxco * task_list->maxco, 1)];
508 double hab[imax(task_list->maxco * task_list->maxco, 1)];
509
510 // Parallelize over blocks to avoid concurred access to hab_blocks.
511 const int nthreads = omp_get_num_threads();
512 const int chunk_size = imax(1, task_list->nblocks / (nthreads * 50));
513#pragma omp for schedule(dynamic, chunk_size)
514 for (int block_num = 0; block_num < task_list->nblocks; block_num++) {
515 const int first_task = first_block_task[block_num];
516 const int last_task = last_block_task[block_num];
517
518 // Accumulate forces per block as it corresponds to a pair of atoms.
519 const int iatom = task_list->tasks[first_task].iatom - 1;
520 const int jatom = task_list->tasks[first_task].jatom - 1;
521 double my_forces[2][3] = {0};
522 double my_virials[2][3][3] = {0};
523
524 for (int itask = first_task; itask <= last_task; itask++) {
525 // Define some convenient aliases.
526 const grid_cpu_task *task = &task_list->tasks[itask];
527 assert(task->block_num - 1 == block_num);
528 assert(task->iatom - 1 == iatom && task->jatom - 1 == jatom);
529 const int ikind = task_list->atom_kinds[iatom] - 1;
530 const int jkind = task_list->atom_kinds[jatom] - 1;
531 grid_basis_set *ibasis = task_list->basis_sets[ikind];
532 grid_basis_set *jbasis = task_list->basis_sets[jkind];
533 const int iset = task->iset - 1;
534 const int jset = task->jset - 1;
535 const int ipgf = task->ipgf - 1;
536 const int jpgf = task->jpgf - 1;
537 const double zeta = ibasis->zet[iset * ibasis->maxpgf + ipgf];
538 const double zetb = jbasis->zet[jset * jbasis->maxpgf + jpgf];
539 const int ncoseta = ncoset(ibasis->lmax[iset]);
540 const int ncosetb = ncoset(jbasis->lmax[jset]);
541 const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
542 const int ncob = jbasis->npgf[jset] * ncosetb;
543 const int block_offset = task_list->block_offsets[block_num];
544 const bool transpose = (iatom <= jatom);
545 const bool pab_required = (forces != NULL || virial != NULL);
546
547 // Load pab and store hab subblocks when needed.
548 // Previous hab and pab can be reused when only ipgf or jpgf changed.
549 if (block_offset != old_offset || iset != old_iset ||
550 jset != old_jset) {
551 if (pab_required) {
552 load_pab(ibasis, jbasis, iset, jset, transpose,
553 &pab_blocks->host_buffer[block_offset], pab);
554 }
555 if (old_offset >= 0) { // skip first iteration
556 store_hab(old_ibasis, old_jbasis, old_iset, old_jset, old_transpose,
557 hab, &hab_blocks->host_buffer[old_offset]);
558 }
559 memset(hab, 0, ncoa * ncob * sizeof(double));
560 old_offset = block_offset;
561 old_iset = iset;
562 old_jset = jset;
563 old_ibasis = ibasis;
564 old_jbasis = jbasis;
565 old_transpose = transpose;
566 }
567
569 /*orthorhombic=*/task_list->orthorhombic,
570 /*compute_tau=*/compute_tau,
571 /*border_mask=*/task->border_mask,
572 /*la_max=*/ibasis->lmax[iset],
573 /*la_min=*/ibasis->lmin[iset],
574 /*lb_max=*/jbasis->lmax[jset],
575 /*lb_min=*/jbasis->lmin[jset],
576 /*zeta=*/zeta,
577 /*zetb=*/zetb,
578 /*dh=*/dh,
579 /*dh_inv=*/dh_inv,
580 /*ra=*/&task_list->atom_positions[3 * iatom],
581 /*rab=*/task->rab,
582 /*npts_global=*/npts_global,
583 /*npts_local=*/npts_local,
584 /*shift_local=*/shift_local,
585 /*border_width=*/border_width,
586 /*radius=*/task->radius,
587 /*o1=*/ipgf * ncoseta,
588 /*o2=*/jpgf * ncosetb,
589 /*n1=*/ncoa,
590 /*n2=*/ncob,
591 /*grid=*/grid->host_buffer,
592 /*hab=*/(double(*)[ncoa])hab,
593 /*pab=*/(pab_required) ? (const double(*)[ncoa])pab : NULL,
594 /*forces=*/(forces != NULL) ? my_forces : NULL,
595 /*virials=*/(virial != NULL) ? my_virials : NULL,
596 /*hdab=*/NULL,
597 /*hadb=*/NULL,
598 /*a_hdab=*/NULL);
599
600 } // end of task loop
601
602 // Merge thread-local forces and virial into shared ones.
603 // It does not seem worth the trouble to accumulate them thread-locally.
604 const double scalef = (iatom == jatom) ? 1.0 : 2.0;
605 if (forces != NULL) {
606#pragma omp critical(forces)
607 for (int i = 0; i < 3; i++) {
608 forces[iatom][i] += scalef * my_forces[0][i];
609 forces[jatom][i] += scalef * my_forces[1][i];
610 }
611 }
612 if (virial != NULL) {
613#pragma omp critical(virial)
614 for (int i = 0; i < 3; i++) {
615 for (int j = 0; j < 3; j++) {
616 virial[i][j] += scalef * my_virials[0][i][j];
617 virial[i][j] += scalef * my_virials[1][i][j];
618 }
619 }
620 }
621
622 } // end of block loop
623
624 // store final hab
625 if (old_offset >= 0) {
626 store_hab(old_ibasis, old_jbasis, old_iset, old_jset, old_transpose, hab,
627 &hab_blocks->host_buffer[old_offset]);
628 }
629
630 } // end of omp parallel region
631}
632
633/*******************************************************************************
634 * \brief Integrate all tasks of in given list from given grids.
635 * See grid_task_list.h for details.
636 * \author Ole Schuett
637 ******************************************************************************/
639 const grid_cpu_task_list *task_list, const bool compute_tau,
640 const int natoms, const int nlevels, const offload_buffer *pab_blocks,
641 const offload_buffer *grids[nlevels], offload_buffer *hab_blocks,
642 double forces[natoms][3], double virial[3][3]) {
643
644 assert(task_list->nlevels == nlevels);
645 assert(task_list->natoms == natoms);
646 assert(hab_blocks != NULL);
647
648 // Zero result arrays.
649 if (hab_blocks->size != 0) {
650 memset(hab_blocks->host_buffer, 0, hab_blocks->size);
651 }
652 if (forces != NULL) {
653 memset(forces, 0, natoms * 3 * sizeof(double));
654 }
655 if (virial != NULL) {
656 memset(virial, 0, 9 * sizeof(double));
657 }
658
659 for (int level = 0; level < task_list->nlevels; level++) {
660 const int idx = level * task_list->nblocks;
661 const int *first_block_task = &task_list->first_level_block_task[idx];
662 const int *last_block_task = &task_list->last_level_block_task[idx];
663 const grid_cpu_layout *layout = &task_list->layouts[level];
665 task_list, first_block_task, last_block_task, compute_tau, natoms,
666 layout->npts_global, layout->npts_local, layout->shift_local,
667 layout->border_width, layout->dh, layout->dh_inv, pab_blocks,
668 grids[level], hab_blocks, forces, virial);
669 }
670}
671
672// EOF
static int imax(int x, int y)
Returns the larger of two given integers (missing from the C standard)
static int imin(int x, int y)
Returns the smaller of the two integers (missing from the C standard).
Definition dbm_miniapp.c:40
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
#define GRID_PRAGMA_SIMD_LOOP
Definition grid_common.h:25
static GRID_HOST_DEVICE int idx(const orbital a)
Return coset index of given orbital angular momentum.
grid_func
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
static void const int const int i
const int ub
static void const int const int const int const int const int const double const int const int const int npts_local[3]
void grid_cpu_collocate_pgf_product(const bool orthorhombic, const int border_mask, const enum grid_func func, const int la_max, const int la_min, const int lb_max, const int lb_min, const double zeta, const double zetb, const double rscale, const double dh[3][3], const double dh_inv[3][3], const double ra[3], const double rab[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, const int o1, const int o2, const int n1, const int n2, const double pab[n2][n1], double *grid)
Public entry point. A thin wrapper with the only purpose of calling write_task_file when DUMP_TASKS =...
void grid_cpu_integrate_pgf_product(const bool orthorhombic, const bool compute_tau, const int border_mask, const int la_max, const int la_min, const int lb_max, const int lb_min, const double zeta, const double zetb, const double dh[3][3], const double dh_inv[3][3], const double ra[3], const double rab[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, const int o1, const int o2, const int n1, const int n2, const double *grid, double hab[n2][n1], const double pab[n2][n1], double forces[2][3], double virials[2][3][3], double hdab[n2][n1][3], double hadb[n2][n1][3], double a_hdab[n2][n1][3][3])
Integrates a single task. See grid_cpu_integrate.h for details.
static void load_pab(const grid_basis_set *ibasis, const grid_basis_set *jbasis, const int iset, const int jset, const bool transpose, const double *block, double *pab)
Transforms pab from contracted spherical to prim. cartesian basis.
void grid_cpu_free_task_list(grid_cpu_task_list *task_list)
Deallocates given task list, basis_sets have to be freed separately.
void grid_cpu_create_task_list(const bool orthorhombic, const int ntasks, const int nlevels, const int natoms, const int nkinds, const int nblocks, const int block_offsets[nblocks], const double atom_positions[natoms][3], const int atom_kinds[natoms], const grid_basis_set *basis_sets[nkinds], const int level_list[ntasks], const int iatom_list[ntasks], const int jatom_list[ntasks], const int iset_list[ntasks], const int jset_list[ntasks], const int ipgf_list[ntasks], const int jpgf_list[ntasks], const int border_mask_list[ntasks], const int block_num_list[ntasks], const double radius_list[ntasks], const double rab_list[ntasks][3], const int npts_global[nlevels][3], const int npts_local[nlevels][3], const int shift_local[nlevels][3], const int border_width[nlevels][3], const double dh[nlevels][3][3], const double dh_inv[nlevels][3][3], grid_cpu_task_list **task_list_out)
Allocates a task list for the cpu backend. See grid_task_list.h for details.
static void integrate_one_grid_level(const grid_cpu_task_list *task_list, const int *first_block_task, const int *last_block_task, const bool compute_tau, const int natoms, const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double dh[3][3], const double dh_inv[3][3], const offload_buffer *pab_blocks, const offload_buffer *grid, offload_buffer *hab_blocks, double forces[natoms][3], double virial[3][3])
Integrate a range of tasks that belong to the same grid level.
static void collocate_one_grid_level(const grid_cpu_task_list *task_list, const int *first_block_task, const int *last_block_task, const enum grid_func func, const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double dh[3][3], const double dh_inv[3][3], const double *pab_blocks, offload_buffer *grid)
Collocate a range of tasks which are destined for the same grid level.
static int compare_tasks(const void *a, const void *b)
Comperator passed to qsort to compare two tasks.
void grid_cpu_collocate_task_list(const grid_cpu_task_list *task_list, const enum grid_func func, const int nlevels, const offload_buffer *pab_blocks, offload_buffer *grids[nlevels])
Collocate all tasks of in given list onto given grids. See grid_task_list.h for details.
void dgemm_(const char *transa, const char *transb, const int *m, const int *n, const int *k, const double *alpha, const double *a, const int *lda, const double *b, const int *ldb, const double *beta, double *c, const int *ldc)
Prototype for BLAS dgemm.
void grid_cpu_integrate_task_list(const grid_cpu_task_list *task_list, const bool compute_tau, const int natoms, const int nlevels, const offload_buffer *pab_blocks, const offload_buffer *grids[nlevels], offload_buffer *hab_blocks, double forces[natoms][3], double virial[3][3])
Integrate all tasks of in given list from given grids. See grid_task_list.h for details.
static void store_hab(const grid_basis_set *ibasis, const grid_basis_set *jbasis, const int iset, const int jset, const bool transpose, const double *hab, double *block)
Transforms hab from prim. cartesian to contracted spherical basis.
static void dgemm(const char transa, const char transb, const int m, const int n, const int k, const double alpha, const double *a, const int lda, const double *b, const int ldb, const double beta, double *c, const int ldc)
Convenient wrapper to hide Fortran nature of dgemm_, swapping a and b.
Internal representation of a basis set.
Internal representation of a grid layout.
Internal representation of a task list.
grid_cpu_layout * layouts
grid_basis_set ** basis_sets
grid_cpu_task * tasks
Internal representation of a task.
Internal representation of a buffer.
double * host_buffer