(git:6250c82)
Loading...
Searching...
No Matches
grid_cpu_task_list.c
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2026 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: BSD-3-Clause */
6/*----------------------------------------------------------------------------*/
7
8#include <assert.h>
9#include <math.h>
10#include <omp.h>
11#include <stdint.h>
12#include <stdio.h>
13#include <stdlib.h>
14#include <string.h>
15
16#include "../common/grid_common.h"
17#include "grid_cpu_collocate.h"
18#include "grid_cpu_integrate.h"
19#include "grid_cpu_task_list.h"
21
22/*******************************************************************************
23 * \brief Comperator passed to qsort to compare two tasks.
24 * \author Ole Schuett
25 ******************************************************************************/
26static int compare_tasks(const void *a, const void *b) {
27 const grid_cpu_task *task_a = a, *task_b = b;
28 if (task_a->level != task_b->level) {
29 return task_a->level - task_b->level;
30 } else if (task_a->block_num != task_b->block_num) {
31 return task_a->block_num - task_b->block_num;
32 } else if (task_a->iset != task_b->iset) {
33 return task_a->iset - task_b->iset;
34 } else {
35 return task_a->jset - task_b->jset;
36 }
37}
38/*******************************************************************************
39 * \brief Allocates a task list for the cpu backend.
40 * See grid_task_list.h for details.
41 * \author Ole Schuett
42 ******************************************************************************/
44 const bool orthorhombic, const int ntasks, const int nlevels,
45 const int natoms, const int nkinds, const int nblocks,
46 const int block_offsets[nblocks], const double atom_positions[natoms][3],
47 const int atom_kinds[natoms], const grid_basis_set *basis_sets[nkinds],
48 const int level_list[ntasks], const int iatom_list[ntasks],
49 const int jatom_list[ntasks], const int iset_list[ntasks],
50 const int jset_list[ntasks], const int ipgf_list[ntasks],
51 const int jpgf_list[ntasks], const int border_mask_list[ntasks],
52 const int block_num_list[ntasks], const double radius_list[ntasks],
53 const double rab_list[ntasks][3], const int npts_global[nlevels][3],
54 const int npts_local[nlevels][3], const int shift_local[nlevels][3],
55 const int border_width[nlevels][3], const double dh[nlevels][3][3],
56 const double dh_inv[nlevels][3][3], grid_cpu_task_list **task_list_out) {
57
58 if (*task_list_out != NULL) {
59 // This is actually an opportunity to reuse some buffers.
60 grid_cpu_free_task_list(*task_list_out);
61 }
62
64 malloc(sizeof(grid_cpu_task_list_internal));
65 assert(task_list != NULL);
66
67 task_list->orthorhombic = orthorhombic;
68 task_list->ntasks = ntasks;
69 task_list->nlevels = nlevels;
70 task_list->natoms = natoms;
71 task_list->nkinds = nkinds;
72 task_list->nblocks = nblocks;
73
74 size_t size = nblocks * sizeof(int);
75 task_list->block_offsets = malloc(size);
76 assert(task_list->block_offsets != NULL || size == 0);
77 if (size != 0) {
78 memcpy(task_list->block_offsets, block_offsets, size);
79 }
80
81 size = 3 * natoms * sizeof(double);
82 task_list->atom_positions = malloc(size);
83 assert(task_list->atom_positions != NULL || size == 0);
84 if (size != 0) {
85 memcpy(task_list->atom_positions, atom_positions, size);
86 }
87
88 size = natoms * sizeof(int);
89 task_list->atom_kinds = malloc(size);
90 assert(task_list->atom_kinds != NULL || size == 0);
91 if (size != 0) {
92 memcpy(task_list->atom_kinds, atom_kinds, size);
93 }
94
95 size = nkinds * sizeof(grid_basis_set *);
96 task_list->basis_sets = malloc(size);
97 assert(task_list->basis_sets != NULL || size == 0);
98 if (size != 0) {
99 memcpy(task_list->basis_sets, basis_sets, size);
100 }
101
102 size = ntasks * sizeof(grid_cpu_task);
103 task_list->tasks = malloc(size);
104 assert(task_list->tasks != NULL || size == 0);
105#pragma omp parallel for schedule(static) if (ntasks > GRID_OMP_MIN_ITERATIONS)
106 for (int i = 0; i < ntasks; i++) {
107 task_list->tasks[i].level = level_list[i];
108 task_list->tasks[i].iatom = iatom_list[i];
109 task_list->tasks[i].jatom = jatom_list[i];
110 task_list->tasks[i].iset = iset_list[i];
111 task_list->tasks[i].jset = jset_list[i];
112 task_list->tasks[i].ipgf = ipgf_list[i];
113 task_list->tasks[i].jpgf = jpgf_list[i];
114 task_list->tasks[i].border_mask = border_mask_list[i];
115 task_list->tasks[i].block_num = block_num_list[i];
116 task_list->tasks[i].radius = radius_list[i];
117 task_list->tasks[i].rab[0] = rab_list[i][0];
118 task_list->tasks[i].rab[1] = rab_list[i][1];
119 task_list->tasks[i].rab[2] = rab_list[i][2];
120 }
121
122 // Store grid layouts.
123 size = nlevels * sizeof(grid_cpu_layout);
124 task_list->layouts = malloc(size);
125 assert(task_list->layouts != NULL || size == 0);
126 for (int level = 0; level < nlevels; level++) {
127 for (int i = 0; i < 3; i++) {
128 task_list->layouts[level].npts_global[i] = npts_global[level][i];
129 task_list->layouts[level].npts_local[i] = npts_local[level][i];
130 task_list->layouts[level].shift_local[i] = shift_local[level][i];
131 task_list->layouts[level].border_width[i] = border_width[level][i];
132 for (int j = 0; j < 3; j++) {
133 task_list->layouts[level].dh[i][j] = dh[level][i][j];
134 task_list->layouts[level].dh_inv[i][j] = dh_inv[level][i][j];
135 }
136 }
137 }
138
139 // Sort tasks by level, block_num, iset, and jset.
140 qsort(task_list->tasks, ntasks, sizeof(grid_cpu_task), &compare_tasks);
141
142 // Find first and last task for each level and block.
143 size = nlevels * nblocks * sizeof(int);
144 task_list->first_level_block_task = malloc(size);
145 assert(task_list->first_level_block_task != NULL || size == 0);
146 task_list->last_level_block_task = malloc(size);
147 assert(task_list->last_level_block_task != NULL || size == 0);
148 const int nlevel_blocks = nlevels * nblocks;
149#pragma omp parallel for schedule(static) if (nlevel_blocks > \
150 GRID_OMP_MIN_ITERATIONS)
151 for (int i = 0; i < nlevel_blocks; i++) {
152 task_list->first_level_block_task[i] = 0;
153 task_list->last_level_block_task[i] = -1; // last < first means no tasks
154 }
155 for (int itask = 0; itask < ntasks; itask++) {
156 const int level = task_list->tasks[itask].level - 1;
157 const int block_num = task_list->tasks[itask].block_num - 1;
158 if (itask == 0 || task_list->tasks[itask - 1].level - 1 != level ||
159 task_list->tasks[itask - 1].block_num - 1 != block_num) {
160 task_list->first_level_block_task[level * nblocks + block_num] = itask;
161 }
162 task_list->last_level_block_task[level * nblocks + block_num] = itask;
163 }
164
165 // Find largest Cartesian subblock size.
166 task_list->maxco = 0;
167 for (int i = 0; i < nkinds; i++) {
168 task_list->maxco = imax(task_list->maxco, task_list->basis_sets[i]->maxco);
169 }
170
171 // Initialize thread-local storage.
172 size = omp_get_max_threads() * sizeof(double *);
173 task_list->threadlocals = malloc(size);
174 assert(task_list->threadlocals != NULL);
175 memset(task_list->threadlocals, 0, size);
176 size = omp_get_max_threads() * sizeof(size_t);
177 task_list->threadlocal_sizes = malloc(size);
178 assert(task_list->threadlocal_sizes != NULL);
179 memset(task_list->threadlocal_sizes, 0, size);
180
181 *task_list_out = task_list;
182}
183
184/*******************************************************************************
185 * \brief Deallocates given task list, basis_sets have to be freed separately.
186 * \author Ole Schuett
187 ******************************************************************************/
189 if (ptr == NULL)
190 return;
191
193
194 free(task_list->block_offsets);
195 free(task_list->atom_positions);
196 free(task_list->atom_kinds);
197 free(task_list->basis_sets);
198 free(task_list->tasks);
199 free(task_list->layouts);
200 free(task_list->first_level_block_task);
201 free(task_list->last_level_block_task);
202 for (int i = 0; i < omp_get_max_threads(); i++) {
203 if (task_list->threadlocals[i] != NULL) {
204 free(task_list->threadlocals[i]);
205 }
206 }
207 free(task_list->threadlocals);
208 free(task_list->threadlocal_sizes);
209 free(task_list);
210}
211
212/*******************************************************************************
213 * \brief Prototype for BLAS dgemm.
214 * \author Ole Schuett
215 ******************************************************************************/
216void dgemm_(const char *transa, const char *transb, const int *m, const int *n,
217 const int *k, const double *alpha, const double *a, const int *lda,
218 const double *b, const int *ldb, const double *beta, double *c,
219 const int *ldc);
220
221/*******************************************************************************
222 * \brief Convenient wrapper to hide Fortran nature of dgemm_, swapping a and b.
223 * \author Ole Schuett
224 ******************************************************************************/
225static void dgemm(const char transa, const char transb, const int m,
226 const int n, const int k, const double alpha, const double *a,
227 const int lda, const double *b, const int ldb,
228 const double beta, double *c, const int ldc) {
229 dgemm_(&transb, &transa, &n, &m, &k, &alpha, b, &ldb, a, &lda, &beta, c,
230 &ldc);
231}
232
233/*******************************************************************************
234 * \brief Transforms pab from contracted spherical to prim. cartesian basis.
235 * \author Ole Schuett
236 ******************************************************************************/
237static void load_pab(const grid_basis_set *ibasis, const grid_basis_set *jbasis,
238 const int iset, const int jset, const bool transpose,
239 const double *block, double *pab) {
240
241 // Define some more convenient aliases.
242 const int ncoseta = ncoset(ibasis->lmax[iset]);
243 const int ncosetb = ncoset(jbasis->lmax[jset]);
244 const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
245 const int ncob = jbasis->npgf[jset] * ncosetb;
246
247 const int nsgf_seta = ibasis->nsgf_set[iset]; // size of spherical set
248 const int nsgf_setb = jbasis->nsgf_set[jset];
249 const int nsgfa = ibasis->nsgf; // size of entire spherical basis
250 const int nsgfb = jbasis->nsgf;
251 const int sgfa = ibasis->first_sgf[iset] - 1; // start of spherical set
252 const int sgfb = jbasis->first_sgf[jset] - 1;
253 const int maxcoa = ibasis->maxco;
254 const int maxcob = jbasis->maxco;
255
256 double work[nsgf_setb * ncoa];
257 if (transpose) {
258 // work[nsgf_setb][ncoa] = MATMUL(subblock, ibasis->sphi)
259 dgemm('N', 'N', nsgf_setb, ncoa, nsgf_seta, 1.0,
260 &block[sgfb * nsgfa + sgfa], nsgfa, &ibasis->sphi[sgfa * maxcoa],
261 maxcoa, 0.0, work, ncoa);
262 } else {
263 // work[nsgf_setb][ncoa] = MATMUL(TRANSPOSE(subblock), ibasis->sphi)
264 dgemm('T', 'N', nsgf_setb, ncoa, nsgf_seta, 1.0,
265 &block[sgfa * nsgfb + sgfb], nsgfb, &ibasis->sphi[sgfa * maxcoa],
266 maxcoa, 0.0, work, ncoa);
267 }
268 // pab[ncob][ncoa] = MATMUL(TRANSPOSE(jbasis->sphi), work)
269 dgemm('T', 'N', ncob, ncoa, nsgf_setb, 1.0, &jbasis->sphi[sgfb * maxcob],
270 maxcob, work, ncoa, 0.0, pab, ncoa);
271}
272
273/*******************************************************************************
274 * \brief Collocate a range of tasks which are destined for the same grid level.
275 * \author Ole Schuett
276 ******************************************************************************/
278 const grid_cpu_task_list *ptr, const int *first_block_task,
279 const int *last_block_task, const enum grid_func func,
280 const int npts_global[3], const int npts_local[3], const int shift_local[3],
281 const int border_width[3], const double dh[3][3], const double dh_inv[3][3],
282 const double *pab_blocks, offload_buffer *grid) {
283
284 if (ptr == NULL)
285 return;
286
288
289// Using default(shared) because with GCC 9 the behavior around const changed:
290// https://www.gnu.org/software/gcc/gcc-9/porting_to.html
291#pragma omp parallel default(shared)
292 {
293 const int ithread = omp_get_thread_num();
294 const int nthreads = omp_get_num_threads();
295
296 // Initialize variables to detect when a new subblock has to be fetched.
297 int old_offset = -1, old_iset = -1, old_jset = -1;
298
299 // Matrix pab is re-used across tasks.
300 double pab[imax(task_list->maxco * task_list->maxco, 1)];
301
302 // Ensure that grid can fit into thread-local storage, reallocate if needed.
303 const int npts_local_total = npts_local[0] * npts_local[1] * npts_local[2];
304 const size_t grid_size = npts_local_total * sizeof(double);
305 if (task_list->threadlocal_sizes[ithread] < grid_size) {
306 if (task_list->threadlocals[ithread] != NULL) {
307 free(task_list->threadlocals[ithread]);
308 }
309 task_list->threadlocals[ithread] = malloc(grid_size);
310 assert(task_list->threadlocals[ithread] != NULL);
311 task_list->threadlocal_sizes[ithread] = grid_size;
312 }
313
314 // Zero thread-local copy of the grid.
315 double *const my_grid = task_list->threadlocals[ithread];
316 memset(my_grid, 0, grid_size);
317
318 // Parallelize over blocks to avoid unnecessary calls to load_pab.
319 const int chunk_size = imax(1, task_list->nblocks / (nthreads * 50));
320#pragma omp for schedule(dynamic, chunk_size)
321 for (int block_num = 0; block_num < task_list->nblocks; block_num++) {
322 const int first_task = first_block_task[block_num];
323 const int last_task = last_block_task[block_num];
324
325 for (int itask = first_task; itask <= last_task; itask++) {
326 // Define some convenient aliases.
327 const grid_cpu_task *task = &task_list->tasks[itask];
328 const int iatom = task->iatom - 1;
329 const int jatom = task->jatom - 1;
330 const int iset = task->iset - 1;
331 const int jset = task->jset - 1;
332 const int ipgf = task->ipgf - 1;
333 const int jpgf = task->jpgf - 1;
334 const int ikind = task_list->atom_kinds[iatom] - 1;
335 const int jkind = task_list->atom_kinds[jatom] - 1;
336 const grid_basis_set *ibasis = task_list->basis_sets[ikind];
337 const grid_basis_set *jbasis = task_list->basis_sets[jkind];
338 const double zeta = ibasis->zet[iset * ibasis->maxpgf + ipgf];
339 const double zetb = jbasis->zet[jset * jbasis->maxpgf + jpgf];
340 const int ncoseta = ncoset(ibasis->lmax[iset]);
341 const int ncosetb = ncoset(jbasis->lmax[jset]);
342 const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
343 const int ncob = jbasis->npgf[jset] * ncosetb;
344 const int block_num = task->block_num - 1;
345 const int block_offset = task_list->block_offsets[block_num];
346 const double *block = &pab_blocks[block_offset];
347 const bool transpose = (iatom <= jatom);
348
349 // Load subblock from buffer and decontract into Cartesian sublock pab.
350 // The previous pab can be reused when only ipgf or jpgf has changed.
351 if (block_offset != old_offset || iset != old_iset ||
352 jset != old_jset) {
353 old_offset = block_offset;
354 old_iset = iset;
355 old_jset = jset;
356 load_pab(ibasis, jbasis, iset, jset, transpose, block, pab);
357 }
358
360 /*orthorhombic=*/task_list->orthorhombic,
361 /*border_mask=*/task->border_mask,
362 /*func=*/func,
363 /*la_max=*/ibasis->lmax[iset],
364 /*la_min=*/ibasis->lmin[iset],
365 /*lb_max=*/jbasis->lmax[jset],
366 /*lb_min=*/jbasis->lmin[jset],
367 /*zeta=*/zeta,
368 /*zetb=*/zetb,
369 /*rscale=*/(iatom == jatom) ? 1 : 2,
370 /*dh=*/dh,
371 /*dh_inv=*/dh_inv,
372 /*ra=*/&task_list->atom_positions[3 * iatom],
373 /*rab=*/task->rab,
374 /*npts_global=*/npts_global,
375 /*npts_local=*/npts_local,
376 /*shift_local=*/shift_local,
377 /*border_width=*/border_width,
378 /*radius=*/task->radius,
379 /*o1=*/ipgf * ncoseta,
380 /*o2=*/jpgf * ncosetb,
381 /*n1=*/ncoa,
382 /*n2=*/ncob,
383 /*pab=*/(const double(*)[ncoa])pab,
384 /*grid=*/my_grid);
385
386 } // end of task loop
387 } // end of block loop
388
389// While there should be an implicit barrier at the end of the block loop, this
390// explicit barrier eliminates occasional seg faults with icc compiled binaries.
391#pragma omp barrier
392
393 // Merge thread-local grids via an efficient tree reduction.
394 const int nreduction_cycles = ceil(log(nthreads) / log(2)); // tree depth
395 for (int icycle = 1; icycle <= nreduction_cycles; icycle++) {
396 // Threads are divided into groups, whose size doubles with each cycle.
397 // After a cycle the reduced data is stored at first thread of each group.
398 const int group_size = 1 << icycle; // 2**icycle
399 const int igroup = ithread / group_size;
400 const int dest_thread = igroup * group_size;
401 const int src_thread = dest_thread + group_size / 2;
402 // The last group might actually be a bit smaller.
403 const int actual_group_size = imin(group_size, nthreads - dest_thread);
404 // Parallelize summation by dividing grid points across group members.
405 const int rank = modulo(ithread, group_size); // position within the group
406 const int64_t lb = ((int64_t)npts_local_total * rank) / actual_group_size;
407 const int64_t ub =
408 ((int64_t)npts_local_total * (rank + 1)) / actual_group_size;
409 if (src_thread < nthreads) {
411 for (int i = (int)lb; i < (int)ub; i++) {
412 task_list->threadlocals[dest_thread][i] +=
413 task_list->threadlocals[src_thread][i];
414 }
415 }
416#pragma omp barrier
417 }
418
419 // Copy final result from first thread into shared grid.
420 const int64_t lb = ((int64_t)npts_local_total * ithread) / nthreads;
421 const int64_t ub = ((int64_t)npts_local_total * (ithread + 1)) / nthreads;
423 for (int i = (int)lb; i < (int)ub; i++) {
424 grid->host_buffer[i] = task_list->threadlocals[0][i];
425 }
426
427 } // end of omp parallel region
428}
429
430/*******************************************************************************
431 * \brief Collocate all tasks of in given list onto given grids.
432 * See grid_task_list.h for details.
433 * \author Ole Schuett
434 ******************************************************************************/
436 const enum grid_func func, const int nlevels,
437 const offload_buffer *pab_blocks,
438 offload_buffer *grids[nlevels]) {
439 if (ptr == NULL)
440 return;
441
443 assert(task_list->nlevels == nlevels);
444
445 for (int level = 0; level < task_list->nlevels; level++) {
446 const int idx = level * task_list->nblocks;
447 const int *first_block_task = &task_list->first_level_block_task[idx];
448 const int *last_block_task = &task_list->last_level_block_task[idx];
449 const grid_cpu_layout *layout = &task_list->layouts[level];
451 task_list, first_block_task, last_block_task, func, layout->npts_global,
452 layout->npts_local, layout->shift_local, layout->border_width,
453 layout->dh, layout->dh_inv, pab_blocks->host_buffer, grids[level]);
454 }
455}
456
457/*******************************************************************************
458 * \brief Transforms hab from prim. cartesian to contracted spherical basis.
459 * \author Ole Schuett
460 ******************************************************************************/
461static inline void store_hab(const grid_basis_set *ibasis,
462 const grid_basis_set *jbasis, const int iset,
463 const int jset, const bool transpose,
464 const double *hab, double *block) {
465
466 // Define some more convenient aliases.
467 const int ncoseta = ncoset(ibasis->lmax[iset]);
468 const int ncosetb = ncoset(jbasis->lmax[jset]);
469 const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
470 const int ncob = jbasis->npgf[jset] * ncosetb;
471
472 const int nsgf_seta = ibasis->nsgf_set[iset]; // size of spherical set
473 const int nsgf_setb = jbasis->nsgf_set[jset];
474 const int nsgfa = ibasis->nsgf; // size of entire spherical basis
475 const int nsgfb = jbasis->nsgf;
476 const int sgfa = ibasis->first_sgf[iset] - 1; // start of spherical set
477 const int sgfb = jbasis->first_sgf[jset] - 1;
478 const int maxcoa = ibasis->maxco;
479 const int maxcob = jbasis->maxco;
480
481 double work[nsgf_setb * ncoa];
482
483 // work[nsgf_setb][ncoa] = MATMUL(jbasis->sphi, hab)
484 dgemm('N', 'N', nsgf_setb, ncoa, ncob, 1.0, &jbasis->sphi[sgfb * maxcob],
485 maxcob, hab, ncoa, 0.0, work, ncoa);
486
487 if (transpose) {
488 // subblock[nsgf_setb][nsgf_seta] += MATMUL(work, TRANSPOSE(ibasis->sphi))
489 dgemm('N', 'T', nsgf_setb, nsgf_seta, ncoa, 1.0, work, ncoa,
490 &ibasis->sphi[sgfa * maxcoa], maxcoa, 1.0,
491 &block[sgfb * nsgfa + sgfa], nsgfa);
492 } else {
493 // subblock[nsgf_seta][nsgf_setb] += MATMUL(ibasis->sphi, TRANSPOSE(work))
494 dgemm('N', 'T', nsgf_seta, nsgf_setb, ncoa, 1.0,
495 &ibasis->sphi[sgfa * maxcoa], maxcoa, work, ncoa, 1.0,
496 &block[sgfa * nsgfb + sgfb], nsgfb);
497 }
498}
499
500/*******************************************************************************
501 * \brief Integrate a range of tasks that belong to the same grid level.
502 * \author Ole Schuett
503 ******************************************************************************/
505 const grid_cpu_task_list *ptr, const int *first_block_task,
506 const int *last_block_task, const bool compute_tau, const int natoms,
507 const int npts_global[3], const int npts_local[3], const int shift_local[3],
508 const int border_width[3], const double dh[3][3], const double dh_inv[3][3],
509 const offload_buffer *pab_blocks, const offload_buffer *grid,
510 offload_buffer *hab_blocks, double forces[natoms][3], double virial[3][3]) {
511
512 if (ptr == NULL)
513 return;
514
516
517// Using default(shared) because with GCC 9 the behavior around const changed:
518// https://www.gnu.org/software/gcc/gcc-9/porting_to.html
519#pragma omp parallel default(shared)
520 {
521 // Initialize variables to detect when a new subblock has to be fetched.
522 int old_offset = -1, old_iset = -1, old_jset = -1;
523 grid_basis_set *old_ibasis = NULL, *old_jbasis = NULL;
524 bool old_transpose = false;
525
526 // Matrix pab and hab are re-used across tasks.
527 double pab[imax(task_list->maxco * task_list->maxco, 1)];
528 double hab[imax(task_list->maxco * task_list->maxco, 1)];
529
530 // Parallelize over blocks to avoid concurred access to hab_blocks.
531 const int nthreads = omp_get_num_threads();
532 const int chunk_size = imax(1, task_list->nblocks / (nthreads * 50));
533#pragma omp for schedule(dynamic, chunk_size)
534 for (int block_num = 0; block_num < task_list->nblocks; block_num++) {
535 const int first_task = first_block_task[block_num];
536 const int last_task = last_block_task[block_num];
537
538 // Accumulate forces per block as it corresponds to a pair of atoms.
539 const int iatom = task_list->tasks[first_task].iatom - 1;
540 const int jatom = task_list->tasks[first_task].jatom - 1;
541 double my_forces[2][3] = {0};
542 double my_virials[2][3][3] = {0};
543
544 for (int itask = first_task; itask <= last_task; itask++) {
545 // Define some convenient aliases.
546 const grid_cpu_task *task = &task_list->tasks[itask];
547 assert(task->block_num - 1 == block_num);
548 assert(task->iatom - 1 == iatom && task->jatom - 1 == jatom);
549 const int ikind = task_list->atom_kinds[iatom] - 1;
550 const int jkind = task_list->atom_kinds[jatom] - 1;
551 grid_basis_set *ibasis = task_list->basis_sets[ikind];
552 grid_basis_set *jbasis = task_list->basis_sets[jkind];
553 const int iset = task->iset - 1;
554 const int jset = task->jset - 1;
555 const int ipgf = task->ipgf - 1;
556 const int jpgf = task->jpgf - 1;
557 const double zeta = ibasis->zet[iset * ibasis->maxpgf + ipgf];
558 const double zetb = jbasis->zet[jset * jbasis->maxpgf + jpgf];
559 const int ncoseta = ncoset(ibasis->lmax[iset]);
560 const int ncosetb = ncoset(jbasis->lmax[jset]);
561 const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
562 const int ncob = jbasis->npgf[jset] * ncosetb;
563 const int block_offset = task_list->block_offsets[block_num];
564 const bool transpose = (iatom <= jatom);
565 const bool pab_required = (forces != NULL || virial != NULL);
566
567 // Load pab and store hab subblocks when needed.
568 // Previous hab and pab can be reused when only ipgf or jpgf changed.
569 if (block_offset != old_offset || iset != old_iset ||
570 jset != old_jset) {
571 if (pab_required) {
572 load_pab(ibasis, jbasis, iset, jset, transpose,
573 &pab_blocks->host_buffer[block_offset], pab);
574 }
575 if (old_offset >= 0) { // skip first iteration
576 store_hab(old_ibasis, old_jbasis, old_iset, old_jset, old_transpose,
577 hab, &hab_blocks->host_buffer[old_offset]);
578 }
579 memset(hab, 0, ncoa * ncob * sizeof(double));
580 old_offset = block_offset;
581 old_iset = iset;
582 old_jset = jset;
583 old_ibasis = ibasis;
584 old_jbasis = jbasis;
585 old_transpose = transpose;
586 }
587
589 /*orthorhombic=*/task_list->orthorhombic,
590 /*compute_tau=*/compute_tau,
591 /*border_mask=*/task->border_mask,
592 /*la_max=*/ibasis->lmax[iset],
593 /*la_min=*/ibasis->lmin[iset],
594 /*lb_max=*/jbasis->lmax[jset],
595 /*lb_min=*/jbasis->lmin[jset],
596 /*zeta=*/zeta,
597 /*zetb=*/zetb,
598 /*dh=*/dh,
599 /*dh_inv=*/dh_inv,
600 /*ra=*/&task_list->atom_positions[3 * iatom],
601 /*rab=*/task->rab,
602 /*npts_global=*/npts_global,
603 /*npts_local=*/npts_local,
604 /*shift_local=*/shift_local,
605 /*border_width=*/border_width,
606 /*radius=*/task->radius,
607 /*o1=*/ipgf * ncoseta,
608 /*o2=*/jpgf * ncosetb,
609 /*n1=*/ncoa,
610 /*n2=*/ncob,
611 /*grid=*/grid->host_buffer,
612 /*hab=*/(double(*)[ncoa])hab,
613 /*pab=*/(pab_required) ? (const double(*)[ncoa])pab : NULL,
614 /*forces=*/(forces != NULL) ? my_forces : NULL,
615 /*virials=*/(virial != NULL) ? my_virials : NULL,
616 /*hdab=*/NULL,
617 /*hadb=*/NULL,
618 /*a_hdab=*/NULL);
619
620 } // end of task loop
621
622 // Merge thread-local forces and virial into shared ones.
623 // It does not seem worth the trouble to accumulate them thread-locally.
624 const double scalef = (iatom == jatom) ? 1.0 : 2.0;
625 if (forces != NULL) {
626#pragma omp critical(forces)
627 for (int i = 0; i < 3; i++) {
628 forces[iatom][i] += scalef * my_forces[0][i];
629 forces[jatom][i] += scalef * my_forces[1][i];
630 }
631 }
632 if (virial != NULL) {
633#pragma omp critical(virial)
634 for (int i = 0; i < 3; i++) {
635 for (int j = 0; j < 3; j++) {
636 virial[i][j] += scalef * my_virials[0][i][j];
637 virial[i][j] += scalef * my_virials[1][i][j];
638 }
639 }
640 }
641
642 } // end of block loop
643
644 // store final hab
645 if (old_offset >= 0) {
646 store_hab(old_ibasis, old_jbasis, old_iset, old_jset, old_transpose, hab,
647 &hab_blocks->host_buffer[old_offset]);
648 }
649
650 } // end of omp parallel region
651}
652
653/*******************************************************************************
654 * \brief Integrate all tasks of in given list from given grids.
655 * See grid_task_list.h for details.
656 * \author Ole Schuett
657 ******************************************************************************/
659 const grid_cpu_task_list *ptr, const bool compute_tau, const int natoms,
660 const int nlevels, const offload_buffer *pab_blocks,
661 const offload_buffer *grids[nlevels], offload_buffer *hab_blocks,
662 double forces[natoms][3], double virial[3][3]) {
663
664 if (ptr == NULL)
665 return;
666
668
669 assert(task_list->nlevels == nlevels);
670 assert(task_list->natoms == natoms);
671 assert(hab_blocks != NULL);
672
673 // Zero result arrays.
674 if (hab_blocks->size != 0) {
675 memset(hab_blocks->host_buffer, 0, hab_blocks->size);
676 }
677 if (forces != NULL) {
678 memset(forces, 0, natoms * 3 * sizeof(double));
679 }
680 if (virial != NULL) {
681 memset(virial, 0, 9 * sizeof(double));
682 }
683
684 for (int level = 0; level < task_list->nlevels; level++) {
685 const int idx = level * task_list->nblocks;
686 const int *first_block_task = &task_list->first_level_block_task[idx];
687 const int *last_block_task = &task_list->last_level_block_task[idx];
688 const grid_cpu_layout *layout = &task_list->layouts[level];
690 task_list, first_block_task, last_block_task, compute_tau, natoms,
691 layout->npts_global, layout->npts_local, layout->shift_local,
692 layout->border_width, layout->dh, layout->dh_inv, pab_blocks,
693 grids[level], hab_blocks, forces, virial);
694 }
695}
696
697// EOF
static int imax(int x, int y)
Returns the larger of two given integers (missing from the C standard)
static int imin(int x, int y)
Returns the smaller of the two integers (missing from the C standard).
Definition dbm_miniapp.c:36
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
#define GRID_PRAGMA_SIMD_LOOP
Definition grid_common.h:25
static GRID_HOST_DEVICE int idx(const orbital a)
Return coset index of given orbital angular momentum.
grid_func
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
static void const int const int i
const int ub
static void const int const int const int const int const int const double const int const int const int npts_local[3]
void grid_cpu_collocate_pgf_product(const bool orthorhombic, const int border_mask, const enum grid_func func, const int la_max, const int la_min, const int lb_max, const int lb_min, const double zeta, const double zetb, const double rscale, const double dh[3][3], const double dh_inv[3][3], const double ra[3], const double rab[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, const int o1, const int o2, const int n1, const int n2, const double pab[n2][n1], double *grid)
Public entry point. A thin wrapper with the only purpose of calling write_task_file when DUMP_TASKS =...
void grid_cpu_integrate_pgf_product(const bool orthorhombic, const bool compute_tau, const int border_mask, const int la_max, const int la_min, const int lb_max, const int lb_min, const double zeta, const double zetb, const double dh[3][3], const double dh_inv[3][3], const double ra[3], const double rab[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, const int o1, const int o2, const int n1, const int n2, const double *grid, double hab[n2][n1], const double pab[n2][n1], double forces[2][3], double virials[2][3][3], double hdab[n2][n1][3], double hadb[n2][n1][3], double a_hdab[n2][n1][3][3])
Integrates a single task. See grid_cpu_integrate.h for details.
static void collocate_one_grid_level(const grid_cpu_task_list *ptr, const int *first_block_task, const int *last_block_task, const enum grid_func func, const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double dh[3][3], const double dh_inv[3][3], const double *pab_blocks, offload_buffer *grid)
Collocate a range of tasks which are destined for the same grid level.
static void load_pab(const grid_basis_set *ibasis, const grid_basis_set *jbasis, const int iset, const int jset, const bool transpose, const double *block, double *pab)
Transforms pab from contracted spherical to prim. cartesian basis.
void grid_cpu_collocate_task_list(const grid_cpu_task_list *ptr, const enum grid_func func, const int nlevels, const offload_buffer *pab_blocks, offload_buffer *grids[nlevels])
Collocate all tasks of in given list onto given grids. See grid_task_list.h for details.
void grid_cpu_free_task_list(grid_cpu_task_list *ptr)
Deallocates given task list, basis_sets have to be freed separately.
void grid_cpu_create_task_list(const bool orthorhombic, const int ntasks, const int nlevels, const int natoms, const int nkinds, const int nblocks, const int block_offsets[nblocks], const double atom_positions[natoms][3], const int atom_kinds[natoms], const grid_basis_set *basis_sets[nkinds], const int level_list[ntasks], const int iatom_list[ntasks], const int jatom_list[ntasks], const int iset_list[ntasks], const int jset_list[ntasks], const int ipgf_list[ntasks], const int jpgf_list[ntasks], const int border_mask_list[ntasks], const int block_num_list[ntasks], const double radius_list[ntasks], const double rab_list[ntasks][3], const int npts_global[nlevels][3], const int npts_local[nlevels][3], const int shift_local[nlevels][3], const int border_width[nlevels][3], const double dh[nlevels][3][3], const double dh_inv[nlevels][3][3], grid_cpu_task_list **task_list_out)
Allocates a task list for the cpu backend. See grid_task_list.h for details.
static int compare_tasks(const void *a, const void *b)
Comperator passed to qsort to compare two tasks.
static void integrate_one_grid_level(const grid_cpu_task_list *ptr, const int *first_block_task, const int *last_block_task, const bool compute_tau, const int natoms, const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double dh[3][3], const double dh_inv[3][3], const offload_buffer *pab_blocks, const offload_buffer *grid, offload_buffer *hab_blocks, double forces[natoms][3], double virial[3][3])
Integrate a range of tasks that belong to the same grid level.
void dgemm_(const char *transa, const char *transb, const int *m, const int *n, const int *k, const double *alpha, const double *a, const int *lda, const double *b, const int *ldb, const double *beta, double *c, const int *ldc)
Prototype for BLAS dgemm.
static void store_hab(const grid_basis_set *ibasis, const grid_basis_set *jbasis, const int iset, const int jset, const bool transpose, const double *hab, double *block)
Transforms hab from prim. cartesian to contracted spherical basis.
void grid_cpu_integrate_task_list(const grid_cpu_task_list *ptr, const bool compute_tau, const int natoms, const int nlevels, const offload_buffer *pab_blocks, const offload_buffer *grids[nlevels], offload_buffer *hab_blocks, double forces[natoms][3], double virial[3][3])
Integrate all tasks of in given list from given grids. See grid_task_list.h for details.
static void dgemm(const char transa, const char transb, const int m, const int n, const int k, const double alpha, const double *a, const int lda, const double *b, const int ldb, const double beta, double *c, const int ldc)
Convenient wrapper to hide Fortran nature of dgemm_, swapping a and b.
void grid_cpu_task_list
Internal representation of a basis set.
Internal representation of a grid layout.
Internal representation of a task list.
Internal representation of a task.
Internal representation of a buffer.
double * host_buffer