(git:97501a3)
Loading...
Searching...
No Matches
grid_cpu_task_list.c
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2025 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: BSD-3-Clause */
6/*----------------------------------------------------------------------------*/
7
8#include <assert.h>
9#include <math.h>
10#include <omp.h>
11#include <stdint.h>
12#include <stdio.h>
13#include <stdlib.h>
14#include <string.h>
15
16#include "../common/grid_common.h"
17#include "grid_cpu_collocate.h"
18#include "grid_cpu_integrate.h"
19#include "grid_cpu_task_list.h"
20
21/*******************************************************************************
22 * \brief Comperator passed to qsort to compare two tasks.
23 * \author Ole Schuett
24 ******************************************************************************/
25static int compare_tasks(const void *a, const void *b) {
26 const grid_cpu_task *task_a = a, *task_b = b;
27 if (task_a->level != task_b->level) {
28 return task_a->level - task_b->level;
29 } else if (task_a->block_num != task_b->block_num) {
30 return task_a->block_num - task_b->block_num;
31 } else if (task_a->iset != task_b->iset) {
32 return task_a->iset - task_b->iset;
33 } else {
34 return task_a->jset - task_b->jset;
35 }
36}
37/*******************************************************************************
38 * \brief Allocates a task list for the cpu backend.
39 * See grid_task_list.h for details.
40 * \author Ole Schuett
41 ******************************************************************************/
43 const bool orthorhombic, const int ntasks, const int nlevels,
44 const int natoms, const int nkinds, const int nblocks,
45 const int block_offsets[nblocks], const double atom_positions[natoms][3],
46 const int atom_kinds[natoms], const grid_basis_set *basis_sets[nkinds],
47 const int level_list[ntasks], const int iatom_list[ntasks],
48 const int jatom_list[ntasks], const int iset_list[ntasks],
49 const int jset_list[ntasks], const int ipgf_list[ntasks],
50 const int jpgf_list[ntasks], const int border_mask_list[ntasks],
51 const int block_num_list[ntasks], const double radius_list[ntasks],
52 const double rab_list[ntasks][3], const int npts_global[nlevels][3],
53 const int npts_local[nlevels][3], const int shift_local[nlevels][3],
54 const int border_width[nlevels][3], const double dh[nlevels][3][3],
55 const double dh_inv[nlevels][3][3], grid_cpu_task_list **task_list_out) {
56
57 if (*task_list_out != NULL) {
58 // This is actually an opportunity to reuse some buffers.
59 grid_cpu_free_task_list(*task_list_out);
60 }
61
62 grid_cpu_task_list *task_list = malloc(sizeof(grid_cpu_task_list));
63 assert(task_list != NULL);
64
65 task_list->orthorhombic = orthorhombic;
66 task_list->ntasks = ntasks;
67 task_list->nlevels = nlevels;
68 task_list->natoms = natoms;
69 task_list->nkinds = nkinds;
70 task_list->nblocks = nblocks;
71
72 size_t size = nblocks * sizeof(int);
73 task_list->block_offsets = malloc(size);
74 assert(task_list->block_offsets != NULL || size == 0);
75 if (size != 0) {
76 memcpy(task_list->block_offsets, block_offsets, size);
77 }
78
79 size = 3 * natoms * sizeof(double);
80 task_list->atom_positions = malloc(size);
81 assert(task_list->atom_positions != NULL || size == 0);
82 if (size != 0) {
83 memcpy(task_list->atom_positions, atom_positions, size);
84 }
85
86 size = natoms * sizeof(int);
87 task_list->atom_kinds = malloc(size);
88 assert(task_list->atom_kinds != NULL || size == 0);
89 if (size != 0) {
90 memcpy(task_list->atom_kinds, atom_kinds, size);
91 }
92
93 size = nkinds * sizeof(grid_basis_set *);
94 task_list->basis_sets = malloc(size);
95 assert(task_list->basis_sets != NULL || size == 0);
96 if (size != 0) {
97 memcpy(task_list->basis_sets, basis_sets, size);
98 }
99
100 size = ntasks * sizeof(grid_cpu_task);
101 task_list->tasks = malloc(size);
102 assert(task_list->tasks != NULL || size == 0);
103 for (int i = 0; i < ntasks; i++) {
104 task_list->tasks[i].level = level_list[i];
105 task_list->tasks[i].iatom = iatom_list[i];
106 task_list->tasks[i].jatom = jatom_list[i];
107 task_list->tasks[i].iset = iset_list[i];
108 task_list->tasks[i].jset = jset_list[i];
109 task_list->tasks[i].ipgf = ipgf_list[i];
110 task_list->tasks[i].jpgf = jpgf_list[i];
111 task_list->tasks[i].border_mask = border_mask_list[i];
112 task_list->tasks[i].block_num = block_num_list[i];
113 task_list->tasks[i].radius = radius_list[i];
114 task_list->tasks[i].rab[0] = rab_list[i][0];
115 task_list->tasks[i].rab[1] = rab_list[i][1];
116 task_list->tasks[i].rab[2] = rab_list[i][2];
117 }
118
119 // Store grid layouts.
120 size = nlevels * sizeof(grid_cpu_layout);
121 task_list->layouts = malloc(size);
122 assert(task_list->layouts != NULL || size == 0);
123 for (int level = 0; level < nlevels; level++) {
124 for (int i = 0; i < 3; i++) {
125 task_list->layouts[level].npts_global[i] = npts_global[level][i];
126 task_list->layouts[level].npts_local[i] = npts_local[level][i];
127 task_list->layouts[level].shift_local[i] = shift_local[level][i];
128 task_list->layouts[level].border_width[i] = border_width[level][i];
129 for (int j = 0; j < 3; j++) {
130 task_list->layouts[level].dh[i][j] = dh[level][i][j];
131 task_list->layouts[level].dh_inv[i][j] = dh_inv[level][i][j];
132 }
133 }
134 }
135
136 // Sort tasks by level, block_num, iset, and jset.
137 qsort(task_list->tasks, ntasks, sizeof(grid_cpu_task), &compare_tasks);
138
139 // Find first and last task for each level and block.
140 size = nlevels * nblocks * sizeof(int);
141 task_list->first_level_block_task = malloc(size);
142 assert(task_list->first_level_block_task != NULL || size == 0);
143 task_list->last_level_block_task = malloc(size);
144 assert(task_list->last_level_block_task != NULL || size == 0);
145 for (int i = 0; i < nlevels * nblocks; i++) {
146 task_list->first_level_block_task[i] = 0;
147 task_list->last_level_block_task[i] = -1; // last < first means no tasks
148 }
149 for (int itask = 0; itask < ntasks; itask++) {
150 const int level = task_list->tasks[itask].level - 1;
151 const int block_num = task_list->tasks[itask].block_num - 1;
152 if (itask == 0 || task_list->tasks[itask - 1].level - 1 != level ||
153 task_list->tasks[itask - 1].block_num - 1 != block_num) {
154 task_list->first_level_block_task[level * nblocks + block_num] = itask;
155 }
156 task_list->last_level_block_task[level * nblocks + block_num] = itask;
157 }
158
159 // Find largest Cartesian subblock size.
160 task_list->maxco = 0;
161 for (int i = 0; i < nkinds; i++) {
162 task_list->maxco = imax(task_list->maxco, task_list->basis_sets[i]->maxco);
163 }
164
165 // Initialize thread-local storage.
166 size = omp_get_max_threads() * sizeof(double *);
167 task_list->threadlocals = malloc(size);
168 assert(task_list->threadlocals != NULL);
169 memset(task_list->threadlocals, 0, size);
170 size = omp_get_max_threads() * sizeof(size_t);
171 task_list->threadlocal_sizes = malloc(size);
172 assert(task_list->threadlocal_sizes != NULL);
173 memset(task_list->threadlocal_sizes, 0, size);
174
175 *task_list_out = task_list;
176}
177
178/*******************************************************************************
179 * \brief Deallocates given task list, basis_sets have to be freed separately.
180 * \author Ole Schuett
181 ******************************************************************************/
183 free(task_list->block_offsets);
184 free(task_list->atom_positions);
185 free(task_list->atom_kinds);
186 free(task_list->basis_sets);
187 free(task_list->tasks);
188 free(task_list->layouts);
189 free(task_list->first_level_block_task);
190 free(task_list->last_level_block_task);
191 for (int i = 0; i < omp_get_max_threads(); i++) {
192 if (task_list->threadlocals[i] != NULL) {
193 free(task_list->threadlocals[i]);
194 }
195 }
196 free(task_list->threadlocals);
197 free(task_list->threadlocal_sizes);
198 free(task_list);
199}
200
201/*******************************************************************************
202 * \brief Prototype for BLAS dgemm.
203 * \author Ole Schuett
204 ******************************************************************************/
205void dgemm_(const char *transa, const char *transb, const int *m, const int *n,
206 const int *k, const double *alpha, const double *a, const int *lda,
207 const double *b, const int *ldb, const double *beta, double *c,
208 const int *ldc);
209
210/*******************************************************************************
211 * \brief Convenient wrapper to hide Fortran nature of dgemm_, swapping a and b.
212 * \author Ole Schuett
213 ******************************************************************************/
214static void dgemm(const char transa, const char transb, const int m,
215 const int n, const int k, const double alpha, const double *a,
216 const int lda, const double *b, const int ldb,
217 const double beta, double *c, const int ldc) {
218 dgemm_(&transb, &transa, &n, &m, &k, &alpha, b, &ldb, a, &lda, &beta, c,
219 &ldc);
220}
221
222/*******************************************************************************
223 * \brief Transforms pab from contracted spherical to prim. cartesian basis.
224 * \author Ole Schuett
225 ******************************************************************************/
226static void load_pab(const grid_basis_set *ibasis, const grid_basis_set *jbasis,
227 const int iset, const int jset, const bool transpose,
228 const double *block, double *pab) {
229
230 // Define some more convenient aliases.
231 const int ncoseta = ncoset(ibasis->lmax[iset]);
232 const int ncosetb = ncoset(jbasis->lmax[jset]);
233 const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
234 const int ncob = jbasis->npgf[jset] * ncosetb;
235
236 const int nsgf_seta = ibasis->nsgf_set[iset]; // size of spherical set
237 const int nsgf_setb = jbasis->nsgf_set[jset];
238 const int nsgfa = ibasis->nsgf; // size of entire spherical basis
239 const int nsgfb = jbasis->nsgf;
240 const int sgfa = ibasis->first_sgf[iset] - 1; // start of spherical set
241 const int sgfb = jbasis->first_sgf[jset] - 1;
242 const int maxcoa = ibasis->maxco;
243 const int maxcob = jbasis->maxco;
244
245 double work[nsgf_setb * ncoa];
246 if (transpose) {
247 // work[nsgf_setb][ncoa] = MATMUL(subblock, ibasis->sphi)
248 dgemm('N', 'N', nsgf_setb, ncoa, nsgf_seta, 1.0,
249 &block[sgfb * nsgfa + sgfa], nsgfa, &ibasis->sphi[sgfa * maxcoa],
250 maxcoa, 0.0, work, ncoa);
251 } else {
252 // work[nsgf_setb][ncoa] = MATMUL(TRANSPOSE(subblock), ibasis->sphi)
253 dgemm('T', 'N', nsgf_setb, ncoa, nsgf_seta, 1.0,
254 &block[sgfa * nsgfb + sgfb], nsgfb, &ibasis->sphi[sgfa * maxcoa],
255 maxcoa, 0.0, work, ncoa);
256 }
257 // pab[ncob][ncoa] = MATMUL(TRANSPOSE(jbasis->sphi), work)
258 dgemm('T', 'N', ncob, ncoa, nsgf_setb, 1.0, &jbasis->sphi[sgfb * maxcob],
259 maxcob, work, ncoa, 0.0, pab, ncoa);
260}
261
262/*******************************************************************************
263 * \brief Collocate a range of tasks which are destined for the same grid level.
264 * \author Ole Schuett
265 ******************************************************************************/
267 const grid_cpu_task_list *task_list, const int *first_block_task,
268 const int *last_block_task, const enum grid_func func,
269 const int npts_global[3], const int npts_local[3], const int shift_local[3],
270 const int border_width[3], const double dh[3][3], const double dh_inv[3][3],
271 const double *pab_blocks, offload_buffer *grid) {
272
273// Using default(shared) because with GCC 9 the behavior around const changed:
274// https://www.gnu.org/software/gcc/gcc-9/porting_to.html
275#pragma omp parallel default(shared)
276 {
277 const int ithread = omp_get_thread_num();
278 const int nthreads = omp_get_num_threads();
279
280 // Initialize variables to detect when a new subblock has to be fetched.
281 int old_offset = -1, old_iset = -1, old_jset = -1;
282
283 // Matrix pab is re-used across tasks.
284 double pab[imax(task_list->maxco * task_list->maxco, 1)];
285
286 // Ensure that grid can fit into thread-local storage, reallocate if needed.
287 const int npts_local_total = npts_local[0] * npts_local[1] * npts_local[2];
288 const size_t grid_size = npts_local_total * sizeof(double);
289 if (task_list->threadlocal_sizes[ithread] < grid_size) {
290 if (task_list->threadlocals[ithread] != NULL) {
291 free(task_list->threadlocals[ithread]);
292 }
293 task_list->threadlocals[ithread] = malloc(grid_size);
294 assert(task_list->threadlocals[ithread] != NULL);
295 task_list->threadlocal_sizes[ithread] = grid_size;
296 }
297
298 // Zero thread-local copy of the grid.
299 double *const my_grid = task_list->threadlocals[ithread];
300 memset(my_grid, 0, grid_size);
301
302 // Parallelize over blocks to avoid unnecessary calls to load_pab.
303 const int chunk_size = imax(1, task_list->nblocks / (nthreads * 50));
304#pragma omp for schedule(dynamic, chunk_size)
305 for (int block_num = 0; block_num < task_list->nblocks; block_num++) {
306 const int first_task = first_block_task[block_num];
307 const int last_task = last_block_task[block_num];
308
309 for (int itask = first_task; itask <= last_task; itask++) {
310 // Define some convenient aliases.
311 const grid_cpu_task *task = &task_list->tasks[itask];
312 const int iatom = task->iatom - 1;
313 const int jatom = task->jatom - 1;
314 const int iset = task->iset - 1;
315 const int jset = task->jset - 1;
316 const int ipgf = task->ipgf - 1;
317 const int jpgf = task->jpgf - 1;
318 const int ikind = task_list->atom_kinds[iatom] - 1;
319 const int jkind = task_list->atom_kinds[jatom] - 1;
320 const grid_basis_set *ibasis = task_list->basis_sets[ikind];
321 const grid_basis_set *jbasis = task_list->basis_sets[jkind];
322 const double zeta = ibasis->zet[iset * ibasis->maxpgf + ipgf];
323 const double zetb = jbasis->zet[jset * jbasis->maxpgf + jpgf];
324 const int ncoseta = ncoset(ibasis->lmax[iset]);
325 const int ncosetb = ncoset(jbasis->lmax[jset]);
326 const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
327 const int ncob = jbasis->npgf[jset] * ncosetb;
328 const int block_num = task->block_num - 1;
329 const int block_offset = task_list->block_offsets[block_num];
330 const double *block = &pab_blocks[block_offset];
331 const bool transpose = (iatom <= jatom);
332
333 // Load subblock from buffer and decontract into Cartesian sublock pab.
334 // The previous pab can be reused when only ipgf or jpgf has changed.
335 if (block_offset != old_offset || iset != old_iset ||
336 jset != old_jset) {
337 old_offset = block_offset;
338 old_iset = iset;
339 old_jset = jset;
340 load_pab(ibasis, jbasis, iset, jset, transpose, block, pab);
341 }
342
344 /*orthorhombic=*/task_list->orthorhombic,
345 /*border_mask=*/task->border_mask,
346 /*func=*/func,
347 /*la_max=*/ibasis->lmax[iset],
348 /*la_min=*/ibasis->lmin[iset],
349 /*lb_max=*/jbasis->lmax[jset],
350 /*lb_min=*/jbasis->lmin[jset],
351 /*zeta=*/zeta,
352 /*zetb=*/zetb,
353 /*rscale=*/(iatom == jatom) ? 1 : 2,
354 /*dh=*/dh,
355 /*dh_inv=*/dh_inv,
356 /*ra=*/&task_list->atom_positions[3 * iatom],
357 /*rab=*/task->rab,
358 /*npts_global=*/npts_global,
359 /*npts_local=*/npts_local,
360 /*shift_local=*/shift_local,
361 /*border_width=*/border_width,
362 /*radius=*/task->radius,
363 /*o1=*/ipgf * ncoseta,
364 /*o2=*/jpgf * ncosetb,
365 /*n1=*/ncoa,
366 /*n2=*/ncob,
367 /*pab=*/(const double(*)[ncoa])pab,
368 /*grid=*/my_grid);
369
370 } // end of task loop
371 } // end of block loop
372
373// While there should be an implicit barrier at the end of the block loop, this
374// explicit barrier eliminates occasional seg faults with icc compiled binaries.
375#pragma omp barrier
376
377 // Merge thread-local grids via an efficient tree reduction.
378 const int nreduction_cycles = ceil(log(nthreads) / log(2)); // tree depth
379 for (int icycle = 1; icycle <= nreduction_cycles; icycle++) {
380 // Threads are divided into groups, whose size doubles with each cycle.
381 // After a cycle the reduced data is stored at first thread of each group.
382 const int group_size = 1 << icycle; // 2**icycle
383 const int igroup = ithread / group_size;
384 const int dest_thread = igroup * group_size;
385 const int src_thread = dest_thread + group_size / 2;
386 // The last group might actually be a bit smaller.
387 const int actual_group_size = imin(group_size, nthreads - dest_thread);
388 // Parallelize summation by dividing grid points across group members.
389 const int rank = modulo(ithread, group_size); // position within the group
390 const int64_t lb = ((int64_t)npts_local_total * rank) / actual_group_size;
391 const int64_t ub =
392 ((int64_t)npts_local_total * (rank + 1)) / actual_group_size;
393 if (src_thread < nthreads) {
394 for (int i = (int)lb; i < (int)ub; i++) {
395 task_list->threadlocals[dest_thread][i] +=
396 task_list->threadlocals[src_thread][i];
397 }
398 }
399#pragma omp barrier
400 }
401
402 // Copy final result from first thread into shared grid.
403 const int64_t lb = ((int64_t)npts_local_total * ithread) / nthreads;
404 const int64_t ub = ((int64_t)npts_local_total * (ithread + 1)) / nthreads;
405 for (int i = (int)lb; i < (int)ub; i++) {
406 grid->host_buffer[i] = task_list->threadlocals[0][i];
407 }
408
409 } // end of omp parallel region
410}
411
412/*******************************************************************************
413 * \brief Collocate all tasks of in given list onto given grids.
414 * See grid_task_list.h for details.
415 * \author Ole Schuett
416 ******************************************************************************/
418 const enum grid_func func, const int nlevels,
419 const offload_buffer *pab_blocks,
420 offload_buffer *grids[nlevels]) {
421
422 assert(task_list->nlevels == nlevels);
423
424 for (int level = 0; level < task_list->nlevels; level++) {
425 const int idx = level * task_list->nblocks;
426 const int *first_block_task = &task_list->first_level_block_task[idx];
427 const int *last_block_task = &task_list->last_level_block_task[idx];
428 const grid_cpu_layout *layout = &task_list->layouts[level];
430 task_list, first_block_task, last_block_task, func, layout->npts_global,
431 layout->npts_local, layout->shift_local, layout->border_width,
432 layout->dh, layout->dh_inv, pab_blocks->host_buffer, grids[level]);
433 }
434}
435
436/*******************************************************************************
437 * \brief Transforms hab from prim. cartesian to contracted spherical basis.
438 * \author Ole Schuett
439 ******************************************************************************/
440static inline void store_hab(const grid_basis_set *ibasis,
441 const grid_basis_set *jbasis, const int iset,
442 const int jset, const bool transpose,
443 const double *hab, double *block) {
444
445 // Define some more convenient aliases.
446 const int ncoseta = ncoset(ibasis->lmax[iset]);
447 const int ncosetb = ncoset(jbasis->lmax[jset]);
448 const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
449 const int ncob = jbasis->npgf[jset] * ncosetb;
450
451 const int nsgf_seta = ibasis->nsgf_set[iset]; // size of spherical set
452 const int nsgf_setb = jbasis->nsgf_set[jset];
453 const int nsgfa = ibasis->nsgf; // size of entire spherical basis
454 const int nsgfb = jbasis->nsgf;
455 const int sgfa = ibasis->first_sgf[iset] - 1; // start of spherical set
456 const int sgfb = jbasis->first_sgf[jset] - 1;
457 const int maxcoa = ibasis->maxco;
458 const int maxcob = jbasis->maxco;
459
460 double work[nsgf_setb * ncoa];
461
462 // work[nsgf_setb][ncoa] = MATMUL(jbasis->sphi, hab)
463 dgemm('N', 'N', nsgf_setb, ncoa, ncob, 1.0, &jbasis->sphi[sgfb * maxcob],
464 maxcob, hab, ncoa, 0.0, work, ncoa);
465
466 if (transpose) {
467 // subblock[nsgf_setb][nsgf_seta] += MATMUL(work, TRANSPOSE(ibasis->sphi))
468 dgemm('N', 'T', nsgf_setb, nsgf_seta, ncoa, 1.0, work, ncoa,
469 &ibasis->sphi[sgfa * maxcoa], maxcoa, 1.0,
470 &block[sgfb * nsgfa + sgfa], nsgfa);
471 } else {
472 // subblock[nsgf_seta][nsgf_setb] += MATMUL(ibasis->sphi, TRANSPOSE(work))
473 dgemm('N', 'T', nsgf_seta, nsgf_setb, ncoa, 1.0,
474 &ibasis->sphi[sgfa * maxcoa], maxcoa, work, ncoa, 1.0,
475 &block[sgfa * nsgfb + sgfb], nsgfb);
476 }
477}
478
479/*******************************************************************************
480 * \brief Integrate a range of tasks that belong to the same grid level.
481 * \author Ole Schuett
482 ******************************************************************************/
484 const grid_cpu_task_list *task_list, const int *first_block_task,
485 const int *last_block_task, const bool compute_tau, const int natoms,
486 const int npts_global[3], const int npts_local[3], const int shift_local[3],
487 const int border_width[3], const double dh[3][3], const double dh_inv[3][3],
488 const offload_buffer *pab_blocks, const offload_buffer *grid,
489 offload_buffer *hab_blocks, double forces[natoms][3], double virial[3][3]) {
490
491// Using default(shared) because with GCC 9 the behavior around const changed:
492// https://www.gnu.org/software/gcc/gcc-9/porting_to.html
493#pragma omp parallel default(shared)
494 {
495 // Initialize variables to detect when a new subblock has to be fetched.
496 int old_offset = -1, old_iset = -1, old_jset = -1;
497 grid_basis_set *old_ibasis = NULL, *old_jbasis = NULL;
498 bool old_transpose = false;
499
500 // Matrix pab and hab are re-used across tasks.
501 double pab[imax(task_list->maxco * task_list->maxco, 1)];
502 double hab[imax(task_list->maxco * task_list->maxco, 1)];
503
504 // Parallelize over blocks to avoid concurred access to hab_blocks.
505 const int nthreads = omp_get_num_threads();
506 const int chunk_size = imax(1, task_list->nblocks / (nthreads * 50));
507#pragma omp for schedule(dynamic, chunk_size)
508 for (int block_num = 0; block_num < task_list->nblocks; block_num++) {
509 const int first_task = first_block_task[block_num];
510 const int last_task = last_block_task[block_num];
511
512 // Accumulate forces per block as it corresponds to a pair of atoms.
513 const int iatom = task_list->tasks[first_task].iatom - 1;
514 const int jatom = task_list->tasks[first_task].jatom - 1;
515 double my_forces[2][3] = {0};
516 double my_virials[2][3][3] = {0};
517
518 for (int itask = first_task; itask <= last_task; itask++) {
519 // Define some convenient aliases.
520 const grid_cpu_task *task = &task_list->tasks[itask];
521 assert(task->block_num - 1 == block_num);
522 assert(task->iatom - 1 == iatom && task->jatom - 1 == jatom);
523 const int ikind = task_list->atom_kinds[iatom] - 1;
524 const int jkind = task_list->atom_kinds[jatom] - 1;
525 grid_basis_set *ibasis = task_list->basis_sets[ikind];
526 grid_basis_set *jbasis = task_list->basis_sets[jkind];
527 const int iset = task->iset - 1;
528 const int jset = task->jset - 1;
529 const int ipgf = task->ipgf - 1;
530 const int jpgf = task->jpgf - 1;
531 const double zeta = ibasis->zet[iset * ibasis->maxpgf + ipgf];
532 const double zetb = jbasis->zet[jset * jbasis->maxpgf + jpgf];
533 const int ncoseta = ncoset(ibasis->lmax[iset]);
534 const int ncosetb = ncoset(jbasis->lmax[jset]);
535 const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
536 const int ncob = jbasis->npgf[jset] * ncosetb;
537 const int block_offset = task_list->block_offsets[block_num];
538 const bool transpose = (iatom <= jatom);
539 const bool pab_required = (forces != NULL || virial != NULL);
540
541 // Load pab and store hab subblocks when needed.
542 // Previous hab and pab can be reused when only ipgf or jpgf changed.
543 if (block_offset != old_offset || iset != old_iset ||
544 jset != old_jset) {
545 if (pab_required) {
546 load_pab(ibasis, jbasis, iset, jset, transpose,
547 &pab_blocks->host_buffer[block_offset], pab);
548 }
549 if (old_offset >= 0) { // skip first iteration
550 store_hab(old_ibasis, old_jbasis, old_iset, old_jset, old_transpose,
551 hab, &hab_blocks->host_buffer[old_offset]);
552 }
553 memset(hab, 0, ncoa * ncob * sizeof(double));
554 old_offset = block_offset;
555 old_iset = iset;
556 old_jset = jset;
557 old_ibasis = ibasis;
558 old_jbasis = jbasis;
559 old_transpose = transpose;
560 }
561
563 /*orthorhombic=*/task_list->orthorhombic,
564 /*compute_tau=*/compute_tau,
565 /*border_mask=*/task->border_mask,
566 /*la_max=*/ibasis->lmax[iset],
567 /*la_min=*/ibasis->lmin[iset],
568 /*lb_max=*/jbasis->lmax[jset],
569 /*lb_min=*/jbasis->lmin[jset],
570 /*zeta=*/zeta,
571 /*zetb=*/zetb,
572 /*dh=*/dh,
573 /*dh_inv=*/dh_inv,
574 /*ra=*/&task_list->atom_positions[3 * iatom],
575 /*rab=*/task->rab,
576 /*npts_global=*/npts_global,
577 /*npts_local=*/npts_local,
578 /*shift_local=*/shift_local,
579 /*border_width=*/border_width,
580 /*radius=*/task->radius,
581 /*o1=*/ipgf * ncoseta,
582 /*o2=*/jpgf * ncosetb,
583 /*n1=*/ncoa,
584 /*n2=*/ncob,
585 /*grid=*/grid->host_buffer,
586 /*hab=*/(double(*)[ncoa])hab,
587 /*pab=*/(pab_required) ? (const double(*)[ncoa])pab : NULL,
588 /*forces=*/(forces != NULL) ? my_forces : NULL,
589 /*virials=*/(virial != NULL) ? my_virials : NULL,
590 /*hdab=*/NULL,
591 /*hadb=*/NULL,
592 /*a_hdab=*/NULL);
593
594 } // end of task loop
595
596 // Merge thread-local forces and virial into shared ones.
597 // It does not seem worth the trouble to accumulate them thread-locally.
598 const double scalef = (iatom == jatom) ? 1.0 : 2.0;
599 if (forces != NULL) {
600#pragma omp critical(forces)
601 for (int i = 0; i < 3; i++) {
602 forces[iatom][i] += scalef * my_forces[0][i];
603 forces[jatom][i] += scalef * my_forces[1][i];
604 }
605 }
606 if (virial != NULL) {
607#pragma omp critical(virial)
608 for (int i = 0; i < 3; i++) {
609 for (int j = 0; j < 3; j++) {
610 virial[i][j] += scalef * my_virials[0][i][j];
611 virial[i][j] += scalef * my_virials[1][i][j];
612 }
613 }
614 }
615
616 } // end of block loop
617
618 // store final hab
619 if (old_offset >= 0) {
620 store_hab(old_ibasis, old_jbasis, old_iset, old_jset, old_transpose, hab,
621 &hab_blocks->host_buffer[old_offset]);
622 }
623
624 } // end of omp parallel region
625}
626
627/*******************************************************************************
628 * \brief Integrate all tasks of in given list from given grids.
629 * See grid_task_list.h for details.
630 * \author Ole Schuett
631 ******************************************************************************/
633 const grid_cpu_task_list *task_list, const bool compute_tau,
634 const int natoms, const int nlevels, const offload_buffer *pab_blocks,
635 const offload_buffer *grids[nlevels], offload_buffer *hab_blocks,
636 double forces[natoms][3], double virial[3][3]) {
637
638 assert(task_list->nlevels == nlevels);
639 assert(task_list->natoms == natoms);
640 assert(hab_blocks != NULL);
641
642 // Zero result arrays.
643 if (hab_blocks->size != 0) {
644 memset(hab_blocks->host_buffer, 0, hab_blocks->size);
645 }
646 if (forces != NULL) {
647 memset(forces, 0, natoms * 3 * sizeof(double));
648 }
649 if (virial != NULL) {
650 memset(virial, 0, 9 * sizeof(double));
651 }
652
653 for (int level = 0; level < task_list->nlevels; level++) {
654 const int idx = level * task_list->nblocks;
655 const int *first_block_task = &task_list->first_level_block_task[idx];
656 const int *last_block_task = &task_list->last_level_block_task[idx];
657 const grid_cpu_layout *layout = &task_list->layouts[level];
659 task_list, first_block_task, last_block_task, compute_tau, natoms,
660 layout->npts_global, layout->npts_local, layout->shift_local,
661 layout->border_width, layout->dh, layout->dh_inv, pab_blocks,
662 grids[level], hab_blocks, forces, virial);
663 }
664}
665
666// EOF
static int imax(int x, int y)
Returns the larger of two given integers (missing from the C standard)
static int imin(int x, int y)
Returns the smaller of the two integers (missing from the C standard).
Definition dbm_miniapp.c:38
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
static GRID_HOST_DEVICE int idx(const orbital a)
Return coset index of given orbital angular momentum.
grid_func
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
static void const int const int i
const int ub
static void const int const int const int const int const int const double const int const int const int npts_local[3]
void grid_cpu_collocate_pgf_product(const bool orthorhombic, const int border_mask, const enum grid_func func, const int la_max, const int la_min, const int lb_max, const int lb_min, const double zeta, const double zetb, const double rscale, const double dh[3][3], const double dh_inv[3][3], const double ra[3], const double rab[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, const int o1, const int o2, const int n1, const int n2, const double pab[n2][n1], double *grid)
Public entry point. A thin wrapper with the only purpose of calling write_task_file when DUMP_TASKS =...
void grid_cpu_integrate_pgf_product(const bool orthorhombic, const bool compute_tau, const int border_mask, const int la_max, const int la_min, const int lb_max, const int lb_min, const double zeta, const double zetb, const double dh[3][3], const double dh_inv[3][3], const double ra[3], const double rab[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, const int o1, const int o2, const int n1, const int n2, const double *grid, double hab[n2][n1], const double pab[n2][n1], double forces[2][3], double virials[2][3][3], double hdab[n2][n1][3], double hadb[n2][n1][3], double a_hdab[n2][n1][3][3])
Integrates a single task. See grid_cpu_integrate.h for details.
static void load_pab(const grid_basis_set *ibasis, const grid_basis_set *jbasis, const int iset, const int jset, const bool transpose, const double *block, double *pab)
Transforms pab from contracted spherical to prim. cartesian basis.
void grid_cpu_free_task_list(grid_cpu_task_list *task_list)
Deallocates given task list, basis_sets have to be freed separately.
void grid_cpu_create_task_list(const bool orthorhombic, const int ntasks, const int nlevels, const int natoms, const int nkinds, const int nblocks, const int block_offsets[nblocks], const double atom_positions[natoms][3], const int atom_kinds[natoms], const grid_basis_set *basis_sets[nkinds], const int level_list[ntasks], const int iatom_list[ntasks], const int jatom_list[ntasks], const int iset_list[ntasks], const int jset_list[ntasks], const int ipgf_list[ntasks], const int jpgf_list[ntasks], const int border_mask_list[ntasks], const int block_num_list[ntasks], const double radius_list[ntasks], const double rab_list[ntasks][3], const int npts_global[nlevels][3], const int npts_local[nlevels][3], const int shift_local[nlevels][3], const int border_width[nlevels][3], const double dh[nlevels][3][3], const double dh_inv[nlevels][3][3], grid_cpu_task_list **task_list_out)
Allocates a task list for the cpu backend. See grid_task_list.h for details.
static void integrate_one_grid_level(const grid_cpu_task_list *task_list, const int *first_block_task, const int *last_block_task, const bool compute_tau, const int natoms, const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double dh[3][3], const double dh_inv[3][3], const offload_buffer *pab_blocks, const offload_buffer *grid, offload_buffer *hab_blocks, double forces[natoms][3], double virial[3][3])
Integrate a range of tasks that belong to the same grid level.
static void collocate_one_grid_level(const grid_cpu_task_list *task_list, const int *first_block_task, const int *last_block_task, const enum grid_func func, const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double dh[3][3], const double dh_inv[3][3], const double *pab_blocks, offload_buffer *grid)
Collocate a range of tasks which are destined for the same grid level.
static int compare_tasks(const void *a, const void *b)
Comperator passed to qsort to compare two tasks.
void grid_cpu_collocate_task_list(const grid_cpu_task_list *task_list, const enum grid_func func, const int nlevels, const offload_buffer *pab_blocks, offload_buffer *grids[nlevels])
Collocate all tasks of in given list onto given grids. See grid_task_list.h for details.
void dgemm_(const char *transa, const char *transb, const int *m, const int *n, const int *k, const double *alpha, const double *a, const int *lda, const double *b, const int *ldb, const double *beta, double *c, const int *ldc)
Prototype for BLAS dgemm.
void grid_cpu_integrate_task_list(const grid_cpu_task_list *task_list, const bool compute_tau, const int natoms, const int nlevels, const offload_buffer *pab_blocks, const offload_buffer *grids[nlevels], offload_buffer *hab_blocks, double forces[natoms][3], double virial[3][3])
Integrate all tasks of in given list from given grids. See grid_task_list.h for details.
static void store_hab(const grid_basis_set *ibasis, const grid_basis_set *jbasis, const int iset, const int jset, const bool transpose, const double *hab, double *block)
Transforms hab from prim. cartesian to contracted spherical basis.
static void dgemm(const char transa, const char transb, const int m, const int n, const int k, const double alpha, const double *a, const int lda, const double *b, const int ldb, const double beta, double *c, const int ldc)
Convenient wrapper to hide Fortran nature of dgemm_, swapping a and b.
Internal representation of a basis set.
Internal representation of a grid layout.
Internal representation of a task list.
grid_cpu_layout * layouts
grid_basis_set ** basis_sets
grid_cpu_task * tasks
Internal representation of a task.
Internal representation of a buffer.
double * host_buffer