(git:374b731)
Loading...
Searching...
No Matches
grid_cpu_task_list.c
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2024 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: BSD-3-Clause */
6/*----------------------------------------------------------------------------*/
7
8#include <assert.h>
9#include <math.h>
10#include <omp.h>
11#include <stdint.h>
12#include <stdio.h>
13#include <stdlib.h>
14#include <string.h>
15
16#include "../common/grid_common.h"
17#include "grid_cpu_collocate.h"
18#include "grid_cpu_integrate.h"
19#include "grid_cpu_task_list.h"
20
21/*******************************************************************************
22 * \brief Comperator passed to qsort to compare two tasks.
23 * \author Ole Schuett
24 ******************************************************************************/
25static int compare_tasks(const void *a, const void *b) {
26 const grid_cpu_task *task_a = a, *task_b = b;
27 if (task_a->level != task_b->level) {
28 return task_a->level - task_b->level;
29 } else if (task_a->block_num != task_b->block_num) {
30 return task_a->block_num - task_b->block_num;
31 } else if (task_a->iset != task_b->iset) {
32 return task_a->iset - task_b->iset;
33 } else {
34 return task_a->jset - task_b->jset;
35 }
36}
37/*******************************************************************************
38 * \brief Allocates a task list for the cpu backend.
39 * See grid_task_list.h for details.
40 * \author Ole Schuett
41 ******************************************************************************/
43 const bool orthorhombic, const int ntasks, const int nlevels,
44 const int natoms, const int nkinds, const int nblocks,
45 const int block_offsets[nblocks], const double atom_positions[natoms][3],
46 const int atom_kinds[natoms], const grid_basis_set *basis_sets[nkinds],
47 const int level_list[ntasks], const int iatom_list[ntasks],
48 const int jatom_list[ntasks], const int iset_list[ntasks],
49 const int jset_list[ntasks], const int ipgf_list[ntasks],
50 const int jpgf_list[ntasks], const int border_mask_list[ntasks],
51 const int block_num_list[ntasks], const double radius_list[ntasks],
52 const double rab_list[ntasks][3], const int npts_global[nlevels][3],
53 const int npts_local[nlevels][3], const int shift_local[nlevels][3],
54 const int border_width[nlevels][3], const double dh[nlevels][3][3],
55 const double dh_inv[nlevels][3][3], grid_cpu_task_list **task_list_out) {
56
57 if (*task_list_out != NULL) {
58 // This is actually an opportunity to reuse some buffers.
59 grid_cpu_free_task_list(*task_list_out);
60 }
61
62 grid_cpu_task_list *task_list = malloc(sizeof(grid_cpu_task_list));
63
64 task_list->orthorhombic = orthorhombic;
65 task_list->ntasks = ntasks;
66 task_list->nlevels = nlevels;
67 task_list->natoms = natoms;
68 task_list->nkinds = nkinds;
69 task_list->nblocks = nblocks;
70
71 size_t size = nblocks * sizeof(int);
72 task_list->block_offsets = malloc(size);
73 memcpy(task_list->block_offsets, block_offsets, size);
74
75 size = 3 * natoms * sizeof(double);
76 task_list->atom_positions = malloc(size);
77 memcpy(task_list->atom_positions, atom_positions, size);
78
79 size = natoms * sizeof(int);
80 task_list->atom_kinds = malloc(size);
81 memcpy(task_list->atom_kinds, atom_kinds, size);
82
83 size = nkinds * sizeof(grid_basis_set *);
84 task_list->basis_sets = malloc(size);
85 memcpy(task_list->basis_sets, basis_sets, size);
86
87 size = ntasks * sizeof(grid_cpu_task);
88 task_list->tasks = malloc(size);
89 for (int i = 0; i < ntasks; i++) {
90 task_list->tasks[i].level = level_list[i];
91 task_list->tasks[i].iatom = iatom_list[i];
92 task_list->tasks[i].jatom = jatom_list[i];
93 task_list->tasks[i].iset = iset_list[i];
94 task_list->tasks[i].jset = jset_list[i];
95 task_list->tasks[i].ipgf = ipgf_list[i];
96 task_list->tasks[i].jpgf = jpgf_list[i];
97 task_list->tasks[i].border_mask = border_mask_list[i];
98 task_list->tasks[i].block_num = block_num_list[i];
99 task_list->tasks[i].radius = radius_list[i];
100 task_list->tasks[i].rab[0] = rab_list[i][0];
101 task_list->tasks[i].rab[1] = rab_list[i][1];
102 task_list->tasks[i].rab[2] = rab_list[i][2];
103 }
104
105 // Store grid layouts.
106 size = nlevels * sizeof(grid_cpu_layout);
107 task_list->layouts = malloc(size);
108 for (int level = 0; level < nlevels; level++) {
109 for (int i = 0; i < 3; i++) {
110 task_list->layouts[level].npts_global[i] = npts_global[level][i];
111 task_list->layouts[level].npts_local[i] = npts_local[level][i];
112 task_list->layouts[level].shift_local[i] = shift_local[level][i];
113 task_list->layouts[level].border_width[i] = border_width[level][i];
114 for (int j = 0; j < 3; j++) {
115 task_list->layouts[level].dh[i][j] = dh[level][i][j];
116 task_list->layouts[level].dh_inv[i][j] = dh_inv[level][i][j];
117 }
118 }
119 }
120
121 // Sort tasks by level, block_num, iset, and jset.
122 qsort(task_list->tasks, ntasks, sizeof(grid_cpu_task), &compare_tasks);
123
124 // Find first and last task for each level and block.
125 size = nlevels * nblocks * sizeof(int);
126 task_list->first_level_block_task = malloc(size);
127 task_list->last_level_block_task = malloc(size);
128 for (int i = 0; i < nlevels * nblocks; i++) {
129 task_list->first_level_block_task[i] = 0;
130 task_list->last_level_block_task[i] = -1; // last < first means no tasks
131 }
132 for (int itask = 0; itask < ntasks; itask++) {
133 const int level = task_list->tasks[itask].level - 1;
134 const int block_num = task_list->tasks[itask].block_num - 1;
135 if (itask == 0 || task_list->tasks[itask - 1].level - 1 != level ||
136 task_list->tasks[itask - 1].block_num - 1 != block_num) {
137 task_list->first_level_block_task[level * nblocks + block_num] = itask;
138 }
139 task_list->last_level_block_task[level * nblocks + block_num] = itask;
140 }
141
142 // Find largest Cartesian subblock size.
143 task_list->maxco = 0;
144 for (int i = 0; i < nkinds; i++) {
145 task_list->maxco = imax(task_list->maxco, task_list->basis_sets[i]->maxco);
146 }
147
148 // Initialize thread-local storage.
149 size = omp_get_max_threads() * sizeof(double *);
150 task_list->threadlocals = malloc(size);
151 memset(task_list->threadlocals, 0, size);
152 size = omp_get_max_threads() * sizeof(size_t);
153 task_list->threadlocal_sizes = malloc(size);
154 memset(task_list->threadlocal_sizes, 0, size);
155
156 *task_list_out = task_list;
157}
158
159/*******************************************************************************
160 * \brief Deallocates given task list, basis_sets have to be freed separately.
161 * \author Ole Schuett
162 ******************************************************************************/
164 free(task_list->block_offsets);
165 free(task_list->atom_positions);
166 free(task_list->atom_kinds);
167 free(task_list->basis_sets);
168 free(task_list->tasks);
169 free(task_list->layouts);
170 free(task_list->first_level_block_task);
171 free(task_list->last_level_block_task);
172 for (int i = 0; i < omp_get_max_threads(); i++) {
173 if (task_list->threadlocals[i] != NULL) {
174 free(task_list->threadlocals[i]);
175 }
176 }
177 free(task_list->threadlocals);
178 free(task_list->threadlocal_sizes);
179 free(task_list);
180}
181
182/*******************************************************************************
183 * \brief Prototype for BLAS dgemm.
184 * \author Ole Schuett
185 ******************************************************************************/
186void dgemm_(const char *transa, const char *transb, const int *m, const int *n,
187 const int *k, const double *alpha, const double *a, const int *lda,
188 const double *b, const int *ldb, const double *beta, double *c,
189 const int *ldc);
190
191/*******************************************************************************
192 * \brief Convenient wrapper to hide Fortran nature of dgemm_, swapping a and b.
193 * \author Ole Schuett
194 ******************************************************************************/
195static void dgemm(const char transa, const char transb, const int m,
196 const int n, const int k, const double alpha, const double *a,
197 const int lda, const double *b, const int ldb,
198 const double beta, double *c, const int ldc) {
199 dgemm_(&transb, &transa, &n, &m, &k, &alpha, b, &ldb, a, &lda, &beta, c,
200 &ldc);
201}
202
203/*******************************************************************************
204 * \brief Transforms pab from contracted spherical to prim. cartesian basis.
205 * \author Ole Schuett
206 ******************************************************************************/
207static void load_pab(const grid_basis_set *ibasis, const grid_basis_set *jbasis,
208 const int iset, const int jset, const bool transpose,
209 const double *block, double *pab) {
210
211 // Define some more convenient aliases.
212 const int ncoseta = ncoset(ibasis->lmax[iset]);
213 const int ncosetb = ncoset(jbasis->lmax[jset]);
214 const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
215 const int ncob = jbasis->npgf[jset] * ncosetb;
216
217 const int nsgf_seta = ibasis->nsgf_set[iset]; // size of spherical set
218 const int nsgf_setb = jbasis->nsgf_set[jset];
219 const int nsgfa = ibasis->nsgf; // size of entire spherical basis
220 const int nsgfb = jbasis->nsgf;
221 const int sgfa = ibasis->first_sgf[iset] - 1; // start of spherical set
222 const int sgfb = jbasis->first_sgf[jset] - 1;
223 const int maxcoa = ibasis->maxco;
224 const int maxcob = jbasis->maxco;
225
226 double work[nsgf_setb * ncoa];
227 if (transpose) {
228 // work[nsgf_setb][ncoa] = MATMUL(subblock, ibasis->sphi)
229 dgemm('N', 'N', nsgf_setb, ncoa, nsgf_seta, 1.0,
230 &block[sgfb * nsgfa + sgfa], nsgfa, &ibasis->sphi[sgfa * maxcoa],
231 maxcoa, 0.0, work, ncoa);
232 } else {
233 // work[nsgf_setb][ncoa] = MATMUL(TRANSPOSE(subblock), ibasis->sphi)
234 dgemm('T', 'N', nsgf_setb, ncoa, nsgf_seta, 1.0,
235 &block[sgfa * nsgfb + sgfb], nsgfb, &ibasis->sphi[sgfa * maxcoa],
236 maxcoa, 0.0, work, ncoa);
237 }
238 // pab[ncob][ncoa] = MATMUL(TRANSPOSE(jbasis->sphi), work)
239 dgemm('T', 'N', ncob, ncoa, nsgf_setb, 1.0, &jbasis->sphi[sgfb * maxcob],
240 maxcob, work, ncoa, 0.0, pab, ncoa);
241}
242
243/*******************************************************************************
244 * \brief Collocate a range of tasks which are destined for the same grid level.
245 * \author Ole Schuett
246 ******************************************************************************/
248 const grid_cpu_task_list *task_list, const int *first_block_task,
249 const int *last_block_task, const enum grid_func func,
250 const int npts_global[3], const int npts_local[3], const int shift_local[3],
251 const int border_width[3], const double dh[3][3], const double dh_inv[3][3],
252 const double *pab_blocks, offload_buffer *grid) {
253
254// Using default(shared) because with GCC 9 the behavior around const changed:
255// https://www.gnu.org/software/gcc/gcc-9/porting_to.html
256#pragma omp parallel default(shared)
257 {
258 const int ithread = omp_get_thread_num();
259 const int nthreads = omp_get_num_threads();
260
261 // Initialize variables to detect when a new subblock has to be fetched.
262 int old_offset = -1, old_iset = -1, old_jset = -1;
263
264 // Matrix pab is re-used across tasks.
265 double pab[task_list->maxco * task_list->maxco];
266
267 // Ensure that grid can fit into thread-local storage, reallocate if needed.
268 const int npts_local_total = npts_local[0] * npts_local[1] * npts_local[2];
269 const size_t grid_size = npts_local_total * sizeof(double);
270 if (task_list->threadlocal_sizes[ithread] < grid_size) {
271 if (task_list->threadlocals[ithread] != NULL) {
272 free(task_list->threadlocals[ithread]);
273 }
274 task_list->threadlocals[ithread] = malloc(grid_size);
275 task_list->threadlocal_sizes[ithread] = grid_size;
276 }
277
278 // Zero thread-local copy of the grid.
279 double *const my_grid = task_list->threadlocals[ithread];
280 memset(my_grid, 0, grid_size);
281
282 // Parallelize over blocks to avoid unnecessary calls to load_pab.
283 const int chunk_size = imax(1, task_list->nblocks / (nthreads * 50));
284#pragma omp for schedule(dynamic, chunk_size)
285 for (int block_num = 0; block_num < task_list->nblocks; block_num++) {
286 const int first_task = first_block_task[block_num];
287 const int last_task = last_block_task[block_num];
288
289 for (int itask = first_task; itask <= last_task; itask++) {
290 // Define some convenient aliases.
291 const grid_cpu_task *task = &task_list->tasks[itask];
292 const int iatom = task->iatom - 1;
293 const int jatom = task->jatom - 1;
294 const int iset = task->iset - 1;
295 const int jset = task->jset - 1;
296 const int ipgf = task->ipgf - 1;
297 const int jpgf = task->jpgf - 1;
298 const int ikind = task_list->atom_kinds[iatom] - 1;
299 const int jkind = task_list->atom_kinds[jatom] - 1;
300 const grid_basis_set *ibasis = task_list->basis_sets[ikind];
301 const grid_basis_set *jbasis = task_list->basis_sets[jkind];
302 const double zeta = ibasis->zet[iset * ibasis->maxpgf + ipgf];
303 const double zetb = jbasis->zet[jset * jbasis->maxpgf + jpgf];
304 const int ncoseta = ncoset(ibasis->lmax[iset]);
305 const int ncosetb = ncoset(jbasis->lmax[jset]);
306 const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
307 const int ncob = jbasis->npgf[jset] * ncosetb;
308 const int block_num = task->block_num - 1;
309 const int block_offset = task_list->block_offsets[block_num];
310 const double *block = &pab_blocks[block_offset];
311 const bool transpose = (iatom <= jatom);
312
313 // Load subblock from buffer and decontract into Cartesian sublock pab.
314 // The previous pab can be reused when only ipgf or jpgf has changed.
315 if (block_offset != old_offset || iset != old_iset ||
316 jset != old_jset) {
317 old_offset = block_offset;
318 old_iset = iset;
319 old_jset = jset;
320 load_pab(ibasis, jbasis, iset, jset, transpose, block, pab);
321 }
322
324 /*orthorhombic=*/task_list->orthorhombic,
325 /*border_mask=*/task->border_mask,
326 /*func=*/func,
327 /*la_max=*/ibasis->lmax[iset],
328 /*la_min=*/ibasis->lmin[iset],
329 /*lb_max=*/jbasis->lmax[jset],
330 /*lb_min=*/jbasis->lmin[jset],
331 /*zeta=*/zeta,
332 /*zetb=*/zetb,
333 /*rscale=*/(iatom == jatom) ? 1 : 2,
334 /*dh=*/dh,
335 /*dh_inv=*/dh_inv,
336 /*ra=*/&task_list->atom_positions[3 * iatom],
337 /*rab=*/task->rab,
338 /*npts_global=*/npts_global,
339 /*npts_local=*/npts_local,
340 /*shift_local=*/shift_local,
341 /*border_width=*/border_width,
342 /*radius=*/task->radius,
343 /*o1=*/ipgf * ncoseta,
344 /*o2=*/jpgf * ncosetb,
345 /*n1=*/ncoa,
346 /*n2=*/ncob,
347 /*pab=*/(const double(*)[ncoa])pab,
348 /*grid=*/my_grid);
349
350 } // end of task loop
351 } // end of block loop
352
353// While there should be an implicit barrier at the end of the block loop, this
354// explicit barrier eliminates occasional seg faults with icc compiled binaries.
355#pragma omp barrier
356
357 // Merge thread-local grids via an efficient tree reduction.
358 const int nreduction_cycles = ceil(log(nthreads) / log(2)); // tree depth
359 for (int icycle = 1; icycle <= nreduction_cycles; icycle++) {
360 // Threads are divided into groups, whose size doubles with each cycle.
361 // After a cycle the reduced data is stored at first thread of each group.
362 const int group_size = 1 << icycle; // 2**icycle
363 const int igroup = ithread / group_size;
364 const int dest_thread = igroup * group_size;
365 const int src_thread = dest_thread + group_size / 2;
366 // The last group might actually be a bit smaller.
367 const int actual_group_size = imin(group_size, nthreads - dest_thread);
368 // Parallelize summation by dividing grid points across group members.
369 const int rank = modulo(ithread, group_size); // position within the group
370 const int lb = (npts_local_total * rank) / actual_group_size;
371 const int ub = (npts_local_total * (rank + 1)) / actual_group_size;
372 if (src_thread < nthreads) {
373 for (int i = lb; i < ub; i++) {
374 task_list->threadlocals[dest_thread][i] +=
375 task_list->threadlocals[src_thread][i];
376 }
377 }
378#pragma omp barrier
379 }
380
381 // Copy final result from first thread into shared grid.
382 const int lb = (npts_local_total * ithread) / nthreads;
383 const int ub = (npts_local_total * (ithread + 1)) / nthreads;
384 for (int i = lb; i < ub; i++) {
385 grid->host_buffer[i] = task_list->threadlocals[0][i];
386 }
387
388 } // end of omp parallel region
389}
390
391/*******************************************************************************
392 * \brief Collocate all tasks of in given list onto given grids.
393 * See grid_task_list.h for details.
394 * \author Ole Schuett
395 ******************************************************************************/
397 const enum grid_func func, const int nlevels,
398 const offload_buffer *pab_blocks,
399 offload_buffer *grids[nlevels]) {
400
401 assert(task_list->nlevels == nlevels);
402
403 for (int level = 0; level < task_list->nlevels; level++) {
404 const int idx = level * task_list->nblocks;
405 const int *first_block_task = &task_list->first_level_block_task[idx];
406 const int *last_block_task = &task_list->last_level_block_task[idx];
407 const grid_cpu_layout *layout = &task_list->layouts[level];
409 task_list, first_block_task, last_block_task, func, layout->npts_global,
410 layout->npts_local, layout->shift_local, layout->border_width,
411 layout->dh, layout->dh_inv, pab_blocks->host_buffer, grids[level]);
412 }
413}
414
415/*******************************************************************************
416 * \brief Transforms hab from prim. cartesian to contracted spherical basis.
417 * \author Ole Schuett
418 ******************************************************************************/
419static inline void store_hab(const grid_basis_set *ibasis,
420 const grid_basis_set *jbasis, const int iset,
421 const int jset, const bool transpose,
422 const double *hab, double *block) {
423
424 // Define some more convenient aliases.
425 const int ncoseta = ncoset(ibasis->lmax[iset]);
426 const int ncosetb = ncoset(jbasis->lmax[jset]);
427 const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
428 const int ncob = jbasis->npgf[jset] * ncosetb;
429
430 const int nsgf_seta = ibasis->nsgf_set[iset]; // size of spherical set
431 const int nsgf_setb = jbasis->nsgf_set[jset];
432 const int nsgfa = ibasis->nsgf; // size of entire spherical basis
433 const int nsgfb = jbasis->nsgf;
434 const int sgfa = ibasis->first_sgf[iset] - 1; // start of spherical set
435 const int sgfb = jbasis->first_sgf[jset] - 1;
436 const int maxcoa = ibasis->maxco;
437 const int maxcob = jbasis->maxco;
438
439 double work[nsgf_setb * ncoa];
440
441 // work[nsgf_setb][ncoa] = MATMUL(jbasis->sphi, hab)
442 dgemm('N', 'N', nsgf_setb, ncoa, ncob, 1.0, &jbasis->sphi[sgfb * maxcob],
443 maxcob, hab, ncoa, 0.0, work, ncoa);
444
445 if (transpose) {
446 // subblock[nsgf_setb][nsgf_seta] += MATMUL(work, TRANSPOSE(ibasis->sphi))
447 dgemm('N', 'T', nsgf_setb, nsgf_seta, ncoa, 1.0, work, ncoa,
448 &ibasis->sphi[sgfa * maxcoa], maxcoa, 1.0,
449 &block[sgfb * nsgfa + sgfa], nsgfa);
450 } else {
451 // subblock[nsgf_seta][nsgf_setb] += MATMUL(ibasis->sphi, TRANSPOSE(work))
452 dgemm('N', 'T', nsgf_seta, nsgf_setb, ncoa, 1.0,
453 &ibasis->sphi[sgfa * maxcoa], maxcoa, work, ncoa, 1.0,
454 &block[sgfa * nsgfb + sgfb], nsgfb);
455 }
456}
457
458/*******************************************************************************
459 * \brief Integrate a range of tasks that belong to the same grid level.
460 * \author Ole Schuett
461 ******************************************************************************/
463 const grid_cpu_task_list *task_list, const int *first_block_task,
464 const int *last_block_task, const bool compute_tau, const int natoms,
465 const int npts_global[3], const int npts_local[3], const int shift_local[3],
466 const int border_width[3], const double dh[3][3], const double dh_inv[3][3],
467 const offload_buffer *pab_blocks, const offload_buffer *grid,
468 offload_buffer *hab_blocks, double forces[natoms][3], double virial[3][3]) {
469
470// Using default(shared) because with GCC 9 the behavior around const changed:
471// https://www.gnu.org/software/gcc/gcc-9/porting_to.html
472#pragma omp parallel default(shared)
473 {
474 // Initialize variables to detect when a new subblock has to be fetched.
475 int old_offset = -1, old_iset = -1, old_jset = -1;
476 grid_basis_set *old_ibasis = NULL, *old_jbasis = NULL;
477 bool old_transpose = false;
478
479 // Matrix pab and hab are re-used across tasks.
480 double pab[task_list->maxco * task_list->maxco];
481 double hab[task_list->maxco * task_list->maxco];
482
483 // Parallelize over blocks to avoid concurred access to hab_blocks.
484 const int nthreads = omp_get_num_threads();
485 const int chunk_size = imax(1, task_list->nblocks / (nthreads * 50));
486#pragma omp for schedule(dynamic, chunk_size)
487 for (int block_num = 0; block_num < task_list->nblocks; block_num++) {
488 const int first_task = first_block_task[block_num];
489 const int last_task = last_block_task[block_num];
490
491 // Accumulate forces per block as it corresponds to a pair of atoms.
492 const int iatom = task_list->tasks[first_task].iatom - 1;
493 const int jatom = task_list->tasks[first_task].jatom - 1;
494 double my_forces[2][3] = {0};
495 double my_virials[2][3][3] = {0};
496
497 for (int itask = first_task; itask <= last_task; itask++) {
498 // Define some convenient aliases.
499 const grid_cpu_task *task = &task_list->tasks[itask];
500 assert(task->block_num - 1 == block_num);
501 assert(task->iatom - 1 == iatom && task->jatom - 1 == jatom);
502 const int ikind = task_list->atom_kinds[iatom] - 1;
503 const int jkind = task_list->atom_kinds[jatom] - 1;
504 grid_basis_set *ibasis = task_list->basis_sets[ikind];
505 grid_basis_set *jbasis = task_list->basis_sets[jkind];
506 const int iset = task->iset - 1;
507 const int jset = task->jset - 1;
508 const int ipgf = task->ipgf - 1;
509 const int jpgf = task->jpgf - 1;
510 const double zeta = ibasis->zet[iset * ibasis->maxpgf + ipgf];
511 const double zetb = jbasis->zet[jset * jbasis->maxpgf + jpgf];
512 const int ncoseta = ncoset(ibasis->lmax[iset]);
513 const int ncosetb = ncoset(jbasis->lmax[jset]);
514 const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
515 const int ncob = jbasis->npgf[jset] * ncosetb;
516 const int block_offset = task_list->block_offsets[block_num];
517 const bool transpose = (iatom <= jatom);
518 const bool pab_required = (forces != NULL || virial != NULL);
519
520 // Load pab and store hab subblocks when needed.
521 // Previous hab and pab can be reused when only ipgf or jpgf changed.
522 if (block_offset != old_offset || iset != old_iset ||
523 jset != old_jset) {
524 if (pab_required) {
525 load_pab(ibasis, jbasis, iset, jset, transpose,
526 &pab_blocks->host_buffer[block_offset], pab);
527 }
528 if (old_offset >= 0) { // skip first iteration
529 store_hab(old_ibasis, old_jbasis, old_iset, old_jset, old_transpose,
530 hab, &hab_blocks->host_buffer[old_offset]);
531 }
532 memset(hab, 0, ncoa * ncob * sizeof(double));
533 old_offset = block_offset;
534 old_iset = iset;
535 old_jset = jset;
536 old_ibasis = ibasis;
537 old_jbasis = jbasis;
538 old_transpose = transpose;
539 }
540
542 /*orthorhombic=*/task_list->orthorhombic,
543 /*compute_tau=*/compute_tau,
544 /*border_mask=*/task->border_mask,
545 /*la_max=*/ibasis->lmax[iset],
546 /*la_min=*/ibasis->lmin[iset],
547 /*lb_max=*/jbasis->lmax[jset],
548 /*lb_min=*/jbasis->lmin[jset],
549 /*zeta=*/zeta,
550 /*zetb=*/zetb,
551 /*dh=*/dh,
552 /*dh_inv=*/dh_inv,
553 /*ra=*/&task_list->atom_positions[3 * iatom],
554 /*rab=*/task->rab,
555 /*npts_global=*/npts_global,
556 /*npts_local=*/npts_local,
557 /*shift_local=*/shift_local,
558 /*border_width=*/border_width,
559 /*radius=*/task->radius,
560 /*o1=*/ipgf * ncoseta,
561 /*o2=*/jpgf * ncosetb,
562 /*n1=*/ncoa,
563 /*n2=*/ncob,
564 /*grid=*/grid->host_buffer,
565 /*hab=*/(double(*)[ncoa])hab,
566 /*pab=*/(pab_required) ? (const double(*)[ncoa])pab : NULL,
567 /*forces=*/(forces != NULL) ? my_forces : NULL,
568 /*virials=*/(virial != NULL) ? my_virials : NULL,
569 /*hdab=*/NULL,
570 /*hadb=*/NULL,
571 /*a_hdab=*/NULL);
572
573 } // end of task loop
574
575 // Merge thread-local forces and virial into shared ones.
576 // It does not seem worth the trouble to accumulate them thread-locally.
577 const double scalef = (iatom == jatom) ? 1.0 : 2.0;
578 if (forces != NULL) {
579#pragma omp critical(forces)
580 for (int i = 0; i < 3; i++) {
581 forces[iatom][i] += scalef * my_forces[0][i];
582 forces[jatom][i] += scalef * my_forces[1][i];
583 }
584 }
585 if (virial != NULL) {
586#pragma omp critical(virial)
587 for (int i = 0; i < 3; i++) {
588 for (int j = 0; j < 3; j++) {
589 virial[i][j] += scalef * my_virials[0][i][j];
590 virial[i][j] += scalef * my_virials[1][i][j];
591 }
592 }
593 }
594
595 } // end of block loop
596
597 // store final hab
598 if (old_offset >= 0) {
599 store_hab(old_ibasis, old_jbasis, old_iset, old_jset, old_transpose, hab,
600 &hab_blocks->host_buffer[old_offset]);
601 }
602
603 } // end of omp parallel region
604}
605
606/*******************************************************************************
607 * \brief Integrate all tasks of in given list from given grids.
608 * See grid_task_list.h for details.
609 * \author Ole Schuett
610 ******************************************************************************/
612 const grid_cpu_task_list *task_list, const bool compute_tau,
613 const int natoms, const int nlevels, const offload_buffer *pab_blocks,
614 const offload_buffer *grids[nlevels], offload_buffer *hab_blocks,
615 double forces[natoms][3], double virial[3][3]) {
616
617 assert(task_list->nlevels == nlevels);
618 assert(task_list->natoms == natoms);
619
620 // Zero result arrays.
621 memset(hab_blocks->host_buffer, 0, hab_blocks->size);
622 if (forces != NULL) {
623 memset(forces, 0, natoms * 3 * sizeof(double));
624 }
625 if (virial != NULL) {
626 memset(virial, 0, 9 * sizeof(double));
627 }
628
629 for (int level = 0; level < task_list->nlevels; level++) {
630 const int idx = level * task_list->nblocks;
631 const int *first_block_task = &task_list->first_level_block_task[idx];
632 const int *last_block_task = &task_list->last_level_block_task[idx];
633 const grid_cpu_layout *layout = &task_list->layouts[level];
635 task_list, first_block_task, last_block_task, compute_tau, natoms,
636 layout->npts_global, layout->npts_local, layout->shift_local,
637 layout->border_width, layout->dh, layout->dh_inv, pab_blocks,
638 grids[level], hab_blocks, forces, virial);
639 }
640}
641
642// EOF
static int imax(int x, int y)
Returns the larger of two given integer (missing from the C standard)
static int imin(int x, int y)
Returns the smaller of two given integer (missing from the C standard)
Definition dbm_miniapp.c:38
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
static GRID_HOST_DEVICE int idx(const orbital a)
Return coset index of given orbital angular momentum.
grid_func
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
static void const int const int i
const int ub
static void const int const int const int const int const int const double const int const int const int npts_local[3]
void grid_cpu_collocate_pgf_product(const bool orthorhombic, const int border_mask, const enum grid_func func, const int la_max, const int la_min, const int lb_max, const int lb_min, const double zeta, const double zetb, const double rscale, const double dh[3][3], const double dh_inv[3][3], const double ra[3], const double rab[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, const int o1, const int o2, const int n1, const int n2, const double pab[n2][n1], double *grid)
Public entry point. A thin wrapper with the only purpose of calling write_task_file when DUMP_TASKS =...
void grid_cpu_integrate_pgf_product(const bool orthorhombic, const bool compute_tau, const int border_mask, const int la_max, const int la_min, const int lb_max, const int lb_min, const double zeta, const double zetb, const double dh[3][3], const double dh_inv[3][3], const double ra[3], const double rab[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, const int o1, const int o2, const int n1, const int n2, const double *grid, double hab[n2][n1], const double pab[n2][n1], double forces[2][3], double virials[2][3][3], double hdab[n2][n1][3], double hadb[n2][n1][3], double a_hdab[n2][n1][3][3])
Integrates a single task. See grid_cpu_integrate.h for details.
static void load_pab(const grid_basis_set *ibasis, const grid_basis_set *jbasis, const int iset, const int jset, const bool transpose, const double *block, double *pab)
Transforms pab from contracted spherical to prim. cartesian basis.
void grid_cpu_free_task_list(grid_cpu_task_list *task_list)
Deallocates given task list, basis_sets have to be freed separately.
void grid_cpu_create_task_list(const bool orthorhombic, const int ntasks, const int nlevels, const int natoms, const int nkinds, const int nblocks, const int block_offsets[nblocks], const double atom_positions[natoms][3], const int atom_kinds[natoms], const grid_basis_set *basis_sets[nkinds], const int level_list[ntasks], const int iatom_list[ntasks], const int jatom_list[ntasks], const int iset_list[ntasks], const int jset_list[ntasks], const int ipgf_list[ntasks], const int jpgf_list[ntasks], const int border_mask_list[ntasks], const int block_num_list[ntasks], const double radius_list[ntasks], const double rab_list[ntasks][3], const int npts_global[nlevels][3], const int npts_local[nlevels][3], const int shift_local[nlevels][3], const int border_width[nlevels][3], const double dh[nlevels][3][3], const double dh_inv[nlevels][3][3], grid_cpu_task_list **task_list_out)
Allocates a task list for the cpu backend. See grid_task_list.h for details.
static void integrate_one_grid_level(const grid_cpu_task_list *task_list, const int *first_block_task, const int *last_block_task, const bool compute_tau, const int natoms, const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double dh[3][3], const double dh_inv[3][3], const offload_buffer *pab_blocks, const offload_buffer *grid, offload_buffer *hab_blocks, double forces[natoms][3], double virial[3][3])
Integrate a range of tasks that belong to the same grid level.
static void collocate_one_grid_level(const grid_cpu_task_list *task_list, const int *first_block_task, const int *last_block_task, const enum grid_func func, const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double dh[3][3], const double dh_inv[3][3], const double *pab_blocks, offload_buffer *grid)
Collocate a range of tasks which are destined for the same grid level.
static int compare_tasks(const void *a, const void *b)
Comperator passed to qsort to compare two tasks.
void grid_cpu_collocate_task_list(const grid_cpu_task_list *task_list, const enum grid_func func, const int nlevels, const offload_buffer *pab_blocks, offload_buffer *grids[nlevels])
Collocate all tasks of in given list onto given grids. See grid_task_list.h for details.
void dgemm_(const char *transa, const char *transb, const int *m, const int *n, const int *k, const double *alpha, const double *a, const int *lda, const double *b, const int *ldb, const double *beta, double *c, const int *ldc)
Prototype for BLAS dgemm.
void grid_cpu_integrate_task_list(const grid_cpu_task_list *task_list, const bool compute_tau, const int natoms, const int nlevels, const offload_buffer *pab_blocks, const offload_buffer *grids[nlevels], offload_buffer *hab_blocks, double forces[natoms][3], double virial[3][3])
Integrate all tasks of in given list from given grids. See grid_task_list.h for details.
static void store_hab(const grid_basis_set *ibasis, const grid_basis_set *jbasis, const int iset, const int jset, const bool transpose, const double *hab, double *block)
Transforms hab from prim. cartesian to contracted spherical basis.
static void dgemm(const char transa, const char transb, const int m, const int n, const int k, const double alpha, const double *a, const int lda, const double *b, const int ldb, const double beta, double *c, const int ldc)
Convenient wrapper to hide Fortran nature of dgemm_, swapping a and b.
Internal representation of a basis set.
Internal representation of a grid layout.
Internal representation of a task list.
grid_cpu_layout * layouts
grid_basis_set ** basis_sets
grid_cpu_task * tasks
Internal representation of a task.
Internal representation of a buffer.
double * host_buffer