(git:e7e05ae)
grid_hip_context.cu
Go to the documentation of this file.
1 /*----------------------------------------------------------------------------*/
2 /* CP2K: A general program to perform molecular dynamics simulations */
3 /* Copyright 2000-2024 CP2K developers group <https://cp2k.org> */
4 /* */
5 /* SPDX-License-Identifier: BSD-3-Clause */
6 /*----------------------------------------------------------------------------*/
7 
8 /*
9  * Authors :
10  - Dr Mathieu Taillefumier (ETH Zurich / CSCS)
11  - Advanced Micro Devices, Inc.
12 */
13 
14 #if defined(__OFFLOAD_HIP) && !defined(__NO_OFFLOAD_GRID)
15 #include <cassert>
16 #include <cstdio>
17 #include <cstdlib>
18 #include <cstring>
19 #include <hip/hip_runtime_api.h>
20 #include <iostream>
21 
22 #include "../../offload/offload_library.h"
23 extern "C" {
24 #include "../common/grid_basis_set.h"
25 #include "../common/grid_constants.h"
26 #include "../common/grid_library.h"
27 }
28 
29 #include "grid_hip_context.h"
31 
32 #include "grid_hip_task_list.h"
33 
34 #if defined(_OMP_H)
35 #error "OpenMP should not be used in .cu files to accommodate HIP."
36 #endif
37 
38 /*******************************************************************************
39  * \brief Allocates a task list for the GPU backend.
40  * See grid_ctx.h for details.
41  ******************************************************************************/
42 extern "C" void grid_hip_create_task_list(
43  const bool ortho, const int ntasks, const int nlevels, const int natoms,
44  const int nkinds, const int nblocks, const int *block_offsets,
45  const double *atom_positions, const int *atom_kinds,
46  const grid_basis_set **basis_sets, const int *level_list,
47  const int *iatom_list, const int *jatom_list, const int *iset_list,
48  const int *jset_list, const int *ipgf_list, const int *jpgf_list,
49  const int *border_mask_list, const int *block_num_list,
50  const double *radius_list, const double *rab_list, const int *npts_global,
51  const int *npts_local, const int *shift_local, const int *border_width,
52  const double *dh, const double *dh_inv, void *ptr) {
53 
55  // Select GPU device.
56  rocm_backend::context_info *ctx = nullptr;
57  if (*ctx_out == nullptr) {
59  *ctx_out = ctx;
60  } else {
61  ctx = *ctx_out;
62  // verify that the object is the right one
63  ctx->verify_checksum();
64  }
65 
66  ctx->ntasks = ntasks;
67  ctx->nlevels = nlevels;
68  ctx->natoms = natoms;
69  ctx->nblocks = nblocks;
70 
71  ctx->grid_.resize(nlevels);
72  ctx->set_device();
73  std::vector<double> dh_max(ctx->nlevels, 0);
74 
75  for (int level = 0; level < ctx->nlevels; level++) {
76  ctx->grid_[level].resize(npts_global + 3 * level, npts_local + 3 * level,
77  shift_local + 3 * level, border_width + 3 * level);
78  ctx->grid_[level].is_distributed(false);
79  ctx->grid_[level].set_lattice_vectors(&dh[9 * level], &dh_inv[9 * level]);
80  ctx->grid_[level].check_orthorhombicity(ortho);
81  for (int i = 0; i < 9; i++)
82  dh_max[level] = std::max(dh_max[level], std::abs(dh[9 * level + i]));
83  }
84 
85  ctx->block_offsets_dev.resize(nblocks);
86  ctx->block_offsets_dev.copy_to_gpu(block_offsets);
87  ctx->initialize_basis_sets(basis_sets, nkinds);
88 
89  ctx->first_task_per_level_.resize(nlevels, 0);
90  ctx->number_of_tasks_per_level_.resize(nlevels, 0);
91 
92  memset(ctx->first_task_per_level_.data(), 0, sizeof(int) * nlevels);
93  memset(ctx->number_of_tasks_per_level_.data(), 0, sizeof(int) * nlevels);
94 
95  std::vector<rocm_backend::task_info> tasks_host(ntasks);
96 
97  size_t coef_size = 0;
98  size_t cab_size_ = 0;
99  for (int i = 0; i < ntasks; i++) {
100  const int level = level_list[i] - 1;
101 
102  // count the number of task per level
103  ctx->number_of_tasks_per_level_[level]++;
104 
105  const int iatom = iatom_list[i] - 1;
106  const int jatom = jatom_list[i] - 1;
107  const int iset = iset_list[i] - 1;
108  const int jset = jset_list[i] - 1;
109  const int ipgf = ipgf_list[i] - 1;
110  const int jpgf = jpgf_list[i] - 1;
111  const int ikind = atom_kinds[iatom] - 1;
112  const int jkind = atom_kinds[jatom] - 1;
113 
114  /* set parameters related to atom type orbital etc.... */
115  const grid_basis_set *ibasis = basis_sets[ikind];
116  const grid_basis_set *jbasis = basis_sets[jkind];
117 
118  tasks_host[i] = {};
119  tasks_host[i].level = level;
120  tasks_host[i].iatom = iatom;
121  tasks_host[i].jatom = jatom;
122  tasks_host[i].iset = iset;
123  tasks_host[i].jset = jset;
124  tasks_host[i].ipgf = ipgf;
125  tasks_host[i].jpgf = jpgf;
126  tasks_host[i].ikind = ikind;
127  tasks_host[i].jkind = jkind;
128  tasks_host[i].border_mask = border_mask_list[i];
129  tasks_host[i].block_num = block_num_list[i] - 1;
130 
131  if (border_mask_list[i]) {
132  ctx->grid_[level].is_distributed(true);
133  }
134  /* parameters for the gaussian */
135  tasks_host[i].radius = radius_list[i];
136  tasks_host[i].rab[0] = rab_list[3 * i];
137  tasks_host[i].rab[1] = rab_list[3 * i + 1];
138  tasks_host[i].rab[2] = rab_list[3 * i + 2];
139  tasks_host[i].zeta = ibasis->zet[iset * ibasis->maxpgf + ipgf];
140  tasks_host[i].zetb = jbasis->zet[jset * jbasis->maxpgf + jpgf];
141  tasks_host[i].zetp = tasks_host[i].zeta + tasks_host[i].zetb;
142  const double f = tasks_host[i].zetb / tasks_host[i].zetp;
143  tasks_host[i].rab2 = 0.0;
144  for (int d = 0; d < 3; d++) {
145  tasks_host[i].rab[d] = tasks_host[i].rab[d];
146  tasks_host[i].rab2 += tasks_host[i].rab[d] * tasks_host[i].rab[d];
147  tasks_host[i].ra[d] = atom_positions[3 * iatom + d];
148  tasks_host[i].rb[d] = tasks_host[i].ra[d] + tasks_host[i].rab[d];
149  tasks_host[i].rp[d] = tasks_host[i].ra[d] + tasks_host[i].rab[d] * f;
150  }
151 
152  tasks_host[i].skip_task = (2 * tasks_host[i].radius < dh_max[level]);
153  tasks_host[i].prefactor = exp(-tasks_host[i].zeta * f * tasks_host[i].rab2);
154 
155  tasks_host[i].off_diag_twice = (iatom == jatom) ? 1.0 : 2.0;
156  // angular momentum range of basis set
157  const int la_max_basis = ibasis->lmax[iset];
158  const int lb_max_basis = jbasis->lmax[jset];
159  const int la_min_basis = ibasis->lmin[iset];
160  const int lb_min_basis = jbasis->lmin[jset];
161 
162  // angular momentum range for the actual collocate/integrate opteration.
163  tasks_host[i].la_max = la_max_basis;
164  tasks_host[i].lb_max = lb_max_basis;
165  tasks_host[i].la_min = la_min_basis;
166  tasks_host[i].lb_min = lb_min_basis;
167 
168  // start of decontracted set, ie. pab and hab
169  tasks_host[i].first_coseta =
170  (la_min_basis > 0) ? rocm_backend::ncoset(la_min_basis - 1) : 0;
171  tasks_host[i].first_cosetb =
172  (lb_min_basis > 0) ? rocm_backend::ncoset(lb_min_basis - 1) : 0;
173 
174  // size of decontracted set, ie. pab and hab
175  tasks_host[i].ncoseta = rocm_backend::ncoset(la_max_basis);
176  tasks_host[i].ncosetb = rocm_backend::ncoset(lb_max_basis);
177 
178  // size of entire spherical basis
179  tasks_host[i].nsgfa = ibasis->nsgf;
180  tasks_host[i].nsgfb = jbasis->nsgf;
181 
182  // size of spherical set
183  tasks_host[i].nsgf_seta = ibasis->nsgf_set[iset];
184  tasks_host[i].nsgf_setb = jbasis->nsgf_set[jset];
185 
186  // strides of the sphi transformation matrices
187  tasks_host[i].maxcoa = ibasis->maxco;
188  tasks_host[i].maxcob = jbasis->maxco;
189 
190  tasks_host[i].sgfa = ibasis->first_sgf[iset] - 1;
191  tasks_host[i].sgfb = jbasis->first_sgf[jset] - 1;
192 
193  tasks_host[i].block_transposed = (iatom > jatom);
194  tasks_host[i].subblock_offset =
195  (tasks_host[i].block_transposed)
196  ? (tasks_host[i].sgfa * tasks_host[i].nsgfb + tasks_host[i].sgfb)
197  : (tasks_host[i].sgfb * tasks_host[i].nsgfa + tasks_host[i].sgfa);
198 
199  /* the constant 6 is important here since we do not know ahead of time what
200  * specific operation we will be doing. collocate functions can go up to 4
201  * while integrate can go up to 5 (but put 6 for safety reasons) */
202 
203  /* this block is only as temporary scratch for calculating the coefficients.
204  * Doing this avoid a lot of atomic operations that are costly on hardware
205  * that only have partial support of them. For better performance we should
206  * most probably align the offsets as well. it is 256 bytes on Mi100 and
207  * above */
208  tasks_host[i].lp_max = tasks_host[i].lb_max + tasks_host[i].la_max + 6;
209  if (i == 0) {
210  tasks_host[i].coef_offset = 0;
211  } else {
212  tasks_host[i].coef_offset =
213  tasks_host[i - 1].coef_offset +
214  rocm_backend::ncoset(tasks_host[i - 1].lp_max);
215  }
216  coef_size += rocm_backend::ncoset(tasks_host[i].lp_max);
217 
218  if (i == 0)
219  tasks_host[i].cab_offset = 0;
220  else
221  tasks_host[i].cab_offset =
222  tasks_host[i - 1].cab_offset +
223  rocm_backend::ncoset(tasks_host[i - 1].la_max + 3) *
224  rocm_backend::ncoset(tasks_host[i - 1].lb_max + 3);
225 
226  cab_size_ += rocm_backend::ncoset(tasks_host[i].la_max + 3) *
227  rocm_backend::ncoset(tasks_host[i].lb_max + 3);
228 
229  auto &grid = ctx->grid_[tasks_host[i].level];
230  // compute the cube properties
231 
232  tasks_host[i].apply_border_mask = (tasks_host[i].border_mask != 0);
233 
234  if (grid.is_orthorhombic() && (tasks_host[i].border_mask == 0)) {
235  tasks_host[i].discrete_radius =
236  rocm_backend::compute_cube_properties<double, double3, true>(
237  tasks_host[i].radius, grid.dh(), grid.dh_inv(),
238  (double3 *)tasks_host[i].rp, // center of the gaussian
239  &tasks_host[i]
240  .roffset, // offset compared to the closest grid point
241  &tasks_host[i].cube_center, // center coordinates in grid space
242  &tasks_host[i].lb_cube, // lower boundary
243  &tasks_host[i].cube_size);
244  } else {
245  tasks_host[i].discrete_radius =
246  rocm_backend::compute_cube_properties<double, double3, false>(
247  tasks_host[i].radius, grid.dh(), grid.dh_inv(),
248  (double3 *)tasks_host[i].rp, // center of the gaussian
249  &tasks_host[i]
250  .roffset, // offset compared to the closest grid point
251  &tasks_host[i].cube_center, // center coordinates in grid space
252  &tasks_host[i].lb_cube, // lower boundary
253  &tasks_host[i].cube_size);
254  }
255  }
256 
257  // we need to sort the task list although I expect it to be sorted already
258  /*
259  * sorting with this lambda does not work
260  std::sort(tasks_host.begin(), tasks_host.end(), [](rocm_backend::task_info a,
261  rocm_backend::task_info b) { if (a.level == b.level) { if (a.block_num <=
262  b.block_num) return true; else return false; } else { return (a.level <
263  b.level);
264  }
265  });
266  */
267  // it is a exclusive scan actually
268  for (int level = 1; level < (int)ctx->number_of_tasks_per_level_.size();
269  level++) {
270  ctx->first_task_per_level_[level] =
271  ctx->first_task_per_level_[level - 1] +
272  ctx->number_of_tasks_per_level_[level - 1];
273  }
274 
275  ctx->tasks_dev.clear();
276  ctx->tasks_dev.resize(tasks_host.size());
277  ctx->tasks_dev.copy_to_gpu(tasks_host);
278 
279  /* Sort the blocks */
280  std::vector<std::vector<int>> task_sorted_by_block(nblocks);
281  std::vector<int> sorted_blocks(ntasks, 0);
282  std::vector<int> num_tasks_per_block(nblocks, 0);
283  std::vector<int> sorted_blocks_offset(nblocks, 0);
284  for (auto &block : task_sorted_by_block)
285  block.clear();
286 
287  for (int i = 0; i < ntasks; i++) {
288  task_sorted_by_block[block_num_list[i] - 1].push_back(i);
289  num_tasks_per_block[block_num_list[i] - 1]++;
290  }
291 
292  int offset = 0;
293  // flatten the task_sorted_by_block and compute the offsets
294  for (int i = 0; i < (int)task_sorted_by_block.size(); i++) {
295  auto &task_list = task_sorted_by_block[i];
296 
297  // take care of the case where the blocks are not associated to a given
298  // task. (and also a workaround in the grid_replay.c file)
299  if (!task_list.empty()) {
300  memcpy(&sorted_blocks[offset], &task_list[0],
301  sizeof(int) * task_list.size());
302  }
303  sorted_blocks_offset[i] = offset;
304  offset += task_list.size();
305  }
306 
307  // copy the blocks offsets
308  ctx->sorted_blocks_offset_dev.resize(sorted_blocks_offset.size());
309  ctx->sorted_blocks_offset_dev.copy_to_gpu(sorted_blocks_offset);
310 
311  // copy the task list sorted by block (not by level) to the gpu
312  ctx->task_sorted_by_blocks_dev.resize(sorted_blocks.size());
313  ctx->task_sorted_by_blocks_dev.copy_to_gpu(sorted_blocks);
314 
315  for (int i = 0; i < (int)sorted_blocks_offset.size(); i++) {
316  int num_tasks = 0;
317  if (i == (int)sorted_blocks_offset.size() - 1)
318  num_tasks = ntasks - sorted_blocks_offset[i];
319  else
320  num_tasks = sorted_blocks_offset[i + 1] - sorted_blocks_offset[i];
321 
322  // pointless tests since they should be equal.
323  assert(num_tasks == num_tasks_per_block[i]);
324 
325  // check that all tasks point to the same block
326  for (int tk = 0; tk < num_tasks; tk++)
327  assert(
328  tasks_host[sorted_blocks[tk + sorted_blocks_offset[i]]].block_num ==
329  i);
330  }
331  for (auto &block : task_sorted_by_block)
332  block.clear();
333  task_sorted_by_block.clear();
334 
335  sorted_blocks.clear();
336  sorted_blocks_offset.clear();
337 
338  ctx->num_tasks_per_block_dev_.resize(num_tasks_per_block.size());
339  ctx->num_tasks_per_block_dev_.copy_to_gpu(num_tasks_per_block);
340 
341  // collect stats
342  memset(ctx->stats, 0, 2 * 20 * sizeof(int));
343  for (int itask = 0; itask < ntasks; itask++) {
344  const int iatom = iatom_list[itask] - 1;
345  const int jatom = jatom_list[itask] - 1;
346  const int ikind = atom_kinds[iatom] - 1;
347  const int jkind = atom_kinds[jatom] - 1;
348  const int iset = iset_list[itask] - 1;
349  const int jset = jset_list[itask] - 1;
350  const int la_max = basis_sets[ikind]->lmax[iset];
351  const int lb_max = basis_sets[jkind]->lmax[jset];
352  const int lp = std::min(la_max + lb_max, 19);
353  const bool has_border_mask = (border_mask_list[itask] != 0);
354  ctx->stats[has_border_mask][lp]++;
355  }
356 
357  ctx->create_streams();
358 
359  tasks_host.clear();
360  ctx->coef_dev_.resize(coef_size);
361  ctx->cab_dev_.resize(cab_size_);
362  ctx->compute_checksum();
363  // return newly created or updated context
364  *ctx_out = ctx;
365 }
366 
367 /*******************************************************************************
368  * \brief destroy a context
369  ******************************************************************************/
370 extern "C" void grid_hip_free_task_list(void *ptr) {
371 
373  // Select GPU device.
374  if (ctx == nullptr)
375  return;
376  ctx->verify_checksum();
377  ctx->set_device();
378  delete ctx;
379 }
380 
381 /*******************************************************************************
382  * \brief Collocate all tasks of in given list onto given grids.
383  ******************************************************************************/
384 extern "C" void grid_hip_collocate_task_list(const void *ptr,
385  const enum grid_func func,
386  const int nlevels,
387  const offload_buffer *pab_blocks,
388  offload_buffer **grids) {
390 
391  if (ptr == nullptr)
392  return;
393 
394  ctx->verify_checksum();
395  assert(ctx->nlevels == nlevels);
396  ctx->set_device();
397 
398  ctx->pab_block_.associate(pab_blocks->host_buffer, pab_blocks->device_buffer,
399  pab_blocks->size / sizeof(double));
400 
401  /*
402  There are 3 scenario here.
403  - Mi300 : no copy will happen as the two buffers have the same address
404  - Mi250X : an internal copy will happen. We do not need to do anything
405  explicit
406  - no unified memory : Explicit copy will happen
407  */
409 
410  for (int level = 0; level < ctx->nlevels; level++) {
411  ctx->grid_[level].associate(grids[level]->host_buffer,
412  grids[level]->device_buffer,
413  grids[level]->size / sizeof(double));
414  ctx->grid_[level].zero(ctx->level_streams[level]);
415  }
416 
417  ctx->pab_block_.associate(pab_blocks->host_buffer, pab_blocks->device_buffer,
418  pab_blocks->size / sizeof(double));
419 
420  int lp_diff = -1;
421 
422  ctx->synchronize(ctx->main_stream);
423 
424  for (int level = 0; level < ctx->nlevels; level++) {
425  ctx->collocate_one_grid_level(level, func, &lp_diff);
426  }
427 
428  // update counters while we wait for kernels to finish. It is not thread safe
429  // at all since the function grid_library_counter_add has global static
430  // states. We need a much better mechanism than this for instance move this
431  // information one level up and encapsulate it in the context associated to
432  // the library.
433 
434  if (lp_diff > -1) {
435  for (int has_border_mask = 0; has_border_mask <= 1; has_border_mask++) {
436  for (int lp = 0; lp < 20; lp++) {
437  const int count = ctx->stats[has_border_mask][lp];
438  if (ctx->grid_[0].is_orthorhombic() && !has_border_mask) {
440  GRID_COLLOCATE_ORTHO, count);
441  } else {
443  GRID_COLLOCATE_GENERAL, count);
444  }
445  }
446  }
447  }
448 
449  // download result from device to host.
450  for (int level = 0; level < ctx->nlevels; level++) {
451  ctx->grid_[level].copy_to_host(ctx->level_streams[level]);
452  }
453 
454  // need to wait for all streams to finish
455  for (int level = 0; level < ctx->nlevels; level++) {
456  ctx->synchronize(ctx->level_streams[level]);
457  }
458 }
459 
460 /*******************************************************************************
461  * \brief Integrate all tasks of in given list onto given grids.
462  * See grid_ctx.h for details.
463  ******************************************************************************/
464 extern "C" void grid_hip_integrate_task_list(
465  const void *ptr, const bool compute_tau, const int nlevels,
466  const offload_buffer *pab_blocks, const offload_buffer **grids,
467  offload_buffer *hab_blocks, double *forces, double *virial) {
468 
470 
471  if (ptr == nullptr)
472  return;
473  assert(ctx->nlevels == nlevels);
474 
475  ctx->verify_checksum();
476  // Select GPU device.
477  ctx->set_device();
478 
479  // ctx->coef_dev_.zero(ctx->level_streams[0]);
480 
481  for (int level = 0; level < ctx->nlevels; level++) {
482  if (ctx->number_of_tasks_per_level_[level]) {
483  ctx->grid_[level].associate(grids[level]->host_buffer,
484  grids[level]->device_buffer,
485  grids[level]->size / sizeof(double));
486  ctx->grid_[level].copy_to_gpu(ctx->level_streams[level]);
487  }
488  }
489 
490  if ((forces != nullptr) || (virial != nullptr)) {
491  ctx->pab_block_.associate(pab_blocks->host_buffer,
492  pab_blocks->device_buffer,
493  pab_blocks->size / sizeof(double));
495  }
496 
497  // we do not need to wait for this to start the computations since the matrix
498  // elements are computed after all coefficients are calculated.
499  ctx->hab_block_.associate(hab_blocks->host_buffer, hab_blocks->device_buffer,
500  hab_blocks->size / sizeof(double));
501  ctx->hab_block_.zero(ctx->main_stream);
502 
503  ctx->calculate_forces = (forces != nullptr);
504  ctx->calculate_virial = (virial != nullptr);
505  ctx->compute_tau = compute_tau;
506  if (forces != nullptr) {
507  ctx->forces_.resize(3 * ctx->natoms);
508  ctx->forces_.zero(ctx->main_stream);
509  }
510 
511  if (virial != nullptr) {
512  ctx->virial_.resize(9);
513  ctx->virial_.zero(ctx->main_stream);
514  }
515 
516  int lp_diff = -1;
517 
518  // we can actually treat the full task list without bothering about the level
519  // at that stage. This can be taken care of inside the kernel.
520 
521  for (int level = 0; level < ctx->nlevels; level++) {
522  // launch kernel, but only after grid has arrived
523  ctx->integrate_one_grid_level(level, &lp_diff);
524  }
525 
526  if (lp_diff > -1) {
527  // update counters while we wait for kernels to finish
528  for (int has_border_mask = 0; has_border_mask <= 1; has_border_mask++) {
529  for (int lp = 0; lp < 20; lp++) {
530  const int count = ctx->stats[has_border_mask][lp];
531  if (ctx->grid_[0].is_orthorhombic() && !has_border_mask) {
533  GRID_INTEGRATE_ORTHO, count);
534  } else {
536  GRID_INTEGRATE_GENERAL, count);
537  }
538  }
539  }
540  }
541 
542  // need to wait for all streams to finish
543  for (int level = 0; level < ctx->nlevels; level++) {
544  if (ctx->number_of_tasks_per_level_[level])
545  ctx->synchronize(ctx->level_streams[level]);
546  }
547 
548  // computing the hab coefficients does not depend on the number of grids so we
549  // can run these calculations on the main stream
552 
553  if (forces != NULL) {
554  ctx->forces_.copy_from_gpu(forces, ctx->main_stream);
555  }
556  if (virial != NULL) {
557  ctx->virial_.copy_from_gpu(virial, ctx->main_stream);
558  }
559 
560  ctx->synchronize(ctx->main_stream);
561 }
562 
563 #endif // defined(__OFFLOAD_HIP) && !defined(__NO_OFFLOAD_GRID)
std::vector< int > first_task_per_level_
gpu_vector< double > virial_
gpu_vector< int > sorted_blocks_offset_dev
gpu_vector< double > coef_dev_
gpu_vector< double > pab_block_
void collocate_one_grid_level(const int level, const enum grid_func func, int *lp_diff)
gpu_vector< int > block_offsets_dev
void synchronize(offloadStream_t &stream)
gpu_vector< double > cab_dev_
gpu_vector< task_info > tasks_dev
std::vector< grid_info< double > > grid_
gpu_vector< double > forces_
gpu_vector< double > hab_block_
std::vector< offloadStream_t > level_streams
void initialize_basis_sets(const grid_basis_set **basis_sets, const int nkinds__)
std::vector< int > number_of_tasks_per_level_
gpu_vector< int > num_tasks_per_block_dev_
void integrate_one_grid_level(const int level, int *lp_diff)
gpu_vector< int > task_sorted_by_blocks_dev
void zero(offloadStream_t &stream__)
void copy_from_gpu(T *data__, offloadStream_t &stream__)
void associate(void *host_ptr__, void *device_ptr__, const size_t size__)
void copy_to_gpu(const T *data__)
void resize(const size_t new_size_)
@ GRID_BACKEND_HIP
grid_func
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
static void const int const int i
static void const int const int const int const int const int const double const int const int const int npts_local[3]
void grid_library_counter_add(const int lp, const enum grid_backend backend, const enum grid_library_kernel kernel, const int increment)
Adds given increment to counter specified by lp, backend, and kernel.
Definition: grid_library.c:129
@ GRID_INTEGRATE_GENERAL
Definition: grid_library.h:69
@ GRID_COLLOCATE_ORTHO
Definition: grid_library.h:66
@ GRID_COLLOCATE_GENERAL
Definition: grid_library.h:68
@ GRID_INTEGRATE_ORTHO
Definition: grid_library.h:67
real(dp), dimension(3) d
Definition: ai_eri_debug.F:31
integer function, public offload_get_chosen_device()
Returns the chosen device.
Definition: offload_api.F:152
__host__ __device__ __inline__ int ncoset(const int l)
Number of Cartesian orbitals up to given angular momentum quantum.
Internal representation of a basis set.
Internal representation of a buffer.
double * device_buffer
double * host_buffer