(git:374b731)
Loading...
Searching...
No Matches
grid_hip_context.cu
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2024 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: BSD-3-Clause */
6/*----------------------------------------------------------------------------*/
7
8/*
9 * Authors :
10 - Dr Mathieu Taillefumier (ETH Zurich / CSCS)
11 - Advanced Micro Devices, Inc.
12*/
13
14#if defined(__OFFLOAD_HIP) && !defined(__NO_OFFLOAD_GRID)
15#include <cassert>
16#include <cstdio>
17#include <cstdlib>
18#include <cstring>
19#include <hip/hip_runtime_api.h>
20#include <iostream>
21
22#include "../../offload/offload_library.h"
23extern "C" {
24#include "../common/grid_basis_set.h"
25#include "../common/grid_constants.h"
26#include "../common/grid_library.h"
27}
28
29#include "grid_hip_context.h"
31
32#include "grid_hip_task_list.h"
33
34#if defined(_OMP_H)
35#error "OpenMP should not be used in .cu files to accommodate HIP."
36#endif
37
38/*******************************************************************************
39 * \brief Allocates a task list for the GPU backend.
40 * See grid_ctx.h for details.
41 ******************************************************************************/
42extern "C" void grid_hip_create_task_list(
43 const bool ortho, const int ntasks, const int nlevels, const int natoms,
44 const int nkinds, const int nblocks, const int *block_offsets,
45 const double *atom_positions, const int *atom_kinds,
46 const grid_basis_set **basis_sets, const int *level_list,
47 const int *iatom_list, const int *jatom_list, const int *iset_list,
48 const int *jset_list, const int *ipgf_list, const int *jpgf_list,
49 const int *border_mask_list, const int *block_num_list,
50 const double *radius_list, const double *rab_list, const int *npts_global,
51 const int *npts_local, const int *shift_local, const int *border_width,
52 const double *dh, const double *dh_inv, void *ptr) {
53
55 // Select GPU device.
56 rocm_backend::context_info *ctx = nullptr;
57 if (*ctx_out == nullptr) {
59 *ctx_out = ctx;
60 } else {
61 ctx = *ctx_out;
62 // verify that the object is the right one
63 ctx->verify_checksum();
64 }
65
66 ctx->ntasks = ntasks;
67 ctx->nlevels = nlevels;
68 ctx->natoms = natoms;
69 ctx->nblocks = nblocks;
70
71 ctx->grid_.resize(nlevels);
72 ctx->set_device();
73 std::vector<double> dh_max(ctx->nlevels, 0);
74
75 for (int level = 0; level < ctx->nlevels; level++) {
76 ctx->grid_[level].resize(npts_global + 3 * level, npts_local + 3 * level,
77 shift_local + 3 * level, border_width + 3 * level);
78 ctx->grid_[level].is_distributed(false);
79 ctx->grid_[level].set_lattice_vectors(&dh[9 * level], &dh_inv[9 * level]);
80 ctx->grid_[level].check_orthorhombicity(ortho);
81 for (int i = 0; i < 9; i++)
82 dh_max[level] = std::max(dh_max[level], std::abs(dh[9 * level + i]));
83 }
84
85 ctx->block_offsets_dev.resize(nblocks);
86 ctx->block_offsets_dev.copy_to_gpu(block_offsets);
87 ctx->initialize_basis_sets(basis_sets, nkinds);
88
89 ctx->first_task_per_level_.resize(nlevels, 0);
90 ctx->number_of_tasks_per_level_.resize(nlevels, 0);
91
92 memset(ctx->first_task_per_level_.data(), 0, sizeof(int) * nlevels);
93 memset(ctx->number_of_tasks_per_level_.data(), 0, sizeof(int) * nlevels);
94
95 std::vector<rocm_backend::task_info> tasks_host(ntasks);
96
97 size_t coef_size = 0;
98 size_t cab_size_ = 0;
99 for (int i = 0; i < ntasks; i++) {
100 const int level = level_list[i] - 1;
101
102 // count the number of task per level
103 ctx->number_of_tasks_per_level_[level]++;
104
105 const int iatom = iatom_list[i] - 1;
106 const int jatom = jatom_list[i] - 1;
107 const int iset = iset_list[i] - 1;
108 const int jset = jset_list[i] - 1;
109 const int ipgf = ipgf_list[i] - 1;
110 const int jpgf = jpgf_list[i] - 1;
111 const int ikind = atom_kinds[iatom] - 1;
112 const int jkind = atom_kinds[jatom] - 1;
113
114 /* set parameters related to atom type orbital etc.... */
115 const grid_basis_set *ibasis = basis_sets[ikind];
116 const grid_basis_set *jbasis = basis_sets[jkind];
117
118 tasks_host[i] = {};
119 tasks_host[i].level = level;
120 tasks_host[i].iatom = iatom;
121 tasks_host[i].jatom = jatom;
122 tasks_host[i].iset = iset;
123 tasks_host[i].jset = jset;
124 tasks_host[i].ipgf = ipgf;
125 tasks_host[i].jpgf = jpgf;
126 tasks_host[i].ikind = ikind;
127 tasks_host[i].jkind = jkind;
128 tasks_host[i].border_mask = border_mask_list[i];
129 tasks_host[i].block_num = block_num_list[i] - 1;
130
131 if (border_mask_list[i]) {
132 ctx->grid_[level].is_distributed(true);
133 }
134 /* parameters for the gaussian */
135 tasks_host[i].radius = radius_list[i];
136 tasks_host[i].rab[0] = rab_list[3 * i];
137 tasks_host[i].rab[1] = rab_list[3 * i + 1];
138 tasks_host[i].rab[2] = rab_list[3 * i + 2];
139 tasks_host[i].zeta = ibasis->zet[iset * ibasis->maxpgf + ipgf];
140 tasks_host[i].zetb = jbasis->zet[jset * jbasis->maxpgf + jpgf];
141 tasks_host[i].zetp = tasks_host[i].zeta + tasks_host[i].zetb;
142 const double f = tasks_host[i].zetb / tasks_host[i].zetp;
143 tasks_host[i].rab2 = 0.0;
144 for (int d = 0; d < 3; d++) {
145 tasks_host[i].rab[d] = tasks_host[i].rab[d];
146 tasks_host[i].rab2 += tasks_host[i].rab[d] * tasks_host[i].rab[d];
147 tasks_host[i].ra[d] = atom_positions[3 * iatom + d];
148 tasks_host[i].rb[d] = tasks_host[i].ra[d] + tasks_host[i].rab[d];
149 tasks_host[i].rp[d] = tasks_host[i].ra[d] + tasks_host[i].rab[d] * f;
150 }
151
152 tasks_host[i].skip_task = (2 * tasks_host[i].radius < dh_max[level]);
153 tasks_host[i].prefactor = exp(-tasks_host[i].zeta * f * tasks_host[i].rab2);
154
155 tasks_host[i].off_diag_twice = (iatom == jatom) ? 1.0 : 2.0;
156 // angular momentum range of basis set
157 const int la_max_basis = ibasis->lmax[iset];
158 const int lb_max_basis = jbasis->lmax[jset];
159 const int la_min_basis = ibasis->lmin[iset];
160 const int lb_min_basis = jbasis->lmin[jset];
161
162 // angular momentum range for the actual collocate/integrate opteration.
163 tasks_host[i].la_max = la_max_basis;
164 tasks_host[i].lb_max = lb_max_basis;
165 tasks_host[i].la_min = la_min_basis;
166 tasks_host[i].lb_min = lb_min_basis;
167
168 // start of decontracted set, ie. pab and hab
169 tasks_host[i].first_coseta =
170 (la_min_basis > 0) ? rocm_backend::ncoset(la_min_basis - 1) : 0;
171 tasks_host[i].first_cosetb =
172 (lb_min_basis > 0) ? rocm_backend::ncoset(lb_min_basis - 1) : 0;
173
174 // size of decontracted set, ie. pab and hab
175 tasks_host[i].ncoseta = rocm_backend::ncoset(la_max_basis);
176 tasks_host[i].ncosetb = rocm_backend::ncoset(lb_max_basis);
177
178 // size of entire spherical basis
179 tasks_host[i].nsgfa = ibasis->nsgf;
180 tasks_host[i].nsgfb = jbasis->nsgf;
181
182 // size of spherical set
183 tasks_host[i].nsgf_seta = ibasis->nsgf_set[iset];
184 tasks_host[i].nsgf_setb = jbasis->nsgf_set[jset];
185
186 // strides of the sphi transformation matrices
187 tasks_host[i].maxcoa = ibasis->maxco;
188 tasks_host[i].maxcob = jbasis->maxco;
189
190 tasks_host[i].sgfa = ibasis->first_sgf[iset] - 1;
191 tasks_host[i].sgfb = jbasis->first_sgf[jset] - 1;
192
193 tasks_host[i].block_transposed = (iatom > jatom);
194 tasks_host[i].subblock_offset =
195 (tasks_host[i].block_transposed)
196 ? (tasks_host[i].sgfa * tasks_host[i].nsgfb + tasks_host[i].sgfb)
197 : (tasks_host[i].sgfb * tasks_host[i].nsgfa + tasks_host[i].sgfa);
198
199 /* the constant 6 is important here since we do not know ahead of time what
200 * specific operation we will be doing. collocate functions can go up to 4
201 * while integrate can go up to 5 (but put 6 for safety reasons) */
202
203 /* this block is only as temporary scratch for calculating the coefficients.
204 * Doing this avoid a lot of atomic operations that are costly on hardware
205 * that only have partial support of them. For better performance we should
206 * most probably align the offsets as well. it is 256 bytes on Mi100 and
207 * above */
208 tasks_host[i].lp_max = tasks_host[i].lb_max + tasks_host[i].la_max + 6;
209 if (i == 0) {
210 tasks_host[i].coef_offset = 0;
211 } else {
212 tasks_host[i].coef_offset =
213 tasks_host[i - 1].coef_offset +
214 rocm_backend::ncoset(tasks_host[i - 1].lp_max);
215 }
216 coef_size += rocm_backend::ncoset(tasks_host[i].lp_max);
217
218 if (i == 0)
219 tasks_host[i].cab_offset = 0;
220 else
221 tasks_host[i].cab_offset =
222 tasks_host[i - 1].cab_offset +
223 rocm_backend::ncoset(tasks_host[i - 1].la_max + 3) *
224 rocm_backend::ncoset(tasks_host[i - 1].lb_max + 3);
225
226 cab_size_ += rocm_backend::ncoset(tasks_host[i].la_max + 3) *
227 rocm_backend::ncoset(tasks_host[i].lb_max + 3);
228
229 auto &grid = ctx->grid_[tasks_host[i].level];
230 // compute the cube properties
231
232 tasks_host[i].apply_border_mask = (tasks_host[i].border_mask != 0);
233
234 if (grid.is_orthorhombic() && (tasks_host[i].border_mask == 0)) {
235 tasks_host[i].discrete_radius =
236 rocm_backend::compute_cube_properties<double, double3, true>(
237 tasks_host[i].radius, grid.dh(), grid.dh_inv(),
238 (double3 *)tasks_host[i].rp, // center of the gaussian
239 &tasks_host[i]
240 .roffset, // offset compared to the closest grid point
241 &tasks_host[i].cube_center, // center coordinates in grid space
242 &tasks_host[i].lb_cube, // lower boundary
243 &tasks_host[i].cube_size);
244 } else {
245 tasks_host[i].discrete_radius =
246 rocm_backend::compute_cube_properties<double, double3, false>(
247 tasks_host[i].radius, grid.dh(), grid.dh_inv(),
248 (double3 *)tasks_host[i].rp, // center of the gaussian
249 &tasks_host[i]
250 .roffset, // offset compared to the closest grid point
251 &tasks_host[i].cube_center, // center coordinates in grid space
252 &tasks_host[i].lb_cube, // lower boundary
253 &tasks_host[i].cube_size);
254 }
255 }
256
257 // we need to sort the task list although I expect it to be sorted already
258 /*
259 * sorting with this lambda does not work
260 std::sort(tasks_host.begin(), tasks_host.end(), [](rocm_backend::task_info a,
261 rocm_backend::task_info b) { if (a.level == b.level) { if (a.block_num <=
262 b.block_num) return true; else return false; } else { return (a.level <
263 b.level);
264 }
265 });
266 */
267 // it is a exclusive scan actually
268 for (int level = 1; level < (int)ctx->number_of_tasks_per_level_.size();
269 level++) {
270 ctx->first_task_per_level_[level] =
271 ctx->first_task_per_level_[level - 1] +
272 ctx->number_of_tasks_per_level_[level - 1];
273 }
274
275 ctx->tasks_dev.clear();
276 ctx->tasks_dev.resize(tasks_host.size());
277 ctx->tasks_dev.copy_to_gpu(tasks_host);
278
279 /* Sort the blocks */
280 std::vector<std::vector<int>> task_sorted_by_block(nblocks);
281 std::vector<int> sorted_blocks(ntasks, 0);
282 std::vector<int> num_tasks_per_block(nblocks, 0);
283 std::vector<int> sorted_blocks_offset(nblocks, 0);
284 for (auto &block : task_sorted_by_block)
285 block.clear();
286
287 for (int i = 0; i < ntasks; i++) {
288 task_sorted_by_block[block_num_list[i] - 1].push_back(i);
289 num_tasks_per_block[block_num_list[i] - 1]++;
290 }
291
292 int offset = 0;
293 // flatten the task_sorted_by_block and compute the offsets
294 for (int i = 0; i < (int)task_sorted_by_block.size(); i++) {
295 auto &task_list = task_sorted_by_block[i];
296
297 // take care of the case where the blocks are not associated to a given
298 // task. (and also a workaround in the grid_replay.c file)
299 if (!task_list.empty()) {
300 memcpy(&sorted_blocks[offset], &task_list[0],
301 sizeof(int) * task_list.size());
302 }
303 sorted_blocks_offset[i] = offset;
304 offset += task_list.size();
305 }
306
307 // copy the blocks offsets
308 ctx->sorted_blocks_offset_dev.resize(sorted_blocks_offset.size());
309 ctx->sorted_blocks_offset_dev.copy_to_gpu(sorted_blocks_offset);
310
311 // copy the task list sorted by block (not by level) to the gpu
312 ctx->task_sorted_by_blocks_dev.resize(sorted_blocks.size());
313 ctx->task_sorted_by_blocks_dev.copy_to_gpu(sorted_blocks);
314
315 for (int i = 0; i < (int)sorted_blocks_offset.size(); i++) {
316 int num_tasks = 0;
317 if (i == (int)sorted_blocks_offset.size() - 1)
318 num_tasks = ntasks - sorted_blocks_offset[i];
319 else
320 num_tasks = sorted_blocks_offset[i + 1] - sorted_blocks_offset[i];
321
322 // pointless tests since they should be equal.
323 assert(num_tasks == num_tasks_per_block[i]);
324
325 // check that all tasks point to the same block
326 for (int tk = 0; tk < num_tasks; tk++)
327 assert(
328 tasks_host[sorted_blocks[tk + sorted_blocks_offset[i]]].block_num ==
329 i);
330 }
331 for (auto &block : task_sorted_by_block)
332 block.clear();
333 task_sorted_by_block.clear();
334
335 sorted_blocks.clear();
336 sorted_blocks_offset.clear();
337
338 ctx->num_tasks_per_block_dev_.resize(num_tasks_per_block.size());
339 ctx->num_tasks_per_block_dev_.copy_to_gpu(num_tasks_per_block);
340
341 // collect stats
342 memset(ctx->stats, 0, 2 * 20 * sizeof(int));
343 for (int itask = 0; itask < ntasks; itask++) {
344 const int iatom = iatom_list[itask] - 1;
345 const int jatom = jatom_list[itask] - 1;
346 const int ikind = atom_kinds[iatom] - 1;
347 const int jkind = atom_kinds[jatom] - 1;
348 const int iset = iset_list[itask] - 1;
349 const int jset = jset_list[itask] - 1;
350 const int la_max = basis_sets[ikind]->lmax[iset];
351 const int lb_max = basis_sets[jkind]->lmax[jset];
352 const int lp = std::min(la_max + lb_max, 19);
353 const bool has_border_mask = (border_mask_list[itask] != 0);
354 ctx->stats[has_border_mask][lp]++;
355 }
356
357 ctx->create_streams();
358
359 tasks_host.clear();
360 ctx->coef_dev_.resize(coef_size);
361 ctx->cab_dev_.resize(cab_size_);
362 ctx->compute_checksum();
363 // return newly created or updated context
364 *ctx_out = ctx;
365}
366
367/*******************************************************************************
368 * \brief destroy a context
369 ******************************************************************************/
370extern "C" void grid_hip_free_task_list(void *ptr) {
371
373 // Select GPU device.
374 if (ctx == nullptr)
375 return;
376 ctx->verify_checksum();
377 ctx->set_device();
378 delete ctx;
379}
380
381/*******************************************************************************
382 * \brief Collocate all tasks of in given list onto given grids.
383 ******************************************************************************/
384extern "C" void grid_hip_collocate_task_list(const void *ptr,
385 const enum grid_func func,
386 const int nlevels,
387 const offload_buffer *pab_blocks,
388 offload_buffer **grids) {
390
391 if (ptr == nullptr)
392 return;
393
394 ctx->verify_checksum();
395 assert(ctx->nlevels == nlevels);
396 ctx->set_device();
397
398 ctx->pab_block_.associate(pab_blocks->host_buffer, pab_blocks->device_buffer,
399 pab_blocks->size / sizeof(double));
400
401 /*
402 There are 3 scenario here.
403 - Mi300 : no copy will happen as the two buffers have the same address
404 - Mi250X : an internal copy will happen. We do not need to do anything
405 explicit
406 - no unified memory : Explicit copy will happen
407 */
409
410 for (int level = 0; level < ctx->nlevels; level++) {
411 ctx->grid_[level].associate(grids[level]->host_buffer,
412 grids[level]->device_buffer,
413 grids[level]->size / sizeof(double));
414 ctx->grid_[level].zero(ctx->level_streams[level]);
415 }
416
417 ctx->pab_block_.associate(pab_blocks->host_buffer, pab_blocks->device_buffer,
418 pab_blocks->size / sizeof(double));
419
420 int lp_diff = -1;
421
422 ctx->synchronize(ctx->main_stream);
423
424 for (int level = 0; level < ctx->nlevels; level++) {
425 ctx->collocate_one_grid_level(level, func, &lp_diff);
426 }
427
428 // update counters while we wait for kernels to finish. It is not thread safe
429 // at all since the function grid_library_counter_add has global static
430 // states. We need a much better mechanism than this for instance move this
431 // information one level up and encapsulate it in the context associated to
432 // the library.
433
434 if (lp_diff > -1) {
435 for (int has_border_mask = 0; has_border_mask <= 1; has_border_mask++) {
436 for (int lp = 0; lp < 20; lp++) {
437 const int count = ctx->stats[has_border_mask][lp];
438 if (ctx->grid_[0].is_orthorhombic() && !has_border_mask) {
440 GRID_COLLOCATE_ORTHO, count);
441 } else {
444 }
445 }
446 }
447 }
448
449 // download result from device to host.
450 for (int level = 0; level < ctx->nlevels; level++) {
451 ctx->grid_[level].copy_to_host(ctx->level_streams[level]);
452 }
453
454 // need to wait for all streams to finish
455 for (int level = 0; level < ctx->nlevels; level++) {
456 ctx->synchronize(ctx->level_streams[level]);
457 }
458}
459
460/*******************************************************************************
461 * \brief Integrate all tasks of in given list onto given grids.
462 * See grid_ctx.h for details.
463 ******************************************************************************/
464extern "C" void grid_hip_integrate_task_list(
465 const void *ptr, const bool compute_tau, const int nlevels,
466 const offload_buffer *pab_blocks, const offload_buffer **grids,
467 offload_buffer *hab_blocks, double *forces, double *virial) {
468
470
471 if (ptr == nullptr)
472 return;
473 assert(ctx->nlevels == nlevels);
474
475 ctx->verify_checksum();
476 // Select GPU device.
477 ctx->set_device();
478
479 // ctx->coef_dev_.zero(ctx->level_streams[0]);
480
481 for (int level = 0; level < ctx->nlevels; level++) {
482 if (ctx->number_of_tasks_per_level_[level]) {
483 ctx->grid_[level].associate(grids[level]->host_buffer,
484 grids[level]->device_buffer,
485 grids[level]->size / sizeof(double));
486 ctx->grid_[level].copy_to_gpu(ctx->level_streams[level]);
487 }
488 }
489
490 if ((forces != nullptr) || (virial != nullptr)) {
491 ctx->pab_block_.associate(pab_blocks->host_buffer,
492 pab_blocks->device_buffer,
493 pab_blocks->size / sizeof(double));
495 }
496
497 // we do not need to wait for this to start the computations since the matrix
498 // elements are computed after all coefficients are calculated.
499 ctx->hab_block_.associate(hab_blocks->host_buffer, hab_blocks->device_buffer,
500 hab_blocks->size / sizeof(double));
501 ctx->hab_block_.zero(ctx->main_stream);
502
503 ctx->calculate_forces = (forces != nullptr);
504 ctx->calculate_virial = (virial != nullptr);
505 ctx->compute_tau = compute_tau;
506 if (forces != nullptr) {
507 ctx->forces_.resize(3 * ctx->natoms);
508 ctx->forces_.zero(ctx->main_stream);
509 }
510
511 if (virial != nullptr) {
512 ctx->virial_.resize(9);
513 ctx->virial_.zero(ctx->main_stream);
514 }
515
516 int lp_diff = -1;
517
518 // we can actually treat the full task list without bothering about the level
519 // at that stage. This can be taken care of inside the kernel.
520
521 for (int level = 0; level < ctx->nlevels; level++) {
522 // launch kernel, but only after grid has arrived
523 ctx->integrate_one_grid_level(level, &lp_diff);
524 }
525
526 if (lp_diff > -1) {
527 // update counters while we wait for kernels to finish
528 for (int has_border_mask = 0; has_border_mask <= 1; has_border_mask++) {
529 for (int lp = 0; lp < 20; lp++) {
530 const int count = ctx->stats[has_border_mask][lp];
531 if (ctx->grid_[0].is_orthorhombic() && !has_border_mask) {
533 GRID_INTEGRATE_ORTHO, count);
534 } else {
537 }
538 }
539 }
540 }
541
542 // need to wait for all streams to finish
543 for (int level = 0; level < ctx->nlevels; level++) {
544 if (ctx->number_of_tasks_per_level_[level])
545 ctx->synchronize(ctx->level_streams[level]);
546 }
547
548 // computing the hab coefficients does not depend on the number of grids so we
549 // can run these calculations on the main stream
552
553 if (forces != NULL) {
554 ctx->forces_.copy_from_gpu(forces, ctx->main_stream);
555 }
556 if (virial != NULL) {
557 ctx->virial_.copy_from_gpu(virial, ctx->main_stream);
558 }
559
560 ctx->synchronize(ctx->main_stream);
561}
562
563#endif // defined(__OFFLOAD_HIP) && !defined(__NO_OFFLOAD_GRID)
std::vector< int > first_task_per_level_
gpu_vector< double > virial_
gpu_vector< int > sorted_blocks_offset_dev
gpu_vector< double > coef_dev_
gpu_vector< double > pab_block_
void collocate_one_grid_level(const int level, const enum grid_func func, int *lp_diff)
gpu_vector< int > block_offsets_dev
void synchronize(offloadStream_t &stream)
gpu_vector< double > cab_dev_
gpu_vector< task_info > tasks_dev
std::vector< grid_info< double > > grid_
gpu_vector< double > forces_
gpu_vector< double > hab_block_
std::vector< offloadStream_t > level_streams
void initialize_basis_sets(const grid_basis_set **basis_sets, const int nkinds__)
std::vector< int > number_of_tasks_per_level_
gpu_vector< int > num_tasks_per_block_dev_
void integrate_one_grid_level(const int level, int *lp_diff)
gpu_vector< int > task_sorted_by_blocks_dev
void zero(offloadStream_t &stream__)
void copy_from_gpu(T *data__, offloadStream_t &stream__)
void associate(void *host_ptr__, void *device_ptr__, const size_t size__)
void copy_to_gpu(const T *data__)
void resize(const size_t new_size_)
@ GRID_BACKEND_HIP
grid_func
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
static void const int const int i
static void const int const int const int const int const int const double const int const int const int npts_local[3]
void grid_library_counter_add(const int lp, const enum grid_backend backend, const enum grid_library_kernel kernel, const int increment)
Adds given increment to counter specified by lp, backend, and kernel.
@ GRID_INTEGRATE_GENERAL
@ GRID_COLLOCATE_ORTHO
@ GRID_COLLOCATE_GENERAL
@ GRID_INTEGRATE_ORTHO
real(dp), dimension(3) d
integer function, public offload_get_chosen_device()
Returns the chosen device.
__host__ __device__ __inline__ int ncoset(const int l)
Number of Cartesian orbitals up to given angular momentum quantum.
Internal representation of a basis set.
Internal representation of a buffer.
double * device_buffer
double * host_buffer