(git:0d88fc2)
Loading...
Searching...
No Matches
grid_gpu_context.cu
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2026 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: BSD-3-Clause */
6/*----------------------------------------------------------------------------*/
7
8/*
9 * Authors :
10 - Mathieu Taillefumier (ETH Zurich / CSCS)
11 - Advanced Micro Devices, Inc.
12 - Ole Schuett
13*/
14
15#include <cassert>
16#include <cstdio>
17#include <cstdlib>
18#include <cstring>
19#include <iostream>
20
21#include "../../offload/offload_library.h"
22extern "C" {
23#include "../common/grid_basis_set.h"
24#include "../common/grid_constants.h"
25#include "../common/grid_library.h"
26}
27
28#include "grid_gpu_context.h"
30
31#include "grid_gpu_task_list.h"
32
33#if defined(_OMP_H)
34#error "OpenMP should not be used in .cu files to accommodate HIP."
35#endif
36
37namespace rocm_backend {
38
39constexpr size_t align_up_elems(size_t n_elems, size_t elem_alignment) {
40 return (n_elems + elem_alignment - 1) & ~(elem_alignment - 1);
41}
42
43kernel_params
44context_info::set_kernel_parameters(const int level,
45 const smem_parameters &smem_params) {
46 kernel_params params;
47 params.cab_size_ = smem_params.cab_size();
48 params.first_task = 0;
49
50 params.la_min_diff = smem_params.ldiffs().la_min_diff;
51 params.lb_min_diff = smem_params.ldiffs().lb_min_diff;
52
53 params.la_max_diff = smem_params.ldiffs().la_max_diff;
54 params.lb_max_diff = smem_params.ldiffs().lb_max_diff;
55 params.tasks = this->tasks_dev.data();
56 params.task_sorted_by_blocks_dev = task_sorted_by_blocks_dev.data();
57 params.sorted_blocks_offset_dev = sorted_blocks_offset_dev.data();
58 params.num_tasks_per_block_dev = this->num_tasks_per_block_dev_.data();
59 params.block_offsets = this->block_offsets_dev.data();
60 params.la_min_diff = smem_params.ldiffs().la_min_diff;
61 params.lb_min_diff = smem_params.ldiffs().lb_min_diff;
62 params.la_max_diff = smem_params.ldiffs().la_max_diff;
63 params.lb_max_diff = smem_params.ldiffs().lb_max_diff;
64
65 params.ptr_dev[0] = pab_block_.data();
66
67 if (level >= 0) {
68 params.ptr_dev[1] = grid_[level].data();
69 memcpy(params.dh_, grid_[level].dh(), 9 * sizeof(double));
70 memcpy(params.dh_inv_, grid_[level].dh_inv(), 9 * sizeof(double));
71 params.first_task = first_task_per_level_[level];
72
73 params.grid_full_size_ = grid_[level].full_size();
74 params.grid_local_size_ = grid_[level].local_size();
75 params.grid_lower_corner_ = grid_[level].lower_corner();
76 params.grid_border_width_ = grid_[level].border_width();
77 }
78
79 params.ptr_dev[2] = this->coef_dev_.data();
80 params.ptr_dev[3] = hab_block_.data();
81 params.ptr_dev[4] = forces_.data();
82 params.ptr_dev[5] = virial_.data();
83 params.ptr_dev[6] = this->cab_dev_.data();
84 params.cab_block_offset_dev = this->cab_block_offset_dev.data();
85 params.sphi_dev = this->sphi_dev.data();
86 return params;
87}
88}; // namespace rocm_backend
89
90/*******************************************************************************
91 * \brief Allocates a task list for the GPU backend.
92 * See grid_ctx.h for details.
93 ******************************************************************************/
95 const bool ortho, const int ntasks, const int nlevels, const int natoms,
96 const int nkinds, const int nblocks, const int *block_offsets,
97 const double *atom_positions, const int *atom_kinds,
98 const grid_basis_set **basis_sets, const int *level_list,
99 const int *iatom_list, const int *jatom_list, const int *iset_list,
100 const int *jset_list, const int *ipgf_list, const int *jpgf_list,
101 const int *border_mask_list, const int *block_num_list,
102 const double *radius_list, const double *rab_list, const int *npts_global,
103 const int *npts_local, const int *shift_local, const int *border_width,
104 const double *dh, const double *dh_inv, grid_gpu_task_list **ptr) {
105
107
108 // Yes it makes no sense
109 if ((nblocks == 0) || (ntasks == 0)) {
110 *ptr = nullptr;
111 return;
112 }
113
114 // Select GPU device.
115 rocm_backend::context_info *ctx = nullptr;
116 if (*ctx_out == nullptr) {
117 ctx = new rocm_backend::context_info(offload_get_chosen_device());
118 *ctx_out = ctx;
119 } else {
120 ctx = *ctx_out;
121 // verify that the object is the right one
122 ctx->verify_checksum();
123 }
124
125 ctx->ntasks = ntasks;
126 ctx->nlevels = nlevels;
127 ctx->natoms = natoms;
128 ctx->nblocks = nblocks;
129 ctx->nkinds = nkinds;
130 ctx->grid_.resize(nlevels);
131 ctx->set_device();
132
133 std::vector<double> dh_max(ctx->nlevels, 0);
134
135 for (int level = 0; level < ctx->nlevels; level++) {
136 ctx->grid_[level].resize(npts_global + 3 * level, npts_local + 3 * level,
137 shift_local + 3 * level, border_width + 3 * level);
138 ctx->grid_[level].is_distributed(false);
139 ctx->grid_[level].set_lattice_vectors(&dh[9 * level], &dh_inv[9 * level]);
140 ctx->grid_[level].check_orthogonality(ortho);
141 for (int i = 0; i < 9; i++)
142 dh_max[level] = std::max(dh_max[level], std::abs(dh[9 * level + i]));
143 }
144
145 ctx->block_offsets_dev.resize(nblocks);
146 ctx->block_offsets_dev.copy_to_gpu(block_offsets);
147 ctx->initialize_basis_sets(basis_sets, nkinds);
148
149 ctx->first_task_per_level_.resize(nlevels, 0);
150 ctx->number_of_tasks_per_level_.resize(nlevels, 0);
151
152 memset(ctx->first_task_per_level_.data(), 0, sizeof(int) * nlevels);
153 memset(ctx->number_of_tasks_per_level_.data(), 0, sizeof(int) * nlevels);
154
155 std::vector<rocm_backend::task_info> tasks_host(ntasks);
156
157 size_t coef_size = 0;
158 int lmax_ = 0;
159
160 for (int i = 0; i < ntasks; i++) {
161 const int level = level_list[i] - 1;
162
163 // count the number of task per level
164 ctx->number_of_tasks_per_level_[level]++;
165
166 const int iatom = iatom_list[i] - 1;
167 const int jatom = jatom_list[i] - 1;
168 const int iset = iset_list[i] - 1;
169 const int jset = jset_list[i] - 1;
170 const int ipgf = ipgf_list[i] - 1;
171 const int jpgf = jpgf_list[i] - 1;
172 const int ikind = atom_kinds[iatom] - 1;
173 const int jkind = atom_kinds[jatom] - 1;
174
175 /* set parameters related to atom type orbital etc.... */
176 const grid_basis_set *ibasis = basis_sets[ikind];
177 const grid_basis_set *jbasis = basis_sets[jkind];
178
179 tasks_host[i] = {};
180 tasks_host[i].level = level;
181 tasks_host[i].iatom = iatom;
182 tasks_host[i].jatom = jatom;
183 tasks_host[i].iset = iset;
184 tasks_host[i].jset = jset;
185 tasks_host[i].ipgf = ipgf;
186 tasks_host[i].jpgf = jpgf;
187 tasks_host[i].ikind = ikind;
188 tasks_host[i].jkind = jkind;
189 tasks_host[i].border_mask = border_mask_list[i];
190 tasks_host[i].block_num = block_num_list[i] - 1;
191
192 if (border_mask_list[i]) {
193 ctx->grid_[level].is_distributed(true);
194 }
195 /* parameters for the gaussian */
196 tasks_host[i].radius = radius_list[i];
197 tasks_host[i].rab[0] = rab_list[3 * i];
198 tasks_host[i].rab[1] = rab_list[3 * i + 1];
199 tasks_host[i].rab[2] = rab_list[3 * i + 2];
200 tasks_host[i].zeta = ibasis->zet[iset * ibasis->maxpgf + ipgf];
201 tasks_host[i].zetb = jbasis->zet[jset * jbasis->maxpgf + jpgf];
202 tasks_host[i].zetp = tasks_host[i].zeta + tasks_host[i].zetb;
203 const double f = tasks_host[i].zetb / tasks_host[i].zetp;
204 tasks_host[i].rab2 = 0.0;
205 for (int d = 0; d < 3; d++) {
206 tasks_host[i].rab[d] = tasks_host[i].rab[d];
207 tasks_host[i].rab2 += tasks_host[i].rab[d] * tasks_host[i].rab[d];
208 tasks_host[i].ra[d] = atom_positions[3 * iatom + d];
209 tasks_host[i].rb[d] = tasks_host[i].ra[d] + tasks_host[i].rab[d];
210 tasks_host[i].rp[d] = tasks_host[i].ra[d] + tasks_host[i].rab[d] * f;
211 }
212
213 tasks_host[i].skip_task = (2 * tasks_host[i].radius < dh_max[level]);
214 tasks_host[i].prefactor = exp(-tasks_host[i].zeta * f * tasks_host[i].rab2);
215
216 tasks_host[i].off_diag_twice = (iatom == jatom) ? 1.0 : 2.0;
217 // angular momentum range of basis set
218 const int la_max_basis = ibasis->lmax[iset];
219 const int lb_max_basis = jbasis->lmax[jset];
220 const int la_min_basis = ibasis->lmin[iset];
221 const int lb_min_basis = jbasis->lmin[jset];
222
223 // angular momentum range for the actual collocate/integrate opteration.
224 tasks_host[i].la_max = la_max_basis;
225 tasks_host[i].lb_max = lb_max_basis;
226 tasks_host[i].la_min = la_min_basis;
227 tasks_host[i].lb_min = lb_min_basis;
228
229 lmax_ = std::max(lmax_, tasks_host[i].la_max);
230 lmax_ = std::max(lmax_, tasks_host[i].lb_max);
231
232 // start of decontracted set, ie. pab and hab
233 tasks_host[i].first_coseta =
234 (la_min_basis > 0) ? rocm_backend::ncoset(la_min_basis - 1) : 0;
235 tasks_host[i].first_cosetb =
236 (lb_min_basis > 0) ? rocm_backend::ncoset(lb_min_basis - 1) : 0;
237
238 // size of decontracted set, ie. pab and hab
239 tasks_host[i].ncoseta = rocm_backend::ncoset(la_max_basis);
240 tasks_host[i].ncosetb = rocm_backend::ncoset(lb_max_basis);
241 // it should lmax + 3 because calculating forces+stress+compute_tau requires
242 // l + 3
243 tasks_host[i].max_cab_size =
245 rocm_backend::ncoset(lb_max_basis + 3),
246 4);
247
248 // size of entire spherical basis
249 tasks_host[i].nsgfa = ibasis->nsgf;
250 tasks_host[i].nsgfb = jbasis->nsgf;
251
252 // size of spherical set
253 tasks_host[i].nsgf_seta = ibasis->nsgf_set[iset];
254 tasks_host[i].nsgf_setb = jbasis->nsgf_set[jset];
255
256 // strides of the sphi transformation matrices
257 tasks_host[i].maxcoa = ibasis->maxco;
258 tasks_host[i].maxcob = jbasis->maxco;
259
260 tasks_host[i].sgfa = ibasis->first_sgf[iset] - 1;
261 tasks_host[i].sgfb = jbasis->first_sgf[jset] - 1;
262
263 tasks_host[i].block_transposed = (iatom > jatom);
264 tasks_host[i].subblock_offset =
265 (tasks_host[i].block_transposed)
266 ? (tasks_host[i].sgfa * tasks_host[i].nsgfb + tasks_host[i].sgfb)
267 : (tasks_host[i].sgfb * tasks_host[i].nsgfa + tasks_host[i].sgfa);
268
269 /* the constant 6 is important here since we do not know ahead of time what
270 * specific operation we will be doing. collocate functions can go up to 4
271 * while integrate can go up to 5 (but put 6 for safety reasons) */
272
273 /* this block is only as temporary scratch for calculating the coefficients.
274 * Doing this avoid a lot of atomic operations that are costly on hardware
275 * that only have partial support of them. For better performance we should
276 * most probably align the offsets as well. it is 256 bytes on Mi100 and
277 * above */
278 tasks_host[i].lp_max = tasks_host[i].lb_max + tasks_host[i].la_max + 6;
279 if (i == 0) {
280 tasks_host[i].coef_offset = 0;
281 } else {
282 tasks_host[i].coef_offset =
283 tasks_host[i - 1].coef_offset +
285 rocm_backend::ncoset(tasks_host[i - 1].lp_max), 4);
286 }
287
288 // calculate the size such that the coef table is a multiple of 4.
289 coef_size += rocm_backend::align_up_elems(
290 rocm_backend::ncoset(tasks_host[i].lp_max), 4);
291
292 auto &grid = ctx->grid_[tasks_host[i].level];
293 // compute the cube properties
294
295 tasks_host[i].apply_border_mask = (tasks_host[i].border_mask != 0);
296
297 if (grid.is_orthogonal() && (tasks_host[i].border_mask == 0)) {
298 tasks_host[i].discrete_radius =
299 rocm_backend::compute_cube_properties<double, double3, true>(
300 tasks_host[i].radius, grid.dh(), grid.dh_inv(),
301 (double3 *)tasks_host[i].rp, // center of the gaussian
302 &tasks_host[i]
303 .roffset, // offset compared to the closest grid point
304 &tasks_host[i].cube_center, // center coordinates in grid space
305 &tasks_host[i].lb_cube, // lower boundary
306 &tasks_host[i].cube_size);
307 } else {
308 tasks_host[i].discrete_radius =
309 rocm_backend::compute_cube_properties<double, double3, false>(
310 tasks_host[i].radius, grid.dh(), grid.dh_inv(),
311 (double3 *)tasks_host[i].rp, // center of the gaussian
312 &tasks_host[i]
313 .roffset, // offset compared to the closest grid point
314 &tasks_host[i].cube_center, // center coordinates in grid space
315 &tasks_host[i].lb_cube, // lower boundary
316 &tasks_host[i].cube_size);
317 }
318 }
319
320 // we need to sort the task list although I expect it to be sorted already
321 // it is a exclusive scan actually
322 for (int level = 1; level < (int)ctx->number_of_tasks_per_level_.size();
323 level++) {
324 ctx->first_task_per_level_[level] =
325 ctx->first_task_per_level_[level - 1] +
326 ctx->number_of_tasks_per_level_[level - 1];
327 }
328
329 ctx->tasks_dev.clear();
330 ctx->tasks_dev.resize(tasks_host.size());
331 ctx->tasks_dev.copy_to_gpu(tasks_host);
332
333 /* Sort the blocks */
334 std::vector<std::vector<int>> task_sorted_by_block(nblocks);
335 std::vector<int> sorted_blocks(ntasks, 0);
336 std::vector<int> num_tasks_per_block(nblocks, 0);
337 std::vector<int> sorted_blocks_offset(nblocks, 0);
338 for (auto &block : task_sorted_by_block)
339 block.clear();
340
341 for (int i = 0; i < ntasks; i++) {
342 task_sorted_by_block[block_num_list[i] - 1].push_back(i);
343 num_tasks_per_block[block_num_list[i] - 1]++;
344 }
345
346 int offset = 0;
347 // flatten the task_sorted_by_block and compute the offsets
348 for (int i = 0; i < (int)task_sorted_by_block.size(); i++) {
349 auto &task_list = task_sorted_by_block[i];
350
351 // take care of the case where the blocks are not associated to a given
352 // task. (and also a workaround in the grid_replay.c file)
353 if (!task_list.empty()) {
354 memcpy(&sorted_blocks[offset], &task_list[0],
355 sizeof(int) * task_list.size());
356 }
357 sorted_blocks_offset[i] = offset;
358 offset += task_list.size();
359 }
360
361 // copy the blocks offsets
362 ctx->sorted_blocks_offset_dev.resize(sorted_blocks_offset.size());
363 ctx->sorted_blocks_offset_dev.copy_to_gpu(sorted_blocks_offset);
364
365 // copy the task list sorted by block (not by level) to the gpu
366 ctx->task_sorted_by_blocks_dev.resize(sorted_blocks.size());
367 ctx->task_sorted_by_blocks_dev.copy_to_gpu(sorted_blocks);
368
369 std::vector<int> cab_size_offset_tmp(nblocks, 0);
370 std::vector<int> cab_size_tmp(nblocks);
371
372 for (int i = 0; i < (int)sorted_blocks_offset.size(); i++) {
373 int num_tasks = 0;
374 if (i == (int)sorted_blocks_offset.size() - 1)
375 num_tasks = ntasks - sorted_blocks_offset[i];
376 else
377 num_tasks = sorted_blocks_offset[i + 1] - sorted_blocks_offset[i];
378
379 // invariants tests since they should be equal.
380 assert(num_tasks == num_tasks_per_block[i]);
381
382 int tmp_cab_size = 0;
383 for (int tk = 0; tk < num_tasks; tk++) {
384 auto &task = tasks_host[sorted_blocks[tk + sorted_blocks_offset[i]]];
385 // check that all tasks point to the same block
386 assert(
387 tasks_host[sorted_blocks[tk + sorted_blocks_offset[i]]].block_num ==
388 i);
389
390 // calculate the largest cab block needed for this block
391 tmp_cab_size = std::max(task.max_cab_size, tmp_cab_size);
392 }
393
394 // keep the size of the largest cab block
395 cab_size_tmp[i] = tmp_cab_size;
396 }
397
398 cab_size_offset_tmp[0] = 0;
399
400 for (int i = 1; i < cab_size_tmp.size(); i++) {
401 cab_size_offset_tmp[i] = cab_size_offset_tmp[i - 1] + cab_size_tmp[i - 1];
402 }
403
404 for (auto &block : task_sorted_by_block)
405 block.clear();
406 task_sorted_by_block.clear();
407
408 sorted_blocks.clear();
409 sorted_blocks_offset.clear();
410
411 ctx->cab_block_offset_dev.resize(cab_size_offset_tmp.size());
412 ctx->cab_block_offset_dev.copy_to_gpu(cab_size_offset_tmp);
413
414 ctx->num_tasks_per_block_dev_.resize(num_tasks_per_block.size());
415 ctx->num_tasks_per_block_dev_.copy_to_gpu(num_tasks_per_block);
416
417 // Calculate the total amount of workspace needed
418 size_t cab_size_total = 0;
419
420 for (auto &elem : cab_size_tmp)
421 cab_size_total += elem;
422
423 cab_size_offset_tmp.clear();
424 cab_size_tmp.clear();
425
426 // To avoid memory saturation, cab only depends on the number of blocks not
427 // the number of tasks. It forces us to compute all xyz coefficients before
428 // calling collocate. However it does not change the logic of the integrate
429 // counterpart.
430
431 ctx->cab_dev_.resize(cab_size_total);
432
433 // allocate workspace for the coefficients
434 ctx->coef_dev_.resize(coef_size);
435
436 // collect stats
437 memset(ctx->stats, 0, 2 * 20 * sizeof(int));
438 for (int itask = 0; itask < ntasks; itask++) {
439 const int iatom = iatom_list[itask] - 1;
440 const int jatom = jatom_list[itask] - 1;
441 const int ikind = atom_kinds[iatom] - 1;
442 const int jkind = atom_kinds[jatom] - 1;
443 const int iset = iset_list[itask] - 1;
444 const int jset = jset_list[itask] - 1;
445 const int la_max = basis_sets[ikind]->lmax[iset];
446 const int lb_max = basis_sets[jkind]->lmax[jset];
447 const int lp = std::min(la_max + lb_max, 19);
448 const bool has_border_mask = (border_mask_list[itask] != 0);
449 ctx->stats[has_border_mask][lp]++;
450 }
451
452 ctx->create_streams();
453 ctx->compute_checksum();
454
455 // cleanup
456 tasks_host.clear();
457
458 // return newly created or updated context
459 *ctx_out = ctx;
460}
461
462/*******************************************************************************
463 * \brief destroy a context
464 ******************************************************************************/
466
468 // Select GPU device.
469 if (ctx == nullptr)
470 return;
471 ctx->verify_checksum();
472 ctx->set_device();
473 delete ctx;
474}
475
476/*******************************************************************************
477 * \brief Collocate all tasks of in given list onto given grids.
478 ******************************************************************************/
480 const enum grid_func func,
481 const int nlevels,
482 const offload_buffer *pab_blocks,
483 offload_buffer **grids) {
485
486 if (ptr == nullptr)
487 return;
488
489 ctx->verify_checksum();
490
491 if ((ctx->nblocks == 0) || (ctx->ntasks == 0)) {
492 return;
493 }
494
495 assert(ctx->nlevels == nlevels);
496 ctx->set_device();
497
498 ctx->pab_block_.associate(pab_blocks->host_buffer, pab_blocks->device_buffer,
499 pab_blocks->size / sizeof(double));
500
501 /*
502 There are 3 scenario here.
503 - Mi300 : no copy will happen as the two buffers have the same address
504 - Mi250X : an internal copy will happen. We do not need to do anything
505 explicit
506 - no unified memory : Explicit copy will happen
507 */
508 int lp_diff = -1;
510 ctx->coef_dev_.zero(ctx->main_stream);
511 ctx->calculate_all_coefficients(func, &lp_diff);
512
513 for (int level = 0; level < ctx->nlevels; level++) {
514 ctx->grid_[level].associate(grids[level]->host_buffer,
515 grids[level]->device_buffer,
516 grids[level]->size / sizeof(double));
517 ctx->grid_[level].zero(ctx->level_streams[level]);
518 }
519
520 ctx->synchronize(ctx->main_stream);
521
522 for (int level = 0; level < ctx->nlevels; level++) {
523 ctx->collocate_one_grid_level(level, func, &lp_diff);
524 }
525
526 // download result from device to host.
527 for (int level = 0; level < ctx->nlevels; level++) {
528 ctx->grid_[level].copy_to_host(ctx->level_streams[level]);
529 }
530
531 // update counters while we wait for kernels to finish. It is not thread safe
532 // at all since the function grid_library_counter_add has global static
533 // states. We need a much better mechanism than this for instance move this
534 // information one level up and encapsulate it in the context associated to
535 // the library.
536
537 if (lp_diff > -1) {
538 for (int has_border_mask = 0; has_border_mask <= 1; has_border_mask++) {
539 for (int lp = 0; lp < 20; lp++) {
540 const int count = ctx->stats[has_border_mask][lp];
541 if (ctx->grid_[0].is_orthogonal() && !has_border_mask) {
543 GRID_COLLOCATE_ORTHO, count);
544 } else {
547 }
548 }
549 }
550 }
551
552 // need to wait for all streams to finish
553 for (int level = 0; level < ctx->nlevels; level++) {
554 ctx->synchronize(ctx->level_streams[level]);
555 }
556}
557
558/*******************************************************************************
559 * \brief Integrate all tasks of in given list onto given grids.
560 * See grid_ctx.h for details.
561 ******************************************************************************/
563 const grid_gpu_task_list *ptr, const bool compute_tau, const int nlevels,
564 const offload_buffer *pab_blocks, const offload_buffer **grids,
565 offload_buffer *hab_blocks, double *forces, double *virial) {
566
568
569 if (ptr == nullptr)
570 return;
571 assert(ctx->nlevels == nlevels);
572
573 ctx->verify_checksum();
574 // Select GPU device.
575 ctx->set_device();
576
577 for (int level = 0; level < ctx->nlevels; level++) {
578 if (ctx->number_of_tasks_per_level_[level]) {
579 ctx->grid_[level].associate(grids[level]->host_buffer,
580 grids[level]->device_buffer,
581 grids[level]->size / sizeof(double));
582 ctx->grid_[level].copy_to_gpu(ctx->level_streams[level]);
583 }
584 }
585
586 if ((forces != nullptr) || (virial != nullptr)) {
587 ctx->pab_block_.associate(pab_blocks->host_buffer,
588 pab_blocks->device_buffer,
589 pab_blocks->size / sizeof(double));
591 }
592
593 // we do not need to wait for this to start the computations since the matrix
594 // elements are computed after all coefficients are calculated.
595 ctx->hab_block_.associate(hab_blocks->host_buffer, hab_blocks->device_buffer,
596 hab_blocks->size / sizeof(double));
597 ctx->hab_block_.zero(ctx->main_stream);
598
599 ctx->calculate_forces = (forces != nullptr);
600 ctx->calculate_virial = (virial != nullptr);
601 ctx->compute_tau = compute_tau;
602
603 if (forces != nullptr) {
604 ctx->forces_.resize(3 * ctx->natoms);
605 ctx->forces_.zero(ctx->main_stream);
606 }
607
608 if (virial != nullptr) {
609 ctx->virial_.resize(9);
610 ctx->virial_.zero(ctx->main_stream);
611 }
612
613 int lp_diff = -1;
614
615 // we can actually treat the full task list without bothering about the level
616 // at that stage. This can be taken care of inside the kernel.
617 for (int level = 0; level < ctx->nlevels; level++) {
618 // launch kernel, but only after grid has arrived
619 ctx->integrate_one_grid_level(level, &lp_diff);
620 }
621
622 if (lp_diff > -1) {
623 // update counters while we wait for kernels to finish
624 for (int has_border_mask = 0; has_border_mask <= 1; has_border_mask++) {
625 for (int lp = 0; lp < 20; lp++) {
626 const int count = ctx->stats[has_border_mask][lp];
627 if (ctx->grid_[0].is_orthogonal() && !has_border_mask) {
629 GRID_INTEGRATE_ORTHO, count);
630 } else {
633 }
634 }
635 }
636 }
637
638 // need to wait for all streams to finish
639 for (int level = 0; level < ctx->nlevels; level++) {
640 ctx->synchronize(ctx->level_streams[level]);
641 }
642 // computing the hab coefficients does not depend on the number of grids so we
643 // can run these calculations on the main stream
646
647 if (forces != NULL) {
648 ctx->forces_.copy_from_gpu(forces, ctx->main_stream);
649 }
650 if (virial != NULL) {
651 ctx->virial_.copy_from_gpu(virial, ctx->main_stream);
652 }
653
654 ctx->synchronize(ctx->main_stream);
655}
std::vector< int > first_task_per_level_
void calculate_all_coefficients(const enum grid_func func, int *lp_diff)
gpu_vector< double > virial_
gpu_vector< int > sorted_blocks_offset_dev
gpu_vector< double > coef_dev_
gpu_vector< int > cab_block_offset_dev
gpu_vector< double > pab_block_
void collocate_one_grid_level(const int level, const enum grid_func func, int *lp_diff)
Launches the Cuda kernel that collocates all tasks of one grid level.
gpu_vector< int > block_offsets_dev
void synchronize(offloadStream_t &stream)
gpu_vector< double > cab_dev_
gpu_vector< task_info > tasks_dev
std::vector< grid_info< double > > grid_
gpu_vector< double > forces_
gpu_vector< double > hab_block_
std::vector< offloadStream_t > level_streams
void initialize_basis_sets(const grid_basis_set **basis_sets, const int nkinds__)
std::vector< int > number_of_tasks_per_level_
gpu_vector< double * > sphi_dev
gpu_vector< int > num_tasks_per_block_dev_
void integrate_one_grid_level(const int level, int *lp_diff)
Launches the Cuda kernel that integrates all tasks of one grid level.
gpu_vector< int > task_sorted_by_blocks_dev
void zero(offloadStream_t &stream__)
void copy_associated_host_to_gpu(offloadStream_t &stream__)
void resize(const size_t new_size__)
void copy_from_gpu(T *data__, offloadStream_t &stream__)
void copy_gpu_to_associated_host(offloadStream_t &stream__)
void associate(void *host_ptr__, void *device_ptr__, const size_t size__)
void copy_to_gpu(const T *data__)
@ GRID_BACKEND_GPU
grid_func
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
static void const int const int i
static void const int const int const int const int const int const double const int const int const int npts_local[3]
void grid_gpu_collocate_task_list(const grid_gpu_task_list *ptr, const enum grid_func func, const int nlevels, const offload_buffer *pab_blocks, offload_buffer **grids)
Collocate all tasks of in given list onto given grids.
void grid_gpu_create_task_list(const bool ortho, const int ntasks, const int nlevels, const int natoms, const int nkinds, const int nblocks, const int *block_offsets, const double *atom_positions, const int *atom_kinds, const grid_basis_set **basis_sets, const int *level_list, const int *iatom_list, const int *jatom_list, const int *iset_list, const int *jset_list, const int *ipgf_list, const int *jpgf_list, const int *border_mask_list, const int *block_num_list, const double *radius_list, const double *rab_list, const int *npts_global, const int *npts_local, const int *shift_local, const int *border_width, const double *dh, const double *dh_inv, grid_gpu_task_list **ptr)
Allocates a task list for the GPU backend. See grid_ctx.h for details.
void grid_gpu_free_task_list(grid_gpu_task_list *ptr)
destroy a context
void grid_gpu_integrate_task_list(const grid_gpu_task_list *ptr, const bool compute_tau, const int nlevels, const offload_buffer *pab_blocks, const offload_buffer **grids, offload_buffer *hab_blocks, double *forces, double *virial)
Integrate all tasks of in given list onto given grids. See grid_ctx.h for details.
void grid_gpu_task_list
void grid_library_counter_add(const int lp, const enum grid_backend backend, const enum grid_library_kernel kernel, const int increment)
Adds given increment to counter specified by lp, backend, and kernel.
@ GRID_INTEGRATE_GENERAL
@ GRID_COLLOCATE_ORTHO
@ GRID_COLLOCATE_GENERAL
@ GRID_INTEGRATE_ORTHO
__host__ __device__ __inline__ int ncoset(const int l)
Number of Cartesian orbitals up to given angular momentum quantum.
constexpr size_t align_up_elems(size_t n_elems, size_t elem_alignment)
Internal representation of a basis set.
Internal representation of a buffer.
double * device_buffer
double * host_buffer