21#include "../../offload/offload_library.h"
23#include "../common/grid_basis_set.h"
24#include "../common/grid_constants.h"
25#include "../common/grid_library.h"
34#error "OpenMP should not be used in .cu files to accommodate HIP."
40 return (n_elems + elem_alignment - 1) & ~(elem_alignment - 1);
44context_info::set_kernel_parameters(
const int level,
45 const smem_parameters &smem_params) {
47 params.cab_size_ = smem_params.cab_size();
48 params.first_task = 0;
50 params.la_min_diff = smem_params.ldiffs().la_min_diff;
51 params.lb_min_diff = smem_params.ldiffs().lb_min_diff;
53 params.la_max_diff = smem_params.ldiffs().la_max_diff;
54 params.lb_max_diff = smem_params.ldiffs().lb_max_diff;
60 params.la_min_diff = smem_params.ldiffs().la_min_diff;
61 params.lb_min_diff = smem_params.ldiffs().lb_min_diff;
62 params.la_max_diff = smem_params.ldiffs().la_max_diff;
63 params.lb_max_diff = smem_params.ldiffs().lb_max_diff;
68 params.ptr_dev[1] =
grid_[level].data();
69 memcpy(params.dh_,
grid_[level].dh(), 9 *
sizeof(
double));
70 memcpy(params.dh_inv_,
grid_[level].dh_inv(), 9 *
sizeof(
double));
73 params.grid_full_size_ =
grid_[level].full_size();
74 params.grid_local_size_ =
grid_[level].local_size();
75 params.grid_lower_corner_ =
grid_[level].lower_corner();
76 params.grid_border_width_ =
grid_[level].border_width();
95 const bool ortho,
const int ntasks,
const int nlevels,
const int natoms,
96 const int nkinds,
const int nblocks,
const int *block_offsets,
97 const double *atom_positions,
const int *atom_kinds,
99 const int *iatom_list,
const int *jatom_list,
const int *iset_list,
100 const int *jset_list,
const int *ipgf_list,
const int *jpgf_list,
101 const int *border_mask_list,
const int *block_num_list,
102 const double *radius_list,
const double *rab_list,
const int *npts_global,
103 const int *
npts_local,
const int *shift_local,
const int *border_width,
109 if ((nblocks == 0) || (ntasks == 0)) {
116 if (*ctx_out ==
nullptr) {
130 ctx->
grid_.resize(nlevels);
133 std::vector<double> dh_max(ctx->
nlevels, 0);
135 for (
int level = 0; level < ctx->
nlevels; level++) {
136 ctx->
grid_[level].resize(npts_global + 3 * level,
npts_local + 3 * level,
137 shift_local + 3 * level, border_width + 3 * level);
138 ctx->
grid_[level].is_distributed(
false);
139 ctx->
grid_[level].set_lattice_vectors(&dh[9 * level], &dh_inv[9 * level]);
140 ctx->
grid_[level].check_orthogonality(ortho);
141 for (
int i = 0;
i < 9;
i++)
142 dh_max[level] = std::max(dh_max[level], std::abs(dh[9 * level +
i]));
155 std::vector<rocm_backend::task_info> tasks_host(ntasks);
157 size_t coef_size = 0;
160 for (
int i = 0;
i < ntasks;
i++) {
161 const int level = level_list[
i] - 1;
166 const int iatom = iatom_list[
i] - 1;
167 const int jatom = jatom_list[
i] - 1;
168 const int iset = iset_list[
i] - 1;
169 const int jset = jset_list[
i] - 1;
170 const int ipgf = ipgf_list[
i] - 1;
171 const int jpgf = jpgf_list[
i] - 1;
172 const int ikind = atom_kinds[iatom] - 1;
173 const int jkind = atom_kinds[jatom] - 1;
180 tasks_host[
i].level = level;
181 tasks_host[
i].iatom = iatom;
182 tasks_host[
i].jatom = jatom;
183 tasks_host[
i].iset = iset;
184 tasks_host[
i].jset = jset;
185 tasks_host[
i].ipgf = ipgf;
186 tasks_host[
i].jpgf = jpgf;
187 tasks_host[
i].ikind = ikind;
188 tasks_host[
i].jkind = jkind;
189 tasks_host[
i].border_mask = border_mask_list[
i];
190 tasks_host[
i].block_num = block_num_list[
i] - 1;
192 if (border_mask_list[
i]) {
193 ctx->
grid_[level].is_distributed(
true);
196 tasks_host[
i].radius = radius_list[
i];
197 tasks_host[
i].rab[0] = rab_list[3 *
i];
198 tasks_host[
i].rab[1] = rab_list[3 *
i + 1];
199 tasks_host[
i].rab[2] = rab_list[3 *
i + 2];
200 tasks_host[
i].zeta = ibasis->
zet[iset * ibasis->
maxpgf + ipgf];
201 tasks_host[
i].zetb = jbasis->
zet[jset * jbasis->
maxpgf + jpgf];
202 tasks_host[
i].zetp = tasks_host[
i].zeta + tasks_host[
i].zetb;
203 const double f = tasks_host[
i].zetb / tasks_host[
i].zetp;
204 tasks_host[
i].rab2 = 0.0;
205 for (
int d = 0; d < 3; d++) {
206 tasks_host[
i].rab[d] = tasks_host[
i].rab[d];
207 tasks_host[
i].rab2 += tasks_host[
i].rab[d] * tasks_host[
i].rab[d];
208 tasks_host[
i].ra[d] = atom_positions[3 * iatom + d];
209 tasks_host[
i].rb[d] = tasks_host[
i].ra[d] + tasks_host[
i].rab[d];
210 tasks_host[
i].rp[d] = tasks_host[
i].ra[d] + tasks_host[
i].rab[d] * f;
213 tasks_host[
i].skip_task = (2 * tasks_host[
i].radius < dh_max[level]);
214 tasks_host[
i].prefactor = exp(-tasks_host[
i].zeta * f * tasks_host[
i].rab2);
216 tasks_host[
i].off_diag_twice = (iatom == jatom) ? 1.0 : 2.0;
218 const int la_max_basis = ibasis->
lmax[iset];
219 const int lb_max_basis = jbasis->
lmax[jset];
220 const int la_min_basis = ibasis->
lmin[iset];
221 const int lb_min_basis = jbasis->
lmin[jset];
224 tasks_host[
i].la_max = la_max_basis;
225 tasks_host[
i].lb_max = lb_max_basis;
226 tasks_host[
i].la_min = la_min_basis;
227 tasks_host[
i].lb_min = lb_min_basis;
229 lmax_ = std::max(lmax_, tasks_host[
i].la_max);
230 lmax_ = std::max(lmax_, tasks_host[
i].lb_max);
233 tasks_host[
i].first_coseta =
235 tasks_host[
i].first_cosetb =
243 tasks_host[
i].max_cab_size =
249 tasks_host[
i].nsgfa = ibasis->
nsgf;
250 tasks_host[
i].nsgfb = jbasis->
nsgf;
253 tasks_host[
i].nsgf_seta = ibasis->
nsgf_set[iset];
254 tasks_host[
i].nsgf_setb = jbasis->
nsgf_set[jset];
257 tasks_host[
i].maxcoa = ibasis->
maxco;
258 tasks_host[
i].maxcob = jbasis->
maxco;
260 tasks_host[
i].sgfa = ibasis->
first_sgf[iset] - 1;
261 tasks_host[
i].sgfb = jbasis->
first_sgf[jset] - 1;
263 tasks_host[
i].block_transposed = (iatom > jatom);
264 tasks_host[
i].subblock_offset =
265 (tasks_host[
i].block_transposed)
266 ? (tasks_host[
i].sgfa * tasks_host[
i].nsgfb + tasks_host[
i].sgfb)
267 : (tasks_host[
i].sgfb * tasks_host[
i].nsgfa + tasks_host[
i].sgfa);
278 tasks_host[
i].lp_max = tasks_host[
i].lb_max + tasks_host[
i].la_max + 6;
280 tasks_host[
i].coef_offset = 0;
282 tasks_host[
i].coef_offset =
283 tasks_host[
i - 1].coef_offset +
295 tasks_host[
i].apply_border_mask = (tasks_host[
i].border_mask != 0);
297 if (
grid.is_orthogonal() && (tasks_host[
i].border_mask == 0)) {
298 tasks_host[
i].discrete_radius =
299 rocm_backend::compute_cube_properties<double, double3, true>(
300 tasks_host[
i].radius,
grid.dh(),
grid.dh_inv(),
301 (double3 *)tasks_host[
i].rp,
304 &tasks_host[
i].cube_center,
305 &tasks_host[
i].lb_cube,
306 &tasks_host[
i].cube_size);
308 tasks_host[
i].discrete_radius =
309 rocm_backend::compute_cube_properties<double, double3, false>(
310 tasks_host[
i].radius,
grid.dh(),
grid.dh_inv(),
311 (double3 *)tasks_host[
i].rp,
314 &tasks_host[
i].cube_center,
315 &tasks_host[
i].lb_cube,
316 &tasks_host[
i].cube_size);
330 ctx->
tasks_dev.resize(tasks_host.size());
334 std::vector<std::vector<int>> task_sorted_by_block(nblocks);
335 std::vector<int> sorted_blocks(ntasks, 0);
336 std::vector<int> num_tasks_per_block(nblocks, 0);
337 std::vector<int> sorted_blocks_offset(nblocks, 0);
338 for (
auto &block : task_sorted_by_block)
341 for (
int i = 0;
i < ntasks;
i++) {
342 task_sorted_by_block[block_num_list[
i] - 1].push_back(
i);
343 num_tasks_per_block[block_num_list[
i] - 1]++;
348 for (
int i = 0;
i < (int)task_sorted_by_block.size();
i++) {
349 auto &task_list = task_sorted_by_block[
i];
353 if (!task_list.empty()) {
354 memcpy(&sorted_blocks[offset], &task_list[0],
355 sizeof(
int) * task_list.size());
357 sorted_blocks_offset[
i] = offset;
358 offset += task_list.size();
369 std::vector<int> cab_size_offset_tmp(nblocks, 0);
370 std::vector<int> cab_size_tmp(nblocks);
372 for (
int i = 0;
i < (int)sorted_blocks_offset.size();
i++) {
374 if (
i == (
int)sorted_blocks_offset.size() - 1)
375 num_tasks = ntasks - sorted_blocks_offset[
i];
377 num_tasks = sorted_blocks_offset[
i + 1] - sorted_blocks_offset[
i];
380 assert(num_tasks == num_tasks_per_block[
i]);
382 int tmp_cab_size = 0;
383 for (
int tk = 0; tk < num_tasks; tk++) {
384 auto &task = tasks_host[sorted_blocks[tk + sorted_blocks_offset[
i]]];
387 tasks_host[sorted_blocks[tk + sorted_blocks_offset[
i]]].block_num ==
391 tmp_cab_size = std::max(task.max_cab_size, tmp_cab_size);
395 cab_size_tmp[
i] = tmp_cab_size;
398 cab_size_offset_tmp[0] = 0;
400 for (
int i = 1;
i < cab_size_tmp.size();
i++) {
401 cab_size_offset_tmp[
i] = cab_size_offset_tmp[
i - 1] + cab_size_tmp[
i - 1];
404 for (
auto &block : task_sorted_by_block)
406 task_sorted_by_block.clear();
408 sorted_blocks.clear();
409 sorted_blocks_offset.clear();
418 size_t cab_size_total = 0;
420 for (
auto &elem : cab_size_tmp)
421 cab_size_total += elem;
423 cab_size_offset_tmp.clear();
424 cab_size_tmp.clear();
437 memset(ctx->
stats, 0, 2 * 20 *
sizeof(
int));
438 for (
int itask = 0; itask < ntasks; itask++) {
439 const int iatom = iatom_list[itask] - 1;
440 const int jatom = jatom_list[itask] - 1;
441 const int ikind = atom_kinds[iatom] - 1;
442 const int jkind = atom_kinds[jatom] - 1;
443 const int iset = iset_list[itask] - 1;
444 const int jset = jset_list[itask] - 1;
445 const int la_max = basis_sets[ikind]->
lmax[iset];
446 const int lb_max = basis_sets[jkind]->
lmax[jset];
447 const int lp = std::min(la_max + lb_max, 19);
448 const bool has_border_mask = (border_mask_list[itask] != 0);
449 ctx->
stats[has_border_mask][lp]++;
495 assert(ctx->
nlevels == nlevels);
499 pab_blocks->
size /
sizeof(
double));
513 for (
int level = 0; level < ctx->
nlevels; level++) {
514 ctx->
grid_[level].associate(grids[level]->host_buffer,
515 grids[level]->device_buffer,
516 grids[level]->size /
sizeof(
double));
522 for (
int level = 0; level < ctx->
nlevels; level++) {
527 for (
int level = 0; level < ctx->
nlevels; level++) {
538 for (
int has_border_mask = 0; has_border_mask <= 1; has_border_mask++) {
539 for (
int lp = 0; lp < 20; lp++) {
540 const int count = ctx->
stats[has_border_mask][lp];
541 if (ctx->
grid_[0].is_orthogonal() && !has_border_mask) {
553 for (
int level = 0; level < ctx->
nlevels; level++) {
571 assert(ctx->
nlevels == nlevels);
577 for (
int level = 0; level < ctx->
nlevels; level++) {
579 ctx->
grid_[level].associate(grids[level]->host_buffer,
580 grids[level]->device_buffer,
581 grids[level]->size /
sizeof(
double));
586 if ((forces !=
nullptr) || (virial !=
nullptr)) {
589 pab_blocks->
size /
sizeof(
double));
596 hab_blocks->
size /
sizeof(
double));
603 if (forces !=
nullptr) {
608 if (virial !=
nullptr) {
617 for (
int level = 0; level < ctx->
nlevels; level++) {
624 for (
int has_border_mask = 0; has_border_mask <= 1; has_border_mask++) {
625 for (
int lp = 0; lp < 20; lp++) {
626 const int count = ctx->
stats[has_border_mask][lp];
627 if (ctx->
grid_[0].is_orthogonal() && !has_border_mask) {
639 for (
int level = 0; level < ctx->
nlevels; level++) {
647 if (forces != NULL) {
650 if (virial != NULL) {
std::vector< int > first_task_per_level_
void calculate_all_coefficients(const enum grid_func func, int *lp_diff)
gpu_vector< double > virial_
gpu_vector< int > sorted_blocks_offset_dev
gpu_vector< double > coef_dev_
gpu_vector< int > cab_block_offset_dev
gpu_vector< double > pab_block_
void collocate_one_grid_level(const int level, const enum grid_func func, int *lp_diff)
Launches the Cuda kernel that collocates all tasks of one grid level.
gpu_vector< int > block_offsets_dev
offloadStream_t main_stream
void synchronize(offloadStream_t &stream)
gpu_vector< double > cab_dev_
gpu_vector< task_info > tasks_dev
std::vector< grid_info< double > > grid_
gpu_vector< double > forces_
void compute_hab_coefficients()
gpu_vector< double > hab_block_
std::vector< offloadStream_t > level_streams
void initialize_basis_sets(const grid_basis_set **basis_sets, const int nkinds__)
std::vector< int > number_of_tasks_per_level_
gpu_vector< double * > sphi_dev
gpu_vector< int > num_tasks_per_block_dev_
void integrate_one_grid_level(const int level, int *lp_diff)
Launches the Cuda kernel that integrates all tasks of one grid level.
gpu_vector< int > task_sorted_by_blocks_dev
void zero(offloadStream_t &stream__)
void copy_associated_host_to_gpu(offloadStream_t &stream__)
void resize(const size_t new_size__)
void copy_from_gpu(T *data__, offloadStream_t &stream__)
void copy_gpu_to_associated_host(offloadStream_t &stream__)
void associate(void *host_ptr__, void *device_ptr__, const size_t size__)
void copy_to_gpu(const T *data__)
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
static void const int const int i
static void const int const int const int const int const int const double const int const int const int npts_local[3]
void grid_gpu_collocate_task_list(const grid_gpu_task_list *ptr, const enum grid_func func, const int nlevels, const offload_buffer *pab_blocks, offload_buffer **grids)
Collocate all tasks of in given list onto given grids.
void grid_gpu_create_task_list(const bool ortho, const int ntasks, const int nlevels, const int natoms, const int nkinds, const int nblocks, const int *block_offsets, const double *atom_positions, const int *atom_kinds, const grid_basis_set **basis_sets, const int *level_list, const int *iatom_list, const int *jatom_list, const int *iset_list, const int *jset_list, const int *ipgf_list, const int *jpgf_list, const int *border_mask_list, const int *block_num_list, const double *radius_list, const double *rab_list, const int *npts_global, const int *npts_local, const int *shift_local, const int *border_width, const double *dh, const double *dh_inv, grid_gpu_task_list **ptr)
Allocates a task list for the GPU backend. See grid_ctx.h for details.
void grid_gpu_free_task_list(grid_gpu_task_list *ptr)
destroy a context
void grid_gpu_integrate_task_list(const grid_gpu_task_list *ptr, const bool compute_tau, const int nlevels, const offload_buffer *pab_blocks, const offload_buffer **grids, offload_buffer *hab_blocks, double *forces, double *virial)
Integrate all tasks of in given list onto given grids. See grid_ctx.h for details.
void grid_library_counter_add(const int lp, const enum grid_backend backend, const enum grid_library_kernel kernel, const int increment)
Adds given increment to counter specified by lp, backend, and kernel.
__host__ __device__ __inline__ int ncoset(const int l)
Number of Cartesian orbitals up to given angular momentum quantum.
constexpr size_t align_up_elems(size_t n_elems, size_t elem_alignment)
Internal representation of a basis set.
Internal representation of a buffer.