14#if defined(__OFFLOAD_HIP) && !defined(__NO_OFFLOAD_GRID)
19#include <hip/hip_runtime_api.h>
22#include "../../offload/offload_library.h"
24#include "../common/grid_basis_set.h"
25#include "../common/grid_constants.h"
26#include "../common/grid_library.h"
35#error "OpenMP should not be used in .cu files to accommodate HIP."
42extern "C" void grid_hip_create_task_list(
43 const bool ortho,
const int ntasks,
const int nlevels,
const int natoms,
44 const int nkinds,
const int nblocks,
const int *block_offsets,
45 const double *atom_positions,
const int *atom_kinds,
47 const int *iatom_list,
const int *jatom_list,
const int *iset_list,
48 const int *jset_list,
const int *ipgf_list,
const int *jpgf_list,
49 const int *border_mask_list,
const int *block_num_list,
50 const double *radius_list,
const double *rab_list,
const int *npts_global,
51 const int *
npts_local,
const int *shift_local,
const int *border_width,
52 const double *dh,
const double *dh_inv,
void *ptr) {
57 if (*ctx_out ==
nullptr) {
71 ctx->
grid_.resize(nlevels);
73 std::vector<double> dh_max(ctx->
nlevels, 0);
75 for (
int level = 0; level < ctx->
nlevels; level++) {
76 ctx->
grid_[level].resize(npts_global + 3 * level,
npts_local + 3 * level,
77 shift_local + 3 * level, border_width + 3 * level);
78 ctx->
grid_[level].is_distributed(
false);
79 ctx->
grid_[level].set_lattice_vectors(&dh[9 * level], &dh_inv[9 * level]);
80 ctx->
grid_[level].check_orthorhombicity(ortho);
81 for (
int i = 0;
i < 9;
i++)
82 dh_max[level] = std::max(dh_max[level], std::abs(dh[9 * level +
i]));
95 std::vector<rocm_backend::task_info> tasks_host(ntasks);
99 for (
int i = 0;
i < ntasks;
i++) {
100 const int level = level_list[
i] - 1;
105 const int iatom = iatom_list[
i] - 1;
106 const int jatom = jatom_list[
i] - 1;
107 const int iset = iset_list[
i] - 1;
108 const int jset = jset_list[
i] - 1;
109 const int ipgf = ipgf_list[
i] - 1;
110 const int jpgf = jpgf_list[
i] - 1;
111 const int ikind = atom_kinds[iatom] - 1;
112 const int jkind = atom_kinds[jatom] - 1;
119 tasks_host[
i].level = level;
120 tasks_host[
i].iatom = iatom;
121 tasks_host[
i].jatom = jatom;
122 tasks_host[
i].iset = iset;
123 tasks_host[
i].jset = jset;
124 tasks_host[
i].ipgf = ipgf;
125 tasks_host[
i].jpgf = jpgf;
126 tasks_host[
i].ikind = ikind;
127 tasks_host[
i].jkind = jkind;
128 tasks_host[
i].border_mask = border_mask_list[
i];
129 tasks_host[
i].block_num = block_num_list[
i] - 1;
131 if (border_mask_list[
i]) {
132 ctx->
grid_[level].is_distributed(
true);
135 tasks_host[
i].radius = radius_list[
i];
136 tasks_host[
i].rab[0] = rab_list[3 *
i];
137 tasks_host[
i].rab[1] = rab_list[3 *
i + 1];
138 tasks_host[
i].rab[2] = rab_list[3 *
i + 2];
139 tasks_host[
i].zeta = ibasis->
zet[iset * ibasis->
maxpgf + ipgf];
140 tasks_host[
i].zetb = jbasis->
zet[jset * jbasis->
maxpgf + jpgf];
141 tasks_host[
i].zetp = tasks_host[
i].zeta + tasks_host[
i].zetb;
142 const double f = tasks_host[
i].zetb / tasks_host[
i].zetp;
143 tasks_host[
i].rab2 = 0.0;
144 for (
int d = 0;
d < 3;
d++) {
145 tasks_host[
i].rab[
d] = tasks_host[
i].rab[
d];
146 tasks_host[
i].rab2 += tasks_host[
i].rab[
d] * tasks_host[
i].rab[
d];
147 tasks_host[
i].ra[
d] = atom_positions[3 * iatom +
d];
148 tasks_host[
i].rb[
d] = tasks_host[
i].ra[
d] + tasks_host[
i].rab[
d];
149 tasks_host[
i].rp[
d] = tasks_host[
i].ra[
d] + tasks_host[
i].rab[
d] * f;
152 tasks_host[
i].skip_task = (2 * tasks_host[
i].radius < dh_max[level]);
153 tasks_host[
i].prefactor = exp(-tasks_host[
i].zeta * f * tasks_host[
i].rab2);
155 tasks_host[
i].off_diag_twice = (iatom == jatom) ? 1.0 : 2.0;
157 const int la_max_basis = ibasis->
lmax[iset];
158 const int lb_max_basis = jbasis->
lmax[jset];
159 const int la_min_basis = ibasis->
lmin[iset];
160 const int lb_min_basis = jbasis->
lmin[jset];
163 tasks_host[
i].la_max = la_max_basis;
164 tasks_host[
i].lb_max = lb_max_basis;
165 tasks_host[
i].la_min = la_min_basis;
166 tasks_host[
i].lb_min = lb_min_basis;
169 tasks_host[
i].first_coseta =
171 tasks_host[
i].first_cosetb =
179 tasks_host[
i].nsgfa = ibasis->
nsgf;
180 tasks_host[
i].nsgfb = jbasis->
nsgf;
183 tasks_host[
i].nsgf_seta = ibasis->
nsgf_set[iset];
184 tasks_host[
i].nsgf_setb = jbasis->
nsgf_set[jset];
187 tasks_host[
i].maxcoa = ibasis->
maxco;
188 tasks_host[
i].maxcob = jbasis->
maxco;
190 tasks_host[
i].sgfa = ibasis->
first_sgf[iset] - 1;
191 tasks_host[
i].sgfb = jbasis->
first_sgf[jset] - 1;
193 tasks_host[
i].block_transposed = (iatom > jatom);
194 tasks_host[
i].subblock_offset =
195 (tasks_host[
i].block_transposed)
196 ? (tasks_host[
i].sgfa * tasks_host[
i].nsgfb + tasks_host[
i].sgfb)
197 : (tasks_host[
i].sgfb * tasks_host[
i].nsgfa + tasks_host[
i].sgfa);
208 tasks_host[
i].lp_max = tasks_host[
i].lb_max + tasks_host[
i].la_max + 6;
210 tasks_host[
i].coef_offset = 0;
212 tasks_host[
i].coef_offset =
213 tasks_host[
i - 1].coef_offset +
219 tasks_host[
i].cab_offset = 0;
221 tasks_host[
i].cab_offset =
222 tasks_host[
i - 1].cab_offset +
232 tasks_host[
i].apply_border_mask = (tasks_host[
i].border_mask != 0);
234 if (
grid.is_orthorhombic() && (tasks_host[
i].border_mask == 0)) {
235 tasks_host[
i].discrete_radius =
236 rocm_backend::compute_cube_properties<double, double3, true>(
237 tasks_host[
i].radius,
grid.dh(),
grid.dh_inv(),
238 (double3 *)tasks_host[
i].rp,
241 &tasks_host[
i].cube_center,
242 &tasks_host[
i].lb_cube,
243 &tasks_host[
i].cube_size);
245 tasks_host[
i].discrete_radius =
246 rocm_backend::compute_cube_properties<double, double3, false>(
247 tasks_host[
i].radius,
grid.dh(),
grid.dh_inv(),
248 (double3 *)tasks_host[
i].rp,
251 &tasks_host[
i].cube_center,
252 &tasks_host[
i].lb_cube,
253 &tasks_host[
i].cube_size);
276 ctx->
tasks_dev.resize(tasks_host.size());
280 std::vector<std::vector<int>> task_sorted_by_block(nblocks);
281 std::vector<int> sorted_blocks(ntasks, 0);
282 std::vector<int> num_tasks_per_block(nblocks, 0);
283 std::vector<int> sorted_blocks_offset(nblocks, 0);
284 for (
auto &block : task_sorted_by_block)
287 for (
int i = 0;
i < ntasks;
i++) {
288 task_sorted_by_block[block_num_list[
i] - 1].push_back(
i);
289 num_tasks_per_block[block_num_list[
i] - 1]++;
294 for (
int i = 0;
i < (int)task_sorted_by_block.size();
i++) {
295 auto &task_list = task_sorted_by_block[
i];
299 if (!task_list.empty()) {
300 memcpy(&sorted_blocks[offset], &task_list[0],
301 sizeof(
int) * task_list.size());
303 sorted_blocks_offset[
i] = offset;
304 offset += task_list.size();
315 for (
int i = 0;
i < (int)sorted_blocks_offset.size();
i++) {
317 if (
i == (
int)sorted_blocks_offset.size() - 1)
318 num_tasks = ntasks - sorted_blocks_offset[
i];
320 num_tasks = sorted_blocks_offset[
i + 1] - sorted_blocks_offset[
i];
323 assert(num_tasks == num_tasks_per_block[
i]);
326 for (
int tk = 0; tk < num_tasks; tk++)
328 tasks_host[sorted_blocks[tk + sorted_blocks_offset[
i]]].block_num ==
331 for (
auto &block : task_sorted_by_block)
333 task_sorted_by_block.clear();
335 sorted_blocks.clear();
336 sorted_blocks_offset.clear();
342 memset(ctx->
stats, 0, 2 * 20 *
sizeof(
int));
343 for (
int itask = 0; itask < ntasks; itask++) {
344 const int iatom = iatom_list[itask] - 1;
345 const int jatom = jatom_list[itask] - 1;
346 const int ikind = atom_kinds[iatom] - 1;
347 const int jkind = atom_kinds[jatom] - 1;
348 const int iset = iset_list[itask] - 1;
349 const int jset = jset_list[itask] - 1;
350 const int la_max = basis_sets[ikind]->
lmax[iset];
351 const int lb_max = basis_sets[jkind]->
lmax[jset];
352 const int lp = std::min(la_max + lb_max, 19);
353 const bool has_border_mask = (border_mask_list[itask] != 0);
354 ctx->
stats[has_border_mask][lp]++;
370extern "C" void grid_hip_free_task_list(
void *ptr) {
384extern "C" void grid_hip_collocate_task_list(
const void *ptr,
395 assert(ctx->
nlevels == nlevels);
399 pab_blocks->
size /
sizeof(
double));
410 for (
int level = 0; level < ctx->
nlevels; level++) {
411 ctx->
grid_[level].associate(grids[level]->host_buffer,
412 grids[level]->device_buffer,
413 grids[level]->size /
sizeof(
double));
421 for (
int level = 0; level < ctx->
nlevels; level++) {
426 for (
int level = 0; level < ctx->
nlevels; level++) {
437 for (
int has_border_mask = 0; has_border_mask <= 1; has_border_mask++) {
438 for (
int lp = 0; lp < 20; lp++) {
439 const int count = ctx->
stats[has_border_mask][lp];
440 if (ctx->
grid_[0].is_orthorhombic() && !has_border_mask) {
452 for (
int level = 0; level < ctx->
nlevels; level++) {
461extern "C" void grid_hip_integrate_task_list(
462 const void *ptr,
const bool compute_tau,
const int nlevels,
470 assert(ctx->
nlevels == nlevels);
476 for (
int level = 0; level < ctx->
nlevels; level++) {
478 ctx->
grid_[level].associate(grids[level]->host_buffer,
479 grids[level]->device_buffer,
480 grids[level]->size /
sizeof(
double));
485 if ((forces !=
nullptr) || (virial !=
nullptr)) {
488 pab_blocks->
size /
sizeof(
double));
495 hab_blocks->
size /
sizeof(
double));
501 if (forces !=
nullptr) {
506 if (virial !=
nullptr) {
515 for (
int level = 0; level < ctx->
nlevels; level++) {
522 for (
int has_border_mask = 0; has_border_mask <= 1; has_border_mask++) {
523 for (
int lp = 0; lp < 20; lp++) {
524 const int count = ctx->
stats[has_border_mask][lp];
525 if (ctx->
grid_[0].is_orthorhombic() && !has_border_mask) {
537 for (
int level = 0; level < ctx->
nlevels; level++) {
546 if (forces != NULL) {
549 if (virial != NULL) {
std::vector< int > first_task_per_level_
gpu_vector< double > virial_
gpu_vector< int > sorted_blocks_offset_dev
gpu_vector< double > coef_dev_
gpu_vector< double > pab_block_
void collocate_one_grid_level(const int level, const enum grid_func func, int *lp_diff)
gpu_vector< int > block_offsets_dev
offloadStream_t main_stream
void synchronize(offloadStream_t &stream)
gpu_vector< double > cab_dev_
gpu_vector< task_info > tasks_dev
std::vector< grid_info< double > > grid_
gpu_vector< double > forces_
void compute_hab_coefficients()
gpu_vector< double > hab_block_
std::vector< offloadStream_t > level_streams
void initialize_basis_sets(const grid_basis_set **basis_sets, const int nkinds__)
std::vector< int > number_of_tasks_per_level_
gpu_vector< int > num_tasks_per_block_dev_
void integrate_one_grid_level(const int level, int *lp_diff)
gpu_vector< int > task_sorted_by_blocks_dev
void zero(offloadStream_t &stream__)
void copy_from_gpu(T *data__, offloadStream_t &stream__)
void associate(void *host_ptr__, void *device_ptr__, const size_t size__)
void copy_to_gpu(const T *data__)
void resize(const size_t new_size_)
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
static void const int const int i
static void const int const int const int const int const int const double const int const int const int npts_local[3]
void grid_library_counter_add(const int lp, const enum grid_backend backend, const enum grid_library_kernel kernel, const int increment)
Adds given increment to counter specified by lp, backend, and kernel.
integer function, public offload_get_chosen_device()
Returns the chosen device.
__host__ __device__ __inline__ int ncoset(const int l)
Number of Cartesian orbitals up to given angular momentum quantum.
Internal representation of a basis set.
Internal representation of a buffer.