dd/ded/grid__gpu__context_8cu_source.html

/*----------------------------------------------------------------------------*/

/*  CP2K: A general program to perform molecular dynamics simulations         */

/*  Copyright 2000-2026 CP2K developers group <https://cp2k.org>              */

/*                                                                            */

/*  SPDX-License-Identifier: BSD-3-Clause                                     */

/*----------------------------------------------------------------------------*/


/*

 * Authors :

 - Mathieu Taillefumier (ETH Zurich / CSCS)

 - Advanced Micro Devices, Inc.

 - Ole Schuett

*/


#include <cassert>

#include <cstdio>

#include <cstdlib>

#include <cstring>

#include <iostream>


#include "../../offload/offload_library.h"

extern "C" {

#include "../common/grid_basis_set.h"

#include "../common/grid_constants.h"

#include "../common/grid_library.h"

}


#include "grid_gpu_context.h"

#include "grid_gpu_internal_header.h"


#include "grid_gpu_task_list.h"


#if defined(_OMP_H)

#error "OpenMP should not be used in .cu files to accommodate HIP."

#endif


namespace rocm_backend {


constexpr size_t align_up_elems(size_t n_elems, size_t elem_alignment) {

  return (n_elems + elem_alignment - 1) & ~(elem_alignment - 1);

}


kernel_params

context_info::set_kernel_parameters(const int level,

                                    const smem_parameters &smem_params) {

  kernel_params params;

  params.cab_size_ = smem_params.cab_size();

  params.first_task = 0;


  params.la_min_diff = smem_params.ldiffs().la_min_diff;

  params.lb_min_diff = smem_params.ldiffs().lb_min_diff;


  params.la_max_diff = smem_params.ldiffs().la_max_diff;

  params.lb_max_diff = smem_params.ldiffs().lb_max_diff;

  params.tasks = this->tasks_dev.data();

  params.task_sorted_by_blocks_dev = task_sorted_by_blocks_dev.data();

  params.sorted_blocks_offset_dev = sorted_blocks_offset_dev.data();

  params.num_tasks_per_block_dev = this->num_tasks_per_block_dev_.data();

  params.block_offsets = this->block_offsets_dev.data();

  params.la_min_diff = smem_params.ldiffs().la_min_diff;

  params.lb_min_diff = smem_params.ldiffs().lb_min_diff;

  params.la_max_diff = smem_params.ldiffs().la_max_diff;

  params.lb_max_diff = smem_params.ldiffs().lb_max_diff;


  params.ptr_dev[0] = pab_block_.data();


  if (level >= 0) {

    params.ptr_dev[1] = grid_[level].data();

    memcpy(params.dh_, grid_[level].dh(), 9 * sizeof(double));

    memcpy(params.dh_inv_, grid_[level].dh_inv(), 9 * sizeof(double));

    params.first_task = first_task_per_level_[level];


    params.grid_full_size_ = grid_[level].full_size();

    params.grid_local_size_ = grid_[level].local_size();

    params.grid_lower_corner_ = grid_[level].lower_corner();

    params.grid_border_width_ = grid_[level].border_width();

  }


  params.ptr_dev[2] = this->coef_dev_.data();

  params.ptr_dev[3] = hab_block_.data();

  params.ptr_dev[4] = forces_.data();

  params.ptr_dev[5] = virial_.data();

  params.ptr_dev[6] = this->cab_dev_.data();

  params.cab_block_offset_dev = this->cab_block_offset_dev.data();

  params.sphi_dev = this->sphi_dev.data();

  return params;

}

}; // namespace rocm_backend


/*******************************************************************************

 * \brief Allocates a task list for the GPU backend.

 *        See grid_ctx.h for details.

 ******************************************************************************/


extern "C" void grid_gpu_create_task_list(

    const bool ortho, const int ntasks, const int nlevels, const int natoms,

    const int nkinds, const int nblocks, const int *block_offsets,

    const double *atom_positions, const int *atom_kinds,

    const grid_basis_set **basis_sets, const int *level_list,

    const int *iatom_list, const int *jatom_list, const int *iset_list,

    const int *jset_list, const int *ipgf_list, const int *jpgf_list,

    const int *border_mask_list, const int *block_num_list,

    const double *radius_list, const double *rab_list, const int *npts_global,

    const int *npts_local, const int *shift_local, const int *border_width,

    const double *dh, const double *dh_inv, grid_gpu_task_list **ptr) {


  rocm_backend::context_info **ctx_out = (rocm_backend::context_info **)ptr;


  // Yes it makes no sense

  if ((nblocks == 0) || (ntasks == 0)) {

    *ptr = nullptr;

    return;

  }


  // Select GPU device.

  rocm_backend::context_info *ctx = nullptr;

  if (*ctx_out == nullptr) {

    ctx = new rocm_backend::context_info(offload_get_chosen_device());

    *ctx_out = ctx;

  } else {

    ctx = *ctx_out;

    // verify that the object is the right one

    ctx->verify_checksum();

  }


  ctx->ntasks = ntasks;

  ctx->nlevels = nlevels;

  ctx->natoms = natoms;

  ctx->nblocks = nblocks;

  ctx->nkinds = nkinds;

  ctx->grid_.resize(nlevels);

  ctx->set_device();


  std::vector<double> dh_max(ctx->nlevels, 0);


  for (int level = 0; level < ctx->nlevels; level++) {

    ctx->grid_[level].resize(npts_global + 3 * level, npts_local + 3 * level,

                             shift_local + 3 * level, border_width + 3 * level);

    ctx->grid_[level].is_distributed(false);

    ctx->grid_[level].set_lattice_vectors(&dh[9 * level], &dh_inv[9 * level]);

    ctx->grid_[level].check_orthogonality(ortho);

    for (int i = 0; i < 9; i++)

      dh_max[level] = std::max(dh_max[level], std::abs(dh[9 * level + i]));

  }


  ctx->block_offsets_dev.resize(nblocks);

  ctx->block_offsets_dev.copy_to_gpu(block_offsets);

  ctx->initialize_basis_sets(basis_sets, nkinds);


  ctx->first_task_per_level_.resize(nlevels, 0);

  ctx->number_of_tasks_per_level_.resize(nlevels, 0);


  memset(ctx->first_task_per_level_.data(), 0, sizeof(int) * nlevels);

  memset(ctx->number_of_tasks_per_level_.data(), 0, sizeof(int) * nlevels);


  std::vector<rocm_backend::task_info> tasks_host(ntasks);


  size_t coef_size = 0;

  int lmax_ = 0;


  for (int i = 0; i < ntasks; i++) {

    const int level = level_list[i] - 1;


    // count the number of task per level

    ctx->number_of_tasks_per_level_[level]++;


    const int iatom = iatom_list[i] - 1;

    const int jatom = jatom_list[i] - 1;

    const int iset = iset_list[i] - 1;

    const int jset = jset_list[i] - 1;

    const int ipgf = ipgf_list[i] - 1;

    const int jpgf = jpgf_list[i] - 1;

    const int ikind = atom_kinds[iatom] - 1;

    const int jkind = atom_kinds[jatom] - 1;


    /* set parameters related to atom type orbital etc....  */

    const grid_basis_set *ibasis = basis_sets[ikind];

    const grid_basis_set *jbasis = basis_sets[jkind];


    tasks_host[i] = {};

    tasks_host[i].level = level;

    tasks_host[i].iatom = iatom;

    tasks_host[i].jatom = jatom;

    tasks_host[i].iset = iset;

    tasks_host[i].jset = jset;

    tasks_host[i].ipgf = ipgf;

    tasks_host[i].jpgf = jpgf;

    tasks_host[i].ikind = ikind;

    tasks_host[i].jkind = jkind;

    tasks_host[i].border_mask = border_mask_list[i];

    tasks_host[i].block_num = block_num_list[i] - 1;


    if (border_mask_list[i]) {

      ctx->grid_[level].is_distributed(true);

    }

    /* parameters for the gaussian  */

    tasks_host[i].radius = radius_list[i];

    tasks_host[i].rab[0] = rab_list[3 * i];

    tasks_host[i].rab[1] = rab_list[3 * i + 1];

    tasks_host[i].rab[2] = rab_list[3 * i + 2];

    tasks_host[i].zeta = ibasis->zet[iset * ibasis->maxpgf + ipgf];

    tasks_host[i].zetb = jbasis->zet[jset * jbasis->maxpgf + jpgf];

    tasks_host[i].zetp = tasks_host[i].zeta + tasks_host[i].zetb;

    const double f = tasks_host[i].zetb / tasks_host[i].zetp;

    tasks_host[i].rab2 = 0.0;

    for (int d = 0; d < 3; d++) {

      tasks_host[i].rab[d] = tasks_host[i].rab[d];

      tasks_host[i].rab2 += tasks_host[i].rab[d] * tasks_host[i].rab[d];

      tasks_host[i].ra[d] = atom_positions[3 * iatom + d];

      tasks_host[i].rb[d] = tasks_host[i].ra[d] + tasks_host[i].rab[d];

      tasks_host[i].rp[d] = tasks_host[i].ra[d] + tasks_host[i].rab[d] * f;

    }


    tasks_host[i].skip_task = (2 * tasks_host[i].radius < dh_max[level]);

    tasks_host[i].prefactor = exp(-tasks_host[i].zeta * f * tasks_host[i].rab2);


    tasks_host[i].off_diag_twice = (iatom == jatom) ? 1.0 : 2.0;

    // angular momentum range of basis set

    const int la_max_basis = ibasis->lmax[iset];

    const int lb_max_basis = jbasis->lmax[jset];

    const int la_min_basis = ibasis->lmin[iset];

    const int lb_min_basis = jbasis->lmin[jset];


    // angular momentum range for the actual collocate/integrate opteration.

    tasks_host[i].la_max = la_max_basis;

    tasks_host[i].lb_max = lb_max_basis;

    tasks_host[i].la_min = la_min_basis;

    tasks_host[i].lb_min = lb_min_basis;


    lmax_ = std::max(lmax_, tasks_host[i].la_max);

    lmax_ = std::max(lmax_, tasks_host[i].lb_max);


    // start of decontracted set, ie. pab and hab

    tasks_host[i].first_coseta =

        (la_min_basis > 0) ? rocm_backend::ncoset(la_min_basis - 1) : 0;

    tasks_host[i].first_cosetb =

        (lb_min_basis > 0) ? rocm_backend::ncoset(lb_min_basis - 1) : 0;


    // size of decontracted set, ie. pab and hab

    tasks_host[i].ncoseta = rocm_backend::ncoset(la_max_basis);

    tasks_host[i].ncosetb = rocm_backend::ncoset(lb_max_basis);

    // it should lmax + 3 because calculating forces+stress+compute_tau requires

    // l + 3

    tasks_host[i].max_cab_size =

        rocm_backend::align_up_elems(rocm_backend::ncoset(la_max_basis + 3) *

                                         rocm_backend::ncoset(lb_max_basis + 3),

                                     4);


    // size of entire spherical basis

    tasks_host[i].nsgfa = ibasis->nsgf;

    tasks_host[i].nsgfb = jbasis->nsgf;


    // size of spherical set

    tasks_host[i].nsgf_seta = ibasis->nsgf_set[iset];

    tasks_host[i].nsgf_setb = jbasis->nsgf_set[jset];


    // strides of the sphi transformation matrices

    tasks_host[i].maxcoa = ibasis->maxco;

    tasks_host[i].maxcob = jbasis->maxco;


    tasks_host[i].sgfa = ibasis->first_sgf[iset] - 1;

    tasks_host[i].sgfb = jbasis->first_sgf[jset] - 1;


    tasks_host[i].block_transposed = (iatom > jatom);

    tasks_host[i].subblock_offset =

        (tasks_host[i].block_transposed)

            ? (tasks_host[i].sgfa * tasks_host[i].nsgfb + tasks_host[i].sgfb)

            : (tasks_host[i].sgfb * tasks_host[i].nsgfa + tasks_host[i].sgfa);


    /* the constant 6 is important here since we do not know ahead of time what

     * specific operation we will be doing. collocate functions can go up to 4

     * while integrate can go up to 5 (but put 6 for safety reasons) */


    /* this block is only as temporary scratch for calculating the coefficients.

     * Doing this avoid a lot of atomic operations that are costly on hardware

     * that only have partial support of them. For better performance we should

     * most probably align the offsets as well. it is 256 bytes on Mi100 and

     * above */

    tasks_host[i].lp_max = tasks_host[i].lb_max + tasks_host[i].la_max + 6;

    if (i == 0) {

      tasks_host[i].coef_offset = 0;

    } else {

      tasks_host[i].coef_offset =

          tasks_host[i - 1].coef_offset +

          rocm_backend::align_up_elems(

              rocm_backend::ncoset(tasks_host[i - 1].lp_max), 4);

    }


    // calculate the size such that the coef table is a multiple of 4.

    coef_size += rocm_backend::align_up_elems(

        rocm_backend::ncoset(tasks_host[i].lp_max), 4);


    auto &grid = ctx->grid_[tasks_host[i].level];

    // compute the cube properties


    tasks_host[i].apply_border_mask = (tasks_host[i].border_mask != 0);


    if (grid.is_orthogonal() && (tasks_host[i].border_mask == 0)) {

      tasks_host[i].discrete_radius =

          rocm_backend::compute_cube_properties<double, double3, true>(

              tasks_host[i].radius, grid.dh(), grid.dh_inv(),

              (double3 *)tasks_host[i].rp, // center of the gaussian

              &tasks_host[i]

                   .roffset, // offset compared to the closest grid point

              &tasks_host[i].cube_center, // center coordinates in grid space

              &tasks_host[i].lb_cube,     // lower boundary

              &tasks_host[i].cube_size);

    } else {

      tasks_host[i].discrete_radius =

          rocm_backend::compute_cube_properties<double, double3, false>(

              tasks_host[i].radius, grid.dh(), grid.dh_inv(),

              (double3 *)tasks_host[i].rp, // center of the gaussian

              &tasks_host[i]

                   .roffset, // offset compared to the closest grid point

              &tasks_host[i].cube_center, // center coordinates in grid space

              &tasks_host[i].lb_cube,     // lower boundary

              &tasks_host[i].cube_size);

    }

  }


  // we need to sort the task list although I expect it to be sorted already

  // it is a exclusive scan actually

  for (int level = 1; level < (int)ctx->number_of_tasks_per_level_.size();

       level++) {

    ctx->first_task_per_level_[level] =

        ctx->first_task_per_level_[level - 1] +

        ctx->number_of_tasks_per_level_[level - 1];

  }


  ctx->tasks_dev.clear();

  ctx->tasks_dev.resize(tasks_host.size());

  ctx->tasks_dev.copy_to_gpu(tasks_host);


  /* Sort the blocks */

  std::vector<std::vector<int>> task_sorted_by_block(nblocks);

  std::vector<int> sorted_blocks(ntasks, 0);

  std::vector<int> num_tasks_per_block(nblocks, 0);

  std::vector<int> sorted_blocks_offset(nblocks, 0);

  for (auto &block : task_sorted_by_block)

    block.clear();


  for (int i = 0; i < ntasks; i++) {

    task_sorted_by_block[block_num_list[i] - 1].push_back(i);

    num_tasks_per_block[block_num_list[i] - 1]++;

  }


  int offset = 0;

  // flatten the task_sorted_by_block and compute the offsets

  for (int i = 0; i < (int)task_sorted_by_block.size(); i++) {

    auto &task_list = task_sorted_by_block[i];


    // take care of the case where the blocks are not associated to a given

    // task. (and also a workaround in the grid_replay.c file)

    if (!task_list.empty()) {

      memcpy(&sorted_blocks[offset], &task_list[0],

             sizeof(int) * task_list.size());

    }

    sorted_blocks_offset[i] = offset;

    offset += task_list.size();

  }


  // copy the blocks offsets

  ctx->sorted_blocks_offset_dev.resize(sorted_blocks_offset.size());

  ctx->sorted_blocks_offset_dev.copy_to_gpu(sorted_blocks_offset);


  // copy the task list sorted by block (not by level) to the gpu

  ctx->task_sorted_by_blocks_dev.resize(sorted_blocks.size());

  ctx->task_sorted_by_blocks_dev.copy_to_gpu(sorted_blocks);


  std::vector<int> cab_size_offset_tmp(nblocks, 0);

  std::vector<int> cab_size_tmp(nblocks);


  for (int i = 0; i < (int)sorted_blocks_offset.size(); i++) {

    int num_tasks = 0;

    if (i == (int)sorted_blocks_offset.size() - 1)

      num_tasks = ntasks - sorted_blocks_offset[i];

    else

      num_tasks = sorted_blocks_offset[i + 1] - sorted_blocks_offset[i];


    // invariants tests since they should be equal.

    assert(num_tasks == num_tasks_per_block[i]);


    int tmp_cab_size = 0;

    for (int tk = 0; tk < num_tasks; tk++) {

      auto &task = tasks_host[sorted_blocks[tk + sorted_blocks_offset[i]]];

      // check that all tasks point to the same block

      assert(

          tasks_host[sorted_blocks[tk + sorted_blocks_offset[i]]].block_num ==

          i);


      // calculate the largest cab block needed for this block

      tmp_cab_size = std::max(task.max_cab_size, tmp_cab_size);

    }


    // keep the size of the largest cab block

    cab_size_tmp[i] = tmp_cab_size;

  }


  cab_size_offset_tmp[0] = 0;


  for (int i = 1; i < cab_size_tmp.size(); i++) {

    cab_size_offset_tmp[i] = cab_size_offset_tmp[i - 1] + cab_size_tmp[i - 1];

  }


  for (auto &block : task_sorted_by_block)

    block.clear();

  task_sorted_by_block.clear();


  sorted_blocks.clear();

  sorted_blocks_offset.clear();


  ctx->cab_block_offset_dev.resize(cab_size_offset_tmp.size());

  ctx->cab_block_offset_dev.copy_to_gpu(cab_size_offset_tmp);


  ctx->num_tasks_per_block_dev_.resize(num_tasks_per_block.size());

  ctx->num_tasks_per_block_dev_.copy_to_gpu(num_tasks_per_block);


  // Calculate the total amount of workspace needed

  size_t cab_size_total = 0;


  for (auto &elem : cab_size_tmp)

    cab_size_total += elem;


  cab_size_offset_tmp.clear();

  cab_size_tmp.clear();


  // To avoid memory saturation, cab only depends on the number of blocks not

  // the number of tasks. It forces us to compute all xyz coefficients before

  // calling collocate. However it does not change the logic of the integrate

  // counterpart.


  ctx->cab_dev_.resize(cab_size_total);


  // allocate workspace for the coefficients

  ctx->coef_dev_.resize(coef_size);


  // collect stats

  memset(ctx->stats, 0, 2 * 20 * sizeof(int));

  for (int itask = 0; itask < ntasks; itask++) {

    const int iatom = iatom_list[itask] - 1;

    const int jatom = jatom_list[itask] - 1;

    const int ikind = atom_kinds[iatom] - 1;

    const int jkind = atom_kinds[jatom] - 1;

    const int iset = iset_list[itask] - 1;

    const int jset = jset_list[itask] - 1;

    const int la_max = basis_sets[ikind]->lmax[iset];

    const int lb_max = basis_sets[jkind]->lmax[jset];

    const int lp = std::min(la_max + lb_max, 19);

    const bool has_border_mask = (border_mask_list[itask] != 0);

    ctx->stats[has_border_mask][lp]++;

  }


  ctx->create_streams();

  ctx->compute_checksum();


  // cleanup

  tasks_host.clear();


  // return newly created or updated context

  *ctx_out = ctx;

}


/*******************************************************************************

 * \brief destroy a context

 ******************************************************************************/


extern "C" void grid_gpu_free_task_list(grid_gpu_task_list *ptr) {


  rocm_backend::context_info *ctx = (rocm_backend::context_info *)ptr;

  // Select GPU device.

  if (ctx == nullptr)

    return;

  ctx->verify_checksum();

  ctx->set_device();

  delete ctx;

}


/*******************************************************************************

 * \brief Collocate all tasks of in given list onto given grids.

 ******************************************************************************/


extern "C" void grid_gpu_collocate_task_list(const grid_gpu_task_list *ptr,

                                             const enum grid_func func,

                                             const int nlevels,

                                             const offload_buffer *pab_blocks,

                                             offload_buffer **grids) {

  rocm_backend::context_info *ctx = (rocm_backend::context_info *)ptr;


  if (ptr == nullptr)

    return;


  ctx->verify_checksum();


  if ((ctx->nblocks == 0) || (ctx->ntasks == 0)) {

    return;

  }


  assert(ctx->nlevels == nlevels);

  ctx->set_device();


  ctx->pab_block_.associate(pab_blocks->host_buffer, pab_blocks->device_buffer,

                            pab_blocks->size / sizeof(double));


  /*

      There are 3 scenario here.

        - Mi300 : no copy will happen as the two buffers have the same address

        - Mi250X : an internal copy will happen. We do not need to do anything

      explicit

        - no unified memory : Explicit copy will happen

    */

  int lp_diff = -1;

  ctx->pab_block_.copy_associated_host_to_gpu(ctx->main_stream);

  ctx->coef_dev_.zero(ctx->main_stream);

  ctx->calculate_all_coefficients(func, &lp_diff);


  for (int level = 0; level < ctx->nlevels; level++) {

    ctx->grid_[level].associate(grids[level]->host_buffer,

                                grids[level]->device_buffer,

                                grids[level]->size / sizeof(double));

    ctx->grid_[level].zero(ctx->level_streams[level]);

  }


  ctx->synchronize(ctx->main_stream);


  for (int level = 0; level < ctx->nlevels; level++) {

    ctx->collocate_one_grid_level(level, func, &lp_diff);

  }


  // download result from device to host.

  for (int level = 0; level < ctx->nlevels; level++) {

    ctx->grid_[level].copy_to_host(ctx->level_streams[level]);

  }


  // update counters while we wait for kernels to finish. It is not thread safe

  // at all since the function grid_library_counter_add has global static

  // states. We need a much better mechanism than this for instance move this

  // information one level up and encapsulate it in the context associated to

  // the library.


  if (lp_diff > -1) {

    for (int has_border_mask = 0; has_border_mask <= 1; has_border_mask++) {

      for (int lp = 0; lp < 20; lp++) {

        const int count = ctx->stats[has_border_mask][lp];

        if (ctx->grid_[0].is_orthogonal() && !has_border_mask) {

          grid_library_counter_add(lp + lp_diff, GRID_BACKEND_GPU,

                                   GRID_COLLOCATE_ORTHO, count);

        } else {

          grid_library_counter_add(lp + lp_diff, GRID_BACKEND_GPU,

                                   GRID_COLLOCATE_GENERAL, count);

        }

      }

    }

  }


  // need to wait for all streams to finish

  for (int level = 0; level < ctx->nlevels; level++) {

    ctx->synchronize(ctx->level_streams[level]);

  }

}


/*******************************************************************************

 * \brief Integrate all tasks of in given list onto given grids.

 *        See grid_ctx.h for details.

 ******************************************************************************/


extern "C" void grid_gpu_integrate_task_list(

    const grid_gpu_task_list *ptr, const bool compute_tau, const int nlevels,

    const offload_buffer *pab_blocks, const offload_buffer **grids,

    offload_buffer *hab_blocks, double *forces, double *virial) {


  rocm_backend::context_info *ctx = (rocm_backend::context_info *)ptr;


  if (ptr == nullptr)

    return;

  assert(ctx->nlevels == nlevels);


  ctx->verify_checksum();

  // Select GPU device.

  ctx->set_device();


  for (int level = 0; level < ctx->nlevels; level++) {

    if (ctx->number_of_tasks_per_level_[level]) {

      ctx->grid_[level].associate(grids[level]->host_buffer,

                                  grids[level]->device_buffer,

                                  grids[level]->size / sizeof(double));

      ctx->grid_[level].copy_to_gpu(ctx->level_streams[level]);

    }

  }


  if ((forces != nullptr) || (virial != nullptr)) {

    ctx->pab_block_.associate(pab_blocks->host_buffer,

                              pab_blocks->device_buffer,

                              pab_blocks->size / sizeof(double));

    ctx->pab_block_.copy_associated_host_to_gpu(ctx->main_stream);

  }


  // we do not need to wait for this to start the computations since the matrix

  // elements are computed after all coefficients are calculated.

  ctx->hab_block_.associate(hab_blocks->host_buffer, hab_blocks->device_buffer,

                            hab_blocks->size / sizeof(double));

  ctx->hab_block_.zero(ctx->main_stream);


  ctx->calculate_forces = (forces != nullptr);

  ctx->calculate_virial = (virial != nullptr);

  ctx->compute_tau = compute_tau;


  if (forces != nullptr) {

    ctx->forces_.resize(3 * ctx->natoms);

    ctx->forces_.zero(ctx->main_stream);

  }


  if (virial != nullptr) {

    ctx->virial_.resize(9);

    ctx->virial_.zero(ctx->main_stream);

  }


  int lp_diff = -1;


  // we can actually treat the full task list without bothering about the level

  // at that stage. This can be taken care of inside the kernel.

  for (int level = 0; level < ctx->nlevels; level++) {

    // launch kernel, but only after grid has arrived

    ctx->integrate_one_grid_level(level, &lp_diff);

  }


  if (lp_diff > -1) {

    // update counters while we wait for kernels to finish

    for (int has_border_mask = 0; has_border_mask <= 1; has_border_mask++) {

      for (int lp = 0; lp < 20; lp++) {

        const int count = ctx->stats[has_border_mask][lp];

        if (ctx->grid_[0].is_orthogonal() && !has_border_mask) {

          grid_library_counter_add(lp + lp_diff, GRID_BACKEND_GPU,

                                   GRID_INTEGRATE_ORTHO, count);

        } else {

          grid_library_counter_add(lp + lp_diff, GRID_BACKEND_GPU,

                                   GRID_INTEGRATE_GENERAL, count);

        }

      }

    }

  }


  // need to wait for all streams to finish

  for (int level = 0; level < ctx->nlevels; level++) {

    ctx->synchronize(ctx->level_streams[level]);

  }

  // computing the hab coefficients does not depend on the number of grids so we

  // can run these calculations on the main stream

  ctx->compute_hab_coefficients();

  ctx->hab_block_.copy_gpu_to_associated_host(ctx->main_stream);


  if (forces != NULL) {

    ctx->forces_.copy_from_gpu(forces, ctx->main_stream);

  }

  if (virial != NULL) {

    ctx->virial_.copy_from_gpu(virial, ctx->main_stream);

  }


  ctx->synchronize(ctx->main_stream);

}


rocm_backend::context_info
Definition grid_gpu_context.h:449

rocm_backend::context_info::first_task_per_level_
std::vector< int > first_task_per_level_
Definition grid_gpu_context.h:480

rocm_backend::context_info::calculate_all_coefficients
void calculate_all_coefficients(const enum grid_func func, int *lp_diff)
Definition grid_gpu_collocate.cu:400

rocm_backend::context_info::virial_
gpu_vector< double > virial_
Definition grid_gpu_context.h:475

rocm_backend::context_info::sorted_blocks_offset_dev
gpu_vector< int > sorted_blocks_offset_dev
Definition grid_gpu_context.h:483

rocm_backend::context_info::compute_tau
bool compute_tau
Definition grid_gpu_context.h:486

rocm_backend::context_info::coef_dev_
gpu_vector< double > coef_dev_
Definition grid_gpu_context.h:470

rocm_backend::context_info::calculate_forces
bool calculate_forces
Definition grid_gpu_context.h:484

rocm_backend::context_info::cab_block_offset_dev
gpu_vector< int > cab_block_offset_dev
Definition grid_gpu_context.h:469

rocm_backend::context_info::stats
int stats[2][20]
Definition grid_gpu_context.h:465

rocm_backend::context_info::pab_block_
gpu_vector< double > pab_block_
Definition grid_gpu_context.h:472

rocm_backend::context_info::collocate_one_grid_level
void collocate_one_grid_level(const int level, const enum grid_func func, int *lp_diff)
Launches the Cuda kernel that collocates all tasks of one grid level.
Definition grid_gpu_collocate.cu:429

rocm_backend::context_info::block_offsets_dev
gpu_vector< int > block_offsets_dev
Definition grid_gpu_context.h:468

rocm_backend::context_info::verify_checksum
void verify_checksum()
Definition grid_gpu_context.h:728

rocm_backend::context_info::nblocks
int nblocks
Definition grid_gpu_context.h:460

rocm_backend::context_info::main_stream
offloadStream_t main_stream
Definition grid_gpu_context.h:464

rocm_backend::context_info::synchronize
void synchronize(offloadStream_t &stream)
Definition grid_gpu_context.h:707

rocm_backend::context_info::ntasks
int ntasks
Definition grid_gpu_context.h:456

rocm_backend::context_info::calculate_virial
bool calculate_virial
Definition grid_gpu_context.h:485

rocm_backend::context_info::nlevels
int nlevels
Definition grid_gpu_context.h:457

rocm_backend::context_info::cab_dev_
gpu_vector< double > cab_dev_
Definition grid_gpu_context.h:471

rocm_backend::context_info::create_streams
void create_streams()
Definition grid_gpu_context.h:694

rocm_backend::context_info::tasks_dev
gpu_vector< task_info > tasks_dev
Definition grid_gpu_context.h:476

rocm_backend::context_info::grid_
std::vector< grid_info< double > > grid_
Definition grid_gpu_context.h:478

rocm_backend::context_info::forces_
gpu_vector< double > forces_
Definition grid_gpu_context.h:474

rocm_backend::context_info::nkinds
int nkinds
Definition grid_gpu_context.h:459

rocm_backend::context_info::compute_hab_coefficients
void compute_hab_coefficients()
Definition grid_gpu_integrate.cu:735

rocm_backend::context_info::natoms
int natoms
Definition grid_gpu_context.h:458

rocm_backend::context_info::hab_block_
gpu_vector< double > hab_block_
Definition grid_gpu_context.h:473

rocm_backend::context_info::set_device
void set_device()
Definition grid_gpu_context.h:716

rocm_backend::context_info::compute_checksum
void compute_checksum()
Definition grid_gpu_context.h:727

rocm_backend::context_info::level_streams
std::vector< offloadStream_t > level_streams
Definition grid_gpu_context.h:463

rocm_backend::context_info::initialize_basis_sets
void initialize_basis_sets(const grid_basis_set **basis_sets, const int nkinds__)
Definition grid_gpu_context.h:655

rocm_backend::context_info::number_of_tasks_per_level_
std::vector< int > number_of_tasks_per_level_
Definition grid_gpu_context.h:479

rocm_backend::context_info::sphi_dev
gpu_vector< double * > sphi_dev
Definition grid_gpu_context.h:482

rocm_backend::context_info::num_tasks_per_block_dev_
gpu_vector< int > num_tasks_per_block_dev_
Definition grid_gpu_context.h:477

rocm_backend::context_info::integrate_one_grid_level
void integrate_one_grid_level(const int level, int *lp_diff)
Launches the Cuda kernel that integrates all tasks of one grid level.
Definition grid_gpu_integrate.cu:689

rocm_backend::context_info::task_sorted_by_blocks_dev
gpu_vector< int > task_sorted_by_blocks_dev
Definition grid_gpu_context.h:483

rocm_backend::gpu_vector::zero
void zero(offloadStream_t &stream__)
Definition grid_gpu_context.h:142

rocm_backend::gpu_vector::copy_associated_host_to_gpu
void copy_associated_host_to_gpu(offloadStream_t &stream__)
Definition grid_gpu_context.h:115

rocm_backend::gpu_vector::data
T * data()
Definition grid_gpu_context.h:212

rocm_backend::gpu_vector::resize
void resize(const size_t new_size__)
Definition grid_gpu_context.h:175

rocm_backend::gpu_vector::copy_from_gpu
void copy_from_gpu(T *data__, offloadStream_t &stream__)
Definition grid_gpu_context.h:125

rocm_backend::gpu_vector::copy_gpu_to_associated_host
void copy_gpu_to_associated_host(offloadStream_t &stream__)
Definition grid_gpu_context.h:132

rocm_backend::gpu_vector::associate
void associate(void *host_ptr__, void *device_ptr__, const size_t size__)
Definition grid_gpu_context.h:148

rocm_backend::gpu_vector::copy_to_gpu
void copy_to_gpu(const T *data__)
Definition grid_gpu_context.h:102

GRID_BACKEND_GPU
@ GRID_BACKEND_GPU
Definition grid_constants.h:53

grid_func
grid_func
Definition grid_constants.h:10

grid
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
Definition grid_cpu_collint.h:169

i
static void const int const int i
Definition grid_cpu_collint.h:38

npts_local
static void const int const int const int const int const int const double const int const int const int npts_local[3]
Definition grid_cpu_collint.h:167

grid_gpu_collocate_task_list
void grid_gpu_collocate_task_list(const grid_gpu_task_list *ptr, const enum grid_func func, const int nlevels, const offload_buffer *pab_blocks, offload_buffer **grids)
Collocate all tasks of in given list onto given grids.
Definition grid_gpu_context.cu:479

grid_gpu_create_task_list
void grid_gpu_create_task_list(const bool ortho, const int ntasks, const int nlevels, const int natoms, const int nkinds, const int nblocks, const int *block_offsets, const double *atom_positions, const int *atom_kinds, const grid_basis_set **basis_sets, const int *level_list, const int *iatom_list, const int *jatom_list, const int *iset_list, const int *jset_list, const int *ipgf_list, const int *jpgf_list, const int *border_mask_list, const int *block_num_list, const double *radius_list, const double *rab_list, const int *npts_global, const int *npts_local, const int *shift_local, const int *border_width, const double *dh, const double *dh_inv, grid_gpu_task_list **ptr)
Allocates a task list for the GPU backend. See grid_ctx.h for details.
Definition grid_gpu_context.cu:94

grid_gpu_free_task_list
void grid_gpu_free_task_list(grid_gpu_task_list *ptr)
destroy a context
Definition grid_gpu_context.cu:465

grid_gpu_integrate_task_list
void grid_gpu_integrate_task_list(const grid_gpu_task_list *ptr, const bool compute_tau, const int nlevels, const offload_buffer *pab_blocks, const offload_buffer **grids, offload_buffer *hab_blocks, double *forces, double *virial)
Integrate all tasks of in given list onto given grids. See grid_ctx.h for details.
Definition grid_gpu_context.cu:562

grid_gpu_context.h

grid_gpu_internal_header.h

grid_gpu_task_list.h

grid_gpu_task_list
void grid_gpu_task_list
Definition grid_gpu_task_list.h:20

grid_library_counter_add
void grid_library_counter_add(const int lp, const enum grid_backend backend, const enum grid_library_kernel kernel, const int increment)
Adds given increment to counter specified by lp, backend, and kernel.
Definition grid_library.c:135

GRID_INTEGRATE_GENERAL
@ GRID_INTEGRATE_GENERAL
Definition grid_library.h:70

GRID_COLLOCATE_ORTHO
@ GRID_COLLOCATE_ORTHO
Definition grid_library.h:67

GRID_COLLOCATE_GENERAL
@ GRID_COLLOCATE_GENERAL
Definition grid_library.h:69

GRID_INTEGRATE_ORTHO
@ GRID_INTEGRATE_ORTHO
Definition grid_library.h:68

rocm_backend
Definition grid_gpu_collocate.cu:30

rocm_backend::ncoset
__host__ __device__ __inline__ int ncoset(const int l)
Number of Cartesian orbitals up to given angular momentum quantum.
Definition grid_gpu_internal_header.h:168

rocm_backend::align_up_elems
constexpr size_t align_up_elems(size_t n_elems, size_t elem_alignment)
Definition grid_gpu_context.cu:39

grid_basis_set
Internal representation of a basis set.
Definition grid_basis_set.h:14

grid_basis_set::nsgf
int nsgf
Definition grid_basis_set.h:16

grid_basis_set::maxco
int maxco
Definition grid_basis_set.h:17

grid_basis_set::lmax
int * lmax
Definition grid_basis_set.h:20

grid_basis_set::zet
double * zet
Definition grid_basis_set.h:25

grid_basis_set::first_sgf
int * first_sgf
Definition grid_basis_set.h:23

grid_basis_set::nsgf_set
int * nsgf_set
Definition grid_basis_set.h:22

grid_basis_set::maxpgf
int maxpgf
Definition grid_basis_set.h:18

grid_basis_set::lmin
int * lmin
Definition grid_basis_set.h:19

offload_buffer
Internal representation of a buffer.
Definition offload_buffer.h:16

offload_buffer::device_buffer
double * device_buffer
Definition offload_buffer.h:19

offload_buffer::host_buffer
double * host_buffer
Definition offload_buffer.h:18

offload_buffer::size
size_t size
Definition offload_buffer.h:17