14#ifndef GRID_HIP_CONTEXT_H
15#define GRID_HIP_CONTEXT_H
17#include <hip/hip_runtime_api.h>
21#include "../common/grid_basis_set.h"
22#include "../common/grid_constants.h"
25#include "../../offload/offload_library.h"
26#include "../../offload/offload_runtime.h"
35 size_t allocated_size_{0};
36 size_t current_size_{0};
37 bool allocated_outside_{
false};
38 bool internal_allocation_ = {
false};
39 T *device_ptr_ =
nullptr;
40 T *host_ptr_ =
nullptr;
50 allocated_size_ = (size__ / 16 + 1) * 16;
52 current_size_ = size__;
53 internal_allocation_ =
true;
54#ifndef __OFFLOAD_UNIFIED_MEMORY
55 offloadMalloc((
void **)&device_ptr_,
sizeof(T) * allocated_size_);
57 hipMallocManaged((
void **)&device_ptr_,
sizeof(T) * allocated_size_);
62 allocated_size_ = size__;
63 current_size_ = size__;
64 allocated_outside_ =
true;
69 inline size_t size() {
return current_size_; }
72 offloadMemcpyHtoD(device_ptr_, data__,
sizeof(T) * current_size_);
75 inline void copy_to_gpu(
const T *data__, offloadStream_t &stream__) {
76 offloadMemcpyAsyncHtoD(device_ptr_, data__,
sizeof(T) * current_size_,
81 offloadMemcpyAsyncHtoD(device_ptr_, host_ptr_,
sizeof(T) * current_size_,
86 offloadMemcpyAsyncDtoH(data__, device_ptr_,
sizeof(T) * current_size_,
91 offloadMemcpyAsyncDtoH(host_ptr_, device_ptr_,
sizeof(T) * current_size_,
95 inline void zero(offloadStream_t &stream__) {
97 offloadMemsetAsync(device_ptr_, 0,
sizeof(T) * current_size_, stream__);
100 inline void associate(
void *host_ptr__,
void *device_ptr__,
101 const size_t size__) {
103 if (internal_allocation_) {
105 offloadFree(device_ptr_);
107 std::free(host_ptr_);
108 internal_allocation_ =
false;
111 allocated_outside_ =
true;
113 current_size_ = size__;
114 device_ptr_ =
static_cast<T *
>(device_ptr__);
115 host_ptr_ =
static_cast<T *
>(host_ptr__);
120 offloadMemset(device_ptr_, 0,
sizeof(T) * current_size_);
124 assert(data__.size() == current_size_);
130 offloadMemcpyHtoD(device_ptr_, data__.data(),
sizeof(T) * data__.size());
133 inline void resize(
const size_t new_size_) {
134 if (allocated_outside_) {
135 allocated_outside_ =
false;
137 device_ptr_ =
nullptr;
141 if (allocated_size_ < new_size_) {
142 if (device_ptr_ !=
nullptr)
143 offloadFree(device_ptr_);
144 allocated_size_ = (new_size_ / 16 + (new_size_ % 16 != 0)) * 16;
145 offloadMalloc((
void **)&device_ptr_,
sizeof(T) * allocated_size_);
146 internal_allocation_ =
true;
148 current_size_ = new_size_;
152 inline void clear() { current_size_ = 0; }
156 if (!allocated_outside_) {
157 if (device_ptr_ !=
nullptr)
158 offloadFree(device_ptr_);
160 if (host_ptr_ !=
nullptr)
161 std::free(device_ptr_);
166 device_ptr_ =
nullptr;
168 internal_allocation_ =
false;
171 inline T *
data() {
return device_ptr_; }
175 int full_size_[3] = {0, 0, 0};
176 int local_size_[3] = {0, 0, 0};
178 int lower_corner_[3] = {0, 0, 0};
179 int border_width_[3] = {0, 0, 0};
182 bool orthorhombic_{
false};
183 bool is_distributed_{
false};
189 grid_info(
const int *full_size__,
const int *local_size__,
190 const int *border_width__) {
191 initialize(full_size__, local_size__, border_width__);
212 inline void resize(
const int *full_size__,
const int *local_size__,
213 const int *
const roffset__,
214 const int *
const border_width__) {
215 initialize(full_size__, local_size__, roffset__, border_width__);
218 inline size_t size()
const {
return grid_.
size(); }
220 inline void zero(offloadStream_t &stream) { grid_.
zero(stream); }
223 memcpy(dh_, dh__,
sizeof(
double) * 9);
224 memcpy(dh_inv_, dh_inv__,
sizeof(
double) * 9);
227 inline T *
dh() {
return dh_; }
234 is_distributed_ = distributed__;
239 orthorhombic_ =
true;
242 double norm1, norm2, norm3;
243 bool orthogonal[3] = {
false,
false,
false};
244 norm1 = dh_[0] * dh_[0] + dh_[1] * dh_[1] + dh_[2] * dh_[2];
245 norm2 = dh_[3] * dh_[3] + dh_[4] * dh_[4] + dh_[5] * dh_[5];
246 norm3 = dh_[6] * dh_[6] + dh_[7] * dh_[7] + dh_[8] * dh_[8];
248 norm1 = 1.0 / sqrt(norm1);
249 norm2 = 1.0 / sqrt(norm2);
250 norm3 = 1.0 / sqrt(norm3);
254 ((fabs(dh_[0] * dh_[6] + dh_[1] * dh_[7] + dh_[2] * dh_[8]) * norm1 *
258 ((fabs(dh_[3] * dh_[6] + dh_[4] * dh_[7] + dh_[5] * dh_[8]) * norm2 *
262 ((fabs(dh_[0] * dh_[3] + dh_[1] * dh_[4] + dh_[2] * dh_[5]) * norm1 *
265 orthorhombic_ = orthogonal[0] && orthogonal[1] && orthogonal[2];
276 inline void associate(
void *host_ptr__,
void *device_ptr__,
277 const size_t size__) {
278 grid_.
associate(host_ptr__, device_ptr__, size__);
284 return full_size_[
i];
289 return local_size_[
i];
294 return lower_corner_[
i];
299 return border_width_[
i];
303 void initialize(
const int *
const full_size__,
const int *
const local_size__,
304 const int *
const roffset__,
const int *
const border_width__) {
309 full_size_[2] = full_size__[0];
310 full_size_[1] = full_size__[1];
311 full_size_[0] = full_size__[2];
313 local_size_[2] = local_size__[0];
314 local_size_[1] = local_size__[1];
315 local_size_[0] = local_size__[2];
317 lower_corner_[0] = roffset__[2];
318 lower_corner_[1] = roffset__[1];
319 lower_corner_[2] = roffset__[0];
321 is_distributed_ = (full_size_[2] != local_size_[2]) ||
322 (full_size_[1] != local_size_[1]) ||
323 (full_size_[0] != local_size_[0]);
325 border_width_[2] = border_width__[0];
326 border_width_[1] = border_width__[1];
327 border_width_[0] = border_width__[2];
387 double *
ptr_dev[7] = {
nullptr,
nullptr,
nullptr,
nullptr,
388 nullptr,
nullptr,
nullptr};
424 std::vector<grid_info<double>>
grid_;
440 device_id_ = device_id__;
445 hipSetDevice(device_id_);
455 for (
auto &phi :
sphi)
473 int lmax()
const {
return lmax_; }
476 const int nkinds__) {
478 if (nkinds__ > (
int)
sphi.size()) {
479 for (
auto &phi :
sphi)
480 if (phi !=
nullptr) {
486 sphi.resize(nkinds__,
nullptr);
493 for (
int i = 0;
i < nkinds__;
i++) {
494 const auto &basis_set = basis_sets[
i];
495 if (
sphi_size[
i] < basis_set->nsgf * basis_set->maxco) {
496 offloadMalloc((
void **)&
sphi[
i],
497 basis_set->nsgf * basis_set->maxco *
sizeof(
double));
498 sphi_size[
i] = basis_set->nsgf * basis_set->maxco;
501 offloadMemcpyHtoD(
sphi[
i], basis_set->sphi,
502 basis_set->nsgf * basis_set->maxco *
sizeof(
double));
507 for (
int ikind = 0; ikind <
nkinds; ikind++) {
508 for (
int iset = 0; iset < basis_sets[ikind]->
nset; iset++) {
509 lmax_ = std::max(lmax_, basis_sets[ikind]->
lmax[iset]);
522 offloadStreamCreate(&stream);
528 offloadStreamSynchronize(stream);
533 offloadDeviceSynchronize();
547 fprintf(stderr,
"This object does not seem to have the right structure.\n"
548 "A casting went wrong or the object is corrupted\n");
556 unsigned int compute_checksum_() {
std::vector< int > first_task_per_level_
gpu_vector< double > virial_
gpu_vector< int > sorted_blocks_offset_dev
gpu_vector< double > coef_dev_
std::vector< int > sphi_size
gpu_vector< double > pab_block_
void collocate_one_grid_level(const int level, const enum grid_func func, int *lp_diff)
gpu_vector< int > block_offsets_dev
offloadStream_t main_stream
void synchronize(offloadStream_t &stream)
gpu_vector< double > cab_dev_
gpu_vector< task_info > tasks_dev
std::vector< grid_info< double > > grid_
gpu_vector< double > forces_
void compute_hab_coefficients()
gpu_vector< double > hab_block_
std::vector< double * > sphi
std::vector< offloadStream_t > level_streams
context_info(const int device_id__)
void initialize_basis_sets(const grid_basis_set **basis_sets, const int nkinds__)
std::vector< int > number_of_tasks_per_level_
gpu_vector< double * > sphi_dev
gpu_vector< int > num_tasks_per_block_dev_
void integrate_one_grid_level(const int level, int *lp_diff)
gpu_vector< int > task_sorted_by_blocks_dev
gpu_vector(const size_t size__, const void *ptr__)
void zero(offloadStream_t &stream__)
gpu_vector(const size_t size__)
void copy_to_gpu(offloadStream_t &stream__)
void copy_from_gpu(T *data__, offloadStream_t &stream__)
void associate(void *host_ptr__, void *device_ptr__, const size_t size__)
void copy_to_gpu(const std::vector< T > &data__)
void copy_to_gpu(const T *data__, offloadStream_t &stream__)
void copy_to_gpu(const T *data__)
void copy_from_gpu(offloadStream_t &stream__)
void resize(const size_t new_size_)
int full_size(const int i)
int local_size(const int i)
void zero(offloadStream_t &stream)
void copy_to_gpu(const T *data, offloadStream_t &stream)
void set_lattice_vectors(const double *dh__, const double *dh_inv__)
void copy_to_host(double *data__, offloadStream_t &stream)
int border_width(const int i)
void check_orthorhombicity(const bool ortho)
int lower_corner(const int i)
void copy_to_gpu(offloadStream_t &stream)
void copy_to_host(offloadStream_t &stream)
void resize(const int *full_size__, const int *local_size__, const int *const roffset__, const int *const border_width__)
void associate(void *host_ptr__, void *device_ptr__, const size_t size__)
grid_info(const int *full_size__, const int *local_size__, const int *border_width__)
void is_distributed(const bool distributed__)
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
static void const int const int i
Internal representation of a basis set.
Parameters of the collocate kernel.
int * task_sorted_by_blocks_dev
int grid_lower_corner_[3]
int * num_tasks_per_block_dev
int grid_border_width_[3]
int * sorted_blocks_offset_dev
Internal representation of a task.