14#ifndef GRID_GPU_CONTEXT_H
15#define GRID_GPU_CONTEXT_H
18#include <hip/hip_runtime_api.h>
20#include <cuda_runtime.h>
25#include "../common/grid_basis_set.h"
26#include "../common/grid_constants.h"
29#include "../../offload/offload_library.h"
30#include "../../offload/offload_runtime.h"
39 size_t allocated_size_{0};
40 size_t current_size_{0};
41 bool allocated_outside_{
false};
42 bool internal_allocation_ = {
false};
43 T *device_ptr_ =
nullptr;
44 T *host_ptr_ =
nullptr;
54 allocated_size_ = (size__ / 16 + 1) * 16;
56 current_size_ = size__;
57 internal_allocation_ =
true;
58#ifndef __OFFLOAD_UNIFIED_MEMORY
59 offloadMalloc((
void **)&device_ptr_,
sizeof(T) * allocated_size_);
61 hipMallocManaged((
void **)&device_ptr_,
sizeof(T) * allocated_size_);
66 allocated_size_ = size__;
67 current_size_ = size__;
68 allocated_outside_ =
true;
73 inline size_t size() {
return current_size_; }
76 offloadMemcpyHtoD(device_ptr_, data__,
sizeof(T) * current_size_);
79 inline void copy_to_gpu(
const T *data__, offloadStream_t &stream__) {
80 offloadMemcpyAsyncHtoD(device_ptr_, data__,
sizeof(T) * current_size_,
85 offloadMemcpyAsyncHtoD(device_ptr_, host_ptr_,
sizeof(T) * current_size_,
90 offloadMemcpyAsyncDtoH(data__, device_ptr_,
sizeof(T) * current_size_,
95 offloadMemcpyAsyncDtoH(host_ptr_, device_ptr_,
sizeof(T) * current_size_,
99 inline void zero(offloadStream_t &stream__) {
101 offloadMemsetAsync(device_ptr_, 0,
sizeof(T) * current_size_, stream__);
104 inline void associate(
void *host_ptr__,
void *device_ptr__,
105 const size_t size__) {
107 if (internal_allocation_) {
109 offloadFree(device_ptr_);
111 std::free(host_ptr_);
112 internal_allocation_ =
false;
115 allocated_outside_ =
true;
117 current_size_ = size__;
118 device_ptr_ =
static_cast<T *
>(device_ptr__);
119 host_ptr_ =
static_cast<T *
>(host_ptr__);
124 offloadMemset(device_ptr_, 0,
sizeof(T) * current_size_);
128 assert(data__.size() == current_size_);
134 offloadMemcpyHtoD(device_ptr_, data__.data(),
sizeof(T) * data__.size());
137 inline void resize(
const size_t new_size_) {
138 if (allocated_outside_) {
139 allocated_outside_ =
false;
141 device_ptr_ =
nullptr;
145 if (allocated_size_ < new_size_) {
146 if (device_ptr_ !=
nullptr)
147 offloadFree(device_ptr_);
148 allocated_size_ = (new_size_ / 16 + (new_size_ % 16 != 0)) * 16;
149 offloadMalloc((
void **)&device_ptr_,
sizeof(T) * allocated_size_);
150 internal_allocation_ =
true;
152 current_size_ = new_size_;
156 inline void clear() { current_size_ = 0; }
160 if (!allocated_outside_) {
161 if (device_ptr_ !=
nullptr)
162 offloadFree(device_ptr_);
164 if (host_ptr_ !=
nullptr)
165 std::free(device_ptr_);
170 device_ptr_ =
nullptr;
172 internal_allocation_ =
false;
175 inline T *
data() {
return device_ptr_; }
179 int full_size_[3] = {0, 0, 0};
180 int local_size_[3] = {0, 0, 0};
182 int lower_corner_[3] = {0, 0, 0};
183 int border_width_[3] = {0, 0, 0};
186 bool orthorhombic_{
false};
187 bool is_distributed_{
false};
193 grid_info(
const int *full_size__,
const int *local_size__,
194 const int *border_width__) {
195 initialize(full_size__, local_size__, border_width__);
216 inline void resize(
const int *full_size__,
const int *local_size__,
217 const int *
const roffset__,
218 const int *
const border_width__) {
219 initialize(full_size__, local_size__, roffset__, border_width__);
222 inline size_t size()
const {
return grid_.
size(); }
224 inline void zero(offloadStream_t &stream) { grid_.
zero(stream); }
227 memcpy(dh_, dh__,
sizeof(
double) * 9);
228 memcpy(dh_inv_, dh_inv__,
sizeof(
double) * 9);
231 inline T *
dh() {
return dh_; }
238 is_distributed_ = distributed__;
243 orthorhombic_ =
true;
246 double norm1, norm2, norm3;
247 bool orthogonal[3] = {
false,
false,
false};
248 norm1 = dh_[0] * dh_[0] + dh_[1] * dh_[1] + dh_[2] * dh_[2];
249 norm2 = dh_[3] * dh_[3] + dh_[4] * dh_[4] + dh_[5] * dh_[5];
250 norm3 = dh_[6] * dh_[6] + dh_[7] * dh_[7] + dh_[8] * dh_[8];
252 norm1 = 1.0 / sqrt(norm1);
253 norm2 = 1.0 / sqrt(norm2);
254 norm3 = 1.0 / sqrt(norm3);
258 ((fabs(dh_[0] * dh_[6] + dh_[1] * dh_[7] + dh_[2] * dh_[8]) * norm1 *
262 ((fabs(dh_[3] * dh_[6] + dh_[4] * dh_[7] + dh_[5] * dh_[8]) * norm2 *
266 ((fabs(dh_[0] * dh_[3] + dh_[1] * dh_[4] + dh_[2] * dh_[5]) * norm1 *
269 orthorhombic_ = orthogonal[0] && orthogonal[1] && orthogonal[2];
280 inline void associate(
void *host_ptr__,
void *device_ptr__,
281 const size_t size__) {
282 grid_.
associate(host_ptr__, device_ptr__, size__);
288 return full_size_[
i];
293 return local_size_[
i];
298 return lower_corner_[
i];
303 return border_width_[
i];
307 void initialize(
const int *
const full_size__,
const int *
const local_size__,
308 const int *
const roffset__,
const int *
const border_width__) {
313 full_size_[2] = full_size__[0];
314 full_size_[1] = full_size__[1];
315 full_size_[0] = full_size__[2];
317 local_size_[2] = local_size__[0];
318 local_size_[1] = local_size__[1];
319 local_size_[0] = local_size__[2];
321 lower_corner_[0] = roffset__[2];
322 lower_corner_[1] = roffset__[1];
323 lower_corner_[2] = roffset__[0];
325 is_distributed_ = (full_size_[2] != local_size_[2]) ||
326 (full_size_[1] != local_size_[1]) ||
327 (full_size_[0] != local_size_[0]);
329 border_width_[2] = border_width__[0];
330 border_width_[1] = border_width__[1];
331 border_width_[0] = border_width__[2];
391 double *
ptr_dev[7] = {
nullptr,
nullptr,
nullptr,
nullptr,
392 nullptr,
nullptr,
nullptr};
428 std::vector<grid_info<double>>
grid_;
444 device_id_ = device_id__;
449 offload_set_chosen_device(device_id_);
450 offload_activate_chosen_device();
460 for (
auto &phi :
sphi)
478 int lmax()
const {
return lmax_; }
481 const int nkinds__) {
483 if (nkinds__ > (
int)
sphi.size()) {
484 for (
auto &phi :
sphi)
485 if (phi !=
nullptr) {
491 sphi.resize(nkinds__,
nullptr);
498 for (
int i = 0;
i < nkinds__;
i++) {
499 const auto &basis_set = basis_sets[
i];
500 if (
sphi_size[
i] < basis_set->nsgf * basis_set->maxco) {
501 offloadMalloc((
void **)&
sphi[
i],
502 basis_set->nsgf * basis_set->maxco *
sizeof(
double));
503 sphi_size[
i] = basis_set->nsgf * basis_set->maxco;
506 offloadMemcpyHtoD(
sphi[
i], basis_set->sphi,
507 basis_set->nsgf * basis_set->maxco *
sizeof(
double));
512 for (
int ikind = 0; ikind <
nkinds; ikind++) {
513 for (
int iset = 0; iset < basis_sets[ikind]->
nset; iset++) {
514 lmax_ = std::max(lmax_, basis_sets[ikind]->
lmax[iset]);
527 offloadStreamCreate(&stream);
533 offloadStreamSynchronize(stream);
538 offloadDeviceSynchronize();
542 offload_set_chosen_device(device_id_);
543 offload_activate_chosen_device();
555 fprintf(stderr,
"This object does not seem to have the right structure.\n"
556 "A casting went wrong or the object is corrupted\n");
564 unsigned int compute_checksum_() {
std::vector< int > first_task_per_level_
gpu_vector< double > virial_
gpu_vector< int > sorted_blocks_offset_dev
gpu_vector< double > coef_dev_
std::vector< int > sphi_size
gpu_vector< double > pab_block_
void collocate_one_grid_level(const int level, const enum grid_func func, int *lp_diff)
Launches the Cuda kernel that collocates all tasks of one grid level.
gpu_vector< int > block_offsets_dev
offloadStream_t main_stream
void synchronize(offloadStream_t &stream)
gpu_vector< double > cab_dev_
gpu_vector< task_info > tasks_dev
std::vector< grid_info< double > > grid_
gpu_vector< double > forces_
void compute_hab_coefficients()
gpu_vector< double > hab_block_
std::vector< double * > sphi
std::vector< offloadStream_t > level_streams
context_info(const int device_id__)
void initialize_basis_sets(const grid_basis_set **basis_sets, const int nkinds__)
std::vector< int > number_of_tasks_per_level_
gpu_vector< double * > sphi_dev
gpu_vector< int > num_tasks_per_block_dev_
void integrate_one_grid_level(const int level, int *lp_diff)
Launches the Cuda kernel that integrates all tasks of one grid level.
gpu_vector< int > task_sorted_by_blocks_dev
gpu_vector(const size_t size__, const void *ptr__)
void zero(offloadStream_t &stream__)
gpu_vector(const size_t size__)
void copy_to_gpu(offloadStream_t &stream__)
void copy_from_gpu(T *data__, offloadStream_t &stream__)
void associate(void *host_ptr__, void *device_ptr__, const size_t size__)
void copy_to_gpu(const std::vector< T > &data__)
void copy_to_gpu(const T *data__, offloadStream_t &stream__)
void copy_to_gpu(const T *data__)
void copy_from_gpu(offloadStream_t &stream__)
void resize(const size_t new_size_)
int full_size(const int i)
int local_size(const int i)
void zero(offloadStream_t &stream)
void copy_to_gpu(const T *data, offloadStream_t &stream)
void set_lattice_vectors(const double *dh__, const double *dh_inv__)
void copy_to_host(double *data__, offloadStream_t &stream)
int border_width(const int i)
void check_orthorhombicity(const bool ortho)
int lower_corner(const int i)
void copy_to_gpu(offloadStream_t &stream)
void copy_to_host(offloadStream_t &stream)
void resize(const int *full_size__, const int *local_size__, const int *const roffset__, const int *const border_width__)
void associate(void *host_ptr__, void *device_ptr__, const size_t size__)
grid_info(const int *full_size__, const int *local_size__, const int *border_width__)
void is_distributed(const bool distributed__)
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
static void const int const int i
Internal representation of a basis set.
Parameters of the collocate kernel.
int * task_sorted_by_blocks_dev
int grid_lower_corner_[3]
int * num_tasks_per_block_dev
int grid_border_width_[3]
int * sorted_blocks_offset_dev
Internal representation of a task.