(git:b279b6b)
grid_hip_context.h
Go to the documentation of this file.
1 /*----------------------------------------------------------------------------*/
2 /* CP2K: A general program to perform molecular dynamics simulations */
3 /* Copyright 2000-2024 CP2K developers group <https://cp2k.org> */
4 /* */
5 /* SPDX-License-Identifier: BSD-3-Clause */
6 /*----------------------------------------------------------------------------*/
7 
8 /*
9  * Authors :
10  - Dr Mathieu Taillefumier (ETH Zurich / CSCS)
11  - Advanced Micro Devices, Inc.
12 */
13 
14 #ifndef GRID_HIP_CONTEXT_H
15 #define GRID_HIP_CONTEXT_H
16 
17 #include <hip/hip_runtime_api.h>
18 #include <vector>
19 
20 extern "C" {
21 #include "../common/grid_basis_set.h"
22 #include "../common/grid_constants.h"
23 }
24 
25 #include "../../offload/offload_library.h"
26 #include "../../offload/offload_runtime.h"
27 namespace rocm_backend {
28 // a little helper class in the same spirit than std::vector. it must exist
29 // somewhere. Maybe possible to get the same thing with std::vector and
30 // specific allocator.
31 
32 class smem_parameters;
33 template <typename T> class gpu_vector {
34  size_t allocated_size_{0};
35  size_t current_size_{0};
36  bool allocated_outside_{false};
37  T *device_ptr_ = nullptr;
38  T *host_ptr_ = nullptr;
39 
40 public:
42 
43  // size is the number of elements not the memory size
44  gpu_vector(const size_t size__) {
45  if (size__ < 16) {
46  allocated_size_ = 16;
47  } else {
48  allocated_size_ = (size__ / 16 + 1) * 16;
49  }
50  current_size_ = size__;
51 
52 #ifndef __OFFLOAD_UNIFIED_MEMORY
53  offloadMalloc((void **)&device_ptr_, sizeof(T) * allocated_size_);
54 #else
55  hipMallocManaged((void **)&device_ptr_, sizeof(T) * allocated_size_);
56 #endif
57  }
58 
59  gpu_vector(const size_t size__, const void *ptr__) {
60  allocated_size_ = size__;
61  current_size_ = size__;
62  allocated_outside_ = true;
63  device_ptr_ = ptr__;
64  }
65  ~gpu_vector() { reset(); }
66 
67  inline size_t size() { return current_size_; }
68 
69  inline void copy_to_gpu(const T *data__) {
70  offloadMemcpyHtoD(device_ptr_, data__, sizeof(T) * current_size_);
71  }
72 
73  inline void copy_to_gpu(const T *data__, offloadStream_t &stream__) {
74  offloadMemcpyAsyncHtoD(device_ptr_, data__, sizeof(T) * current_size_,
75  stream__);
76  }
77 
78  inline void copy_to_gpu(offloadStream_t &stream__) {
79  offloadMemcpyAsyncHtoD(device_ptr_, host_ptr_, sizeof(T) * current_size_,
80  stream__);
81  }
82 
83  inline void copy_from_gpu(T *data__, offloadStream_t &stream__) {
84  offloadMemcpyAsyncDtoH(data__, device_ptr_, sizeof(T) * current_size_,
85  stream__);
86  }
87 
88  inline void copy_from_gpu(offloadStream_t &stream__) {
89  offloadMemcpyAsyncDtoH(host_ptr_, device_ptr_, sizeof(T) * current_size_,
90  stream__);
91  }
92 
93  inline void zero(offloadStream_t &stream__) {
94  // zero device grid buffers
95  offloadMemsetAsync(device_ptr_, 0, sizeof(T) * current_size_, stream__);
96  }
97 
98  inline void associate(void *host_ptr__, void *device_ptr__,
99  const size_t size__) {
100  allocated_outside_ = true;
101  // size__ is the number of elements not the size of the memory block
102  current_size_ = size__;
103  device_ptr_ = static_cast<T *>(device_ptr__);
104  host_ptr_ = static_cast<T *>(host_ptr__);
105  }
106 
107  inline void zero() {
108  // zero device grid buffers
109  offloadMemset(device_ptr_, 0, sizeof(T) * current_size_);
110  }
111 
112  inline void copy_to_gpu(const std::vector<T> &data__) {
113  assert(data__.size() == current_size_);
114  // if it fails it means that the vector on the gpu does not have the right
115  // size. two option then
116  // - resize the gpu vector
117  // - or the cpu vector and gpu vector are not representing the quantity.
118 
119  offloadMemcpyHtoD(device_ptr_, data__.data(), sizeof(T) * data__.size());
120  }
121 
122  inline void resize(const size_t new_size_) {
123  if (allocated_outside_) {
124  allocated_outside_ = false;
125  allocated_size_ = 0;
126  device_ptr_ = nullptr;
127  host_ptr_ = nullptr;
128  }
129 
130  if (allocated_size_ < new_size_) {
131  if (device_ptr_ != nullptr)
132  offloadFree(device_ptr_);
133  allocated_size_ = (new_size_ / 16 + (new_size_ % 16 != 0)) * 16;
134  offloadMalloc((void **)&device_ptr_, sizeof(T) * allocated_size_);
135  }
136  current_size_ = new_size_;
137  }
138 
139  // does not invalidate the pointer. The memory is still allocated
140  inline void clear() { current_size_ = 0; }
141 
142  // reset the class and free memory
143  inline void reset() {
144  if (allocated_outside_) {
145  return;
146  }
147 
148  if (device_ptr_ != nullptr)
149  offloadFree(device_ptr_);
150 
151  allocated_size_ = 0;
152  current_size_ = 0;
153  device_ptr_ = nullptr;
154  }
155 
156  inline T *data() { return device_ptr_; }
157 };
158 
159 template <typename T> class grid_info {
160  int full_size_[3] = {0, 0, 0};
161  int local_size_[3] = {0, 0, 0};
162  // origin of the local part of the grid in grid point
163  int lower_corner_[3] = {0, 0, 0};
164  int border_width_[3] = {0, 0, 0};
165  double dh_[9];
166  double dh_inv_[9];
167  bool orthorhombic_{false};
168  bool is_distributed_{false};
169  gpu_vector<T> grid_;
170 
171 public:
173 
174  grid_info(const int *full_size__, const int *local_size__,
175  const int *border_width__) {
176  initialize(full_size__, local_size__, border_width__);
177  }
178 
179  ~grid_info() { grid_.reset(); };
180 
181  inline T *data() { return grid_.data(); }
182 
183  inline void copy_to_gpu(const T *data, offloadStream_t &stream) {
184  grid_.copy_to_gpu(data, stream);
185  }
186 
187  inline void copy_to_gpu(offloadStream_t &stream) {
188  grid_.copy_to_gpu(stream);
189  }
190 
191  inline void reset() { grid_.reset(); }
192 
193  inline void resize(const int *full_size__, const int *local_size__,
194  const int *const roffset__,
195  const int *const border_width__) {
196  initialize(full_size__, local_size__, roffset__, border_width__);
197  }
198 
199  inline size_t size() const { return grid_.size(); }
200 
201  inline void zero(offloadStream_t &stream) { grid_.zero(stream); }
202  inline gpu_vector<T> &grid() { return grid_; }
203  inline void set_lattice_vectors(const double *dh__, const double *dh_inv__) {
204  memcpy(dh_, dh__, sizeof(double) * 9);
205  memcpy(dh_inv_, dh_inv__, sizeof(double) * 9);
206  }
207 
208  inline T *dh() { return dh_; }
209 
210  inline T *dh_inv() { return dh_inv_; }
211 
212  inline bool is_orthorhombic() { return orthorhombic_; }
213 
214  inline void is_distributed(const bool distributed__) {
215  is_distributed_ = distributed__;
216  }
217 
218  void check_orthorhombicity(const bool ortho) {
219  if (ortho) {
220  orthorhombic_ = true;
221  return;
222  }
223  double norm1, norm2, norm3;
224  bool orthogonal[3] = {false, false, false};
225  norm1 = dh_[0] * dh_[0] + dh_[1] * dh_[1] + dh_[2] * dh_[2];
226  norm2 = dh_[3] * dh_[3] + dh_[4] * dh_[4] + dh_[5] * dh_[5];
227  norm3 = dh_[6] * dh_[6] + dh_[7] * dh_[7] + dh_[8] * dh_[8];
228 
229  norm1 = 1.0 / sqrt(norm1);
230  norm2 = 1.0 / sqrt(norm2);
231  norm3 = 1.0 / sqrt(norm3);
232 
233  /* x z */
234  orthogonal[0] =
235  ((fabs(dh_[0] * dh_[6] + dh_[1] * dh_[7] + dh_[2] * dh_[8]) * norm1 *
236  norm3) < 1e-12);
237  /* y z */
238  orthogonal[1] =
239  ((fabs(dh_[3] * dh_[6] + dh_[4] * dh_[7] + dh_[5] * dh_[8]) * norm2 *
240  norm3) < 1e-12);
241  /* x y */
242  orthogonal[2] =
243  ((fabs(dh_[0] * dh_[3] + dh_[1] * dh_[4] + dh_[2] * dh_[5]) * norm1 *
244  norm2) < 1e-12);
245 
246  orthorhombic_ = orthogonal[0] && orthogonal[1] && orthogonal[2];
247  }
248 
249  inline void copy_to_host(double *data__, offloadStream_t &stream) {
250  grid_.copy_from_gpu(data__, stream);
251  }
252 
253  inline void copy_to_host(offloadStream_t &stream) {
254  grid_.copy_from_gpu(stream);
255  }
256 
257  inline void associate(void *host_ptr__, void *device_ptr__,
258  const size_t size__) {
259  grid_.associate(host_ptr__, device_ptr__, size__);
260  }
261  inline bool is_distributed() { return is_distributed_; }
262 
263  inline int full_size(const int i) {
264  assert(i < 3);
265  return full_size_[i];
266  }
267 
268  inline int local_size(const int i) {
269  assert(i < 3);
270  return local_size_[i];
271  }
272 
273  inline int lower_corner(const int i) {
274  assert(i < 3);
275  return lower_corner_[i];
276  }
277 
278  inline int border_width(const int i) {
279  assert(i < 3);
280  return border_width_[i];
281  }
282 
283 private:
284  void initialize(const int *const full_size__, const int *const local_size__,
285  const int *const roffset__, const int *const border_width__) {
286  // the calling code store things like this cube[z][y][x] (in fortran
287  // cube(x,y,z)) so all sizes are [x,y,z] while we are working in C/C++ so we
288  // have to permute the indices to get this right.
289 
290  full_size_[2] = full_size__[0];
291  full_size_[1] = full_size__[1];
292  full_size_[0] = full_size__[2];
293 
294  local_size_[2] = local_size__[0];
295  local_size_[1] = local_size__[1];
296  local_size_[0] = local_size__[2];
297 
298  lower_corner_[0] = roffset__[2];
299  lower_corner_[1] = roffset__[1];
300  lower_corner_[2] = roffset__[0];
301 
302  is_distributed_ = (full_size_[2] != local_size_[2]) ||
303  (full_size_[1] != local_size_[1]) ||
304  (full_size_[0] != local_size_[0]);
305 
306  border_width_[2] = border_width__[0];
307  border_width_[1] = border_width__[1];
308  border_width_[0] = border_width__[2];
309 
310  grid_.resize(local_size_[0] * local_size_[1] * local_size_[2]);
311  }
312 };
313 
314 /*******************************************************************************
315  * \brief Internal representation of a task.
316  ******************************************************************************/
317 struct task_info {
318  int level;
319  int iatom;
320  int jatom;
321  int iset;
322  int jset;
323  int ipgf;
324  int jpgf;
325  int ikind, jkind;
328  double radius;
329  double ra[3], rb[3], rp[3];
330  double rab2;
332  double rab[3];
333  int lp_max{0};
334  size_t coef_offset{0};
335  size_t cab_offset{0};
339  double3 roffset;
340  int3 cube_size;
341  int3 lb_cube;
346  bool skip_task;
347 };
348 
349 /*******************************************************************************
350  * \brief Parameters of the collocate kernel.
351  ******************************************************************************/
352 
356  int first_task{0};
357  int grid_full_size_[3] = {0, 0, 0};
358  int grid_local_size_[3] = {0, 0, 0};
359  int grid_lower_corner_[3] = {0, 0, 0};
360  int grid_border_width_[3] = {0, 0, 0};
361  double dh_[9];
362  double dh_inv_[9];
364  int *block_offsets{nullptr};
365  char la_min_diff{0};
366  char lb_min_diff{0};
367  char la_max_diff{0};
368  char lb_max_diff{0};
369  enum grid_func func;
370  double *ptr_dev[7] = {nullptr, nullptr, nullptr, nullptr,
371  nullptr, nullptr, nullptr};
372  double **sphi_dev{nullptr};
373  int ntasks{0};
376  int *num_tasks_per_block_dev{nullptr};
377 };
378 
379 /* regroup all information about the context. */
381 private:
382  int device_id_{-1};
383  int lmax_{0};
384  unsigned int checksum_{0};
385 
386 public:
387  int ntasks{0};
388  int nlevels{0};
389  int natoms{0};
390  int nkinds{0};
391  int nblocks{0};
392  std::vector<double *> sphi;
393  std::vector<offloadStream_t> level_streams;
394  offloadStream_t main_stream;
395  int stats[2][20]; // [has_border_mask][lp]
396  // all these tables are on the gpu. we can resize them copy to them and copy
397  // from them
407  std::vector<grid_info<double>> grid_;
408  std::vector<int> number_of_tasks_per_level_;
409  std::vector<int> first_task_per_level_;
410  std::vector<int> sphi_size;
413  bool calculate_forces{false};
414  bool calculate_virial{false};
415  bool compute_tau{false};
416  bool apply_border_mask{false};
417 
419  context_info(const int device_id__) {
420  if (device_id__ < 0)
421  device_id_ = 0;
422  else
423  device_id_ = device_id__;
424  }
426 
427  void clear() {
428  hipSetDevice(device_id_);
429  tasks_dev.reset();
431  coef_dev_.reset();
432  cab_dev_.reset();
435  sphi_dev.reset();
436  forces_.reset();
437  virial_.reset();
438  for (auto &phi : sphi)
439  if (phi != nullptr)
440  offloadFree(phi);
441  sphi.clear();
442 
443  offloadStreamDestroy(main_stream);
444 
445  for (int i = 0; i < nlevels; i++) {
446  offloadStreamDestroy(level_streams[i]);
447  }
448  level_streams.clear();
449 
450  for (auto &grid : grid_) {
451  grid.reset();
452  }
453  grid_.clear();
454  }
455 
456  int lmax() const { return lmax_; }
457 
458  void initialize_basis_sets(const grid_basis_set **basis_sets,
459  const int nkinds__) {
460  nkinds = nkinds__;
461  if (nkinds__ > (int)sphi.size()) {
462  for (auto &phi : sphi)
463  if (phi != nullptr) {
464  offloadFree(phi);
465  }
466 
467  sphi_dev.resize(nkinds__);
468 
469  sphi.resize(nkinds__, nullptr);
470  sphi_size.clear();
471  sphi_size.resize(nkinds__, 0);
472  sphi_dev.resize(nkinds__);
473  }
474 
475  // Upload basis sets to device.
476  for (int i = 0; i < nkinds__; i++) {
477  const auto &basis_set = basis_sets[i];
478  if (sphi_size[i] < basis_set->nsgf * basis_set->maxco) {
479  offloadMalloc((void **)&sphi[i],
480  basis_set->nsgf * basis_set->maxco * sizeof(double));
481  sphi_size[i] = basis_set->nsgf * basis_set->maxco;
482  }
483  offloadMemset(sphi[i], 0, sizeof(double) * sphi_size[i]);
484  offloadMemcpyHtoD(sphi[i], basis_set->sphi,
485  basis_set->nsgf * basis_set->maxco * sizeof(double));
486  }
488  // Find largest angular momentum.
489  lmax_ = 0;
490  for (int ikind = 0; ikind < nkinds; ikind++) {
491  for (int iset = 0; iset < basis_sets[ikind]->nset; iset++) {
492  lmax_ = std::max(lmax_, basis_sets[ikind]->lmax[iset]);
493  }
494  }
495  }
496 
497  void create_streams() {
498  // allocate main hip stream
499  offloadStreamCreate(&main_stream);
500 
501  // allocate one hip stream per grid level
502  if ((int)level_streams.size() < nlevels) {
503  level_streams.resize(nlevels);
504  for (auto &stream : level_streams) {
505  offloadStreamCreate(&stream);
506  }
507  }
508  }
509 
510  void synchronize(offloadStream_t &stream) {
511  offloadStreamSynchronize(stream);
512  }
513 
514  void synchornize() {
515  // wait for all the streams to finish
516  offloadDeviceSynchronize();
517  }
518 
519  void set_device() { hipSetDevice(device_id_); }
520 
521  void collocate_one_grid_level(const int level, const enum grid_func func,
522  int *lp_diff);
523  void integrate_one_grid_level(const int level, int *lp_diff);
525  /* basic checksum computation for simple verification that the object is sane
526  */
527  void compute_checksum() { checksum_ = compute_checksum_(); }
529  if (checksum_ != compute_checksum_()) {
530  fprintf(stderr, "This object does not seem to have the right structure.\n"
531  "A casting went wrong or the object is corrupted\n");
532  abort();
533  }
534  }
535 
536 private:
537  kernel_params set_kernel_parameters(const int level,
538  const smem_parameters &smem_params);
539  unsigned int compute_checksum_() {
540  return natoms ^ ntasks ^ nlevels ^ nkinds ^ nblocks ^ 0x4F2C5D1A;
541  }
542 };
543 } // namespace rocm_backend
544 #endif
std::vector< int > first_task_per_level_
gpu_vector< double > virial_
gpu_vector< int > sorted_blocks_offset_dev
gpu_vector< double > coef_dev_
std::vector< int > sphi_size
gpu_vector< double > pab_block_
void collocate_one_grid_level(const int level, const enum grid_func func, int *lp_diff)
gpu_vector< int > block_offsets_dev
void synchronize(offloadStream_t &stream)
gpu_vector< double > cab_dev_
gpu_vector< task_info > tasks_dev
std::vector< grid_info< double > > grid_
gpu_vector< double > forces_
gpu_vector< double > hab_block_
std::vector< double * > sphi
std::vector< offloadStream_t > level_streams
context_info(const int device_id__)
void initialize_basis_sets(const grid_basis_set **basis_sets, const int nkinds__)
std::vector< int > number_of_tasks_per_level_
gpu_vector< double * > sphi_dev
gpu_vector< int > num_tasks_per_block_dev_
void integrate_one_grid_level(const int level, int *lp_diff)
gpu_vector< int > task_sorted_by_blocks_dev
gpu_vector(const size_t size__, const void *ptr__)
void zero(offloadStream_t &stream__)
gpu_vector(const size_t size__)
void copy_to_gpu(offloadStream_t &stream__)
void copy_from_gpu(T *data__, offloadStream_t &stream__)
void associate(void *host_ptr__, void *device_ptr__, const size_t size__)
void copy_to_gpu(const std::vector< T > &data__)
void copy_to_gpu(const T *data__, offloadStream_t &stream__)
void copy_to_gpu(const T *data__)
void copy_from_gpu(offloadStream_t &stream__)
void resize(const size_t new_size_)
int full_size(const int i)
int local_size(const int i)
void zero(offloadStream_t &stream)
void copy_to_gpu(const T *data, offloadStream_t &stream)
void set_lattice_vectors(const double *dh__, const double *dh_inv__)
void copy_to_host(double *data__, offloadStream_t &stream)
int border_width(const int i)
void check_orthorhombicity(const bool ortho)
int lower_corner(const int i)
void copy_to_gpu(offloadStream_t &stream)
void copy_to_host(offloadStream_t &stream)
void resize(const int *full_size__, const int *local_size__, const int *const roffset__, const int *const border_width__)
void associate(void *host_ptr__, void *device_ptr__, const size_t size__)
grid_info(const int *full_size__, const int *local_size__, const int *border_width__)
gpu_vector< T > & grid()
void is_distributed(const bool distributed__)
grid_func
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
static void const int const int i
Internal representation of a basis set.
Parameters of the collocate kernel.
Internal representation of a task.