(git:374b731)
Loading...
Searching...
No Matches
grid_hip_context.h
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2024 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: BSD-3-Clause */
6/*----------------------------------------------------------------------------*/
7
8/*
9 * Authors :
10 - Dr Mathieu Taillefumier (ETH Zurich / CSCS)
11 - Advanced Micro Devices, Inc.
12*/
13
14#ifndef GRID_HIP_CONTEXT_H
15#define GRID_HIP_CONTEXT_H
16
17#include <hip/hip_runtime_api.h>
18#include <vector>
19
20extern "C" {
21#include "../common/grid_basis_set.h"
22#include "../common/grid_constants.h"
23}
24
25#include "../../offload/offload_library.h"
26#include "../../offload/offload_runtime.h"
27namespace rocm_backend {
28// a little helper class in the same spirit than std::vector. it must exist
29// somewhere. Maybe possible to get the same thing with std::vector and
30// specific allocator.
31
32class smem_parameters;
33template <typename T> class gpu_vector {
34 size_t allocated_size_{0};
35 size_t current_size_{0};
36 bool allocated_outside_{false};
37 T *device_ptr_ = nullptr;
38 T *host_ptr_ = nullptr;
39
40public:
42
43 // size is the number of elements not the memory size
44 gpu_vector(const size_t size__) {
45 if (size__ < 16) {
46 allocated_size_ = 16;
47 } else {
48 allocated_size_ = (size__ / 16 + 1) * 16;
49 }
50 current_size_ = size__;
51
52#ifndef __OFFLOAD_UNIFIED_MEMORY
53 offloadMalloc((void **)&device_ptr_, sizeof(T) * allocated_size_);
54#else
55 hipMallocManaged((void **)&device_ptr_, sizeof(T) * allocated_size_);
56#endif
57 }
58
59 gpu_vector(const size_t size__, const void *ptr__) {
60 allocated_size_ = size__;
61 current_size_ = size__;
62 allocated_outside_ = true;
63 device_ptr_ = ptr__;
64 }
66
67 inline size_t size() { return current_size_; }
68
69 inline void copy_to_gpu(const T *data__) {
70 offloadMemcpyHtoD(device_ptr_, data__, sizeof(T) * current_size_);
71 }
72
73 inline void copy_to_gpu(const T *data__, offloadStream_t &stream__) {
74 offloadMemcpyAsyncHtoD(device_ptr_, data__, sizeof(T) * current_size_,
75 stream__);
76 }
77
78 inline void copy_to_gpu(offloadStream_t &stream__) {
79 offloadMemcpyAsyncHtoD(device_ptr_, host_ptr_, sizeof(T) * current_size_,
80 stream__);
81 }
82
83 inline void copy_from_gpu(T *data__, offloadStream_t &stream__) {
84 offloadMemcpyAsyncDtoH(data__, device_ptr_, sizeof(T) * current_size_,
85 stream__);
86 }
87
88 inline void copy_from_gpu(offloadStream_t &stream__) {
89 offloadMemcpyAsyncDtoH(host_ptr_, device_ptr_, sizeof(T) * current_size_,
90 stream__);
91 }
92
93 inline void zero(offloadStream_t &stream__) {
94 // zero device grid buffers
95 offloadMemsetAsync(device_ptr_, 0, sizeof(T) * current_size_, stream__);
96 }
97
98 inline void associate(void *host_ptr__, void *device_ptr__,
99 const size_t size__) {
100 allocated_outside_ = true;
101 // size__ is the number of elements not the size of the memory block
102 current_size_ = size__;
103 device_ptr_ = static_cast<T *>(device_ptr__);
104 host_ptr_ = static_cast<T *>(host_ptr__);
105 }
106
107 inline void zero() {
108 // zero device grid buffers
109 offloadMemset(device_ptr_, 0, sizeof(T) * current_size_);
110 }
111
112 inline void copy_to_gpu(const std::vector<T> &data__) {
113 assert(data__.size() == current_size_);
114 // if it fails it means that the vector on the gpu does not have the right
115 // size. two option then
116 // - resize the gpu vector
117 // - or the cpu vector and gpu vector are not representing the quantity.
118
119 offloadMemcpyHtoD(device_ptr_, data__.data(), sizeof(T) * data__.size());
120 }
121
122 inline void resize(const size_t new_size_) {
123 if (allocated_outside_) {
124 allocated_outside_ = false;
125 allocated_size_ = 0;
126 device_ptr_ = nullptr;
127 host_ptr_ = nullptr;
128 }
129
130 if (allocated_size_ < new_size_) {
131 if (device_ptr_ != nullptr)
132 offloadFree(device_ptr_);
133 allocated_size_ = (new_size_ / 16 + (new_size_ % 16 != 0)) * 16;
134 offloadMalloc((void **)&device_ptr_, sizeof(T) * allocated_size_);
135 }
136 current_size_ = new_size_;
137 }
138
139 // does not invalidate the pointer. The memory is still allocated
140 inline void clear() { current_size_ = 0; }
141
142 // reset the class and free memory
143 inline void reset() {
144 if (allocated_outside_) {
145 return;
146 }
147
148 if (device_ptr_ != nullptr)
149 offloadFree(device_ptr_);
150
151 allocated_size_ = 0;
152 current_size_ = 0;
153 device_ptr_ = nullptr;
154 }
155
156 inline T *data() { return device_ptr_; }
157};
158
159template <typename T> class grid_info {
160 int full_size_[3] = {0, 0, 0};
161 int local_size_[3] = {0, 0, 0};
162 // origin of the local part of the grid in grid point
163 int lower_corner_[3] = {0, 0, 0};
164 int border_width_[3] = {0, 0, 0};
165 double dh_[9];
166 double dh_inv_[9];
167 bool orthorhombic_{false};
168 bool is_distributed_{false};
169 gpu_vector<T> grid_;
170
171public:
173
174 grid_info(const int *full_size__, const int *local_size__,
175 const int *border_width__) {
176 initialize(full_size__, local_size__, border_width__);
177 }
178
179 ~grid_info() { grid_.reset(); };
180
181 inline T *data() { return grid_.data(); }
182
183 inline void copy_to_gpu(const T *data, offloadStream_t &stream) {
184 grid_.copy_to_gpu(data, stream);
185 }
186
187 inline void copy_to_gpu(offloadStream_t &stream) {
188 grid_.copy_to_gpu(stream);
189 }
190
191 inline void reset() { grid_.reset(); }
192
193 inline void resize(const int *full_size__, const int *local_size__,
194 const int *const roffset__,
195 const int *const border_width__) {
196 initialize(full_size__, local_size__, roffset__, border_width__);
197 }
198
199 inline size_t size() const { return grid_.size(); }
200
201 inline void zero(offloadStream_t &stream) { grid_.zero(stream); }
202 inline gpu_vector<T> &grid() { return grid_; }
203 inline void set_lattice_vectors(const double *dh__, const double *dh_inv__) {
204 memcpy(dh_, dh__, sizeof(double) * 9);
205 memcpy(dh_inv_, dh_inv__, sizeof(double) * 9);
206 }
207
208 inline T *dh() { return dh_; }
209
210 inline T *dh_inv() { return dh_inv_; }
211
212 inline bool is_orthorhombic() { return orthorhombic_; }
213
214 inline void is_distributed(const bool distributed__) {
215 is_distributed_ = distributed__;
216 }
217
218 void check_orthorhombicity(const bool ortho) {
219 if (ortho) {
220 orthorhombic_ = true;
221 return;
222 }
223 double norm1, norm2, norm3;
224 bool orthogonal[3] = {false, false, false};
225 norm1 = dh_[0] * dh_[0] + dh_[1] * dh_[1] + dh_[2] * dh_[2];
226 norm2 = dh_[3] * dh_[3] + dh_[4] * dh_[4] + dh_[5] * dh_[5];
227 norm3 = dh_[6] * dh_[6] + dh_[7] * dh_[7] + dh_[8] * dh_[8];
228
229 norm1 = 1.0 / sqrt(norm1);
230 norm2 = 1.0 / sqrt(norm2);
231 norm3 = 1.0 / sqrt(norm3);
232
233 /* x z */
234 orthogonal[0] =
235 ((fabs(dh_[0] * dh_[6] + dh_[1] * dh_[7] + dh_[2] * dh_[8]) * norm1 *
236 norm3) < 1e-12);
237 /* y z */
238 orthogonal[1] =
239 ((fabs(dh_[3] * dh_[6] + dh_[4] * dh_[7] + dh_[5] * dh_[8]) * norm2 *
240 norm3) < 1e-12);
241 /* x y */
242 orthogonal[2] =
243 ((fabs(dh_[0] * dh_[3] + dh_[1] * dh_[4] + dh_[2] * dh_[5]) * norm1 *
244 norm2) < 1e-12);
245
246 orthorhombic_ = orthogonal[0] && orthogonal[1] && orthogonal[2];
247 }
248
249 inline void copy_to_host(double *data__, offloadStream_t &stream) {
250 grid_.copy_from_gpu(data__, stream);
251 }
252
253 inline void copy_to_host(offloadStream_t &stream) {
254 grid_.copy_from_gpu(stream);
255 }
256
257 inline void associate(void *host_ptr__, void *device_ptr__,
258 const size_t size__) {
259 grid_.associate(host_ptr__, device_ptr__, size__);
260 }
261 inline bool is_distributed() { return is_distributed_; }
262
263 inline int full_size(const int i) {
264 assert(i < 3);
265 return full_size_[i];
266 }
267
268 inline int local_size(const int i) {
269 assert(i < 3);
270 return local_size_[i];
271 }
272
273 inline int lower_corner(const int i) {
274 assert(i < 3);
275 return lower_corner_[i];
276 }
277
278 inline int border_width(const int i) {
279 assert(i < 3);
280 return border_width_[i];
281 }
282
283private:
284 void initialize(const int *const full_size__, const int *const local_size__,
285 const int *const roffset__, const int *const border_width__) {
286 // the calling code store things like this cube[z][y][x] (in fortran
287 // cube(x,y,z)) so all sizes are [x,y,z] while we are working in C/C++ so we
288 // have to permute the indices to get this right.
289
290 full_size_[2] = full_size__[0];
291 full_size_[1] = full_size__[1];
292 full_size_[0] = full_size__[2];
293
294 local_size_[2] = local_size__[0];
295 local_size_[1] = local_size__[1];
296 local_size_[0] = local_size__[2];
297
298 lower_corner_[0] = roffset__[2];
299 lower_corner_[1] = roffset__[1];
300 lower_corner_[2] = roffset__[0];
301
302 is_distributed_ = (full_size_[2] != local_size_[2]) ||
303 (full_size_[1] != local_size_[1]) ||
304 (full_size_[0] != local_size_[0]);
305
306 border_width_[2] = border_width__[0];
307 border_width_[1] = border_width__[1];
308 border_width_[0] = border_width__[2];
309
310 grid_.resize(local_size_[0] * local_size_[1] * local_size_[2]);
311 }
312};
313
314/*******************************************************************************
315 * \brief Internal representation of a task.
316 ******************************************************************************/
348
349/*******************************************************************************
350 * \brief Parameters of the collocate kernel.
351 ******************************************************************************/
352
357 int grid_full_size_[3] = {0, 0, 0};
358 int grid_local_size_[3] = {0, 0, 0};
359 int grid_lower_corner_[3] = {0, 0, 0};
360 int grid_border_width_[3] = {0, 0, 0};
361 double dh_[9];
362 double dh_inv_[9];
364 int *block_offsets{nullptr};
365 char la_min_diff{0};
366 char lb_min_diff{0};
367 char la_max_diff{0};
368 char lb_max_diff{0};
370 double *ptr_dev[7] = {nullptr, nullptr, nullptr, nullptr,
371 nullptr, nullptr, nullptr};
372 double **sphi_dev{nullptr};
373 int ntasks{0};
377};
378
379/* regroup all information about the context. */
381private:
382 int device_id_{-1};
383 int lmax_{0};
384 unsigned int checksum_{0};
385
386public:
387 int ntasks{0};
388 int nlevels{0};
389 int natoms{0};
390 int nkinds{0};
391 int nblocks{0};
392 std::vector<double *> sphi;
393 std::vector<offloadStream_t> level_streams;
394 offloadStream_t main_stream;
395 int stats[2][20]; // [has_border_mask][lp]
396 // all these tables are on the gpu. we can resize them copy to them and copy
397 // from them
407 std::vector<grid_info<double>> grid_;
409 std::vector<int> first_task_per_level_;
410 std::vector<int> sphi_size;
413 bool calculate_forces{false};
414 bool calculate_virial{false};
415 bool compute_tau{false};
416 bool apply_border_mask{false};
417
419 context_info(const int device_id__) {
420 if (device_id__ < 0)
421 device_id_ = 0;
422 else
423 device_id_ = device_id__;
424 }
426
427 void clear() {
428 hipSetDevice(device_id_);
429 tasks_dev.reset();
432 cab_dev_.reset();
435 sphi_dev.reset();
436 forces_.reset();
437 virial_.reset();
438 for (auto &phi : sphi)
439 if (phi != nullptr)
440 offloadFree(phi);
441 sphi.clear();
442
443 offloadStreamDestroy(main_stream);
444
445 for (int i = 0; i < nlevels; i++) {
446 offloadStreamDestroy(level_streams[i]);
447 }
448 level_streams.clear();
449
450 for (auto &grid : grid_) {
451 grid.reset();
452 }
453 grid_.clear();
454 }
455
456 int lmax() const { return lmax_; }
457
458 void initialize_basis_sets(const grid_basis_set **basis_sets,
459 const int nkinds__) {
460 nkinds = nkinds__;
461 if (nkinds__ > (int)sphi.size()) {
462 for (auto &phi : sphi)
463 if (phi != nullptr) {
464 offloadFree(phi);
465 }
466
467 sphi_dev.resize(nkinds__);
468
469 sphi.resize(nkinds__, nullptr);
470 sphi_size.clear();
471 sphi_size.resize(nkinds__, 0);
472 sphi_dev.resize(nkinds__);
473 }
474
475 // Upload basis sets to device.
476 for (int i = 0; i < nkinds__; i++) {
477 const auto &basis_set = basis_sets[i];
478 if (sphi_size[i] < basis_set->nsgf * basis_set->maxco) {
479 offloadMalloc((void **)&sphi[i],
480 basis_set->nsgf * basis_set->maxco * sizeof(double));
481 sphi_size[i] = basis_set->nsgf * basis_set->maxco;
482 }
483 offloadMemset(sphi[i], 0, sizeof(double) * sphi_size[i]);
484 offloadMemcpyHtoD(sphi[i], basis_set->sphi,
485 basis_set->nsgf * basis_set->maxco * sizeof(double));
486 }
488 // Find largest angular momentum.
489 lmax_ = 0;
490 for (int ikind = 0; ikind < nkinds; ikind++) {
491 for (int iset = 0; iset < basis_sets[ikind]->nset; iset++) {
492 lmax_ = std::max(lmax_, basis_sets[ikind]->lmax[iset]);
493 }
494 }
495 }
496
498 // allocate main hip stream
499 offloadStreamCreate(&main_stream);
500
501 // allocate one hip stream per grid level
502 if ((int)level_streams.size() < nlevels) {
503 level_streams.resize(nlevels);
504 for (auto &stream : level_streams) {
505 offloadStreamCreate(&stream);
506 }
507 }
508 }
509
510 void synchronize(offloadStream_t &stream) {
511 offloadStreamSynchronize(stream);
512 }
513
514 void synchornize() {
515 // wait for all the streams to finish
516 offloadDeviceSynchronize();
517 }
518
519 void set_device() { hipSetDevice(device_id_); }
520
521 void collocate_one_grid_level(const int level, const enum grid_func func,
522 int *lp_diff);
523 void integrate_one_grid_level(const int level, int *lp_diff);
525 /* basic checksum computation for simple verification that the object is sane
526 */
527 void compute_checksum() { checksum_ = compute_checksum_(); }
529 if (checksum_ != compute_checksum_()) {
530 fprintf(stderr, "This object does not seem to have the right structure.\n"
531 "A casting went wrong or the object is corrupted\n");
532 abort();
533 }
534 }
535
536private:
537 kernel_params set_kernel_parameters(const int level,
538 const smem_parameters &smem_params);
539 unsigned int compute_checksum_() {
540 return natoms ^ ntasks ^ nlevels ^ nkinds ^ nblocks ^ 0x4F2C5D1A;
541 }
542};
543} // namespace rocm_backend
544#endif
std::vector< int > first_task_per_level_
gpu_vector< double > virial_
gpu_vector< int > sorted_blocks_offset_dev
gpu_vector< double > coef_dev_
gpu_vector< double > pab_block_
void collocate_one_grid_level(const int level, const enum grid_func func, int *lp_diff)
gpu_vector< int > block_offsets_dev
void synchronize(offloadStream_t &stream)
gpu_vector< double > cab_dev_
gpu_vector< task_info > tasks_dev
std::vector< grid_info< double > > grid_
gpu_vector< double > forces_
gpu_vector< double > hab_block_
std::vector< double * > sphi
std::vector< offloadStream_t > level_streams
context_info(const int device_id__)
void initialize_basis_sets(const grid_basis_set **basis_sets, const int nkinds__)
std::vector< int > number_of_tasks_per_level_
gpu_vector< double * > sphi_dev
gpu_vector< int > num_tasks_per_block_dev_
void integrate_one_grid_level(const int level, int *lp_diff)
gpu_vector< int > task_sorted_by_blocks_dev
gpu_vector(const size_t size__, const void *ptr__)
void zero(offloadStream_t &stream__)
gpu_vector(const size_t size__)
void copy_to_gpu(offloadStream_t &stream__)
void copy_from_gpu(T *data__, offloadStream_t &stream__)
void associate(void *host_ptr__, void *device_ptr__, const size_t size__)
void copy_to_gpu(const std::vector< T > &data__)
void copy_to_gpu(const T *data__, offloadStream_t &stream__)
void copy_to_gpu(const T *data__)
void copy_from_gpu(offloadStream_t &stream__)
void resize(const size_t new_size_)
int local_size(const int i)
void zero(offloadStream_t &stream)
void copy_to_gpu(const T *data, offloadStream_t &stream)
void set_lattice_vectors(const double *dh__, const double *dh_inv__)
void copy_to_host(double *data__, offloadStream_t &stream)
gpu_vector< T > & grid()
int border_width(const int i)
void check_orthorhombicity(const bool ortho)
int lower_corner(const int i)
void copy_to_gpu(offloadStream_t &stream)
void copy_to_host(offloadStream_t &stream)
void resize(const int *full_size__, const int *local_size__, const int *const roffset__, const int *const border_width__)
void associate(void *host_ptr__, void *device_ptr__, const size_t size__)
grid_info(const int *full_size__, const int *local_size__, const int *border_width__)
void is_distributed(const bool distributed__)
grid_func
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
static void const int const int i
Internal representation of a basis set.
Parameters of the collocate kernel.
Internal representation of a task.