(git:ed6f26b)
Loading...
Searching...
No Matches
grid_hip_context.h
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2025 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: BSD-3-Clause */
6/*----------------------------------------------------------------------------*/
7
8/*
9 * Authors :
10 - Dr Mathieu Taillefumier (ETH Zurich / CSCS)
11 - Advanced Micro Devices, Inc.
12*/
13
14#ifndef GRID_HIP_CONTEXT_H
15#define GRID_HIP_CONTEXT_H
16
17#include <hip/hip_runtime_api.h>
18#include <vector>
19
20extern "C" {
21#include "../common/grid_basis_set.h"
22#include "../common/grid_constants.h"
23}
24
25#include "../../offload/offload_library.h"
26#include "../../offload/offload_runtime.h"
27
28namespace rocm_backend {
29// a little helper class in the same spirit than std::vector. it must exist
30// somewhere. Maybe possible to get the same thing with std::vector and
31// specific allocator.
32
33class smem_parameters;
34template <typename T> class gpu_vector {
35 size_t allocated_size_{0};
36 size_t current_size_{0};
37 bool allocated_outside_{false};
38 bool internal_allocation_ = {false};
39 T *device_ptr_ = nullptr;
40 T *host_ptr_ = nullptr;
41
42public:
44
45 // size is the number of elements not the memory size
46 gpu_vector(const size_t size__) {
47 if (size__ < 16) {
48 allocated_size_ = 16;
49 } else {
50 allocated_size_ = (size__ / 16 + 1) * 16;
51 }
52 current_size_ = size__;
53 internal_allocation_ = true;
54#ifndef __OFFLOAD_UNIFIED_MEMORY
55 offloadMalloc((void **)&device_ptr_, sizeof(T) * allocated_size_);
56#else
57 hipMallocManaged((void **)&device_ptr_, sizeof(T) * allocated_size_);
58#endif
59 }
60
61 gpu_vector(const size_t size__, const void *ptr__) {
62 allocated_size_ = size__;
63 current_size_ = size__;
64 allocated_outside_ = true;
65 device_ptr_ = ptr__;
66 }
68
69 inline size_t size() { return current_size_; }
70
71 inline void copy_to_gpu(const T *data__) {
72 offloadMemcpyHtoD(device_ptr_, data__, sizeof(T) * current_size_);
73 }
74
75 inline void copy_to_gpu(const T *data__, offloadStream_t &stream__) {
76 offloadMemcpyAsyncHtoD(device_ptr_, data__, sizeof(T) * current_size_,
77 stream__);
78 }
79
80 inline void copy_to_gpu(offloadStream_t &stream__) {
81 offloadMemcpyAsyncHtoD(device_ptr_, host_ptr_, sizeof(T) * current_size_,
82 stream__);
83 }
84
85 inline void copy_from_gpu(T *data__, offloadStream_t &stream__) {
86 offloadMemcpyAsyncDtoH(data__, device_ptr_, sizeof(T) * current_size_,
87 stream__);
88 }
89
90 inline void copy_from_gpu(offloadStream_t &stream__) {
91 offloadMemcpyAsyncDtoH(host_ptr_, device_ptr_, sizeof(T) * current_size_,
92 stream__);
93 }
94
95 inline void zero(offloadStream_t &stream__) {
96 // zero device grid buffers
97 offloadMemsetAsync(device_ptr_, 0, sizeof(T) * current_size_, stream__);
98 }
99
100 inline void associate(void *host_ptr__, void *device_ptr__,
101 const size_t size__) {
102
103 if (internal_allocation_) {
104 if (device_ptr_)
105 offloadFree(device_ptr_);
106 if (host_ptr_)
107 std::free(host_ptr_);
108 internal_allocation_ = false;
109 }
110
111 allocated_outside_ = true;
112 // size__ is the number of elements not the size of the memory block
113 current_size_ = size__;
114 device_ptr_ = static_cast<T *>(device_ptr__);
115 host_ptr_ = static_cast<T *>(host_ptr__);
116 }
117
118 inline void zero() {
119 // zero device grid buffers
120 offloadMemset(device_ptr_, 0, sizeof(T) * current_size_);
121 }
122
123 inline void copy_to_gpu(const std::vector<T> &data__) {
124 assert(data__.size() == current_size_);
125 // if it fails it means that the vector on the gpu does not have the right
126 // size. two option then
127 // - resize the gpu vector
128 // - or the cpu vector and gpu vector are not representing the quantity.
129
130 offloadMemcpyHtoD(device_ptr_, data__.data(), sizeof(T) * data__.size());
131 }
132
133 inline void resize(const size_t new_size_) {
134 if (allocated_outside_) {
135 allocated_outside_ = false;
136 allocated_size_ = 0;
137 device_ptr_ = nullptr;
138 host_ptr_ = nullptr;
139 }
140
141 if (allocated_size_ < new_size_) {
142 if (device_ptr_ != nullptr)
143 offloadFree(device_ptr_);
144 allocated_size_ = (new_size_ / 16 + (new_size_ % 16 != 0)) * 16;
145 offloadMalloc((void **)&device_ptr_, sizeof(T) * allocated_size_);
146 internal_allocation_ = true;
147 }
148 current_size_ = new_size_;
149 }
150
151 // does not invalidate the pointer. The memory is still allocated
152 inline void clear() { current_size_ = 0; }
153
154 // reset the class and free memory
155 inline void reset() {
156 if (!allocated_outside_) {
157 if (device_ptr_ != nullptr)
158 offloadFree(device_ptr_);
159
160 if (host_ptr_ != nullptr)
161 std::free(device_ptr_);
162 }
163
164 allocated_size_ = 0;
165 current_size_ = 0;
166 device_ptr_ = nullptr;
167 host_ptr_ = nullptr;
168 internal_allocation_ = false;
169 }
170
171 inline T *data() { return device_ptr_; }
172};
173
174template <typename T> class grid_info {
175 int full_size_[3] = {0, 0, 0};
176 int local_size_[3] = {0, 0, 0};
177 // origin of the local part of the grid in grid point
178 int lower_corner_[3] = {0, 0, 0};
179 int border_width_[3] = {0, 0, 0};
180 double dh_[9];
181 double dh_inv_[9];
182 bool orthorhombic_{false};
183 bool is_distributed_{false};
184 gpu_vector<T> grid_;
185
186public:
188
189 grid_info(const int *full_size__, const int *local_size__,
190 const int *border_width__) {
191 initialize(full_size__, local_size__, border_width__);
192 }
193
194 ~grid_info() { grid_.reset(); };
195
196 inline T *data() { return grid_.data(); }
197
198 inline void copy_to_gpu(const T *data, offloadStream_t &stream) {
199 grid_.copy_to_gpu(data, stream);
200 }
201
202 inline void copy_to_gpu(offloadStream_t &stream) {
203 grid_.copy_to_gpu(stream);
204 }
205
206 inline void reset() { grid_.reset(); }
207
208 /*
209 * We do not allocate memory as the buffer is always coming from the outside
210 * world. We only initialize the sizes, etc...
211 */
212 inline void resize(const int *full_size__, const int *local_size__,
213 const int *const roffset__,
214 const int *const border_width__) {
215 initialize(full_size__, local_size__, roffset__, border_width__);
216 }
217
218 inline size_t size() const { return grid_.size(); }
219
220 inline void zero(offloadStream_t &stream) { grid_.zero(stream); }
221 inline gpu_vector<T> &grid() { return grid_; }
222 inline void set_lattice_vectors(const double *dh__, const double *dh_inv__) {
223 memcpy(dh_, dh__, sizeof(double) * 9);
224 memcpy(dh_inv_, dh_inv__, sizeof(double) * 9);
225 }
226
227 inline T *dh() { return dh_; }
228
229 inline T *dh_inv() { return dh_inv_; }
230
231 inline bool is_orthorhombic() { return orthorhombic_; }
232
233 inline void is_distributed(const bool distributed__) {
234 is_distributed_ = distributed__;
235 }
236
237 void check_orthorhombicity(const bool ortho) {
238 if (ortho) {
239 orthorhombic_ = true;
240 return;
241 }
242 double norm1, norm2, norm3;
243 bool orthogonal[3] = {false, false, false};
244 norm1 = dh_[0] * dh_[0] + dh_[1] * dh_[1] + dh_[2] * dh_[2];
245 norm2 = dh_[3] * dh_[3] + dh_[4] * dh_[4] + dh_[5] * dh_[5];
246 norm3 = dh_[6] * dh_[6] + dh_[7] * dh_[7] + dh_[8] * dh_[8];
247
248 norm1 = 1.0 / sqrt(norm1);
249 norm2 = 1.0 / sqrt(norm2);
250 norm3 = 1.0 / sqrt(norm3);
251
252 /* x z */
253 orthogonal[0] =
254 ((fabs(dh_[0] * dh_[6] + dh_[1] * dh_[7] + dh_[2] * dh_[8]) * norm1 *
255 norm3) < 1e-12);
256 /* y z */
257 orthogonal[1] =
258 ((fabs(dh_[3] * dh_[6] + dh_[4] * dh_[7] + dh_[5] * dh_[8]) * norm2 *
259 norm3) < 1e-12);
260 /* x y */
261 orthogonal[2] =
262 ((fabs(dh_[0] * dh_[3] + dh_[1] * dh_[4] + dh_[2] * dh_[5]) * norm1 *
263 norm2) < 1e-12);
264
265 orthorhombic_ = orthogonal[0] && orthogonal[1] && orthogonal[2];
266 }
267
268 inline void copy_to_host(double *data__, offloadStream_t &stream) {
269 grid_.copy_from_gpu(data__, stream);
270 }
271
272 inline void copy_to_host(offloadStream_t &stream) {
273 grid_.copy_from_gpu(stream);
274 }
275
276 inline void associate(void *host_ptr__, void *device_ptr__,
277 const size_t size__) {
278 grid_.associate(host_ptr__, device_ptr__, size__);
279 }
280 inline bool is_distributed() { return is_distributed_; }
281
282 inline int full_size(const int i) {
283 assert(i < 3);
284 return full_size_[i];
285 }
286
287 inline int local_size(const int i) {
288 assert(i < 3);
289 return local_size_[i];
290 }
291
292 inline int lower_corner(const int i) {
293 assert(i < 3);
294 return lower_corner_[i];
295 }
296
297 inline int border_width(const int i) {
298 assert(i < 3);
299 return border_width_[i];
300 }
301
302private:
303 void initialize(const int *const full_size__, const int *const local_size__,
304 const int *const roffset__, const int *const border_width__) {
305 // the calling code store things like this cube[z][y][x] (in fortran
306 // cube(x,y,z)) so all sizes are [x,y,z] while we are working in C/C++ so we
307 // have to permute the indices to get this right.
308
309 full_size_[2] = full_size__[0];
310 full_size_[1] = full_size__[1];
311 full_size_[0] = full_size__[2];
312
313 local_size_[2] = local_size__[0];
314 local_size_[1] = local_size__[1];
315 local_size_[0] = local_size__[2];
316
317 lower_corner_[0] = roffset__[2];
318 lower_corner_[1] = roffset__[1];
319 lower_corner_[2] = roffset__[0];
320
321 is_distributed_ = (full_size_[2] != local_size_[2]) ||
322 (full_size_[1] != local_size_[1]) ||
323 (full_size_[0] != local_size_[0]);
324
325 border_width_[2] = border_width__[0];
326 border_width_[1] = border_width__[1];
327 border_width_[0] = border_width__[2];
328 }
329};
330
331/*******************************************************************************
332 * \brief Internal representation of a task.
333 ******************************************************************************/
365
366/*******************************************************************************
367 * \brief Parameters of the collocate kernel.
368 ******************************************************************************/
369
374 int grid_full_size_[3] = {0, 0, 0};
375 int grid_local_size_[3] = {0, 0, 0};
376 int grid_lower_corner_[3] = {0, 0, 0};
377 int grid_border_width_[3] = {0, 0, 0};
378 double dh_[9];
379 double dh_inv_[9];
381 int *block_offsets{nullptr};
382 char la_min_diff{0};
383 char lb_min_diff{0};
384 char la_max_diff{0};
385 char lb_max_diff{0};
387 double *ptr_dev[7] = {nullptr, nullptr, nullptr, nullptr,
388 nullptr, nullptr, nullptr};
389 double **sphi_dev{nullptr};
390 int ntasks{0};
394};
395
396/* regroup all information about the context. */
398private:
399 int device_id_{-1};
400 int lmax_{0};
401 unsigned int checksum_{0};
402
403public:
404 int ntasks{0};
405 int nlevels{0};
406 int natoms{0};
407 int nkinds{0};
408 int nblocks{0};
409 std::vector<double *> sphi;
410 std::vector<offloadStream_t> level_streams;
411 offloadStream_t main_stream;
412 int stats[2][20]; // [has_border_mask][lp]
413 // all these tables are on the gpu. we can resize them copy to them and copy
414 // from them
424 std::vector<grid_info<double>> grid_;
426 std::vector<int> first_task_per_level_;
427 std::vector<int> sphi_size;
430 bool calculate_forces{false};
431 bool calculate_virial{false};
432 bool compute_tau{false};
433 bool apply_border_mask{false};
434
436 context_info(const int device_id__) {
437 if (device_id__ < 0)
438 device_id_ = 0;
439 else
440 device_id_ = device_id__;
441 }
443
444 void clear() {
445 hipSetDevice(device_id_);
446 tasks_dev.reset();
449 cab_dev_.reset();
452 sphi_dev.reset();
453 forces_.reset();
454 virial_.reset();
455 for (auto &phi : sphi)
456 if (phi != nullptr)
457 offloadFree(phi);
458 sphi.clear();
459
460 offloadStreamDestroy(main_stream);
461
462 for (int i = 0; i < nlevels; i++) {
463 offloadStreamDestroy(level_streams[i]);
464 }
465 level_streams.clear();
466
467 for (auto &grid : grid_) {
468 grid.reset();
469 }
470 grid_.clear();
471 }
472
473 int lmax() const { return lmax_; }
474
475 void initialize_basis_sets(const grid_basis_set **basis_sets,
476 const int nkinds__) {
477 nkinds = nkinds__;
478 if (nkinds__ > (int)sphi.size()) {
479 for (auto &phi : sphi)
480 if (phi != nullptr) {
481 offloadFree(phi);
482 }
483
484 sphi_dev.resize(nkinds__);
485
486 sphi.resize(nkinds__, nullptr);
487 sphi_size.clear();
488 sphi_size.resize(nkinds__, 0);
489 sphi_dev.resize(nkinds__);
490 }
491
492 // Upload basis sets to device.
493 for (int i = 0; i < nkinds__; i++) {
494 const auto &basis_set = basis_sets[i];
495 if (sphi_size[i] < basis_set->nsgf * basis_set->maxco) {
496 offloadMalloc((void **)&sphi[i],
497 basis_set->nsgf * basis_set->maxco * sizeof(double));
498 sphi_size[i] = basis_set->nsgf * basis_set->maxco;
499 }
500 offloadMemset(sphi[i], 0, sizeof(double) * sphi_size[i]);
501 offloadMemcpyHtoD(sphi[i], basis_set->sphi,
502 basis_set->nsgf * basis_set->maxco * sizeof(double));
503 }
505 // Find largest angular momentum.
506 lmax_ = 0;
507 for (int ikind = 0; ikind < nkinds; ikind++) {
508 for (int iset = 0; iset < basis_sets[ikind]->nset; iset++) {
509 lmax_ = std::max(lmax_, basis_sets[ikind]->lmax[iset]);
510 }
511 }
512 }
513
515 // allocate main hip stream
516 offloadStreamCreate(&main_stream);
517
518 // allocate one hip stream per grid level
519 if ((int)level_streams.size() < nlevels) {
520 level_streams.resize(nlevels);
521 for (auto &stream : level_streams) {
522 offloadStreamCreate(&stream);
523 }
524 }
525 }
526
527 void synchronize(offloadStream_t &stream) {
528 offloadStreamSynchronize(stream);
529 }
530
531 void synchornize() {
532 // wait for all the streams to finish
533 offloadDeviceSynchronize();
534 }
535
536 void set_device() { hipSetDevice(device_id_); }
537
538 void collocate_one_grid_level(const int level, const enum grid_func func,
539 int *lp_diff);
540 void integrate_one_grid_level(const int level, int *lp_diff);
542 /* basic checksum computation for simple verification that the object is sane
543 */
544 void compute_checksum() { checksum_ = compute_checksum_(); }
546 if (checksum_ != compute_checksum_()) {
547 fprintf(stderr, "This object does not seem to have the right structure.\n"
548 "A casting went wrong or the object is corrupted\n");
549 abort();
550 }
551 }
552
553private:
554 kernel_params set_kernel_parameters(const int level,
555 const smem_parameters &smem_params);
556 unsigned int compute_checksum_() {
557 return natoms ^ ntasks ^ nlevels ^ nkinds ^ nblocks ^ 0x4F2C5D1A;
558 }
559};
560} // namespace rocm_backend
561#endif
std::vector< int > first_task_per_level_
gpu_vector< double > virial_
gpu_vector< int > sorted_blocks_offset_dev
gpu_vector< double > coef_dev_
gpu_vector< double > pab_block_
void collocate_one_grid_level(const int level, const enum grid_func func, int *lp_diff)
gpu_vector< int > block_offsets_dev
void synchronize(offloadStream_t &stream)
gpu_vector< double > cab_dev_
gpu_vector< task_info > tasks_dev
std::vector< grid_info< double > > grid_
gpu_vector< double > forces_
gpu_vector< double > hab_block_
std::vector< double * > sphi
std::vector< offloadStream_t > level_streams
context_info(const int device_id__)
void initialize_basis_sets(const grid_basis_set **basis_sets, const int nkinds__)
std::vector< int > number_of_tasks_per_level_
gpu_vector< double * > sphi_dev
gpu_vector< int > num_tasks_per_block_dev_
void integrate_one_grid_level(const int level, int *lp_diff)
gpu_vector< int > task_sorted_by_blocks_dev
gpu_vector(const size_t size__, const void *ptr__)
void zero(offloadStream_t &stream__)
gpu_vector(const size_t size__)
void copy_to_gpu(offloadStream_t &stream__)
void copy_from_gpu(T *data__, offloadStream_t &stream__)
void associate(void *host_ptr__, void *device_ptr__, const size_t size__)
void copy_to_gpu(const std::vector< T > &data__)
void copy_to_gpu(const T *data__, offloadStream_t &stream__)
void copy_to_gpu(const T *data__)
void copy_from_gpu(offloadStream_t &stream__)
void resize(const size_t new_size_)
int local_size(const int i)
void zero(offloadStream_t &stream)
void copy_to_gpu(const T *data, offloadStream_t &stream)
void set_lattice_vectors(const double *dh__, const double *dh_inv__)
void copy_to_host(double *data__, offloadStream_t &stream)
gpu_vector< T > & grid()
int border_width(const int i)
void check_orthorhombicity(const bool ortho)
int lower_corner(const int i)
void copy_to_gpu(offloadStream_t &stream)
void copy_to_host(offloadStream_t &stream)
void resize(const int *full_size__, const int *local_size__, const int *const roffset__, const int *const border_width__)
void associate(void *host_ptr__, void *device_ptr__, const size_t size__)
grid_info(const int *full_size__, const int *local_size__, const int *border_width__)
void is_distributed(const bool distributed__)
grid_func
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
static void const int const int i
Internal representation of a basis set.
Parameters of the collocate kernel.
Internal representation of a task.