(git:6768d83)
Loading...
Searching...
No Matches
grid_gpu_context.h
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2026 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: BSD-3-Clause */
6/*----------------------------------------------------------------------------*/
7
8/*
9 * Authors :
10 - Dr Mathieu Taillefumier (ETH Zurich / CSCS)
11 - Advanced Micro Devices, Inc.
12*/
13
14#ifndef GRID_GPU_CONTEXT_H
15#define GRID_GPU_CONTEXT_H
16
17#ifdef __OFFLOAD_HIP
18#include <hip/hip_runtime_api.h>
19#else
20#include <cuda_runtime.h>
21#endif
22#include <vector>
23
24extern "C" {
25#include "../common/grid_basis_set.h"
26#include "../common/grid_constants.h"
27}
28
29#include "../../offload/offload_library.h"
30#include "../../offload/offload_runtime.h"
31
32namespace rocm_backend {
33// a little helper class in the same spirit than std::vector. it must exist
34// somewhere. Maybe possible to get the same thing with std::vector and
35// specific allocator.
36
37class smem_parameters;
38template <typename T> class gpu_vector {
39 size_t allocated_size_{0};
40 size_t current_size_{0};
41 bool allocated_outside_{false};
42 bool internal_allocation_ = {false};
43 T *device_ptr_ = nullptr;
44 T *host_ptr_ = nullptr;
45
46public:
48
49 // size is the number of elements not the memory size
50 gpu_vector(const size_t size__) {
51 if (size__ < 16) {
52 allocated_size_ = 16;
53 } else {
54 allocated_size_ = (size__ / 16 + 1) * 16;
55 }
56 current_size_ = size__;
57 internal_allocation_ = true;
58#ifndef __OFFLOAD_UNIFIED_MEMORY
59 offloadMalloc((void **)&device_ptr_, sizeof(T) * allocated_size_);
60#else
61 hipMallocManaged((void **)&device_ptr_, sizeof(T) * allocated_size_);
62#endif
63 }
64
65 gpu_vector(const size_t size__, const void *ptr__) {
66 allocated_size_ = size__;
67 current_size_ = size__;
68 allocated_outside_ = true;
69 device_ptr_ = ptr__;
70 }
72
73 inline size_t size() { return current_size_; }
74
75 inline void copy_to_gpu(const T *data__) {
76 offloadMemcpyHtoD(device_ptr_, data__, sizeof(T) * current_size_);
77 }
78
79 inline void copy_to_gpu(const T *data__, offloadStream_t &stream__) {
80 offloadMemcpyAsyncHtoD(device_ptr_, data__, sizeof(T) * current_size_,
81 stream__);
82 }
83
84 inline void copy_to_gpu(offloadStream_t &stream__) {
85 offloadMemcpyAsyncHtoD(device_ptr_, host_ptr_, sizeof(T) * current_size_,
86 stream__);
87 }
88
89 inline void copy_from_gpu(T *data__, offloadStream_t &stream__) {
90 offloadMemcpyAsyncDtoH(data__, device_ptr_, sizeof(T) * current_size_,
91 stream__);
92 }
93
94 inline void copy_from_gpu(offloadStream_t &stream__) {
95 offloadMemcpyAsyncDtoH(host_ptr_, device_ptr_, sizeof(T) * current_size_,
96 stream__);
97 }
98
99 inline void zero(offloadStream_t &stream__) {
100 // zero device grid buffers
101 offloadMemsetAsync(device_ptr_, 0, sizeof(T) * current_size_, stream__);
102 }
103
104 inline void associate(void *host_ptr__, void *device_ptr__,
105 const size_t size__) {
106
107 if (internal_allocation_) {
108 if (device_ptr_)
109 offloadFree(device_ptr_);
110 if (host_ptr_)
111 std::free(host_ptr_);
112 internal_allocation_ = false;
113 }
114
115 allocated_outside_ = true;
116 // size__ is the number of elements not the size of the memory block
117 current_size_ = size__;
118 device_ptr_ = static_cast<T *>(device_ptr__);
119 host_ptr_ = static_cast<T *>(host_ptr__);
120 }
121
122 inline void zero() {
123 // zero device grid buffers
124 offloadMemset(device_ptr_, 0, sizeof(T) * current_size_);
125 }
126
127 inline void copy_to_gpu(const std::vector<T> &data__) {
128 assert(data__.size() == current_size_);
129 // if it fails it means that the vector on the gpu does not have the right
130 // size. two option then
131 // - resize the gpu vector
132 // - or the cpu vector and gpu vector are not representing the quantity.
133
134 offloadMemcpyHtoD(device_ptr_, data__.data(), sizeof(T) * data__.size());
135 }
136
137 inline void resize(const size_t new_size_) {
138 if (allocated_outside_) {
139 allocated_outside_ = false;
140 allocated_size_ = 0;
141 device_ptr_ = nullptr;
142 host_ptr_ = nullptr;
143 }
144
145 if (allocated_size_ < new_size_) {
146 if (device_ptr_ != nullptr)
147 offloadFree(device_ptr_);
148 allocated_size_ = (new_size_ / 16 + (new_size_ % 16 != 0)) * 16;
149 offloadMalloc((void **)&device_ptr_, sizeof(T) * allocated_size_);
150 internal_allocation_ = true;
151 }
152 current_size_ = new_size_;
153 }
154
155 // does not invalidate the pointer. The memory is still allocated
156 inline void clear() { current_size_ = 0; }
157
158 // reset the class and free memory
159 inline void reset() {
160 if (!allocated_outside_) {
161 if (device_ptr_ != nullptr)
162 offloadFree(device_ptr_);
163
164 if (host_ptr_ != nullptr)
165 std::free(device_ptr_);
166 }
167
168 allocated_size_ = 0;
169 current_size_ = 0;
170 device_ptr_ = nullptr;
171 host_ptr_ = nullptr;
172 internal_allocation_ = false;
173 }
174
175 inline T *data() { return device_ptr_; }
176};
177
178template <typename T> class grid_info {
179 int full_size_[3] = {0, 0, 0};
180 int local_size_[3] = {0, 0, 0};
181 // origin of the local part of the grid in grid point
182 int lower_corner_[3] = {0, 0, 0};
183 int border_width_[3] = {0, 0, 0};
184 double dh_[9];
185 double dh_inv_[9];
186 bool orthorhombic_{false};
187 bool is_distributed_{false};
188 gpu_vector<T> grid_;
189
190public:
192
193 grid_info(const int *full_size__, const int *local_size__,
194 const int *border_width__) {
195 initialize(full_size__, local_size__, border_width__);
196 }
197
198 ~grid_info() { grid_.reset(); };
199
200 inline T *data() { return grid_.data(); }
201
202 inline void copy_to_gpu(const T *data, offloadStream_t &stream) {
203 grid_.copy_to_gpu(data, stream);
204 }
205
206 inline void copy_to_gpu(offloadStream_t &stream) {
207 grid_.copy_to_gpu(stream);
208 }
209
210 inline void reset() { grid_.reset(); }
211
212 /*
213 * We do not allocate memory as the buffer is always coming from the outside
214 * world. We only initialize the sizes, etc...
215 */
216 inline void resize(const int *full_size__, const int *local_size__,
217 const int *const roffset__,
218 const int *const border_width__) {
219 initialize(full_size__, local_size__, roffset__, border_width__);
220 }
221
222 inline size_t size() const { return grid_.size(); }
223
224 inline void zero(offloadStream_t &stream) { grid_.zero(stream); }
225 inline gpu_vector<T> &grid() { return grid_; }
226 inline void set_lattice_vectors(const double *dh__, const double *dh_inv__) {
227 memcpy(dh_, dh__, sizeof(double) * 9);
228 memcpy(dh_inv_, dh_inv__, sizeof(double) * 9);
229 }
230
231 inline T *dh() { return dh_; }
232
233 inline T *dh_inv() { return dh_inv_; }
234
235 inline bool is_orthorhombic() { return orthorhombic_; }
236
237 inline void is_distributed(const bool distributed__) {
238 is_distributed_ = distributed__;
239 }
240
241 void check_orthorhombicity(const bool ortho) {
242 if (ortho) {
243 orthorhombic_ = true;
244 return;
245 }
246 double norm1, norm2, norm3;
247 bool orthogonal[3] = {false, false, false};
248 norm1 = dh_[0] * dh_[0] + dh_[1] * dh_[1] + dh_[2] * dh_[2];
249 norm2 = dh_[3] * dh_[3] + dh_[4] * dh_[4] + dh_[5] * dh_[5];
250 norm3 = dh_[6] * dh_[6] + dh_[7] * dh_[7] + dh_[8] * dh_[8];
251
252 norm1 = 1.0 / sqrt(norm1);
253 norm2 = 1.0 / sqrt(norm2);
254 norm3 = 1.0 / sqrt(norm3);
255
256 /* x z */
257 orthogonal[0] =
258 ((fabs(dh_[0] * dh_[6] + dh_[1] * dh_[7] + dh_[2] * dh_[8]) * norm1 *
259 norm3) < 1e-12);
260 /* y z */
261 orthogonal[1] =
262 ((fabs(dh_[3] * dh_[6] + dh_[4] * dh_[7] + dh_[5] * dh_[8]) * norm2 *
263 norm3) < 1e-12);
264 /* x y */
265 orthogonal[2] =
266 ((fabs(dh_[0] * dh_[3] + dh_[1] * dh_[4] + dh_[2] * dh_[5]) * norm1 *
267 norm2) < 1e-12);
268
269 orthorhombic_ = orthogonal[0] && orthogonal[1] && orthogonal[2];
270 }
271
272 inline void copy_to_host(double *data__, offloadStream_t &stream) {
273 grid_.copy_from_gpu(data__, stream);
274 }
275
276 inline void copy_to_host(offloadStream_t &stream) {
277 grid_.copy_from_gpu(stream);
278 }
279
280 inline void associate(void *host_ptr__, void *device_ptr__,
281 const size_t size__) {
282 grid_.associate(host_ptr__, device_ptr__, size__);
283 }
284 inline bool is_distributed() { return is_distributed_; }
285
286 inline int full_size(const int i) {
287 assert(i < 3);
288 return full_size_[i];
289 }
290
291 inline int local_size(const int i) {
292 assert(i < 3);
293 return local_size_[i];
294 }
295
296 inline int lower_corner(const int i) {
297 assert(i < 3);
298 return lower_corner_[i];
299 }
300
301 inline int border_width(const int i) {
302 assert(i < 3);
303 return border_width_[i];
304 }
305
306private:
307 void initialize(const int *const full_size__, const int *const local_size__,
308 const int *const roffset__, const int *const border_width__) {
309 // the calling code store things like this cube[z][y][x] (in fortran
310 // cube(x,y,z)) so all sizes are [x,y,z] while we are working in C/C++ so we
311 // have to permute the indices to get this right.
312
313 full_size_[2] = full_size__[0];
314 full_size_[1] = full_size__[1];
315 full_size_[0] = full_size__[2];
316
317 local_size_[2] = local_size__[0];
318 local_size_[1] = local_size__[1];
319 local_size_[0] = local_size__[2];
320
321 lower_corner_[0] = roffset__[2];
322 lower_corner_[1] = roffset__[1];
323 lower_corner_[2] = roffset__[0];
324
325 is_distributed_ = (full_size_[2] != local_size_[2]) ||
326 (full_size_[1] != local_size_[1]) ||
327 (full_size_[0] != local_size_[0]);
328
329 border_width_[2] = border_width__[0];
330 border_width_[1] = border_width__[1];
331 border_width_[0] = border_width__[2];
332 }
333};
334
335/*******************************************************************************
336 * \brief Internal representation of a task.
337 ******************************************************************************/
369
370/*******************************************************************************
371 * \brief Parameters of the collocate kernel.
372 ******************************************************************************/
373
378 int grid_full_size_[3] = {0, 0, 0};
379 int grid_local_size_[3] = {0, 0, 0};
380 int grid_lower_corner_[3] = {0, 0, 0};
381 int grid_border_width_[3] = {0, 0, 0};
382 double dh_[9];
383 double dh_inv_[9];
385 int *block_offsets{nullptr};
386 char la_min_diff{0};
387 char lb_min_diff{0};
388 char la_max_diff{0};
389 char lb_max_diff{0};
391 double *ptr_dev[7] = {nullptr, nullptr, nullptr, nullptr,
392 nullptr, nullptr, nullptr};
393 double **sphi_dev{nullptr};
394 int ntasks{0};
398};
399
400/* regroup all information about the context. */
402private:
403 int device_id_{-1};
404 int lmax_{0};
405 unsigned int checksum_{0};
406
407public:
408 int ntasks{0};
409 int nlevels{0};
410 int natoms{0};
411 int nkinds{0};
412 int nblocks{0};
413 std::vector<double *> sphi;
414 std::vector<offloadStream_t> level_streams;
415 offloadStream_t main_stream;
416 int stats[2][20]; // [has_border_mask][lp]
417 // all these tables are on the gpu. we can resize them copy to them and copy
418 // from them
428 std::vector<grid_info<double>> grid_;
430 std::vector<int> first_task_per_level_;
431 std::vector<int> sphi_size;
434 bool calculate_forces{false};
435 bool calculate_virial{false};
436 bool compute_tau{false};
437 bool apply_border_mask{false};
438
440 context_info(const int device_id__) {
441 if (device_id__ < 0)
442 device_id_ = 0;
443 else
444 device_id_ = device_id__;
445 }
447
448 void clear() {
449 offload_set_chosen_device(device_id_);
450 offload_activate_chosen_device();
451 tasks_dev.reset();
454 cab_dev_.reset();
457 sphi_dev.reset();
458 forces_.reset();
459 virial_.reset();
460 for (auto &phi : sphi)
461 if (phi != nullptr)
462 offloadFree(phi);
463 sphi.clear();
464
465 offloadStreamDestroy(main_stream);
466
467 for (int i = 0; i < nlevels; i++) {
468 offloadStreamDestroy(level_streams[i]);
469 }
470 level_streams.clear();
471
472 for (auto &grid : grid_) {
473 grid.reset();
474 }
475 grid_.clear();
476 }
477
478 int lmax() const { return lmax_; }
479
480 void initialize_basis_sets(const grid_basis_set **basis_sets,
481 const int nkinds__) {
482 nkinds = nkinds__;
483 if (nkinds__ > (int)sphi.size()) {
484 for (auto &phi : sphi)
485 if (phi != nullptr) {
486 offloadFree(phi);
487 }
488
489 sphi_dev.resize(nkinds__);
490
491 sphi.resize(nkinds__, nullptr);
492 sphi_size.clear();
493 sphi_size.resize(nkinds__, 0);
494 sphi_dev.resize(nkinds__);
495 }
496
497 // Upload basis sets to device.
498 for (int i = 0; i < nkinds__; i++) {
499 const auto &basis_set = basis_sets[i];
500 if (sphi_size[i] < basis_set->nsgf * basis_set->maxco) {
501 offloadMalloc((void **)&sphi[i],
502 basis_set->nsgf * basis_set->maxco * sizeof(double));
503 sphi_size[i] = basis_set->nsgf * basis_set->maxco;
504 }
505 offloadMemset(sphi[i], 0, sizeof(double) * sphi_size[i]);
506 offloadMemcpyHtoD(sphi[i], basis_set->sphi,
507 basis_set->nsgf * basis_set->maxco * sizeof(double));
508 }
510 // Find largest angular momentum.
511 lmax_ = 0;
512 for (int ikind = 0; ikind < nkinds; ikind++) {
513 for (int iset = 0; iset < basis_sets[ikind]->nset; iset++) {
514 lmax_ = std::max(lmax_, basis_sets[ikind]->lmax[iset]);
515 }
516 }
517 }
518
520 // allocate main hip stream
521 offloadStreamCreate(&main_stream);
522
523 // allocate one hip stream per grid level
524 if ((int)level_streams.size() < nlevels) {
525 level_streams.resize(nlevels);
526 for (auto &stream : level_streams) {
527 offloadStreamCreate(&stream);
528 }
529 }
530 }
531
532 void synchronize(offloadStream_t &stream) {
533 offloadStreamSynchronize(stream);
534 }
535
536 void synchornize() {
537 // wait for all the streams to finish
538 offloadDeviceSynchronize();
539 }
540
541 void set_device() {
542 offload_set_chosen_device(device_id_);
543 offload_activate_chosen_device();
544 }
545
546 void collocate_one_grid_level(const int level, const enum grid_func func,
547 int *lp_diff);
548 void integrate_one_grid_level(const int level, int *lp_diff);
550 /* basic checksum computation for simple verification that the object is sane
551 */
552 void compute_checksum() { checksum_ = compute_checksum_(); }
554 if (checksum_ != compute_checksum_()) {
555 fprintf(stderr, "This object does not seem to have the right structure.\n"
556 "A casting went wrong or the object is corrupted\n");
557 abort();
558 }
559 }
560
561private:
562 kernel_params set_kernel_parameters(const int level,
563 const smem_parameters &smem_params);
564 unsigned int compute_checksum_() {
565 return natoms ^ ntasks ^ nlevels ^ nkinds ^ nblocks ^ 0x4F2C5D1A;
566 }
567};
568} // namespace rocm_backend
569#endif
std::vector< int > first_task_per_level_
gpu_vector< double > virial_
gpu_vector< int > sorted_blocks_offset_dev
gpu_vector< double > coef_dev_
gpu_vector< double > pab_block_
void collocate_one_grid_level(const int level, const enum grid_func func, int *lp_diff)
Launches the Cuda kernel that collocates all tasks of one grid level.
gpu_vector< int > block_offsets_dev
void synchronize(offloadStream_t &stream)
gpu_vector< double > cab_dev_
gpu_vector< task_info > tasks_dev
std::vector< grid_info< double > > grid_
gpu_vector< double > forces_
gpu_vector< double > hab_block_
std::vector< double * > sphi
std::vector< offloadStream_t > level_streams
context_info(const int device_id__)
void initialize_basis_sets(const grid_basis_set **basis_sets, const int nkinds__)
std::vector< int > number_of_tasks_per_level_
gpu_vector< double * > sphi_dev
gpu_vector< int > num_tasks_per_block_dev_
void integrate_one_grid_level(const int level, int *lp_diff)
Launches the Cuda kernel that integrates all tasks of one grid level.
gpu_vector< int > task_sorted_by_blocks_dev
gpu_vector(const size_t size__, const void *ptr__)
void zero(offloadStream_t &stream__)
gpu_vector(const size_t size__)
void copy_to_gpu(offloadStream_t &stream__)
void copy_from_gpu(T *data__, offloadStream_t &stream__)
void associate(void *host_ptr__, void *device_ptr__, const size_t size__)
void copy_to_gpu(const std::vector< T > &data__)
void copy_to_gpu(const T *data__, offloadStream_t &stream__)
void copy_to_gpu(const T *data__)
void copy_from_gpu(offloadStream_t &stream__)
void resize(const size_t new_size_)
int local_size(const int i)
void zero(offloadStream_t &stream)
void copy_to_gpu(const T *data, offloadStream_t &stream)
void set_lattice_vectors(const double *dh__, const double *dh_inv__)
void copy_to_host(double *data__, offloadStream_t &stream)
gpu_vector< T > & grid()
int border_width(const int i)
void check_orthorhombicity(const bool ortho)
int lower_corner(const int i)
void copy_to_gpu(offloadStream_t &stream)
void copy_to_host(offloadStream_t &stream)
void resize(const int *full_size__, const int *local_size__, const int *const roffset__, const int *const border_width__)
void associate(void *host_ptr__, void *device_ptr__, const size_t size__)
grid_info(const int *full_size__, const int *local_size__, const int *border_width__)
void is_distributed(const bool distributed__)
grid_func
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
static void const int const int i
Internal representation of a basis set.
Parameters of the collocate kernel.
Internal representation of a task.