(git:ed6f26b)
Loading...
Searching...
No Matches
grid_hip_collocate.cu
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2025 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: BSD-3-Clause */
6/*----------------------------------------------------------------------------*/
7
8/*
9 * Authors :
10 - Dr Mathieu Taillefumier (ETH Zurich / CSCS)
11 - Advanced Micro Devices, Inc.
12*/
13
14#if defined(__OFFLOAD_HIP) && !defined(__NO_OFFLOAD_GRID)
15
16#include <algorithm>
17#include <assert.h>
18#include <climits>
19#include <cmath>
20#include <cstdio>
21#include <cstdlib>
22#include <cstring>
23#include <hip/hip_runtime.h>
24
27
28#if defined(_OMP_H)
29#error "OpenMP should not be used in .cu files to accommodate HIP."
30#endif
31
32namespace rocm_backend {
33/*******************************************************************************
34 * \brief Decontracts the subblock, going from spherical to cartesian harmonics.
35 ******************************************************************************/
36template <typename T, bool IS_FUNC_AB>
37__device__ __inline__ void block_to_cab(const kernel_params &params,
38 const smem_task<T> &task, T *cab) {
39
40 // The spherical index runs over angular momentum and then over contractions.
41 // The cartesian index runs over exponents and then over angular momentum.
42
43 // This is a T matrix product. Since the pab block can be quite large the
44 // two products are fused to conserve shared memory.
45 const int tid =
46 threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
47
48 for (int jco = task.first_cosetb + tid / 16; jco < task.ncosetb; jco += 4) {
49 for (int ico = task.first_coseta + (tid % 16); ico < task.ncoseta;
50 ico += 16) {
51 T pab_val = 0.0;
52 for (int i = 0; i < task.nsgf_setb; i++) {
53 const T sphib = task.sphib[i * task.maxcob + jco];
54 T tmp_val = 0.0;
55 for (int j = 0; j < task.nsgf_seta; j++) {
56 T block_val;
57 if (task.block_transposed) {
58 block_val =
59 task.pab_block[j * task.nsgfb + i] * task.off_diag_twice;
60 } else {
61 block_val =
62 task.pab_block[i * task.nsgfa + j] * task.off_diag_twice;
63 }
64 // const T sphia = task.sphia[j * task.maxcoa + ico];
65 tmp_val += block_val * task.sphia[j * task.maxcoa + ico];
66 }
67 pab_val += tmp_val * sphib;
68 }
69
70 if (IS_FUNC_AB) {
71 cab[jco * task.ncoseta + ico] += pab_val;
72 } else {
73 const auto a = coset_inv[ico];
74 const auto b = coset_inv[jco];
75 // remains the atomicAdd used in prepare_pab. Are they needed ?
76 prepare_pab(params.func, a, b, task.zeta, task.zetb, pab_val, task.n1,
77 cab);
78 }
79 }
80 }
81}
82
83template <typename T, bool IS_FUNC_AB>
84__global__
85__launch_bounds__(64) void calculate_coefficients(const kernel_params dev_) {
86 __shared__ smem_task<T> task;
87 if (dev_.tasks[dev_.first_task + blockIdx.x].skip_task)
88 return;
89
90 const int tid =
91 threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
92
93 fill_smem_task_coef(dev_, dev_.first_task + blockIdx.x, task);
94 extern __shared__ T shared_memory[];
95 // T *smem_cab = &shared_memory[dev_.smem_cab_offset];
96 T *smem_alpha = &shared_memory[dev_.smem_alpha_offset];
97 T *coef_ =
98 &dev_.ptr_dev[2][dev_.tasks[dev_.first_task + blockIdx.x].coef_offset];
99 T *smem_cab =
100 &dev_.ptr_dev[6][dev_.tasks[dev_.first_task + blockIdx.x].cab_offset];
101 compute_alpha(task, smem_alpha);
102 for (int z = tid; z < task.n1 * task.n2;
103 z += blockDim.x * blockDim.y * blockDim.z)
104 smem_cab[z] = 0.0;
105 __syncthreads();
106 block_to_cab<T, IS_FUNC_AB>(dev_, task, smem_cab);
107 __syncthreads();
108 cab_to_cxyz(task, smem_alpha, smem_cab, coef_);
109}
110
111/*
112 \brief compute the real space representation of an operator expressed in the
113 gaussian basis
114
115 this kernel does the following operation
116
117 n_{ijk} = \f[
118 \sum_{p,\alpha,\beta,\gamma} C^p_{\alpha\beta\gamma} X_{\alpha,i} Y_{\beta, j}
119 Z_{\gamma, k} \exp\left(- \eta (r_{ijk} - r_c) ^ 2\right)
120 ]
121
122 where $X_{\alpha,i}, Y_{\beta, j}, Z_{\gamma, k}$ are polynomials of degree
123 $\alpha,\beta,\gamma$ and $r_{ijk}% a point (in cartesian coordinates) on a 3D
124 grid. C^p_{\alpha\beta\gamma} are also constrained such that 0 <= \alpha +
125 \beta + \gamma <= lmax. It means in practice that we need store (lmax + 1) *
126 (lamax + 2) * (lmax + 3) / 6 coefficients all the other coefficients are zero
127
128 to reduce computation, a spherical cutoff is applied such that all points
129 $|r_{ijk} - r_c| > radius$ are not computed. The sum over p extends over all
130 relevant pairs of gaussians (which are called task in the code).
131
132 the kernel computes the polynomials and the gaussian then sums the result
133 back to the grid.
134
135 the coefficients $C^p_{\alpha\beta\gamma}$ are computed by
136 calculate_coefficients. We only keep the non zero elements to same memory.
137*/
138
139template <typename T, typename T3, bool distributed__, bool orthorhombic_>
140__global__
141__launch_bounds__(64) void collocate_kernel(const kernel_params dev_) {
142 // Copy task from global to shared memory and precompute some stuff.
143 __shared__ smem_task_reduced<T, T3> task;
144
145 if (dev_.tasks[dev_.first_task + blockIdx.x].skip_task)
146 return;
147
148 fill_smem_task_reduced(dev_, dev_.first_task + blockIdx.x, task);
149
150 const int tid =
151 threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
152
153 // Alloc shared memory.
154 extern __shared__ T coefs_[];
155
156 T *coef_ =
157 &dev_.ptr_dev[2][dev_.tasks[dev_.first_task + blockIdx.x].coef_offset];
158 __shared__ T dh_[9], dh_inv_[9];
159
160 if (tid < 9) {
161 // matrix from lattice coordinates to cartesian coordinates
162 dh_[tid] = dev_.dh_[tid];
163 // matrix from cartesian coordinates to lattice coordinates.
164 dh_inv_[tid] = dev_.dh_inv_[tid];
165 }
166
167 for (int i = tid; i < ncoset(6); i += blockDim.x * blockDim.y * blockDim.z)
168 coefs_[i] = coef_[i];
169
170 if (tid == 0) {
171 // the cube center is initialy expressed in lattice coordinates but we
172 // always do something like this. x = x + lower_corner + cube_center (+
173 // roffset) - grid_lower_corner so shift the cube center already
174 task.cube_center.z += task.lb_cube.z - dev_.grid_lower_corner_[0];
175 task.cube_center.y += task.lb_cube.y - dev_.grid_lower_corner_[1];
176 task.cube_center.x += task.lb_cube.x - dev_.grid_lower_corner_[2];
177
178 if (distributed__) {
179 if (task.apply_border_mask) {
181 dev_.grid_local_size_,
182 dev_.tasks[dev_.first_task + blockIdx.x].border_mask,
183 dev_.grid_border_width_, &task.window_size, &task.window_shift);
184 }
185 }
186 }
187 __syncthreads();
188
189 for (int z = threadIdx.z; z < task.cube_size.z; z += blockDim.z) {
190 int z2 = (z + task.cube_center.z) % dev_.grid_full_size_[0];
191
192 if (z2 < 0)
193 z2 += dev_.grid_full_size_[0];
194
195 if (distributed__) {
196 // check if the point is within the window
197 if (task.apply_border_mask) {
198 // this test is only relevant when the grid is split over several mpi
199 // ranks. in that case we take only the points contributing to local
200 // part of the grid.
201 if ((z2 < task.window_shift.z) || (z2 > task.window_size.z)) {
202 continue;
203 }
204 }
205 }
206
207 // compute the coordinates of the point in atomic coordinates
208 T kremain;
209 short int ymin = 0;
210 short int ymax = task.cube_size.y - 1;
211
212 if (orthorhombic_ && !task.apply_border_mask) {
213 ymin = (2 * (z + task.lb_cube.z) - 1) / 2;
214 ymin *= ymin;
215 kremain = task.discrete_radius * task.discrete_radius -
216 ((T)ymin) * dh_[8] * dh_[8];
217 ymin = ceil(-1.0e-8 - sqrt(fmax(0.0, kremain)) * dh_inv_[4]);
218 ymax = 1 - ymin - task.lb_cube.y;
219 ymin = ymin - task.lb_cube.y;
220 }
221
222 for (int y = ymin + threadIdx.y; y <= ymax; y += blockDim.y) {
223 int y2 = (y + task.cube_center.y) % dev_.grid_full_size_[1];
224 if (y2 < 0)
225 y2 += dev_.grid_full_size_[1];
226
227 if (distributed__) {
228 if (task.apply_border_mask) {
229 // check if the point is within the window
230 if ((y2 < task.window_shift.y) || (y2 > task.window_size.y)) {
231 continue;
232 }
233 }
234 }
235
236 short int xmin = 0;
237 short int xmax = task.cube_size.x - 1;
238 if (orthorhombic_ && !task.apply_border_mask) {
239 xmin = (2 * (y + task.lb_cube.y) - 1) / 2;
240 xmin *= xmin;
241 xmin =
242 ceil(-1.0e-8 - sqrt(fmax(0.0, kremain - xmin * dh_[4] * dh_[4])) *
243 dh_inv_[0]);
244 xmax = 1 - xmin - task.lb_cube.x;
245 xmin = xmin - task.lb_cube.x;
246 }
247
248 for (int x = xmin + threadIdx.x; x <= xmax; x += blockDim.x) {
249 int x2 = (x + task.cube_center.x) % dev_.grid_full_size_[2];
250
251 if (x2 < 0)
252 x2 += dev_.grid_full_size_[2];
253
254 if (distributed__) {
255 if (task.apply_border_mask) {
256 // check if the point is within the window (only true or false
257 // when using mpi) otherwise MPI=1 always true
258 if ((x2 < task.window_shift.x) || (x2 > task.window_size.x)) {
259 continue;
260 }
261 }
262 }
263
264 // I make no distinction between orthorhombic and non orthorhombic
265 // cases
266
267 T3 r3;
268 if (orthorhombic_) {
269 r3.x = (x + task.lb_cube.x + task.roffset.x) * dh_[0];
270 r3.y = (y + task.lb_cube.y + task.roffset.y) * dh_[4];
271 r3.z = (z + task.lb_cube.z + task.roffset.z) * dh_[8];
272 } else {
273 r3 = compute_coordinates(dh_, (x + task.lb_cube.x + task.roffset.x),
274 (y + task.lb_cube.y + task.roffset.y),
275 (z + task.lb_cube.z + task.roffset.z));
276 }
277
278 const T r3x2 = r3.x * r3.x;
279 const T r3y2 = r3.y * r3.y;
280 const T r3z2 = r3.z * r3.z;
281
282 if (distributed__) {
283 // check if the point is inside the sphere or not. Note that it does
284 // not apply for the orthorhombic case when the full sphere is inside
285 // the region of interest.
286
287 if (((task.radius * task.radius) <= (r3x2 + r3y2 + r3z2)) &&
288 (!orthorhombic_ || task.apply_border_mask))
289 continue;
290 } else {
291 // we do not need to do this test for the orthorhombic case
292 if ((!orthorhombic_) &&
293 ((task.radius * task.radius) <= (r3x2 + r3y2 + r3z2)))
294 continue;
295 }
296
297 // allow computation of the address in parallel to starting the
298 // computations
299 T res = coefs_[0];
300
301 if (task.lp >= 1) {
302 res += coefs_[1] * r3.x;
303 res += coefs_[2] * r3.y;
304 res += coefs_[3] * r3.z;
305 }
306 const T r3xy = r3.x * r3.y;
307 const T r3xz = r3.x * r3.z;
308 const T r3yz = r3.y * r3.z;
309
310 if (task.lp >= 2) {
311 res += coefs_[4] * r3x2;
312 res += coefs_[5] * r3xy;
313 res += coefs_[6] * r3xz;
314 res += coefs_[7] * r3y2;
315 res += coefs_[8] * r3yz;
316 res += coefs_[9] * r3z2;
317 }
318
319 if (task.lp >= 3) {
320 res += r3x2 *
321 (coefs_[10] * r3.x + coefs_[11] * r3.y + coefs_[12] * r3.z);
322 res += r3.x * (coefs_[13] * r3y2 + coefs_[15] * r3z2);
323 res += coefs_[14] * r3xy * r3.z;
324 res += r3y2 * (coefs_[16] * r3.y + coefs_[17] * r3.z);
325 res += r3z2 * (coefs_[18] * r3.y + coefs_[19] * r3.z);
326 }
327
328 if (task.lp >= 4) {
329 res += r3x2 *
330 (coefs_[20] * r3x2 + coefs_[21] * r3xy + coefs_[22] * r3xz +
331 coefs_[23] * r3y2 + coefs_[24] * r3yz + coefs_[25] * r3z2);
332 res += r3y2 *
333 (coefs_[26] * r3xy + coefs_[27] * r3xz + coefs_[30] * r3y2 +
334 coefs_[31] * r3yz + coefs_[32] * r3z2);
335 res += r3z2 * (coefs_[28] * r3xy + coefs_[29] * r3xz +
336 coefs_[33] * r3yz + coefs_[34] * r3z2);
337 }
338
339 if (task.lp >= 5) {
340 const T r3x4 = r3x2 * r3x2;
341 const T r3y4 = r3y2 * r3y2;
342 const T r3z4 = r3z2 * r3z2;
343
344 res += r3x4 * (coefs_[35] * r3.x + // x^5
345 coefs_[36] * r3.y + // x^4 y
346 coefs_[37] * r3.z); // x^4 z
347 res += r3x2 * (r3.x * (coefs_[38] * r3y2 + // x^3 y^2
348 coefs_[39] * r3yz + // x^3 y z
349 coefs_[40] * r3z2) + // x^3 z^2
350 r3y2 * (coefs_[41] * r3.y + // x^2 y^3
351 coefs_[42] * r3.z) + // x^2 y^2 z
352 r3z2 * (coefs_[43] * r3.y + // x^2 y z^2
353 coefs_[44] * r3.z)); // x^2 z^2 z
354 res += r3.x * (coefs_[45] * r3y4 + // x y^4
355 r3y2 * (coefs_[46] * r3yz + // x y^3 z
356 coefs_[47] * r3z2) + // x y^2 z^2
357 r3z2 * (coefs_[48] * r3yz + // x y z^3
358 coefs_[49] * r3z2)); // x z^4
359 res += r3y2 * (r3y2 * (coefs_[50] * r3.y + // y^5
360 coefs_[51] * r3.z) + // y^4 z
361 r3z2 * (coefs_[52] * r3.y + // y^3 z^2
362 coefs_[53] * r3.z)); // y^2 z^3
363 res += r3z4 * (coefs_[54] * r3.y + // y z^4
364 coefs_[55] * r3.z); // z^5
365
366 if (task.lp >= 6) {
367 res += r3x4 * (coefs_[56] * r3x2 + // x^6
368 coefs_[57] * r3xy + // x^ 5 y
369 coefs_[58] * r3xz + // x^ 5 z
370 coefs_[59] * r3y2 + // x^4 y^2
371 coefs_[60] * r3yz + // x^4 yz
372 coefs_[61] * r3z2); // x^4 z^2
373
374 res += r3x2 * (coefs_[62] * r3y2 * r3xy + // x^3 y^3
375 coefs_[63] * r3y2 * r3xz + // x^3 y^2 z
376 coefs_[64] * r3xy * r3z2 + // x^3 y z^2
377 coefs_[65] * r3z2 * r3xz + // x^3 z^3
378 coefs_[66] * r3y4 + // x^2 y^4
379 coefs_[67] * r3y2 * r3yz + // x^2 y^3 z
380 coefs_[68] * r3y2 * r3z2 + // x^2 y^2 z^2
381 coefs_[69] * r3yz * r3z2 + // x^2 y z^3
382 coefs_[70] * r3z4); // x^2 z^4
383 res += r3y2 * r3z2 *
384 (coefs_[73] * r3xy + // x y^3 z^2
385 coefs_[74] * r3xz + // x y^2 z^3
386 coefs_[80] * r3yz + // y^3 z^3
387 coefs_[81] * r3z2); // y^2 z^4
388 res += r3y4 * (coefs_[71] * r3xy + // x y^5
389 coefs_[72] * r3xz + // x y^4 z
390 coefs_[77] * r3y2 + // y^6
391 coefs_[78] * r3yz + // y^5 z
392 coefs_[79] * r3z2); // y^4 z^2
393 res += r3z4 * (coefs_[75] * r3xy + // x y z^4
394 coefs_[76] * r3xz + // x z^5
395 coefs_[82] * r3yz + // y z^5
396 coefs_[83] * r3z2); // z^6
397 }
398
399 if (task.lp >= 7) {
400 for (int ic = ncoset(6); ic < ncoset(task.lp); ic++) {
401 T tmp1 = coef_[ic];
402 auto &co = coset_inv[ic];
403 T tmp = 1.0;
404 for (int po = 0; po < (co.l[2] >> 1); po++)
405 tmp *= r3z2;
406 if (co.l[2] & 0x1)
407 tmp *= r3.z;
408 for (int po = 0; po < (co.l[1] >> 1); po++)
409 tmp *= r3y2;
410 if (co.l[1] & 0x1)
411 tmp *= r3.y;
412 for (int po = 0; po < (co.l[0] >> 1); po++)
413 tmp *= r3x2;
414 if (co.l[0] & 0x1)
415 tmp *= r3.x;
416 res += tmp * tmp1;
417 }
418 }
419 }
420
421 res *= exp(-(r3x2 + r3y2 + r3z2) * task.zetp);
422 atomicAdd(dev_.ptr_dev[1] +
423 (z2 * dev_.grid_local_size_[1] + y2) *
424 dev_.grid_local_size_[2] +
425 x2,
426 res);
427 }
428 }
429 }
430}
431/*******************************************************************************
432 * \brief Launches the Cuda kernel that collocates all tasks of one grid level.
433 ******************************************************************************/
434void context_info::collocate_one_grid_level(const int level,
435 const enum grid_func func,
436 int *lp_diff) {
437
438 if (number_of_tasks_per_level_[level] == 0)
439 return;
440
441 // Compute max angular momentum.
442 const ldiffs_value ldiffs = prepare_get_ldiffs(func);
443 smem_parameters smem_params(ldiffs, lmax());
444
445 *lp_diff = smem_params.lp_diff();
447
448 // kernel parameters
449 kernel_params params = set_kernel_parameters(level, smem_params);
450 params.func = func;
451
452 // Launch !
453 const dim3 threads_per_block(4, 4, 4);
454
455 if (func == GRID_FUNC_AB) {
456 calculate_coefficients<double, true>
457 <<<number_of_tasks_per_level_[level], threads_per_block,
458 smem_params.smem_per_block(), level_streams[level]>>>(params);
459 } else {
460 calculate_coefficients<double, false>
461 <<<number_of_tasks_per_level_[level], threads_per_block,
462 smem_params.smem_per_block(), level_streams[level]>>>(params);
463 }
464
465 if (grid_[level].is_distributed()) {
466 if (grid_[level].is_orthorhombic())
467 collocate_kernel<double, double3, true, true>
468 <<<number_of_tasks_per_level_[level], threads_per_block,
469 ncoset(6) * sizeof(double), level_streams[level]>>>(params);
470 else
471 collocate_kernel<double, double3, true, false>
472 <<<number_of_tasks_per_level_[level], threads_per_block,
473 ncoset(6) * sizeof(double), level_streams[level]>>>(params);
474 } else {
475 if (grid_[level].is_orthorhombic())
476 collocate_kernel<double, double3, false, true>
477 <<<number_of_tasks_per_level_[level], threads_per_block,
478 ncoset(6) * sizeof(double), level_streams[level]>>>(params);
479 else
480 collocate_kernel<double, double3, false, false>
481 <<<number_of_tasks_per_level_[level], threads_per_block,
482 ncoset(6) * sizeof(double), level_streams[level]>>>(params);
483 }
484}
485} // namespace rocm_backend
486#endif // defined(__OFFLOAD_HIP) && !defined(__NO_OFFLOAD_GRID)
void collocate_one_grid_level(const int level, const enum grid_func func, int *lp_diff)
std::vector< grid_info< double > > grid_
std::vector< offloadStream_t > level_streams
std::vector< int > number_of_tasks_per_level_
grid_func
@ GRID_FUNC_AB
static void const int const int i
real(dp), dimension(3) a
real(dp), dimension(3) b
integer, dimension(:, :, :), allocatable, public co
integer, dimension(:), allocatable, public ncoset
__inline__ __device__ void compute_window_size(const int *const grid_size, const int border_mask, const int *border_width, int3 *const window_size, int3 *const window_shift)
__device__ static __inline__ void cab_to_cxyz(const smem_task< T > &task, const T *__restrict__ alpha, const T *__restrict__ cab, T *__restrict__ cxyz)
Transforms coefficients C_ab into C_xyz.
__device__ __inline__ void fill_smem_task_coef(const kernel_params &dev, const int task_id, smem_task< T > &task)
Copies a task from global to shared memory and does precomputations.
ldiffs_value prepare_get_ldiffs(const enum grid_func func)
Returns difference in angular momentum range for given func.
static void init_constant_memory()
Initializes the device's constant memory.
__constant__ orbital coset_inv[1330]
__inline__ __device__ void compute_alpha(const smem_task< T > &task, T *__restrict__ alpha)
Computes the polynomial expansion coefficients: (x-a)**lxa (x-b)**lxb -> sum_{ls} alpha(ls,...
__device__ __inline__ void fill_smem_task_reduced(const kernel_params &dev, const int task_id, smem_task_reduced< T, T3 > &task)
Copies a task from global to shared memory.
__inline__ __device__ double3 compute_coordinates(const double *__restrict__ dh_, const double x, const double y, const double z)
__device__ __inline__ void prepare_pab(const enum grid_func func, const orbital a, const orbital b, const T zeta, const T zetb, const T pab_val, const int n, T *cab)
Transforms a given element of the density matrix according to func.