(git:0d88fc2)
Loading...
Searching...
No Matches
grid_gpu_collocate.cu
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2026 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: BSD-3-Clause */
6/*----------------------------------------------------------------------------*/
7
8/*
9 * Authors :
10 - Mathieu Taillefumier (ETH Zurich / CSCS)
11 - Advanced Micro Devices, Inc.
12 - Ole Schuett
13*/
14
15#include <algorithm>
16#include <assert.h>
17#include <climits>
18#include <cmath>
19#include <cstdio>
20#include <cstdlib>
21#include <cstring>
22
25
26#if defined(_OMP_H)
27#error "OpenMP should not be used in .cu files to accommodate HIP."
28#endif
29
30namespace rocm_backend {
31/*******************************************************************************
32 * \brief Decontracts the subblock, going from spherical to cartesian harmonics.
33 ******************************************************************************/
34template <typename T, bool IS_FUNC_AB>
35__device__ __inline__ void block_to_cab(const kernel_params &params,
36 const smem_task<T> &task, T *cab) {
37
38 // The spherical index runs over angular momentum and then over contractions.
39 // The cartesian index runs over exponents and then over angular momentum.
40
41 // This is a T matrix product. Since the pab block can be quite large the
42 // two products are fused to conserve shared memory.
43 const int tid = thread_global_index();
44
45 for (int jco = task.first_cosetb + tid / 16; jco < task.ncosetb; jco += 4) {
46 for (int ico = task.first_coseta + (tid % 16); ico < task.ncoseta;
47 ico += 16) {
48 T pab_val = 0.0;
49 for (int i = 0; i < task.nsgf_setb; i++) {
50 const T sphib = task.sphib[i * task.maxcob + jco];
51 T tmp_val = 0.0;
52 for (int j = 0; j < task.nsgf_seta; j++) {
53 T block_val;
54 if (task.block_transposed) {
55 block_val =
56 task.pab_block[j * task.nsgfb + i] * task.off_diag_twice;
57 } else {
58 block_val =
59 task.pab_block[i * task.nsgfa + j] * task.off_diag_twice;
60 }
61 tmp_val += block_val * task.sphia[j * task.maxcoa + ico];
62 }
63 pab_val += tmp_val * sphib;
64 }
65
66 if (IS_FUNC_AB) {
67 cab[jco * task.ncoseta + ico] += pab_val;
68 } else {
69 const auto a = coset_inv[ico];
70 const auto b = coset_inv[jco];
71 // remains the atomicAdd used in prepare_pab. Are they needed ?
72 prepare_pab(params.func, a, b, task.zeta, task.zetb, pab_val, task.n1,
73 cab);
74 }
75 }
76 }
77}
78
79template <typename T, bool IS_FUNC_AB>
80__global__
81__launch_bounds__(64) void calculate_coefficients(const kernel_params dev_) {
82 // Copy task from global to shared memory and precompute some stuff.
83 extern __shared__ T shared_memory[];
84 const int number_of_tasks = dev_.num_tasks_per_block_dev[block_index()];
85
86 if (number_of_tasks == 0)
87 return;
88
89 T *smem_alpha = &shared_memory[0];
90 const int tid = thread_global_index();
91 const int offset = dev_.sorted_blocks_offset_dev[block_index()];
92 T *smem_cab = reinterpret_cast<double *>(
93 __builtin_assume_aligned(allocate_workspace<T>(dev_), 32));
94
95 for (int tk = 0; tk < number_of_tasks; tk++) {
96 __shared__ smem_task<T> task;
97 const int task_id = dev_.task_sorted_by_blocks_dev[offset + tk];
98 if (dev_.tasks[task_id].skip_task)
99 continue;
100 fill_smem_task_coef(dev_, task_id, task);
101
102 T *__restrict__ coef_ = &dev_.ptr_dev[2][dev_.tasks[task_id].coef_offset];
103
104 compute_alpha(task, smem_alpha);
105
106 for (int z = tid; z < task.n1 * task.n2;
107 z += blockDim.x * blockDim.y * blockDim.z)
108 smem_cab[z] = 0.0;
109 __syncthreads();
110
111 block_to_cab<T, IS_FUNC_AB>(dev_, task, smem_cab);
112 __syncthreads();
113
114 cab_to_cxyz(task, smem_alpha, smem_cab, coef_);
115 __syncthreads();
116 }
117}
118
119/*
120 \brief compute the real space representation of an operator expressed in the
121 gaussian basis
122
123 this kernel does the following operation
124
125 n_{ijk} = \f[
126 \sum_{p,\alpha,\beta,\gamma} C^p_{\alpha\beta\gamma} X_{\alpha,i} Y_{\beta, j}
127 Z_{\gamma, k} \exp\left(- \eta (r_{ijk} - r_c) ^ 2\right)
128 ]
129
130 where $X_{\alpha,i}, Y_{\beta, j}, Z_{\gamma, k}$ are polynomials of degree
131 $\alpha,\beta,\gamma$ and $r_{ijk}% a point (in cartesian coordinates) on a 3D
132 grid. C^p_{\alpha\beta\gamma} are also constrained such that 0 <= \alpha +
133 \beta + \gamma <= lmax. It means in practice that we need store (lmax + 1) *
134 (lamax + 2) * (lmax + 3) / 6 coefficients all the other coefficients are zero
135
136 to reduce computation, a spherical cutoff is applied such that all points
137 $|r_{ijk} - r_c| > radius$ are not computed. The sum over p extends over all
138 relevant pairs of gaussians (which are called task in the code).
139
140 the kernel computes the polynomials and the gaussian then sums the result
141 back to the grid.
142
143 the coefficients $C^p_{\alpha\beta\gamma}$ are computed by
144 calculate_coefficients. We only keep the non zero elements to same memory.
145*/
146
147template <typename T, typename T3, bool distributed__, bool orthogonal_>
148__global__
149__launch_bounds__(64) void collocate_kernel(const kernel_params dev_) {
150 // Copy task from global to shared memory and precompute some stuff.
151 __shared__ smem_task_reduced<T, T3> task;
152
153 if (dev_.tasks[dev_.first_task + block_index()].skip_task)
154 return;
155
156 fill_smem_task_reduced(dev_, dev_.first_task + block_index(), task);
157
158 const int tid = thread_global_index();
159
160 // Alloc shared memory.
161 extern __shared__ T coefs_[];
162
163 T *coef_ =
164 &dev_.ptr_dev[2][dev_.tasks[dev_.first_task + block_index()].coef_offset];
165 __shared__ T dh_[9];
166
167 if (tid < 9) {
168 // matrix from lattice coordinates to cartesian coordinates
169 dh_[tid] = dev_.dh_[tid];
170 }
171
172 for (int i = tid; i < ncoset(6); i += blockDim.x * blockDim.y * blockDim.z)
173 coefs_[i] = coef_[i];
174
175 if (tid == 0) {
176 // the cube center is initialy expressed in lattice coordinates but we
177 // always do something like this. x = x + lower_corner + cube_center (+
178 // roffset) - grid_lower_corner so shift the cube center already
179 setup_task_cube_center<T, T3, distributed__>(dev_, task);
180 }
181 __syncthreads();
182
183 for (int z = threadIdx.z; z < task.cube_size.z; z += blockDim.z) {
184 int z2 = wrap_grid_index(z + task.cube_center.z, dev_.grid_full_size_.z);
185
186 if (distributed__) {
187 // check if the point is within the window
188 if (task.apply_border_mask) {
189 // this test is only relevant when the grid is split over several mpi
190 // ranks. in that case we take only the points contributing to local
191 // part of the grid.
192 if ((z2 < task.window_shift.z) || (z2 > task.window_size.z)) {
193 continue;
194 }
195 }
196 }
197
198 // compute the coordinates of the point in atomic coordinates
199 T kremain;
200 int ymin = 0;
201 int ymax = task.cube_size.y - 1;
202
203 if (orthogonal_ && !task.apply_border_mask) {
204 kremain = calculate_ymix_ymax_boundaries(task, z, ymin, ymax);
205 }
206
207 for (int y = ymin + threadIdx.y; y <= ymax; y += blockDim.y) {
208 int y2 = wrap_grid_index(y + task.cube_center.y, dev_.grid_full_size_.y);
209
210 if (distributed__) {
211 if (task.apply_border_mask) {
212 // check if the point is within the window
213 if ((y2 < task.window_shift.y) || (y2 > task.window_size.y)) {
214 continue;
215 }
216 }
217 }
218
219 int xmin = 0;
220 int xmax = task.cube_size.x - 1;
221 if (orthogonal_ && !task.apply_border_mask) {
222 calculate_xmin_xmax_boundaries<T, T3>(task, y, kremain, xmin, xmax);
223 }
224
225 for (int x = xmin + threadIdx.x; x <= xmax; x += blockDim.x) {
226 int x2 =
227 wrap_grid_index(x + task.cube_center.x, dev_.grid_full_size_.x);
228
229 if (distributed__) {
230 if (task.apply_border_mask) {
231 // check if the point is within the window (only true or false
232 // when using mpi) otherwise MPI=1 always true
233 if ((x2 < task.window_shift.x) || (x2 > task.window_size.x)) {
234 continue;
235 }
236 }
237 }
238
239 // I make no distinction between orthogonal and non orthogonal
240 // cases
241
242 T3 r3;
243 r3 = compute_coordinates(dh_, (x + task.lb_cube.x + task.roffset.x),
244 (y + task.lb_cube.y + task.roffset.y),
245 (z + task.lb_cube.z + task.roffset.z));
246
247 const T r3x2 = r3.x * r3.x;
248 const T r3y2 = r3.y * r3.y;
249 const T r3z2 = r3.z * r3.z;
250
251 if (distributed__) {
252 // check if the point is inside the sphere or not. Note that it does
253 // not apply for the orthorhombic case when the full sphere is inside
254 // the region of interest.
255
256 if (((task.radius * task.radius) <= (r3x2 + r3y2 + r3z2)) &&
257 (!orthogonal_ || task.apply_border_mask))
258 continue;
259 } else {
260 // we do not need to do this test for the orthorhombic case
261 if ((!orthogonal_) &&
262 ((task.radius * task.radius) <= (r3x2 + r3y2 + r3z2)))
263 continue;
264 }
265
266 // allow computation of the address in parallel to starting the
267 // computations
268 T res = coefs_[0];
269
270 if (task.lp >= 1) {
271 res += coefs_[1] * r3.x;
272 res += coefs_[2] * r3.y;
273 res += coefs_[3] * r3.z;
274 }
275 const T r3xy = r3.x * r3.y;
276 const T r3xz = r3.x * r3.z;
277 const T r3yz = r3.y * r3.z;
278
279 if (task.lp >= 2) {
280 res += coefs_[4] * r3x2;
281 res += coefs_[5] * r3xy;
282 res += coefs_[6] * r3xz;
283 res += coefs_[7] * r3y2;
284 res += coefs_[8] * r3yz;
285 res += coefs_[9] * r3z2;
286 }
287
288 if (task.lp >= 3) {
289 res += r3x2 *
290 (coefs_[10] * r3.x + coefs_[11] * r3.y + coefs_[12] * r3.z);
291 res += r3.x * (coefs_[13] * r3y2 + coefs_[15] * r3z2);
292 res += coefs_[14] * r3xy * r3.z;
293 res += r3y2 * (coefs_[16] * r3.y + coefs_[17] * r3.z);
294 res += r3z2 * (coefs_[18] * r3.y + coefs_[19] * r3.z);
295 }
296
297 if (task.lp >= 4) {
298 res += r3x2 *
299 (coefs_[20] * r3x2 + coefs_[21] * r3xy + coefs_[22] * r3xz +
300 coefs_[23] * r3y2 + coefs_[24] * r3yz + coefs_[25] * r3z2);
301 res += r3y2 *
302 (coefs_[26] * r3xy + coefs_[27] * r3xz + coefs_[30] * r3y2 +
303 coefs_[31] * r3yz + coefs_[32] * r3z2);
304 res += r3z2 * (coefs_[28] * r3xy + coefs_[29] * r3xz +
305 coefs_[33] * r3yz + coefs_[34] * r3z2);
306 }
307
308 if (task.lp >= 5) {
309 const T r3x4 = r3x2 * r3x2;
310 const T r3y4 = r3y2 * r3y2;
311 const T r3z4 = r3z2 * r3z2;
312
313 res += r3x4 * (coefs_[35] * r3.x + // x^5
314 coefs_[36] * r3.y + // x^4 y
315 coefs_[37] * r3.z); // x^4 z
316 res += r3x2 * (r3.x * (coefs_[38] * r3y2 + // x^3 y^2
317 coefs_[39] * r3yz + // x^3 y z
318 coefs_[40] * r3z2) + // x^3 z^2
319 r3y2 * (coefs_[41] * r3.y + // x^2 y^3
320 coefs_[42] * r3.z) + // x^2 y^2 z
321 r3z2 * (coefs_[43] * r3.y + // x^2 y z^2
322 coefs_[44] * r3.z)); // x^2 z^2 z
323 res += r3.x * (coefs_[45] * r3y4 + // x y^4
324 r3y2 * (coefs_[46] * r3yz + // x y^3 z
325 coefs_[47] * r3z2) + // x y^2 z^2
326 r3z2 * (coefs_[48] * r3yz + // x y z^3
327 coefs_[49] * r3z2)); // x z^4
328 res += r3y2 * (r3y2 * (coefs_[50] * r3.y + // y^5
329 coefs_[51] * r3.z) + // y^4 z
330 r3z2 * (coefs_[52] * r3.y + // y^3 z^2
331 coefs_[53] * r3.z)); // y^2 z^3
332 res += r3z4 * (coefs_[54] * r3.y + // y z^4
333 coefs_[55] * r3.z); // z^5
334
335 if (task.lp >= 6) {
336 res += r3x4 * (coefs_[56] * r3x2 + // x^6
337 coefs_[57] * r3xy + // x^ 5 y
338 coefs_[58] * r3xz + // x^ 5 z
339 coefs_[59] * r3y2 + // x^4 y^2
340 coefs_[60] * r3yz + // x^4 yz
341 coefs_[61] * r3z2); // x^4 z^2
342
343 res += r3x2 * (coefs_[62] * r3y2 * r3xy + // x^3 y^3
344 coefs_[63] * r3y2 * r3xz + // x^3 y^2 z
345 coefs_[64] * r3xy * r3z2 + // x^3 y z^2
346 coefs_[65] * r3z2 * r3xz + // x^3 z^3
347 coefs_[66] * r3y4 + // x^2 y^4
348 coefs_[67] * r3y2 * r3yz + // x^2 y^3 z
349 coefs_[68] * r3y2 * r3z2 + // x^2 y^2 z^2
350 coefs_[69] * r3yz * r3z2 + // x^2 y z^3
351 coefs_[70] * r3z4); // x^2 z^4
352 res += r3y2 * r3z2 *
353 (coefs_[73] * r3xy + // x y^3 z^2
354 coefs_[74] * r3xz + // x y^2 z^3
355 coefs_[80] * r3yz + // y^3 z^3
356 coefs_[81] * r3z2); // y^2 z^4
357 res += r3y4 * (coefs_[71] * r3xy + // x y^5
358 coefs_[72] * r3xz + // x y^4 z
359 coefs_[77] * r3y2 + // y^6
360 coefs_[78] * r3yz + // y^5 z
361 coefs_[79] * r3z2); // y^4 z^2
362 res += r3z4 * (coefs_[75] * r3xy + // x y z^4
363 coefs_[76] * r3xz + // x z^5
364 coefs_[82] * r3yz + // y z^5
365 coefs_[83] * r3z2); // z^6
366 }
367
368 if (task.lp >= 7) {
369 for (int ic = ncoset(6); ic < ncoset(task.lp); ic++) {
370 auto &co = coset_inv[ic];
371 T tmp = 1.0;
372 for (int po = 0; po < (co.l[2] >> 1); po++)
373 tmp *= r3z2;
374 if (co.l[2] & 0x1)
375 tmp *= r3.z;
376 for (int po = 0; po < (co.l[1] >> 1); po++)
377 tmp *= r3y2;
378 if (co.l[1] & 0x1)
379 tmp *= r3.y;
380 for (int po = 0; po < (co.l[0] >> 1); po++)
381 tmp *= r3x2;
382 if (co.l[0] & 0x1)
383 tmp *= r3.x;
384 res += tmp * coef_[ic];
385 }
386 }
387 }
388
389 res *= exp(-(r3x2 + r3y2 + r3z2) * task.zetp);
390 atomicAdd(dev_.ptr_dev[1] +
391 (z2 * dev_.grid_local_size_.y + y2) *
392 dev_.grid_local_size_.x +
393 x2,
394 res);
395 }
396 }
397 }
398}
399
401 int *lp_diff) {
402 // Compute max angular momentum.
403 const ldiffs_value ldiffs = prepare_get_ldiffs(func);
404 smem_parameters smem_params(ldiffs, lmax());
405
406 *lp_diff = smem_params.lp_diff();
408
409 // kernel parameters
410 kernel_params params = set_kernel_parameters(-1, smem_params);
411 params.func = func;
412
413 // Launch !
414 const dim3 threads_per_block(4, 4, 4);
415
416 if (func == GRID_FUNC_AB) {
417 calculate_coefficients<double, true>
418 <<<nblocks, threads_per_block, smem_params.smem_per_block(),
419 main_stream>>>(params);
420 } else {
421 calculate_coefficients<double, false>
422 <<<nblocks, threads_per_block, smem_params.smem_per_block(),
423 main_stream>>>(params);
424 }
425}
426/*******************************************************************************
427 * \brief Launches the Cuda kernel that collocates all tasks of one grid level.
428 ******************************************************************************/
430 const enum grid_func func,
431 int *lp_diff) {
432
433 if (number_of_tasks_per_level_[level] == 0)
434 return;
435
436 // Compute max angular momentum.
437 const ldiffs_value ldiffs = prepare_get_ldiffs(func);
438 smem_parameters smem_params(ldiffs, lmax());
439
440 *lp_diff = smem_params.lp_diff();
442
443 // kernel parameters
444 kernel_params params = set_kernel_parameters(level, smem_params);
445 params.func = func;
446
447 // Launch !
448 const dim3 threads_per_block(4, 4, 4);
449
450 if (grid_[level].is_distributed()) {
451 if (grid_[level].is_orthogonal())
452 collocate_kernel<double, double3, true, true>
453 <<<number_of_tasks_per_level_[level], threads_per_block,
454 ncoset(6) * sizeof(double), level_streams[level]>>>(params);
455 else
456 collocate_kernel<double, double3, true, false>
457 <<<number_of_tasks_per_level_[level], threads_per_block,
458 ncoset(6) * sizeof(double), level_streams[level]>>>(params);
459 } else {
460 if (grid_[level].is_orthogonal())
461 collocate_kernel<double, double3, false, true>
462 <<<number_of_tasks_per_level_[level], threads_per_block,
463 ncoset(6) * sizeof(double), level_streams[level]>>>(params);
464 else
465 collocate_kernel<double, double3, false, false>
466 <<<number_of_tasks_per_level_[level], threads_per_block,
467 ncoset(6) * sizeof(double), level_streams[level]>>>(params);
468 }
469}
470} // namespace rocm_backend
void calculate_all_coefficients(const enum grid_func func, int *lp_diff)
void collocate_one_grid_level(const int level, const enum grid_func func, int *lp_diff)
Launches the Cuda kernel that collocates all tasks of one grid level.
std::vector< grid_info< double > > grid_
std::vector< offloadStream_t > level_streams
std::vector< int > number_of_tasks_per_level_
grid_func
@ GRID_FUNC_AB
static void const int const int i
__device__ static __inline__ void cab_to_cxyz(const smem_task< T > &task, const T *__restrict__ alpha, const T *__restrict__ cab, T *__restrict__ cxyz)
Transforms coefficients C_ab into C_xyz.
__inline__ __device__ unsigned int block_index()
__device__ __inline__ void fill_smem_task_coef(const kernel_params &dev, const int task_id, smem_task< T > &task)
Copies a task from global to shared memory and does precomputations.
ldiffs_value prepare_get_ldiffs(const enum grid_func func)
Returns difference in angular momentum range for given func.
__inline__ __device__ unsigned int thread_global_index()
static void init_constant_memory()
Initializes the device's constant memory.
__constant__ orbital coset_inv[1330]
__inline__ __device__ int wrap_grid_index(const int idx, const int full_size)
Wraps a cube-local coordinate into the periodic full grid along one axis. Single point of entry share...
__inline__ __device__ void compute_alpha(const smem_task< T > &task, T *__restrict__ alpha)
Computes the polynomial expansion coefficients: (x-a)**lxa (x-b)**lxb -> sum_{ls} alpha(ls,...
__device__ __inline__ void fill_smem_task_reduced(const kernel_params &dev, const int task_id, smem_task_reduced< T, T3 > &task)
Copies a task from global to shared memory.
__inline__ __device__ double3 compute_coordinates(const double *__restrict__ dh_, const double x, const double y, const double z)
__global__ __launch_bounds__(64) void calculate_coefficients(const kernel_params dev_)
__device__ __inline__ void prepare_pab(const enum grid_func func, const orbital a, const orbital b, const T zeta, const T zetb, const T pab_val, const int n, T *cab)
Transforms a given element of the density matrix according to func.
__device__ __inline__ T calculate_ymix_ymax_boundaries(smem_task_reduced< T, T3 > &task, const int z, int &ymin, int &ymax)
__device__ __inline__ void block_to_cab(const kernel_params &params, const smem_task< T > &task, T *cab)
Decontracts the subblock, going from spherical to cartesian harmonics.
Parameters of the collocate kernel.
Differences in angular momentum.
data needed for collocate and integrate kernels
data needed for calculating the coefficients, forces, and stress