(git:ccc2433)
grid_hip_collocate.cu
Go to the documentation of this file.
1 /*----------------------------------------------------------------------------*/
2 /* CP2K: A general program to perform molecular dynamics simulations */
3 /* Copyright 2000-2024 CP2K developers group <https://cp2k.org> */
4 /* */
5 /* SPDX-License-Identifier: BSD-3-Clause */
6 /*----------------------------------------------------------------------------*/
7 
8 /*
9  * Authors :
10  - Dr Mathieu Taillefumier (ETH Zurich / CSCS)
11  - Advanced Micro Devices, Inc.
12 */
13 
14 #if defined(__OFFLOAD_HIP) && !defined(__NO_OFFLOAD_GRID)
15 
16 #include <algorithm>
17 #include <assert.h>
18 #include <climits>
19 #include <cmath>
20 #include <cstdio>
21 #include <cstdlib>
22 #include <cstring>
23 #include <hip/hip_runtime.h>
24 
26 #include "grid_hip_prepare_pab.h"
27 
28 #if defined(_OMP_H)
29 #error "OpenMP should not be used in .cu files to accommodate HIP."
30 #endif
31 
32 namespace rocm_backend {
33 /*******************************************************************************
34  * \brief Decontracts the subblock, going from spherical to cartesian harmonics.
35  ******************************************************************************/
36 template <typename T, bool IS_FUNC_AB>
37 __device__ __inline__ void block_to_cab(const kernel_params &params,
38  const smem_task<T> &task, T *cab) {
39 
40  // The spherical index runs over angular momentum and then over contractions.
41  // The cartesian index runs over exponents and then over angular momentum.
42 
43  // This is a T matrix product. Since the pab block can be quite large the
44  // two products are fused to conserve shared memory.
45  const int tid =
46  threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
47  for (int i = 0; i < task.nsgf_setb; i++) {
48  for (int j = 0; j < task.nsgf_seta; j++) {
49  T block_val;
50  if (task.block_transposed) {
51  block_val = task.pab_block[j * task.nsgfb + i] * task.off_diag_twice;
52  } else {
53  block_val = task.pab_block[i * task.nsgfa + j] * task.off_diag_twice;
54  }
55 
56  // fast path for common case
57  // const int jco_start = task.first_cosetb;
58  for (int jco = task.first_cosetb + tid / 8; jco < task.ncosetb;
59  jco += 8) {
60  const T sphib = task.sphib[i * task.maxcob + jco];
61  for (int ico = task.first_coseta + (tid % 8); ico < task.ncoseta;
62  ico += 8) {
63  const T sphia = task.sphia[j * task.maxcoa + ico];
64  const T pab_val = block_val * sphia * sphib;
65  if (IS_FUNC_AB) {
66  cab[jco * task.ncoseta + ico] += pab_val;
67  } else {
68  const auto a = coset_inv[ico];
69  const auto b = coset_inv[jco];
70  prepare_pab(params.func, a, b, task.zeta, task.zetb, pab_val,
71  task.n1, cab);
72  }
73  }
74  }
75  __syncthreads();
76  }
77  }
78 }
79 
80 template <typename T, bool IS_FUNC_AB>
81 __global__ void calculate_coefficients(const kernel_params dev_) {
82  __shared__ smem_task<T> task;
83  if (dev_.tasks[dev_.first_task + blockIdx.x].skip_task)
84  return;
85 
86  const int tid =
87  threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
88 
89  fill_smem_task_coef(dev_, dev_.first_task + blockIdx.x, task);
90  extern __shared__ T shared_memory[];
91  // T *smem_cab = &shared_memory[dev_.smem_cab_offset];
92  T *smem_alpha = &shared_memory[dev_.smem_alpha_offset];
93  T *coef_ =
94  &dev_.ptr_dev[2][dev_.tasks[dev_.first_task + blockIdx.x].coef_offset];
95  T *smem_cab =
96  &dev_.ptr_dev[6][dev_.tasks[dev_.first_task + blockIdx.x].cab_offset];
97  for (int z = tid; z < task.n1 * task.n2;
98  z += blockDim.x * blockDim.y * blockDim.z)
99  smem_cab[z] = 0.0;
100 
101  __syncthreads();
102  block_to_cab<T, IS_FUNC_AB>(dev_, task, smem_cab);
103  __syncthreads();
104  compute_alpha(task, smem_alpha);
105  __syncthreads();
106  cab_to_cxyz(task, smem_alpha, smem_cab, coef_);
107 }
108 
109 /*
110  \brief compute the real space representation of an operator expressed in the
111  gaussian basis
112 
113  this kernel does the following operation
114 
115  n_{ijk} = \f[
116  \sum_{p,\alpha,\beta,\gamma} C^p_{\alpha\beta\gamma} X_{\alpha,i} Y_{\beta, j}
117  Z_{\gamma, k} \exp\left(- \eta (r_{ijk} - r_c) ^ 2\right)
118  ]
119 
120  where $X_{\alpha,i}, Y_{\beta, j}, Z_{\gamma, k}$ are polynomials of degree
121  $\alpha,\beta,\gamma$ and $r_{ijk}% a point (in cartesian coordinates) on a 3D
122  grid. C^p_{\alpha\beta\gamma} are also constrained such that 0 <= \alpha +
123  \beta + \gamma <= lmax. It means in practice that we need store (lmax + 1) *
124  (lamax + 2) * (lmax + 3) / 6 coefficients all the other coefficients are zero
125 
126  to reduce computation, a spherical cutoff is applied such that all points
127  $|r_{ijk} - r_c| > radius$ are not computed. The sum over p extends over all
128  relevant pairs of gaussians (which are called task in the code).
129 
130  the kernel computes the polynomials and the gaussian then sums the result
131  back to the grid.
132 
133  the coefficients $C^p_{\alpha\beta\gamma}$ are computed by
134  calculate_coefficients. We only keep the non zero elements to same memory.
135 */
136 
137 template <typename T, typename T3, bool distributed__, bool orthorhombic_>
138 __global__
139 __launch_bounds__(64) void collocate_kernel(const kernel_params dev_) {
140  // Copy task from global to shared memory and precompute some stuff.
141  __shared__ smem_task_reduced<T, T3> task;
142 
143  if (dev_.tasks[dev_.first_task + blockIdx.x].skip_task)
144  return;
145 
146  fill_smem_task_reduced(dev_, dev_.first_task + blockIdx.x, task);
147 
148  const int tid =
149  threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
150 
151  // Alloc shared memory.
152  extern __shared__ T coefs_[];
153 
154  T *coef_ =
155  &dev_.ptr_dev[2][dev_.tasks[dev_.first_task + blockIdx.x].coef_offset];
156  __shared__ T dh_[9], dh_inv_[9];
157 
158  // matrix from lattice coordinates to cartesian coordinates
159  for (int i = tid; i < 9; i += blockDim.x * blockDim.y * blockDim.z) {
160  dh_[i] = dev_.dh_[i];
161  }
162 
163  // matrix from cartesian coordinates to lattice coordinates.
164  for (int i = tid; i < 9; i += blockDim.x * blockDim.y * blockDim.z) {
165  dh_inv_[i] = dev_.dh_inv_[i];
166  }
167 
168  __syncthreads();
169  if (tid < ncoset(4))
170  coefs_[tid] = coef_[tid];
171 
172  if (tid == 0) {
173  // the cube center is initialy expressed in lattice coordinates but we
174  // always do something like this. x = x + lower_corner + cube_center (+
175  // roffset) - grid_lower_corner so shift the cube center already
176  task.cube_center.z += task.lb_cube.z - dev_.grid_lower_corner_[0];
177  task.cube_center.y += task.lb_cube.y - dev_.grid_lower_corner_[1];
178  task.cube_center.x += task.lb_cube.x - dev_.grid_lower_corner_[2];
179 
180  if (distributed__) {
181  if (task.apply_border_mask) {
183  dev_.grid_local_size_,
184  dev_.tasks[dev_.first_task + blockIdx.x].border_mask,
185  dev_.grid_border_width_, &task.window_size, &task.window_shift);
186  }
187  }
188  }
189  __syncthreads();
190 
191  for (int z = threadIdx.z; z < task.cube_size.z; z += blockDim.z) {
192  int z2 = (z + task.cube_center.z) % dev_.grid_full_size_[0];
193 
194  if (z2 < 0)
195  z2 += dev_.grid_full_size_[0];
196 
197  if (distributed__) {
198  // check if the point is within the window
199  if (task.apply_border_mask) {
200  // this test is only relevant when the grid is split over several mpi
201  // ranks. in that case we take only the points contributing to local
202  // part of the grid.
203  if ((z2 < task.window_shift.z) || (z2 > task.window_size.z)) {
204  continue;
205  }
206  }
207  }
208 
209  // compute the coordinates of the point in atomic coordinates
210  T kremain;
211  short int ymin = 0;
212  short int ymax = task.cube_size.y - 1;
213 
214  if (orthorhombic_ && !task.apply_border_mask) {
215  ymin = (2 * (z + task.lb_cube.z) - 1) / 2;
216  ymin *= ymin;
217  kremain = task.discrete_radius * task.discrete_radius -
218  ((T)ymin) * dh_[8] * dh_[8];
219  ymin = ceil(-1.0e-8 - sqrt(fmax(0.0, kremain)) * dh_inv_[4]);
220  ymax = 1 - ymin - task.lb_cube.y;
221  ymin = ymin - task.lb_cube.y;
222  }
223 
224  for (int y = ymin + threadIdx.y; y <= ymax; y += blockDim.y) {
225  int y2 = (y + task.cube_center.y) % dev_.grid_full_size_[1];
226  if (y2 < 0)
227  y2 += dev_.grid_full_size_[1];
228 
229  if (distributed__) {
230  if (task.apply_border_mask) {
231  // check if the point is within the window
232  if ((y2 < task.window_shift.y) || (y2 > task.window_size.y)) {
233  continue;
234  }
235  }
236  }
237 
238  short int xmin = 0;
239  short int xmax = task.cube_size.x - 1;
240  if (orthorhombic_ && !task.apply_border_mask) {
241  xmin = (2 * (y + task.lb_cube.y) - 1) / 2;
242  xmin *= xmin;
243  xmin =
244  ceil(-1.0e-8 - sqrt(fmax(0.0, kremain - xmin * dh_[4] * dh_[4])) *
245  dh_inv_[0]);
246  xmax = 1 - xmin - task.lb_cube.x;
247  xmin = xmin - task.lb_cube.x;
248  }
249 
250  for (int x = xmin + threadIdx.x; x <= xmax; x += blockDim.x) {
251  int x2 = (x + task.cube_center.x) % dev_.grid_full_size_[2];
252 
253  if (x2 < 0)
254  x2 += dev_.grid_full_size_[2];
255 
256  if (distributed__) {
257  if (task.apply_border_mask) {
258  // check if the point is within the window (only true or false
259  // when using mpi) otherwise MPI=1 always true
260  if ((x2 < task.window_shift.x) || (x2 > task.window_size.x)) {
261  continue;
262  }
263  }
264  }
265 
266  // I make no distinction between orthorhombic and non orthorhombic
267  // cases
268 
269  T3 r3;
270  if (orthorhombic_) {
271  r3.x = (x + task.lb_cube.x + task.roffset.x) * dh_[0];
272  r3.y = (y + task.lb_cube.y + task.roffset.y) * dh_[4];
273  r3.z = (z + task.lb_cube.z + task.roffset.z) * dh_[8];
274  } else {
275  r3 = compute_coordinates(dh_, (x + task.lb_cube.x + task.roffset.x),
276  (y + task.lb_cube.y + task.roffset.y),
277  (z + task.lb_cube.z + task.roffset.z));
278  }
279 
280  if (distributed__) {
281  // check if the point is inside the sphere or not. Note that it does
282  // not apply for the orthorhombic case when the full sphere is inside
283  // the region of interest.
284 
285  if (((task.radius * task.radius) <=
286  (r3.x * r3.x + r3.y * r3.y + r3.z * r3.z)) &&
287  (!orthorhombic_ || task.apply_border_mask))
288  continue;
289  } else {
290  // we do not need to do this test for the orthorhombic case
291  if ((!orthorhombic_) && ((task.radius * task.radius) <=
292  (r3.x * r3.x + r3.y * r3.y + r3.z * r3.z)))
293  continue;
294  }
295 
296  // allow computation of the address in parallel to starting the
297  // computations
298  // T *grid_elem =
299  // dev_.ptr_dev[1] +
300  // (z2 * dev_.grid_local_size_[1] + y2) * dev_.grid_local_size_[2] +
301  // x2;
302 
303  T res = coefs_[0];
304 
305  if (task.lp >= 1) {
306  res += coefs_[1] * r3.x;
307  res += coefs_[2] * r3.y;
308  res += coefs_[3] * r3.z;
309  }
310  const T r3xy = r3.x * r3.y;
311  const T r3xz = r3.x * r3.z;
312  const T r3yz = r3.y * r3.z;
313  const T r3x2 = r3.x * r3.x;
314  const T r3y2 = r3.y * r3.y;
315  const T r3z2 = r3.z * r3.z;
316 
317  if (task.lp >= 2) {
318  res += coefs_[4] * r3x2;
319  res += coefs_[5] * r3xy;
320  res += coefs_[6] * r3xz;
321  res += coefs_[7] * r3y2;
322  res += coefs_[8] * r3yz;
323  res += coefs_[9] * r3z2;
324  }
325 
326  if (task.lp >= 3) {
327  res += coefs_[10] * r3x2 * r3.x;
328  res += coefs_[11] * r3x2 * r3.y;
329  res += coefs_[12] * r3x2 * r3.z;
330  res += coefs_[13] * r3.x * r3y2;
331  res += coefs_[14] * r3xy * r3.z;
332  res += coefs_[15] * r3.x * r3z2;
333  res += coefs_[16] * r3y2 * r3.y;
334  res += coefs_[17] * r3y2 * r3.z;
335  res += coefs_[18] * r3.y * r3z2;
336  res += coefs_[19] * r3z2 * r3.z;
337  }
338 
339  if (task.lp >= 4) {
340  res += coefs_[20] * r3x2 * r3x2;
341  res += coefs_[21] * r3x2 * r3xy;
342  res += coefs_[22] * r3x2 * r3xz;
343  res += coefs_[23] * r3x2 * r3y2;
344  res += coefs_[24] * r3x2 * r3yz;
345  res += coefs_[25] * r3x2 * r3z2;
346  res += coefs_[26] * r3xy * r3y2;
347  res += coefs_[27] * r3xz * r3y2;
348  res += coefs_[28] * r3xy * r3z2;
349  res += coefs_[29] * r3xz * r3z2;
350  res += coefs_[30] * r3y2 * r3y2;
351  res += coefs_[31] * r3y2 * r3yz;
352  res += coefs_[32] * r3y2 * r3z2;
353  res += coefs_[33] * r3yz * r3z2;
354  res += coefs_[34] * r3z2 * r3z2;
355  }
356 
357  // beware it is coef_ (global memory) here not coefs_ (shared memory)
358  if (task.lp > 4) {
359  for (int ic = 35; ic < ncoset(task.lp); ic++) {
360  auto &co = coset_inv[ic];
361  T tmp = coef_[ic];
362  for (int po = 0; po < co.l[2]; po++)
363  tmp *= r3.z;
364  for (int po = 0; po < co.l[1]; po++)
365  tmp *= r3.y;
366  for (int po = 0; po < co.l[0]; po++)
367  tmp *= r3.x;
368  res += tmp;
369  }
370  }
371 
372  atomicAdd(
373  dev_.ptr_dev[1] +
374  (z2 * dev_.grid_local_size_[1] + y2) *
375  dev_.grid_local_size_[2] +
376  x2,
377  res * exp(-(r3.x * r3.x + r3.y * r3.y + r3.z * r3.z) * task.zetp));
378  }
379  }
380  }
381  __syncthreads();
382 }
383 /*******************************************************************************
384  * \brief Launches the Cuda kernel that collocates all tasks of one grid level.
385  ******************************************************************************/
386 void context_info::collocate_one_grid_level(const int level,
387  const enum grid_func func,
388  int *lp_diff) {
389 
390  if (number_of_tasks_per_level_[level] == 0)
391  return;
392 
393  // Compute max angular momentum.
394  const ldiffs_value ldiffs = prepare_get_ldiffs(func);
395  smem_parameters smem_params(ldiffs, lmax());
396 
397  *lp_diff = smem_params.lp_diff();
399 
400  // kernel parameters
401  kernel_params params = set_kernel_parameters(level, smem_params);
402  params.func = func;
403 
404  // Launch !
405  const dim3 threads_per_block(4, 4, 4);
406 
407  if (func == GRID_FUNC_AB) {
408  calculate_coefficients<double, true>
409  <<<number_of_tasks_per_level_[level], threads_per_block,
410  smem_params.smem_per_block(), level_streams[level]>>>(params);
411  } else {
412  calculate_coefficients<double, false>
413  <<<number_of_tasks_per_level_[level], threads_per_block,
414  smem_params.smem_per_block(), level_streams[level]>>>(params);
415  }
416 
417  if (grid_[level].is_distributed()) {
418  if (grid_[level].is_orthorhombic())
419  collocate_kernel<double, double3, true, true>
420  <<<number_of_tasks_per_level_[level], threads_per_block,
421  ncoset(4) * sizeof(double), level_streams[level]>>>(params);
422  else
423  collocate_kernel<double, double3, true, false>
424  <<<number_of_tasks_per_level_[level], threads_per_block,
425  ncoset(4) * sizeof(double), level_streams[level]>>>(params);
426  } else {
427  if (grid_[level].is_orthorhombic())
428  collocate_kernel<double, double3, false, true>
429  <<<number_of_tasks_per_level_[level], threads_per_block,
430  ncoset(4) * sizeof(double), level_streams[level]>>>(params);
431  else
432  collocate_kernel<double, double3, false, false>
433  <<<number_of_tasks_per_level_[level], threads_per_block,
434  ncoset(4) * sizeof(double), level_streams[level]>>>(params);
435  }
436 }
437 } // namespace rocm_backend
438 #endif // defined(__OFFLOAD_HIP) && !defined(__NO_OFFLOAD_GRID)
void collocate_one_grid_level(const int level, const enum grid_func func, int *lp_diff)
std::vector< grid_info< double > > grid_
std::vector< offloadStream_t > level_streams
std::vector< int > number_of_tasks_per_level_
grid_func
@ GRID_FUNC_AB
static void const int const int i
for(int lxp=0;lxp<=lp;lxp++)
real(dp), dimension(3) a
Definition: ai_eri_debug.F:31
real(dp), dimension(3) b
Definition: ai_eri_debug.F:31
integer, dimension(:, :, :), allocatable, public co
__inline__ __device__ void compute_window_size(const int *const grid_size, const int border_mask, const int *border_width, int3 *const window_size, int3 *const window_shift)
__device__ static __inline__ void cab_to_cxyz(const smem_task< T > &task, const T *__restrict__ alpha, const T *__restrict__ cab, T *__restrict__ cxyz)
Transforms coefficients C_ab into C_xyz.
__host__ __device__ __inline__ int ncoset(const int l)
Number of Cartesian orbitals up to given angular momentum quantum.
__device__ __inline__ void fill_smem_task_coef(const kernel_params &dev, const int task_id, smem_task< T > &task)
Copies a task from global to shared memory and does precomputations.
ldiffs_value prepare_get_ldiffs(const enum grid_func func)
Returns difference in angular momentum range for given func.
static void init_constant_memory()
Initializes the device's constant memory.
__constant__ orbital coset_inv[1330]
__inline__ __device__ void compute_alpha(const smem_task< T > &task, T *__restrict__ alpha)
Computes the polynomial expansion coefficients: (x-a)**lxa (x-b)**lxb -> sum_{ls} alpha(ls,...
__device__ __inline__ void fill_smem_task_reduced(const kernel_params &dev, const int task_id, smem_task_reduced< T, T3 > &task)
Copies a task from global to shared memory.
__inline__ __device__ double3 compute_coordinates(const double *__restrict__ dh_, const double x, const double y, const double z)
__device__ __inline__ void prepare_pab(const enum grid_func func, const orbital a, const orbital b, const T zeta, const T zetb, const T pab_val, const int n, T *cab)
Transforms a given element of the density matrix according to func.