(git:b279b6b)
grid_hip_internal_header.h
Go to the documentation of this file.
1 /*----------------------------------------------------------------------------*/
2 /* CP2K: A general program to perform molecular dynamics simulations */
3 /* Copyright 2000-2024 CP2K developers group <https://cp2k.org> */
4 /* */
5 /* SPDX-License-Identifier: BSD-3-Clause */
6 /*----------------------------------------------------------------------------*/
7 
8 /*
9  * Authors :
10  - Dr Mathieu Taillefumier (ETH Zurich / CSCS)
11  - Advanced Micro Devices, Inc.
12 */
13 
14 #ifndef GRID_HIP_INTERNAL_HEADER_H
15 #define GRID_HIP_INTERNAL_HEADER_H
16 
17 #include <algorithm>
18 #include <assert.h>
19 #include <hip/hip_runtime.h>
20 #include <limits.h>
21 #include <math.h>
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <string.h>
25 
26 #define GRID_DEVICE __device__
27 extern "C" {
28 #include "../common/grid_basis_set.h"
29 #include "../common/grid_constants.h"
30 }
31 
32 #include "grid_hip_context.h"
33 
34 namespace rocm_backend {
35 
36 #if defined(__HIP_PLATFORM_NVIDIA__)
37 #if __CUDA_ARCH__ < 600
38 __device__ __inline__ double atomicAdd(double *address, double val) {
39  unsigned long long int *address_as_ull = (unsigned long long int *)address;
40  unsigned long long int old = *address_as_ull, assumed;
41 
42  do {
43  assumed = old;
44  old = atomicCAS(address_as_ull, assumed,
45  __double_as_longlong(val + __longlong_as_double(assumed)));
46 
47  // Note: uses integer comparison to avoid hang in case of NaN (since NaN !=
48  // NaN)
49  } while (assumed != old);
50 
51  return __longlong_as_double(old);
52 }
53 #endif
54 #endif
55 
56 /*******************************************************************************
57  * \brief Orbital angular momentum.
58  ******************************************************************************/
59 struct orbital {
60  int l[3];
61 };
62 
63 __constant__ orbital coset_inv[1330];
64 __constant__ int binomial_coef[19][19];
65 
66 /*******************************************************************************
67  * \brief Differences in angular momentum.
68  ******************************************************************************/
69 struct ldiffs_value {
70  int la_max_diff{0};
71  int la_min_diff{0};
72  int lb_max_diff{0};
73  int lb_min_diff{0};
74 };
75 
76 /*******************************************************************************
77  * \brief data needed for calculating the coefficients, forces, and stress
78  ******************************************************************************/
79 template <typename T> struct smem_task {
81  T ra[3];
82  T rb[3];
83  T rp[3];
84  T rab[3];
85  T zeta;
86  T zetb;
87  T zetp;
90  char la_max;
91  char lb_max;
92  char la_min;
93  char lb_min;
94  char lp;
95  // size of the cab matrix
96  unsigned short int n1;
97  unsigned short int n2;
98  // size of entire spherical basis
99  unsigned short int nsgfa;
100  unsigned short int nsgfb;
101  // size of spherical set
102  unsigned short int nsgf_seta;
103  unsigned short int nsgf_setb;
104  // start of decontracted set, ie. pab and hab
107  // size of decontracted set, ie. pab and hab
108  int ncoseta;
109  int ncosetb;
110  // strides of the sphi transformation matrices
111  int maxcoa;
112  int maxcob;
113  // pointers matrices
115  T *sphia;
116  T *sphib;
117  // integrate
121 };
122 
123 /*******************************************************************************
124  * \brief data needed for collocate and integrate kernels
125  ******************************************************************************/
126 template <typename T, typename T3> struct smem_task_reduced {
130  T zetp;
131  char lp;
133 };
134 
135 /*******************************************************************************
136  * \brief Factorial function, e.g. fac(5) = 5! = 120.
137  * \author Ole Schuett
138  ******************************************************************************/
139 __device__ __inline__ double fac(const int i) {
140  static const double table[] = {
141  0.10000000000000000000E+01, 0.10000000000000000000E+01,
142  0.20000000000000000000E+01, 0.60000000000000000000E+01,
143  0.24000000000000000000E+02, 0.12000000000000000000E+03,
144  0.72000000000000000000E+03, 0.50400000000000000000E+04,
145  0.40320000000000000000E+05, 0.36288000000000000000E+06,
146  0.36288000000000000000E+07, 0.39916800000000000000E+08,
147  0.47900160000000000000E+09, 0.62270208000000000000E+10,
148  0.87178291200000000000E+11, 0.13076743680000000000E+13,
149  0.20922789888000000000E+14, 0.35568742809600000000E+15,
150  0.64023737057280000000E+16, 0.12164510040883200000E+18,
151  0.24329020081766400000E+19, 0.51090942171709440000E+20,
152  0.11240007277776076800E+22, 0.25852016738884976640E+23,
153  0.62044840173323943936E+24, 0.15511210043330985984E+26,
154  0.40329146112660563558E+27, 0.10888869450418352161E+29,
155  0.30488834461171386050E+30, 0.88417619937397019545E+31,
156  0.26525285981219105864E+33};
157  return table[i];
158 }
159 
160 /*******************************************************************************
161  * \brief Number of Cartesian orbitals up to given angular momentum quantum.
162  * \author Ole Schuett
163  ******************************************************************************/
164 __host__ __device__ __inline__ int ncoset(const int l) {
165  static const int table[] = {1, // l=0
166  4, // l=1
167  10, // l=2 ...
168  20, 35, 56, 84, 120, 165, 220, 286,
169  364, 455, 560, 680, 816, 969, 1140, 1330};
170  return table[l];
171 }
172 
173 /*******************************************************************************
174  * \brief Maps three angular momentum components to a single zero based index.
175  ******************************************************************************/
176 __host__ __device__ __inline__ int coset(int lx, int ly, int lz) {
177  const int l = lx + ly + lz;
178  if (l == 0) {
179  return 0;
180  } else {
181  return ncoset(l - 1) + ((l - lx) * (l - lx + 1)) / 2 + lz;
182  }
183 }
184 
185 /*******************************************************************************
186  * \brief Increase i'th component of given orbital angular momentum.
187  ******************************************************************************/
188 __device__ __inline__ orbital up(const int i, const orbital &a) {
189  orbital b = a;
190  b.l[i] += 1;
191  return b;
192 }
193 
194 /*******************************************************************************
195  * \brief Decrease i'th component of given orbital angular momentum.
196  ******************************************************************************/
197 __inline__ __device__ orbital down(const int i, const orbital &a) {
198  orbital b = a;
199  b.l[i] = max(0, a.l[i] - 1);
200  return b;
201 }
202 
203 /*******************************************************************************
204  * \brief Return coset index of given orbital angular momentum.
205  ******************************************************************************/
206 __inline__ __device__ int idx(const orbital a) {
207  return coset(a.l[0], a.l[1], a.l[2]);
208 }
209 
210 __device__ __inline__ double power(const double x, const int expo) {
211  double tmp = 1.0;
212  for (int i = 1; i <= expo; i++)
213  tmp *= x;
214  return tmp;
215 }
216 
217 /*******************************************************************************
218  * \brief Adds given value to matrix element cab[idx(b)][idx(a)].
219  ******************************************************************************/
220 template <typename T = double>
221 __device__ __inline__ void prep_term(const orbital a, const orbital b,
222  const T value, const int n, T *cab) {
223  atomicAdd(&cab[idx(b) * n + idx(a)], value);
224 }
225 
226 /*******************************************************************************
227  * \brief Initializes the device's constant memory.
228  * \author Ole Schuett
229  ******************************************************************************/
230 inline static void init_constant_memory() {
231  static bool initialized = false;
232  if (initialized) {
233  return; // constant memory has to be initialized only once
234  }
235 
236  // Inverse coset mapping
237  orbital coset_inv_host[1330];
238  for (int lx = 0; lx <= 18; lx++) {
239  for (int ly = 0; ly <= 18 - lx; ly++) {
240  for (int lz = 0; lz <= 18 - lx - ly; lz++) {
241  const int i = coset(lx, ly, lz);
242  coset_inv_host[i] = {{lx, ly, lz}};
243  }
244  }
245  }
246  hipError_t error =
247  hipMemcpyToSymbol(coset_inv, &coset_inv_host, sizeof(coset_inv_host), 0,
248  hipMemcpyHostToDevice);
249  assert(error == hipSuccess);
250 
251  // Binomial coefficient
252  int binomial_coef_host[19][19] = {
253  {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
254  {1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
255  {1, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
256  {1, 3, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
257  {1, 4, 6, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
258  {1, 5, 10, 10, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
259  {1, 6, 15, 20, 15, 6, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
260  {1, 7, 21, 35, 35, 21, 7, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
261  {1, 8, 28, 56, 70, 56, 28, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
262  {1, 9, 36, 84, 126, 126, 84, 36, 9, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0},
263  {1, 10, 45, 120, 210, 252, 210, 120, 45, 10, 1, 0, 0, 0, 0, 0, 0, 0, 0},
264  {1, 11, 55, 165, 330, 462, 462, 330, 165, 55, 11, 1, 0, 0, 0, 0, 0, 0, 0},
265  {1, 12, 66, 220, 495, 792, 924, 792, 495, 220, 66, 12, 1, 0, 0, 0, 0, 0,
266  0},
267  {1, 13, 78, 286, 715, 1287, 1716, 1716, 1287, 715, 286, 78, 13, 1, 0, 0,
268  0, 0, 0},
269  {1, 14, 91, 364, 1001, 2002, 3003, 3432, 3003, 2002, 1001, 364, 91, 14, 1,
270  0, 0, 0, 0},
271  {1, 15, 105, 455, 1365, 3003, 5005, 6435, 6435, 5005, 3003, 1365, 455,
272  105, 15, 1, 0, 0, 0},
273  {1, 16, 120, 560, 1820, 4368, 8008, 11440, 12870, 11440, 8008, 4368, 1820,
274  560, 120, 16, 1, 0, 0},
275  {1, 17, 136, 680, 2380, 6188, 12376, 19448, 24310, 24310, 19448, 12376,
276  6188, 2380, 680, 136, 17, 1, 0},
277  {1, 18, 153, 816, 3060, 8568, 18564, 31824, 43758, 48620, 43758, 31824,
278  18564, 8568, 3060, 816, 153, 18, 1}};
279  error =
280  hipMemcpyToSymbol(binomial_coef, &binomial_coef_host[0][0],
281  sizeof(binomial_coef_host), 0, hipMemcpyHostToDevice);
282  assert(error == hipSuccess);
283 
284  initialized = true;
285 }
286 
287 __inline__ __device__ double3
288 compute_coordinates(const double *__restrict__ dh_, const double x,
289  const double y, const double z) {
290 
291  double3 r3;
292  // I make no distinction between orthorhombic and non orthorhombic
293  // cases
294 
295  r3.x = z * dh_[6] + y * dh_[3] + x * dh_[0];
296  r3.y = z * dh_[7] + y * dh_[4] + x * dh_[1];
297  r3.z = z * dh_[8] + y * dh_[5] + x * dh_[2];
298  return r3;
299 }
300 
301 __inline__ __device__ float3 compute_coordinates(const float *__restrict__ dh_,
302  const float x, const float y,
303  const float z) {
304 
305  float3 r3;
306  // I make no distinction between orthorhombic and non orthorhombic
307  // cases
308 
309  r3.x = z * dh_[6] + y * dh_[3] + x * dh_[0];
310  r3.y = z * dh_[7] + y * dh_[4] + x * dh_[1];
311  r3.z = z * dh_[8] + y * dh_[5] + x * dh_[2];
312  return r3;
313 }
314 
315 /*******************************************************************************
316  * \brief Computes the polynomial expansion coefficients:
317  * (x-a)**lxa (x-b)**lxb -> sum_{ls} alpha(ls,lxa,lxb,1)*(x-p)**ls
318  ******************************************************************************/
319 template <typename T>
320 __inline__ __device__ void compute_alpha(const smem_task<T> &task,
321  T *__restrict__ alpha) {
322  // strides for accessing alpha
323  const int s3 = (task.lp + 1);
324  const int s2 = (task.la_max + 1) * s3;
325  const int s1 = (task.lb_max + 1) * s2;
326  const int tid =
327  threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
328  for (int i = tid; i < 3 * s1; i += blockDim.x * blockDim.y * blockDim.z)
329  alpha[i] = 0.0;
330 
331  __syncthreads();
332 
333  for (int idir = threadIdx.z; idir < 3; idir += blockDim.z) {
334  const T drpa = task.rp[idir] - task.ra[idir];
335  const T drpb = task.rp[idir] - task.rb[idir];
336  for (int la = threadIdx.y; la <= task.la_max; la += blockDim.y) {
337  for (int lb = threadIdx.x; lb <= task.lb_max; lb += blockDim.x) {
338  T a = 1.0;
339  for (int k = 0; k <= la; k++) {
340  T b = 1.0;
341  const int base = idir * s1 + lb * s2 + la * s3;
342  for (int l = 0; l <= lb; l++) {
343  alpha[base + la - l + lb - k] +=
344  a * b * binomial_coef[la][k] * binomial_coef[lb][l];
345  b *= drpb;
346  }
347  a *= drpa;
348  }
349  }
350  }
351  }
352  __syncthreads(); // because of concurrent writes to alpha
353 }
354 
355 __host__ __inline__ __device__ void
356 convert_to_lattice_coordinates(const double *dh_inv_,
357  const double3 *__restrict__ const rp,
358  double3 *__restrict__ rp_c) {
359  rp_c->x = dh_inv_[0] * rp->x + dh_inv_[3] * rp->y + dh_inv_[6] * rp->z;
360  rp_c->y = dh_inv_[1] * rp->x + dh_inv_[4] * rp->y + dh_inv_[7] * rp->z;
361  rp_c->z = dh_inv_[2] * rp->x + dh_inv_[5] * rp->y + dh_inv_[8] * rp->z;
362 }
363 
364 __host__ __inline__ __device__ void
366  const double *__restrict__ dh_, const double3 *__restrict__ const rp,
367  double3 *__restrict__ rp_c) {
368  rp_c->x = dh_[0] * rp->x + dh_[3] * rp->y + dh_[6] * rp->z;
369  rp_c->y = dh_[1] * rp->x + dh_[4] * rp->y + dh_[7] * rp->z;
370  rp_c->z = dh_[2] * rp->x + dh_[5] * rp->y + dh_[8] * rp->z;
371 }
372 
373 __host__ __inline__ __device__ void
374 convert_to_lattice_coordinates(const float *dh_inv_,
375  const float3 *__restrict__ const rp,
376  float3 *__restrict__ rp_c) {
377  rp_c->x = dh_inv_[0] * rp->x + dh_inv_[3] * rp->y + dh_inv_[6] * rp->z;
378  rp_c->y = dh_inv_[1] * rp->x + dh_inv_[4] * rp->y + dh_inv_[7] * rp->z;
379  rp_c->z = dh_inv_[2] * rp->x + dh_inv_[5] * rp->y + dh_inv_[8] * rp->z;
380 }
381 
382 __host__ __inline__ __device__ void
384  const float *__restrict__ dh_, const float3 *__restrict__ const rp,
385  float3 *__restrict__ rp_c) {
386  rp_c->x = dh_[0] * rp->x + dh_[3] * rp->y + dh_[6] * rp->z;
387  rp_c->y = dh_[1] * rp->x + dh_[4] * rp->y + dh_[7] * rp->z;
388  rp_c->z = dh_[2] * rp->x + dh_[5] * rp->y + dh_[8] * rp->z;
389 }
390 
391 template <typename T, typename T3, bool orthorhombic_>
393  const T radius, const T *const __restrict__ dh_,
394  const T *const __restrict__ dh_inv_, const T3 *__restrict__ rp,
395  T3 *__restrict__ roffset, int3 *__restrict__ cubecenter,
396  int3 *__restrict__ lb_cube, int3 *__restrict__ cube_size) {
397 
398  /* center of the gaussian in the lattice coordinates */
399  T3 rp1, rp2, rp3;
400 
401  /* it is in the lattice vector frame */
402  convert_to_lattice_coordinates(dh_inv_, rp, &rp1);
403 
404  /* compute the grid point that is the closest to the sphere center. */
405  cubecenter->x = std::floor(rp1.x);
406  cubecenter->y = std::floor(rp1.y);
407  cubecenter->z = std::floor(rp1.z);
408 
409  /* seting up the cube parameters */
410 
411  if (orthorhombic_) {
412 
413  // the cube is actually slightly bigger than the sphere of radius r. that's
414  // why we need to discretize it to get the cube size "right".
415 
416  // disc_radius >= radius always. somehow despite the fact that we compile
417  // things with a c++ compiler on the host side, we need to use fmin instead
418  // of std::min since std::min is not allowed on the device side
419 
420  // We assume no specific form for the orthogonal matrix. (can be diaognal or
421  // completely full, the only constraint is that the tree vectors are
422  // orthogonal)
423 
424  T norm1, norm2, norm3;
425  norm1 = dh_[0] * dh_[0] + dh_[1] * dh_[1] + dh_[2] * dh_[2];
426  norm2 = dh_[3] * dh_[3] + dh_[4] * dh_[4] + dh_[5] * dh_[5];
427  norm3 = dh_[6] * dh_[6] + dh_[7] * dh_[7] + dh_[8] * dh_[8];
428 
429  norm1 = std::sqrt(norm1);
430  norm2 = std::sqrt(norm2);
431  norm3 = std::sqrt(norm3);
432 
433  const T disr_radius =
434  std::min(norm1, std::min(norm2, norm3)) *
435  std::max(1,
436  (int)ceil(radius / std::min(norm1, std::min(norm2, norm3))));
437 
438  rp2.x = cubecenter->x;
439  rp2.y = cubecenter->y;
440  rp2.z = cubecenter->z;
441 
442  /* convert the cube center from grid points coordinates to cartesian */
444  /* cube center */
445  roffset->x -= rp->x;
446  roffset->y -= rp->y;
447  roffset->z -= rp->z;
448 
449  rp2.x = disr_radius;
450  rp2.y = disr_radius;
451  rp2.z = disr_radius;
452 
453  /* it is in the lattice vector frame */
454  convert_to_lattice_coordinates(dh_inv_, &rp2, &rp3);
455  /* lower and upper bounds */
456  lb_cube->x = std::ceil(-1e-8 - rp3.x);
457  lb_cube->y = std::ceil(-1e-8 - rp3.y);
458  lb_cube->z = std::ceil(-1e-8 - rp3.z);
459 
460  /* it is in the lattice vector frame */
461  convert_to_lattice_coordinates(dh_inv_, roffset, &rp2);
462 
463  /* Express the offset in lattice coordinates */
464  roffset->x = rp2.x;
465  roffset->y = rp2.y;
466  roffset->z = rp2.z;
467 
468  /* compute the cube size ignoring periodicity */
469  /* the interval is not symmetrical for some curious reasons. it should go
470  * from [-L..L+1] so the number of points is multiple of two */
471  cube_size->x = 2 - 2 * lb_cube->x;
472  cube_size->y = 2 - 2 * lb_cube->y;
473  cube_size->z = 2 - 2 * lb_cube->z;
474  return disr_radius;
475  } else {
476  int3 ub_cube;
477 
478  lb_cube->x = INT_MAX;
479  ub_cube.x = INT_MIN;
480  lb_cube->y = INT_MAX;
481  ub_cube.y = INT_MIN;
482  lb_cube->z = INT_MAX;
483  ub_cube.z = INT_MIN;
484 
485  for (int i = -1; i <= 1; i++) {
486  for (int j = -1; j <= 1; j++) {
487  for (int k = -1; k <= 1; k++) {
488  T3 r;
489  r.x = rp->x + ((T)i) * radius;
490  r.y = rp->y + ((T)j) * radius;
491  r.z = rp->z + ((T)k) * radius;
492  convert_to_lattice_coordinates(dh_inv_, &r, roffset);
493 
494  lb_cube->x = std::min(lb_cube->x, (int)std::floor(roffset->x));
495  ub_cube.x = std::max(ub_cube.x, (int)std::ceil(roffset->x));
496 
497  lb_cube->y = std::min(lb_cube->y, (int)std::floor(roffset->y));
498  ub_cube.y = std::max(ub_cube.y, (int)std::ceil(roffset->y));
499 
500  lb_cube->z = std::min(lb_cube->z, (int)std::floor(roffset->z));
501  ub_cube.z = std::max(ub_cube.z, (int)std::ceil(roffset->z));
502  }
503  }
504  }
505  /* compute the cube size ignoring periodicity */
506  cube_size->x = ub_cube.x - lb_cube->x;
507  cube_size->y = ub_cube.y - lb_cube->y;
508  cube_size->z = ub_cube.z - lb_cube->z;
509 
510  /* compute the offset in lattice coordinates */
511 
512  roffset->x = cubecenter->x - rp1.x;
513  roffset->y = cubecenter->y - rp1.y;
514  roffset->z = cubecenter->z - rp1.z;
515 
516  // shift the boundaries compared to the cube center so that the
517  // specialization ortho / non ortho is minimal
518  lb_cube->x -= cubecenter->x;
519  lb_cube->y -= cubecenter->y;
520  lb_cube->z -= cubecenter->z;
521 
522  return radius;
523  }
524 }
525 
526 __inline__ __device__ void compute_window_size(const int *const grid_size,
527  const int border_mask,
528  const int *border_width,
529  int3 *const window_size,
530  int3 *const window_shift) {
531  window_shift->x = 0;
532  window_shift->y = 0;
533  window_shift->z = 0;
534 
535  window_size->x = grid_size[2] - 1;
536  window_size->y = grid_size[1] - 1;
537  window_size->z = grid_size[0] - 1;
538 
539  if (border_mask & (1 << 0))
540  window_shift->x += border_width[2];
541  if (border_mask & (1 << 1))
542  window_size->x -= border_width[2];
543  if (border_mask & (1 << 2))
544  window_shift->y += border_width[1];
545  if (border_mask & (1 << 3))
546  window_size->y -= border_width[1];
547  if (border_mask & (1 << 4))
548  window_shift->z += border_width[0];
549  if (border_mask & (1 << 5))
550  window_size->z -= border_width[0];
551 }
552 
553 /*******************************************************************************
554  * \brief Transforms coefficients C_ab into C_xyz.
555  ******************************************************************************/
556 template <typename T>
557 __device__ __inline__ static void
558 cab_to_cxyz(const smem_task<T> &task, const T *__restrict__ alpha,
559  const T *__restrict__ cab, T *__restrict__ cxyz) {
560 
561  // *** initialise the coefficient matrix, we transform the sum
562  //
563  // sum_{lxa,lya,lza,lxb,lyb,lzb} P_{lxa,lya,lza,lxb,lyb,lzb} *
564  // (x-a_x)**lxa (y-a_y)**lya (z-a_z)**lza (x-b_x)**lxb (y-a_y)**lya
565  // (z-a_z)**lza
566  //
567  // into
568  //
569  // sum_{lxp,lyp,lzp} P_{lxp,lyp,lzp} (x-p_x)**lxp (y-p_y)**lyp (z-p_z)**lzp
570  //
571  // where p is center of the product gaussian, and lp = la_max + lb_max
572  // (current implementation is l**7)
573 
574  // strides for accessing alpha
575  const int s3 = (task.lp + 1);
576  const int s2 = (task.la_max + 1) * s3;
577  const int s1 = (task.lb_max + 1) * s2;
578 
579  // TODO: Maybe we can transpose alpha to index it directly with ico and jco.
580  const int tid =
581  threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
582 
583  for (int i = tid; i < ncoset(task.lp);
584  i += blockDim.x * blockDim.y * blockDim.z) {
585  auto &co = coset_inv[i];
586  T reg = 0.0; // accumulate into a register
587  for (int jco = 0; jco < ncoset(task.lb_max); jco++) {
588  const auto &b = coset_inv[jco];
589  for (int ico = 0; ico < ncoset(task.la_max); ico++) {
590  const auto &a = coset_inv[ico];
591  const T p = task.prefactor *
592  alpha[0 * s1 + b.l[0] * s2 + a.l[0] * s3 + co.l[0]] *
593  alpha[1 * s1 + b.l[1] * s2 + a.l[1] * s3 + co.l[1]] *
594  alpha[2 * s1 + b.l[2] * s2 + a.l[2] * s3 + co.l[2]];
595  reg += p * cab[jco * task.n1 + ico]; // collocate
596  }
597  }
598 
599  cxyz[i] = reg;
600  }
601  __syncthreads(); // because of concurrent writes to cxyz / cab
602 }
603 
604 /*******************************************************************************
605  * \brief Transforms coefficients C_xyz into C_ab.
606  ******************************************************************************/
607 template <typename T>
608 __device__ __inline__ static void
609 cxyz_to_cab(const smem_task<T> &task, const T *__restrict__ alpha,
610  const T *__restrict__ cxyz, T *__restrict__ cab) {
611 
612  // *** initialise the coefficient matrix, we transform the sum
613  //
614  // sum_{lxa,lya,lza,lxb,lyb,lzb} P_{lxa,lya,lza,lxb,lyb,lzb} *
615  // (x-a_x)**lxa (y-a_y)**lya (z-a_z)**lza (x-b_x)**lxb (y-a_y)**lya
616  // (z-a_z)**lza
617  //
618  // into
619  //
620  // sum_{lxp,lyp,lzp} P_{lxp,lyp,lzp} (x-p_x)**lxp (y-p_y)**lyp (z-p_z)**lzp
621  //
622  // where p is center of the product gaussian, and lp = la_max + lb_max
623  // (current implementation is l**7)
624 
625  // strides for accessing alpha
626  const int s3 = (task.lp + 1);
627  const int s2 = (task.la_max + 1) * s3;
628  const int s1 = (task.lb_max + 1) * s2;
629 
630  const int tid =
631  threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
632  for (int jco = tid / 8; jco < ncoset(task.lb_max); jco += 8) {
633  const orbital b = coset_inv[jco];
634  for (int ico = tid % 8; ico < ncoset(task.la_max); ico += 8) {
635  const auto &a = coset_inv[ico];
636  T reg = 0.0; // accumulate into a register
637  for (int ic = 0; ic < ncoset(task.lp); ic++) {
638  const auto &co = coset_inv[ic];
639  const T p = task.prefactor *
640  alpha[b.l[0] * s2 + a.l[0] * s3 + co.l[0]] *
641  alpha[s1 + b.l[1] * s2 + a.l[1] * s3 + co.l[1]] *
642  alpha[2 * s1 + b.l[2] * s2 + a.l[2] * s3 + co.l[2]];
643 
644  reg += p * cxyz[ic]; // integrate
645  }
646  cab[jco * task.n1 + ico] = reg; // partial loop coverage -> zero it
647  }
648  }
649 }
650 
651 /*******************************************************************************
652  * \brief Copies a task from global to shared memory.
653  ******************************************************************************/
654 
655 /* Collocate and integrate do not care about the sphi etc... computing the
656  * coefficients do not care about the exponentials (the parameters are still
657  * needed), so simplify the amount of information to save shared memory (it is
658  * crucial on AMD GPU to max out occupancy) */
659 template <typename T, typename T3>
660 __device__ __inline__ void
661 fill_smem_task_reduced(const kernel_params &dev, const int task_id,
662  smem_task_reduced<T, T3> &task) {
663  if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
664  const auto &glb_task = dev.tasks[task_id];
665  task.zetp = glb_task.zeta + glb_task.zetb;
666  task.radius = glb_task.radius;
667  task.discrete_radius = glb_task.discrete_radius;
668 
669  // angular momentum range for the actual collocate/integrate operation.
670  task.lp =
671  glb_task.la_max + dev.la_max_diff + glb_task.lb_max + dev.lb_max_diff;
672 
673  task.cube_size.x = glb_task.cube_size.x;
674  task.cube_size.y = glb_task.cube_size.y;
675  task.cube_size.z = glb_task.cube_size.z;
676 
677  task.cube_center.x = glb_task.cube_center.x;
678  task.cube_center.y = glb_task.cube_center.y;
679  task.cube_center.z = glb_task.cube_center.z;
680 
681  task.roffset.x = glb_task.roffset.x;
682  task.roffset.y = glb_task.roffset.y;
683  task.roffset.z = glb_task.roffset.z;
684 
685  task.lb_cube.x = glb_task.lb_cube.x;
686  task.lb_cube.y = glb_task.lb_cube.y;
687  task.lb_cube.z = glb_task.lb_cube.z;
688 
689  task.apply_border_mask = glb_task.apply_border_mask;
690  }
691  __syncthreads();
692 }
693 
694 /*******************************************************************************
695  * \brief Copies a task from global to shared memory and does precomputations.
696  ******************************************************************************/
697 
698 /* computing the coefficients do not care about many of the exponentials
699  * parameters, etc.. so */
700 template <typename T>
701 __device__ __inline__ void fill_smem_task_coef(const kernel_params &dev,
702  const int task_id,
703  smem_task<T> &task) {
704  if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
705  const auto &glb_task = dev.tasks[task_id];
706  const int iatom = glb_task.iatom;
707  const int jatom = glb_task.jatom;
708  task.zeta = glb_task.zeta;
709  task.zetb = glb_task.zetb;
710 
711  for (int i = 0; i < 3; i++) {
712  task.rab[i] = glb_task.rab[i];
713  task.ra[i] = glb_task.ra[i];
714  task.rb[i] = task.ra[i] + task.rab[i];
715  task.rp[i] =
716  task.ra[i] + task.rab[i] * task.zetb / (task.zeta + task.zetb);
717  }
718 
719  task.prefactor = glb_task.prefactor;
720 
721  // task.radius = glb_task.radius;
722  task.off_diag_twice = glb_task.off_diag_twice;
723 
724  // angular momentum range of basis set
725  const int la_max_basis = glb_task.la_max;
726  const int lb_max_basis = glb_task.lb_max;
727  const int la_min_basis = glb_task.la_min;
728  const int lb_min_basis = glb_task.lb_min;
729 
730  // angular momentum range for the actual collocate/integrate opteration.
731  task.la_max = la_max_basis + dev.la_max_diff;
732  task.lb_max = lb_max_basis + dev.lb_max_diff;
733  task.la_min = max(la_min_basis + (int)dev.la_min_diff, 0);
734  task.lb_min = max(lb_min_basis + (int)dev.lb_min_diff, 0);
735  task.lp = task.la_max + task.lb_max;
736 
737  // start of decontracted set, ie. pab and hab
738  task.first_coseta = (la_min_basis > 0) ? ncoset(la_min_basis - 1) : 0;
739  task.first_cosetb = (lb_min_basis > 0) ? ncoset(lb_min_basis - 1) : 0;
740 
741  // size of decontracted set, ie. pab and hab
742  task.ncoseta = ncoset(la_max_basis);
743  task.ncosetb = ncoset(lb_max_basis);
744  // size of the cab matrix
745  task.n1 = ncoset(task.la_max);
746  task.n2 = ncoset(task.lb_max);
747 
748  // size of entire spherical basis
749  task.nsgfa = glb_task.nsgfa; // ibasis.nsgf;
750  task.nsgfb = glb_task.nsgfb;
751 
752  // size of spherical set
753  task.nsgf_seta = glb_task.nsgf_seta;
754  task.nsgf_setb = glb_task.nsgf_setb;
755 
756  // strides of the sphi transformation matrices
757  task.maxcoa = glb_task.maxcoa;
758  task.maxcob = glb_task.maxcob;
759 
760  // transformations from contracted spherical to primitive cartesian basis
761  task.sphia = &dev.sphi_dev[glb_task.ikind][glb_task.sgfa * task.maxcoa +
762  glb_task.ipgf * task.ncoseta];
763  task.sphib = &dev.sphi_dev[glb_task.jkind][glb_task.sgfb * task.maxcob +
764  glb_task.jpgf * task.ncosetb];
765 
766  // Locate current matrix block within the buffer.
767  const int block_offset = dev.block_offsets[glb_task.block_num];
768  task.block_transposed = glb_task.block_transposed;
769  task.pab_block = dev.ptr_dev[0] + block_offset + glb_task.subblock_offset;
770 
771  if (dev.ptr_dev[3] != nullptr) {
772  task.hab_block = dev.ptr_dev[3] + block_offset + glb_task.subblock_offset;
773  if (dev.ptr_dev[4] != nullptr) {
774  task.forces_a = &dev.ptr_dev[4][3 * iatom];
775  task.forces_b = &dev.ptr_dev[4][3 * jatom];
776  }
777  }
778  }
779  __syncthreads();
780 }
781 
783 private:
784  int la_max_{-1};
785  int lb_max_{-1};
786  int smem_per_block_{0};
787  int alpha_len_{-1};
788  int cab_len_{-1};
789  int lp_max_{-1};
790  ldiffs_value ldiffs_;
791  int lp_diff_{-1};
792 
793 public:
795  ldiffs_ = ldiffs;
796  lp_diff_ = ldiffs.la_max_diff + ldiffs.lb_max_diff;
797  la_max_ = lmax + ldiffs.la_max_diff;
798  lb_max_ = lmax + ldiffs.lb_max_diff;
799  lp_max_ = la_max_ + lb_max_;
800 
801  cab_len_ = ncoset(lb_max_) * ncoset(la_max_);
802  alpha_len_ = 3 * (lb_max_ + 1) * (la_max_ + 1) * (lp_max_ + 1);
803  smem_per_block_ = std::max(alpha_len_, 64) * sizeof(double);
804 
805  if (smem_per_block_ > 64 * 1024) {
806  fprintf(stderr,
807  "ERROR: Not enough shared memory in grid_gpu_collocate.\n");
808  fprintf(stderr, "cab_len: %i, ", cab_len_);
809  fprintf(stderr, "alpha_len: %i, ", alpha_len_);
810  fprintf(stderr, "total smem_per_block: %f kb\n\n",
811  smem_per_block_ / 1024.0);
812  abort();
813  }
814  }
815 
817 
818  // copy and move are trivial
819 
820  inline int smem_alpha_offset() const { return 0; }
821 
822  inline int smem_cab_offset() const { return alpha_len_; }
823 
824  inline int smem_cxyz_offset() const { return alpha_len_ + cab_len_; }
825 
826  inline int smem_per_block() const { return smem_per_block_; }
827 
828  inline int lp_diff() const { return lp_diff_; }
829 
830  inline ldiffs_value ldiffs() const { return ldiffs_; }
831 
832  inline int lp_max() const { return lp_max_; }
833 
834  inline int cxyz_len() const { return ncoset(lp_max_); }
835 };
836 
837 } // namespace rocm_backend
838 #endif
smem_parameters(const ldiffs_value ldiffs, const int lmax)
static void const int const int i
real(dp), dimension(3) a
Definition: ai_eri_debug.F:31
real(dp), dimension(3) b
Definition: ai_eri_debug.F:31
integer, parameter lmax
Definition: ai_eri_debug.F:28
real(dp), dimension(3) p
Definition: ai_eri_debug.F:32
integer, dimension(:, :, :), allocatable, public co
__inline__ __device__ void compute_window_size(const int *const grid_size, const int border_mask, const int *border_width, int3 *const window_size, int3 *const window_shift)
__host__ __inline__ __device__ void convert_from_lattice_coordinates_to_cartesian(const double *__restrict__ dh_, const double3 *__restrict__ const rp, double3 *__restrict__ rp_c)
__device__ static __inline__ void cab_to_cxyz(const smem_task< T > &task, const T *__restrict__ alpha, const T *__restrict__ cab, T *__restrict__ cxyz)
Transforms coefficients C_ab into C_xyz.
__host__ __device__ __inline__ int ncoset(const int l)
Number of Cartesian orbitals up to given angular momentum quantum.
__device__ __inline__ void prep_term(const orbital a, const orbital b, const T value, const int n, T *cab)
Adds given value to matrix element cab[idx(b)][idx(a)].
__device__ __inline__ void fill_smem_task_coef(const kernel_params &dev, const int task_id, smem_task< T > &task)
Copies a task from global to shared memory and does precomputations.
__device__ __inline__ double fac(const int i)
Factorial function, e.g. fac(5) = 5! = 120.
__device__ __inline__ double power(const double x, const int expo)
__host__ __device__ __inline__ int coset(int lx, int ly, int lz)
Maps three angular momentum components to a single zero based index.
__inline__ __device__ orbital down(const int i, const orbital &a)
Decrease i'th component of given orbital angular momentum.
__host__ __inline__ __device__ void convert_to_lattice_coordinates(const double *dh_inv_, const double3 *__restrict__ const rp, double3 *__restrict__ rp_c)
static void init_constant_memory()
Initializes the device's constant memory.
__device__ static __inline__ void cxyz_to_cab(const smem_task< T > &task, const T *__restrict__ alpha, const T *__restrict__ cxyz, T *__restrict__ cab)
Transforms coefficients C_xyz into C_ab.
__inline__ __device__ int idx(const orbital a)
Return coset index of given orbital angular momentum.
__constant__ orbital coset_inv[1330]
__device__ __inline__ orbital up(const int i, const orbital &a)
Increase i'th component of given orbital angular momentum.
__inline__ __device__ void compute_alpha(const smem_task< T > &task, T *__restrict__ alpha)
Computes the polynomial expansion coefficients: (x-a)**lxa (x-b)**lxb -> sum_{ls} alpha(ls,...
__device__ __inline__ void fill_smem_task_reduced(const kernel_params &dev, const int task_id, smem_task_reduced< T, T3 > &task)
Copies a task from global to shared memory.
__inline__ __device__ double3 compute_coordinates(const double *__restrict__ dh_, const double x, const double y, const double z)
__constant__ int binomial_coef[19][19]
__inline__ T compute_cube_properties(const T radius, const T *const __restrict__ dh_, const T *const __restrict__ dh_inv_, const T3 *__restrict__ rp, T3 *__restrict__ roffset, int3 *__restrict__ cubecenter, int3 *__restrict__ lb_cube, int3 *__restrict__ cube_size)
Parameters of the collocate kernel.
Differences in angular momentum.
Orbital angular momentum.
data needed for collocate and integrate kernels
data needed for calculating the coefficients, forces, and stress