(git:374b731)
Loading...
Searching...
No Matches
grid_hip_internal_header.h
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2024 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: BSD-3-Clause */
6/*----------------------------------------------------------------------------*/
7
8/*
9 * Authors :
10 - Dr Mathieu Taillefumier (ETH Zurich / CSCS)
11 - Advanced Micro Devices, Inc.
12*/
13
14#ifndef GRID_HIP_INTERNAL_HEADER_H
15#define GRID_HIP_INTERNAL_HEADER_H
16
17#include <algorithm>
18#include <assert.h>
19#include <hip/hip_runtime.h>
20#include <limits.h>
21#include <math.h>
22#include <stdio.h>
23#include <stdlib.h>
24#include <string.h>
25
26#define GRID_DEVICE __device__
27extern "C" {
28#include "../common/grid_basis_set.h"
29#include "../common/grid_constants.h"
30}
31
32#include "grid_hip_context.h"
33
34namespace rocm_backend {
35
36#if defined(__HIP_PLATFORM_NVIDIA__)
37#if __CUDA_ARCH__ < 600
38__device__ __inline__ double atomicAdd(double *address, double val) {
39 unsigned long long int *address_as_ull = (unsigned long long int *)address;
40 unsigned long long int old = *address_as_ull, assumed;
41
42 do {
43 assumed = old;
44 old = atomicCAS(address_as_ull, assumed,
45 __double_as_longlong(val + __longlong_as_double(assumed)));
46
47 // Note: uses integer comparison to avoid hang in case of NaN (since NaN !=
48 // NaN)
49 } while (assumed != old);
50
51 return __longlong_as_double(old);
52}
53#endif
54#endif
55
56/*******************************************************************************
57 * \brief Orbital angular momentum.
58 ******************************************************************************/
59struct orbital {
60 int l[3];
61};
62
63__constant__ orbital coset_inv[1330];
64__constant__ int binomial_coef[19][19];
65
66/*******************************************************************************
67 * \brief Differences in angular momentum.
68 ******************************************************************************/
74};
75
76/*******************************************************************************
77 * \brief data needed for calculating the coefficients, forces, and stress
78 ******************************************************************************/
79template <typename T> struct smem_task {
81 T ra[3];
82 T rb[3];
83 T rp[3];
84 T rab[3];
90 char la_max;
91 char lb_max;
92 char la_min;
93 char lb_min;
94 char lp;
95 // size of the cab matrix
96 unsigned short int n1;
97 unsigned short int n2;
98 // size of entire spherical basis
99 unsigned short int nsgfa;
100 unsigned short int nsgfb;
101 // size of spherical set
102 unsigned short int nsgf_seta;
103 unsigned short int nsgf_setb;
104 // start of decontracted set, ie. pab and hab
107 // size of decontracted set, ie. pab and hab
110 // strides of the sphi transformation matrices
113 // pointers matrices
117 // integrate
121};
122
123/*******************************************************************************
124 * \brief data needed for collocate and integrate kernels
125 ******************************************************************************/
134
135/*******************************************************************************
136 * \brief Factorial function, e.g. fac(5) = 5! = 120.
137 * \author Ole Schuett
138 ******************************************************************************/
139__device__ __inline__ double fac(const int i) {
140 static const double table[] = {
141 0.10000000000000000000E+01, 0.10000000000000000000E+01,
142 0.20000000000000000000E+01, 0.60000000000000000000E+01,
143 0.24000000000000000000E+02, 0.12000000000000000000E+03,
144 0.72000000000000000000E+03, 0.50400000000000000000E+04,
145 0.40320000000000000000E+05, 0.36288000000000000000E+06,
146 0.36288000000000000000E+07, 0.39916800000000000000E+08,
147 0.47900160000000000000E+09, 0.62270208000000000000E+10,
148 0.87178291200000000000E+11, 0.13076743680000000000E+13,
149 0.20922789888000000000E+14, 0.35568742809600000000E+15,
150 0.64023737057280000000E+16, 0.12164510040883200000E+18,
151 0.24329020081766400000E+19, 0.51090942171709440000E+20,
152 0.11240007277776076800E+22, 0.25852016738884976640E+23,
153 0.62044840173323943936E+24, 0.15511210043330985984E+26,
154 0.40329146112660563558E+27, 0.10888869450418352161E+29,
155 0.30488834461171386050E+30, 0.88417619937397019545E+31,
156 0.26525285981219105864E+33};
157 return table[i];
158}
159
160/*******************************************************************************
161 * \brief Number of Cartesian orbitals up to given angular momentum quantum.
162 * \author Ole Schuett
163 ******************************************************************************/
164__host__ __device__ __inline__ int ncoset(const int l) {
165 static const int table[] = {1, // l=0
166 4, // l=1
167 10, // l=2 ...
168 20, 35, 56, 84, 120, 165, 220, 286,
169 364, 455, 560, 680, 816, 969, 1140, 1330};
170 return table[l];
171}
172
173/*******************************************************************************
174 * \brief Maps three angular momentum components to a single zero based index.
175 ******************************************************************************/
176__host__ __device__ __inline__ int coset(int lx, int ly, int lz) {
177 const int l = lx + ly + lz;
178 if (l == 0) {
179 return 0;
180 } else {
181 return ncoset(l - 1) + ((l - lx) * (l - lx + 1)) / 2 + lz;
182 }
183}
184
185/*******************************************************************************
186 * \brief Increase i'th component of given orbital angular momentum.
187 ******************************************************************************/
188__device__ __inline__ orbital up(const int i, const orbital &a) {
189 orbital b = a;
190 b.l[i] += 1;
191 return b;
192}
193
194/*******************************************************************************
195 * \brief Decrease i'th component of given orbital angular momentum.
196 ******************************************************************************/
197__inline__ __device__ orbital down(const int i, const orbital &a) {
198 orbital b = a;
199 b.l[i] = max(0, a.l[i] - 1);
200 return b;
201}
202
203/*******************************************************************************
204 * \brief Return coset index of given orbital angular momentum.
205 ******************************************************************************/
206__inline__ __device__ int idx(const orbital a) {
207 return coset(a.l[0], a.l[1], a.l[2]);
208}
209
210__device__ __inline__ double power(const double x, const int expo) {
211 double tmp = 1.0;
212 for (int i = 1; i <= expo; i++)
213 tmp *= x;
214 return tmp;
215}
216
217/*******************************************************************************
218 * \brief Adds given value to matrix element cab[idx(b)][idx(a)].
219 ******************************************************************************/
220template <typename T = double>
221__device__ __inline__ void prep_term(const orbital a, const orbital b,
222 const T value, const int n, T *cab) {
223 atomicAdd(&cab[idx(b) * n + idx(a)], value);
224}
225
226/*******************************************************************************
227 * \brief Initializes the device's constant memory.
228 * \author Ole Schuett
229 ******************************************************************************/
230inline static void init_constant_memory() {
231 static bool initialized = false;
232 if (initialized) {
233 return; // constant memory has to be initialized only once
234 }
235
236 // Inverse coset mapping
237 orbital coset_inv_host[1330];
238 for (int lx = 0; lx <= 18; lx++) {
239 for (int ly = 0; ly <= 18 - lx; ly++) {
240 for (int lz = 0; lz <= 18 - lx - ly; lz++) {
241 const int i = coset(lx, ly, lz);
242 coset_inv_host[i] = {{lx, ly, lz}};
243 }
244 }
245 }
246 hipError_t error =
247 hipMemcpyToSymbol(coset_inv, &coset_inv_host, sizeof(coset_inv_host), 0,
248 hipMemcpyHostToDevice);
249 assert(error == hipSuccess);
250
251 // Binomial coefficient
252 int binomial_coef_host[19][19] = {
253 {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
254 {1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
255 {1, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
256 {1, 3, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
257 {1, 4, 6, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
258 {1, 5, 10, 10, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
259 {1, 6, 15, 20, 15, 6, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
260 {1, 7, 21, 35, 35, 21, 7, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
261 {1, 8, 28, 56, 70, 56, 28, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
262 {1, 9, 36, 84, 126, 126, 84, 36, 9, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0},
263 {1, 10, 45, 120, 210, 252, 210, 120, 45, 10, 1, 0, 0, 0, 0, 0, 0, 0, 0},
264 {1, 11, 55, 165, 330, 462, 462, 330, 165, 55, 11, 1, 0, 0, 0, 0, 0, 0, 0},
265 {1, 12, 66, 220, 495, 792, 924, 792, 495, 220, 66, 12, 1, 0, 0, 0, 0, 0,
266 0},
267 {1, 13, 78, 286, 715, 1287, 1716, 1716, 1287, 715, 286, 78, 13, 1, 0, 0,
268 0, 0, 0},
269 {1, 14, 91, 364, 1001, 2002, 3003, 3432, 3003, 2002, 1001, 364, 91, 14, 1,
270 0, 0, 0, 0},
271 {1, 15, 105, 455, 1365, 3003, 5005, 6435, 6435, 5005, 3003, 1365, 455,
272 105, 15, 1, 0, 0, 0},
273 {1, 16, 120, 560, 1820, 4368, 8008, 11440, 12870, 11440, 8008, 4368, 1820,
274 560, 120, 16, 1, 0, 0},
275 {1, 17, 136, 680, 2380, 6188, 12376, 19448, 24310, 24310, 19448, 12376,
276 6188, 2380, 680, 136, 17, 1, 0},
277 {1, 18, 153, 816, 3060, 8568, 18564, 31824, 43758, 48620, 43758, 31824,
278 18564, 8568, 3060, 816, 153, 18, 1}};
279 error =
280 hipMemcpyToSymbol(binomial_coef, &binomial_coef_host[0][0],
281 sizeof(binomial_coef_host), 0, hipMemcpyHostToDevice);
282 assert(error == hipSuccess);
283
284 initialized = true;
285}
286
287__inline__ __device__ double3
288compute_coordinates(const double *__restrict__ dh_, const double x,
289 const double y, const double z) {
290
291 double3 r3;
292 // I make no distinction between orthorhombic and non orthorhombic
293 // cases
294
295 r3.x = z * dh_[6] + y * dh_[3] + x * dh_[0];
296 r3.y = z * dh_[7] + y * dh_[4] + x * dh_[1];
297 r3.z = z * dh_[8] + y * dh_[5] + x * dh_[2];
298 return r3;
299}
300
301__inline__ __device__ float3 compute_coordinates(const float *__restrict__ dh_,
302 const float x, const float y,
303 const float z) {
304
305 float3 r3;
306 // I make no distinction between orthorhombic and non orthorhombic
307 // cases
308
309 r3.x = z * dh_[6] + y * dh_[3] + x * dh_[0];
310 r3.y = z * dh_[7] + y * dh_[4] + x * dh_[1];
311 r3.z = z * dh_[8] + y * dh_[5] + x * dh_[2];
312 return r3;
313}
314
315/*******************************************************************************
316 * \brief Computes the polynomial expansion coefficients:
317 * (x-a)**lxa (x-b)**lxb -> sum_{ls} alpha(ls,lxa,lxb,1)*(x-p)**ls
318 ******************************************************************************/
319template <typename T>
320__inline__ __device__ void compute_alpha(const smem_task<T> &task,
321 T *__restrict__ alpha) {
322 // strides for accessing alpha
323 const int s3 = (task.lp + 1);
324 const int s2 = (task.la_max + 1) * s3;
325 const int s1 = (task.lb_max + 1) * s2;
326 const int tid =
327 threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
328 for (int i = tid; i < 3 * s1; i += blockDim.x * blockDim.y * blockDim.z)
329 alpha[i] = 0.0;
330
331 __syncthreads();
332
333 for (int idir = threadIdx.z; idir < 3; idir += blockDim.z) {
334 const T drpa = task.rp[idir] - task.ra[idir];
335 const T drpb = task.rp[idir] - task.rb[idir];
336 for (int la = threadIdx.y; la <= task.la_max; la += blockDim.y) {
337 for (int lb = threadIdx.x; lb <= task.lb_max; lb += blockDim.x) {
338 T a = 1.0;
339 for (int k = 0; k <= la; k++) {
340 T b = 1.0;
341 const int base = idir * s1 + lb * s2 + la * s3;
342 for (int l = 0; l <= lb; l++) {
343 alpha[base + la - l + lb - k] +=
344 a * b * binomial_coef[la][k] * binomial_coef[lb][l];
345 b *= drpb;
346 }
347 a *= drpa;
348 }
349 }
350 }
351 }
352 __syncthreads(); // because of concurrent writes to alpha
353}
354
355__host__ __inline__ __device__ void
356convert_to_lattice_coordinates(const double *dh_inv_,
357 const double3 *__restrict__ const rp,
358 double3 *__restrict__ rp_c) {
359 rp_c->x = dh_inv_[0] * rp->x + dh_inv_[3] * rp->y + dh_inv_[6] * rp->z;
360 rp_c->y = dh_inv_[1] * rp->x + dh_inv_[4] * rp->y + dh_inv_[7] * rp->z;
361 rp_c->z = dh_inv_[2] * rp->x + dh_inv_[5] * rp->y + dh_inv_[8] * rp->z;
362}
363
364__host__ __inline__ __device__ void
366 const double *__restrict__ dh_, const double3 *__restrict__ const rp,
367 double3 *__restrict__ rp_c) {
368 rp_c->x = dh_[0] * rp->x + dh_[3] * rp->y + dh_[6] * rp->z;
369 rp_c->y = dh_[1] * rp->x + dh_[4] * rp->y + dh_[7] * rp->z;
370 rp_c->z = dh_[2] * rp->x + dh_[5] * rp->y + dh_[8] * rp->z;
371}
372
373__host__ __inline__ __device__ void
375 const float3 *__restrict__ const rp,
376 float3 *__restrict__ rp_c) {
377 rp_c->x = dh_inv_[0] * rp->x + dh_inv_[3] * rp->y + dh_inv_[6] * rp->z;
378 rp_c->y = dh_inv_[1] * rp->x + dh_inv_[4] * rp->y + dh_inv_[7] * rp->z;
379 rp_c->z = dh_inv_[2] * rp->x + dh_inv_[5] * rp->y + dh_inv_[8] * rp->z;
380}
381
382__host__ __inline__ __device__ void
384 const float *__restrict__ dh_, const float3 *__restrict__ const rp,
385 float3 *__restrict__ rp_c) {
386 rp_c->x = dh_[0] * rp->x + dh_[3] * rp->y + dh_[6] * rp->z;
387 rp_c->y = dh_[1] * rp->x + dh_[4] * rp->y + dh_[7] * rp->z;
388 rp_c->z = dh_[2] * rp->x + dh_[5] * rp->y + dh_[8] * rp->z;
389}
390
391template <typename T, typename T3, bool orthorhombic_>
393 const T radius, const T *const __restrict__ dh_,
394 const T *const __restrict__ dh_inv_, const T3 *__restrict__ rp,
395 T3 *__restrict__ roffset, int3 *__restrict__ cubecenter,
396 int3 *__restrict__ lb_cube, int3 *__restrict__ cube_size) {
397
398 /* center of the gaussian in the lattice coordinates */
399 T3 rp1, rp2, rp3;
400
401 /* it is in the lattice vector frame */
402 convert_to_lattice_coordinates(dh_inv_, rp, &rp1);
403
404 /* compute the grid point that is the closest to the sphere center. */
405 cubecenter->x = std::floor(rp1.x);
406 cubecenter->y = std::floor(rp1.y);
407 cubecenter->z = std::floor(rp1.z);
408
409 /* seting up the cube parameters */
410
411 if (orthorhombic_) {
412
413 // the cube is actually slightly bigger than the sphere of radius r. that's
414 // why we need to discretize it to get the cube size "right".
415
416 // disc_radius >= radius always. somehow despite the fact that we compile
417 // things with a c++ compiler on the host side, we need to use fmin instead
418 // of std::min since std::min is not allowed on the device side
419
420 // We assume no specific form for the orthogonal matrix. (can be diaognal or
421 // completely full, the only constraint is that the tree vectors are
422 // orthogonal)
423
424 T norm1, norm2, norm3;
425 norm1 = dh_[0] * dh_[0] + dh_[1] * dh_[1] + dh_[2] * dh_[2];
426 norm2 = dh_[3] * dh_[3] + dh_[4] * dh_[4] + dh_[5] * dh_[5];
427 norm3 = dh_[6] * dh_[6] + dh_[7] * dh_[7] + dh_[8] * dh_[8];
428
429 norm1 = std::sqrt(norm1);
430 norm2 = std::sqrt(norm2);
431 norm3 = std::sqrt(norm3);
432
433 const T disr_radius =
434 std::min(norm1, std::min(norm2, norm3)) *
435 std::max(1,
436 (int)ceil(radius / std::min(norm1, std::min(norm2, norm3))));
437
438 rp2.x = cubecenter->x;
439 rp2.y = cubecenter->y;
440 rp2.z = cubecenter->z;
441
442 /* convert the cube center from grid points coordinates to cartesian */
444 /* cube center */
445 roffset->x -= rp->x;
446 roffset->y -= rp->y;
447 roffset->z -= rp->z;
448
449 rp2.x = disr_radius;
450 rp2.y = disr_radius;
451 rp2.z = disr_radius;
452
453 /* it is in the lattice vector frame */
454 convert_to_lattice_coordinates(dh_inv_, &rp2, &rp3);
455 /* lower and upper bounds */
456 lb_cube->x = std::ceil(-1e-8 - rp3.x);
457 lb_cube->y = std::ceil(-1e-8 - rp3.y);
458 lb_cube->z = std::ceil(-1e-8 - rp3.z);
459
460 /* it is in the lattice vector frame */
461 convert_to_lattice_coordinates(dh_inv_, roffset, &rp2);
462
463 /* Express the offset in lattice coordinates */
464 roffset->x = rp2.x;
465 roffset->y = rp2.y;
466 roffset->z = rp2.z;
467
468 /* compute the cube size ignoring periodicity */
469 /* the interval is not symmetrical for some curious reasons. it should go
470 * from [-L..L+1] so the number of points is multiple of two */
471 cube_size->x = 2 - 2 * lb_cube->x;
472 cube_size->y = 2 - 2 * lb_cube->y;
473 cube_size->z = 2 - 2 * lb_cube->z;
474 return disr_radius;
475 } else {
476 int3 ub_cube;
477
478 lb_cube->x = INT_MAX;
479 ub_cube.x = INT_MIN;
480 lb_cube->y = INT_MAX;
481 ub_cube.y = INT_MIN;
482 lb_cube->z = INT_MAX;
483 ub_cube.z = INT_MIN;
484
485 for (int i = -1; i <= 1; i++) {
486 for (int j = -1; j <= 1; j++) {
487 for (int k = -1; k <= 1; k++) {
488 T3 r;
489 r.x = rp->x + ((T)i) * radius;
490 r.y = rp->y + ((T)j) * radius;
491 r.z = rp->z + ((T)k) * radius;
492 convert_to_lattice_coordinates(dh_inv_, &r, roffset);
493
494 lb_cube->x = std::min(lb_cube->x, (int)std::floor(roffset->x));
495 ub_cube.x = std::max(ub_cube.x, (int)std::ceil(roffset->x));
496
497 lb_cube->y = std::min(lb_cube->y, (int)std::floor(roffset->y));
498 ub_cube.y = std::max(ub_cube.y, (int)std::ceil(roffset->y));
499
500 lb_cube->z = std::min(lb_cube->z, (int)std::floor(roffset->z));
501 ub_cube.z = std::max(ub_cube.z, (int)std::ceil(roffset->z));
502 }
503 }
504 }
505 /* compute the cube size ignoring periodicity */
506 cube_size->x = ub_cube.x - lb_cube->x;
507 cube_size->y = ub_cube.y - lb_cube->y;
508 cube_size->z = ub_cube.z - lb_cube->z;
509
510 /* compute the offset in lattice coordinates */
511
512 roffset->x = cubecenter->x - rp1.x;
513 roffset->y = cubecenter->y - rp1.y;
514 roffset->z = cubecenter->z - rp1.z;
515
516 // shift the boundaries compared to the cube center so that the
517 // specialization ortho / non ortho is minimal
518 lb_cube->x -= cubecenter->x;
519 lb_cube->y -= cubecenter->y;
520 lb_cube->z -= cubecenter->z;
521
522 return radius;
523 }
524}
525
526__inline__ __device__ void compute_window_size(const int *const grid_size,
527 const int border_mask,
528 const int *border_width,
529 int3 *const window_size,
530 int3 *const window_shift) {
531 window_shift->x = 0;
532 window_shift->y = 0;
533 window_shift->z = 0;
534
535 window_size->x = grid_size[2] - 1;
536 window_size->y = grid_size[1] - 1;
537 window_size->z = grid_size[0] - 1;
538
539 if (border_mask & (1 << 0))
540 window_shift->x += border_width[2];
541 if (border_mask & (1 << 1))
542 window_size->x -= border_width[2];
543 if (border_mask & (1 << 2))
544 window_shift->y += border_width[1];
545 if (border_mask & (1 << 3))
546 window_size->y -= border_width[1];
547 if (border_mask & (1 << 4))
548 window_shift->z += border_width[0];
549 if (border_mask & (1 << 5))
550 window_size->z -= border_width[0];
551}
552
553/*******************************************************************************
554 * \brief Transforms coefficients C_ab into C_xyz.
555 ******************************************************************************/
556template <typename T>
557__device__ __inline__ static void
558cab_to_cxyz(const smem_task<T> &task, const T *__restrict__ alpha,
559 const T *__restrict__ cab, T *__restrict__ cxyz) {
560
561 // *** initialise the coefficient matrix, we transform the sum
562 //
563 // sum_{lxa,lya,lza,lxb,lyb,lzb} P_{lxa,lya,lza,lxb,lyb,lzb} *
564 // (x-a_x)**lxa (y-a_y)**lya (z-a_z)**lza (x-b_x)**lxb (y-a_y)**lya
565 // (z-a_z)**lza
566 //
567 // into
568 //
569 // sum_{lxp,lyp,lzp} P_{lxp,lyp,lzp} (x-p_x)**lxp (y-p_y)**lyp (z-p_z)**lzp
570 //
571 // where p is center of the product gaussian, and lp = la_max + lb_max
572 // (current implementation is l**7)
573
574 // strides for accessing alpha
575 const int s3 = (task.lp + 1);
576 const int s2 = (task.la_max + 1) * s3;
577 const int s1 = (task.lb_max + 1) * s2;
578
579 // TODO: Maybe we can transpose alpha to index it directly with ico and jco.
580 const int tid =
581 threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
582
583 for (int i = tid; i < ncoset(task.lp);
584 i += blockDim.x * blockDim.y * blockDim.z) {
585 auto &co = coset_inv[i];
586 T reg = 0.0; // accumulate into a register
587 for (int jco = 0; jco < ncoset(task.lb_max); jco++) {
588 const auto &b = coset_inv[jco];
589 for (int ico = 0; ico < ncoset(task.la_max); ico++) {
590 const auto &a = coset_inv[ico];
591 const T p = task.prefactor *
592 alpha[0 * s1 + b.l[0] * s2 + a.l[0] * s3 + co.l[0]] *
593 alpha[1 * s1 + b.l[1] * s2 + a.l[1] * s3 + co.l[1]] *
594 alpha[2 * s1 + b.l[2] * s2 + a.l[2] * s3 + co.l[2]];
595 reg += p * cab[jco * task.n1 + ico]; // collocate
596 }
597 }
598
599 cxyz[i] = reg;
600 }
601 __syncthreads(); // because of concurrent writes to cxyz / cab
602}
603
604/*******************************************************************************
605 * \brief Transforms coefficients C_xyz into C_ab.
606 ******************************************************************************/
607template <typename T>
608__device__ __inline__ static void
609cxyz_to_cab(const smem_task<T> &task, const T *__restrict__ alpha,
610 const T *__restrict__ cxyz, T *__restrict__ cab) {
611
612 // *** initialise the coefficient matrix, we transform the sum
613 //
614 // sum_{lxa,lya,lza,lxb,lyb,lzb} P_{lxa,lya,lza,lxb,lyb,lzb} *
615 // (x-a_x)**lxa (y-a_y)**lya (z-a_z)**lza (x-b_x)**lxb (y-a_y)**lya
616 // (z-a_z)**lza
617 //
618 // into
619 //
620 // sum_{lxp,lyp,lzp} P_{lxp,lyp,lzp} (x-p_x)**lxp (y-p_y)**lyp (z-p_z)**lzp
621 //
622 // where p is center of the product gaussian, and lp = la_max + lb_max
623 // (current implementation is l**7)
624
625 // strides for accessing alpha
626 const int s3 = (task.lp + 1);
627 const int s2 = (task.la_max + 1) * s3;
628 const int s1 = (task.lb_max + 1) * s2;
629
630 const int tid =
631 threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
632 for (int jco = tid / 8; jco < ncoset(task.lb_max); jco += 8) {
633 const orbital b = coset_inv[jco];
634 for (int ico = tid % 8; ico < ncoset(task.la_max); ico += 8) {
635 const auto &a = coset_inv[ico];
636 T reg = 0.0; // accumulate into a register
637 for (int ic = 0; ic < ncoset(task.lp); ic++) {
638 const auto &co = coset_inv[ic];
639 const T p = task.prefactor *
640 alpha[b.l[0] * s2 + a.l[0] * s3 + co.l[0]] *
641 alpha[s1 + b.l[1] * s2 + a.l[1] * s3 + co.l[1]] *
642 alpha[2 * s1 + b.l[2] * s2 + a.l[2] * s3 + co.l[2]];
643
644 reg += p * cxyz[ic]; // integrate
645 }
646 cab[jco * task.n1 + ico] = reg; // partial loop coverage -> zero it
647 }
648 }
649}
650
651/*******************************************************************************
652 * \brief Copies a task from global to shared memory.
653 ******************************************************************************/
654
655/* Collocate and integrate do not care about the sphi etc... computing the
656 * coefficients do not care about the exponentials (the parameters are still
657 * needed), so simplify the amount of information to save shared memory (it is
658 * crucial on AMD GPU to max out occupancy) */
659template <typename T, typename T3>
660__device__ __inline__ void
661fill_smem_task_reduced(const kernel_params &dev, const int task_id,
663 if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
664 const auto &glb_task = dev.tasks[task_id];
665 task.zetp = glb_task.zeta + glb_task.zetb;
666 task.radius = glb_task.radius;
667 task.discrete_radius = glb_task.discrete_radius;
668
669 // angular momentum range for the actual collocate/integrate operation.
670 task.lp =
671 glb_task.la_max + dev.la_max_diff + glb_task.lb_max + dev.lb_max_diff;
672
673 task.cube_size.x = glb_task.cube_size.x;
674 task.cube_size.y = glb_task.cube_size.y;
675 task.cube_size.z = glb_task.cube_size.z;
676
677 task.cube_center.x = glb_task.cube_center.x;
678 task.cube_center.y = glb_task.cube_center.y;
679 task.cube_center.z = glb_task.cube_center.z;
680
681 task.roffset.x = glb_task.roffset.x;
682 task.roffset.y = glb_task.roffset.y;
683 task.roffset.z = glb_task.roffset.z;
684
685 task.lb_cube.x = glb_task.lb_cube.x;
686 task.lb_cube.y = glb_task.lb_cube.y;
687 task.lb_cube.z = glb_task.lb_cube.z;
688
689 task.apply_border_mask = glb_task.apply_border_mask;
690 }
691 __syncthreads();
692}
693
694/*******************************************************************************
695 * \brief Copies a task from global to shared memory and does precomputations.
696 ******************************************************************************/
697
698/* computing the coefficients do not care about many of the exponentials
699 * parameters, etc.. so */
700template <typename T>
701__device__ __inline__ void fill_smem_task_coef(const kernel_params &dev,
702 const int task_id,
703 smem_task<T> &task) {
704 if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
705 const auto &glb_task = dev.tasks[task_id];
706 const int iatom = glb_task.iatom;
707 const int jatom = glb_task.jatom;
708 task.zeta = glb_task.zeta;
709 task.zetb = glb_task.zetb;
710
711 for (int i = 0; i < 3; i++) {
712 task.rab[i] = glb_task.rab[i];
713 task.ra[i] = glb_task.ra[i];
714 task.rb[i] = task.ra[i] + task.rab[i];
715 task.rp[i] =
716 task.ra[i] + task.rab[i] * task.zetb / (task.zeta + task.zetb);
717 }
718
719 task.prefactor = glb_task.prefactor;
720
721 // task.radius = glb_task.radius;
722 task.off_diag_twice = glb_task.off_diag_twice;
723
724 // angular momentum range of basis set
725 const int la_max_basis = glb_task.la_max;
726 const int lb_max_basis = glb_task.lb_max;
727 const int la_min_basis = glb_task.la_min;
728 const int lb_min_basis = glb_task.lb_min;
729
730 // angular momentum range for the actual collocate/integrate opteration.
731 task.la_max = la_max_basis + dev.la_max_diff;
732 task.lb_max = lb_max_basis + dev.lb_max_diff;
733 task.la_min = max(la_min_basis + (int)dev.la_min_diff, 0);
734 task.lb_min = max(lb_min_basis + (int)dev.lb_min_diff, 0);
735 task.lp = task.la_max + task.lb_max;
736
737 // start of decontracted set, ie. pab and hab
738 task.first_coseta = (la_min_basis > 0) ? ncoset(la_min_basis - 1) : 0;
739 task.first_cosetb = (lb_min_basis > 0) ? ncoset(lb_min_basis - 1) : 0;
740
741 // size of decontracted set, ie. pab and hab
742 task.ncoseta = ncoset(la_max_basis);
743 task.ncosetb = ncoset(lb_max_basis);
744 // size of the cab matrix
745 task.n1 = ncoset(task.la_max);
746 task.n2 = ncoset(task.lb_max);
747
748 // size of entire spherical basis
749 task.nsgfa = glb_task.nsgfa; // ibasis.nsgf;
750 task.nsgfb = glb_task.nsgfb;
751
752 // size of spherical set
753 task.nsgf_seta = glb_task.nsgf_seta;
754 task.nsgf_setb = glb_task.nsgf_setb;
755
756 // strides of the sphi transformation matrices
757 task.maxcoa = glb_task.maxcoa;
758 task.maxcob = glb_task.maxcob;
759
760 // transformations from contracted spherical to primitive cartesian basis
761 task.sphia = &dev.sphi_dev[glb_task.ikind][glb_task.sgfa * task.maxcoa +
762 glb_task.ipgf * task.ncoseta];
763 task.sphib = &dev.sphi_dev[glb_task.jkind][glb_task.sgfb * task.maxcob +
764 glb_task.jpgf * task.ncosetb];
765
766 // Locate current matrix block within the buffer.
767 const int block_offset = dev.block_offsets[glb_task.block_num];
768 task.block_transposed = glb_task.block_transposed;
769 task.pab_block = dev.ptr_dev[0] + block_offset + glb_task.subblock_offset;
770
771 if (dev.ptr_dev[3] != nullptr) {
772 task.hab_block = dev.ptr_dev[3] + block_offset + glb_task.subblock_offset;
773 if (dev.ptr_dev[4] != nullptr) {
774 task.forces_a = &dev.ptr_dev[4][3 * iatom];
775 task.forces_b = &dev.ptr_dev[4][3 * jatom];
776 }
777 }
778 }
779 __syncthreads();
780}
781
783private:
784 int la_max_{-1};
785 int lb_max_{-1};
786 int smem_per_block_{0};
787 int alpha_len_{-1};
788 int cab_len_{-1};
789 int lp_max_{-1};
790 ldiffs_value ldiffs_;
791 int lp_diff_{-1};
792
793public:
794 smem_parameters(const ldiffs_value ldiffs, const int lmax) {
795 ldiffs_ = ldiffs;
797 la_max_ = lmax + ldiffs.la_max_diff;
798 lb_max_ = lmax + ldiffs.lb_max_diff;
799 lp_max_ = la_max_ + lb_max_;
800
801 cab_len_ = ncoset(lb_max_) * ncoset(la_max_);
802 alpha_len_ = 3 * (lb_max_ + 1) * (la_max_ + 1) * (lp_max_ + 1);
803 smem_per_block_ = std::max(alpha_len_, 64) * sizeof(double);
804
805 if (smem_per_block_ > 64 * 1024) {
806 fprintf(stderr,
807 "ERROR: Not enough shared memory in grid_gpu_collocate.\n");
808 fprintf(stderr, "cab_len: %i, ", cab_len_);
809 fprintf(stderr, "alpha_len: %i, ", alpha_len_);
810 fprintf(stderr, "total smem_per_block: %f kb\n\n",
811 smem_per_block_ / 1024.0);
812 abort();
813 }
814 }
815
817
818 // copy and move are trivial
819
820 inline int smem_alpha_offset() const { return 0; }
821
822 inline int smem_cab_offset() const { return alpha_len_; }
823
824 inline int smem_cxyz_offset() const { return alpha_len_ + cab_len_; }
825
826 inline int smem_per_block() const { return smem_per_block_; }
827
828 inline int lp_diff() const { return lp_diff_; }
829
830 inline ldiffs_value ldiffs() const { return ldiffs_; }
831
832 inline int lp_max() const { return lp_max_; }
833
834 inline int cxyz_len() const { return ncoset(lp_max_); }
835};
836
837} // namespace rocm_backend
838#endif
smem_parameters(const ldiffs_value ldiffs, const int lmax)
static void const int const int i
__inline__ __device__ void compute_window_size(const int *const grid_size, const int border_mask, const int *border_width, int3 *const window_size, int3 *const window_shift)
__host__ __inline__ __device__ void convert_from_lattice_coordinates_to_cartesian(const double *__restrict__ dh_, const double3 *__restrict__ const rp, double3 *__restrict__ rp_c)
__device__ static __inline__ void cab_to_cxyz(const smem_task< T > &task, const T *__restrict__ alpha, const T *__restrict__ cab, T *__restrict__ cxyz)
Transforms coefficients C_ab into C_xyz.
__device__ __inline__ void prep_term(const orbital a, const orbital b, const T value, const int n, T *cab)
Adds given value to matrix element cab[idx(b)][idx(a)].
__device__ __inline__ void fill_smem_task_coef(const kernel_params &dev, const int task_id, smem_task< T > &task)
Copies a task from global to shared memory and does precomputations.
__device__ __inline__ double power(const double x, const int expo)
__inline__ __device__ orbital down(const int i, const orbital &a)
Decrease i'th component of given orbital angular momentum.
__host__ __inline__ __device__ void convert_to_lattice_coordinates(const double *dh_inv_, const double3 *__restrict__ const rp, double3 *__restrict__ rp_c)
static void init_constant_memory()
Initializes the device's constant memory.
__device__ static __inline__ void cxyz_to_cab(const smem_task< T > &task, const T *__restrict__ alpha, const T *__restrict__ cxyz, T *__restrict__ cab)
Transforms coefficients C_xyz into C_ab.
__inline__ __device__ int idx(const orbital a)
Return coset index of given orbital angular momentum.
__constant__ orbital coset_inv[1330]
__device__ __inline__ orbital up(const int i, const orbital &a)
Increase i'th component of given orbital angular momentum.
__inline__ __device__ void compute_alpha(const smem_task< T > &task, T *__restrict__ alpha)
Computes the polynomial expansion coefficients: (x-a)**lxa (x-b)**lxb -> sum_{ls} alpha(ls,...
__device__ __inline__ void fill_smem_task_reduced(const kernel_params &dev, const int task_id, smem_task_reduced< T, T3 > &task)
Copies a task from global to shared memory.
__inline__ __device__ double3 compute_coordinates(const double *__restrict__ dh_, const double x, const double y, const double z)
__constant__ int binomial_coef[19][19]
__inline__ T compute_cube_properties(const T radius, const T *const __restrict__ dh_, const T *const __restrict__ dh_inv_, const T3 *__restrict__ rp, T3 *__restrict__ roffset, int3 *__restrict__ cubecenter, int3 *__restrict__ lb_cube, int3 *__restrict__ cube_size)
Parameters of the collocate kernel.
Differences in angular momentum.
Orbital angular momentum.
data needed for collocate and integrate kernels
data needed for calculating the coefficients, forces, and stress