(git:6768d83)
Loading...
Searching...
No Matches
grid_gpu_internal_header.h
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2026 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: BSD-3-Clause */
6/*----------------------------------------------------------------------------*/
7
8/*
9 * Authors :
10 - Dr Mathieu Taillefumier (ETH Zurich / CSCS)
11 - Advanced Micro Devices, Inc.
12*/
13
14#ifndef GRID_GPU_INTERNAL_HEADER_H
15#define GRID_GPU_INTERNAL_HEADER_H
16
17#include <algorithm>
18#include <assert.h>
19#include <limits.h>
20#include <math.h>
21#include <stdio.h>
22#include <stdlib.h>
23#include <string.h>
24
25#define GRID_DEVICE __device__
26extern "C" {
27#include "../common/grid_basis_set.h"
28#include "../common/grid_constants.h"
29}
30
31#include "grid_gpu_context.h"
32
33namespace rocm_backend {
34
35#if defined(__HIP_PLATFORM_NVIDIA__)
36#if __CUDA_ARCH__ < 600
37__device__ __inline__ double atomicAdd(double *address, double val) {
38 unsigned long long int *address_as_ull = (unsigned long long int *)address;
39 unsigned long long int old = *address_as_ull, assumed;
40
41 do {
42 assumed = old;
43 old = atomicCAS(address_as_ull, assumed,
44 __double_as_longlong(val + __longlong_as_double(assumed)));
45
46 // Note: uses integer comparison to avoid hang in case of NaN (since NaN !=
47 // NaN)
48 } while (assumed != old);
49
50 return __longlong_as_double(old);
51}
52#endif
53#endif
54
55/*******************************************************************************
56 * \brief Orbital angular momentum.
57 ******************************************************************************/
58struct orbital {
59 int l[3];
60};
61
62__constant__ orbital coset_inv[1330];
63__constant__ int binomial_coef[19][19];
64
65/*******************************************************************************
66 * \brief Differences in angular momentum.
67 ******************************************************************************/
73};
74
75/*******************************************************************************
76 * \brief data needed for calculating the coefficients, forces, and stress
77 ******************************************************************************/
78template <typename T> struct smem_task {
80 T ra[3];
81 T rb[3];
82 T rp[3];
83 T rab[3];
89 char la_max;
90 char lb_max;
91 char la_min;
92 char lb_min;
93 char lp;
94 // size of the cab matrix
95 unsigned short int n1;
96 unsigned short int n2;
97 // size of entire spherical basis
98 unsigned short int nsgfa;
99 unsigned short int nsgfb;
100 // size of spherical set
101 unsigned short int nsgf_seta;
102 unsigned short int nsgf_setb;
103 // start of decontracted set, ie. pab and hab
106 // size of decontracted set, ie. pab and hab
109 // strides of the sphi transformation matrices
112 // pointers matrices
116 // integrate
120};
121
122/*******************************************************************************
123 * \brief data needed for collocate and integrate kernels
124 ******************************************************************************/
133
134/*******************************************************************************
135 * \brief Factorial function, e.g. fac(5) = 5! = 120.
136 * \author Ole Schuett
137 ******************************************************************************/
138__device__ __inline__ double fac(const int i) {
139 static const double table[] = {
140 0.10000000000000000000E+01, 0.10000000000000000000E+01,
141 0.20000000000000000000E+01, 0.60000000000000000000E+01,
142 0.24000000000000000000E+02, 0.12000000000000000000E+03,
143 0.72000000000000000000E+03, 0.50400000000000000000E+04,
144 0.40320000000000000000E+05, 0.36288000000000000000E+06,
145 0.36288000000000000000E+07, 0.39916800000000000000E+08,
146 0.47900160000000000000E+09, 0.62270208000000000000E+10,
147 0.87178291200000000000E+11, 0.13076743680000000000E+13,
148 0.20922789888000000000E+14, 0.35568742809600000000E+15,
149 0.64023737057280000000E+16, 0.12164510040883200000E+18,
150 0.24329020081766400000E+19, 0.51090942171709440000E+20,
151 0.11240007277776076800E+22, 0.25852016738884976640E+23,
152 0.62044840173323943936E+24, 0.15511210043330985984E+26,
153 0.40329146112660563558E+27, 0.10888869450418352161E+29,
154 0.30488834461171386050E+30, 0.88417619937397019545E+31,
155 0.26525285981219105864E+33};
156 return table[i];
157}
158
159/*******************************************************************************
160 * \brief Number of Cartesian orbitals up to given angular momentum quantum.
161 * \author Ole Schuett
162 ******************************************************************************/
163__host__ __device__ __inline__ int ncoset(const int l) {
164 static const int table[] = {1, // l=0
165 4, // l=1
166 10, // l=2 ...
167 20, 35, 56, 84, 120, 165, 220, 286,
168 364, 455, 560, 680, 816, 969, 1140, 1330};
169 return table[l];
170}
171
172/*******************************************************************************
173 * \brief Maps three angular momentum components to a single zero based index.
174 ******************************************************************************/
175__host__ __device__ __inline__ int coset(int lx, int ly, int lz) {
176 const int l = lx + ly + lz;
177 if (l == 0) {
178 return 0;
179 } else {
180 return ncoset(l - 1) + ((l - lx) * (l - lx + 1)) / 2 + lz;
181 }
182}
183
184/*******************************************************************************
185 * \brief Increase i'th component of given orbital angular momentum.
186 ******************************************************************************/
187__device__ __inline__ orbital up(const int i, const orbital &a) {
188 orbital b = a;
189 b.l[i] += 1;
190 return b;
191}
192
193/*******************************************************************************
194 * \brief Decrease i'th component of given orbital angular momentum.
195 ******************************************************************************/
196__inline__ __device__ orbital down(const int i, const orbital &a) {
197 orbital b = a;
198 b.l[i] = max(0, a.l[i] - 1);
199 return b;
200}
201
202/*******************************************************************************
203 * \brief Return coset index of given orbital angular momentum.
204 ******************************************************************************/
205__inline__ __device__ int idx(const orbital a) {
206 return coset(a.l[0], a.l[1], a.l[2]);
207}
208
209__device__ __inline__ double power(const double x, const int expo) {
210 double tmp = 1.0;
211 for (int i = 1; i <= expo; i++)
212 tmp *= x;
213 return tmp;
214}
215
216/*******************************************************************************
217 * \brief Adds given value to matrix element cab[idx(b)][idx(a)].
218 ******************************************************************************/
219template <typename T = double>
220__device__ __inline__ void prep_term(const orbital a, const orbital b,
221 const T value, const int n, T *cab) {
222 atomicAdd(&cab[idx(b) * n + idx(a)], value);
223}
224
225/*******************************************************************************
226 * \brief Initializes the device's constant memory.
227 * \author Ole Schuett
228 ******************************************************************************/
229inline static void init_constant_memory() {
230 static bool initialized = false;
231 if (initialized) {
232 return; // constant memory has to be initialized only once
233 }
234
235 // Inverse coset mapping
236 orbital coset_inv_host[1330];
237 for (int lx = 0; lx <= 18; lx++) {
238 for (int ly = 0; ly <= 18 - lx; ly++) {
239 for (int lz = 0; lz <= 18 - lx - ly; lz++) {
240 const int i = coset(lx, ly, lz);
241 coset_inv_host[i] = {{lx, ly, lz}};
242 }
243 }
244 }
245
246 offloadMemcpyToSymbol(coset_inv, &coset_inv_host, sizeof(coset_inv_host));
247
248 // Binomial coefficient
249 int binomial_coef_host[19][19] = {
250 {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
251 {1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
252 {1, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
253 {1, 3, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
254 {1, 4, 6, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
255 {1, 5, 10, 10, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
256 {1, 6, 15, 20, 15, 6, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
257 {1, 7, 21, 35, 35, 21, 7, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
258 {1, 8, 28, 56, 70, 56, 28, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
259 {1, 9, 36, 84, 126, 126, 84, 36, 9, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0},
260 {1, 10, 45, 120, 210, 252, 210, 120, 45, 10, 1, 0, 0, 0, 0, 0, 0, 0, 0},
261 {1, 11, 55, 165, 330, 462, 462, 330, 165, 55, 11, 1, 0, 0, 0, 0, 0, 0, 0},
262 {1, 12, 66, 220, 495, 792, 924, 792, 495, 220, 66, 12, 1, 0, 0, 0, 0, 0,
263 0},
264 {1, 13, 78, 286, 715, 1287, 1716, 1716, 1287, 715, 286, 78, 13, 1, 0, 0,
265 0, 0, 0},
266 {1, 14, 91, 364, 1001, 2002, 3003, 3432, 3003, 2002, 1001, 364, 91, 14, 1,
267 0, 0, 0, 0},
268 {1, 15, 105, 455, 1365, 3003, 5005, 6435, 6435, 5005, 3003, 1365, 455,
269 105, 15, 1, 0, 0, 0},
270 {1, 16, 120, 560, 1820, 4368, 8008, 11440, 12870, 11440, 8008, 4368, 1820,
271 560, 120, 16, 1, 0, 0},
272 {1, 17, 136, 680, 2380, 6188, 12376, 19448, 24310, 24310, 19448, 12376,
273 6188, 2380, 680, 136, 17, 1, 0},
274 {1, 18, 153, 816, 3060, 8568, 18564, 31824, 43758, 48620, 43758, 31824,
275 18564, 8568, 3060, 816, 153, 18, 1}};
276 offloadMemcpyToSymbol(binomial_coef, &binomial_coef_host[0][0],
277 sizeof(binomial_coef_host));
278 initialized = true;
279}
280
281__inline__ __device__ double3
282compute_coordinates(const double *__restrict__ dh_, const double x,
283 const double y, const double z) {
284
285 double3 r3;
286 // I make no distinction between orthorhombic and non orthorhombic
287 // cases
288
289 r3.x = z * dh_[6] + y * dh_[3] + x * dh_[0];
290 r3.y = z * dh_[7] + y * dh_[4] + x * dh_[1];
291 r3.z = z * dh_[8] + y * dh_[5] + x * dh_[2];
292 return r3;
293}
294
295__inline__ __device__ float3 compute_coordinates(const float *__restrict__ dh_,
296 const float x, const float y,
297 const float z) {
298
299 float3 r3;
300 // I make no distinction between orthorhombic and non orthorhombic
301 // cases
302
303 r3.x = z * dh_[6] + y * dh_[3] + x * dh_[0];
304 r3.y = z * dh_[7] + y * dh_[4] + x * dh_[1];
305 r3.z = z * dh_[8] + y * dh_[5] + x * dh_[2];
306 return r3;
307}
308
309/*******************************************************************************
310 * \brief Computes the polynomial expansion coefficients:
311 * (x-a)**lxa (x-b)**lxb -> sum_{ls} alpha(ls,lxa,lxb,1)*(x-p)**ls
312 ******************************************************************************/
313template <typename T>
314__inline__ __device__ void compute_alpha(const smem_task<T> &task,
315 T *__restrict__ alpha) {
316 // strides for accessing alpha
317 const int s3 = (task.lp + 1);
318 const int s2 = (task.la_max + 1) * s3;
319 const int s1 = (task.lb_max + 1) * s2;
320 const int tid =
321 threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
322 for (int i = tid; i < 3 * s1; i += blockDim.x * blockDim.y * blockDim.z)
323 alpha[i] = 0.0;
324
325 __syncthreads();
326
327 for (int idir = threadIdx.z; idir < 3; idir += blockDim.z) {
328 const T drpa = task.rp[idir] - task.ra[idir];
329 const T drpb = task.rp[idir] - task.rb[idir];
330 for (int la = threadIdx.y; la <= task.la_max; la += blockDim.y) {
331 for (int lb = threadIdx.x; lb <= task.lb_max; lb += blockDim.x) {
332 T a = 1.0;
333 for (int k = 0; k <= la; k++) {
334 T b = 1.0;
335 const int base = idir * s1 + lb * s2 + la * s3;
336 for (int l = 0; l <= lb; l++) {
337 alpha[base + la - l + lb - k] +=
338 a * b * binomial_coef[la][k] * binomial_coef[lb][l];
339 b *= drpb;
340 }
341 a *= drpa;
342 }
343 }
344 }
345 }
346 __syncthreads(); // because of concurrent writes to alpha
347}
348
349__host__ __inline__ __device__ void
350convert_to_lattice_coordinates(const double *dh_inv_,
351 const double3 *__restrict__ const rp,
352 double3 *__restrict__ rp_c) {
353 rp_c->x = dh_inv_[0] * rp->x + dh_inv_[3] * rp->y + dh_inv_[6] * rp->z;
354 rp_c->y = dh_inv_[1] * rp->x + dh_inv_[4] * rp->y + dh_inv_[7] * rp->z;
355 rp_c->z = dh_inv_[2] * rp->x + dh_inv_[5] * rp->y + dh_inv_[8] * rp->z;
356}
357
358__host__ __inline__ __device__ void
360 const double *__restrict__ dh_, const double3 *__restrict__ const rp,
361 double3 *__restrict__ rp_c) {
362 rp_c->x = dh_[0] * rp->x + dh_[3] * rp->y + dh_[6] * rp->z;
363 rp_c->y = dh_[1] * rp->x + dh_[4] * rp->y + dh_[7] * rp->z;
364 rp_c->z = dh_[2] * rp->x + dh_[5] * rp->y + dh_[8] * rp->z;
365}
366
367__host__ __inline__ __device__ void
369 const float3 *__restrict__ const rp,
370 float3 *__restrict__ rp_c) {
371 rp_c->x = dh_inv_[0] * rp->x + dh_inv_[3] * rp->y + dh_inv_[6] * rp->z;
372 rp_c->y = dh_inv_[1] * rp->x + dh_inv_[4] * rp->y + dh_inv_[7] * rp->z;
373 rp_c->z = dh_inv_[2] * rp->x + dh_inv_[5] * rp->y + dh_inv_[8] * rp->z;
374}
375
376__host__ __inline__ __device__ void
378 const float *__restrict__ dh_, const float3 *__restrict__ const rp,
379 float3 *__restrict__ rp_c) {
380 rp_c->x = dh_[0] * rp->x + dh_[3] * rp->y + dh_[6] * rp->z;
381 rp_c->y = dh_[1] * rp->x + dh_[4] * rp->y + dh_[7] * rp->z;
382 rp_c->z = dh_[2] * rp->x + dh_[5] * rp->y + dh_[8] * rp->z;
383}
384
385template <typename T, typename T3, bool orthorhombic_>
387 const T radius, const T *const __restrict__ dh_,
388 const T *const __restrict__ dh_inv_, const T3 *__restrict__ rp,
389 T3 *__restrict__ roffset, int3 *__restrict__ cubecenter,
390 int3 *__restrict__ lb_cube, int3 *__restrict__ cube_size) {
391
392 /* center of the gaussian in the lattice coordinates */
393 T3 rp1, rp2, rp3;
394
395 /* it is in the lattice vector frame */
396 convert_to_lattice_coordinates(dh_inv_, rp, &rp1);
397
398 /* compute the grid point that is the closest to the sphere center. */
399 cubecenter->x = std::floor(rp1.x);
400 cubecenter->y = std::floor(rp1.y);
401 cubecenter->z = std::floor(rp1.z);
402
403 /* seting up the cube parameters */
404
405 if (orthorhombic_) {
406
407 // the cube is actually slightly bigger than the sphere of radius r. that's
408 // why we need to discretize it to get the cube size "right".
409
410 // disc_radius >= radius always. somehow despite the fact that we compile
411 // things with a c++ compiler on the host side, we need to use fmin instead
412 // of std::min since std::min is not allowed on the device side
413
414 // We assume no specific form for the orthogonal matrix. (can be diaognal or
415 // completely full, the only constraint is that the tree vectors are
416 // orthogonal)
417
418 T norm1, norm2, norm3;
419 norm1 = dh_[0] * dh_[0] + dh_[1] * dh_[1] + dh_[2] * dh_[2];
420 norm2 = dh_[3] * dh_[3] + dh_[4] * dh_[4] + dh_[5] * dh_[5];
421 norm3 = dh_[6] * dh_[6] + dh_[7] * dh_[7] + dh_[8] * dh_[8];
422
423 norm1 = std::sqrt(norm1);
424 norm2 = std::sqrt(norm2);
425 norm3 = std::sqrt(norm3);
426
427 const T disr_radius =
428 std::min(norm1, std::min(norm2, norm3)) *
429 std::max(1,
430 (int)ceil(radius / std::min(norm1, std::min(norm2, norm3))));
431
432 rp2.x = cubecenter->x;
433 rp2.y = cubecenter->y;
434 rp2.z = cubecenter->z;
435
436 /* convert the cube center from grid points coordinates to cartesian */
438 /* cube center */
439 roffset->x -= rp->x;
440 roffset->y -= rp->y;
441 roffset->z -= rp->z;
442
443 rp2.x = disr_radius;
444 rp2.y = disr_radius;
445 rp2.z = disr_radius;
446
447 /* it is in the lattice vector frame */
448 convert_to_lattice_coordinates(dh_inv_, &rp2, &rp3);
449 /* lower and upper bounds */
450 lb_cube->x = std::ceil(-1e-8 - rp3.x);
451 lb_cube->y = std::ceil(-1e-8 - rp3.y);
452 lb_cube->z = std::ceil(-1e-8 - rp3.z);
453
454 /* it is in the lattice vector frame */
455 convert_to_lattice_coordinates(dh_inv_, roffset, &rp2);
456
457 /* Express the offset in lattice coordinates */
458 roffset->x = rp2.x;
459 roffset->y = rp2.y;
460 roffset->z = rp2.z;
461
462 /* compute the cube size ignoring periodicity */
463 /* the interval is not symmetrical for some curious reasons. it should go
464 * from [-L..L+1] so the number of points is multiple of two */
465 cube_size->x = 2 - 2 * lb_cube->x;
466 cube_size->y = 2 - 2 * lb_cube->y;
467 cube_size->z = 2 - 2 * lb_cube->z;
468 return disr_radius;
469 } else {
470 int3 ub_cube;
471
472 lb_cube->x = INT_MAX;
473 ub_cube.x = INT_MIN;
474 lb_cube->y = INT_MAX;
475 ub_cube.y = INT_MIN;
476 lb_cube->z = INT_MAX;
477 ub_cube.z = INT_MIN;
478
479 for (int i = -1; i <= 1; i++) {
480 for (int j = -1; j <= 1; j++) {
481 for (int k = -1; k <= 1; k++) {
482 T3 r;
483 r.x = rp->x + ((T)i) * radius;
484 r.y = rp->y + ((T)j) * radius;
485 r.z = rp->z + ((T)k) * radius;
486 convert_to_lattice_coordinates(dh_inv_, &r, roffset);
487
488 lb_cube->x = std::min(lb_cube->x, (int)std::floor(roffset->x));
489 ub_cube.x = std::max(ub_cube.x, (int)std::ceil(roffset->x));
490
491 lb_cube->y = std::min(lb_cube->y, (int)std::floor(roffset->y));
492 ub_cube.y = std::max(ub_cube.y, (int)std::ceil(roffset->y));
493
494 lb_cube->z = std::min(lb_cube->z, (int)std::floor(roffset->z));
495 ub_cube.z = std::max(ub_cube.z, (int)std::ceil(roffset->z));
496 }
497 }
498 }
499 /* compute the cube size ignoring periodicity */
500 cube_size->x = ub_cube.x - lb_cube->x;
501 cube_size->y = ub_cube.y - lb_cube->y;
502 cube_size->z = ub_cube.z - lb_cube->z;
503
504 /* compute the offset in lattice coordinates */
505
506 roffset->x = cubecenter->x - rp1.x;
507 roffset->y = cubecenter->y - rp1.y;
508 roffset->z = cubecenter->z - rp1.z;
509
510 // shift the boundaries compared to the cube center so that the
511 // specialization ortho / non ortho is minimal
512 lb_cube->x -= cubecenter->x;
513 lb_cube->y -= cubecenter->y;
514 lb_cube->z -= cubecenter->z;
515
516 return radius;
517 }
518}
519
520__inline__ __device__ void compute_window_size(const int *const grid_size,
521 const int border_mask,
522 const int *border_width,
523 int3 *const window_size,
524 int3 *const window_shift) {
525 window_shift->x = 0;
526 window_shift->y = 0;
527 window_shift->z = 0;
528
529 window_size->x = grid_size[2] - 1;
530 window_size->y = grid_size[1] - 1;
531 window_size->z = grid_size[0] - 1;
532
533 if (border_mask & (1 << 0))
534 window_shift->x += border_width[2];
535 if (border_mask & (1 << 1))
536 window_size->x -= border_width[2];
537 if (border_mask & (1 << 2))
538 window_shift->y += border_width[1];
539 if (border_mask & (1 << 3))
540 window_size->y -= border_width[1];
541 if (border_mask & (1 << 4))
542 window_shift->z += border_width[0];
543 if (border_mask & (1 << 5))
544 window_size->z -= border_width[0];
545}
546
547/*******************************************************************************
548 * \brief Transforms coefficients C_ab into C_xyz.
549 ******************************************************************************/
550template <typename T>
551__device__ __inline__ static void
552cab_to_cxyz(const smem_task<T> &task, const T *__restrict__ alpha,
553 const T *__restrict__ cab, T *__restrict__ cxyz) {
554
555 // *** initialise the coefficient matrix, we transform the sum
556 //
557 // sum_{lxa,lya,lza,lxb,lyb,lzb} P_{lxa,lya,lza,lxb,lyb,lzb} *
558 // (x-a_x)**lxa (y-a_y)**lya (z-a_z)**lza (x-b_x)**lxb (y-a_y)**lya
559 // (z-a_z)**lza
560 //
561 // into
562 //
563 // sum_{lxp,lyp,lzp} P_{lxp,lyp,lzp} (x-p_x)**lxp (y-p_y)**lyp (z-p_z)**lzp
564 //
565 // where p is center of the product gaussian, and lp = la_max + lb_max
566 // (current implementation is l**7)
567
568 // strides for accessing alpha
569 const int s3 = (task.lp + 1);
570 const int s2 = (task.la_max + 1) * s3;
571 const int s1 = (task.lb_max + 1) * s2;
572
573 // TODO: Maybe we can transpose alpha to index it directly with ico and jco.
574 const int tid =
575 threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
576
577 for (int i = tid; i < ncoset(task.lp);
578 i += blockDim.x * blockDim.y * blockDim.z) {
579 auto &co = coset_inv[i];
580 T reg = 0.0; // accumulate into a register
581 for (int jco = 0; jco < ncoset(task.lb_max); jco++) {
582 const auto &b = coset_inv[jco];
583 for (int ico = 0; ico < ncoset(task.la_max); ico++) {
584 const auto &a = coset_inv[ico];
585 const T p = task.prefactor *
586 alpha[0 * s1 + b.l[0] * s2 + a.l[0] * s3 + co.l[0]] *
587 alpha[1 * s1 + b.l[1] * s2 + a.l[1] * s3 + co.l[1]] *
588 alpha[2 * s1 + b.l[2] * s2 + a.l[2] * s3 + co.l[2]];
589 reg += p * cab[jco * task.n1 + ico]; // collocate
590 }
591 }
592
593 cxyz[i] = reg;
594 }
595 __syncthreads(); // because of concurrent writes to cxyz / cab
596}
597
598/*******************************************************************************
599 * \brief Transforms coefficients C_xyz into C_ab.
600 ******************************************************************************/
601template <typename T>
602__device__ __inline__ static void
603cxyz_to_cab(const smem_task<T> &task, const T *__restrict__ alpha,
604 const T *__restrict__ cxyz, T *__restrict__ cab) {
605
606 // *** initialise the coefficient matrix, we transform the sum
607 //
608 // sum_{lxa,lya,lza,lxb,lyb,lzb} P_{lxa,lya,lza,lxb,lyb,lzb} *
609 // (x-a_x)**lxa (y-a_y)**lya (z-a_z)**lza (x-b_x)**lxb (y-a_y)**lya
610 // (z-a_z)**lza
611 //
612 // into
613 //
614 // sum_{lxp,lyp,lzp} P_{lxp,lyp,lzp} (x-p_x)**lxp (y-p_y)**lyp (z-p_z)**lzp
615 //
616 // where p is center of the product gaussian, and lp = la_max + lb_max
617 // (current implementation is l**7)
618
619 // strides for accessing alpha
620 const int s3 = (task.lp + 1);
621 const int s2 = (task.la_max + 1) * s3;
622 const int s1 = (task.lb_max + 1) * s2;
623
624 const int tid =
625 threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
626 for (int jco = tid / 8; jco < ncoset(task.lb_max); jco += 8) {
627 const orbital b = coset_inv[jco];
628 for (int ico = tid % 8; ico < ncoset(task.la_max); ico += 8) {
629 const auto &a = coset_inv[ico];
630 T reg = 0.0; // accumulate into a register
631 for (int ic = 0; ic < ncoset(task.lp); ic++) {
632 const auto &co = coset_inv[ic];
633 const T p = task.prefactor *
634 alpha[b.l[0] * s2 + a.l[0] * s3 + co.l[0]] *
635 alpha[s1 + b.l[1] * s2 + a.l[1] * s3 + co.l[1]] *
636 alpha[2 * s1 + b.l[2] * s2 + a.l[2] * s3 + co.l[2]];
637
638 reg += p * cxyz[ic]; // integrate
639 }
640 cab[jco * task.n1 + ico] = reg; // partial loop coverage -> zero it
641 }
642 }
643}
644
645/*******************************************************************************
646 * \brief Copies a task from global to shared memory.
647 ******************************************************************************/
648
649/* Collocate and integrate do not care about the sphi etc... computing the
650 * coefficients do not care about the exponentials (the parameters are still
651 * needed), so simplify the amount of information to save shared memory (it is
652 * crucial on AMD GPU to max out occupancy) */
653template <typename T, typename T3>
654__device__ __inline__ void
655fill_smem_task_reduced(const kernel_params &dev, const int task_id,
657 if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
658 const auto &glb_task = dev.tasks[task_id];
659 task.zetp = glb_task.zeta + glb_task.zetb;
660 task.radius = glb_task.radius;
661 task.discrete_radius = glb_task.discrete_radius;
662
663 // angular momentum range for the actual collocate/integrate operation.
664 task.lp =
665 glb_task.la_max + dev.la_max_diff + glb_task.lb_max + dev.lb_max_diff;
666
667 task.cube_size.x = glb_task.cube_size.x;
668 task.cube_size.y = glb_task.cube_size.y;
669 task.cube_size.z = glb_task.cube_size.z;
670
671 task.cube_center.x = glb_task.cube_center.x;
672 task.cube_center.y = glb_task.cube_center.y;
673 task.cube_center.z = glb_task.cube_center.z;
674
675 task.roffset.x = glb_task.roffset.x;
676 task.roffset.y = glb_task.roffset.y;
677 task.roffset.z = glb_task.roffset.z;
678
679 task.lb_cube.x = glb_task.lb_cube.x;
680 task.lb_cube.y = glb_task.lb_cube.y;
681 task.lb_cube.z = glb_task.lb_cube.z;
682
683 task.apply_border_mask = glb_task.apply_border_mask;
684 }
685 __syncthreads();
686}
687
688/*******************************************************************************
689 * \brief Copies a task from global to shared memory and does precomputations.
690 ******************************************************************************/
691
692/* computing the coefficients do not care about many of the exponentials
693 * parameters, etc.. so */
694template <typename T>
695__device__ __inline__ void fill_smem_task_coef(const kernel_params &dev,
696 const int task_id,
697 smem_task<T> &task) {
698 if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
699 const auto &glb_task = dev.tasks[task_id];
700 const int iatom = glb_task.iatom;
701 const int jatom = glb_task.jatom;
702 task.zeta = glb_task.zeta;
703 task.zetb = glb_task.zetb;
704
705 for (int i = 0; i < 3; i++) {
706 task.rab[i] = glb_task.rab[i];
707 task.ra[i] = glb_task.ra[i];
708 task.rb[i] = task.ra[i] + task.rab[i];
709 task.rp[i] =
710 task.ra[i] + task.rab[i] * task.zetb / (task.zeta + task.zetb);
711 }
712
713 task.prefactor = glb_task.prefactor;
714
715 // task.radius = glb_task.radius;
716 task.off_diag_twice = glb_task.off_diag_twice;
717
718 // angular momentum range of basis set
719 const int la_max_basis = glb_task.la_max;
720 const int lb_max_basis = glb_task.lb_max;
721 const int la_min_basis = glb_task.la_min;
722 const int lb_min_basis = glb_task.lb_min;
723
724 // angular momentum range for the actual collocate/integrate opteration.
725 task.la_max = la_max_basis + dev.la_max_diff;
726 task.lb_max = lb_max_basis + dev.lb_max_diff;
727 task.la_min = max(la_min_basis + (int)dev.la_min_diff, 0);
728 task.lb_min = max(lb_min_basis + (int)dev.lb_min_diff, 0);
729 task.lp = task.la_max + task.lb_max;
730
731 // start of decontracted set, ie. pab and hab
732 task.first_coseta = (la_min_basis > 0) ? ncoset(la_min_basis - 1) : 0;
733 task.first_cosetb = (lb_min_basis > 0) ? ncoset(lb_min_basis - 1) : 0;
734
735 // size of decontracted set, ie. pab and hab
736 task.ncoseta = ncoset(la_max_basis);
737 task.ncosetb = ncoset(lb_max_basis);
738 // size of the cab matrix
739 task.n1 = ncoset(task.la_max);
740 task.n2 = ncoset(task.lb_max);
741
742 // size of entire spherical basis
743 task.nsgfa = glb_task.nsgfa; // ibasis.nsgf;
744 task.nsgfb = glb_task.nsgfb;
745
746 // size of spherical set
747 task.nsgf_seta = glb_task.nsgf_seta;
748 task.nsgf_setb = glb_task.nsgf_setb;
749
750 // strides of the sphi transformation matrices
751 task.maxcoa = glb_task.maxcoa;
752 task.maxcob = glb_task.maxcob;
753
754 // transformations from contracted spherical to primitive cartesian basis
755 task.sphia = &dev.sphi_dev[glb_task.ikind][glb_task.sgfa * task.maxcoa +
756 glb_task.ipgf * task.ncoseta];
757 task.sphib = &dev.sphi_dev[glb_task.jkind][glb_task.sgfb * task.maxcob +
758 glb_task.jpgf * task.ncosetb];
759
760 // Locate current matrix block within the buffer.
761 const int block_offset = dev.block_offsets[glb_task.block_num];
762 task.block_transposed = glb_task.block_transposed;
763 task.pab_block = dev.ptr_dev[0] + block_offset + glb_task.subblock_offset;
764
765 if (dev.ptr_dev[3] != nullptr) {
766 task.hab_block = dev.ptr_dev[3] + block_offset + glb_task.subblock_offset;
767 if (dev.ptr_dev[4] != nullptr) {
768 task.forces_a = &dev.ptr_dev[4][3 * iatom];
769 task.forces_b = &dev.ptr_dev[4][3 * jatom];
770 }
771 }
772 }
773 __syncthreads();
774}
775
777private:
778 int la_max_{-1};
779 int lb_max_{-1};
780 int smem_per_block_{0};
781 int alpha_len_{-1};
782 int cab_len_{-1};
783 int lp_max_{-1};
784 ldiffs_value ldiffs_;
785 int lp_diff_{-1};
786
787public:
788 smem_parameters(const ldiffs_value ldiffs, const int lmax) {
789 ldiffs_ = ldiffs;
791 la_max_ = lmax + ldiffs.la_max_diff;
792 lb_max_ = lmax + ldiffs.lb_max_diff;
793 lp_max_ = la_max_ + lb_max_;
794
795 cab_len_ = ncoset(lb_max_) * ncoset(la_max_);
796 alpha_len_ = 3 * (lb_max_ + 1) * (la_max_ + 1) * (lp_max_ + 1);
797 smem_per_block_ = std::max(alpha_len_, 64) * sizeof(double);
798
799 if (smem_per_block_ > 64 * 1024) {
800 fprintf(stderr,
801 "ERROR: Not enough shared memory in grid_gpu_collocate.\n");
802 fprintf(stderr, "cab_len: %i, ", cab_len_);
803 fprintf(stderr, "alpha_len: %i, ", alpha_len_);
804 fprintf(stderr, "total smem_per_block: %f kb\n\n",
805 smem_per_block_ / 1024.0);
806 abort();
807 }
808 }
809
811
812 // copy and move are trivial
813
814 inline int smem_alpha_offset() const { return 0; }
815
816 inline int smem_cab_offset() const { return alpha_len_; }
817
818 inline int smem_cxyz_offset() const { return alpha_len_ + cab_len_; }
819
820 inline int smem_per_block() const { return smem_per_block_; }
821
822 inline int lp_diff() const { return lp_diff_; }
823
824 inline ldiffs_value ldiffs() const { return ldiffs_; }
825
826 inline int lp_max() const { return lp_max_; }
827
828 inline int cxyz_len() const { return ncoset(lp_max_); }
829};
830
831} // namespace rocm_backend
832#endif
smem_parameters(const ldiffs_value ldiffs, const int lmax)
static void const int const int i
__inline__ __device__ void compute_window_size(const int *const grid_size, const int border_mask, const int *border_width, int3 *const window_size, int3 *const window_shift)
__host__ __inline__ __device__ void convert_from_lattice_coordinates_to_cartesian(const double *__restrict__ dh_, const double3 *__restrict__ const rp, double3 *__restrict__ rp_c)
__device__ static __inline__ void cab_to_cxyz(const smem_task< T > &task, const T *__restrict__ alpha, const T *__restrict__ cab, T *__restrict__ cxyz)
Transforms coefficients C_ab into C_xyz.
__device__ __inline__ void prep_term(const orbital a, const orbital b, const T value, const int n, T *cab)
Adds given value to matrix element cab[idx(b)][idx(a)].
__device__ __inline__ void fill_smem_task_coef(const kernel_params &dev, const int task_id, smem_task< T > &task)
Copies a task from global to shared memory and does precomputations.
__device__ __inline__ double power(const double x, const int expo)
__inline__ __device__ orbital down(const int i, const orbital &a)
Decrease i'th component of given orbital angular momentum.
__host__ __inline__ __device__ void convert_to_lattice_coordinates(const double *dh_inv_, const double3 *__restrict__ const rp, double3 *__restrict__ rp_c)
static void init_constant_memory()
Initializes the device's constant memory.
__device__ static __inline__ void cxyz_to_cab(const smem_task< T > &task, const T *__restrict__ alpha, const T *__restrict__ cxyz, T *__restrict__ cab)
Transforms coefficients C_xyz into C_ab.
__inline__ __device__ int idx(const orbital a)
Return coset index of given orbital angular momentum.
__constant__ orbital coset_inv[1330]
__device__ __inline__ orbital up(const int i, const orbital &a)
Increase i'th component of given orbital angular momentum.
__inline__ __device__ void compute_alpha(const smem_task< T > &task, T *__restrict__ alpha)
Computes the polynomial expansion coefficients: (x-a)**lxa (x-b)**lxb -> sum_{ls} alpha(ls,...
__device__ __inline__ void fill_smem_task_reduced(const kernel_params &dev, const int task_id, smem_task_reduced< T, T3 > &task)
Copies a task from global to shared memory.
__inline__ __device__ double3 compute_coordinates(const double *__restrict__ dh_, const double x, const double y, const double z)
__constant__ int binomial_coef[19][19]
__inline__ T compute_cube_properties(const T radius, const T *const __restrict__ dh_, const T *const __restrict__ dh_inv_, const T3 *__restrict__ rp, T3 *__restrict__ roffset, int3 *__restrict__ cubecenter, int3 *__restrict__ lb_cube, int3 *__restrict__ cube_size)
Parameters of the collocate kernel.
Differences in angular momentum.
Orbital angular momentum.
data needed for collocate and integrate kernels
data needed for calculating the coefficients, forces, and stress