8#include "../../offload/offload_runtime.h"
9#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_GRID)
19#define GRID_DO_COLLOCATE 1
20#include "../common/grid_common.h"
25#include "../common/grid_prepare_pab.h"
28#error "OpenMP should not be used in .cu files to accommodate HIP."
35__device__
static void cxyz_to_gridpoint(
const double dx,
const double dy,
36 const double dz,
const double zetp,
37 const int lp,
const cxyz_store *cxyz,
41 const double r2 = dx * dx + dy * dy + dz * dz;
42 const double gaussian = exp(-zetp * r2);
45 double gridpoint_reg = 0.0;
49 gridpoint_reg += cxyz[0];
52 gridpoint_reg += cxyz[1] * dx;
53 gridpoint_reg += cxyz[2] * dy;
54 gridpoint_reg += cxyz[3] * dz;
56 const double dx2 = dx * dx;
57 const double dy2 = dy * dy;
58 const double dz2 = dz * dz;
59 gridpoint_reg += cxyz[4] * dx2;
60 gridpoint_reg += cxyz[5] * dx * dy;
61 gridpoint_reg += cxyz[6] * dx * dz;
62 gridpoint_reg += cxyz[7] * dy2;
63 gridpoint_reg += cxyz[8] * dy * dz;
64 gridpoint_reg += cxyz[9] * dz2;
66 const double dx3 = dx2 * dx;
67 const double dy3 = dy2 * dy;
68 const double dz3 = dz2 * dz;
69 gridpoint_reg += cxyz[10] * dx3;
70 gridpoint_reg += cxyz[11] * dx2 * dy;
71 gridpoint_reg += cxyz[12] * dx2 * dz;
72 gridpoint_reg += cxyz[13] * dx * dy2;
73 gridpoint_reg += cxyz[14] * dx * dy * dz;
74 gridpoint_reg += cxyz[15] * dx * dz2;
75 gridpoint_reg += cxyz[16] * dy3;
76 gridpoint_reg += cxyz[17] * dy2 * dz;
77 gridpoint_reg += cxyz[18] * dy * dz2;
78 gridpoint_reg += cxyz[19] * dz3;
80 const double dx4 = dx3 * dx;
81 const double dy4 = dy3 * dy;
82 const double dz4 = dz3 * dz;
83 gridpoint_reg += cxyz[20] * dx4;
84 gridpoint_reg += cxyz[21] * dx3 * dy;
85 gridpoint_reg += cxyz[22] * dx3 * dz;
86 gridpoint_reg += cxyz[23] * dx2 * dy2;
87 gridpoint_reg += cxyz[24] * dx2 * dy * dz;
88 gridpoint_reg += cxyz[25] * dx2 * dz2;
89 gridpoint_reg += cxyz[26] * dx * dy3;
90 gridpoint_reg += cxyz[27] * dx * dy2 * dz;
91 gridpoint_reg += cxyz[28] * dx * dy * dz2;
92 gridpoint_reg += cxyz[29] * dx * dz3;
93 gridpoint_reg += cxyz[30] * dy4;
94 gridpoint_reg += cxyz[31] * dy3 * dz;
95 gridpoint_reg += cxyz[32] * dy2 * dz2;
96 gridpoint_reg += cxyz[33] * dy * dz3;
97 gridpoint_reg += cxyz[34] * dz4;
99 const double dx5 = dx4 * dx;
100 const double dy5 = dy4 * dy;
101 const double dz5 = dz4 * dz;
102 gridpoint_reg += cxyz[35] * dx5;
103 gridpoint_reg += cxyz[36] * dx4 * dy;
104 gridpoint_reg += cxyz[37] * dx4 * dz;
105 gridpoint_reg += cxyz[38] * dx3 * dy2;
106 gridpoint_reg += cxyz[39] * dx3 * dy * dz;
107 gridpoint_reg += cxyz[40] * dx3 * dz2;
108 gridpoint_reg += cxyz[41] * dx2 * dy3;
109 gridpoint_reg += cxyz[42] * dx2 * dy2 * dz;
110 gridpoint_reg += cxyz[43] * dx2 * dy * dz2;
111 gridpoint_reg += cxyz[44] * dx2 * dz3;
112 gridpoint_reg += cxyz[45] * dx * dy4;
113 gridpoint_reg += cxyz[46] * dx * dy3 * dz;
114 gridpoint_reg += cxyz[47] * dx * dy2 * dz2;
115 gridpoint_reg += cxyz[48] * dx * dy * dz3;
116 gridpoint_reg += cxyz[49] * dx * dz4;
117 gridpoint_reg += cxyz[50] * dy5;
118 gridpoint_reg += cxyz[51] * dy4 * dz;
119 gridpoint_reg += cxyz[52] * dy3 * dz2;
120 gridpoint_reg += cxyz[53] * dy2 * dz3;
121 gridpoint_reg += cxyz[54] * dy * dz4;
122 gridpoint_reg += cxyz[55] * dz5;
124 const double dx6 = dx5 * dx;
125 const double dy6 = dy5 * dy;
126 const double dz6 = dz5 * dz;
127 gridpoint_reg += cxyz[56] * dx6;
128 gridpoint_reg += cxyz[57] * dx5 * dy;
129 gridpoint_reg += cxyz[58] * dx5 * dz;
130 gridpoint_reg += cxyz[59] * dx4 * dy2;
131 gridpoint_reg += cxyz[60] * dx4 * dy * dz;
132 gridpoint_reg += cxyz[61] * dx4 * dz2;
133 gridpoint_reg += cxyz[62] * dx3 * dy3;
134 gridpoint_reg += cxyz[63] * dx3 * dy2 * dz;
135 gridpoint_reg += cxyz[64] * dx3 * dy * dz2;
136 gridpoint_reg += cxyz[65] * dx3 * dz3;
137 gridpoint_reg += cxyz[66] * dx2 * dy4;
138 gridpoint_reg += cxyz[67] * dx2 * dy3 * dz;
139 gridpoint_reg += cxyz[68] * dx2 * dy2 * dz2;
140 gridpoint_reg += cxyz[69] * dx2 * dy * dz3;
141 gridpoint_reg += cxyz[70] * dx2 * dz4;
142 gridpoint_reg += cxyz[71] * dx * dy5;
143 gridpoint_reg += cxyz[72] * dx * dy4 * dz;
144 gridpoint_reg += cxyz[73] * dx * dy3 * dz2;
145 gridpoint_reg += cxyz[74] * dx * dy2 * dz3;
146 gridpoint_reg += cxyz[75] * dx * dy * dz4;
147 gridpoint_reg += cxyz[76] * dx * dz5;
148 gridpoint_reg += cxyz[77] * dy6;
149 gridpoint_reg += cxyz[78] * dy5 * dz;
150 gridpoint_reg += cxyz[79] * dy4 * dz2;
151 gridpoint_reg += cxyz[80] * dy3 * dz3;
152 gridpoint_reg += cxyz[81] * dy2 * dz4;
153 gridpoint_reg += cxyz[82] * dy * dz5;
154 gridpoint_reg += cxyz[83] * dz6;
165 double val = cxyz[
i];
167 for (
int j = 0; j <
a.l[0]; j++) {
170 for (
int j = 0; j <
a.l[1]; j++) {
173 for (
int j = 0; j <
a.l[2]; j++) {
176 gridpoint_reg += val;
180 atomicAddDouble(gridpoint, gridpoint_reg * gaussian);
187__device__
static void cxyz_to_grid(
const kernel_params *params,
188 const smem_task *task,
const double *cxyz,
191 if (task->use_orthorhombic_kernel) {
202template <
bool IS_FUNC_AB>
203__device__
static void block_to_cab(
const kernel_params *params,
215 if (threadIdx.z == 0) {
216 const int jco_start =
ncoset(task->lb_min_basis - 1) + threadIdx.y;
217 const int jco_end =
ncoset(task->lb_max_basis);
218 for (
int jco = jco_start; jco < jco_end; jco += blockDim.y) {
220 const int ico_start =
ncoset(task->la_min_basis - 1) + threadIdx.x;
221 const int ico_end =
ncoset(task->la_max_basis);
222 for (
int ico = ico_start; ico < ico_end; ico += blockDim.x) {
224 double pab_val = 0.0;
225 for (
int i = 0;
i < task->nsgf_setb;
i++) {
226 const double sphib = task->sphib[
i * task->maxcob +
idx(b)];
227 for (
int j = 0; j < task->nsgf_seta; j++) {
229 if (task->block_transposed) {
231 task->pab_block[j * task->nsgfb +
i] * task->off_diag_twice;
234 task->pab_block[
i * task->nsgfa + j] * task->off_diag_twice;
236 const double sphia = task->sphia[j * task->maxcoa +
idx(a)];
237 pab_val += block_val * sphia * sphib;
245 prepare_pab(params->func, a, b, task->zeta, task->zetb, pab_val, cab);
257template <
bool IS_FUNC_AB>
258__device__
static void collocate_kernel(
const kernel_params *params) {
261 __shared__ smem_task task;
262 load_task(params, &task);
265 if (2.0 * task.radius < task.dh_max) {
270 extern __shared__
double shared_memory[];
271 double *smem_cab = &shared_memory[params->smem_cab_offset];
272 double *smem_alpha = &shared_memory[params->smem_alpha_offset];
273 double *smem_cxyz = &shared_memory[params->smem_cxyz_offset];
277 if (params->smem_cab_length < task.n1 * task.n2) {
278 cab.
data = malloc_cab(&task);
283 zero_cab(&cab, task.n1 * task.n2);
286 block_to_cab<IS_FUNC_AB>(params, &task, &cab);
290 if (params->smem_cab_length < task.n1 * task.n2) {
299__global__
static void collocate_kernel_density(
const kernel_params params) {
300 collocate_kernel<true>(¶ms);
307__global__
static void collocate_kernel_anyfunc(
const kernel_params params) {
308 collocate_kernel<false>(¶ms);
315void grid_gpu_collocate_one_grid_level(
316 const grid_gpu_task_list *task_list,
const int first_task,
317 const int last_task,
const enum grid_func func,
318 const grid_gpu_layout *layout,
const offloadStream_t stream,
319 const double *pab_blocks_dev,
double *grid_dev,
int *lp_diff) {
324 const int la_max = task_list->lmax + ldiffs.
la_max_diff;
325 const int lb_max = task_list->lmax + ldiffs.
lb_max_diff;
326 const int lp_max = la_max + lb_max;
328 const int ntasks = last_task - first_task + 1;
333 init_constant_memory();
339 const int alpha_len = 3 * (lb_max + 1) * (la_max + 1) * (lp_max + 1);
340 const int cxyz_len =
ncoset(lp_max);
342 const size_t smem_per_block =
343 (alpha_len + cxyz_len + cab_len) *
sizeof(
double);
346 kernel_params params;
347 params.smem_cab_length = cab_len;
348 params.smem_cab_offset = 0;
349 params.smem_alpha_offset = params.smem_cab_offset + cab_len;
350 params.smem_cxyz_offset = params.smem_alpha_offset + alpha_len;
351 params.first_task = first_task;
353 params.grid = grid_dev;
358 params.tasks = task_list->tasks_dev;
359 params.pab_blocks = pab_blocks_dev;
360 memcpy(params.dh, layout->dh, 9 *
sizeof(
double));
361 memcpy(params.dh_inv, layout->dh_inv, 9 *
sizeof(
double));
362 memcpy(params.npts_global, layout->npts_global, 3 *
sizeof(
int));
363 memcpy(params.npts_local, layout->npts_local, 3 *
sizeof(
int));
364 memcpy(params.shift_local, layout->shift_local, 3 *
sizeof(
int));
367 const int nblocks = ntasks;
368 const dim3 threads_per_block(4, 4, 4);
371 collocate_kernel_density<<<nblocks, threads_per_block, smem_per_block,
374 collocate_kernel_anyfunc<<<nblocks, threads_per_block, smem_per_block,
377 OFFLOAD_CHECK(offloadGetLastError());
static int imin(int x, int y)
Returns the smaller of the two integers (missing from the C standard).
static GRID_HOST_DEVICE int idx(const orbital a)
Return coset index of given orbital angular momentum.
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
static void const int const int i
static GRID_DEVICE void prepare_pab(const enum grid_func func, const orbital a, const orbital b, const double zeta, const double zetb, const double pab_val, cab_store *cab)
Transforms a given element of the density matrix according to func.
static prepare_ldiffs prepare_get_ldiffs(const enum grid_func func)
Returns difference in angular momentum range for given func.
static void cxyz_to_grid(const bool orthorhombic, const int border_mask, const int lp, const double zetp, const double dh[3][3], const double dh_inv[3][3], const double rp[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, GRID_CONST_WHEN_COLLOCATE double *cxyz, GRID_CONST_WHEN_INTEGRATE double *grid)
Collocates coefficients C_xyz onto the grid.
static void cab_to_cxyz(const int la_max, const int la_min, const int lb_max, const int lb_min, const double prefactor, const double ra[3], const double rb[3], const double rp[3], GRID_CONST_WHEN_COLLOCATE double *cab, GRID_CONST_WHEN_INTEGRATE double *cxyz)
Transforms coefficients C_ab into C_xyz.
static void general_cxyz_to_grid(const int border_mask, const int lp, const double zetp, const double dh[3][3], const double dh_inv[3][3], const double rp[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, GRID_CONST_WHEN_COLLOCATE double *cxyz, GRID_CONST_WHEN_INTEGRATE double *grid)
Collocates coefficients C_xyz onto the grid for general case.
static void ortho_cxyz_to_grid(const int lp, const double zetp, const double dh[3][3], const double dh_inv[3][3], const double rp[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const double radius, GRID_CONST_WHEN_COLLOCATE double *cxyz, GRID_CONST_WHEN_INTEGRATE double *grid)
Collocates coefficients C_xyz onto the grid for orthorhombic case.
integer, dimension(:), allocatable, public ncoset
integer, parameter, public gaussian
__constant__ orbital coset_inv[1330]
__inline__ __device__ void compute_alpha(const smem_task< T > &task, T *__restrict__ alpha)
Computes the polynomial expansion coefficients: (x-a)**lxa (x-b)**lxb -> sum_{ls} alpha(ls,...
Cab matrix container to be passed through get_force/virial to cab_get.
Orbital angular momentum.
Differences in angular momentum.