8#include "../../offload/offload_runtime.h"
9#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_GRID)
19#define GRID_DO_COLLOCATE 1
20#include "../common/grid_common.h"
25#include "../common/grid_prepare_pab.h"
28#error "OpenMP should not be used in .cu files to accommodate HIP."
35__device__
static void cxyz_to_gridpoint(
const double dx,
const double dy,
36 const double dz,
const double zetp,
37 const int lp,
const cxyz_store *cxyz,
41 const double r2 = dx * dx + dy * dy + dz * dz;
42 const double gaussian = exp(-zetp * r2);
45 double gridpoint_reg = 0.0;
49 gridpoint_reg += cxyz[0];
52 gridpoint_reg += cxyz[1] * dx;
53 gridpoint_reg += cxyz[2] * dy;
54 gridpoint_reg += cxyz[3] * dz;
56 const double dx2 = dx * dx;
57 const double dy2 = dy * dy;
58 const double dz2 = dz * dz;
59 gridpoint_reg += cxyz[4] * dx2;
60 gridpoint_reg += cxyz[5] * dx * dy;
61 gridpoint_reg += cxyz[6] * dx * dz;
62 gridpoint_reg += cxyz[7] * dy2;
63 gridpoint_reg += cxyz[8] * dy * dz;
64 gridpoint_reg += cxyz[9] * dz2;
66 const double dx3 = dx2 * dx;
67 const double dy3 = dy2 * dy;
68 const double dz3 = dz2 * dz;
69 gridpoint_reg += cxyz[10] * dx3;
70 gridpoint_reg += cxyz[11] * dx2 * dy;
71 gridpoint_reg += cxyz[12] * dx2 * dz;
72 gridpoint_reg += cxyz[13] * dx * dy2;
73 gridpoint_reg += cxyz[14] * dx * dy * dz;
74 gridpoint_reg += cxyz[15] * dx * dz2;
75 gridpoint_reg += cxyz[16] * dy3;
76 gridpoint_reg += cxyz[17] * dy2 * dz;
77 gridpoint_reg += cxyz[18] * dy * dz2;
78 gridpoint_reg += cxyz[19] * dz3;
80 const double dx4 = dx3 * dx;
81 const double dy4 = dy3 * dy;
82 const double dz4 = dz3 * dz;
83 gridpoint_reg += cxyz[20] * dx4;
84 gridpoint_reg += cxyz[21] * dx3 * dy;
85 gridpoint_reg += cxyz[22] * dx3 * dz;
86 gridpoint_reg += cxyz[23] * dx2 * dy2;
87 gridpoint_reg += cxyz[24] * dx2 * dy * dz;
88 gridpoint_reg += cxyz[25] * dx2 * dz2;
89 gridpoint_reg += cxyz[26] * dx * dy3;
90 gridpoint_reg += cxyz[27] * dx * dy2 * dz;
91 gridpoint_reg += cxyz[28] * dx * dy * dz2;
92 gridpoint_reg += cxyz[29] * dx * dz3;
93 gridpoint_reg += cxyz[30] * dy4;
94 gridpoint_reg += cxyz[31] * dy3 * dz;
95 gridpoint_reg += cxyz[32] * dy2 * dz2;
96 gridpoint_reg += cxyz[33] * dy * dz3;
97 gridpoint_reg += cxyz[34] * dz4;
99 const double dx5 = dx4 * dx;
100 const double dy5 = dy4 * dy;
101 const double dz5 = dz4 * dz;
102 gridpoint_reg += cxyz[35] * dx5;
103 gridpoint_reg += cxyz[36] * dx4 * dy;
104 gridpoint_reg += cxyz[37] * dx4 * dz;
105 gridpoint_reg += cxyz[38] * dx3 * dy2;
106 gridpoint_reg += cxyz[39] * dx3 * dy * dz;
107 gridpoint_reg += cxyz[40] * dx3 * dz2;
108 gridpoint_reg += cxyz[41] * dx2 * dy3;
109 gridpoint_reg += cxyz[42] * dx2 * dy2 * dz;
110 gridpoint_reg += cxyz[43] * dx2 * dy * dz2;
111 gridpoint_reg += cxyz[44] * dx2 * dz3;
112 gridpoint_reg += cxyz[45] * dx * dy4;
113 gridpoint_reg += cxyz[46] * dx * dy3 * dz;
114 gridpoint_reg += cxyz[47] * dx * dy2 * dz2;
115 gridpoint_reg += cxyz[48] * dx * dy * dz3;
116 gridpoint_reg += cxyz[49] * dx * dz4;
117 gridpoint_reg += cxyz[50] * dy5;
118 gridpoint_reg += cxyz[51] * dy4 * dz;
119 gridpoint_reg += cxyz[52] * dy3 * dz2;
120 gridpoint_reg += cxyz[53] * dy2 * dz3;
121 gridpoint_reg += cxyz[54] * dy * dz4;
122 gridpoint_reg += cxyz[55] * dz5;
124 const double dx6 = dx5 * dx;
125 const double dy6 = dy5 * dy;
126 const double dz6 = dz5 * dz;
127 gridpoint_reg += cxyz[56] * dx6;
128 gridpoint_reg += cxyz[57] * dx5 * dy;
129 gridpoint_reg += cxyz[58] * dx5 * dz;
130 gridpoint_reg += cxyz[59] * dx4 * dy2;
131 gridpoint_reg += cxyz[60] * dx4 * dy * dz;
132 gridpoint_reg += cxyz[61] * dx4 * dz2;
133 gridpoint_reg += cxyz[62] * dx3 * dy3;
134 gridpoint_reg += cxyz[63] * dx3 * dy2 * dz;
135 gridpoint_reg += cxyz[64] * dx3 * dy * dz2;
136 gridpoint_reg += cxyz[65] * dx3 * dz3;
137 gridpoint_reg += cxyz[66] * dx2 * dy4;
138 gridpoint_reg += cxyz[67] * dx2 * dy3 * dz;
139 gridpoint_reg += cxyz[68] * dx2 * dy2 * dz2;
140 gridpoint_reg += cxyz[69] * dx2 * dy * dz3;
141 gridpoint_reg += cxyz[70] * dx2 * dz4;
142 gridpoint_reg += cxyz[71] * dx * dy5;
143 gridpoint_reg += cxyz[72] * dx * dy4 * dz;
144 gridpoint_reg += cxyz[73] * dx * dy3 * dz2;
145 gridpoint_reg += cxyz[74] * dx * dy2 * dz3;
146 gridpoint_reg += cxyz[75] * dx * dy * dz4;
147 gridpoint_reg += cxyz[76] * dx * dz5;
148 gridpoint_reg += cxyz[77] * dy6;
149 gridpoint_reg += cxyz[78] * dy5 * dz;
150 gridpoint_reg += cxyz[79] * dy4 * dz2;
151 gridpoint_reg += cxyz[80] * dy3 * dz3;
152 gridpoint_reg += cxyz[81] * dy2 * dz4;
153 gridpoint_reg += cxyz[82] * dy * dz5;
154 gridpoint_reg += cxyz[83] * dz6;
165 double val = cxyz[
i];
167 for (
int j = 0; j <
a.l[0]; j++) {
170 for (
int j = 0; j <
a.l[1]; j++) {
173 for (
int j = 0; j <
a.l[2]; j++) {
176 gridpoint_reg += val;
180 atomicAddDouble(gridpoint, gridpoint_reg * gaussian);
187__device__
static void cxyz_to_grid(
const kernel_params *params,
188 const smem_task *task,
const double *cxyz,
191 if (task->use_orthorhombic_kernel) {
202template <
bool IS_FUNC_AB>
203__device__
static void block_to_cab(
const kernel_params *params,
212 for (
int i = threadIdx.z;
i < task->nsgf_setb;
i += blockDim.z) {
213 for (
int j = threadIdx.y; j < task->nsgf_seta; j += blockDim.y) {
215 if (task->block_transposed) {
216 block_val = task->pab_block[j * task->nsgfb +
i] * task->off_diag_twice;
218 block_val = task->pab_block[
i * task->nsgfa + j] * task->off_diag_twice;
223 const int jco_start =
ncoset(task->lb_min_basis - 1) + threadIdx.x;
224 const int jco_end =
ncoset(task->lb_max_basis);
225 for (
int jco = jco_start; jco < jco_end; jco += blockDim.x) {
227 const double sphib = task->sphib[
i * task->maxcob + jco];
228 const int ico_start =
ncoset(task->la_min_basis - 1);
229 const int ico_end =
ncoset(task->la_max_basis);
230 for (
int ico = ico_start; ico < ico_end; ico++) {
232 const double sphia = task->sphia[j * task->maxcoa + ico];
233 const double pab_val = block_val * sphia * sphib;
242 const int jco_start =
ncoset(task->lb_min_basis - 1) + threadIdx.x;
243 const int jco_end =
ncoset(task->lb_max_basis);
244 for (
int jco = jco_start; jco < jco_end; jco += blockDim.x) {
246 const int ico_start =
ncoset(task->la_min_basis - 1);
247 const int ico_end =
ncoset(task->la_max_basis);
248 for (
int ico = ico_start; ico < ico_end; ico++) {
250 const double sphia = task->sphia[j * task->maxcoa +
idx(a)];
251 const double sphib = task->sphib[
i * task->maxcob +
idx(b)];
252 const double pab_val = block_val * sphia * sphib;
253 prepare_pab(params->func, a, b, task->zeta, task->zetb, pab_val,
267template <
bool IS_FUNC_AB>
268__device__
static void collocate_kernel(
const kernel_params *params) {
271 __shared__ smem_task task;
272 load_task(params, &task);
275 if (2.0 * task.radius < task.dh_max) {
280 extern __shared__
double shared_memory[];
281 double *smem_cab = &shared_memory[params->smem_cab_offset];
282 double *smem_alpha = &shared_memory[params->smem_alpha_offset];
283 double *smem_cxyz = &shared_memory[params->smem_cxyz_offset];
287 if (params->smem_cab_length < task.n1 * task.n2) {
288 cab.
data = malloc_cab(&task);
293 zero_cab(&cab, task.n1 * task.n2);
296 block_to_cab<IS_FUNC_AB>(params, &task, &cab);
300 if (params->smem_cab_length < task.n1 * task.n2) {
309__global__
static void collocate_kernel_density(
const kernel_params params) {
310 collocate_kernel<true>(¶ms);
317__global__
static void collocate_kernel_anyfunc(
const kernel_params params) {
318 collocate_kernel<false>(¶ms);
325void grid_gpu_collocate_one_grid_level(
326 const grid_gpu_task_list *task_list,
const int first_task,
327 const int last_task,
const enum grid_func func,
328 const grid_gpu_layout *layout,
const offloadStream_t stream,
329 const double *pab_blocks_dev,
double *grid_dev,
int *lp_diff) {
334 const int la_max = task_list->lmax + ldiffs.
la_max_diff;
335 const int lb_max = task_list->lmax + ldiffs.
lb_max_diff;
336 const int lp_max = la_max + lb_max;
338 const int ntasks = last_task - first_task + 1;
343 init_constant_memory();
349 const int alpha_len = 3 * (lb_max + 1) * (la_max + 1) * (lp_max + 1);
350 const int cxyz_len =
ncoset(lp_max);
352 const size_t smem_per_block =
353 (alpha_len + cxyz_len + cab_len) *
sizeof(
double);
356 kernel_params params;
357 params.smem_cab_length = cab_len;
358 params.smem_cab_offset = 0;
359 params.smem_alpha_offset = params.smem_cab_offset + cab_len;
360 params.smem_cxyz_offset = params.smem_alpha_offset + alpha_len;
361 params.first_task = first_task;
363 params.grid = grid_dev;
368 params.tasks = task_list->tasks_dev;
369 params.pab_blocks = pab_blocks_dev;
370 memcpy(params.dh, layout->dh, 9 *
sizeof(
double));
371 memcpy(params.dh_inv, layout->dh_inv, 9 *
sizeof(
double));
372 memcpy(params.npts_global, layout->npts_global, 3 *
sizeof(
int));
373 memcpy(params.npts_local, layout->npts_local, 3 *
sizeof(
int));
374 memcpy(params.shift_local, layout->shift_local, 3 *
sizeof(
int));
377 const int nblocks = ntasks;
378 const dim3 threads_per_block(4, 4, 4);
381 collocate_kernel_density<<<nblocks, threads_per_block, smem_per_block,
384 collocate_kernel_anyfunc<<<nblocks, threads_per_block, smem_per_block,
387 OFFLOAD_CHECK(offloadGetLastError());
static int imin(int x, int y)
Returns the smaller of the two integers (missing from the C standard).
static GRID_HOST_DEVICE int idx(const orbital a)
Return coset index of given orbital angular momentum.
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
static void const int const int i
static GRID_DEVICE void prepare_pab(const enum grid_func func, const orbital a, const orbital b, const double zeta, const double zetb, const double pab_val, cab_store *cab)
Transforms a given element of the density matrix according to func.
static prepare_ldiffs prepare_get_ldiffs(const enum grid_func func)
Returns difference in angular momentum range for given func.
static void cxyz_to_grid(const bool orthorhombic, const int border_mask, const int lp, const double zetp, const double dh[3][3], const double dh_inv[3][3], const double rp[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, GRID_CONST_WHEN_COLLOCATE double *cxyz, GRID_CONST_WHEN_INTEGRATE double *grid)
Collocates coefficients C_xyz onto the grid.
static void cab_to_cxyz(const int la_max, const int la_min, const int lb_max, const int lb_min, const double prefactor, const double ra[3], const double rb[3], const double rp[3], GRID_CONST_WHEN_COLLOCATE double *cab, GRID_CONST_WHEN_INTEGRATE double *cxyz)
Transforms coefficients C_ab into C_xyz.
static void general_cxyz_to_grid(const int border_mask, const int lp, const double zetp, const double dh[3][3], const double dh_inv[3][3], const double rp[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, GRID_CONST_WHEN_COLLOCATE double *cxyz, GRID_CONST_WHEN_INTEGRATE double *grid)
Collocates coefficients C_xyz onto the grid for general case.
static void ortho_cxyz_to_grid(const int lp, const double zetp, const double dh[3][3], const double dh_inv[3][3], const double rp[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const double radius, GRID_CONST_WHEN_COLLOCATE double *cxyz, GRID_CONST_WHEN_INTEGRATE double *grid)
Collocates coefficients C_xyz onto the grid for orthorhombic case.
integer, dimension(:), allocatable, public ncoset
integer, parameter, public gaussian
__constant__ orbital coset_inv[1330]
__inline__ __device__ void compute_alpha(const smem_task< T > &task, T *__restrict__ alpha)
Computes the polynomial expansion coefficients: (x-a)**lxa (x-b)**lxb -> sum_{ls} alpha(ls,...
Cab matrix container to be passed through get_force/virial to cab_get.
Orbital angular momentum.
Differences in angular momentum.