14#if defined(__OFFLOAD_HIP) && !defined(__NO_OFFLOAD_GRID)
23#include <hip/hip_runtime.h>
29#error "OpenMP should not be used in .cu files to accommodate HIP."
36template <
typename T,
bool IS_FUNC_AB>
37__device__ __inline__
void block_to_cab(
const kernel_params ¶ms,
38 const smem_task<T> &task, T *cab) {
46 threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
48 for (
int jco = task.first_cosetb + tid / 16; jco < task.ncosetb; jco += 4) {
49 for (
int ico = task.first_coseta + (tid % 16); ico < task.ncoseta;
52 for (
int i = 0;
i < task.nsgf_setb;
i++) {
53 const T sphib = task.sphib[
i * task.maxcob + jco];
55 for (
int j = 0; j < task.nsgf_seta; j++) {
57 if (task.block_transposed) {
59 task.pab_block[j * task.nsgfb +
i] * task.off_diag_twice;
62 task.pab_block[
i * task.nsgfa + j] * task.off_diag_twice;
65 tmp_val += block_val * task.sphia[j * task.maxcoa + ico];
67 pab_val += tmp_val * sphib;
71 cab[jco * task.ncoseta + ico] += pab_val;
76 prepare_pab(params.func, a, b, task.zeta, task.zetb, pab_val, task.n1,
83template <
typename T,
bool IS_FUNC_AB>
85__launch_bounds__(64) void calculate_coefficients(const kernel_params dev_) {
86 __shared__ smem_task<T> task;
87 if (dev_.tasks[dev_.first_task + blockIdx.x].skip_task)
91 threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
94 extern __shared__ T shared_memory[];
96 T *smem_alpha = &shared_memory[dev_.smem_alpha_offset];
98 &dev_.ptr_dev[2][dev_.tasks[dev_.first_task + blockIdx.x].coef_offset];
100 &dev_.ptr_dev[6][dev_.tasks[dev_.first_task + blockIdx.x].cab_offset];
102 for (
int z = tid; z < task.n1 * task.n2;
103 z += blockDim.x * blockDim.y * blockDim.z)
106 block_to_cab<T, IS_FUNC_AB>(dev_, task, smem_cab);
139template <
typename T,
typename T3,
bool distributed__,
bool orthorhombic_>
141__launch_bounds__(64) void collocate_kernel(const kernel_params dev_) {
143 __shared__ smem_task_reduced<T, T3> task;
145 if (dev_.tasks[dev_.first_task + blockIdx.x].skip_task)
151 threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
154 extern __shared__ T coefs_[];
157 &dev_.ptr_dev[2][dev_.tasks[dev_.first_task + blockIdx.x].coef_offset];
158 __shared__ T dh_[9], dh_inv_[9];
162 dh_[tid] = dev_.dh_[tid];
164 dh_inv_[tid] = dev_.dh_inv_[tid];
167 for (
int i = tid;
i <
ncoset(6);
i += blockDim.x * blockDim.y * blockDim.z)
168 coefs_[
i] = coef_[
i];
174 task.cube_center.z += task.lb_cube.z - dev_.grid_lower_corner_[0];
175 task.cube_center.y += task.lb_cube.y - dev_.grid_lower_corner_[1];
176 task.cube_center.x += task.lb_cube.x - dev_.grid_lower_corner_[2];
179 if (task.apply_border_mask) {
181 dev_.grid_local_size_,
182 dev_.tasks[dev_.first_task + blockIdx.x].border_mask,
183 dev_.grid_border_width_, &task.window_size, &task.window_shift);
189 for (
int z = threadIdx.z; z < task.cube_size.z; z += blockDim.z) {
190 int z2 = (z + task.cube_center.z) % dev_.grid_full_size_[0];
193 z2 += dev_.grid_full_size_[0];
197 if (task.apply_border_mask) {
201 if ((z2 < task.window_shift.z) || (z2 > task.window_size.z)) {
210 short int ymax = task.cube_size.y - 1;
212 if (orthorhombic_ && !task.apply_border_mask) {
213 ymin = (2 * (z + task.lb_cube.z) - 1) / 2;
215 kremain = task.discrete_radius * task.discrete_radius -
216 ((T)ymin) * dh_[8] * dh_[8];
217 ymin = ceil(-1.0e-8 - sqrt(fmax(0.0, kremain)) * dh_inv_[4]);
218 ymax = 1 - ymin - task.lb_cube.y;
219 ymin = ymin - task.lb_cube.y;
222 for (
int y = ymin + threadIdx.y; y <= ymax; y += blockDim.y) {
223 int y2 = (y + task.cube_center.y) % dev_.grid_full_size_[1];
225 y2 += dev_.grid_full_size_[1];
228 if (task.apply_border_mask) {
230 if ((y2 < task.window_shift.y) || (y2 > task.window_size.y)) {
237 short int xmax = task.cube_size.x - 1;
238 if (orthorhombic_ && !task.apply_border_mask) {
239 xmin = (2 * (y + task.lb_cube.y) - 1) / 2;
242 ceil(-1.0e-8 - sqrt(fmax(0.0, kremain - xmin * dh_[4] * dh_[4])) *
244 xmax = 1 - xmin - task.lb_cube.x;
245 xmin = xmin - task.lb_cube.x;
248 for (
int x = xmin + threadIdx.x; x <= xmax; x += blockDim.x) {
249 int x2 = (x + task.cube_center.x) % dev_.grid_full_size_[2];
252 x2 += dev_.grid_full_size_[2];
255 if (task.apply_border_mask) {
258 if ((x2 < task.window_shift.x) || (x2 > task.window_size.x)) {
269 r3.x = (x + task.lb_cube.x + task.roffset.x) * dh_[0];
270 r3.y = (y + task.lb_cube.y + task.roffset.y) * dh_[4];
271 r3.z = (z + task.lb_cube.z + task.roffset.z) * dh_[8];
274 (y + task.lb_cube.y + task.roffset.y),
275 (z + task.lb_cube.z + task.roffset.z));
278 const T r3x2 = r3.x * r3.x;
279 const T r3y2 = r3.y * r3.y;
280 const T r3z2 = r3.z * r3.z;
287 if (((task.radius * task.radius) <= (r3x2 + r3y2 + r3z2)) &&
288 (!orthorhombic_ || task.apply_border_mask))
292 if ((!orthorhombic_) &&
293 ((task.radius * task.radius) <= (r3x2 + r3y2 + r3z2)))
302 res += coefs_[1] * r3.x;
303 res += coefs_[2] * r3.y;
304 res += coefs_[3] * r3.z;
306 const T r3xy = r3.x * r3.y;
307 const T r3xz = r3.x * r3.z;
308 const T r3yz = r3.y * r3.z;
311 res += coefs_[4] * r3x2;
312 res += coefs_[5] * r3xy;
313 res += coefs_[6] * r3xz;
314 res += coefs_[7] * r3y2;
315 res += coefs_[8] * r3yz;
316 res += coefs_[9] * r3z2;
321 (coefs_[10] * r3.x + coefs_[11] * r3.y + coefs_[12] * r3.z);
322 res += r3.x * (coefs_[13] * r3y2 + coefs_[15] * r3z2);
323 res += coefs_[14] * r3xy * r3.z;
324 res += r3y2 * (coefs_[16] * r3.y + coefs_[17] * r3.z);
325 res += r3z2 * (coefs_[18] * r3.y + coefs_[19] * r3.z);
330 (coefs_[20] * r3x2 + coefs_[21] * r3xy + coefs_[22] * r3xz +
331 coefs_[23] * r3y2 + coefs_[24] * r3yz + coefs_[25] * r3z2);
333 (coefs_[26] * r3xy + coefs_[27] * r3xz + coefs_[30] * r3y2 +
334 coefs_[31] * r3yz + coefs_[32] * r3z2);
335 res += r3z2 * (coefs_[28] * r3xy + coefs_[29] * r3xz +
336 coefs_[33] * r3yz + coefs_[34] * r3z2);
340 const T r3x4 = r3x2 * r3x2;
341 const T r3y4 = r3y2 * r3y2;
342 const T r3z4 = r3z2 * r3z2;
344 res += r3x4 * (coefs_[35] * r3.x +
347 res += r3x2 * (r3.x * (coefs_[38] * r3y2 +
350 r3y2 * (coefs_[41] * r3.y +
352 r3z2 * (coefs_[43] * r3.y +
354 res += r3.x * (coefs_[45] * r3y4 +
355 r3y2 * (coefs_[46] * r3yz +
357 r3z2 * (coefs_[48] * r3yz +
359 res += r3y2 * (r3y2 * (coefs_[50] * r3.y +
361 r3z2 * (coefs_[52] * r3.y +
363 res += r3z4 * (coefs_[54] * r3.y +
367 res += r3x4 * (coefs_[56] * r3x2 +
374 res += r3x2 * (coefs_[62] * r3y2 * r3xy +
375 coefs_[63] * r3y2 * r3xz +
376 coefs_[64] * r3xy * r3z2 +
377 coefs_[65] * r3z2 * r3xz +
379 coefs_[67] * r3y2 * r3yz +
380 coefs_[68] * r3y2 * r3z2 +
381 coefs_[69] * r3yz * r3z2 +
388 res += r3y4 * (coefs_[71] * r3xy +
393 res += r3z4 * (coefs_[75] * r3xy +
404 for (
int po = 0; po < (
co.l[2] >> 1); po++)
408 for (
int po = 0; po < (
co.l[1] >> 1); po++)
412 for (
int po = 0; po < (
co.l[0] >> 1); po++)
421 res *= exp(-(r3x2 + r3y2 + r3z2) * task.zetp);
422 atomicAdd(dev_.ptr_dev[1] +
423 (z2 * dev_.grid_local_size_[1] + y2) *
424 dev_.grid_local_size_[2] +
443 smem_parameters smem_params(ldiffs,
lmax());
445 *lp_diff = smem_params.lp_diff();
449 kernel_params params = set_kernel_parameters(level, smem_params);
453 const dim3 threads_per_block(4, 4, 4);
456 calculate_coefficients<double, true>
458 smem_params.smem_per_block(),
level_streams[level]>>>(params);
460 calculate_coefficients<double, false>
462 smem_params.smem_per_block(),
level_streams[level]>>>(params);
465 if (
grid_[level].is_distributed()) {
466 if (
grid_[level].is_orthorhombic())
467 collocate_kernel<double, double3, true, true>
471 collocate_kernel<double, double3, true, false>
475 if (
grid_[level].is_orthorhombic())
476 collocate_kernel<double, double3, false, true>
480 collocate_kernel<double, double3, false, false>
void collocate_one_grid_level(const int level, const enum grid_func func, int *lp_diff)
std::vector< grid_info< double > > grid_
std::vector< offloadStream_t > level_streams
std::vector< int > number_of_tasks_per_level_
static void const int const int i
integer, dimension(:, :, :), allocatable, public co
integer, dimension(:), allocatable, public ncoset
__inline__ __device__ void compute_window_size(const int *const grid_size, const int border_mask, const int *border_width, int3 *const window_size, int3 *const window_shift)
__device__ static __inline__ void cab_to_cxyz(const smem_task< T > &task, const T *__restrict__ alpha, const T *__restrict__ cab, T *__restrict__ cxyz)
Transforms coefficients C_ab into C_xyz.
__device__ __inline__ void fill_smem_task_coef(const kernel_params &dev, const int task_id, smem_task< T > &task)
Copies a task from global to shared memory and does precomputations.
ldiffs_value prepare_get_ldiffs(const enum grid_func func)
Returns difference in angular momentum range for given func.
static void init_constant_memory()
Initializes the device's constant memory.
__constant__ orbital coset_inv[1330]
__inline__ __device__ void compute_alpha(const smem_task< T > &task, T *__restrict__ alpha)
Computes the polynomial expansion coefficients: (x-a)**lxa (x-b)**lxb -> sum_{ls} alpha(ls,...
__device__ __inline__ void fill_smem_task_reduced(const kernel_params &dev, const int task_id, smem_task_reduced< T, T3 > &task)
Copies a task from global to shared memory.
__inline__ __device__ double3 compute_coordinates(const double *__restrict__ dh_, const double x, const double y, const double z)
__device__ __inline__ void prepare_pab(const enum grid_func func, const orbital a, const orbital b, const T zeta, const T zetb, const T pab_val, const int n, T *cab)
Transforms a given element of the density matrix according to func.