14 #if defined(__OFFLOAD_HIP) && !defined(__NO_OFFLOAD_GRID)
23 #include <hip/hip_runtime.h>
29 #error "OpenMP should not be used in .cu files to accommodate HIP."
36 template <
typename T,
bool IS_FUNC_AB>
37 __device__ __inline__
void block_to_cab(
const kernel_params ¶ms,
38 const smem_task<T> &task, T *cab) {
46 threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
47 for (
int i = 0;
i < task.nsgf_setb;
i++) {
48 for (
int j = 0; j < task.nsgf_seta; j++) {
50 if (task.block_transposed) {
51 block_val = task.pab_block[j * task.nsgfb +
i] * task.off_diag_twice;
53 block_val = task.pab_block[
i * task.nsgfa + j] * task.off_diag_twice;
58 for (
int jco = task.first_cosetb + tid / 8; jco < task.ncosetb;
60 const T sphib = task.sphib[
i * task.maxcob + jco];
61 for (
int ico = task.first_coseta + (tid % 8); ico < task.ncoseta;
63 const T sphia = task.sphia[j * task.maxcoa + ico];
64 const T pab_val = block_val * sphia * sphib;
66 cab[jco * task.ncoseta + ico] += pab_val;
80 template <
typename T,
bool IS_FUNC_AB>
81 __global__
void calculate_coefficients(
const kernel_params dev_) {
82 __shared__ smem_task<T> task;
83 if (dev_.tasks[dev_.first_task + blockIdx.x].skip_task)
87 threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
90 extern __shared__ T shared_memory[];
92 T *smem_alpha = &shared_memory[dev_.smem_alpha_offset];
94 &dev_.ptr_dev[2][dev_.tasks[dev_.first_task + blockIdx.x].coef_offset];
96 &dev_.ptr_dev[6][dev_.tasks[dev_.first_task + blockIdx.x].cab_offset];
97 for (
int z = tid; z < task.n1 * task.n2;
98 z += blockDim.x * blockDim.y * blockDim.z)
102 block_to_cab<T, IS_FUNC_AB>(dev_, task, smem_cab);
137 template <
typename T,
typename T3,
bool distributed__,
bool orthorhombic_>
139 __launch_bounds__(64) void collocate_kernel(const kernel_params dev_) {
141 __shared__ smem_task_reduced<T, T3> task;
143 if (dev_.tasks[dev_.first_task + blockIdx.x].skip_task)
149 threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
152 extern __shared__ T coefs_[];
155 &dev_.ptr_dev[2][dev_.tasks[dev_.first_task + blockIdx.x].coef_offset];
156 __shared__ T dh_[9], dh_inv_[9];
159 for (
int i = tid;
i < 9;
i += blockDim.x * blockDim.y * blockDim.z) {
160 dh_[
i] = dev_.dh_[
i];
164 for (
int i = tid;
i < 9;
i += blockDim.x * blockDim.y * blockDim.z) {
165 dh_inv_[
i] = dev_.dh_inv_[
i];
170 coefs_[tid] = coef_[tid];
176 task.cube_center.z += task.lb_cube.z - dev_.grid_lower_corner_[0];
177 task.cube_center.y += task.lb_cube.y - dev_.grid_lower_corner_[1];
178 task.cube_center.x += task.lb_cube.x - dev_.grid_lower_corner_[2];
181 if (task.apply_border_mask) {
183 dev_.grid_local_size_,
184 dev_.tasks[dev_.first_task + blockIdx.x].border_mask,
185 dev_.grid_border_width_, &task.window_size, &task.window_shift);
191 for (
int z = threadIdx.z; z < task.cube_size.z; z += blockDim.z) {
192 int z2 = (z + task.cube_center.z) % dev_.grid_full_size_[0];
195 z2 += dev_.grid_full_size_[0];
199 if (task.apply_border_mask) {
203 if ((z2 < task.window_shift.z) || (z2 > task.window_size.z)) {
212 short int ymax = task.cube_size.y - 1;
214 if (orthorhombic_ && !task.apply_border_mask) {
215 ymin = (2 * (z + task.lb_cube.z) - 1) / 2;
217 kremain = task.discrete_radius * task.discrete_radius -
218 ((T)ymin) * dh_[8] * dh_[8];
219 ymin = ceil(-1.0e-8 - sqrt(fmax(0.0, kremain)) * dh_inv_[4]);
220 ymax = 1 - ymin - task.lb_cube.y;
221 ymin = ymin - task.lb_cube.y;
224 for (
int y = ymin + threadIdx.y; y <= ymax; y += blockDim.y) {
225 int y2 = (y + task.cube_center.y) % dev_.grid_full_size_[1];
227 y2 += dev_.grid_full_size_[1];
230 if (task.apply_border_mask) {
232 if ((y2 < task.window_shift.y) || (y2 > task.window_size.y)) {
239 short int xmax = task.cube_size.x - 1;
240 if (orthorhombic_ && !task.apply_border_mask) {
241 xmin = (2 * (y + task.lb_cube.y) - 1) / 2;
244 ceil(-1.0e-8 - sqrt(fmax(0.0, kremain - xmin * dh_[4] * dh_[4])) *
246 xmax = 1 - xmin - task.lb_cube.x;
247 xmin = xmin - task.lb_cube.x;
250 for (
int x = xmin + threadIdx.x; x <= xmax; x += blockDim.x) {
251 int x2 = (x + task.cube_center.x) % dev_.grid_full_size_[2];
254 x2 += dev_.grid_full_size_[2];
257 if (task.apply_border_mask) {
260 if ((x2 < task.window_shift.x) || (x2 > task.window_size.x)) {
271 r3.x = (x + task.lb_cube.x + task.roffset.x) * dh_[0];
272 r3.y = (y + task.lb_cube.y + task.roffset.y) * dh_[4];
273 r3.z = (z + task.lb_cube.z + task.roffset.z) * dh_[8];
276 (y + task.lb_cube.y + task.roffset.y),
277 (z + task.lb_cube.z + task.roffset.z));
285 if (((task.radius * task.radius) <=
286 (r3.x * r3.x + r3.y * r3.y + r3.z * r3.z)) &&
287 (!orthorhombic_ || task.apply_border_mask))
291 if ((!orthorhombic_) && ((task.radius * task.radius) <=
292 (r3.x * r3.x + r3.y * r3.y + r3.z * r3.z)))
306 res += coefs_[1] * r3.x;
307 res += coefs_[2] * r3.y;
308 res += coefs_[3] * r3.z;
310 const T r3xy = r3.x * r3.y;
311 const T r3xz = r3.x * r3.z;
312 const T r3yz = r3.y * r3.z;
313 const T r3x2 = r3.x * r3.x;
314 const T r3y2 = r3.y * r3.y;
315 const T r3z2 = r3.z * r3.z;
318 res += coefs_[4] * r3x2;
319 res += coefs_[5] * r3xy;
320 res += coefs_[6] * r3xz;
321 res += coefs_[7] * r3y2;
322 res += coefs_[8] * r3yz;
323 res += coefs_[9] * r3z2;
327 res += coefs_[10] * r3x2 * r3.x;
328 res += coefs_[11] * r3x2 * r3.y;
329 res += coefs_[12] * r3x2 * r3.z;
330 res += coefs_[13] * r3.x * r3y2;
331 res += coefs_[14] * r3xy * r3.z;
332 res += coefs_[15] * r3.x * r3z2;
333 res += coefs_[16] * r3y2 * r3.y;
334 res += coefs_[17] * r3y2 * r3.z;
335 res += coefs_[18] * r3.y * r3z2;
336 res += coefs_[19] * r3z2 * r3.z;
340 res += coefs_[20] * r3x2 * r3x2;
341 res += coefs_[21] * r3x2 * r3xy;
342 res += coefs_[22] * r3x2 * r3xz;
343 res += coefs_[23] * r3x2 * r3y2;
344 res += coefs_[24] * r3x2 * r3yz;
345 res += coefs_[25] * r3x2 * r3z2;
346 res += coefs_[26] * r3xy * r3y2;
347 res += coefs_[27] * r3xz * r3y2;
348 res += coefs_[28] * r3xy * r3z2;
349 res += coefs_[29] * r3xz * r3z2;
350 res += coefs_[30] * r3y2 * r3y2;
351 res += coefs_[31] * r3y2 * r3yz;
352 res += coefs_[32] * r3y2 * r3z2;
353 res += coefs_[33] * r3yz * r3z2;
354 res += coefs_[34] * r3z2 * r3z2;
359 for (
int ic = 35; ic <
ncoset(task.lp); ic++) {
362 for (
int po = 0; po <
co.l[2]; po++)
364 for (
int po = 0; po <
co.l[1]; po++)
366 for (
int po = 0; po <
co.l[0]; po++)
374 (z2 * dev_.grid_local_size_[1] + y2) *
375 dev_.grid_local_size_[2] +
377 res * exp(-(r3.x * r3.x + r3.y * r3.y + r3.z * r3.z) * task.zetp));
395 smem_parameters smem_params(ldiffs,
lmax());
397 *lp_diff = smem_params.lp_diff();
401 kernel_params params = set_kernel_parameters(level, smem_params);
405 const dim3 threads_per_block(4, 4, 4);
408 calculate_coefficients<double, true>
410 smem_params.smem_per_block(),
level_streams[level]>>>(params);
412 calculate_coefficients<double, false>
414 smem_params.smem_per_block(),
level_streams[level]>>>(params);
417 if (
grid_[level].is_distributed()) {
418 if (
grid_[level].is_orthorhombic())
419 collocate_kernel<double, double3, true, true>
423 collocate_kernel<double, double3, true, false>
427 if (
grid_[level].is_orthorhombic())
428 collocate_kernel<double, double3, false, true>
432 collocate_kernel<double, double3, false, false>
void collocate_one_grid_level(const int level, const enum grid_func func, int *lp_diff)
std::vector< grid_info< double > > grid_
std::vector< offloadStream_t > level_streams
std::vector< int > number_of_tasks_per_level_
static void const int const int i
for(int lxp=0;lxp<=lp;lxp++)
integer, dimension(:, :, :), allocatable, public co
__inline__ __device__ void compute_window_size(const int *const grid_size, const int border_mask, const int *border_width, int3 *const window_size, int3 *const window_shift)
__device__ static __inline__ void cab_to_cxyz(const smem_task< T > &task, const T *__restrict__ alpha, const T *__restrict__ cab, T *__restrict__ cxyz)
Transforms coefficients C_ab into C_xyz.
__host__ __device__ __inline__ int ncoset(const int l)
Number of Cartesian orbitals up to given angular momentum quantum.
__device__ __inline__ void fill_smem_task_coef(const kernel_params &dev, const int task_id, smem_task< T > &task)
Copies a task from global to shared memory and does precomputations.
ldiffs_value prepare_get_ldiffs(const enum grid_func func)
Returns difference in angular momentum range for given func.
static void init_constant_memory()
Initializes the device's constant memory.
__constant__ orbital coset_inv[1330]
__inline__ __device__ void compute_alpha(const smem_task< T > &task, T *__restrict__ alpha)
Computes the polynomial expansion coefficients: (x-a)**lxa (x-b)**lxb -> sum_{ls} alpha(ls,...
__device__ __inline__ void fill_smem_task_reduced(const kernel_params &dev, const int task_id, smem_task_reduced< T, T3 > &task)
Copies a task from global to shared memory.
__inline__ __device__ double3 compute_coordinates(const double *__restrict__ dh_, const double x, const double y, const double z)
__device__ __inline__ void prepare_pab(const enum grid_func func, const orbital a, const orbital b, const T zeta, const T zetb, const T pab_val, const int n, T *cab)
Transforms a given element of the density matrix according to func.