15 #if defined(__OFFLOAD_HIP) && !defined(__NO_OFFLOAD_GRID)
19 #include <hip/hip_runtime.h>
30 #ifdef __HIP_PLATFORM_NVIDIA__
31 #include <cooperative_groups.h>
32 #if CUDA_VERSION >= 11000
33 #include <cooperative_groups/reduce.h>
35 namespace cg = cooperative_groups;
39 #error "OpenMP should not be used in .cu files to accommodate HIP."
45 __device__ __inline__ T warp_reduce(T *table,
const int tid) {
49 table[tid] += table[tid + 32];
53 table[tid] += table[tid + 16];
57 table[tid] += table[tid + 8];
61 table[tid] += table[tid + 4];
65 table[tid] += table[tid + 2];
68 return (table[0] + table[1]);
72 template <
typename T,
typename T3,
bool COMPUTE_TAU,
bool CALCULATE_FORCES>
73 __global__ __launch_bounds__(64) void compute_hab_v2(const kernel_params dev_) {
75 extern __shared__ T shared_memory[];
77 T *smem_alpha = &shared_memory[dev_.smem_alpha_offset];
79 threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
80 const int offset = dev_.sorted_blocks_offset_dev[blockIdx.x];
81 const int number_of_tasks = dev_.num_tasks_per_block_dev[blockIdx.x];
83 if (number_of_tasks == 0)
106 for (
int tk = 0; tk < number_of_tasks; tk++) {
107 __shared__ smem_task<T> task;
108 const int task_id = dev_.task_sorted_by_blocks_dev[offset + tk];
109 if (dev_.tasks[task_id].skip_task)
113 T *coef_ = &dev_.ptr_dev[2][dev_.tasks[task_id].coef_offset];
114 T *smem_cab = &dev_.ptr_dev[6][dev_.tasks[task_id].cab_offset];
121 for (
int i = tid / 8;
i < task.nsgf_setb;
i += 8) {
122 for (
int j = tid % 8; j < task.nsgf_seta; j += 8) {
127 if (CALCULATE_FORCES) {
128 if (task.block_transposed) {
130 task.pab_block[j * task.nsgfb +
i] * task.off_diag_twice;
133 task.pab_block[
i * task.nsgfa + j] * task.off_diag_twice;
137 for (
int jco = task.first_cosetb; jco < task.ncosetb; jco++) {
140 const T sphib = task.sphib[
i * task.maxcob + jco];
141 for (
int ico = task.first_coseta; ico < task.ncoseta; ico++) {
143 const T hab = get_hab<COMPUTE_TAU, T>(
a,
b, task.zeta, task.zetb,
145 const T sphia = task.sphia[j * task.maxcoa + ico];
146 block_val += hab * sphia * sphib;
148 if (CALCULATE_FORCES) {
149 for (
int k = 0; k < 3; k++) {
150 fa[k] += block_val1 * sphia * sphib *
151 get_force_a<COMPUTE_TAU, T>(
152 a,
b, k, task.zeta, task.zetb, task.n1, smem_cab);
154 block_val1 * sphia * sphib *
155 get_force_b<COMPUTE_TAU, T>(
a,
b, k, task.zeta, task.zetb,
156 task.rab, task.n1, smem_cab);
158 if (dev_.ptr_dev[5] !=
nullptr) {
159 for (
int k = 0; k < 3; k++) {
160 for (
int l = 0; l < 3; l++) {
162 (get_virial_a<COMPUTE_TAU, T>(
a,
b, k, l, task.zeta,
165 get_virial_b<COMPUTE_TAU, T>(
a,
b, k, l, task.zeta,
167 task.n1, smem_cab)) *
168 block_val1 * sphia * sphib;
183 if (task.block_transposed) {
184 atomicAdd(task.hab_block + j * task.nsgfb +
i, res);
186 atomicAdd(task.hab_block +
i * task.nsgfa + j, res);
193 if (CALCULATE_FORCES) {
194 const int task_id = dev_.task_sorted_by_blocks_dev[offset];
195 const auto &glb_task = dev_.tasks[task_id];
196 const int iatom = glb_task.iatom;
197 const int jatom = glb_task.jatom;
198 T *forces_a = &dev_.ptr_dev[4][3 * iatom];
199 T *forces_b = &dev_.ptr_dev[4][3 * jatom];
201 T *sum = (T *)shared_memory;
202 if (dev_.ptr_dev[5] !=
nullptr) {
204 for (
int i = 0;
i < 9;
i++) {
205 sum[tid] = virial[
i];
208 virial[
i] = warp_reduce<T>(sum, tid);
212 atomicAdd(dev_.ptr_dev[5] +
i, virial[
i]);
217 for (
int i = 0;
i < 3;
i++) {
220 fa[
i] = warp_reduce<T>(sum, tid);
223 atomicAdd(forces_a +
i, fa[
i]);
228 fb[
i] = warp_reduce<T>(sum, tid);
231 atomicAdd(forces_b +
i, fb[
i]);
263 template <
typename T,
typename T3,
bool distributed__,
bool orthorhombic_,
266 __launch_bounds__(64) void integrate_kernel(const kernel_params dev_) {
267 if (dev_.tasks[dev_.first_task + blockIdx.x].skip_task)
271 threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
273 __shared__ T dh_inv_[9];
276 for (
int d = tid;
d < 9;
d += blockDim.x * blockDim.y * blockDim.z)
277 dh_inv_[
d] = dev_.dh_inv_[
d];
279 for (
int d = tid;
d < 9;
d += blockDim.x * blockDim.y * blockDim.z)
280 dh_[
d] = dev_.dh_[
d];
282 __shared__ smem_task_reduced<T, T3> task;
286 task.cube_center.z += task.lb_cube.z - dev_.grid_lower_corner_[0];
287 task.cube_center.y += task.lb_cube.y - dev_.grid_lower_corner_[1];
288 task.cube_center.x += task.lb_cube.x - dev_.grid_lower_corner_[2];
291 if (task.apply_border_mask) {
293 dev_.grid_local_size_,
294 dev_.tasks[dev_.first_task + blockIdx.x].border_mask,
295 dev_.grid_border_width_, &task.window_size, &task.window_shift);
301 __shared__ T accumulator[lbatch][64];
302 __shared__ T sum[lbatch];
307 const short int size_loop =
308 (
ncoset(task.lp) / lbatch + ((
ncoset(task.lp) % lbatch) != 0)) * lbatch;
309 const short int length =
ncoset(task.lp);
310 for (
int ico = 0; ico < size_loop; ico += lbatch) {
311 #pragma unroll lbatch
312 for (
int i = 0;
i < lbatch;
i++)
313 accumulator[
i][tid] = 0.0;
317 for (
int z = threadIdx.z; z < task.cube_size.z; z += blockDim.z) {
318 int z2 = (z + task.cube_center.z) % dev_.grid_full_size_[0];
321 z2 += dev_.grid_full_size_[0];
325 if (task.apply_border_mask) {
327 if ((z2 < task.window_shift.z) || (z2 > task.window_size.z)) {
335 int ymax = task.cube_size.y - 1;
338 if (orthorhombic_ && !task.apply_border_mask) {
339 ymin = (2 * (z + task.lb_cube.z) - 1) / 2;
341 kremain = task.discrete_radius * task.discrete_radius -
342 ymin * dh_[8] * dh_[8];
343 ymin = ceil(-1.0e-8 - sqrt(fmax(0.0, kremain)) * dh_inv_[4]);
344 ymax = 1 - ymin - task.lb_cube.y;
345 ymin = ymin - task.lb_cube.y;
348 for (
int y = ymin + threadIdx.y; y <= ymax; y += blockDim.y) {
349 int y2 = (y + task.cube_center.y) % dev_.grid_full_size_[1];
352 y2 += dev_.grid_full_size_[1];
356 if (task.apply_border_mask) {
357 if ((y2 < task.window_shift.y) || (y2 > task.window_size.y)) {
364 int xmax = task.cube_size.x - 1;
366 if (orthorhombic_ && !task.apply_border_mask) {
367 xmin = (2 * (y + task.lb_cube.y) - 1) / 2;
370 ceil(-1.0e-8 - sqrt(fmax(0.0, kremain - xmin * dh_[4] * dh_[4])) *
372 xmax = 1 - xmin - task.lb_cube.x;
373 xmin -= task.lb_cube.x;
376 for (
int x = xmin + threadIdx.x; x <= xmax; x += blockDim.x) {
377 int x2 = (x + task.cube_center.x) % dev_.grid_full_size_[2];
380 x2 += dev_.grid_full_size_[2];
384 if (task.apply_border_mask) {
385 if ((x2 < task.window_shift.x) || (x2 > task.window_size.x)) {
396 r3.x = (x + task.lb_cube.x + task.roffset.x) * dh_[0];
397 r3.y = (y + task.lb_cube.y + task.roffset.y) * dh_[4];
398 r3.z = (z + task.lb_cube.z + task.roffset.z) * dh_[8];
401 (y + task.lb_cube.y + task.roffset.y),
402 (z + task.lb_cube.z + task.roffset.z));
409 if (((task.radius * task.radius) <=
410 (r3.x * r3.x + r3.y * r3.y + r3.z * r3.z)) &&
411 (!orthorhombic_ || task.apply_border_mask))
414 if (!orthorhombic_) {
415 if ((task.radius * task.radius) <=
416 (r3.x * r3.x + r3.y * r3.y + r3.z * r3.z))
426 __ldg(&dev_.ptr_dev[1][(z2 * dev_.grid_local_size_[1] + y2) *
427 dev_.grid_local_size_[2] +
430 const T r3x2 = r3.x * r3.x;
431 const T r3y2 = r3.y * r3.y;
432 const T r3z2 = r3.z * r3.z;
434 const T r3xy = r3.x * r3.y;
435 const T r3xz = r3.x * r3.z;
436 const T r3yz = r3.y * r3.z;
438 grid_value *= exp(-(r3x2 + r3y2 + r3z2) * task.zetp);
440 switch (ico / lbatch) {
442 accumulator[0][tid] += grid_value;
445 accumulator[1][tid] += grid_value * r3.x;
446 accumulator[2][tid] += grid_value * r3.y;
447 accumulator[3][tid] += grid_value * r3.z;
451 accumulator[4][tid] += grid_value * r3x2;
452 accumulator[5][tid] += grid_value * r3xy;
453 accumulator[6][tid] += grid_value * r3xz;
454 accumulator[7][tid] += grid_value * r3y2;
455 accumulator[8][tid] += grid_value * r3yz;
456 accumulator[9][tid] += grid_value * r3z2;
461 accumulator[0][tid] += grid_value * r3x2 * r3.x;
462 accumulator[1][tid] += grid_value * r3x2 * r3.y;
463 accumulator[2][tid] += grid_value * r3x2 * r3.z;
464 accumulator[3][tid] += grid_value * r3.x * r3y2;
465 accumulator[4][tid] += grid_value * r3xy * r3.z;
466 accumulator[5][tid] += grid_value * r3.x * r3z2;
467 accumulator[6][tid] += grid_value * r3y2 * r3.y;
468 accumulator[7][tid] += grid_value * r3y2 * r3.z;
469 accumulator[8][tid] += grid_value * r3.y * r3z2;
470 accumulator[9][tid] += grid_value * r3z2 * r3.z;
475 accumulator[0][tid] += grid_value * r3x2 * r3x2;
476 accumulator[1][tid] += grid_value * r3x2 * r3xy;
477 accumulator[2][tid] += grid_value * r3x2 * r3xz;
478 accumulator[3][tid] += grid_value * r3x2 * r3y2;
479 accumulator[4][tid] += grid_value * r3x2 * r3yz;
480 accumulator[5][tid] += grid_value * r3x2 * r3z2;
481 accumulator[6][tid] += grid_value * r3xy * r3y2;
482 accumulator[7][tid] += grid_value * r3xz * r3y2;
483 accumulator[8][tid] += grid_value * r3xy * r3z2;
484 accumulator[9][tid] += grid_value * r3xz * r3z2;
489 accumulator[0][tid] += grid_value * r3y2 * r3y2;
490 accumulator[1][tid] += grid_value * r3y2 * r3yz;
491 accumulator[2][tid] += grid_value * r3y2 * r3z2;
492 accumulator[3][tid] += grid_value * r3yz * r3z2;
493 accumulator[4][tid] += grid_value * r3z2 * r3z2;
496 accumulator[5][tid] += grid_value * r3x2 * r3x2 * r3.x;
497 accumulator[6][tid] += grid_value * r3x2 * r3x2 * r3.y;
498 accumulator[7][tid] += grid_value * r3x2 * r3x2 * r3.z;
499 accumulator[8][tid] += grid_value * r3x2 * r3.x * r3y2;
500 accumulator[9][tid] += grid_value * r3x2 * r3xy * r3.z;
504 accumulator[0][tid] += grid_value * r3x2 * r3.x * r3z2;
505 accumulator[1][tid] += grid_value * r3x2 * r3y2 * r3.y;
506 accumulator[2][tid] += grid_value * r3x2 * r3y2 * r3.z;
507 accumulator[3][tid] += grid_value * r3x2 * r3.y * r3z2;
508 accumulator[4][tid] += grid_value * r3x2 * r3z2 * r3.z;
509 accumulator[5][tid] += grid_value * r3.x * r3y2 * r3y2;
510 accumulator[6][tid] += grid_value * r3.x * r3y2 * r3yz;
511 accumulator[7][tid] += grid_value * r3.x * r3y2 * r3z2;
512 accumulator[8][tid] += grid_value * r3xy * r3z2 * r3.z;
513 accumulator[9][tid] += grid_value * r3.x * r3z2 * r3z2;
516 accumulator[0][tid] += grid_value * r3y2 * r3y2 * r3.y;
517 accumulator[1][tid] += grid_value * r3y2 * r3y2 * r3.z;
518 accumulator[2][tid] += grid_value * r3y2 * r3.y * r3z2;
519 accumulator[3][tid] += grid_value * r3y2 * r3z2 * r3.z;
520 accumulator[4][tid] += grid_value * r3.y * r3z2 * r3z2;
521 accumulator[5][tid] += grid_value * r3z2 * r3z2 * r3.z;
523 accumulator[6][tid] += grid_value * r3x2 * r3x2 * r3x2;
524 accumulator[7][tid] += grid_value * r3x2 * r3x2 * r3xy;
525 accumulator[8][tid] += grid_value * r3x2 * r3x2 * r3xz;
526 accumulator[9][tid] += grid_value * r3x2 * r3x2 * r3y2;
530 accumulator[0][tid] += grid_value * r3x2 * r3x2 * r3yz;
531 accumulator[1][tid] += grid_value * r3x2 * r3x2 * r3z2;
532 accumulator[2][tid] += grid_value * r3x2 * r3y2 * r3xy;
533 accumulator[3][tid] += grid_value * r3x2 * r3y2 * r3xz;
534 accumulator[4][tid] += grid_value * r3x2 * r3xy * r3z2;
535 accumulator[5][tid] += grid_value * r3x2 * r3z2 * r3xz;
536 accumulator[6][tid] += grid_value * r3x2 * r3y2 * r3y2;
537 accumulator[7][tid] += grid_value * r3x2 * r3y2 * r3yz;
538 accumulator[8][tid] +=
539 grid_value * r3x2 * r3y2 * r3z2;
540 accumulator[9][tid] += grid_value * r3x2 * r3z2 * r3yz;
543 accumulator[0][tid] += grid_value * r3x2 * r3z2 * r3z2;
544 accumulator[1][tid] += grid_value * r3y2 * r3y2 * r3xy;
545 accumulator[2][tid] += grid_value * r3y2 * r3y2 * r3xz;
546 accumulator[3][tid] += grid_value * r3y2 * r3xy * r3z2;
547 accumulator[4][tid] += grid_value * r3y2 * r3z2 * r3xz;
548 accumulator[5][tid] += grid_value * r3xy * r3z2 * r3z2;
549 accumulator[6][tid] += grid_value * r3z2 * r3z2 * r3xz;
550 accumulator[7][tid] += grid_value * r3y2 * r3y2 * r3y2;
551 accumulator[8][tid] += grid_value * r3y2 * r3y2 * r3yz;
552 accumulator[9][tid] += grid_value * r3y2 * r3y2 * r3z2;
555 for (
int ic = 0; (ic < lbatch) && ((ic + ico) < length); ic++) {
558 for (
int po = 0; po <
co.l[2]; po++)
560 for (
int po = 0; po <
co.l[1]; po++)
562 for (
int po = 0; po <
co.l[0]; po++)
564 accumulator[ic][tid] += tmp * grid_value;
577 for (
int i = 0;
i < min(length - ico, lbatch);
i++) {
579 accumulator[
i][tid] += accumulator[
i][tid + 32];
583 accumulator[
i][tid] += accumulator[
i][tid + 16];
587 accumulator[
i][tid] += accumulator[
i][tid + 8];
591 accumulator[
i][tid] += accumulator[
i][tid + 4];
595 accumulator[
i][tid] += accumulator[
i][tid + 2];
599 sum[
i] = accumulator[
i][0] + accumulator[
i][1];
603 for (
int i = tid;
i < min(length - ico, lbatch);
i += lbatch)
604 dev_.ptr_dev[2][dev_.tasks[dev_.first_task + blockIdx.x].coef_offset +
i +
613 context_info::set_kernel_parameters(
const int level,
614 const smem_parameters &smem_params) {
615 kernel_params params;
616 params.smem_cab_offset = smem_params.smem_cab_offset();
617 params.smem_alpha_offset = smem_params.smem_alpha_offset();
618 params.first_task = 0;
620 params.la_min_diff = smem_params.ldiffs().la_min_diff;
621 params.lb_min_diff = smem_params.ldiffs().lb_min_diff;
623 params.la_max_diff = smem_params.ldiffs().la_max_diff;
624 params.lb_max_diff = smem_params.ldiffs().lb_max_diff;
625 params.first_task = 0;
631 params.la_min_diff = smem_params.ldiffs().la_min_diff;
632 params.lb_min_diff = smem_params.ldiffs().lb_min_diff;
633 params.la_max_diff = smem_params.ldiffs().la_max_diff;
634 params.lb_max_diff = smem_params.ldiffs().lb_max_diff;
639 params.ptr_dev[1] =
grid_[level].data();
640 for (
int i = 0;
i < 3;
i++) {
641 memcpy(params.dh_,
grid_[level].dh(), 9 *
sizeof(
double));
642 memcpy(params.dh_inv_,
grid_[level].dh_inv(), 9 *
sizeof(
double));
643 params.grid_full_size_[
i] =
grid_[level].full_size(
i);
644 params.grid_local_size_[
i] =
grid_[level].local_size(
i);
645 params.grid_lower_corner_[
i] =
grid_[level].lower_corner(
i);
646 params.grid_border_width_[
i] =
grid_[level].border_width(
i);
668 const ldiffs_value ldiffs =
671 smem_parameters smem_params(ldiffs,
lmax());
673 *lp_diff = smem_params.lp_diff();
676 kernel_params params = set_kernel_parameters(level, smem_params);
684 const dim3 threads_per_block(4, 4, 4);
685 if (
grid_[level].is_distributed()) {
686 if (
grid_[level].is_orthorhombic()) {
687 integrate_kernel<double, double3, true, true, 10>
691 integrate_kernel<double, double3, true, false, 10>
696 if (
grid_[level].is_orthorhombic()) {
697 integrate_kernel<double, double3, false, true, 10>
701 integrate_kernel<double, double3, false, false, 10>
713 const ldiffs_value ldiffs =
716 smem_parameters smem_params(ldiffs,
lmax());
719 kernel_params params = set_kernel_parameters(-1, smem_params);
725 const dim3 threads_per_block(4, 4, 4);
728 compute_hab_v2<double, double3, false, false>
729 <<<this->
nblocks, threads_per_block, smem_params.smem_per_block(),
735 compute_hab_v2<double, double3, false, true>
736 <<<this->
nblocks, threads_per_block, smem_params.smem_per_block(),
742 compute_hab_v2<double, double3, true, true>
743 <<<this->
nblocks, threads_per_block, smem_params.smem_per_block(),
748 compute_hab_v2<double, double3, true, false>
749 <<<this->
nblocks, threads_per_block, smem_params.smem_per_block(),
std::vector< int > first_task_per_level_
gpu_vector< double > virial_
gpu_vector< int > sorted_blocks_offset_dev
gpu_vector< double > coef_dev_
gpu_vector< double > pab_block_
gpu_vector< int > block_offsets_dev
offloadStream_t main_stream
gpu_vector< double > cab_dev_
gpu_vector< task_info > tasks_dev
std::vector< grid_info< double > > grid_
gpu_vector< double > forces_
void compute_hab_coefficients()
gpu_vector< double > hab_block_
std::vector< offloadStream_t > level_streams
std::vector< int > number_of_tasks_per_level_
gpu_vector< double * > sphi_dev
gpu_vector< int > num_tasks_per_block_dev_
void integrate_one_grid_level(const int level, int *lp_diff)
gpu_vector< int > task_sorted_by_blocks_dev
static void const int const int i
for(int lxp=0;lxp<=lp;lxp++)
integer, dimension(:, :, :), allocatable, public co
__inline__ __device__ void compute_window_size(const int *const grid_size, const int border_mask, const int *border_width, int3 *const window_size, int3 *const window_shift)
ldiffs_value process_get_ldiffs(bool calculate_forces, bool calculate_virial, bool compute_tau)
Returns difference in angular momentum range for given flags.
__host__ __device__ __inline__ int ncoset(const int l)
Number of Cartesian orbitals up to given angular momentum quantum.
__device__ __inline__ void fill_smem_task_coef(const kernel_params &dev, const int task_id, smem_task< T > &task)
Copies a task from global to shared memory and does precomputations.
static void init_constant_memory()
Initializes the device's constant memory.
__device__ static __inline__ void cxyz_to_cab(const smem_task< T > &task, const T *__restrict__ alpha, const T *__restrict__ cxyz, T *__restrict__ cab)
Transforms coefficients C_xyz into C_ab.
__constant__ orbital coset_inv[1330]
__inline__ __device__ void compute_alpha(const smem_task< T > &task, T *__restrict__ alpha)
Computes the polynomial expansion coefficients: (x-a)**lxa (x-b)**lxb -> sum_{ls} alpha(ls,...
__device__ __inline__ void fill_smem_task_reduced(const kernel_params &dev, const int task_id, smem_task_reduced< T, T3 > &task)
Copies a task from global to shared memory.
__inline__ __device__ double3 compute_coordinates(const double *__restrict__ dh_, const double x, const double y, const double z)