15#if defined(__OFFLOAD_HIP) && !defined(__NO_OFFLOAD_GRID)
19#include <hip/hip_runtime.h>
30#ifdef __HIP_PLATFORM_NVIDIA__
31#include <cooperative_groups.h>
32#if CUDA_VERSION >= 11000
33#include <cooperative_groups/reduce.h>
35namespace cg = cooperative_groups;
39#error "OpenMP should not be used in .cu files to accommodate HIP."
45__device__ __inline__ T warp_reduce(T *table,
const int tid) {
49 table[tid] += table[tid + 32];
53 table[tid] += table[tid + 16];
57 table[tid] += table[tid + 8];
61 table[tid] += table[tid + 4];
65 table[tid] += table[tid + 2];
68 return (table[0] + table[1]);
72template <
typename T,
typename T3,
bool COMPUTE_TAU,
bool CALCULATE_FORCES>
73__global__ __launch_bounds__(64) void compute_hab_v2(const kernel_params dev_) {
75 extern __shared__ T shared_memory[];
77 T *smem_alpha = &shared_memory[dev_.smem_alpha_offset];
79 threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
80 const int offset = dev_.sorted_blocks_offset_dev[blockIdx.x];
81 const int number_of_tasks = dev_.num_tasks_per_block_dev[blockIdx.x];
83 if (number_of_tasks == 0)
106 for (
int tk = 0; tk < number_of_tasks; tk++) {
107 __shared__ smem_task<T> task;
108 const int task_id = dev_.task_sorted_by_blocks_dev[offset + tk];
109 if (dev_.tasks[task_id].skip_task)
113 T *coef_ = &dev_.ptr_dev[2][dev_.tasks[task_id].coef_offset];
114 T *smem_cab = &dev_.ptr_dev[6][dev_.tasks[task_id].cab_offset];
120 for (
int jco = task.first_cosetb; jco < task.ncosetb; jco++) {
122 for (
int ico = task.first_coseta; ico < task.ncoseta; ico++) {
124 const T hab = get_hab<COMPUTE_TAU, T>(a, b, task.zeta, task.zetb,
126 __shared__ T shared_forces_a[3];
127 __shared__ T shared_forces_b[3];
128 __shared__ T shared_virial[9];
130 if (CALCULATE_FORCES) {
132 shared_forces_a[tid] = get_force_a<COMPUTE_TAU, T>(
133 a, b, tid, task.zeta, task.zetb, task.n1, smem_cab);
134 shared_forces_b[tid] = get_force_b<COMPUTE_TAU, T>(
135 a, b, tid, task.zeta, task.zetb, task.rab, task.n1, smem_cab);
137 if ((tid < 9) && (dev_.ptr_dev[5] !=
nullptr)) {
139 get_virial_a<COMPUTE_TAU, T>(a, b, tid / 3, tid % 3, task.zeta,
140 task.zetb, task.n1, smem_cab) +
141 get_virial_b<COMPUTE_TAU, T>(a, b, tid / 3, tid % 3, task.zeta,
142 task.zetb, task.rab, task.n1,
147 for (
int i = tid / 8;
i < task.nsgf_setb;
i += 8) {
148 const T sphib = task.sphib[
i * task.maxcob + jco];
149 for (
int j = tid % 8; j < task.nsgf_seta; j += 8) {
150 const T sphia_times_sphib =
151 task.sphia[j * task.maxcoa + ico] * sphib;
154 if (CALCULATE_FORCES) {
156 if (task.block_transposed) {
158 task.pab_block[j * task.nsgfb +
i] * task.off_diag_twice;
161 task.pab_block[
i * task.nsgfa + j] * task.off_diag_twice;
163 for (
int k = 0; k < 3; k++) {
164 fa[k] += block_val1 * sphia_times_sphib * shared_forces_a[k];
167 fb[k] += block_val1 * sphia_times_sphib * shared_forces_b[k];
171 if (dev_.ptr_dev[5] !=
nullptr) {
172 for (
int k = 0; k < 3; k++) {
173 for (
int l = 0; l < 3; l++) {
174 virial[3 * k + l] += shared_virial[3 * k + l] * block_val1 *
185 if (task.block_transposed) {
186 task.hab_block[j * task.nsgfb +
i] += hab * sphia_times_sphib;
190 task.hab_block[
i * task.nsgfa + j] += hab * sphia_times_sphib;
201 if (CALCULATE_FORCES) {
202 const int task_id = dev_.task_sorted_by_blocks_dev[offset];
203 const auto &glb_task = dev_.tasks[task_id];
204 const int iatom = glb_task.iatom;
205 const int jatom = glb_task.jatom;
206 T *forces_a = &dev_.ptr_dev[4][3 * iatom];
207 T *forces_b = &dev_.ptr_dev[4][3 * jatom];
209 T *sum = (T *)shared_memory;
210 if (dev_.ptr_dev[5] !=
nullptr) {
212 for (
int i = 0;
i < 9;
i++) {
213 sum[tid] = virial[
i];
216 virial[
i] = warp_reduce<T>(sum, tid);
220 atomicAdd(dev_.ptr_dev[5] +
i, virial[
i]);
225 for (
int i = 0;
i < 3;
i++) {
228 fa[
i] = warp_reduce<T>(sum, tid);
231 atomicAdd(forces_a +
i, fa[
i]);
236 fb[
i] = warp_reduce<T>(sum, tid);
239 atomicAdd(forces_b +
i, fb[
i]);
271template <
typename T,
typename T3,
bool distributed__,
bool orthorhombic_,
274__launch_bounds__(64) void integrate_kernel(const kernel_params dev_) {
275 if (dev_.tasks[dev_.first_task + blockIdx.x].skip_task)
279 threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
281 __shared__ T dh_inv_[9];
284 for (
int d = tid;
d < 9;
d += blockDim.x * blockDim.y * blockDim.z)
285 dh_inv_[d] = dev_.dh_inv_[d];
287 for (
int d = tid;
d < 9;
d += blockDim.x * blockDim.y * blockDim.z)
288 dh_[d] = dev_.dh_[d];
290 __shared__ smem_task_reduced<T, T3> task;
294 task.cube_center.z += task.lb_cube.z - dev_.grid_lower_corner_[0];
295 task.cube_center.y += task.lb_cube.y - dev_.grid_lower_corner_[1];
296 task.cube_center.x += task.lb_cube.x - dev_.grid_lower_corner_[2];
299 if (task.apply_border_mask) {
301 dev_.grid_local_size_,
302 dev_.tasks[dev_.first_task + blockIdx.x].border_mask,
303 dev_.grid_border_width_, &task.window_size, &task.window_shift);
309 __shared__ T accumulator[lbatch][64];
310 __shared__ T sum[lbatch];
315 const short int size_loop =
316 (
ncoset(task.lp) / lbatch + ((
ncoset(task.lp) % lbatch) != 0)) * lbatch;
317 const short int length =
ncoset(task.lp);
318 for (
int ico = 0; ico < size_loop; ico += lbatch) {
320 for (
int i = 0;
i < lbatch;
i++)
321 accumulator[
i][tid] = 0.0;
325 for (
int z = threadIdx.z; z < task.cube_size.z; z += blockDim.z) {
326 int z2 = (z + task.cube_center.z) % dev_.grid_full_size_[0];
329 z2 += dev_.grid_full_size_[0];
333 if (task.apply_border_mask) {
335 if ((z2 < task.window_shift.z) || (z2 > task.window_size.z)) {
343 int ymax = task.cube_size.y - 1;
346 if (orthorhombic_ && !task.apply_border_mask) {
347 ymin = (2 * (z + task.lb_cube.z) - 1) / 2;
349 kremain = task.discrete_radius * task.discrete_radius -
350 ymin * dh_[8] * dh_[8];
351 ymin = ceil(-1.0e-8 - sqrt(fmax(0.0, kremain)) * dh_inv_[4]);
352 ymax = 1 - ymin - task.lb_cube.y;
353 ymin = ymin - task.lb_cube.y;
356 for (
int y = ymin + threadIdx.y; y <= ymax; y += blockDim.y) {
357 int y2 = (y + task.cube_center.y) % dev_.grid_full_size_[1];
360 y2 += dev_.grid_full_size_[1];
364 if (task.apply_border_mask) {
365 if ((y2 < task.window_shift.y) || (y2 > task.window_size.y)) {
372 int xmax = task.cube_size.x - 1;
374 if (orthorhombic_ && !task.apply_border_mask) {
375 xmin = (2 * (y + task.lb_cube.y) - 1) / 2;
378 ceil(-1.0e-8 - sqrt(fmax(0.0, kremain - xmin * dh_[4] * dh_[4])) *
380 xmax = 1 - xmin - task.lb_cube.x;
381 xmin -= task.lb_cube.x;
384 for (
int x = xmin + threadIdx.x; x <= xmax; x += blockDim.x) {
385 int x2 = (x + task.cube_center.x) % dev_.grid_full_size_[2];
388 x2 += dev_.grid_full_size_[2];
392 if (task.apply_border_mask) {
393 if ((x2 < task.window_shift.x) || (x2 > task.window_size.x)) {
404 r3.x = (x + task.lb_cube.x + task.roffset.x) * dh_[0];
405 r3.y = (y + task.lb_cube.y + task.roffset.y) * dh_[4];
406 r3.z = (z + task.lb_cube.z + task.roffset.z) * dh_[8];
409 (y + task.lb_cube.y + task.roffset.y),
410 (z + task.lb_cube.z + task.roffset.z));
417 if (((task.radius * task.radius) <=
418 (r3.x * r3.x + r3.y * r3.y + r3.z * r3.z)) &&
419 (!orthorhombic_ || task.apply_border_mask))
422 if (!orthorhombic_) {
423 if ((task.radius * task.radius) <=
424 (r3.x * r3.x + r3.y * r3.y + r3.z * r3.z))
434 __ldg(&dev_.ptr_dev[1][(z2 * dev_.grid_local_size_[1] + y2) *
435 dev_.grid_local_size_[2] +
438 const T r3x2 = r3.x * r3.x;
439 const T r3y2 = r3.y * r3.y;
440 const T r3z2 = r3.z * r3.z;
442 const T r3xy = r3.x * r3.y;
443 const T r3xz = r3.x * r3.z;
444 const T r3yz = r3.y * r3.z;
446 grid_value *= exp(-(r3x2 + r3y2 + r3z2) * task.zetp);
448 switch (ico / lbatch) {
450 accumulator[0][tid] += grid_value;
453 accumulator[1][tid] += grid_value * r3.x;
454 accumulator[2][tid] += grid_value * r3.y;
455 accumulator[3][tid] += grid_value * r3.z;
459 accumulator[4][tid] += grid_value * r3x2;
460 accumulator[5][tid] += grid_value * r3xy;
461 accumulator[6][tid] += grid_value * r3xz;
462 accumulator[7][tid] += grid_value * r3y2;
463 accumulator[8][tid] += grid_value * r3yz;
464 accumulator[9][tid] += grid_value * r3z2;
469 accumulator[0][tid] += grid_value * r3x2 * r3.x;
470 accumulator[1][tid] += grid_value * r3x2 * r3.y;
471 accumulator[2][tid] += grid_value * r3x2 * r3.z;
472 accumulator[3][tid] += grid_value * r3.x * r3y2;
473 accumulator[4][tid] += grid_value * r3xy * r3.z;
474 accumulator[5][tid] += grid_value * r3.x * r3z2;
475 accumulator[6][tid] += grid_value * r3y2 * r3.y;
476 accumulator[7][tid] += grid_value * r3y2 * r3.z;
477 accumulator[8][tid] += grid_value * r3.y * r3z2;
478 accumulator[9][tid] += grid_value * r3z2 * r3.z;
483 accumulator[0][tid] += grid_value * r3x2 * r3x2;
484 accumulator[1][tid] += grid_value * r3x2 * r3xy;
485 accumulator[2][tid] += grid_value * r3x2 * r3xz;
486 accumulator[3][tid] += grid_value * r3x2 * r3y2;
487 accumulator[4][tid] += grid_value * r3x2 * r3yz;
488 accumulator[5][tid] += grid_value * r3x2 * r3z2;
489 accumulator[6][tid] += grid_value * r3xy * r3y2;
490 accumulator[7][tid] += grid_value * r3xz * r3y2;
491 accumulator[8][tid] += grid_value * r3xy * r3z2;
492 accumulator[9][tid] += grid_value * r3xz * r3z2;
497 accumulator[0][tid] += grid_value * r3y2 * r3y2;
498 accumulator[1][tid] += grid_value * r3y2 * r3yz;
499 accumulator[2][tid] += grid_value * r3y2 * r3z2;
500 accumulator[3][tid] += grid_value * r3yz * r3z2;
501 accumulator[4][tid] += grid_value * r3z2 * r3z2;
504 accumulator[5][tid] += grid_value * r3x2 * r3x2 * r3.x;
505 accumulator[6][tid] += grid_value * r3x2 * r3x2 * r3.y;
506 accumulator[7][tid] += grid_value * r3x2 * r3x2 * r3.z;
507 accumulator[8][tid] += grid_value * r3x2 * r3.x * r3y2;
508 accumulator[9][tid] += grid_value * r3x2 * r3xy * r3.z;
512 accumulator[0][tid] += grid_value * r3x2 * r3.x * r3z2;
513 accumulator[1][tid] += grid_value * r3x2 * r3y2 * r3.y;
514 accumulator[2][tid] += grid_value * r3x2 * r3y2 * r3.z;
515 accumulator[3][tid] += grid_value * r3x2 * r3.y * r3z2;
516 accumulator[4][tid] += grid_value * r3x2 * r3z2 * r3.z;
517 accumulator[5][tid] += grid_value * r3.x * r3y2 * r3y2;
518 accumulator[6][tid] += grid_value * r3.x * r3y2 * r3yz;
519 accumulator[7][tid] += grid_value * r3.x * r3y2 * r3z2;
520 accumulator[8][tid] += grid_value * r3xy * r3z2 * r3.z;
521 accumulator[9][tid] += grid_value * r3.x * r3z2 * r3z2;
524 accumulator[0][tid] += grid_value * r3y2 * r3y2 * r3.y;
525 accumulator[1][tid] += grid_value * r3y2 * r3y2 * r3.z;
526 accumulator[2][tid] += grid_value * r3y2 * r3.y * r3z2;
527 accumulator[3][tid] += grid_value * r3y2 * r3z2 * r3.z;
528 accumulator[4][tid] += grid_value * r3.y * r3z2 * r3z2;
529 accumulator[5][tid] += grid_value * r3z2 * r3z2 * r3.z;
531 accumulator[6][tid] += grid_value * r3x2 * r3x2 * r3x2;
532 accumulator[7][tid] += grid_value * r3x2 * r3x2 * r3xy;
533 accumulator[8][tid] += grid_value * r3x2 * r3x2 * r3xz;
534 accumulator[9][tid] += grid_value * r3x2 * r3x2 * r3y2;
538 accumulator[0][tid] += grid_value * r3x2 * r3x2 * r3yz;
539 accumulator[1][tid] += grid_value * r3x2 * r3x2 * r3z2;
540 accumulator[2][tid] += grid_value * r3x2 * r3y2 * r3xy;
541 accumulator[3][tid] += grid_value * r3x2 * r3y2 * r3xz;
542 accumulator[4][tid] += grid_value * r3x2 * r3xy * r3z2;
543 accumulator[5][tid] += grid_value * r3x2 * r3z2 * r3xz;
544 accumulator[6][tid] += grid_value * r3x2 * r3y2 * r3y2;
545 accumulator[7][tid] += grid_value * r3x2 * r3y2 * r3yz;
546 accumulator[8][tid] +=
547 grid_value * r3x2 * r3y2 * r3z2;
548 accumulator[9][tid] += grid_value * r3x2 * r3z2 * r3yz;
551 accumulator[0][tid] += grid_value * r3x2 * r3z2 * r3z2;
552 accumulator[1][tid] += grid_value * r3y2 * r3y2 * r3xy;
553 accumulator[2][tid] += grid_value * r3y2 * r3y2 * r3xz;
554 accumulator[3][tid] += grid_value * r3y2 * r3xy * r3z2;
555 accumulator[4][tid] += grid_value * r3y2 * r3z2 * r3xz;
556 accumulator[5][tid] += grid_value * r3xy * r3z2 * r3z2;
557 accumulator[6][tid] += grid_value * r3z2 * r3z2 * r3xz;
558 accumulator[7][tid] += grid_value * r3y2 * r3y2 * r3y2;
559 accumulator[8][tid] += grid_value * r3y2 * r3y2 * r3yz;
560 accumulator[9][tid] += grid_value * r3y2 * r3y2 * r3z2;
563 for (
int ic = 0; (ic < lbatch) && ((ic + ico) < length); ic++) {
566 for (
int po = 0; po < (
co.l[2] >> 1); po++)
570 for (
int po = 0; po < (
co.l[1] >> 1); po++)
574 for (
int po = 0; po < (
co.l[0] >> 1); po++)
578 accumulator[ic][tid] += tmp * grid_value;
591 for (
int i = 0;
i < min(length - ico, lbatch);
i++) {
593 accumulator[
i][tid] += accumulator[
i][tid + 32];
597 accumulator[
i][tid] += accumulator[
i][tid + 16];
601 accumulator[
i][tid] += accumulator[
i][tid + 8];
605 accumulator[
i][tid] += accumulator[
i][tid + 4];
609 accumulator[
i][tid] += accumulator[
i][tid + 2];
613 sum[
i] = accumulator[
i][0] + accumulator[
i][1];
617 for (
int i = tid;
i < min(length - ico, lbatch);
i += lbatch)
618 dev_.ptr_dev[2][dev_.tasks[dev_.first_task + blockIdx.x].coef_offset +
i +
627context_info::set_kernel_parameters(
const int level,
628 const smem_parameters &smem_params) {
629 kernel_params params;
630 params.smem_cab_offset = smem_params.smem_cab_offset();
631 params.smem_alpha_offset = smem_params.smem_alpha_offset();
632 params.first_task = 0;
634 params.la_min_diff = smem_params.ldiffs().la_min_diff;
635 params.lb_min_diff = smem_params.ldiffs().lb_min_diff;
637 params.la_max_diff = smem_params.ldiffs().la_max_diff;
638 params.lb_max_diff = smem_params.ldiffs().lb_max_diff;
639 params.first_task = 0;
645 params.la_min_diff = smem_params.ldiffs().la_min_diff;
646 params.lb_min_diff = smem_params.ldiffs().lb_min_diff;
647 params.la_max_diff = smem_params.ldiffs().la_max_diff;
648 params.lb_max_diff = smem_params.ldiffs().lb_max_diff;
653 params.ptr_dev[1] =
grid_[level].data();
654 for (
int i = 0;
i < 3;
i++) {
655 memcpy(params.dh_,
grid_[level].dh(), 9 *
sizeof(
double));
656 memcpy(params.dh_inv_,
grid_[level].dh_inv(), 9 *
sizeof(
double));
657 params.grid_full_size_[
i] =
grid_[level].full_size(
i);
658 params.grid_local_size_[
i] =
grid_[level].local_size(
i);
659 params.grid_lower_corner_[
i] =
grid_[level].lower_corner(
i);
660 params.grid_border_width_[
i] =
grid_[level].border_width(
i);
682 const ldiffs_value ldiffs =
685 smem_parameters smem_params(ldiffs,
lmax());
687 *lp_diff = smem_params.lp_diff();
690 kernel_params params = set_kernel_parameters(level, smem_params);
698 const dim3 threads_per_block(4, 4, 4);
699 if (
grid_[level].is_distributed()) {
700 if (
grid_[level].is_orthorhombic()) {
701 integrate_kernel<double, double3, true, true, 10>
705 integrate_kernel<double, double3, true, false, 10>
710 if (
grid_[level].is_orthorhombic()) {
711 integrate_kernel<double, double3, false, true, 10>
715 integrate_kernel<double, double3, false, false, 10>
727 const ldiffs_value ldiffs =
730 smem_parameters smem_params(ldiffs,
lmax());
733 kernel_params params = set_kernel_parameters(-1, smem_params);
739 const dim3 threads_per_block(4, 4, 4);
742 compute_hab_v2<double, double3, false, false>
743 <<<this->
nblocks, threads_per_block, smem_params.smem_per_block(),
749 compute_hab_v2<double, double3, false, true>
750 <<<this->
nblocks, threads_per_block, smem_params.smem_per_block(),
756 compute_hab_v2<double, double3, true, true>
757 <<<this->
nblocks, threads_per_block, smem_params.smem_per_block(),
762 compute_hab_v2<double, double3, true, false>
763 <<<this->
nblocks, threads_per_block, smem_params.smem_per_block(),
std::vector< int > first_task_per_level_
gpu_vector< double > virial_
gpu_vector< int > sorted_blocks_offset_dev
gpu_vector< double > coef_dev_
gpu_vector< double > pab_block_
gpu_vector< int > block_offsets_dev
offloadStream_t main_stream
gpu_vector< double > cab_dev_
gpu_vector< task_info > tasks_dev
std::vector< grid_info< double > > grid_
gpu_vector< double > forces_
void compute_hab_coefficients()
gpu_vector< double > hab_block_
std::vector< offloadStream_t > level_streams
std::vector< int > number_of_tasks_per_level_
gpu_vector< double * > sphi_dev
gpu_vector< int > num_tasks_per_block_dev_
void integrate_one_grid_level(const int level, int *lp_diff)
gpu_vector< int > task_sorted_by_blocks_dev
static void const int const int i
integer, dimension(:, :, :), allocatable, public co
integer, dimension(:), allocatable, public ncoset
__inline__ __device__ void compute_window_size(const int *const grid_size, const int border_mask, const int *border_width, int3 *const window_size, int3 *const window_shift)
ldiffs_value process_get_ldiffs(bool calculate_forces, bool calculate_virial, bool compute_tau)
Returns difference in angular momentum range for given flags.
__device__ __inline__ void fill_smem_task_coef(const kernel_params &dev, const int task_id, smem_task< T > &task)
Copies a task from global to shared memory and does precomputations.
static void init_constant_memory()
Initializes the device's constant memory.
__device__ static __inline__ void cxyz_to_cab(const smem_task< T > &task, const T *__restrict__ alpha, const T *__restrict__ cxyz, T *__restrict__ cab)
Transforms coefficients C_xyz into C_ab.
__constant__ orbital coset_inv[1330]
__inline__ __device__ void compute_alpha(const smem_task< T > &task, T *__restrict__ alpha)
Computes the polynomial expansion coefficients: (x-a)**lxa (x-b)**lxb -> sum_{ls} alpha(ls,...
__device__ __inline__ void fill_smem_task_reduced(const kernel_params &dev, const int task_id, smem_task_reduced< T, T3 > &task)
Copies a task from global to shared memory.
__inline__ __device__ double3 compute_coordinates(const double *__restrict__ dh_, const double x, const double y, const double z)