db/d67/grid__gpu__integrate_8cu_source.html

 /*----------------------------------------------------------------------------*/

 /*  CP2K: A general program to perform molecular dynamics simulations         */

 /*  Copyright 2000-2024 CP2K developers group <https://cp2k.org>              */

 /*                                                                            */

 /*  SPDX-License-Identifier: BSD-3-Clause                                     */

 /*----------------------------------------------------------------------------*/


 #include "../../offload/offload_runtime.h"

 #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_GRID)


 #include <algorithm>

 #include <assert.h>

 #include <limits.h>

 #include <math.h>

 #include <stdio.h>

 #include <stdlib.h>

 #include <string.h>


 #define GRID_DO_COLLOCATE 0

 #include "../common/grid_common.h"

 #include "grid_gpu_collint.h"

 #include "grid_gpu_integrate.h"


 // This has to be included after grid_gpu_collint.h

 #include "../common/grid_process_vab.h"


 #if defined(_OMP_H)

 #error "OpenMP should not be used in .cu files to accommodate HIP."

 #endif


 // Teen registers are sufficient to integrate lp <= 2 with a single grid sweep.

 #define GRID_N_CXYZ_REGISTERS 10


 /*******************************************************************************

  * \brief Add value to designated register without using dynamic indexing.

  *        Otherwise the array would be stored in local memory, which is slower.

  * https://developer.nvidia.com/blog/fast-dynamic-indexing-private-arrays-cuda

  * \author Ole Schuett

  ******************************************************************************/

 __device__ static inline void

 add_to_register(const double value, const int index, cxyz_store *store) {

   switch (index) {

   case 0:

     store->regs[0] += value;

     break;

   case 1:

     store->regs[1] += value;

     break;

   case 2:

     store->regs[2] += value;

     break;

   case 3:

     store->regs[3] += value;

     break;

   case 4:

     store->regs[4] += value;

     break;

   case 5:

     store->regs[5] += value;

     break;

   case 6:

     store->regs[6] += value;

     break;

   case 7:

     store->regs[7] += value;

     break;

   case 8:

     store->regs[8] += value;

     break;

   case 9:

     store->regs[9] += value;

     break;

   }

 }


 /*******************************************************************************

  * \brief Integrate a single grid point with distance d{xyz} from center.

  * \author Ole Schuett

  ******************************************************************************/

 __device__ static void gridpoint_to_cxyz(const double dx, const double dy,

                                          const double dz, const double zetp,

                                          const int lp, const double *gridpoint,

                                          cxyz_store *store) {


   // Squared distance of point from center.

   const double r2 = dx * dx + dy * dy + dz * dz;

   const double gaussian = exp(-zetp * r2);


   // Loading throught read-only cache reduces register usage for some reason.

   const double prefactor = __ldg(gridpoint) * gaussian;


   // Manually unrolled loops based on terms in coset_inv.

   if (store->offset == 0) {

     store->regs[0] += prefactor;

     if (lp >= 1) {

       store->regs[1] += prefactor * dx;

       store->regs[2] += prefactor * dy;

       store->regs[3] += prefactor * dz;

       if (lp >= 2) {

         store->regs[4] += prefactor * dx * dx;

         store->regs[5] += prefactor * dx * dy;

         store->regs[6] += prefactor * dx * dz;

         store->regs[7] += prefactor * dy * dy;

         store->regs[8] += prefactor * dy * dz;

         store->regs[9] += prefactor * dz * dz;

       }

     }


   } else if (store->offset == 10) {

     store->regs[0] += prefactor * dx * dx * dx;

     store->regs[1] += prefactor * dx * dx * dy;

     store->regs[2] += prefactor * dx * dx * dz;

     store->regs[3] += prefactor * dx * dy * dy;

     store->regs[4] += prefactor * dx * dy * dz;

     store->regs[5] += prefactor * dx * dz * dz;

     store->regs[6] += prefactor * dy * dy * dy;

     store->regs[7] += prefactor * dy * dy * dz;

     store->regs[8] += prefactor * dy * dz * dz;

     store->regs[9] += prefactor * dz * dz * dz;


   } else if (store->offset == 20) {

     store->regs[0] += prefactor * dx * dx * dx * dx;

     store->regs[1] += prefactor * dx * dx * dx * dy;

     store->regs[2] += prefactor * dx * dx * dx * dz;

     store->regs[3] += prefactor * dx * dx * dy * dy;

     store->regs[4] += prefactor * dx * dx * dy * dz;

     store->regs[5] += prefactor * dx * dx * dz * dz;

     store->regs[6] += prefactor * dx * dy * dy * dy;

     store->regs[7] += prefactor * dx * dy * dy * dz;

     store->regs[8] += prefactor * dx * dy * dz * dz;

     store->regs[9] += prefactor * dx * dz * dz * dz;


   } else if (store->offset == 30) {

     store->regs[0] += prefactor * dy * dy * dy * dy;

     store->regs[1] += prefactor * dy * dy * dy * dz;

     store->regs[2] += prefactor * dy * dy * dz * dz;

     store->regs[3] += prefactor * dy * dz * dz * dz;

     store->regs[4] += prefactor * dz * dz * dz * dz;

     if (lp >= 5) {

       store->regs[5] += prefactor * dx * dx * dx * dx * dx;

       store->regs[6] += prefactor * dx * dx * dx * dx * dy;

       store->regs[7] += prefactor * dx * dx * dx * dx * dz;

       store->regs[8] += prefactor * dx * dx * dx * dy * dy;

       store->regs[9] += prefactor * dx * dx * dx * dy * dz;

     }


   } else if (store->offset == 40) {

     store->regs[0] += prefactor * dx * dx * dx * dz * dz;

     store->regs[1] += prefactor * dx * dx * dy * dy * dy;

     store->regs[2] += prefactor * dx * dx * dy * dy * dz;

     store->regs[3] += prefactor * dx * dx * dy * dz * dz;

     store->regs[4] += prefactor * dx * dx * dz * dz * dz;

     store->regs[5] += prefactor * dx * dy * dy * dy * dy;

     store->regs[6] += prefactor * dx * dy * dy * dy * dz;

     store->regs[7] += prefactor * dx * dy * dy * dz * dz;

     store->regs[8] += prefactor * dx * dy * dz * dz * dz;

     store->regs[9] += prefactor * dx * dz * dz * dz * dz;


   } else if (store->offset == 50) {

     store->regs[0] += prefactor * dy * dy * dy * dy * dy;

     store->regs[1] += prefactor * dy * dy * dy * dy * dz;

     store->regs[2] += prefactor * dy * dy * dy * dz * dz;

     store->regs[3] += prefactor * dy * dy * dz * dz * dz;

     store->regs[4] += prefactor * dy * dz * dz * dz * dz;

     store->regs[5] += prefactor * dz * dz * dz * dz * dz;

     if (lp >= 6) {

       store->regs[6] += prefactor * dx * dx * dx * dx * dx * dx;

       store->regs[7] += prefactor * dx * dx * dx * dx * dx * dy;

       store->regs[8] += prefactor * dx * dx * dx * dx * dx * dz;

       store->regs[9] += prefactor * dx * dx * dx * dx * dy * dy;

     }


   } else if (store->offset == 60) {

     store->regs[0] += prefactor * dx * dx * dx * dx * dy * dz;

     store->regs[1] += prefactor * dx * dx * dx * dx * dz * dz;

     store->regs[2] += prefactor * dx * dx * dx * dy * dy * dy;

     store->regs[3] += prefactor * dx * dx * dx * dy * dy * dz;

     store->regs[4] += prefactor * dx * dx * dx * dy * dz * dz;

     store->regs[5] += prefactor * dx * dx * dx * dz * dz * dz;

     store->regs[6] += prefactor * dx * dx * dy * dy * dy * dy;

     store->regs[7] += prefactor * dx * dx * dy * dy * dy * dz;

     store->regs[8] += prefactor * dx * dx * dy * dy * dz * dz;

     store->regs[9] += prefactor * dx * dx * dy * dz * dz * dz;


   } else if (store->offset == 70) {

     store->regs[0] += prefactor * dx * dx * dz * dz * dz * dz;

     store->regs[1] += prefactor * dx * dy * dy * dy * dy * dy;

     store->regs[2] += prefactor * dx * dy * dy * dy * dy * dz;

     store->regs[3] += prefactor * dx * dy * dy * dy * dz * dz;

     store->regs[4] += prefactor * dx * dy * dy * dz * dz * dz;

     store->regs[5] += prefactor * dx * dy * dz * dz * dz * dz;

     store->regs[6] += prefactor * dx * dz * dz * dz * dz * dz;

     store->regs[7] += prefactor * dy * dy * dy * dy * dy * dy;

     store->regs[8] += prefactor * dy * dy * dy * dy * dy * dz;

     store->regs[9] += prefactor * dy * dy * dy * dy * dz * dz;


   } else if (store->offset == 80) {

     store->regs[0] += prefactor * dy * dy * dy * dz * dz * dz;

     store->regs[1] += prefactor * dy * dy * dz * dz * dz * dz;

     store->regs[2] += prefactor * dy * dz * dz * dz * dz * dz;

     store->regs[3] += prefactor * dz * dz * dz * dz * dz * dz;

     if (lp >= 7) {

       store->regs[4] += prefactor * dx * dx * dx * dx * dx * dx * dx;

       store->regs[5] += prefactor * dx * dx * dx * dx * dx * dx * dy;

       store->regs[6] += prefactor * dx * dx * dx * dx * dx * dx * dz;

       store->regs[7] += prefactor * dx * dx * dx * dx * dx * dy * dy;

       store->regs[8] += prefactor * dx * dx * dx * dx * dx * dy * dz;

       store->regs[9] += prefactor * dx * dx * dx * dx * dx * dz * dz;

     }


     // Handle higher offsets, ie. values of lp.

   } else {

     for (int i = 0; i < GRID_N_CXYZ_REGISTERS; i++) {

       double val = prefactor;

       const orbital a = coset_inv[i + store->offset];

       for (int j = 0; j < a.l[0]; j++) {

         val *= dx;

       }

       for (int j = 0; j < a.l[1]; j++) {

         val *= dy;

       }

       for (int j = 0; j < a.l[2]; j++) {

         val *= dz;

       }

       add_to_register(val, i, store);

     }

   }

 }


 /*******************************************************************************

  * \brief Integrates the grid into coefficients C_xyz.

  * \author Ole Schuett

  ******************************************************************************/

 __device__ static void grid_to_cxyz(const kernel_params *params,

                                     const smem_task *task, const double *grid,

                                     double *cxyz) {


   // Atomics adds on shared memory are pretty slow. Hence, the coeffients are

   // accumulated in registers while looping over the grid points.

   // For larger values of lp we need to do multiple sweeps over the grid.

   // Due to the higher register usage and the multiple sweeps,

   // the integrate kernel runs about 70% slower than the collocate kernel.

   for (int offset = 0; offset < ncoset(task->lp);

        offset += GRID_N_CXYZ_REGISTERS) {


     double cxyz_regs[GRID_N_CXYZ_REGISTERS] = {0.0};

     cxyz_store store = {.regs = cxyz_regs, .offset = offset};


     if (task->use_orthorhombic_kernel) {

       ortho_cxyz_to_grid(params, task, &store, grid);

     } else {

       general_cxyz_to_grid(params, task, &store, grid);

     }


     // Add register values to coefficients stored in shared memory.

 #pragma unroll // avoid dynamic indexing of registers

     for (int i = 0; i < GRID_N_CXYZ_REGISTERS; i++) {

       if (i + offset < ncoset(task->lp)) {

         atomicAddDouble(&cxyz[i + offset], cxyz_regs[i]);

       }

     }

   }

   __syncthreads(); // because of concurrent writes to cxyz

 }


 /*******************************************************************************

  * \brief Contracts the subblock, going from cartesian harmonics to spherical.

  * \author Ole Schuett

  ******************************************************************************/

 template <bool COMPUTE_TAU>

 __device__ static void store_hab(const smem_task *task, const cab_store *cab) {


   // The spherical index runs over angular momentum and then over contractions.

   // The carthesian index runs over exponents and then over angular momentum.


   // This is a double matrix product. Since the block can be quite large the

   // two products are fused to conserve shared memory.

   for (int i = threadIdx.x; i < task->nsgf_setb; i += blockDim.x) {

     for (int j = threadIdx.y; j < task->nsgf_seta; j += blockDim.y) {

       double block_val = 0.0;

       const int jco_start = ncoset(task->lb_min_basis - 1) + threadIdx.z;

       const int jco_end = ncoset(task->lb_max_basis);

       for (int jco = jco_start; jco < jco_end; jco += blockDim.z) {

         const orbital b = coset_inv[jco];

         const double sphib = task->sphib[i * task->maxcob + jco];

         const int ico_start = ncoset(task->la_min_basis - 1);

         const int ico_end = ncoset(task->la_max_basis);

         for (int ico = ico_start; ico < ico_end; ico++) {

           const orbital a = coset_inv[ico];

           const double hab =

               get_hab(a, b, task->zeta, task->zetb, cab, COMPUTE_TAU);

           const double sphia = task->sphia[j * task->maxcoa + ico];

           block_val += hab * sphia * sphib;

         }

       }

       if (task->block_transposed) {

         atomicAddDouble(&task->hab_block[j * task->nsgfb + i], block_val);

       } else {

         atomicAddDouble(&task->hab_block[i * task->nsgfa + j], block_val);

       }

     }

   }

   __syncthreads(); // Not needed, but coalesced threads are nice.

 }


 /*******************************************************************************

  * \brief Adds contributions from cab to forces and virial.

  * \author Ole Schuett

  ******************************************************************************/

 template <bool COMPUTE_TAU>

 __device__ static void store_forces_and_virial(const kernel_params *params,

                                                const smem_task *task,

                                                const cab_store *cab) {


   for (int i = threadIdx.x; i < task->nsgf_setb; i += blockDim.x) {

     for (int j = threadIdx.y; j < task->nsgf_seta; j += blockDim.y) {

       double block_val;

       if (task->block_transposed) {

         block_val = task->pab_block[j * task->nsgfb + i] * task->off_diag_twice;

       } else {

         block_val = task->pab_block[i * task->nsgfa + j] * task->off_diag_twice;

       }

       const int jco_start = ncoset(task->lb_min_basis - 1) + threadIdx.z;

       const int jco_end = ncoset(task->lb_max_basis);

       for (int jco = jco_start; jco < jco_end; jco += blockDim.z) {

         const double sphib = task->sphib[i * task->maxcob + jco];

         const int ico_start = ncoset(task->la_min_basis - 1);

         const int ico_end = ncoset(task->la_max_basis);

         for (int ico = ico_start; ico < ico_end; ico++) {

           const double sphia = task->sphia[j * task->maxcoa + ico];

           const double pabval = block_val * sphia * sphib;

           const orbital b = coset_inv[jco];

           const orbital a = coset_inv[ico];

           for (int k = 0; k < 3; k++) {

             const double force_a =

                 get_force_a(a, b, k, task->zeta, task->zetb, cab, COMPUTE_TAU);

             atomicAddDouble(&task->forces_a[k], force_a * pabval);

             const double force_b = get_force_b(a, b, k, task->zeta, task->zetb,

                                                task->rab, cab, COMPUTE_TAU);

             atomicAddDouble(&task->forces_b[k], force_b * pabval);

           }

           if (params->virial != NULL) {

             for (int k = 0; k < 3; k++) {

               for (int l = 0; l < 3; l++) {

                 const double virial_a = get_virial_a(

                     a, b, k, l, task->zeta, task->zetb, cab, COMPUTE_TAU);

                 const double virial_b =

                     get_virial_b(a, b, k, l, task->zeta, task->zetb, task->rab,

                                  cab, COMPUTE_TAU);

                 const double virial = pabval * (virial_a + virial_b);

                 atomicAddDouble(&params->virial[k * 3 + l], virial);

               }

             }

           }

         }

       }

     }

   }

   __syncthreads(); // Not needed, but coalesced threads are nice.

 }


 /*******************************************************************************

  * \brief Initializes the cxyz matrix with zeros.

  * \author Ole Schuett

  ******************************************************************************/

 __device__ static void zero_cxyz(const smem_task *task, double *cxyz) {

   if (threadIdx.z == 0 && threadIdx.y == 0) {

     for (int i = threadIdx.x; i < ncoset(task->lp); i += blockDim.x) {

       cxyz[i] = 0.0;

     }

   }

   __syncthreads(); // because of concurrent writes to cxyz

 }


 /*******************************************************************************

  * \brief Cuda kernel for integrating all tasks of one grid level.

  * \author Ole Schuett

  ******************************************************************************/

 template <bool COMPUTE_TAU, bool CALCULATE_FORCES>

 __device__ static void integrate_kernel(const kernel_params *params) {


   // Copy task from global to shared memory and precompute some stuff.

   __shared__ smem_task task;

   load_task(params, &task);


   // Check if radius is below the resolution of the grid.

   if (2.0 * task.radius < task.dh_max) {

     return; // nothing to do

   }


   // Allot dynamic shared memory.

   extern __shared__ double shared_memory[];

   double *smem_cab = &shared_memory[params->smem_cab_offset];

   double *smem_alpha = &shared_memory[params->smem_alpha_offset];

   double *smem_cxyz = &shared_memory[params->smem_cxyz_offset];


   // Allocate Cab from global memory if it does not fit into shared memory.

   cab_store cab = {.data = NULL, .n1 = task.n1};

   if (params->smem_cab_length < task.n1 * task.n2) {

     cab.data = malloc_cab(&task);

   } else {

     cab.data = smem_cab;

   }


   zero_cab(&cab, task.n1 * task.n2);

   compute_alpha(&task, smem_alpha);


   zero_cxyz(&task, smem_cxyz);

   grid_to_cxyz(params, &task, params->grid, smem_cxyz);

   cab_to_cxyz(&task, smem_alpha, &cab, smem_cxyz);


   store_hab<COMPUTE_TAU>(&task, &cab);

   if (CALCULATE_FORCES) {

     store_forces_and_virial<COMPUTE_TAU>(params, &task, &cab);

   }


   if (params->smem_cab_length < task.n1 * task.n2) {

     free_cab(cab.data);

   }

 }


 /*******************************************************************************

  * \brief Specialized Cuda kernel for compute_tau=false & calculate_forces=false

  * \author Ole Schuett

  ******************************************************************************/

 __global__ static void grid_integrate_density(const kernel_params params) {

   integrate_kernel<false, false>(&params);

 }


 /*******************************************************************************

  * \brief Specialized Cuda kernel for compute_tau=true & calculate_forces=false.

  * \author Ole Schuett

  ******************************************************************************/

 __global__ static void grid_integrate_tau(const kernel_params params) {

   integrate_kernel<true, false>(&params);

 }


 /*******************************************************************************

  * \brief Specialized Cuda kernel for compute_tau=false & calculate_forces=true.

  * \author Ole Schuett

  ******************************************************************************/

 __global__ static void

 grid_integrate_density_forces(const kernel_params params) {

   integrate_kernel<false, true>(&params);

 }


 /*******************************************************************************

  * \brief Specialized Cuda kernel for compute_tau=true & calculate_forces=true.

  * \author Ole Schuett

  ******************************************************************************/

 __global__ static void grid_integrate_tau_forces(const kernel_params params) {

   integrate_kernel<true, true>(&params);

 }


 /*******************************************************************************

  * \brief Launches the Cuda kernel that integrates all tasks of one grid level.

  * \author Ole Schuett

  ******************************************************************************/

 void grid_gpu_integrate_one_grid_level(

     const grid_gpu_task_list *task_list, const int first_task,

     const int last_task, const bool compute_tau, const grid_gpu_layout *layout,

     const offloadStream_t stream, const double *pab_blocks_dev,

     const double *grid_dev, double *hab_blocks_dev, double *forces_dev,

     double *virial_dev, int *lp_diff) {


   // Compute max angular momentum.

   const bool calculate_forces = (forces_dev != NULL);

   const bool calculate_virial = (virial_dev != NULL);

   assert(!calculate_virial || calculate_forces);

   const process_ldiffs ldiffs =

       process_get_ldiffs(calculate_forces, calculate_virial, compute_tau);

   *lp_diff = ldiffs.la_max_diff + ldiffs.lb_max_diff; // for reporting stats

   const int la_max = task_list->lmax + ldiffs.la_max_diff;

   const int lb_max = task_list->lmax + ldiffs.lb_max_diff;

   const int lp_max = la_max + lb_max;


   const int ntasks = last_task - first_task + 1;

   if (ntasks == 0) {

     return; // Nothing to do and lp_diff already set.

   }


   init_constant_memory();


   // Small Cab blocks are stored in shared mem, larger ones in global memory.

   const int CAB_SMEM_LIMIT = ncoset(5) * ncoset(5); // = 56 * 56 = 3136


   // Compute required shared memory.

   const int alpha_len = 3 * (lb_max + 1) * (la_max + 1) * (lp_max + 1);

   const int cxyz_len = ncoset(lp_max);

   const int cab_len = imin(CAB_SMEM_LIMIT, ncoset(lb_max) * ncoset(la_max));

   const size_t smem_per_block =

       (alpha_len + cxyz_len + cab_len) * sizeof(double);


   // kernel parameters

   kernel_params params;

   params.smem_cab_length = cab_len;

   params.smem_cab_offset = 0;

   params.smem_alpha_offset = params.smem_cab_offset + cab_len;

   params.smem_cxyz_offset = params.smem_alpha_offset + alpha_len;

   params.first_task = first_task;

   params.grid = grid_dev;

   params.tasks = task_list->tasks_dev;

   params.pab_blocks = pab_blocks_dev;

   params.hab_blocks = hab_blocks_dev;

   params.forces = forces_dev;

   params.virial = virial_dev;

   params.la_min_diff = ldiffs.la_min_diff;

   params.lb_min_diff = ldiffs.lb_min_diff;

   params.la_max_diff = ldiffs.la_max_diff;

   params.lb_max_diff = ldiffs.lb_max_diff;

   memcpy(params.dh, layout->dh, 9 * sizeof(double));

   memcpy(params.dh_inv, layout->dh_inv, 9 * sizeof(double));

   memcpy(params.npts_global, layout->npts_global, 3 * sizeof(int));

   memcpy(params.npts_local, layout->npts_local, 3 * sizeof(int));

   memcpy(params.shift_local, layout->shift_local, 3 * sizeof(int));


   // Launch !

   const int nblocks = ntasks;

   const dim3 threads_per_block(4, 4, 4);


   if (!compute_tau && !calculate_forces) {

     grid_integrate_density<<<nblocks, threads_per_block, smem_per_block,

                              stream>>>(params);

   } else if (compute_tau && !calculate_forces) {

     grid_integrate_tau<<<nblocks, threads_per_block, smem_per_block, stream>>>(

         params);

   } else if (!compute_tau && calculate_forces) {

     grid_integrate_density_forces<<<nblocks, threads_per_block, smem_per_block,

                                     stream>>>(params);

   } else if (compute_tau && calculate_forces) {

     grid_integrate_tau_forces<<<nblocks, threads_per_block, smem_per_block,

                                 stream>>>(params);

   }

   OFFLOAD_CHECK(offloadGetLastError());

 }


 #endif // defined(__OFFLOAD) && !defined(__NO_OFFLOAD_GRID)

 // EOF

imin
static int imin(int x, int y)
Returns the smaller of two given integer (missing from the C standard)
Definition: dbm_miniapp.c:38

ncoset
static GRID_HOST_DEVICE int ncoset(const int l)
Number of Cartesian orbitals up to given angular momentum quantum.
Definition: grid_common.h:73

grid
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
Definition: grid_cpu_collint.h:169

i
static void const int const int i
Definition: grid_cpu_collint.h:38

store_hab
static void store_hab(const grid_basis_set *ibasis, const grid_basis_set *jbasis, const int iset, const int jset, const bool transpose, const double *hab, double *block)
Transforms hab from prim. cartesian to contracted spherical basis.
Definition: grid_cpu_task_list.c:419

grid_gpu_collint.h

grid_gpu_integrate.h

get_hab
static GRID_DEVICE double get_hab(const orbital a, const orbital b, const double zeta, const double zetb, const cab_store *cab, const bool compute_tau)
Returns element i,j of hab matrix.
Definition: grid_process_vab.h:189

process_get_ldiffs
static process_ldiffs process_get_ldiffs(bool calculate_forces, bool calculate_virial, bool compute_tau)
Returns difference in angular momentum range for given flags.
Definition: grid_process_vab.h:222

get_force_b
static GRID_DEVICE double get_force_b(const orbital a, const orbital b, const int i, const double zeta, const double zetb, const double rab[3], const cab_store *cab, const bool compute_tau)
Returns i'th component of force on atom b.
Definition: grid_process_vab.h:81

get_virial_b
static GRID_DEVICE double get_virial_b(const orbital a, const orbital b, const int i, const int j, const double zeta, const double zetb, const double rab[3], const cab_store *cab, const bool compute_tau)
Returns element i,j of virial on atom b.
Definition: grid_process_vab.h:162

get_virial_a
static GRID_DEVICE double get_virial_a(const orbital a, const orbital b, const int i, const int j, const double zeta, const double zetb, const cab_store *cab, const bool compute_tau)
Returns element i,j of virial on atom a.
Definition: grid_process_vab.h:118

get_force_a
static GRID_DEVICE double get_force_a(const orbital a, const orbital b, const int i, const double zeta, const double zetb, const cab_store *cab, const bool compute_tau)
Returns i'th component of force on atom a.
Definition: grid_process_vab.h:42

cab_to_cxyz
static void cab_to_cxyz(const int la_max, const int la_min, const int lb_max, const int lb_min, const double prefactor, const double ra[3], const double rb[3], const double rp[3], GRID_CONST_WHEN_COLLOCATE double *cab, GRID_CONST_WHEN_INTEGRATE double *cxyz)
Transforms coefficients C_ab into C_xyz.
Definition: grid_ref_collint.h:827

general_cxyz_to_grid
static void general_cxyz_to_grid(const int border_mask, const int lp, const double zetp, const double dh[3][3], const double dh_inv[3][3], const double rp[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const int border_width[3], const double radius, GRID_CONST_WHEN_COLLOCATE double *cxyz, GRID_CONST_WHEN_INTEGRATE double *grid)
Collocates coefficients C_xyz onto the grid for general case.
Definition: grid_ref_collint.h:769

ortho_cxyz_to_grid
static void ortho_cxyz_to_grid(const int lp, const double zetp, const double dh[3][3], const double dh_inv[3][3], const double rp[3], const int npts_global[3], const int npts_local[3], const int shift_local[3], const double radius, GRID_CONST_WHEN_COLLOCATE double *cxyz, GRID_CONST_WHEN_INTEGRATE double *grid)
Collocates coefficients C_xyz onto the grid for orthorhombic case.
Definition: grid_ref_collint.h:207

ai_eri_debug::a
real(dp), dimension(3) a
Definition: ai_eri_debug.F:31

ai_eri_debug::b
real(dp), dimension(3) b
Definition: ai_eri_debug.F:31

parallel_rng_types::gaussian
integer, parameter, public gaussian
Definition: parallel_rng_types.F:73

rocm_backend::init_constant_memory
static void init_constant_memory()
Initializes the device's constant memory.
Definition: grid_hip_internal_header.h:230

rocm_backend::coset_inv
__constant__ orbital coset_inv[1330]
Definition: grid_hip_internal_header.h:63

rocm_backend::compute_alpha
__inline__ __device__ void compute_alpha(const smem_task< T > &task, T *__restrict__ alpha)
Computes the polynomial expansion coefficients: (x-a)**lxa (x-b)**lxb -> sum_{ls} alpha(ls,...
Definition: grid_hip_internal_header.h:320

cab_store
Cab matrix container to be passed through get_force/virial to cab_get.
Definition: grid_cpu_integrate.c:24

cab_store::data
const double * data
Definition: grid_cpu_integrate.c:25

orbital
Orbital angular momentum.
Definition: grid_common.h:125

process_ldiffs
Differences in angular momentum.
Definition: grid_process_vab.h:211

process_ldiffs::lb_min_diff
int lb_min_diff
Definition: grid_process_vab.h:215

process_ldiffs::la_max_diff
int la_max_diff
Definition: grid_process_vab.h:212

process_ldiffs::lb_max_diff
int lb_max_diff
Definition: grid_process_vab.h:214

process_ldiffs::la_min_diff
int la_min_diff
Definition: grid_process_vab.h:213