(git:0de0cc2)
grid_hip_integrate.cu
Go to the documentation of this file.
1 /*----------------------------------------------------------------------------*/
2 /* CP2K: A general program to perform molecular dynamics simulations */
3 /* Copyright 2000-2024 CP2K developers group <https://cp2k.org> */
4 /* */
5 /* SPDX-License-Identifier: BSD-3-Clause */
6 /*----------------------------------------------------------------------------*/
7 
8 /*
9  * inspirartions from the gpu backend
10  * Authors :
11  - Dr Mathieu Taillefumier (ETH Zurich / CSCS)
12  - Advanced Micro Devices, Inc.
13 */
14 
15 #if defined(__OFFLOAD_HIP) && !defined(__NO_OFFLOAD_GRID)
16 
17 #include <algorithm>
18 #include <assert.h>
19 #include <hip/hip_runtime.h>
20 #include <limits.h>
21 #include <math.h>
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <string.h>
25 
26 #include "grid_hip_context.h"
28 #include "grid_hip_process_vab.h"
29 
30 #ifdef __HIP_PLATFORM_NVIDIA__
31 #include <cooperative_groups.h>
32 #if CUDA_VERSION >= 11000
33 #include <cooperative_groups/reduce.h>
34 #endif
35 namespace cg = cooperative_groups;
36 #endif
37 
38 #if defined(_OMP_H)
39 #error "OpenMP should not be used in .cu files to accommodate HIP."
40 #endif
41 
42 namespace rocm_backend {
43 // do a warp reduction and return the final sum to thread_id = 0
44 template <typename T>
45 __device__ __inline__ T warp_reduce(T *table, const int tid) {
46  // AMD GPU have warp size of 64 while nvidia GPUs have warpSize of 32 so the
47  // first step is common to both platforms.
48  if (tid < 32) {
49  table[tid] += table[tid + 32];
50  }
51  __syncthreads();
52  if (tid < 16) {
53  table[tid] += table[tid + 16];
54  }
55  __syncthreads();
56  if (tid < 8) {
57  table[tid] += table[tid + 8];
58  }
59  __syncthreads();
60  if (tid < 4) {
61  table[tid] += table[tid + 4];
62  }
63  __syncthreads();
64  if (tid < 2) {
65  table[tid] += table[tid + 2];
66  }
67  __syncthreads();
68  return (table[0] + table[1]);
69 }
70 // #endif
71 
72 template <typename T, typename T3, bool COMPUTE_TAU, bool CALCULATE_FORCES>
73 __global__ __launch_bounds__(64) void compute_hab_v2(const kernel_params dev_) {
74  // Copy task from global to shared memory and precompute some stuff.
75  extern __shared__ T shared_memory[];
76  // T *smem_cab = &shared_memory[dev_.smem_cab_offset];
77  T *smem_alpha = &shared_memory[dev_.smem_alpha_offset];
78  const int tid =
79  threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
80  const int offset = dev_.sorted_blocks_offset_dev[blockIdx.x];
81  const int number_of_tasks = dev_.num_tasks_per_block_dev[blockIdx.x];
82 
83  if (number_of_tasks == 0)
84  return;
85 
86  T fa[3], fb[3];
87  T virial[9];
88 
89  fa[0] = 0.0;
90  fa[1] = 0.0;
91  fa[2] = 0.0;
92  fb[0] = 0.0;
93  fb[1] = 0.0;
94  fb[2] = 0.0;
95 
96  virial[0] = 0.0;
97  virial[1] = 0.0;
98  virial[2] = 0.0;
99  virial[3] = 0.0;
100  virial[4] = 0.0;
101  virial[5] = 0.0;
102  virial[6] = 0.0;
103  virial[7] = 0.0;
104  virial[8] = 0.0;
105 
106  for (int tk = 0; tk < number_of_tasks; tk++) {
107  __shared__ smem_task<T> task;
108  const int task_id = dev_.task_sorted_by_blocks_dev[offset + tk];
109  if (dev_.tasks[task_id].skip_task)
110  continue;
111  fill_smem_task_coef(dev_, task_id, task);
112 
113  T *coef_ = &dev_.ptr_dev[2][dev_.tasks[task_id].coef_offset];
114  T *smem_cab = &dev_.ptr_dev[6][dev_.tasks[task_id].cab_offset];
115 
116  compute_alpha(task, smem_alpha);
117  __syncthreads();
118  cxyz_to_cab(task, smem_alpha, coef_, smem_cab);
119  __syncthreads();
120 
121  for (int i = tid / 8; i < task.nsgf_setb; i += 8) {
122  for (int j = tid % 8; j < task.nsgf_seta; j += 8) {
123  T res = 0.0;
124 
125  T block_val1 = 0.0;
126 
127  if (CALCULATE_FORCES) {
128  if (task.block_transposed) {
129  block_val1 =
130  task.pab_block[j * task.nsgfb + i] * task.off_diag_twice;
131  } else {
132  block_val1 =
133  task.pab_block[i * task.nsgfa + j] * task.off_diag_twice;
134  }
135  }
136 
137  for (int jco = task.first_cosetb; jco < task.ncosetb; jco++) {
138  const auto &b = coset_inv[jco];
139  T block_val = 0.0;
140  const T sphib = task.sphib[i * task.maxcob + jco];
141  for (int ico = task.first_coseta; ico < task.ncoseta; ico++) {
142  const auto &a = coset_inv[ico];
143  const T hab = get_hab<COMPUTE_TAU, T>(a, b, task.zeta, task.zetb,
144  task.n1, smem_cab);
145  const T sphia = task.sphia[j * task.maxcoa + ico];
146  block_val += hab * sphia * sphib;
147 
148  if (CALCULATE_FORCES) {
149  for (int k = 0; k < 3; k++) {
150  fa[k] += block_val1 * sphia * sphib *
151  get_force_a<COMPUTE_TAU, T>(
152  a, b, k, task.zeta, task.zetb, task.n1, smem_cab);
153  fb[k] +=
154  block_val1 * sphia * sphib *
155  get_force_b<COMPUTE_TAU, T>(a, b, k, task.zeta, task.zetb,
156  task.rab, task.n1, smem_cab);
157  }
158  if (dev_.ptr_dev[5] != nullptr) {
159  for (int k = 0; k < 3; k++) {
160  for (int l = 0; l < 3; l++) {
161  virial[3 * k + l] +=
162  (get_virial_a<COMPUTE_TAU, T>(a, b, k, l, task.zeta,
163  task.zetb, task.n1,
164  smem_cab) +
165  get_virial_b<COMPUTE_TAU, T>(a, b, k, l, task.zeta,
166  task.zetb, task.rab,
167  task.n1, smem_cab)) *
168  block_val1 * sphia * sphib;
169  }
170  }
171  }
172  }
173  }
174 
175  res += block_val;
176  }
177 
178  // we can use shuffle_down if it exists for T
179 
180  // these atomic operations are not needed since the blocks are updated
181  // by one single block thread. However the grid_miniapp will fail if not
182  // there.
183  if (task.block_transposed) {
184  atomicAdd(task.hab_block + j * task.nsgfb + i, res);
185  } else {
186  atomicAdd(task.hab_block + i * task.nsgfa + j, res);
187  }
188  }
189  }
190  __syncthreads();
191  }
192 
193  if (CALCULATE_FORCES) {
194  const int task_id = dev_.task_sorted_by_blocks_dev[offset];
195  const auto &glb_task = dev_.tasks[task_id];
196  const int iatom = glb_task.iatom;
197  const int jatom = glb_task.jatom;
198  T *forces_a = &dev_.ptr_dev[4][3 * iatom];
199  T *forces_b = &dev_.ptr_dev[4][3 * jatom];
200 
201  T *sum = (T *)shared_memory;
202  if (dev_.ptr_dev[5] != nullptr) {
203 
204  for (int i = 0; i < 9; i++) {
205  sum[tid] = virial[i];
206  __syncthreads();
207 
208  virial[i] = warp_reduce<T>(sum, tid);
209  __syncthreads();
210 
211  if (tid == 0)
212  atomicAdd(dev_.ptr_dev[5] + i, virial[i]);
213  __syncthreads();
214  }
215  }
216 
217  for (int i = 0; i < 3; i++) {
218  sum[tid] = fa[i];
219  __syncthreads();
220  fa[i] = warp_reduce<T>(sum, tid);
221  __syncthreads();
222  if (tid == 0)
223  atomicAdd(forces_a + i, fa[i]);
224  __syncthreads();
225 
226  sum[tid] = fb[i];
227  __syncthreads();
228  fb[i] = warp_reduce<T>(sum, tid);
229  __syncthreads();
230  if (tid == 0)
231  atomicAdd(forces_b + i, fb[i]);
232  __syncthreads();
233  }
234  }
235 }
236 
237 /*******************************************************************************
238  * Cuda kernel for calcualting the coefficients of a potential (density,
239  etc...) for a given pair of gaussian basis sets
240 
241 We compute the discretized version of the following integral
242 
243 $$\int _\infty ^\infty V_{ijk} P^\alpha_iP^beta_j P ^ \gamma _ k Exp(- \eta
244 |r_{ijk} - r_c|^2)$$
245 
246 where in practice the summation is over a finite domain. The discrete form has
247 this shape
248 
249 $$
250 \sum_{ijk < dmoain} V_{ijk} (x_i - x_c)^\alpha (y_j - y_c)^\beta (z_k - z_c) ^
251 \gamma Exp(- \eta |r_{ijk} - r_c|^2)
252 $$
253 
254 where $0 \le \alpha + \beta + \gamma \le lmax$
255 
256 It is formely the same operation than collocate (from a discrete point of view)
257 but the implementation differ because of technical reasons.
258 
259 So most of the code is the same except the core of the routine that is
260 specialized to the integration.
261 
262 ******************************************************************************/
263 template <typename T, typename T3, bool distributed__, bool orthorhombic_,
264  int lbatch = 10>
265 __global__
266 __launch_bounds__(64) void integrate_kernel(const kernel_params dev_) {
267  if (dev_.tasks[dev_.first_task + blockIdx.x].skip_task)
268  return;
269 
270  const int tid =
271  threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
272 
273  __shared__ T dh_inv_[9];
274  __shared__ T dh_[9];
275 
276  for (int d = tid; d < 9; d += blockDim.x * blockDim.y * blockDim.z)
277  dh_inv_[d] = dev_.dh_inv_[d];
278 
279  for (int d = tid; d < 9; d += blockDim.x * blockDim.y * blockDim.z)
280  dh_[d] = dev_.dh_[d];
281 
282  __shared__ smem_task_reduced<T, T3> task;
283  fill_smem_task_reduced(dev_, dev_.first_task + blockIdx.x, task);
284 
285  if (tid == 0) {
286  task.cube_center.z += task.lb_cube.z - dev_.grid_lower_corner_[0];
287  task.cube_center.y += task.lb_cube.y - dev_.grid_lower_corner_[1];
288  task.cube_center.x += task.lb_cube.x - dev_.grid_lower_corner_[2];
289 
290  if (distributed__) {
291  if (task.apply_border_mask) {
293  dev_.grid_local_size_,
294  dev_.tasks[dev_.first_task + blockIdx.x].border_mask,
295  dev_.grid_border_width_, &task.window_size, &task.window_shift);
296  }
297  }
298  }
299  __syncthreads();
300 
301  __shared__ T accumulator[lbatch][64];
302  __shared__ T sum[lbatch];
303 
304  // we use a multi pass algorithm here because shared memory usage (or
305  // register) would become too high for high angular momentum
306 
307  const short int size_loop =
308  (ncoset(task.lp) / lbatch + ((ncoset(task.lp) % lbatch) != 0)) * lbatch;
309  const short int length = ncoset(task.lp);
310  for (int ico = 0; ico < size_loop; ico += lbatch) {
311 #pragma unroll lbatch
312  for (int i = 0; i < lbatch; i++)
313  accumulator[i][tid] = 0.0;
314 
315  __syncthreads();
316 
317  for (int z = threadIdx.z; z < task.cube_size.z; z += blockDim.z) {
318  int z2 = (z + task.cube_center.z) % dev_.grid_full_size_[0];
319 
320  if (z2 < 0)
321  z2 += dev_.grid_full_size_[0];
322 
323  if (distributed__) {
324  // known at compile time. Will be stripped away
325  if (task.apply_border_mask) {
326  /* check if the point is within the window */
327  if ((z2 < task.window_shift.z) || (z2 > task.window_size.z)) {
328  continue;
329  }
330  }
331  }
332 
333  /* compute the coordinates of the point in atomic coordinates */
334  int ymin = 0;
335  int ymax = task.cube_size.y - 1;
336  T kremain = 0.0;
337 
338  if (orthorhombic_ && !task.apply_border_mask) {
339  ymin = (2 * (z + task.lb_cube.z) - 1) / 2;
340  ymin *= ymin;
341  kremain = task.discrete_radius * task.discrete_radius -
342  ymin * dh_[8] * dh_[8];
343  ymin = ceil(-1.0e-8 - sqrt(fmax(0.0, kremain)) * dh_inv_[4]);
344  ymax = 1 - ymin - task.lb_cube.y;
345  ymin = ymin - task.lb_cube.y;
346  }
347 
348  for (int y = ymin + threadIdx.y; y <= ymax; y += blockDim.y) {
349  int y2 = (y + task.cube_center.y) % dev_.grid_full_size_[1];
350 
351  if (y2 < 0)
352  y2 += dev_.grid_full_size_[1];
353 
354  if (distributed__) {
355  /* check if the point is within the window */
356  if (task.apply_border_mask) {
357  if ((y2 < task.window_shift.y) || (y2 > task.window_size.y)) {
358  continue;
359  }
360  }
361  }
362 
363  int xmin = 0;
364  int xmax = task.cube_size.x - 1;
365 
366  if (orthorhombic_ && !task.apply_border_mask) {
367  xmin = (2 * (y + task.lb_cube.y) - 1) / 2;
368  xmin *= xmin;
369  xmin =
370  ceil(-1.0e-8 - sqrt(fmax(0.0, kremain - xmin * dh_[4] * dh_[4])) *
371  dh_inv_[0]);
372  xmax = 1 - xmin - task.lb_cube.x;
373  xmin -= task.lb_cube.x;
374  }
375 
376  for (int x = xmin + threadIdx.x; x <= xmax; x += blockDim.x) {
377  int x2 = (x + task.cube_center.x) % dev_.grid_full_size_[2];
378 
379  if (x2 < 0)
380  x2 += dev_.grid_full_size_[2];
381 
382  if (distributed__) {
383  /* check if the point is within the window */
384  if (task.apply_border_mask) {
385  if ((x2 < task.window_shift.x) || (x2 > task.window_size.x)) {
386  continue;
387  }
388  }
389  }
390 
391  // I make no distinction between orthorhombic and non orthorhombic
392  // cases
393  T3 r3;
394 
395  if (orthorhombic_) {
396  r3.x = (x + task.lb_cube.x + task.roffset.x) * dh_[0];
397  r3.y = (y + task.lb_cube.y + task.roffset.y) * dh_[4];
398  r3.z = (z + task.lb_cube.z + task.roffset.z) * dh_[8];
399  } else {
400  r3 = compute_coordinates(dh_, (x + task.lb_cube.x + task.roffset.x),
401  (y + task.lb_cube.y + task.roffset.y),
402  (z + task.lb_cube.z + task.roffset.z));
403  }
404  // check if the point is inside the sphere or not. Note that it does
405  // not apply for the orthorhombic case when the full sphere is inside
406  // the region of interest.
407 
408  if (distributed__) {
409  if (((task.radius * task.radius) <=
410  (r3.x * r3.x + r3.y * r3.y + r3.z * r3.z)) &&
411  (!orthorhombic_ || task.apply_border_mask))
412  continue;
413  } else {
414  if (!orthorhombic_) {
415  if ((task.radius * task.radius) <=
416  (r3.x * r3.x + r3.y * r3.y + r3.z * r3.z))
417  continue;
418  }
419  }
420 
421  // read the next point assuming that reading is non blocking untill
422  // the register is actually needed for computation. This is true on
423  // NVIDIA hardware
424 
425  T grid_value =
426  __ldg(&dev_.ptr_dev[1][(z2 * dev_.grid_local_size_[1] + y2) *
427  dev_.grid_local_size_[2] +
428  x2]);
429 
430  const T r3x2 = r3.x * r3.x;
431  const T r3y2 = r3.y * r3.y;
432  const T r3z2 = r3.z * r3.z;
433 
434  const T r3xy = r3.x * r3.y;
435  const T r3xz = r3.x * r3.z;
436  const T r3yz = r3.y * r3.z;
437 
438  grid_value *= exp(-(r3x2 + r3y2 + r3z2) * task.zetp);
439 
440  switch (ico / lbatch) {
441  case 0: {
442  accumulator[0][tid] += grid_value;
443 
444  if (task.lp >= 1) {
445  accumulator[1][tid] += grid_value * r3.x;
446  accumulator[2][tid] += grid_value * r3.y;
447  accumulator[3][tid] += grid_value * r3.z;
448  }
449 
450  if (task.lp >= 2) {
451  accumulator[4][tid] += grid_value * r3x2;
452  accumulator[5][tid] += grid_value * r3xy;
453  accumulator[6][tid] += grid_value * r3xz;
454  accumulator[7][tid] += grid_value * r3y2;
455  accumulator[8][tid] += grid_value * r3yz;
456  accumulator[9][tid] += grid_value * r3z2;
457  }
458  } break;
459  case 1: {
460  if (task.lp >= 3) {
461  accumulator[0][tid] += grid_value * r3x2 * r3.x;
462  accumulator[1][tid] += grid_value * r3x2 * r3.y;
463  accumulator[2][tid] += grid_value * r3x2 * r3.z;
464  accumulator[3][tid] += grid_value * r3.x * r3y2;
465  accumulator[4][tid] += grid_value * r3xy * r3.z;
466  accumulator[5][tid] += grid_value * r3.x * r3z2;
467  accumulator[6][tid] += grid_value * r3y2 * r3.y;
468  accumulator[7][tid] += grid_value * r3y2 * r3.z;
469  accumulator[8][tid] += grid_value * r3.y * r3z2;
470  accumulator[9][tid] += grid_value * r3z2 * r3.z;
471  }
472  } break;
473  case 2: {
474  if (task.lp >= 4) {
475  accumulator[0][tid] += grid_value * r3x2 * r3x2;
476  accumulator[1][tid] += grid_value * r3x2 * r3xy;
477  accumulator[2][tid] += grid_value * r3x2 * r3xz;
478  accumulator[3][tid] += grid_value * r3x2 * r3y2;
479  accumulator[4][tid] += grid_value * r3x2 * r3yz;
480  accumulator[5][tid] += grid_value * r3x2 * r3z2;
481  accumulator[6][tid] += grid_value * r3xy * r3y2;
482  accumulator[7][tid] += grid_value * r3xz * r3y2;
483  accumulator[8][tid] += grid_value * r3xy * r3z2;
484  accumulator[9][tid] += grid_value * r3xz * r3z2;
485  }
486  } break;
487  case 3: {
488  if (task.lp >= 4) {
489  accumulator[0][tid] += grid_value * r3y2 * r3y2;
490  accumulator[1][tid] += grid_value * r3y2 * r3yz;
491  accumulator[2][tid] += grid_value * r3y2 * r3z2;
492  accumulator[3][tid] += grid_value * r3yz * r3z2;
493  accumulator[4][tid] += grid_value * r3z2 * r3z2;
494  }
495  if (task.lp >= 5) {
496  accumulator[5][tid] += grid_value * r3x2 * r3x2 * r3.x;
497  accumulator[6][tid] += grid_value * r3x2 * r3x2 * r3.y;
498  accumulator[7][tid] += grid_value * r3x2 * r3x2 * r3.z;
499  accumulator[8][tid] += grid_value * r3x2 * r3.x * r3y2;
500  accumulator[9][tid] += grid_value * r3x2 * r3xy * r3.z;
501  }
502  } break;
503  case 4: {
504  accumulator[0][tid] += grid_value * r3x2 * r3.x * r3z2;
505  accumulator[1][tid] += grid_value * r3x2 * r3y2 * r3.y;
506  accumulator[2][tid] += grid_value * r3x2 * r3y2 * r3.z;
507  accumulator[3][tid] += grid_value * r3x2 * r3.y * r3z2;
508  accumulator[4][tid] += grid_value * r3x2 * r3z2 * r3.z;
509  accumulator[5][tid] += grid_value * r3.x * r3y2 * r3y2;
510  accumulator[6][tid] += grid_value * r3.x * r3y2 * r3yz;
511  accumulator[7][tid] += grid_value * r3.x * r3y2 * r3z2;
512  accumulator[8][tid] += grid_value * r3xy * r3z2 * r3.z;
513  accumulator[9][tid] += grid_value * r3.x * r3z2 * r3z2;
514  } break;
515  case 5: {
516  accumulator[0][tid] += grid_value * r3y2 * r3y2 * r3.y;
517  accumulator[1][tid] += grid_value * r3y2 * r3y2 * r3.z;
518  accumulator[2][tid] += grid_value * r3y2 * r3.y * r3z2;
519  accumulator[3][tid] += grid_value * r3y2 * r3z2 * r3.z;
520  accumulator[4][tid] += grid_value * r3.y * r3z2 * r3z2;
521  accumulator[5][tid] += grid_value * r3z2 * r3z2 * r3.z;
522  if (task.lp >= 6) {
523  accumulator[6][tid] += grid_value * r3x2 * r3x2 * r3x2; // x^6
524  accumulator[7][tid] += grid_value * r3x2 * r3x2 * r3xy; // x^5 y
525  accumulator[8][tid] += grid_value * r3x2 * r3x2 * r3xz; // x^5 z
526  accumulator[9][tid] += grid_value * r3x2 * r3x2 * r3y2; // x^4 y^2
527  }
528  } break;
529  case 6: {
530  accumulator[0][tid] += grid_value * r3x2 * r3x2 * r3yz; // x^4 y z
531  accumulator[1][tid] += grid_value * r3x2 * r3x2 * r3z2; // x^4 z^2
532  accumulator[2][tid] += grid_value * r3x2 * r3y2 * r3xy; // x^3 y^3
533  accumulator[3][tid] += grid_value * r3x2 * r3y2 * r3xz; // x^3 y^2 z
534  accumulator[4][tid] += grid_value * r3x2 * r3xy * r3z2; // x^3 y z^2
535  accumulator[5][tid] += grid_value * r3x2 * r3z2 * r3xz; // x^3 z^3
536  accumulator[6][tid] += grid_value * r3x2 * r3y2 * r3y2; // x^2 y^4
537  accumulator[7][tid] += grid_value * r3x2 * r3y2 * r3yz; // x^3 y^2 z
538  accumulator[8][tid] +=
539  grid_value * r3x2 * r3y2 * r3z2; // x^2 y^2 z^2
540  accumulator[9][tid] += grid_value * r3x2 * r3z2 * r3yz; // x^2 y z^3
541  } break;
542  case 7: {
543  accumulator[0][tid] += grid_value * r3x2 * r3z2 * r3z2; // x^2 z^4
544  accumulator[1][tid] += grid_value * r3y2 * r3y2 * r3xy; // x y^5
545  accumulator[2][tid] += grid_value * r3y2 * r3y2 * r3xz; // x y^4 z
546  accumulator[3][tid] += grid_value * r3y2 * r3xy * r3z2; // x y^3 z^2
547  accumulator[4][tid] += grid_value * r3y2 * r3z2 * r3xz; // x y^2 z^3
548  accumulator[5][tid] += grid_value * r3xy * r3z2 * r3z2; // x y z^4
549  accumulator[6][tid] += grid_value * r3z2 * r3z2 * r3xz; // x z^5
550  accumulator[7][tid] += grid_value * r3y2 * r3y2 * r3y2; // y^6
551  accumulator[8][tid] += grid_value * r3y2 * r3y2 * r3yz; // y^5 z
552  accumulator[9][tid] += grid_value * r3y2 * r3y2 * r3z2; // y^4 z^2
553  } break;
554  default:
555  for (int ic = 0; (ic < lbatch) && ((ic + ico) < length); ic++) {
556  auto &co = coset_inv[ic + ico];
557  T tmp = 1.0;
558  for (int po = 0; po < co.l[2]; po++)
559  tmp *= r3.z;
560  for (int po = 0; po < co.l[1]; po++)
561  tmp *= r3.y;
562  for (int po = 0; po < co.l[0]; po++)
563  tmp *= r3.x;
564  accumulator[ic][tid] += tmp * grid_value;
565  }
566  break;
567  }
568  }
569  }
570  }
571 
572  __syncthreads();
573 
574  // we know there is only 1 wavefront in each block
575  // lbatch threads could reduce the values saved by all threads of the warp
576  // and save results do a shuffle_down by hand
577  for (int i = 0; i < min(length - ico, lbatch); i++) {
578  if (tid < 32) {
579  accumulator[i][tid] += accumulator[i][tid + 32];
580  }
581  __syncthreads();
582  if (tid < 16) {
583  accumulator[i][tid] += accumulator[i][tid + 16];
584  }
585  __syncthreads();
586  if (tid < 8) {
587  accumulator[i][tid] += accumulator[i][tid + 8];
588  }
589  __syncthreads();
590  if (tid < 4) {
591  accumulator[i][tid] += accumulator[i][tid + 4];
592  }
593  __syncthreads();
594  if (tid < 2) {
595  accumulator[i][tid] += accumulator[i][tid + 2];
596  }
597  __syncthreads();
598  if (tid == 0)
599  sum[i] = accumulator[i][0] + accumulator[i][1];
600  }
601  __syncthreads();
602 
603  for (int i = tid; i < min(length - ico, lbatch); i += lbatch)
604  dev_.ptr_dev[2][dev_.tasks[dev_.first_task + blockIdx.x].coef_offset + i +
605  ico] = sum[i];
606  __syncthreads();
607  // if (tid == 0)
608  // printf("%.15lf\n", sum[0]);
609  }
610 }
611 
612 kernel_params
613 context_info::set_kernel_parameters(const int level,
614  const smem_parameters &smem_params) {
615  kernel_params params;
616  params.smem_cab_offset = smem_params.smem_cab_offset();
617  params.smem_alpha_offset = smem_params.smem_alpha_offset();
618  params.first_task = 0;
619 
620  params.la_min_diff = smem_params.ldiffs().la_min_diff;
621  params.lb_min_diff = smem_params.ldiffs().lb_min_diff;
622 
623  params.la_max_diff = smem_params.ldiffs().la_max_diff;
624  params.lb_max_diff = smem_params.ldiffs().lb_max_diff;
625  params.first_task = 0;
626  params.tasks = this->tasks_dev.data();
627  params.task_sorted_by_blocks_dev = task_sorted_by_blocks_dev.data();
628  params.sorted_blocks_offset_dev = sorted_blocks_offset_dev.data();
629  params.num_tasks_per_block_dev = this->num_tasks_per_block_dev_.data();
630  params.block_offsets = this->block_offsets_dev.data();
631  params.la_min_diff = smem_params.ldiffs().la_min_diff;
632  params.lb_min_diff = smem_params.ldiffs().lb_min_diff;
633  params.la_max_diff = smem_params.ldiffs().la_max_diff;
634  params.lb_max_diff = smem_params.ldiffs().lb_max_diff;
635 
636  params.ptr_dev[0] = pab_block_.data();
637 
638  if (level >= 0) {
639  params.ptr_dev[1] = grid_[level].data();
640  for (int i = 0; i < 3; i++) {
641  memcpy(params.dh_, grid_[level].dh(), 9 * sizeof(double));
642  memcpy(params.dh_inv_, grid_[level].dh_inv(), 9 * sizeof(double));
643  params.grid_full_size_[i] = grid_[level].full_size(i);
644  params.grid_local_size_[i] = grid_[level].local_size(i);
645  params.grid_lower_corner_[i] = grid_[level].lower_corner(i);
646  params.grid_border_width_[i] = grid_[level].border_width(i);
647  }
648  params.first_task = first_task_per_level_[level];
649  }
650  params.ptr_dev[2] = this->coef_dev_.data();
651  params.ptr_dev[3] = hab_block_.data();
652  params.ptr_dev[4] = forces_.data();
653  params.ptr_dev[5] = virial_.data();
654  params.ptr_dev[6] = this->cab_dev_.data();
655  params.sphi_dev = this->sphi_dev.data();
656  return params;
657 }
658 
659 /*******************************************************************************
660  * \brief Launches the Cuda kernel that integrates all tasks of one grid level.
661  ******************************************************************************/
662 void context_info::integrate_one_grid_level(const int level, int *lp_diff) {
663  if (number_of_tasks_per_level_[level] == 0)
664  return;
666 
667  // Compute max angular momentum.
668  const ldiffs_value ldiffs =
670 
671  smem_parameters smem_params(ldiffs, lmax());
672 
673  *lp_diff = smem_params.lp_diff();
675 
676  kernel_params params = set_kernel_parameters(level, smem_params);
677 
678  /* WARNING : if you change the block size please be aware of that the number
679  * of warps is hardcoded when we do the block reduction in the integrate
680  * kernel. The number of warps should be explicitly indicated in the
681  * templating parameters or simply adjust the switch statements inside the
682  * integrate kernels */
683 
684  const dim3 threads_per_block(4, 4, 4);
685  if (grid_[level].is_distributed()) {
686  if (grid_[level].is_orthorhombic()) {
687  integrate_kernel<double, double3, true, true, 10>
688  <<<number_of_tasks_per_level_[level], threads_per_block, 0,
689  level_streams[level]>>>(params);
690  } else {
691  integrate_kernel<double, double3, true, false, 10>
692  <<<number_of_tasks_per_level_[level], threads_per_block, 0,
693  level_streams[level]>>>(params);
694  }
695  } else {
696  if (grid_[level].is_orthorhombic()) {
697  integrate_kernel<double, double3, false, true, 10>
698  <<<number_of_tasks_per_level_[level], threads_per_block, 0,
699  level_streams[level]>>>(params);
700  } else {
701  integrate_kernel<double, double3, false, false, 10>
702  <<<number_of_tasks_per_level_[level], threads_per_block, 0,
703  level_streams[level]>>>(params);
704  }
705  }
706 }
707 
709 
711 
712  // Compute max angular momentum.
713  const ldiffs_value ldiffs =
715 
716  smem_parameters smem_params(ldiffs, lmax());
718 
719  kernel_params params = set_kernel_parameters(-1, smem_params);
720 
721  /* WARNING if you change the block size. The number
722  * of warps is hardcoded when we do the block reduction in the integrate
723  * kernel. */
724 
725  const dim3 threads_per_block(4, 4, 4);
726 
727  if (!compute_tau && !calculate_forces) {
728  compute_hab_v2<double, double3, false, false>
729  <<<this->nblocks, threads_per_block, smem_params.smem_per_block(),
730  this->main_stream>>>(params);
731  return;
732  }
733 
734  if (!compute_tau && calculate_forces) {
735  compute_hab_v2<double, double3, false, true>
736  <<<this->nblocks, threads_per_block, smem_params.smem_per_block(),
737  this->main_stream>>>(params);
738  return;
739  }
740 
741  if (compute_tau && calculate_forces) {
742  compute_hab_v2<double, double3, true, true>
743  <<<this->nblocks, threads_per_block, smem_params.smem_per_block(),
744  this->main_stream>>>(params);
745  }
746 
747  if (compute_tau && !calculate_forces) {
748  compute_hab_v2<double, double3, true, false>
749  <<<this->nblocks, threads_per_block, smem_params.smem_per_block(),
750  this->main_stream>>>(params);
751  }
752 }
753 }; // namespace rocm_backend
754 #endif // defined(__OFFLOAD_HIP) && !defined(__NO_OFFLOAD_GRID)
std::vector< int > first_task_per_level_
gpu_vector< double > virial_
gpu_vector< int > sorted_blocks_offset_dev
gpu_vector< double > coef_dev_
gpu_vector< double > pab_block_
gpu_vector< int > block_offsets_dev
gpu_vector< double > cab_dev_
gpu_vector< task_info > tasks_dev
std::vector< grid_info< double > > grid_
gpu_vector< double > forces_
gpu_vector< double > hab_block_
std::vector< offloadStream_t > level_streams
std::vector< int > number_of_tasks_per_level_
gpu_vector< double * > sphi_dev
gpu_vector< int > num_tasks_per_block_dev_
void integrate_one_grid_level(const int level, int *lp_diff)
gpu_vector< int > task_sorted_by_blocks_dev
static void const int const int i
for(int lxp=0;lxp<=lp;lxp++)
real(dp), dimension(3) a
Definition: ai_eri_debug.F:31
real(dp), dimension(3) d
Definition: ai_eri_debug.F:31
real(dp), dimension(3) b
Definition: ai_eri_debug.F:31
integer, dimension(:, :, :), allocatable, public co
__inline__ __device__ void compute_window_size(const int *const grid_size, const int border_mask, const int *border_width, int3 *const window_size, int3 *const window_shift)
ldiffs_value process_get_ldiffs(bool calculate_forces, bool calculate_virial, bool compute_tau)
Returns difference in angular momentum range for given flags.
__host__ __device__ __inline__ int ncoset(const int l)
Number of Cartesian orbitals up to given angular momentum quantum.
__device__ __inline__ void fill_smem_task_coef(const kernel_params &dev, const int task_id, smem_task< T > &task)
Copies a task from global to shared memory and does precomputations.
static void init_constant_memory()
Initializes the device's constant memory.
__device__ static __inline__ void cxyz_to_cab(const smem_task< T > &task, const T *__restrict__ alpha, const T *__restrict__ cxyz, T *__restrict__ cab)
Transforms coefficients C_xyz into C_ab.
__constant__ orbital coset_inv[1330]
__inline__ __device__ void compute_alpha(const smem_task< T > &task, T *__restrict__ alpha)
Computes the polynomial expansion coefficients: (x-a)**lxa (x-b)**lxb -> sum_{ls} alpha(ls,...
__device__ __inline__ void fill_smem_task_reduced(const kernel_params &dev, const int task_id, smem_task_reduced< T, T3 > &task)
Copies a task from global to shared memory.
__inline__ __device__ double3 compute_coordinates(const double *__restrict__ dh_, const double x, const double y, const double z)