(git:0d88fc2)
Loading...
Searching...
No Matches
grid_gpu_integrate.cu
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2026 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: BSD-3-Clause */
6/*----------------------------------------------------------------------------*/
7
8/*
9 * inspirations from the gpu backend
10 * Authors :
11 - Mathieu Taillefumier (ETH Zurich / CSCS)
12 - Advanced Micro Devices, Inc.
13 - Ole Schuett
14*/
15
16#include <algorithm>
17#include <assert.h>
18#include <limits.h>
19#include <math.h>
20#include <stdio.h>
21#include <stdlib.h>
22#include <string.h>
23
24#include "grid_gpu_context.h"
27
28#if defined(_OMP_H)
29#error "OpenMP should not be used in .cu files to accommodate HIP."
30#endif
31
32namespace rocm_backend {
33
34// do a block reduction for a block of size 64 and return the final sum to
35// thread_id = 0
36template <typename T>
37__device__ __inline__ T block_reduce_64(T *table, const T val_, const int tid) {
38 // AMD GPU have warp size of 64 while nvidia GPUs have warpSize of 32 so the
39 // first step is common to both platforms.
40 table[tid] = val_;
41 __syncthreads();
42
43 T val = 0.0;
44
45 if (tid < 32) {
46 val = table[tid] + table[tid + 32];
47
48 for (int offset = 16; offset > 0; offset >>= 1) {
49#if defined(__CUDACC__)
50 val += __shfl_down_sync(0xffffffff, val, offset);
51#else
52 val += __shfl_down(val, offset);
53#endif
54 }
55 }
56
57 // prevents threads >= 32 (which never touch table[]) from racing ahead and
58 // overwriting it for the next call before the reduction above has read it.
59 __syncthreads();
60 return val;
61}
62
63template <typename T, bool COMPUTE_TAU>
64__global__ __launch_bounds__(64) void compute_hab(const kernel_params dev_) {
65 // Copy task from global to shared memory and precompute some stuff.
66 extern __shared__ T shared_memory[];
67 // T *smem_cab = &shared_memory[dev_.smem_cab_offset];
68 const int number_of_tasks = dev_.num_tasks_per_block_dev[block_index()];
69
70 if (number_of_tasks == 0)
71 return;
72
73 T *smem_alpha = &shared_memory[0];
74 const int tid = thread_global_index();
75 const int offset = dev_.sorted_blocks_offset_dev[block_index()];
76
77 T *__restrict__ smem_cab = nullptr;
78
79 smem_cab = allocate_workspace<T>(dev_);
80
81 for (int tk = 0; tk < number_of_tasks; tk++) {
82 __shared__ smem_task<T> task;
83 const int task_id = dev_.task_sorted_by_blocks_dev[offset + tk];
84 if (dev_.tasks[task_id].skip_task)
85 continue;
86
87 // all warps need to be synchronized here before modifying the task
88 // information.
89 __syncthreads();
90 fill_smem_task_coef(dev_, task_id, task);
91
92 T *__restrict__ coef_ = reinterpret_cast<T *>(__builtin_assume_aligned(
93 &dev_.ptr_dev[2][dev_.tasks[task_id].coef_offset], 32));
94
95 __syncthreads();
96 compute_alpha(task, smem_alpha);
97 __syncthreads();
98 cxyz_to_cab(task, smem_alpha, coef_, smem_cab);
99 __syncthreads();
100
101 for (int i = tid / 8; i < task.nsgf_setb; i += 8) {
102 for (int j = tid % 8; j < task.nsgf_seta; j += 8) {
103 T tmp = 0.0;
104 for (int jco = task.first_cosetb; jco < task.ncosetb; jco++) {
105 const T sphib = task.sphib[i * task.maxcob + jco];
106 const auto &b = coset_inv[jco];
107 for (int ico = task.first_coseta; ico < task.ncoseta; ico++) {
108 const auto &a = coset_inv[ico];
109 const T hab = get_hab<COMPUTE_TAU, T>(a, b, task.zeta, task.zetb,
110 task.n1, smem_cab);
111 const T sphia_times_sphib =
112 task.sphia[j * task.maxcoa + ico] * sphib;
113 tmp += hab * sphia_times_sphib;
114 }
115 }
116 if (task.block_transposed) {
117 task.hab_block[j * task.nsgfb + i] += tmp;
118 } else {
119 task.hab_block[i * task.nsgfa + j] += tmp;
120 }
121 }
122 }
123 }
124}
125
126template <typename T, typename T3, bool COMPUTE_TAU>
127__global__
128__launch_bounds__(64) void compute_hab_forces(const kernel_params dev_) {
129 // Copy task from global to shared memory and precompute some stuff.
130 extern __shared__ T shared_memory[];
131 const int number_of_tasks = dev_.num_tasks_per_block_dev[block_index()];
132
133 if (number_of_tasks == 0)
134 return;
135
136 T *smem_alpha = &shared_memory[0];
137 const int tid = thread_global_index();
138 const int offset = dev_.sorted_blocks_offset_dev[block_index()];
139
140 T *__restrict__ smem_cab = allocate_workspace<T>(dev_);
141
142 T fa[3], fb[3];
143 T virial[9];
144
145 fa[0] = 0.0;
146 fa[1] = 0.0;
147 fa[2] = 0.0;
148 fb[0] = 0.0;
149 fb[1] = 0.0;
150 fb[2] = 0.0;
151
152 virial[0] = 0.0;
153 virial[1] = 0.0;
154 virial[2] = 0.0;
155 virial[3] = 0.0;
156 virial[4] = 0.0;
157 virial[5] = 0.0;
158 virial[6] = 0.0;
159 virial[7] = 0.0;
160 virial[8] = 0.0;
161
162 for (int tk = 0; tk < number_of_tasks; tk++) {
163 __shared__ smem_task<T> task;
164 const int task_id = dev_.task_sorted_by_blocks_dev[offset + tk];
165 if (dev_.tasks[task_id].skip_task)
166 continue;
167 fill_smem_task_coef(dev_, task_id, task);
168
169 T *__restrict__ coef_ = &dev_.ptr_dev[2][dev_.tasks[task_id].coef_offset];
170 __syncthreads();
171 compute_alpha(task, smem_alpha);
172 __syncthreads();
173 cxyz_to_cab(task, smem_alpha, coef_, smem_cab);
174 __syncthreads();
175
176 for (int i = tid / 8; i < task.nsgf_setb; i += 8) {
177 for (int j = tid % 8; j < task.nsgf_seta; j += 8) {
178 T tmp = 0.0;
179 T block_value = 0.0;
180 if (task.block_transposed) {
181 block_value =
182 task.pab_block[j * task.nsgfb + i] * task.off_diag_twice;
183 } else {
184 block_value =
185 task.pab_block[i * task.nsgfa + j] * task.off_diag_twice;
186 }
187 for (int jco = task.first_cosetb; jco < task.ncosetb; jco++) {
188 const T sphib = task.sphib[i * task.maxcob + jco];
189 const auto &b = coset_inv[jco];
190 for (int ico = task.first_coseta; ico < task.ncoseta; ico++) {
191 const auto &a = coset_inv[ico];
192 const T hab = get_hab<COMPUTE_TAU, T>(a, b, task.zeta, task.zetb,
193 task.n1, smem_cab);
194 T sphia_times_sphib = task.sphia[j * task.maxcoa + ico] * sphib;
195 tmp += hab * sphia_times_sphib;
196
197 sphia_times_sphib *= block_value;
198 fa[0] += sphia_times_sphib *
199 get_force_a<COMPUTE_TAU, T>(a, b, 0, task.zeta, task.zetb,
200 task.n1, smem_cab);
201 fa[1] += sphia_times_sphib *
202 get_force_a<COMPUTE_TAU, T>(a, b, 1, task.zeta, task.zetb,
203 task.n1, smem_cab);
204 fa[2] += sphia_times_sphib *
205 get_force_a<COMPUTE_TAU, T>(a, b, 2, task.zeta, task.zetb,
206 task.n1, smem_cab);
207
208 fb[0] += sphia_times_sphib *
209 get_force_b<COMPUTE_TAU, T>(a, b, 0, task.zeta, task.zetb,
210 task.rab, task.n1, smem_cab);
211 fb[1] += sphia_times_sphib *
212 get_force_b<COMPUTE_TAU, T>(a, b, 1, task.zeta, task.zetb,
213 task.rab, task.n1, smem_cab);
214 fb[2] += sphia_times_sphib *
215 get_force_b<COMPUTE_TAU, T>(a, b, 2, task.zeta, task.zetb,
216 task.rab, task.n1, smem_cab);
217
218 if (dev_.ptr_dev[5] != nullptr) {
219 virial[0] +=
220 sphia_times_sphib *
221 (get_virial_a<COMPUTE_TAU, T>(a, b, 0, 0, task.zeta,
222 task.zetb, task.n1, smem_cab) +
223 get_virial_b<COMPUTE_TAU, T>(a, b, 0, 0, task.zeta,
224 task.zetb, task.rab, task.n1,
225 smem_cab));
226 virial[1] +=
227 sphia_times_sphib *
228 (get_virial_a<COMPUTE_TAU, T>(a, b, 0, 1, task.zeta,
229 task.zetb, task.n1, smem_cab) +
230 get_virial_b<COMPUTE_TAU, T>(a, b, 0, 1, task.zeta,
231 task.zetb, task.rab, task.n1,
232 smem_cab));
233 virial[2] +=
234 sphia_times_sphib *
235 (get_virial_a<COMPUTE_TAU, T>(a, b, 0, 2, task.zeta,
236 task.zetb, task.n1, smem_cab) +
237 get_virial_b<COMPUTE_TAU, T>(a, b, 0, 2, task.zeta,
238 task.zetb, task.rab, task.n1,
239 smem_cab));
240 virial[3] +=
241 sphia_times_sphib *
242 (get_virial_a<COMPUTE_TAU, T>(a, b, 1, 0, task.zeta,
243 task.zetb, task.n1, smem_cab) +
244 get_virial_b<COMPUTE_TAU, T>(a, b, 1, 0, task.zeta,
245 task.zetb, task.rab, task.n1,
246 smem_cab));
247 virial[4] +=
248 sphia_times_sphib *
249 (get_virial_a<COMPUTE_TAU, T>(a, b, 1, 1, task.zeta,
250 task.zetb, task.n1, smem_cab) +
251 get_virial_b<COMPUTE_TAU, T>(a, b, 1, 1, task.zeta,
252 task.zetb, task.rab, task.n1,
253 smem_cab));
254 virial[5] +=
255 sphia_times_sphib *
256 (get_virial_a<COMPUTE_TAU, T>(a, b, 1, 2, task.zeta,
257 task.zetb, task.n1, smem_cab) +
258 get_virial_b<COMPUTE_TAU, T>(a, b, 1, 2, task.zeta,
259 task.zetb, task.rab, task.n1,
260 smem_cab));
261 virial[6] +=
262 sphia_times_sphib *
263 (get_virial_a<COMPUTE_TAU, T>(a, b, 2, 0, task.zeta,
264 task.zetb, task.n1, smem_cab) +
265 get_virial_b<COMPUTE_TAU, T>(a, b, 2, 0, task.zeta,
266 task.zetb, task.rab, task.n1,
267 smem_cab));
268 virial[7] +=
269 sphia_times_sphib *
270 (get_virial_a<COMPUTE_TAU, T>(a, b, 2, 1, task.zeta,
271 task.zetb, task.n1, smem_cab) +
272 get_virial_b<COMPUTE_TAU, T>(a, b, 2, 1, task.zeta,
273 task.zetb, task.rab, task.n1,
274 smem_cab));
275 virial[8] +=
276 sphia_times_sphib *
277 (get_virial_a<COMPUTE_TAU, T>(a, b, 2, 2, task.zeta,
278 task.zetb, task.n1, smem_cab) +
279 get_virial_b<COMPUTE_TAU, T>(a, b, 2, 2, task.zeta,
280 task.zetb, task.rab, task.n1,
281 smem_cab));
282 }
283 }
284 }
285
286 if (task.block_transposed) {
287 task.hab_block[j * task.nsgfb + i] += tmp;
288 } else {
289 task.hab_block[i * task.nsgfa + j] += tmp;
290 }
291 }
292 }
293 __syncthreads();
294 }
295
296 // theoretically not needed
297 __syncthreads();
298
299 const int task_id = dev_.task_sorted_by_blocks_dev[offset];
300 const auto &glb_task = dev_.tasks[task_id];
301 const int iatom = glb_task.iatom;
302 const int jatom = glb_task.jatom;
303 T *forces_a = &dev_.ptr_dev[4][3 * iatom];
304 T *forces_b = &dev_.ptr_dev[4][3 * jatom];
305
306 T *sum = (T *)shared_memory;
307 if (dev_.ptr_dev[5] != nullptr) {
308
309 for (int i = 0; i < 9; i++) {
310 virial[i] = block_reduce_64<T>(sum, virial[i], tid);
311
312 if (tid == 0)
313 atomicAdd(dev_.ptr_dev[5] + i, virial[i]);
314 }
315 }
316
317 for (int i = 0; i < 3; i++) {
318 fa[i] = block_reduce_64<T>(sum, fa[i], tid);
319
320 if (tid == 0)
321 atomicAdd(forces_a + i, fa[i]);
322
323 fb[i] = block_reduce_64<T>(sum, fb[i], tid);
324
325 if (tid == 0)
326 atomicAdd(forces_b + i, fb[i]);
327 }
328}
329
330/*******************************************************************************
331 * Cuda kernel for calcualting the coefficients of a potential (density,
332 etc...) for a given pair of gaussian basis sets
333
334We compute the discretized version of the following integral
335
336$$\int _\infty ^\infty V_{ijk} P^\alpha_iP^beta_j P ^ \gamma _ k Exp(- \eta
337|r_{ijk} - r_c|^2)$$
338
339where in practice the summation is over a finite domain. The discrete form has
340this shape
341
342$$
343\sum_{ijk < dmoain} V_{ijk} (x_i - x_c)^\alpha (y_j - y_c)^\beta (z_k - z_c) ^
344\gamma Exp(- \eta |r_{ijk} - r_c|^2)
345$$
346
347where $0 \le \alpha + \beta + \gamma \le lmax$
348
349It is formely the same operation than collocate (from a discrete point of view)
350but the implementation differ because of technical reasons.
351
352So most of the code is the same except the core of the routine that is
353specialized to the integration.
354
355******************************************************************************/
356template <typename T, typename T3, bool distributed__, bool orthogonal_,
357 int lbatch = 10>
358__global__
359__launch_bounds__(64) void integrate_kernel(const kernel_params dev_) {
360 if (dev_.tasks[dev_.first_task + block_index()].skip_task)
361 return;
362
363 const int tid = thread_global_index();
364
365 // __shared__ T dh_inv_[9];
366 __shared__ T dh_[9];
367
368 // for (int d = tid; d < 9; d += blockDim.x * blockDim.y * blockDim.z)
369 // dh_inv_[d] = dev_.dh_inv_[d];
370
371 if (tid < 9)
372 dh_[tid] = dev_.dh_[tid];
373
374 __shared__ smem_task_reduced<T, T3> task;
375 fill_smem_task_reduced(dev_, dev_.first_task + block_index(), task);
376
377 if (tid == 0) {
378 setup_task_cube_center<T, T3, distributed__>(dev_, task);
379 }
380 __syncthreads();
381
382 __shared__ T accumulator[lbatch][64];
383
384 // we use a multi pass algorithm here because shared memory usage (or
385 // register) would become too high for high angular momentum
386
387 const short int size_loop =
388 (ncoset(task.lp) / lbatch + ((ncoset(task.lp) % lbatch) != 0)) * lbatch;
389 const short int length = ncoset(task.lp);
390 for (int ico = 0; ico < size_loop; ico += lbatch) {
391#pragma unroll lbatch
392 for (int i = 0; i < lbatch; i++)
393 accumulator[i][tid] = 0.0;
394
395 __syncthreads();
396
397 for (int z = threadIdx.z; z < task.cube_size.z; z += blockDim.z) {
398 int z2 = wrap_grid_index(z + task.cube_center.z, dev_.grid_full_size_.z);
399
400 if (distributed__) {
401 // known at compile time. Will be stripped away
402 if (task.apply_border_mask) {
403 /* check if the point is within the window */
404 if ((z2 < task.window_shift.z) || (z2 > task.window_size.z)) {
405 continue;
406 }
407 }
408 }
409
410 /* compute the coordinates of the point in atomic coordinates */
411 int ymin = 0;
412 int ymax = task.cube_size.y - 1;
413 T kremain = 0.0;
414
415 if (orthogonal_ && !task.apply_border_mask) {
416 kremain = calculate_ymix_ymax_boundaries(task, z, ymin, ymax);
417 }
418
419 for (int y = ymin + threadIdx.y; y <= ymax; y += blockDim.y) {
420 int y2 =
421 wrap_grid_index(y + task.cube_center.y, dev_.grid_full_size_.y);
422
423 if (distributed__) {
424 /* check if the point is within the window */
425 if (task.apply_border_mask) {
426 if ((y2 < task.window_shift.y) || (y2 > task.window_size.y)) {
427 continue;
428 }
429 }
430 }
431
432 int xmin = 0;
433 int xmax = task.cube_size.x - 1;
434
435 if (orthogonal_ && !task.apply_border_mask) {
436 calculate_xmin_xmax_boundaries<T, T3>(task, y, kremain, xmin, xmax);
437 }
438
439 for (int x = xmin + threadIdx.x; x <= xmax; x += blockDim.x) {
440 int x2 =
441 wrap_grid_index(x + task.cube_center.x, dev_.grid_full_size_.x);
442
443 if (distributed__) {
444 /* check if the point is within the window */
445 if (task.apply_border_mask) {
446 if ((x2 < task.window_shift.x) || (x2 > task.window_size.x)) {
447 continue;
448 }
449 }
450 }
451
452 // I make no distinction between orthogonal and non orthogonal
453 // cases
454 T3 r3;
455
456 r3 = compute_coordinates(dh_, (x + task.lb_cube.x + task.roffset.x),
457 (y + task.lb_cube.y + task.roffset.y),
458 (z + task.lb_cube.z + task.roffset.z));
459 // check if the point is inside the sphere or not. Note that it does
460 // not apply for the orthogonal case when the full sphere is inside
461 // the region of interest.
462 const T r3x2 = r3.x * r3.x;
463 const T r3y2 = r3.y * r3.y;
464 const T r3z2 = r3.z * r3.z;
465
466 if (distributed__) {
467 // check if the point is inside the sphere or not. Note that it does
468 // not apply for the orthorhombic case when the full sphere is
469 // inside the region of interest.
470
471 if (((task.radius * task.radius) <= (r3x2 + r3y2 + r3z2)) &&
472 (!orthogonal_ || task.apply_border_mask))
473 continue;
474 } else {
475 // we do not need to do this test for the orthorhombic case
476 if ((!orthogonal_) &&
477 ((task.radius * task.radius) <= (r3x2 + r3y2 + r3z2)))
478 continue;
479 }
480
481 // read the next point assuming that reading is non blocking untill
482 // the register is actually needed for computation. This is true on
483 // NVIDIA hardware
484
485 T grid_value =
486 __ldg(&dev_.ptr_dev[1][(z2 * dev_.grid_local_size_.y + y2) *
487 dev_.grid_local_size_.x +
488 x2]);
489
490 const T r3xy = r3.x * r3.y;
491 const T r3xz = r3.x * r3.z;
492 const T r3yz = r3.y * r3.z;
493
494 grid_value *= exp(-(r3x2 + r3y2 + r3z2) * task.zetp);
495
496 switch (ico / lbatch) {
497 case 0: {
498 accumulator[0][tid] += grid_value;
499
500 if (task.lp >= 1) {
501 accumulator[1][tid] += grid_value * r3.x;
502 accumulator[2][tid] += grid_value * r3.y;
503 accumulator[3][tid] += grid_value * r3.z;
504 }
505
506 if (task.lp >= 2) {
507 accumulator[4][tid] += grid_value * r3x2;
508 accumulator[5][tid] += grid_value * r3xy;
509 accumulator[6][tid] += grid_value * r3xz;
510 accumulator[7][tid] += grid_value * r3y2;
511 accumulator[8][tid] += grid_value * r3yz;
512 accumulator[9][tid] += grid_value * r3z2;
513 }
514 if (task.lp >= 3) {
515 T tmp = grid_value * r3x2;
516 accumulator[10][tid] += tmp * r3.x;
517 accumulator[11][tid] += tmp * r3.y;
518 accumulator[12][tid] += tmp * r3.z;
519 tmp = grid_value * r3.x;
520 accumulator[13][tid] += tmp * r3y2;
521 accumulator[14][tid] += tmp * r3yz;
522 accumulator[15][tid] += tmp * r3z2;
523 tmp = grid_value * r3y2;
524 accumulator[16][tid] += tmp * r3.y;
525 accumulator[17][tid] += tmp * r3.z;
526 tmp = grid_value * r3z2;
527 accumulator[18][tid] += tmp * r3.y;
528 accumulator[19][tid] += tmp * r3.z;
529 }
530 } break;
531 case 1: {
532 if (task.lp >= 4) {
533 T tmp = grid_value * r3x2;
534 accumulator[0][tid] += tmp * r3x2;
535 accumulator[1][tid] += tmp * r3xy;
536 accumulator[2][tid] += tmp * r3xz;
537 accumulator[3][tid] += tmp * r3y2;
538 accumulator[4][tid] += tmp * r3yz;
539 accumulator[5][tid] += tmp * r3z2;
540 tmp = grid_value * r3y2;
541 accumulator[6][tid] += tmp * r3xy;
542 accumulator[7][tid] += tmp * r3xz;
543 tmp = grid_value * r3z2;
544 accumulator[8][tid] += tmp * r3xy;
545 accumulator[9][tid] += tmp * r3xz;
546 }
547 if (task.lp >= 4) {
548 T tmp = grid_value * r3y2;
549 accumulator[10][tid] += tmp * r3y2;
550 accumulator[11][tid] += tmp * r3yz;
551 accumulator[12][tid] += tmp * r3z2;
552 accumulator[13][tid] += grid_value * r3yz * r3z2;
553 accumulator[14][tid] += grid_value * r3z2 * r3z2;
554 }
555 if (task.lp >= 5) {
556 T tmp = grid_value * r3x2 * r3x2;
557 accumulator[15][tid] += tmp * r3.x;
558 accumulator[16][tid] += tmp * r3.y;
559 accumulator[17][tid] += tmp * r3.z;
560 tmp = grid_value * r3x2;
561 accumulator[18][tid] += tmp * r3.x * r3y2;
562 accumulator[19][tid] += tmp * r3xy * r3.z;
563 }
564 } break;
565 case 2: {
566 T tmp = grid_value * r3x2;
567 accumulator[0][tid] += tmp * r3.x * r3z2;
568 accumulator[1][tid] += tmp * r3y2 * r3.y;
569 accumulator[2][tid] += tmp * r3y2 * r3.z;
570 accumulator[3][tid] += tmp * r3.y * r3z2;
571 accumulator[4][tid] += tmp * r3z2 * r3.z;
572 tmp = grid_value * r3.x * r3y2;
573 accumulator[5][tid] += tmp * r3y2;
574 accumulator[6][tid] += tmp * r3yz;
575 accumulator[7][tid] += tmp * r3z2;
576 tmp = grid_value * r3.x * r3z2;
577 accumulator[8][tid] += tmp * r3yz;
578 accumulator[9][tid] += tmp * r3z2;
579 tmp = grid_value * r3y2 * r3.y;
580 accumulator[10][tid] += tmp * r3y2;
581 accumulator[11][tid] += tmp * r3yz;
582 accumulator[12][tid] += tmp * r3z2;
583 accumulator[13][tid] += grid_value * r3y2 * r3z2 * r3.z;
584 accumulator[14][tid] += grid_value * r3.y * r3z2 * r3z2;
585 accumulator[15][tid] += grid_value * r3z2 * r3z2 * r3.z;
586 if (task.lp >= 6) {
587 tmp = grid_value * r3x2 * r3x2;
588 accumulator[16][tid] += tmp * r3x2; // x^6
589 accumulator[17][tid] += tmp * r3xy; // x^5 y
590 accumulator[18][tid] += tmp * r3xz; // x^5 z
591 accumulator[19][tid] += tmp * r3y2; // x^4 y^2
592 }
593 } break;
594 case 3: {
595 T tmp = grid_value * r3x2;
596 accumulator[0][tid] += tmp * r3x2 * r3yz; // x^4 y z
597 accumulator[1][tid] += tmp * r3x2 * r3z2; // x^4 z^2
598 accumulator[2][tid] += tmp * r3y2 * r3xy; // x^3 y^3
599 accumulator[3][tid] += tmp * r3y2 * r3xz; // x^3 y^2 z
600 accumulator[4][tid] += tmp * r3xy * r3z2; // x^3 y z^2
601 accumulator[5][tid] += tmp * r3z2 * r3xz; // x^3 z^3
602 accumulator[6][tid] += tmp * r3y2 * r3y2; // x^2 y^4
603 accumulator[7][tid] += tmp * r3y2 * r3yz; // x^3 y^2 z
604 accumulator[8][tid] += tmp * r3y2 * r3z2; // x^2 y^2 z^2
605 accumulator[9][tid] += tmp * r3z2 * r3yz; // x^2 y z^3
606 accumulator[10][tid] += tmp * r3z2 * r3z2; // x^2 z^4
607 tmp = grid_value * r3y2 * r3y2;
608 accumulator[11][tid] += tmp * r3xy; // x y^5
609 accumulator[12][tid] += tmp * r3xz; // x y^4 z
610 accumulator[13][tid] +=
611 grid_value * r3y2 * r3xy * r3z2; // x y^3 z^2
612 accumulator[14][tid] +=
613 grid_value * r3y2 * r3z2 * r3xz; // x y^2 z^3
614 accumulator[15][tid] += grid_value * r3xy * r3z2 * r3z2; // x y z^4
615 accumulator[16][tid] += grid_value * r3z2 * r3z2 * r3xz; // x z^5
616 accumulator[17][tid] += tmp * r3y2; // y^6
617 accumulator[18][tid] += tmp * r3yz; // y^5 z
618 accumulator[19][tid] += tmp * r3z2; // y^4 z^2
619 } break;
620 default:
621 for (int ic = 0; (ic < lbatch) && ((ic + ico) < length); ic++) {
622 auto &co = coset_inv[ic + ico];
623 T tmp = 1.0;
624 for (int po = 0; po < (co.l[2] >> 1); po++)
625 tmp *= r3z2;
626 if (co.l[2] & 0x1)
627 tmp *= r3.z;
628 for (int po = 0; po < (co.l[1] >> 1); po++)
629 tmp *= r3y2;
630 if (co.l[1] & 0x1)
631 tmp *= r3.y;
632 for (int po = 0; po < (co.l[0] >> 1); po++)
633 tmp *= r3x2;
634 if (co.l[0] & 0x1)
635 tmp *= r3.x;
636 accumulator[ic][tid] += tmp * grid_value;
637 }
638 break;
639 }
640 }
641 }
642 }
643
644 const int max_i = min(length - ico, lbatch);
645
646 __syncthreads();
647
648 // we know there is only 1 wavefront in each block lbatch threads could
649 // reduce the values saved by all threads of the warp and save results do a
650 // shuffle_down by hand
651
652 if (tid < 32) {
653 for (int i = 0; i < max_i; i++) {
654 // Load local value
655 T val = accumulator[i][tid] + accumulator[i][tid + 32];
656
657 // Warp-level reduction (32 lanes) After this loop, lane 0 of warp 0
658 // holds the result All threads should execute this operation so
659 // __activemask() is incorrect here.
660
661 // The ifdef statement is needed because hip does not necessarily
662 // support the instruction either. Reverting back to __shfl_down is
663 // required.
664 //
665 for (int offset = 16; offset > 0; offset >>= 1) {
666#if defined(__CUDACC__)
667 val += __shfl_down_sync(0xffffffff, val, offset);
668#else
669 val += __shfl_down(val, offset);
670#endif
671 }
672
673 if (tid == 0)
674 accumulator[i][0] = val;
675 }
676 __syncwarp();
677 }
678
679 if (tid < min(length - ico, lbatch))
680 dev_.ptr_dev[2][dev_.tasks[dev_.first_task + block_index()].coef_offset +
681 tid + ico] = accumulator[tid][0];
682 __syncthreads();
683 }
684}
685
686/*******************************************************************************
687 * \brief Launches the Cuda kernel that integrates all tasks of one grid level.
688 ******************************************************************************/
689void context_info::integrate_one_grid_level(const int level, int *lp_diff) {
690 if (number_of_tasks_per_level_[level] == 0)
691 return;
693
694 // Compute max angular momentum.
695 const ldiffs_value ldiffs =
697
698 smem_parameters smem_params(ldiffs, lmax());
699
700 *lp_diff = smem_params.lp_diff();
702
703 kernel_params params = set_kernel_parameters(level, smem_params);
704
705 /* WARNING : if you change the block size please be aware of that the number
706 * of warps is hardcoded when we do the block reduction in the integrate
707 * kernel. The number of warps should be explicitly indicated in the
708 * templating parameters or simply adjust the switch statements inside the
709 * integrate kernels */
710
711 const dim3 threads_per_block(4, 4, 4);
712 if (grid_[level].is_distributed()) {
713 if (grid_[level].is_orthogonal()) {
714 integrate_kernel<double, double3, true, true, 20>
715 <<<number_of_tasks_per_level_[level], threads_per_block, 0,
716 level_streams[level]>>>(params);
717 } else {
718 integrate_kernel<double, double3, true, false, 20>
719 <<<number_of_tasks_per_level_[level], threads_per_block, 0,
720 level_streams[level]>>>(params);
721 }
722 } else {
723 if (grid_[level].is_orthogonal()) {
724 integrate_kernel<double, double3, false, true, 20>
725 <<<number_of_tasks_per_level_[level], threads_per_block, 0,
726 level_streams[level]>>>(params);
727 } else {
728 integrate_kernel<double, double3, false, false, 20>
729 <<<number_of_tasks_per_level_[level], threads_per_block, 0,
730 level_streams[level]>>>(params);
731 }
732 }
733}
734
736
738
739 // Compute max angular momentum.
740 const ldiffs_value ldiffs =
742
743 smem_parameters smem_params(ldiffs, lmax());
745
746 kernel_params params = set_kernel_parameters(-1, smem_params);
747
748 /* WARNING if you change the block size. The number
749 * of warps is hardcoded when we do the block reduction in the integrate
750 * kernel. */
751
752 const dim3 threads_per_block(4, 4, 4);
753
754 if (!compute_tau && !calculate_forces) {
755 compute_hab<double, false>
756 <<<this->nblocks, threads_per_block, smem_params.smem_per_block(),
757 this->main_stream>>>(params);
758 return;
759 }
760
762 compute_hab<double, true>
763 <<<this->nblocks, threads_per_block, smem_params.smem_per_block(),
764 this->main_stream>>>(params);
765 }
766
768 compute_hab_forces<double, double3, false>
769 <<<this->nblocks, threads_per_block, smem_params.smem_per_block(),
770 this->main_stream>>>(params);
771 return;
772 }
773
775 compute_hab_forces<double, double3, true>
776 <<<this->nblocks, threads_per_block, smem_params.smem_per_block(),
777 this->main_stream>>>(params);
778 }
779}
780}; // namespace rocm_backend
std::vector< grid_info< double > > grid_
std::vector< offloadStream_t > level_streams
std::vector< int > number_of_tasks_per_level_
void integrate_one_grid_level(const int level, int *lp_diff)
Launches the Cuda kernel that integrates all tasks of one grid level.
static void const int const int i
__device__ __inline__ T block_reduce_64(T *table, const T val_, const int tid)
ldiffs_value process_get_ldiffs(bool calculate_forces, bool calculate_virial, bool compute_tau)
Returns difference in angular momentum range for given flags.
__inline__ __device__ unsigned int block_index()
__device__ __inline__ void fill_smem_task_coef(const kernel_params &dev, const int task_id, smem_task< T > &task)
Copies a task from global to shared memory and does precomputations.
__inline__ __device__ unsigned int thread_global_index()
static void init_constant_memory()
Initializes the device's constant memory.
__device__ static __inline__ void cxyz_to_cab(const smem_task< T > &task, const T *__restrict__ alpha, const T *__restrict__ cxyz, T *__restrict__ cab)
Transforms coefficients C_xyz into C_ab.
__constant__ orbital coset_inv[1330]
__inline__ __device__ int wrap_grid_index(const int idx, const int full_size)
Wraps a cube-local coordinate into the periodic full grid along one axis. Single point of entry share...
__inline__ __device__ void compute_alpha(const smem_task< T > &task, T *__restrict__ alpha)
Computes the polynomial expansion coefficients: (x-a)**lxa (x-b)**lxb -> sum_{ls} alpha(ls,...
__device__ __inline__ void fill_smem_task_reduced(const kernel_params &dev, const int task_id, smem_task_reduced< T, T3 > &task)
Copies a task from global to shared memory.
__inline__ __device__ double3 compute_coordinates(const double *__restrict__ dh_, const double x, const double y, const double z)
__global__ __launch_bounds__(64) void calculate_coefficients(const kernel_params dev_)
__device__ __inline__ T calculate_ymix_ymax_boundaries(smem_task_reduced< T, T3 > &task, const int z, int &ymin, int &ymax)
Parameters of the collocate kernel.
Differences in angular momentum.
data needed for collocate and integrate kernels
data needed for calculating the coefficients, forces, and stress