(git:ed6f26b)
Loading...
Searching...
No Matches
grid_hip_integrate.cu
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2025 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: BSD-3-Clause */
6/*----------------------------------------------------------------------------*/
7
8/*
9 * inspirartions from the gpu backend
10 * Authors :
11 - Dr Mathieu Taillefumier (ETH Zurich / CSCS)
12 - Advanced Micro Devices, Inc.
13*/
14
15#if defined(__OFFLOAD_HIP) && !defined(__NO_OFFLOAD_GRID)
16
17#include <algorithm>
18#include <assert.h>
19#include <hip/hip_runtime.h>
20#include <limits.h>
21#include <math.h>
22#include <stdio.h>
23#include <stdlib.h>
24#include <string.h>
25
26#include "grid_hip_context.h"
29
30#ifdef __HIP_PLATFORM_NVIDIA__
31#include <cooperative_groups.h>
32#if CUDA_VERSION >= 11000
33#include <cooperative_groups/reduce.h>
34#endif
35namespace cg = cooperative_groups;
36#endif
37
38#if defined(_OMP_H)
39#error "OpenMP should not be used in .cu files to accommodate HIP."
40#endif
41
42namespace rocm_backend {
43// do a warp reduction and return the final sum to thread_id = 0
44template <typename T>
45__device__ __inline__ T warp_reduce(T *table, const int tid) {
46 // AMD GPU have warp size of 64 while nvidia GPUs have warpSize of 32 so the
47 // first step is common to both platforms.
48 if (tid < 32) {
49 table[tid] += table[tid + 32];
50 }
51 __syncthreads();
52 if (tid < 16) {
53 table[tid] += table[tid + 16];
54 }
55 __syncthreads();
56 if (tid < 8) {
57 table[tid] += table[tid + 8];
58 }
59 __syncthreads();
60 if (tid < 4) {
61 table[tid] += table[tid + 4];
62 }
63 __syncthreads();
64 if (tid < 2) {
65 table[tid] += table[tid + 2];
66 }
67 __syncthreads();
68 return (table[0] + table[1]);
69}
70// #endif
71
72template <typename T, typename T3, bool COMPUTE_TAU, bool CALCULATE_FORCES>
73__global__ __launch_bounds__(64) void compute_hab_v2(const kernel_params dev_) {
74 // Copy task from global to shared memory and precompute some stuff.
75 extern __shared__ T shared_memory[];
76 // T *smem_cab = &shared_memory[dev_.smem_cab_offset];
77 T *smem_alpha = &shared_memory[dev_.smem_alpha_offset];
78 const int tid =
79 threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
80 const int offset = dev_.sorted_blocks_offset_dev[blockIdx.x];
81 const int number_of_tasks = dev_.num_tasks_per_block_dev[blockIdx.x];
82
83 if (number_of_tasks == 0)
84 return;
85
86 T fa[3], fb[3];
87 T virial[9];
88
89 fa[0] = 0.0;
90 fa[1] = 0.0;
91 fa[2] = 0.0;
92 fb[0] = 0.0;
93 fb[1] = 0.0;
94 fb[2] = 0.0;
95
96 virial[0] = 0.0;
97 virial[1] = 0.0;
98 virial[2] = 0.0;
99 virial[3] = 0.0;
100 virial[4] = 0.0;
101 virial[5] = 0.0;
102 virial[6] = 0.0;
103 virial[7] = 0.0;
104 virial[8] = 0.0;
105
106 for (int tk = 0; tk < number_of_tasks; tk++) {
107 __shared__ smem_task<T> task;
108 const int task_id = dev_.task_sorted_by_blocks_dev[offset + tk];
109 if (dev_.tasks[task_id].skip_task)
110 continue;
111 fill_smem_task_coef(dev_, task_id, task);
112
113 T *coef_ = &dev_.ptr_dev[2][dev_.tasks[task_id].coef_offset];
114 T *smem_cab = &dev_.ptr_dev[6][dev_.tasks[task_id].cab_offset];
115
116 compute_alpha(task, smem_alpha);
117 __syncthreads();
118 cxyz_to_cab(task, smem_alpha, coef_, smem_cab);
119 __syncthreads();
120 for (int jco = task.first_cosetb; jco < task.ncosetb; jco++) {
121 const auto &b = coset_inv[jco];
122 for (int ico = task.first_coseta; ico < task.ncoseta; ico++) {
123 const auto &a = coset_inv[ico];
124 const T hab = get_hab<COMPUTE_TAU, T>(a, b, task.zeta, task.zetb,
125 task.n1, smem_cab);
126 __shared__ T shared_forces_a[3];
127 __shared__ T shared_forces_b[3];
128 __shared__ T shared_virial[9];
129
130 if (CALCULATE_FORCES) {
131 if (tid < 3) {
132 shared_forces_a[tid] = get_force_a<COMPUTE_TAU, T>(
133 a, b, tid, task.zeta, task.zetb, task.n1, smem_cab);
134 shared_forces_b[tid] = get_force_b<COMPUTE_TAU, T>(
135 a, b, tid, task.zeta, task.zetb, task.rab, task.n1, smem_cab);
136 }
137 if ((tid < 9) && (dev_.ptr_dev[5] != nullptr)) {
138 shared_virial[tid] =
139 get_virial_a<COMPUTE_TAU, T>(a, b, tid / 3, tid % 3, task.zeta,
140 task.zetb, task.n1, smem_cab) +
141 get_virial_b<COMPUTE_TAU, T>(a, b, tid / 3, tid % 3, task.zeta,
142 task.zetb, task.rab, task.n1,
143 smem_cab);
144 }
145 }
146 __syncthreads();
147 for (int i = tid / 8; i < task.nsgf_setb; i += 8) {
148 const T sphib = task.sphib[i * task.maxcob + jco];
149 for (int j = tid % 8; j < task.nsgf_seta; j += 8) {
150 const T sphia_times_sphib =
151 task.sphia[j * task.maxcoa + ico] * sphib;
152 // T block_val = hab * sphia_times_sphib;
153
154 if (CALCULATE_FORCES) {
155 T block_val1 = 0.0;
156 if (task.block_transposed) {
157 block_val1 =
158 task.pab_block[j * task.nsgfb + i] * task.off_diag_twice;
159 } else {
160 block_val1 =
161 task.pab_block[i * task.nsgfa + j] * task.off_diag_twice;
162 }
163 for (int k = 0; k < 3; k++) {
164 fa[k] += block_val1 * sphia_times_sphib * shared_forces_a[k];
165 // get_force_a<COMPUTE_TAU, T>(
166 // a, b, k, task.zeta, task.zetb, task.n1, smem_cab);
167 fb[k] += block_val1 * sphia_times_sphib * shared_forces_b[k];
168 // get_force_b<COMPUTE_TAU, T>(a, b, k, task.zeta, task.zetb,
169 // task.rab, task.n1, smem_cab);
170 }
171 if (dev_.ptr_dev[5] != nullptr) {
172 for (int k = 0; k < 3; k++) {
173 for (int l = 0; l < 3; l++) {
174 virial[3 * k + l] += shared_virial[3 * k + l] * block_val1 *
175 sphia_times_sphib;
176 }
177 }
178 }
179 }
180 // we can use shuffle_down if it exists for T
181
182 // these atomic operations are not needed since the blocks are
183 // updated by one single block thread. However the grid_miniapp will
184 // fail if not there.
185 if (task.block_transposed) {
186 task.hab_block[j * task.nsgfb + i] += hab * sphia_times_sphib;
187 // atomicAdd(task.hab_block + j * task.nsgfb + i, hab
188 // * sphia_times_sphib);
189 } else {
190 task.hab_block[i * task.nsgfa + j] += hab * sphia_times_sphib;
191 // atomicAdd(task.hab_block + i * task.nsgfa + j, hab *
192 // sphia_times_sphib);
193 }
194 }
195 }
196 __syncthreads();
197 }
198 }
199 }
200
201 if (CALCULATE_FORCES) {
202 const int task_id = dev_.task_sorted_by_blocks_dev[offset];
203 const auto &glb_task = dev_.tasks[task_id];
204 const int iatom = glb_task.iatom;
205 const int jatom = glb_task.jatom;
206 T *forces_a = &dev_.ptr_dev[4][3 * iatom];
207 T *forces_b = &dev_.ptr_dev[4][3 * jatom];
208
209 T *sum = (T *)shared_memory;
210 if (dev_.ptr_dev[5] != nullptr) {
211
212 for (int i = 0; i < 9; i++) {
213 sum[tid] = virial[i];
214 __syncthreads();
215
216 virial[i] = warp_reduce<T>(sum, tid);
217 __syncthreads();
218
219 if (tid == 0)
220 atomicAdd(dev_.ptr_dev[5] + i, virial[i]);
221 __syncthreads();
222 }
223 }
224
225 for (int i = 0; i < 3; i++) {
226 sum[tid] = fa[i];
227 __syncthreads();
228 fa[i] = warp_reduce<T>(sum, tid);
229 __syncthreads();
230 if (tid == 0)
231 atomicAdd(forces_a + i, fa[i]);
232 __syncthreads();
233
234 sum[tid] = fb[i];
235 __syncthreads();
236 fb[i] = warp_reduce<T>(sum, tid);
237 __syncthreads();
238 if (tid == 0)
239 atomicAdd(forces_b + i, fb[i]);
240 __syncthreads();
241 }
242 }
243}
244
245/*******************************************************************************
246 * Cuda kernel for calcualting the coefficients of a potential (density,
247 etc...) for a given pair of gaussian basis sets
248
249We compute the discretized version of the following integral
250
251$$\int _\infty ^\infty V_{ijk} P^\alpha_iP^beta_j P ^ \gamma _ k Exp(- \eta
252|r_{ijk} - r_c|^2)$$
253
254where in practice the summation is over a finite domain. The discrete form has
255this shape
256
257$$
258\sum_{ijk < dmoain} V_{ijk} (x_i - x_c)^\alpha (y_j - y_c)^\beta (z_k - z_c) ^
259\gamma Exp(- \eta |r_{ijk} - r_c|^2)
260$$
261
262where $0 \le \alpha + \beta + \gamma \le lmax$
263
264It is formely the same operation than collocate (from a discrete point of view)
265but the implementation differ because of technical reasons.
266
267So most of the code is the same except the core of the routine that is
268specialized to the integration.
269
270******************************************************************************/
271template <typename T, typename T3, bool distributed__, bool orthorhombic_,
272 int lbatch = 10>
273__global__
274__launch_bounds__(64) void integrate_kernel(const kernel_params dev_) {
275 if (dev_.tasks[dev_.first_task + blockIdx.x].skip_task)
276 return;
277
278 const int tid =
279 threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
280
281 __shared__ T dh_inv_[9];
282 __shared__ T dh_[9];
283
284 for (int d = tid; d < 9; d += blockDim.x * blockDim.y * blockDim.z)
285 dh_inv_[d] = dev_.dh_inv_[d];
286
287 for (int d = tid; d < 9; d += blockDim.x * blockDim.y * blockDim.z)
288 dh_[d] = dev_.dh_[d];
289
290 __shared__ smem_task_reduced<T, T3> task;
291 fill_smem_task_reduced(dev_, dev_.first_task + blockIdx.x, task);
292
293 if (tid == 0) {
294 task.cube_center.z += task.lb_cube.z - dev_.grid_lower_corner_[0];
295 task.cube_center.y += task.lb_cube.y - dev_.grid_lower_corner_[1];
296 task.cube_center.x += task.lb_cube.x - dev_.grid_lower_corner_[2];
297
298 if (distributed__) {
299 if (task.apply_border_mask) {
301 dev_.grid_local_size_,
302 dev_.tasks[dev_.first_task + blockIdx.x].border_mask,
303 dev_.grid_border_width_, &task.window_size, &task.window_shift);
304 }
305 }
306 }
307 __syncthreads();
308
309 __shared__ T accumulator[lbatch][64];
310 __shared__ T sum[lbatch];
311
312 // we use a multi pass algorithm here because shared memory usage (or
313 // register) would become too high for high angular momentum
314
315 const short int size_loop =
316 (ncoset(task.lp) / lbatch + ((ncoset(task.lp) % lbatch) != 0)) * lbatch;
317 const short int length = ncoset(task.lp);
318 for (int ico = 0; ico < size_loop; ico += lbatch) {
319#pragma unroll lbatch
320 for (int i = 0; i < lbatch; i++)
321 accumulator[i][tid] = 0.0;
322
323 __syncthreads();
324
325 for (int z = threadIdx.z; z < task.cube_size.z; z += blockDim.z) {
326 int z2 = (z + task.cube_center.z) % dev_.grid_full_size_[0];
327
328 if (z2 < 0)
329 z2 += dev_.grid_full_size_[0];
330
331 if (distributed__) {
332 // known at compile time. Will be stripped away
333 if (task.apply_border_mask) {
334 /* check if the point is within the window */
335 if ((z2 < task.window_shift.z) || (z2 > task.window_size.z)) {
336 continue;
337 }
338 }
339 }
340
341 /* compute the coordinates of the point in atomic coordinates */
342 int ymin = 0;
343 int ymax = task.cube_size.y - 1;
344 T kremain = 0.0;
345
346 if (orthorhombic_ && !task.apply_border_mask) {
347 ymin = (2 * (z + task.lb_cube.z) - 1) / 2;
348 ymin *= ymin;
349 kremain = task.discrete_radius * task.discrete_radius -
350 ymin * dh_[8] * dh_[8];
351 ymin = ceil(-1.0e-8 - sqrt(fmax(0.0, kremain)) * dh_inv_[4]);
352 ymax = 1 - ymin - task.lb_cube.y;
353 ymin = ymin - task.lb_cube.y;
354 }
355
356 for (int y = ymin + threadIdx.y; y <= ymax; y += blockDim.y) {
357 int y2 = (y + task.cube_center.y) % dev_.grid_full_size_[1];
358
359 if (y2 < 0)
360 y2 += dev_.grid_full_size_[1];
361
362 if (distributed__) {
363 /* check if the point is within the window */
364 if (task.apply_border_mask) {
365 if ((y2 < task.window_shift.y) || (y2 > task.window_size.y)) {
366 continue;
367 }
368 }
369 }
370
371 int xmin = 0;
372 int xmax = task.cube_size.x - 1;
373
374 if (orthorhombic_ && !task.apply_border_mask) {
375 xmin = (2 * (y + task.lb_cube.y) - 1) / 2;
376 xmin *= xmin;
377 xmin =
378 ceil(-1.0e-8 - sqrt(fmax(0.0, kremain - xmin * dh_[4] * dh_[4])) *
379 dh_inv_[0]);
380 xmax = 1 - xmin - task.lb_cube.x;
381 xmin -= task.lb_cube.x;
382 }
383
384 for (int x = xmin + threadIdx.x; x <= xmax; x += blockDim.x) {
385 int x2 = (x + task.cube_center.x) % dev_.grid_full_size_[2];
386
387 if (x2 < 0)
388 x2 += dev_.grid_full_size_[2];
389
390 if (distributed__) {
391 /* check if the point is within the window */
392 if (task.apply_border_mask) {
393 if ((x2 < task.window_shift.x) || (x2 > task.window_size.x)) {
394 continue;
395 }
396 }
397 }
398
399 // I make no distinction between orthorhombic and non orthorhombic
400 // cases
401 T3 r3;
402
403 if (orthorhombic_) {
404 r3.x = (x + task.lb_cube.x + task.roffset.x) * dh_[0];
405 r3.y = (y + task.lb_cube.y + task.roffset.y) * dh_[4];
406 r3.z = (z + task.lb_cube.z + task.roffset.z) * dh_[8];
407 } else {
408 r3 = compute_coordinates(dh_, (x + task.lb_cube.x + task.roffset.x),
409 (y + task.lb_cube.y + task.roffset.y),
410 (z + task.lb_cube.z + task.roffset.z));
411 }
412 // check if the point is inside the sphere or not. Note that it does
413 // not apply for the orthorhombic case when the full sphere is inside
414 // the region of interest.
415
416 if (distributed__) {
417 if (((task.radius * task.radius) <=
418 (r3.x * r3.x + r3.y * r3.y + r3.z * r3.z)) &&
419 (!orthorhombic_ || task.apply_border_mask))
420 continue;
421 } else {
422 if (!orthorhombic_) {
423 if ((task.radius * task.radius) <=
424 (r3.x * r3.x + r3.y * r3.y + r3.z * r3.z))
425 continue;
426 }
427 }
428
429 // read the next point assuming that reading is non blocking untill
430 // the register is actually needed for computation. This is true on
431 // NVIDIA hardware
432
433 T grid_value =
434 __ldg(&dev_.ptr_dev[1][(z2 * dev_.grid_local_size_[1] + y2) *
435 dev_.grid_local_size_[2] +
436 x2]);
437
438 const T r3x2 = r3.x * r3.x;
439 const T r3y2 = r3.y * r3.y;
440 const T r3z2 = r3.z * r3.z;
441
442 const T r3xy = r3.x * r3.y;
443 const T r3xz = r3.x * r3.z;
444 const T r3yz = r3.y * r3.z;
445
446 grid_value *= exp(-(r3x2 + r3y2 + r3z2) * task.zetp);
447
448 switch (ico / lbatch) {
449 case 0: {
450 accumulator[0][tid] += grid_value;
451
452 if (task.lp >= 1) {
453 accumulator[1][tid] += grid_value * r3.x;
454 accumulator[2][tid] += grid_value * r3.y;
455 accumulator[3][tid] += grid_value * r3.z;
456 }
457
458 if (task.lp >= 2) {
459 accumulator[4][tid] += grid_value * r3x2;
460 accumulator[5][tid] += grid_value * r3xy;
461 accumulator[6][tid] += grid_value * r3xz;
462 accumulator[7][tid] += grid_value * r3y2;
463 accumulator[8][tid] += grid_value * r3yz;
464 accumulator[9][tid] += grid_value * r3z2;
465 }
466 } break;
467 case 1: {
468 if (task.lp >= 3) {
469 accumulator[0][tid] += grid_value * r3x2 * r3.x;
470 accumulator[1][tid] += grid_value * r3x2 * r3.y;
471 accumulator[2][tid] += grid_value * r3x2 * r3.z;
472 accumulator[3][tid] += grid_value * r3.x * r3y2;
473 accumulator[4][tid] += grid_value * r3xy * r3.z;
474 accumulator[5][tid] += grid_value * r3.x * r3z2;
475 accumulator[6][tid] += grid_value * r3y2 * r3.y;
476 accumulator[7][tid] += grid_value * r3y2 * r3.z;
477 accumulator[8][tid] += grid_value * r3.y * r3z2;
478 accumulator[9][tid] += grid_value * r3z2 * r3.z;
479 }
480 } break;
481 case 2: {
482 if (task.lp >= 4) {
483 accumulator[0][tid] += grid_value * r3x2 * r3x2;
484 accumulator[1][tid] += grid_value * r3x2 * r3xy;
485 accumulator[2][tid] += grid_value * r3x2 * r3xz;
486 accumulator[3][tid] += grid_value * r3x2 * r3y2;
487 accumulator[4][tid] += grid_value * r3x2 * r3yz;
488 accumulator[5][tid] += grid_value * r3x2 * r3z2;
489 accumulator[6][tid] += grid_value * r3xy * r3y2;
490 accumulator[7][tid] += grid_value * r3xz * r3y2;
491 accumulator[8][tid] += grid_value * r3xy * r3z2;
492 accumulator[9][tid] += grid_value * r3xz * r3z2;
493 }
494 } break;
495 case 3: {
496 if (task.lp >= 4) {
497 accumulator[0][tid] += grid_value * r3y2 * r3y2;
498 accumulator[1][tid] += grid_value * r3y2 * r3yz;
499 accumulator[2][tid] += grid_value * r3y2 * r3z2;
500 accumulator[3][tid] += grid_value * r3yz * r3z2;
501 accumulator[4][tid] += grid_value * r3z2 * r3z2;
502 }
503 if (task.lp >= 5) {
504 accumulator[5][tid] += grid_value * r3x2 * r3x2 * r3.x;
505 accumulator[6][tid] += grid_value * r3x2 * r3x2 * r3.y;
506 accumulator[7][tid] += grid_value * r3x2 * r3x2 * r3.z;
507 accumulator[8][tid] += grid_value * r3x2 * r3.x * r3y2;
508 accumulator[9][tid] += grid_value * r3x2 * r3xy * r3.z;
509 }
510 } break;
511 case 4: {
512 accumulator[0][tid] += grid_value * r3x2 * r3.x * r3z2;
513 accumulator[1][tid] += grid_value * r3x2 * r3y2 * r3.y;
514 accumulator[2][tid] += grid_value * r3x2 * r3y2 * r3.z;
515 accumulator[3][tid] += grid_value * r3x2 * r3.y * r3z2;
516 accumulator[4][tid] += grid_value * r3x2 * r3z2 * r3.z;
517 accumulator[5][tid] += grid_value * r3.x * r3y2 * r3y2;
518 accumulator[6][tid] += grid_value * r3.x * r3y2 * r3yz;
519 accumulator[7][tid] += grid_value * r3.x * r3y2 * r3z2;
520 accumulator[8][tid] += grid_value * r3xy * r3z2 * r3.z;
521 accumulator[9][tid] += grid_value * r3.x * r3z2 * r3z2;
522 } break;
523 case 5: {
524 accumulator[0][tid] += grid_value * r3y2 * r3y2 * r3.y;
525 accumulator[1][tid] += grid_value * r3y2 * r3y2 * r3.z;
526 accumulator[2][tid] += grid_value * r3y2 * r3.y * r3z2;
527 accumulator[3][tid] += grid_value * r3y2 * r3z2 * r3.z;
528 accumulator[4][tid] += grid_value * r3.y * r3z2 * r3z2;
529 accumulator[5][tid] += grid_value * r3z2 * r3z2 * r3.z;
530 if (task.lp >= 6) {
531 accumulator[6][tid] += grid_value * r3x2 * r3x2 * r3x2; // x^6
532 accumulator[7][tid] += grid_value * r3x2 * r3x2 * r3xy; // x^5 y
533 accumulator[8][tid] += grid_value * r3x2 * r3x2 * r3xz; // x^5 z
534 accumulator[9][tid] += grid_value * r3x2 * r3x2 * r3y2; // x^4 y^2
535 }
536 } break;
537 case 6: {
538 accumulator[0][tid] += grid_value * r3x2 * r3x2 * r3yz; // x^4 y z
539 accumulator[1][tid] += grid_value * r3x2 * r3x2 * r3z2; // x^4 z^2
540 accumulator[2][tid] += grid_value * r3x2 * r3y2 * r3xy; // x^3 y^3
541 accumulator[3][tid] += grid_value * r3x2 * r3y2 * r3xz; // x^3 y^2 z
542 accumulator[4][tid] += grid_value * r3x2 * r3xy * r3z2; // x^3 y z^2
543 accumulator[5][tid] += grid_value * r3x2 * r3z2 * r3xz; // x^3 z^3
544 accumulator[6][tid] += grid_value * r3x2 * r3y2 * r3y2; // x^2 y^4
545 accumulator[7][tid] += grid_value * r3x2 * r3y2 * r3yz; // x^3 y^2 z
546 accumulator[8][tid] +=
547 grid_value * r3x2 * r3y2 * r3z2; // x^2 y^2 z^2
548 accumulator[9][tid] += grid_value * r3x2 * r3z2 * r3yz; // x^2 y z^3
549 } break;
550 case 7: {
551 accumulator[0][tid] += grid_value * r3x2 * r3z2 * r3z2; // x^2 z^4
552 accumulator[1][tid] += grid_value * r3y2 * r3y2 * r3xy; // x y^5
553 accumulator[2][tid] += grid_value * r3y2 * r3y2 * r3xz; // x y^4 z
554 accumulator[3][tid] += grid_value * r3y2 * r3xy * r3z2; // x y^3 z^2
555 accumulator[4][tid] += grid_value * r3y2 * r3z2 * r3xz; // x y^2 z^3
556 accumulator[5][tid] += grid_value * r3xy * r3z2 * r3z2; // x y z^4
557 accumulator[6][tid] += grid_value * r3z2 * r3z2 * r3xz; // x z^5
558 accumulator[7][tid] += grid_value * r3y2 * r3y2 * r3y2; // y^6
559 accumulator[8][tid] += grid_value * r3y2 * r3y2 * r3yz; // y^5 z
560 accumulator[9][tid] += grid_value * r3y2 * r3y2 * r3z2; // y^4 z^2
561 } break;
562 default:
563 for (int ic = 0; (ic < lbatch) && ((ic + ico) < length); ic++) {
564 auto &co = coset_inv[ic + ico];
565 T tmp = 1.0;
566 for (int po = 0; po < (co.l[2] >> 1); po++)
567 tmp *= r3z2;
568 if (co.l[2] & 0x1)
569 tmp *= r3.z;
570 for (int po = 0; po < (co.l[1] >> 1); po++)
571 tmp *= r3y2;
572 if (co.l[1] & 0x1)
573 tmp *= r3.y;
574 for (int po = 0; po < (co.l[0] >> 1); po++)
575 tmp *= r3x2;
576 if (co.l[0] & 0x1)
577 tmp *= r3.x;
578 accumulator[ic][tid] += tmp * grid_value;
579 }
580 break;
581 }
582 }
583 }
584 }
585
586 __syncthreads();
587
588 // we know there is only 1 wavefront in each block
589 // lbatch threads could reduce the values saved by all threads of the warp
590 // and save results do a shuffle_down by hand
591 for (int i = 0; i < min(length - ico, lbatch); i++) {
592 if (tid < 32) {
593 accumulator[i][tid] += accumulator[i][tid + 32];
594 }
595 __syncthreads();
596 if (tid < 16) {
597 accumulator[i][tid] += accumulator[i][tid + 16];
598 }
599 __syncthreads();
600 if (tid < 8) {
601 accumulator[i][tid] += accumulator[i][tid + 8];
602 }
603 __syncthreads();
604 if (tid < 4) {
605 accumulator[i][tid] += accumulator[i][tid + 4];
606 }
607 __syncthreads();
608 if (tid < 2) {
609 accumulator[i][tid] += accumulator[i][tid + 2];
610 }
611 __syncthreads();
612 if (tid == 0)
613 sum[i] = accumulator[i][0] + accumulator[i][1];
614 }
615 __syncthreads();
616
617 for (int i = tid; i < min(length - ico, lbatch); i += lbatch)
618 dev_.ptr_dev[2][dev_.tasks[dev_.first_task + blockIdx.x].coef_offset + i +
619 ico] = sum[i];
620 __syncthreads();
621 // if (tid == 0)
622 // printf("%.15lf\n", sum[0]);
623 }
624}
625
626kernel_params
627context_info::set_kernel_parameters(const int level,
628 const smem_parameters &smem_params) {
629 kernel_params params;
630 params.smem_cab_offset = smem_params.smem_cab_offset();
631 params.smem_alpha_offset = smem_params.smem_alpha_offset();
632 params.first_task = 0;
633
634 params.la_min_diff = smem_params.ldiffs().la_min_diff;
635 params.lb_min_diff = smem_params.ldiffs().lb_min_diff;
636
637 params.la_max_diff = smem_params.ldiffs().la_max_diff;
638 params.lb_max_diff = smem_params.ldiffs().lb_max_diff;
639 params.first_task = 0;
640 params.tasks = this->tasks_dev.data();
641 params.task_sorted_by_blocks_dev = task_sorted_by_blocks_dev.data();
642 params.sorted_blocks_offset_dev = sorted_blocks_offset_dev.data();
643 params.num_tasks_per_block_dev = this->num_tasks_per_block_dev_.data();
644 params.block_offsets = this->block_offsets_dev.data();
645 params.la_min_diff = smem_params.ldiffs().la_min_diff;
646 params.lb_min_diff = smem_params.ldiffs().lb_min_diff;
647 params.la_max_diff = smem_params.ldiffs().la_max_diff;
648 params.lb_max_diff = smem_params.ldiffs().lb_max_diff;
649
650 params.ptr_dev[0] = pab_block_.data();
651
652 if (level >= 0) {
653 params.ptr_dev[1] = grid_[level].data();
654 for (int i = 0; i < 3; i++) {
655 memcpy(params.dh_, grid_[level].dh(), 9 * sizeof(double));
656 memcpy(params.dh_inv_, grid_[level].dh_inv(), 9 * sizeof(double));
657 params.grid_full_size_[i] = grid_[level].full_size(i);
658 params.grid_local_size_[i] = grid_[level].local_size(i);
659 params.grid_lower_corner_[i] = grid_[level].lower_corner(i);
660 params.grid_border_width_[i] = grid_[level].border_width(i);
661 }
662 params.first_task = first_task_per_level_[level];
663 }
664 params.ptr_dev[2] = this->coef_dev_.data();
665 params.ptr_dev[3] = hab_block_.data();
666 params.ptr_dev[4] = forces_.data();
667 params.ptr_dev[5] = virial_.data();
668 params.ptr_dev[6] = this->cab_dev_.data();
669 params.sphi_dev = this->sphi_dev.data();
670 return params;
671}
672
673/*******************************************************************************
674 * \brief Launches the Cuda kernel that integrates all tasks of one grid level.
675 ******************************************************************************/
676void context_info::integrate_one_grid_level(const int level, int *lp_diff) {
677 if (number_of_tasks_per_level_[level] == 0)
678 return;
680
681 // Compute max angular momentum.
682 const ldiffs_value ldiffs =
684
685 smem_parameters smem_params(ldiffs, lmax());
686
687 *lp_diff = smem_params.lp_diff();
689
690 kernel_params params = set_kernel_parameters(level, smem_params);
691
692 /* WARNING : if you change the block size please be aware of that the number
693 * of warps is hardcoded when we do the block reduction in the integrate
694 * kernel. The number of warps should be explicitly indicated in the
695 * templating parameters or simply adjust the switch statements inside the
696 * integrate kernels */
697
698 const dim3 threads_per_block(4, 4, 4);
699 if (grid_[level].is_distributed()) {
700 if (grid_[level].is_orthorhombic()) {
701 integrate_kernel<double, double3, true, true, 10>
702 <<<number_of_tasks_per_level_[level], threads_per_block, 0,
703 level_streams[level]>>>(params);
704 } else {
705 integrate_kernel<double, double3, true, false, 10>
706 <<<number_of_tasks_per_level_[level], threads_per_block, 0,
707 level_streams[level]>>>(params);
708 }
709 } else {
710 if (grid_[level].is_orthorhombic()) {
711 integrate_kernel<double, double3, false, true, 10>
712 <<<number_of_tasks_per_level_[level], threads_per_block, 0,
713 level_streams[level]>>>(params);
714 } else {
715 integrate_kernel<double, double3, false, false, 10>
716 <<<number_of_tasks_per_level_[level], threads_per_block, 0,
717 level_streams[level]>>>(params);
718 }
719 }
720}
721
723
725
726 // Compute max angular momentum.
727 const ldiffs_value ldiffs =
729
730 smem_parameters smem_params(ldiffs, lmax());
732
733 kernel_params params = set_kernel_parameters(-1, smem_params);
734
735 /* WARNING if you change the block size. The number
736 * of warps is hardcoded when we do the block reduction in the integrate
737 * kernel. */
738
739 const dim3 threads_per_block(4, 4, 4);
740
741 if (!compute_tau && !calculate_forces) {
742 compute_hab_v2<double, double3, false, false>
743 <<<this->nblocks, threads_per_block, smem_params.smem_per_block(),
744 this->main_stream>>>(params);
745 return;
746 }
747
749 compute_hab_v2<double, double3, false, true>
750 <<<this->nblocks, threads_per_block, smem_params.smem_per_block(),
751 this->main_stream>>>(params);
752 return;
753 }
754
756 compute_hab_v2<double, double3, true, true>
757 <<<this->nblocks, threads_per_block, smem_params.smem_per_block(),
758 this->main_stream>>>(params);
759 }
760
762 compute_hab_v2<double, double3, true, false>
763 <<<this->nblocks, threads_per_block, smem_params.smem_per_block(),
764 this->main_stream>>>(params);
765 }
766}
767}; // namespace rocm_backend
768#endif // defined(__OFFLOAD_HIP) && !defined(__NO_OFFLOAD_GRID)
std::vector< int > first_task_per_level_
gpu_vector< double > virial_
gpu_vector< int > sorted_blocks_offset_dev
gpu_vector< double > coef_dev_
gpu_vector< double > pab_block_
gpu_vector< int > block_offsets_dev
gpu_vector< double > cab_dev_
gpu_vector< task_info > tasks_dev
std::vector< grid_info< double > > grid_
gpu_vector< double > forces_
gpu_vector< double > hab_block_
std::vector< offloadStream_t > level_streams
std::vector< int > number_of_tasks_per_level_
gpu_vector< double * > sphi_dev
gpu_vector< int > num_tasks_per_block_dev_
void integrate_one_grid_level(const int level, int *lp_diff)
gpu_vector< int > task_sorted_by_blocks_dev
static void const int const int i
real(dp), dimension(3) a
real(dp), dimension(3) d
real(dp), dimension(3) b
integer, dimension(:, :, :), allocatable, public co
integer, dimension(:), allocatable, public ncoset
__inline__ __device__ void compute_window_size(const int *const grid_size, const int border_mask, const int *border_width, int3 *const window_size, int3 *const window_shift)
ldiffs_value process_get_ldiffs(bool calculate_forces, bool calculate_virial, bool compute_tau)
Returns difference in angular momentum range for given flags.
__device__ __inline__ void fill_smem_task_coef(const kernel_params &dev, const int task_id, smem_task< T > &task)
Copies a task from global to shared memory and does precomputations.
static void init_constant_memory()
Initializes the device's constant memory.
__device__ static __inline__ void cxyz_to_cab(const smem_task< T > &task, const T *__restrict__ alpha, const T *__restrict__ cxyz, T *__restrict__ cab)
Transforms coefficients C_xyz into C_ab.
__constant__ orbital coset_inv[1330]
__inline__ __device__ void compute_alpha(const smem_task< T > &task, T *__restrict__ alpha)
Computes the polynomial expansion coefficients: (x-a)**lxa (x-b)**lxb -> sum_{ls} alpha(ls,...
__device__ __inline__ void fill_smem_task_reduced(const kernel_params &dev, const int task_id, smem_task_reduced< T, T3 > &task)
Copies a task from global to shared memory.
__inline__ __device__ double3 compute_coordinates(const double *__restrict__ dh_, const double x, const double y, const double z)