(git:374b731)
Loading...
Searching...
No Matches
grid_hip_integrate.cu
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2024 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: BSD-3-Clause */
6/*----------------------------------------------------------------------------*/
7
8/*
9 * inspirartions from the gpu backend
10 * Authors :
11 - Dr Mathieu Taillefumier (ETH Zurich / CSCS)
12 - Advanced Micro Devices, Inc.
13*/
14
15#if defined(__OFFLOAD_HIP) && !defined(__NO_OFFLOAD_GRID)
16
17#include <algorithm>
18#include <assert.h>
19#include <hip/hip_runtime.h>
20#include <limits.h>
21#include <math.h>
22#include <stdio.h>
23#include <stdlib.h>
24#include <string.h>
25
26#include "grid_hip_context.h"
29
30#ifdef __HIP_PLATFORM_NVIDIA__
31#include <cooperative_groups.h>
32#if CUDA_VERSION >= 11000
33#include <cooperative_groups/reduce.h>
34#endif
35namespace cg = cooperative_groups;
36#endif
37
38#if defined(_OMP_H)
39#error "OpenMP should not be used in .cu files to accommodate HIP."
40#endif
41
42namespace rocm_backend {
43// do a warp reduction and return the final sum to thread_id = 0
44template <typename T>
45__device__ __inline__ T warp_reduce(T *table, const int tid) {
46 // AMD GPU have warp size of 64 while nvidia GPUs have warpSize of 32 so the
47 // first step is common to both platforms.
48 if (tid < 32) {
49 table[tid] += table[tid + 32];
50 }
51 __syncthreads();
52 if (tid < 16) {
53 table[tid] += table[tid + 16];
54 }
55 __syncthreads();
56 if (tid < 8) {
57 table[tid] += table[tid + 8];
58 }
59 __syncthreads();
60 if (tid < 4) {
61 table[tid] += table[tid + 4];
62 }
63 __syncthreads();
64 if (tid < 2) {
65 table[tid] += table[tid + 2];
66 }
67 __syncthreads();
68 return (table[0] + table[1]);
69}
70// #endif
71
72template <typename T, typename T3, bool COMPUTE_TAU, bool CALCULATE_FORCES>
73__global__ __launch_bounds__(64) void compute_hab_v2(const kernel_params dev_) {
74 // Copy task from global to shared memory and precompute some stuff.
75 extern __shared__ T shared_memory[];
76 // T *smem_cab = &shared_memory[dev_.smem_cab_offset];
77 T *smem_alpha = &shared_memory[dev_.smem_alpha_offset];
78 const int tid =
79 threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
80 const int offset = dev_.sorted_blocks_offset_dev[blockIdx.x];
81 const int number_of_tasks = dev_.num_tasks_per_block_dev[blockIdx.x];
82
83 if (number_of_tasks == 0)
84 return;
85
86 T fa[3], fb[3];
87 T virial[9];
88
89 fa[0] = 0.0;
90 fa[1] = 0.0;
91 fa[2] = 0.0;
92 fb[0] = 0.0;
93 fb[1] = 0.0;
94 fb[2] = 0.0;
95
96 virial[0] = 0.0;
97 virial[1] = 0.0;
98 virial[2] = 0.0;
99 virial[3] = 0.0;
100 virial[4] = 0.0;
101 virial[5] = 0.0;
102 virial[6] = 0.0;
103 virial[7] = 0.0;
104 virial[8] = 0.0;
105
106 for (int tk = 0; tk < number_of_tasks; tk++) {
107 __shared__ smem_task<T> task;
108 const int task_id = dev_.task_sorted_by_blocks_dev[offset + tk];
109 if (dev_.tasks[task_id].skip_task)
110 continue;
111 fill_smem_task_coef(dev_, task_id, task);
112
113 T *coef_ = &dev_.ptr_dev[2][dev_.tasks[task_id].coef_offset];
114 T *smem_cab = &dev_.ptr_dev[6][dev_.tasks[task_id].cab_offset];
115
116 compute_alpha(task, smem_alpha);
117 __syncthreads();
118 cxyz_to_cab(task, smem_alpha, coef_, smem_cab);
119 __syncthreads();
120
121 for (int i = tid / 8; i < task.nsgf_setb; i += 8) {
122 for (int j = tid % 8; j < task.nsgf_seta; j += 8) {
123 T res = 0.0;
124
125 T block_val1 = 0.0;
126
127 if (CALCULATE_FORCES) {
128 if (task.block_transposed) {
129 block_val1 =
130 task.pab_block[j * task.nsgfb + i] * task.off_diag_twice;
131 } else {
132 block_val1 =
133 task.pab_block[i * task.nsgfa + j] * task.off_diag_twice;
134 }
135 }
136
137 for (int jco = task.first_cosetb; jco < task.ncosetb; jco++) {
138 const auto &b = coset_inv[jco];
139 T block_val = 0.0;
140 const T sphib = task.sphib[i * task.maxcob + jco];
141 for (int ico = task.first_coseta; ico < task.ncoseta; ico++) {
142 const auto &a = coset_inv[ico];
143 const T hab = get_hab<COMPUTE_TAU, T>(a, b, task.zeta, task.zetb,
144 task.n1, smem_cab);
145 const T sphia = task.sphia[j * task.maxcoa + ico];
146 block_val += hab * sphia * sphib;
147
148 if (CALCULATE_FORCES) {
149 for (int k = 0; k < 3; k++) {
150 fa[k] += block_val1 * sphia * sphib *
151 get_force_a<COMPUTE_TAU, T>(
152 a, b, k, task.zeta, task.zetb, task.n1, smem_cab);
153 fb[k] +=
154 block_val1 * sphia * sphib *
155 get_force_b<COMPUTE_TAU, T>(a, b, k, task.zeta, task.zetb,
156 task.rab, task.n1, smem_cab);
157 }
158 if (dev_.ptr_dev[5] != nullptr) {
159 for (int k = 0; k < 3; k++) {
160 for (int l = 0; l < 3; l++) {
161 virial[3 * k + l] +=
162 (get_virial_a<COMPUTE_TAU, T>(a, b, k, l, task.zeta,
163 task.zetb, task.n1,
164 smem_cab) +
165 get_virial_b<COMPUTE_TAU, T>(a, b, k, l, task.zeta,
166 task.zetb, task.rab,
167 task.n1, smem_cab)) *
168 block_val1 * sphia * sphib;
169 }
170 }
171 }
172 }
173 }
174
175 res += block_val;
176 }
177
178 // we can use shuffle_down if it exists for T
179
180 // these atomic operations are not needed since the blocks are updated
181 // by one single block thread. However the grid_miniapp will fail if not
182 // there.
183 if (task.block_transposed) {
184 atomicAdd(task.hab_block + j * task.nsgfb + i, res);
185 } else {
186 atomicAdd(task.hab_block + i * task.nsgfa + j, res);
187 }
188 }
189 }
190 __syncthreads();
191 }
192
193 if (CALCULATE_FORCES) {
194 const int task_id = dev_.task_sorted_by_blocks_dev[offset];
195 const auto &glb_task = dev_.tasks[task_id];
196 const int iatom = glb_task.iatom;
197 const int jatom = glb_task.jatom;
198 T *forces_a = &dev_.ptr_dev[4][3 * iatom];
199 T *forces_b = &dev_.ptr_dev[4][3 * jatom];
200
201 T *sum = (T *)shared_memory;
202 if (dev_.ptr_dev[5] != nullptr) {
203
204 for (int i = 0; i < 9; i++) {
205 sum[tid] = virial[i];
206 __syncthreads();
207
208 virial[i] = warp_reduce<T>(sum, tid);
209 __syncthreads();
210
211 if (tid == 0)
212 atomicAdd(dev_.ptr_dev[5] + i, virial[i]);
213 __syncthreads();
214 }
215 }
216
217 for (int i = 0; i < 3; i++) {
218 sum[tid] = fa[i];
219 __syncthreads();
220 fa[i] = warp_reduce<T>(sum, tid);
221 __syncthreads();
222 if (tid == 0)
223 atomicAdd(forces_a + i, fa[i]);
224 __syncthreads();
225
226 sum[tid] = fb[i];
227 __syncthreads();
228 fb[i] = warp_reduce<T>(sum, tid);
229 __syncthreads();
230 if (tid == 0)
231 atomicAdd(forces_b + i, fb[i]);
232 __syncthreads();
233 }
234 }
235}
236
237/*******************************************************************************
238 * Cuda kernel for calcualting the coefficients of a potential (density,
239 etc...) for a given pair of gaussian basis sets
240
241We compute the discretized version of the following integral
242
243$$\int _\infty ^\infty V_{ijk} P^\alpha_iP^beta_j P ^ \gamma _ k Exp(- \eta
244|r_{ijk} - r_c|^2)$$
245
246where in practice the summation is over a finite domain. The discrete form has
247this shape
248
249$$
250\sum_{ijk < dmoain} V_{ijk} (x_i - x_c)^\alpha (y_j - y_c)^\beta (z_k - z_c) ^
251\gamma Exp(- \eta |r_{ijk} - r_c|^2)
252$$
253
254where $0 \le \alpha + \beta + \gamma \le lmax$
255
256It is formely the same operation than collocate (from a discrete point of view)
257but the implementation differ because of technical reasons.
258
259So most of the code is the same except the core of the routine that is
260specialized to the integration.
261
262******************************************************************************/
263template <typename T, typename T3, bool distributed__, bool orthorhombic_,
264 int lbatch = 10>
265__global__
266__launch_bounds__(64) void integrate_kernel(const kernel_params dev_) {
267 if (dev_.tasks[dev_.first_task + blockIdx.x].skip_task)
268 return;
269
270 const int tid =
271 threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
272
273 __shared__ T dh_inv_[9];
274 __shared__ T dh_[9];
275
276 for (int d = tid; d < 9; d += blockDim.x * blockDim.y * blockDim.z)
277 dh_inv_[d] = dev_.dh_inv_[d];
278
279 for (int d = tid; d < 9; d += blockDim.x * blockDim.y * blockDim.z)
280 dh_[d] = dev_.dh_[d];
281
282 __shared__ smem_task_reduced<T, T3> task;
283 fill_smem_task_reduced(dev_, dev_.first_task + blockIdx.x, task);
284
285 if (tid == 0) {
286 task.cube_center.z += task.lb_cube.z - dev_.grid_lower_corner_[0];
287 task.cube_center.y += task.lb_cube.y - dev_.grid_lower_corner_[1];
288 task.cube_center.x += task.lb_cube.x - dev_.grid_lower_corner_[2];
289
290 if (distributed__) {
291 if (task.apply_border_mask) {
293 dev_.grid_local_size_,
294 dev_.tasks[dev_.first_task + blockIdx.x].border_mask,
295 dev_.grid_border_width_, &task.window_size, &task.window_shift);
296 }
297 }
298 }
299 __syncthreads();
300
301 __shared__ T accumulator[lbatch][64];
302 __shared__ T sum[lbatch];
303
304 // we use a multi pass algorithm here because shared memory usage (or
305 // register) would become too high for high angular momentum
306
307 const short int size_loop =
308 (ncoset(task.lp) / lbatch + ((ncoset(task.lp) % lbatch) != 0)) * lbatch;
309 const short int length = ncoset(task.lp);
310 for (int ico = 0; ico < size_loop; ico += lbatch) {
311#pragma unroll lbatch
312 for (int i = 0; i < lbatch; i++)
313 accumulator[i][tid] = 0.0;
314
315 __syncthreads();
316
317 for (int z = threadIdx.z; z < task.cube_size.z; z += blockDim.z) {
318 int z2 = (z + task.cube_center.z) % dev_.grid_full_size_[0];
319
320 if (z2 < 0)
321 z2 += dev_.grid_full_size_[0];
322
323 if (distributed__) {
324 // known at compile time. Will be stripped away
325 if (task.apply_border_mask) {
326 /* check if the point is within the window */
327 if ((z2 < task.window_shift.z) || (z2 > task.window_size.z)) {
328 continue;
329 }
330 }
331 }
332
333 /* compute the coordinates of the point in atomic coordinates */
334 int ymin = 0;
335 int ymax = task.cube_size.y - 1;
336 T kremain = 0.0;
337
338 if (orthorhombic_ && !task.apply_border_mask) {
339 ymin = (2 * (z + task.lb_cube.z) - 1) / 2;
340 ymin *= ymin;
341 kremain = task.discrete_radius * task.discrete_radius -
342 ymin * dh_[8] * dh_[8];
343 ymin = ceil(-1.0e-8 - sqrt(fmax(0.0, kremain)) * dh_inv_[4]);
344 ymax = 1 - ymin - task.lb_cube.y;
345 ymin = ymin - task.lb_cube.y;
346 }
347
348 for (int y = ymin + threadIdx.y; y <= ymax; y += blockDim.y) {
349 int y2 = (y + task.cube_center.y) % dev_.grid_full_size_[1];
350
351 if (y2 < 0)
352 y2 += dev_.grid_full_size_[1];
353
354 if (distributed__) {
355 /* check if the point is within the window */
356 if (task.apply_border_mask) {
357 if ((y2 < task.window_shift.y) || (y2 > task.window_size.y)) {
358 continue;
359 }
360 }
361 }
362
363 int xmin = 0;
364 int xmax = task.cube_size.x - 1;
365
366 if (orthorhombic_ && !task.apply_border_mask) {
367 xmin = (2 * (y + task.lb_cube.y) - 1) / 2;
368 xmin *= xmin;
369 xmin =
370 ceil(-1.0e-8 - sqrt(fmax(0.0, kremain - xmin * dh_[4] * dh_[4])) *
371 dh_inv_[0]);
372 xmax = 1 - xmin - task.lb_cube.x;
373 xmin -= task.lb_cube.x;
374 }
375
376 for (int x = xmin + threadIdx.x; x <= xmax; x += blockDim.x) {
377 int x2 = (x + task.cube_center.x) % dev_.grid_full_size_[2];
378
379 if (x2 < 0)
380 x2 += dev_.grid_full_size_[2];
381
382 if (distributed__) {
383 /* check if the point is within the window */
384 if (task.apply_border_mask) {
385 if ((x2 < task.window_shift.x) || (x2 > task.window_size.x)) {
386 continue;
387 }
388 }
389 }
390
391 // I make no distinction between orthorhombic and non orthorhombic
392 // cases
393 T3 r3;
394
395 if (orthorhombic_) {
396 r3.x = (x + task.lb_cube.x + task.roffset.x) * dh_[0];
397 r3.y = (y + task.lb_cube.y + task.roffset.y) * dh_[4];
398 r3.z = (z + task.lb_cube.z + task.roffset.z) * dh_[8];
399 } else {
400 r3 = compute_coordinates(dh_, (x + task.lb_cube.x + task.roffset.x),
401 (y + task.lb_cube.y + task.roffset.y),
402 (z + task.lb_cube.z + task.roffset.z));
403 }
404 // check if the point is inside the sphere or not. Note that it does
405 // not apply for the orthorhombic case when the full sphere is inside
406 // the region of interest.
407
408 if (distributed__) {
409 if (((task.radius * task.radius) <=
410 (r3.x * r3.x + r3.y * r3.y + r3.z * r3.z)) &&
411 (!orthorhombic_ || task.apply_border_mask))
412 continue;
413 } else {
414 if (!orthorhombic_) {
415 if ((task.radius * task.radius) <=
416 (r3.x * r3.x + r3.y * r3.y + r3.z * r3.z))
417 continue;
418 }
419 }
420
421 // read the next point assuming that reading is non blocking untill
422 // the register is actually needed for computation. This is true on
423 // NVIDIA hardware
424
425 T grid_value =
426 __ldg(&dev_.ptr_dev[1][(z2 * dev_.grid_local_size_[1] + y2) *
427 dev_.grid_local_size_[2] +
428 x2]);
429
430 const T r3x2 = r3.x * r3.x;
431 const T r3y2 = r3.y * r3.y;
432 const T r3z2 = r3.z * r3.z;
433
434 const T r3xy = r3.x * r3.y;
435 const T r3xz = r3.x * r3.z;
436 const T r3yz = r3.y * r3.z;
437
438 grid_value *= exp(-(r3x2 + r3y2 + r3z2) * task.zetp);
439
440 switch (ico / lbatch) {
441 case 0: {
442 accumulator[0][tid] += grid_value;
443
444 if (task.lp >= 1) {
445 accumulator[1][tid] += grid_value * r3.x;
446 accumulator[2][tid] += grid_value * r3.y;
447 accumulator[3][tid] += grid_value * r3.z;
448 }
449
450 if (task.lp >= 2) {
451 accumulator[4][tid] += grid_value * r3x2;
452 accumulator[5][tid] += grid_value * r3xy;
453 accumulator[6][tid] += grid_value * r3xz;
454 accumulator[7][tid] += grid_value * r3y2;
455 accumulator[8][tid] += grid_value * r3yz;
456 accumulator[9][tid] += grid_value * r3z2;
457 }
458 } break;
459 case 1: {
460 if (task.lp >= 3) {
461 accumulator[0][tid] += grid_value * r3x2 * r3.x;
462 accumulator[1][tid] += grid_value * r3x2 * r3.y;
463 accumulator[2][tid] += grid_value * r3x2 * r3.z;
464 accumulator[3][tid] += grid_value * r3.x * r3y2;
465 accumulator[4][tid] += grid_value * r3xy * r3.z;
466 accumulator[5][tid] += grid_value * r3.x * r3z2;
467 accumulator[6][tid] += grid_value * r3y2 * r3.y;
468 accumulator[7][tid] += grid_value * r3y2 * r3.z;
469 accumulator[8][tid] += grid_value * r3.y * r3z2;
470 accumulator[9][tid] += grid_value * r3z2 * r3.z;
471 }
472 } break;
473 case 2: {
474 if (task.lp >= 4) {
475 accumulator[0][tid] += grid_value * r3x2 * r3x2;
476 accumulator[1][tid] += grid_value * r3x2 * r3xy;
477 accumulator[2][tid] += grid_value * r3x2 * r3xz;
478 accumulator[3][tid] += grid_value * r3x2 * r3y2;
479 accumulator[4][tid] += grid_value * r3x2 * r3yz;
480 accumulator[5][tid] += grid_value * r3x2 * r3z2;
481 accumulator[6][tid] += grid_value * r3xy * r3y2;
482 accumulator[7][tid] += grid_value * r3xz * r3y2;
483 accumulator[8][tid] += grid_value * r3xy * r3z2;
484 accumulator[9][tid] += grid_value * r3xz * r3z2;
485 }
486 } break;
487 case 3: {
488 if (task.lp >= 4) {
489 accumulator[0][tid] += grid_value * r3y2 * r3y2;
490 accumulator[1][tid] += grid_value * r3y2 * r3yz;
491 accumulator[2][tid] += grid_value * r3y2 * r3z2;
492 accumulator[3][tid] += grid_value * r3yz * r3z2;
493 accumulator[4][tid] += grid_value * r3z2 * r3z2;
494 }
495 if (task.lp >= 5) {
496 accumulator[5][tid] += grid_value * r3x2 * r3x2 * r3.x;
497 accumulator[6][tid] += grid_value * r3x2 * r3x2 * r3.y;
498 accumulator[7][tid] += grid_value * r3x2 * r3x2 * r3.z;
499 accumulator[8][tid] += grid_value * r3x2 * r3.x * r3y2;
500 accumulator[9][tid] += grid_value * r3x2 * r3xy * r3.z;
501 }
502 } break;
503 case 4: {
504 accumulator[0][tid] += grid_value * r3x2 * r3.x * r3z2;
505 accumulator[1][tid] += grid_value * r3x2 * r3y2 * r3.y;
506 accumulator[2][tid] += grid_value * r3x2 * r3y2 * r3.z;
507 accumulator[3][tid] += grid_value * r3x2 * r3.y * r3z2;
508 accumulator[4][tid] += grid_value * r3x2 * r3z2 * r3.z;
509 accumulator[5][tid] += grid_value * r3.x * r3y2 * r3y2;
510 accumulator[6][tid] += grid_value * r3.x * r3y2 * r3yz;
511 accumulator[7][tid] += grid_value * r3.x * r3y2 * r3z2;
512 accumulator[8][tid] += grid_value * r3xy * r3z2 * r3.z;
513 accumulator[9][tid] += grid_value * r3.x * r3z2 * r3z2;
514 } break;
515 case 5: {
516 accumulator[0][tid] += grid_value * r3y2 * r3y2 * r3.y;
517 accumulator[1][tid] += grid_value * r3y2 * r3y2 * r3.z;
518 accumulator[2][tid] += grid_value * r3y2 * r3.y * r3z2;
519 accumulator[3][tid] += grid_value * r3y2 * r3z2 * r3.z;
520 accumulator[4][tid] += grid_value * r3.y * r3z2 * r3z2;
521 accumulator[5][tid] += grid_value * r3z2 * r3z2 * r3.z;
522 if (task.lp >= 6) {
523 accumulator[6][tid] += grid_value * r3x2 * r3x2 * r3x2; // x^6
524 accumulator[7][tid] += grid_value * r3x2 * r3x2 * r3xy; // x^5 y
525 accumulator[8][tid] += grid_value * r3x2 * r3x2 * r3xz; // x^5 z
526 accumulator[9][tid] += grid_value * r3x2 * r3x2 * r3y2; // x^4 y^2
527 }
528 } break;
529 case 6: {
530 accumulator[0][tid] += grid_value * r3x2 * r3x2 * r3yz; // x^4 y z
531 accumulator[1][tid] += grid_value * r3x2 * r3x2 * r3z2; // x^4 z^2
532 accumulator[2][tid] += grid_value * r3x2 * r3y2 * r3xy; // x^3 y^3
533 accumulator[3][tid] += grid_value * r3x2 * r3y2 * r3xz; // x^3 y^2 z
534 accumulator[4][tid] += grid_value * r3x2 * r3xy * r3z2; // x^3 y z^2
535 accumulator[5][tid] += grid_value * r3x2 * r3z2 * r3xz; // x^3 z^3
536 accumulator[6][tid] += grid_value * r3x2 * r3y2 * r3y2; // x^2 y^4
537 accumulator[7][tid] += grid_value * r3x2 * r3y2 * r3yz; // x^3 y^2 z
538 accumulator[8][tid] +=
539 grid_value * r3x2 * r3y2 * r3z2; // x^2 y^2 z^2
540 accumulator[9][tid] += grid_value * r3x2 * r3z2 * r3yz; // x^2 y z^3
541 } break;
542 case 7: {
543 accumulator[0][tid] += grid_value * r3x2 * r3z2 * r3z2; // x^2 z^4
544 accumulator[1][tid] += grid_value * r3y2 * r3y2 * r3xy; // x y^5
545 accumulator[2][tid] += grid_value * r3y2 * r3y2 * r3xz; // x y^4 z
546 accumulator[3][tid] += grid_value * r3y2 * r3xy * r3z2; // x y^3 z^2
547 accumulator[4][tid] += grid_value * r3y2 * r3z2 * r3xz; // x y^2 z^3
548 accumulator[5][tid] += grid_value * r3xy * r3z2 * r3z2; // x y z^4
549 accumulator[6][tid] += grid_value * r3z2 * r3z2 * r3xz; // x z^5
550 accumulator[7][tid] += grid_value * r3y2 * r3y2 * r3y2; // y^6
551 accumulator[8][tid] += grid_value * r3y2 * r3y2 * r3yz; // y^5 z
552 accumulator[9][tid] += grid_value * r3y2 * r3y2 * r3z2; // y^4 z^2
553 } break;
554 default:
555 for (int ic = 0; (ic < lbatch) && ((ic + ico) < length); ic++) {
556 auto &co = coset_inv[ic + ico];
557 T tmp = 1.0;
558 for (int po = 0; po < co.l[2]; po++)
559 tmp *= r3.z;
560 for (int po = 0; po < co.l[1]; po++)
561 tmp *= r3.y;
562 for (int po = 0; po < co.l[0]; po++)
563 tmp *= r3.x;
564 accumulator[ic][tid] += tmp * grid_value;
565 }
566 break;
567 }
568 }
569 }
570 }
571
572 __syncthreads();
573
574 // we know there is only 1 wavefront in each block
575 // lbatch threads could reduce the values saved by all threads of the warp
576 // and save results do a shuffle_down by hand
577 for (int i = 0; i < min(length - ico, lbatch); i++) {
578 if (tid < 32) {
579 accumulator[i][tid] += accumulator[i][tid + 32];
580 }
581 __syncthreads();
582 if (tid < 16) {
583 accumulator[i][tid] += accumulator[i][tid + 16];
584 }
585 __syncthreads();
586 if (tid < 8) {
587 accumulator[i][tid] += accumulator[i][tid + 8];
588 }
589 __syncthreads();
590 if (tid < 4) {
591 accumulator[i][tid] += accumulator[i][tid + 4];
592 }
593 __syncthreads();
594 if (tid < 2) {
595 accumulator[i][tid] += accumulator[i][tid + 2];
596 }
597 __syncthreads();
598 if (tid == 0)
599 sum[i] = accumulator[i][0] + accumulator[i][1];
600 }
601 __syncthreads();
602
603 for (int i = tid; i < min(length - ico, lbatch); i += lbatch)
604 dev_.ptr_dev[2][dev_.tasks[dev_.first_task + blockIdx.x].coef_offset + i +
605 ico] = sum[i];
606 __syncthreads();
607 // if (tid == 0)
608 // printf("%.15lf\n", sum[0]);
609 }
610}
611
612kernel_params
613context_info::set_kernel_parameters(const int level,
614 const smem_parameters &smem_params) {
615 kernel_params params;
616 params.smem_cab_offset = smem_params.smem_cab_offset();
617 params.smem_alpha_offset = smem_params.smem_alpha_offset();
618 params.first_task = 0;
619
620 params.la_min_diff = smem_params.ldiffs().la_min_diff;
621 params.lb_min_diff = smem_params.ldiffs().lb_min_diff;
622
623 params.la_max_diff = smem_params.ldiffs().la_max_diff;
624 params.lb_max_diff = smem_params.ldiffs().lb_max_diff;
625 params.first_task = 0;
626 params.tasks = this->tasks_dev.data();
627 params.task_sorted_by_blocks_dev = task_sorted_by_blocks_dev.data();
628 params.sorted_blocks_offset_dev = sorted_blocks_offset_dev.data();
629 params.num_tasks_per_block_dev = this->num_tasks_per_block_dev_.data();
630 params.block_offsets = this->block_offsets_dev.data();
631 params.la_min_diff = smem_params.ldiffs().la_min_diff;
632 params.lb_min_diff = smem_params.ldiffs().lb_min_diff;
633 params.la_max_diff = smem_params.ldiffs().la_max_diff;
634 params.lb_max_diff = smem_params.ldiffs().lb_max_diff;
635
636 params.ptr_dev[0] = pab_block_.data();
637
638 if (level >= 0) {
639 params.ptr_dev[1] = grid_[level].data();
640 for (int i = 0; i < 3; i++) {
641 memcpy(params.dh_, grid_[level].dh(), 9 * sizeof(double));
642 memcpy(params.dh_inv_, grid_[level].dh_inv(), 9 * sizeof(double));
643 params.grid_full_size_[i] = grid_[level].full_size(i);
644 params.grid_local_size_[i] = grid_[level].local_size(i);
645 params.grid_lower_corner_[i] = grid_[level].lower_corner(i);
646 params.grid_border_width_[i] = grid_[level].border_width(i);
647 }
648 params.first_task = first_task_per_level_[level];
649 }
650 params.ptr_dev[2] = this->coef_dev_.data();
651 params.ptr_dev[3] = hab_block_.data();
652 params.ptr_dev[4] = forces_.data();
653 params.ptr_dev[5] = virial_.data();
654 params.ptr_dev[6] = this->cab_dev_.data();
655 params.sphi_dev = this->sphi_dev.data();
656 return params;
657}
658
659/*******************************************************************************
660 * \brief Launches the Cuda kernel that integrates all tasks of one grid level.
661 ******************************************************************************/
662void context_info::integrate_one_grid_level(const int level, int *lp_diff) {
663 if (number_of_tasks_per_level_[level] == 0)
664 return;
666
667 // Compute max angular momentum.
668 const ldiffs_value ldiffs =
670
671 smem_parameters smem_params(ldiffs, lmax());
672
673 *lp_diff = smem_params.lp_diff();
675
676 kernel_params params = set_kernel_parameters(level, smem_params);
677
678 /* WARNING : if you change the block size please be aware of that the number
679 * of warps is hardcoded when we do the block reduction in the integrate
680 * kernel. The number of warps should be explicitly indicated in the
681 * templating parameters or simply adjust the switch statements inside the
682 * integrate kernels */
683
684 const dim3 threads_per_block(4, 4, 4);
685 if (grid_[level].is_distributed()) {
686 if (grid_[level].is_orthorhombic()) {
687 integrate_kernel<double, double3, true, true, 10>
688 <<<number_of_tasks_per_level_[level], threads_per_block, 0,
689 level_streams[level]>>>(params);
690 } else {
691 integrate_kernel<double, double3, true, false, 10>
692 <<<number_of_tasks_per_level_[level], threads_per_block, 0,
693 level_streams[level]>>>(params);
694 }
695 } else {
696 if (grid_[level].is_orthorhombic()) {
697 integrate_kernel<double, double3, false, true, 10>
698 <<<number_of_tasks_per_level_[level], threads_per_block, 0,
699 level_streams[level]>>>(params);
700 } else {
701 integrate_kernel<double, double3, false, false, 10>
702 <<<number_of_tasks_per_level_[level], threads_per_block, 0,
703 level_streams[level]>>>(params);
704 }
705 }
706}
707
709
711
712 // Compute max angular momentum.
713 const ldiffs_value ldiffs =
715
716 smem_parameters smem_params(ldiffs, lmax());
718
719 kernel_params params = set_kernel_parameters(-1, smem_params);
720
721 /* WARNING if you change the block size. The number
722 * of warps is hardcoded when we do the block reduction in the integrate
723 * kernel. */
724
725 const dim3 threads_per_block(4, 4, 4);
726
727 if (!compute_tau && !calculate_forces) {
728 compute_hab_v2<double, double3, false, false>
729 <<<this->nblocks, threads_per_block, smem_params.smem_per_block(),
730 this->main_stream>>>(params);
731 return;
732 }
733
735 compute_hab_v2<double, double3, false, true>
736 <<<this->nblocks, threads_per_block, smem_params.smem_per_block(),
737 this->main_stream>>>(params);
738 return;
739 }
740
742 compute_hab_v2<double, double3, true, true>
743 <<<this->nblocks, threads_per_block, smem_params.smem_per_block(),
744 this->main_stream>>>(params);
745 }
746
748 compute_hab_v2<double, double3, true, false>
749 <<<this->nblocks, threads_per_block, smem_params.smem_per_block(),
750 this->main_stream>>>(params);
751 }
752}
753}; // namespace rocm_backend
754#endif // defined(__OFFLOAD_HIP) && !defined(__NO_OFFLOAD_GRID)
std::vector< int > first_task_per_level_
gpu_vector< double > virial_
gpu_vector< int > sorted_blocks_offset_dev
gpu_vector< double > coef_dev_
gpu_vector< double > pab_block_
gpu_vector< int > block_offsets_dev
gpu_vector< double > cab_dev_
gpu_vector< task_info > tasks_dev
std::vector< grid_info< double > > grid_
gpu_vector< double > forces_
gpu_vector< double > hab_block_
std::vector< offloadStream_t > level_streams
std::vector< int > number_of_tasks_per_level_
gpu_vector< double * > sphi_dev
gpu_vector< int > num_tasks_per_block_dev_
void integrate_one_grid_level(const int level, int *lp_diff)
gpu_vector< int > task_sorted_by_blocks_dev
static void const int const int i
real(dp), dimension(3) a
real(dp), dimension(3) d
real(dp), dimension(3) b
integer, dimension(:, :, :), allocatable, public co
integer, dimension(:), allocatable, public ncoset
__inline__ __device__ void compute_window_size(const int *const grid_size, const int border_mask, const int *border_width, int3 *const window_size, int3 *const window_shift)
ldiffs_value process_get_ldiffs(bool calculate_forces, bool calculate_virial, bool compute_tau)
Returns difference in angular momentum range for given flags.
__device__ __inline__ void fill_smem_task_coef(const kernel_params &dev, const int task_id, smem_task< T > &task)
Copies a task from global to shared memory and does precomputations.
static void init_constant_memory()
Initializes the device's constant memory.
__device__ static __inline__ void cxyz_to_cab(const smem_task< T > &task, const T *__restrict__ alpha, const T *__restrict__ cxyz, T *__restrict__ cab)
Transforms coefficients C_xyz into C_ab.
__constant__ orbital coset_inv[1330]
__inline__ __device__ void compute_alpha(const smem_task< T > &task, T *__restrict__ alpha)
Computes the polynomial expansion coefficients: (x-a)**lxa (x-b)**lxb -> sum_{ls} alpha(ls,...
__device__ __inline__ void fill_smem_task_reduced(const kernel_params &dev, const int task_id, smem_task_reduced< T, T3 > &task)
Copies a task from global to shared memory.
__inline__ __device__ double3 compute_coordinates(const double *__restrict__ dh_, const double x, const double y, const double z)