(git:e68414f)
Loading...
Searching...
No Matches
grid_gpu_task_list.cu
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2025 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: BSD-3-Clause */
6/*----------------------------------------------------------------------------*/
7
8#include "../../offload/offload_runtime.h"
9#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_GRID)
10
11#include "grid_gpu_collocate.h"
12#include "grid_gpu_integrate.h"
13#include "grid_gpu_task_list.h"
14
15#include "../../offload/offload_library.h"
16#include "../../offload/offload_mempool.h"
17#include "../common/grid_common.h"
18#include "../common/grid_constants.h"
19#include "../common/grid_library.h"
20
21#include <assert.h>
22#include <stdint.h>
23#include <stdio.h>
24#include <stdlib.h>
25#include <string.h>
26
27#if defined(_OMP_H)
28#error "OpenMP should not be used in .cu files to accommodate HIP."
29#endif
30
31/*******************************************************************************
32 * \brief Create a single task and precompute as much as possible.
33 * \author Ole Schuett
34 ******************************************************************************/
35static void
36create_tasks(const bool orthorhombic, const int ntasks,
37 const int block_offsets[], const double atom_positions[][3],
38 const int atom_kinds[], const grid_basis_set *basis_sets[],
39 const int level_list[], const int iatom_list[],
40 const int jatom_list[], const int iset_list[],
41 const int jset_list[], const int ipgf_list[],
42 const int jpgf_list[], const int border_mask_list[],
43 const int block_num_list[], const double radius_list[],
44 const double rab_list[][3], const int npts_local[][3],
45 const int shift_local[][3], const int border_width[][3],
46 const double dh[][3][3], const double dh_inv[][3][3],
47 const double *sphis_dev[], grid_gpu_task tasks[]) {
48
49 for (int itask = 0; itask < ntasks; itask++) {
50 grid_gpu_task *task = &tasks[itask];
51
52 task->iatom = iatom_list[itask] - 1;
53 task->jatom = jatom_list[itask] - 1;
54 const int iset = iset_list[itask] - 1;
55 const int jset = jset_list[itask] - 1;
56 const int ipgf = ipgf_list[itask] - 1;
57 const int jpgf = jpgf_list[itask] - 1;
58 const int ikind = atom_kinds[task->iatom] - 1;
59 const int jkind = atom_kinds[task->jatom] - 1;
60 const grid_basis_set *ibasis = basis_sets[ikind];
61 const grid_basis_set *jbasis = basis_sets[jkind];
62
63 const int level = level_list[itask] - 1;
64 const int border_mask = border_mask_list[itask];
65
66 task->use_orthorhombic_kernel = (orthorhombic && border_mask == 0);
67
68 task->zeta = ibasis->zet[iset * ibasis->maxpgf + ipgf];
69 task->zetb = jbasis->zet[jset * jbasis->maxpgf + jpgf];
70 task->zetp = task->zeta + task->zetb;
71 const double f = task->zetb / task->zetp;
72
73 task->rab2 = 0.0;
74 for (int i = 0; i < 3; i++) {
75 task->rab[i] = rab_list[itask][i];
76 task->rab2 += task->rab[i] * task->rab[i];
77 task->ra[i] = atom_positions[task->iatom][i];
78 task->rb[i] = task->ra[i] + task->rab[i];
79 task->rp[i] = task->ra[i] + task->rab[i] * f;
80 }
81
82 // center in grid coords, gp = MATMUL(dh_inv, rp)
83 for (int i = 0; i < 3; i++) {
84 task->gp[i] = dh_inv[level][0][i] * task->rp[0] +
85 dh_inv[level][1][i] * task->rp[1] +
86 dh_inv[level][2][i] * task->rp[2];
87 }
88
89 // resolution of the grid
90 task->dh_max = 0.0;
91 for (int i = 0; i < 3; i++) {
92 for (int j = 0; j < 3; j++) {
93 task->dh_max = fmax(task->dh_max, fabs(dh[level][i][j]));
94 }
95 }
96
97 task->radius = radius_list[itask];
98 task->radius2 = task->radius * task->radius;
99 task->prefactor = exp(-task->zeta * f * task->rab2);
100 task->off_diag_twice = (task->iatom == task->jatom) ? 1.0 : 2.0;
101
102 // angular momentum range of basis set
103 task->la_max_basis = ibasis->lmax[iset];
104 task->lb_max_basis = jbasis->lmax[jset];
105 task->la_min_basis = ibasis->lmin[iset];
106 task->lb_min_basis = jbasis->lmin[jset];
107
108 // size of entire spherical basis
109 task->nsgfa = ibasis->nsgf;
110 task->nsgfb = jbasis->nsgf;
111
112 // size of spherical set
113 task->nsgf_seta = ibasis->nsgf_set[iset];
114 task->nsgf_setb = jbasis->nsgf_set[jset];
115
116 // strides of the sphi transformation matrices
117 task->maxcoa = ibasis->maxco;
118 task->maxcob = jbasis->maxco;
119
120 // start of spherical set within the basis
121 const int sgfa = ibasis->first_sgf[iset] - 1;
122 const int sgfb = jbasis->first_sgf[jset] - 1;
123
124 // start of exponent within the cartesian set
125 const int o1 = ipgf * ncoset(task->la_max_basis);
126 const int o2 = jpgf * ncoset(task->lb_max_basis);
127
128 // transformations from contracted spherical to primitiv carthesian basis
129 task->sphia = &sphis_dev[ikind][sgfa * task->maxcoa + o1];
130 task->sphib = &sphis_dev[jkind][sgfb * task->maxcob + o2];
131
132 // Locate current matrix block within the buffer.
133 const int block_num = block_num_list[itask] - 1;
134 task->block_transposed = (task->iatom > task->jatom);
135 const int block_offset = block_offsets[block_num];
136 const int subblock_offset = (task->block_transposed)
137 ? sgfa * task->nsgfb + sgfb
138 : sgfb * task->nsgfa + sgfa;
139 task->ab_block_offset = block_offset + subblock_offset;
140
141 // Stuff for the ortho kernel ----------------------------------------------
142 if (orthorhombic) {
143 // Discretize the radius.
144 const double drmin =
145 fmin(dh[level][0][0], fmin(dh[level][1][1], dh[level][2][2]));
146 const int imr = imax(1, (int)ceil(task->radius / drmin));
147 task->disr_radius = drmin * imr;
148
149 // Position of the gaussian product:
150 // this is the actual definition of the position on the grid
151 // i.e. a point rp(:) gets here grid coordinates
152 // MODULO(rp(:)/dr(:),npts_global(:))+1
153 // hence (0.0,0.0,0.0) in real space is rsgrid%lb on the rsgrid in Fortran
154 // and (1,1,1) on grid here in C.
155 //
156 // cubecenter(:) = FLOOR(MATMUL(dh_inv, rp))
157 for (int i = 0; i < 3; i++) {
158 const int cubecenter = floor(task->gp[i]);
159 task->cube_center_shifted[i] = cubecenter - shift_local[level][i];
160 task->cube_offset[i] = cubecenter * dh[level][i][i] - task->rp[i];
161 }
162 }
163
164 // Stuff for the general kernel --------------------------------------------
165 //
166 // get the min max indices that contain at least the cube that contains a
167 // sphere around rp of radius radius if the cell is very non-orthogonal this
168 // implies that many useless points are included this estimate can be
169 // improved (i.e. not box but sphere should be used)
170 for (int i = 0; i < 3; i++) {
171 task->index_min[i] = INT_MAX;
172 task->index_max[i] = INT_MIN;
173 }
174 for (int i = -1; i <= 1; i++) {
175 for (int j = -1; j <= 1; j++) {
176 for (int k = -1; k <= 1; k++) {
177 const double x = task->rp[0] + i * task->radius;
178 const double y = task->rp[1] + j * task->radius;
179 const double z = task->rp[2] + k * task->radius;
180 for (int idir = 0; idir < 3; idir++) {
181 const double resc = dh_inv[level][0][idir] * x +
182 dh_inv[level][1][idir] * y +
183 dh_inv[level][2][idir] * z;
184 task->index_min[idir] =
185 imin(task->index_min[idir], (int)floor(resc));
186 task->index_max[idir] =
187 imax(task->index_max[idir], (int)ceil(resc));
188 }
189 }
190 }
191 }
192
193 // Defaults for border_mask == 0.
194 task->bounds_i[0] = 0;
195 task->bounds_i[1] = npts_local[level][0] - 1;
196 task->bounds_j[0] = 0;
197 task->bounds_j[1] = npts_local[level][1] - 1;
198 task->bounds_k[0] = 0;
199 task->bounds_k[1] = npts_local[level][2] - 1;
200
201 // See also rs_find_node() in task_list_methods.F.
202 // If the bit is set then we need to exclude the border in that direction.
203 if (border_mask & (1 << 0)) {
204 task->bounds_i[0] += border_width[level][0];
205 }
206 if (border_mask & (1 << 1)) {
207 task->bounds_i[1] -= border_width[level][0];
208 }
209 if (border_mask & (1 << 2)) {
210 task->bounds_j[0] += border_width[level][1];
211 }
212 if (border_mask & (1 << 3)) {
213 task->bounds_j[1] -= border_width[level][1];
214 }
215 if (border_mask & (1 << 4)) {
216 task->bounds_k[0] += border_width[level][2];
217 }
218 if (border_mask & (1 << 5)) {
219 task->bounds_k[1] -= border_width[level][2];
220 }
221 }
222}
223
224/*******************************************************************************
225 * \brief Allocates a task list for the GPU backend.
226 * See grid_task_list.h for details.
227 * \author Ole Schuett
228 ******************************************************************************/
229void grid_gpu_create_task_list(
230 const bool orthorhombic, const int ntasks, const int nlevels,
231 const int natoms, const int nkinds, const int nblocks,
232 const int block_offsets[], const double atom_positions[][3],
233 const int atom_kinds[], const grid_basis_set *basis_sets[],
234 const int level_list[], const int iatom_list[], const int jatom_list[],
235 const int iset_list[], const int jset_list[], const int ipgf_list[],
236 const int jpgf_list[], const int border_mask_list[],
237 const int block_num_list[], const double radius_list[],
238 const double rab_list[][3], const int npts_global[][3],
239 const int npts_local[][3], const int shift_local[][3],
240 const int border_width[][3], const double dh[][3][3],
241 const double dh_inv[][3][3], grid_gpu_task_list **task_list_out) {
242
243 // Select GPU device.
245
246 if (*task_list_out != NULL) {
247 // This is actually an opportunity to reuse some buffers.
248 grid_gpu_free_task_list(*task_list_out);
249 }
250
251 grid_gpu_task_list *task_list =
252 (grid_gpu_task_list *)malloc(sizeof(grid_gpu_task_list));
253
254 task_list->orthorhombic = orthorhombic;
255 task_list->ntasks = ntasks;
256 task_list->nlevels = nlevels;
257 task_list->natoms = natoms;
258 task_list->nkinds = nkinds;
259 task_list->nblocks = nblocks;
260
261 // Store grid layouts.
262 size_t size = nlevels * sizeof(grid_gpu_layout);
263 task_list->layouts = (grid_gpu_layout *)malloc(size);
264 for (int level = 0; level < nlevels; level++) {
265 for (int i = 0; i < 3; i++) {
266 task_list->layouts[level].npts_global[i] = npts_global[level][i];
267 task_list->layouts[level].npts_local[i] = npts_local[level][i];
268 task_list->layouts[level].shift_local[i] = shift_local[level][i];
269 task_list->layouts[level].border_width[i] = border_width[level][i];
270 for (int j = 0; j < 3; j++) {
271 task_list->layouts[level].dh[i][j] = dh[level][i][j];
272 task_list->layouts[level].dh_inv[i][j] = dh_inv[level][i][j];
273 }
274 }
275 }
276
277 // Upload basis set sphi matrices to device.
278 task_list->sphis_dev = (double **)malloc(nkinds * sizeof(double *));
279 for (int i = 0; i < nkinds; i++) {
280 size = basis_sets[i]->nsgf * basis_sets[i]->maxco * sizeof(double);
281 task_list->sphis_dev[i] = (double *)offload_mempool_device_malloc(size);
282 offloadMemcpyHtoD(task_list->sphis_dev[i], basis_sets[i]->sphi, size);
283 }
284
285 size = ntasks * sizeof(grid_gpu_task);
286 grid_gpu_task *tasks_host = (grid_gpu_task *)malloc(size);
287 create_tasks(orthorhombic, ntasks, block_offsets, atom_positions, atom_kinds,
288 basis_sets, level_list, iatom_list, jatom_list, iset_list,
289 jset_list, ipgf_list, jpgf_list, border_mask_list,
290 block_num_list, radius_list, rab_list, npts_local, shift_local,
291 border_width, dh, dh_inv, (const double **)task_list->sphis_dev,
292 tasks_host);
293 task_list->tasks_dev = (grid_gpu_task *)offload_mempool_device_malloc(size);
294 offloadMemcpyHtoD(task_list->tasks_dev, tasks_host, size);
295
296 free(tasks_host);
297
298 // Count tasks per level.
299 size = nlevels * sizeof(int);
300 task_list->tasks_per_level = (int *)malloc(size);
301 memset(task_list->tasks_per_level, 0, size);
302 for (int i = 0; i < ntasks; i++) {
303 task_list->tasks_per_level[level_list[i] - 1]++;
304 assert(i == 0 || level_list[i] >= level_list[i - 1]); // expect ordered list
305 }
306
307 // Find largest angular momentum.
308 task_list->lmax = 0;
309 for (int ikind = 0; ikind < nkinds; ikind++) {
310 for (int iset = 0; iset < basis_sets[ikind]->nset; iset++) {
311 task_list->lmax = imax(task_list->lmax, basis_sets[ikind]->lmax[iset]);
312 }
313 }
314
315 // collect stats
316 memset(task_list->stats, 0, 2 * 20 * sizeof(int));
317 for (int itask = 0; itask < ntasks; itask++) {
318 const int iatom = iatom_list[itask] - 1;
319 const int jatom = jatom_list[itask] - 1;
320 const int ikind = atom_kinds[iatom] - 1;
321 const int jkind = atom_kinds[jatom] - 1;
322 const int iset = iset_list[itask] - 1;
323 const int jset = jset_list[itask] - 1;
324 const int la_max = basis_sets[ikind]->lmax[iset];
325 const int lb_max = basis_sets[jkind]->lmax[jset];
326 const int lp = imin(la_max + lb_max, 19);
327 const bool has_border_mask = (border_mask_list[itask] != 0);
328 task_list->stats[has_border_mask][lp]++;
329 }
330
331 // allocate main cuda stream
332 offloadStreamCreate(&task_list->main_stream);
333
334 // allocate one cuda stream per grid level
335 size = nlevels * sizeof(offloadStream_t);
336 task_list->level_streams = (offloadStream_t *)malloc(size);
337 for (int i = 0; i < nlevels; i++) {
338 offloadStreamCreate(&task_list->level_streams[i]);
339 }
340
341 // return newly created task list
342 *task_list_out = task_list;
343}
344
345/*******************************************************************************
346 * \brief Deallocates given task list, basis_sets have to be freed separately.
347 * \author Ole Schuett
348 ******************************************************************************/
349void grid_gpu_free_task_list(grid_gpu_task_list *task_list) {
350
351 offload_mempool_device_free(task_list->tasks_dev);
352
353 offloadStreamDestroy(task_list->main_stream);
354
355 for (int i = 0; i < task_list->nlevels; i++) {
356 offloadStreamDestroy(task_list->level_streams[i]);
357 }
358 free(task_list->level_streams);
359
360 for (int i = 0; i < task_list->nkinds; i++) {
361 offload_mempool_device_free(task_list->sphis_dev[i]);
362 }
363 free(task_list->sphis_dev);
364
365 free(task_list->tasks_per_level);
366 free(task_list->layouts);
367 free(task_list);
368}
369
370/*******************************************************************************
371 * \brief Collocate all tasks of in given list onto given grids.
372 * See grid_task_list.h for details.
373 * \author Ole Schuett
374 ******************************************************************************/
375void grid_gpu_collocate_task_list(const grid_gpu_task_list *task_list,
376 const enum grid_func func, const int nlevels,
377 const offload_buffer *pab_blocks,
378 offload_buffer *grids[]) {
379
380 // Select GPU device.
382
383 // Upload blocks buffer using the main stream
384 offloadMemcpyAsyncHtoD(pab_blocks->device_buffer, pab_blocks->host_buffer,
385 pab_blocks->size, task_list->main_stream);
386
387 // record an event so the level streams can wait for the blocks to be uploaded
388 offloadEvent_t input_ready_event;
389 offloadEventCreate(&input_ready_event);
390 offloadEventRecord(input_ready_event, task_list->main_stream);
391
392 int lp_diff;
393 int first_task = 0;
394 assert(task_list->nlevels == nlevels);
395 for (int level = 0; level < task_list->nlevels; level++) {
396 const int last_task = first_task + task_list->tasks_per_level[level] - 1;
397 const offloadStream_t level_stream = task_list->level_streams[level];
398 const grid_gpu_layout *layout = &task_list->layouts[level];
399 offload_buffer *grid = grids[level];
400
401 // zero grid device buffer
402 offloadMemsetAsync(grid->device_buffer, 0, grid->size, level_stream);
403
404 // launch kernel, but only after blocks have arrived
405 offloadStreamWaitEvent(level_stream, input_ready_event);
406 grid_gpu_collocate_one_grid_level(
407 task_list, first_task, last_task, func, layout, level_stream,
408 pab_blocks->device_buffer, grid->device_buffer, &lp_diff);
409
410 first_task = last_task + 1;
411 }
412
413 // update counters while we wait for kernels to finish
414 for (int has_border_mask = 0; has_border_mask <= 1; has_border_mask++) {
415 for (int lp = 0; lp < 20; lp++) {
416 const int count = task_list->stats[has_border_mask][lp];
417 if (task_list->orthorhombic && !has_border_mask) {
419 GRID_COLLOCATE_ORTHO, count);
420 } else {
423 }
424 }
425 }
426
427 // download result from device to host.
428 for (int level = 0; level < task_list->nlevels; level++) {
429 offload_buffer *grid = grids[level];
430 offloadMemcpyAsyncDtoH(grid->host_buffer, grid->device_buffer, grid->size,
431 task_list->level_streams[level]);
432 }
433
434 // clean up
435 offloadEventDestroy(input_ready_event);
436
437 // wait for all the streams to finish
438 offloadDeviceSynchronize();
439}
440
441/*******************************************************************************
442 * \brief Integrate all tasks of in given list onto given grids.
443 * See grid_task_list.h for details.
444 * \author Ole Schuett
445 ******************************************************************************/
446void grid_gpu_integrate_task_list(const grid_gpu_task_list *task_list,
447 const bool compute_tau, const int natoms,
448 const int nlevels,
449 const offload_buffer *pab_blocks,
450 const offload_buffer *grids[],
451 offload_buffer *hab_blocks,
452 double forces[][3], double virial[3][3]) {
453
454 // Select GPU device.
456
457 // Prepare shared buffers using the main stream
458 double *forces_dev = NULL;
459 double *virial_dev = NULL;
460 double *pab_blocks_dev = NULL;
461 const size_t forces_size = 3 * natoms * sizeof(double);
462 const size_t virial_size = 9 * sizeof(double);
463 if (forces != NULL || virial != NULL) {
464 offloadMemcpyAsyncHtoD(pab_blocks->device_buffer, pab_blocks->host_buffer,
465 pab_blocks->size, task_list->main_stream);
466 pab_blocks_dev = pab_blocks->device_buffer;
467 }
468 if (forces != NULL) {
469 forces_dev = (double *)offload_mempool_device_malloc(forces_size);
470 offloadMemsetAsync(forces_dev, 0, forces_size, task_list->main_stream);
471 }
472 if (virial != NULL) {
473 virial_dev = (double *)offload_mempool_device_malloc(virial_size);
474 offloadMemsetAsync(virial_dev, 0, virial_size, task_list->main_stream);
475 }
476
477 // zero device hab blocks buffers
478 offloadMemsetAsync(hab_blocks->device_buffer, 0, hab_blocks->size,
479 task_list->main_stream);
480
481 // record event so other streams can wait for hab, pab, virial etc to be ready
482 offloadEvent_t input_ready_event;
483 offloadEventCreate(&input_ready_event);
484 offloadEventRecord(input_ready_event, task_list->main_stream);
485
486 int lp_diff;
487 int first_task = 0;
488 assert(task_list->nlevels == nlevels);
489 for (int level = 0; level < task_list->nlevels; level++) {
490 const int last_task = first_task + task_list->tasks_per_level[level] - 1;
491 const offloadStream_t level_stream = task_list->level_streams[level];
492 const grid_gpu_layout *layout = &task_list->layouts[level];
493 const offload_buffer *grid = grids[level];
494
495 // upload grid
496 offloadMemcpyAsyncHtoD(grid->device_buffer, grid->host_buffer, grid->size,
497 level_stream);
498
499 // launch kernel, but only after hab, pab, virial, etc are ready
500 offloadStreamWaitEvent(level_stream, input_ready_event);
501 grid_gpu_integrate_one_grid_level(
502 task_list, first_task, last_task, compute_tau, layout, level_stream,
503 pab_blocks_dev, grid->device_buffer, hab_blocks->device_buffer,
504 forces_dev, virial_dev, &lp_diff);
505
506 // Have main stream wait for level to complete before downloading results.
507 offloadEvent_t level_done_event;
508 offloadEventCreate(&level_done_event);
509 offloadEventRecord(level_done_event, level_stream);
510 offloadStreamWaitEvent(task_list->main_stream, level_done_event);
511 offloadEventDestroy(level_done_event);
512
513 first_task = last_task + 1;
514 }
515
516 // update counters while we wait for kernels to finish
517 for (int has_border_mask = 0; has_border_mask <= 1; has_border_mask++) {
518 for (int lp = 0; lp < 20; lp++) {
519 const int count = task_list->stats[has_border_mask][lp];
520 if (task_list->orthorhombic && !has_border_mask) {
522 GRID_INTEGRATE_ORTHO, count);
523 } else {
526 }
527 }
528 }
529
530 // download result from device to host using main stream.
531 offloadMemcpyAsyncDtoH(hab_blocks->host_buffer, hab_blocks->device_buffer,
532 hab_blocks->size, task_list->main_stream);
533 if (forces != NULL) {
534 offloadMemcpyAsyncDtoH(forces, forces_dev, forces_size,
535 task_list->main_stream);
536 }
537 if (virial != NULL) {
538 offloadMemcpyAsyncDtoH(virial, virial_dev, virial_size,
539 task_list->main_stream);
540 }
541
542 // wait for all the streams to finish
543 offloadDeviceSynchronize();
544
545 // clean up
546 offloadEventDestroy(input_ready_event);
547 if (forces != NULL) {
548 offload_mempool_device_free(forces_dev);
549 }
550 if (virial != NULL) {
551 offload_mempool_device_free(virial_dev);
552 }
553}
554
555#endif // defined(__OFFLOAD) && !defined(__NO_OFFLOAD_GRID)
556// EOF
static int imax(int x, int y)
Returns the larger of two given integers (missing from the C standard)
static int imin(int x, int y)
Returns the smaller of the two integers (missing from the C standard).
Definition dbm_miniapp.c:40
@ GRID_BACKEND_GPU
grid_func
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
static void const int const int i
static void const int const int const int const int const int const double const int const int const int npts_local[3]
void grid_library_counter_add(const int lp, const enum grid_backend backend, const enum grid_library_kernel kernel, const int increment)
Adds given increment to counter specified by lp, backend, and kernel.
@ GRID_INTEGRATE_GENERAL
@ GRID_COLLOCATE_ORTHO
@ GRID_COLLOCATE_GENERAL
@ GRID_INTEGRATE_ORTHO
subroutine, public offload_activate_chosen_device()
Activates the device selected via offload_set_chosen_device()
integer, dimension(:), allocatable, public ncoset
void offload_mempool_device_free(const void *memory)
Internal routine for releasing memory back to the pool.
void * offload_mempool_device_malloc(const size_t size)
Internal routine for allocating device memory from the pool.
Internal representation of a basis set.
Internal representation of a buffer.
double * device_buffer
double * host_buffer