8#include "../../offload/offload_runtime.h"
9#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_GRID)
17#include "../../offload/offload_library.h"
18#include "../common/grid_common.h"
19#include "../common/grid_constants.h"
20#include "../common/grid_library.h"
26#error "OpenMP should not be used in .cu files to accommodate HIP."
34create_tasks(
const bool orthorhombic,
const int ntasks,
35 const int block_offsets[],
const double atom_positions[][3],
37 const int level_list[],
const int iatom_list[],
38 const int jatom_list[],
const int iset_list[],
39 const int jset_list[],
const int ipgf_list[],
40 const int jpgf_list[],
const int border_mask_list[],
41 const int block_num_list[],
const double radius_list[],
42 const double rab_list[][3],
const int npts_local[][3],
43 const int shift_local[][3],
const int border_width[][3],
44 const double dh[][3][3],
const double dh_inv[][3][3],
45 const double *sphis_dev[], grid_gpu_task tasks[]) {
47 for (
int itask = 0; itask < ntasks; itask++) {
48 grid_gpu_task *task = &tasks[itask];
50 task->iatom = iatom_list[itask] - 1;
51 task->jatom = jatom_list[itask] - 1;
52 const int iset = iset_list[itask] - 1;
53 const int jset = jset_list[itask] - 1;
54 const int ipgf = ipgf_list[itask] - 1;
55 const int jpgf = jpgf_list[itask] - 1;
56 const int ikind = atom_kinds[task->iatom] - 1;
57 const int jkind = atom_kinds[task->jatom] - 1;
61 const int level = level_list[itask] - 1;
62 const int border_mask = border_mask_list[itask];
64 task->use_orthorhombic_kernel = (orthorhombic && border_mask == 0);
66 task->zeta = ibasis->
zet[iset * ibasis->
maxpgf + ipgf];
67 task->zetb = jbasis->
zet[jset * jbasis->
maxpgf + jpgf];
68 task->zetp = task->zeta + task->zetb;
69 const double f = task->zetb / task->zetp;
72 for (
int i = 0;
i < 3;
i++) {
73 task->rab[
i] = rab_list[itask][
i];
74 task->rab2 += task->rab[
i] * task->rab[
i];
75 task->ra[
i] = atom_positions[task->iatom][
i];
76 task->rb[
i] = task->ra[
i] + task->rab[
i];
77 task->rp[
i] = task->ra[
i] + task->rab[
i] * f;
81 for (
int i = 0;
i < 3;
i++) {
82 task->gp[
i] = dh_inv[level][0][
i] * task->rp[0] +
83 dh_inv[level][1][
i] * task->rp[1] +
84 dh_inv[level][2][
i] * task->rp[2];
89 for (
int i = 0;
i < 3;
i++) {
90 for (
int j = 0; j < 3; j++) {
91 task->dh_max = fmax(task->dh_max, fabs(dh[level][
i][j]));
95 task->radius = radius_list[itask];
96 task->radius2 = task->radius * task->radius;
97 task->prefactor = exp(-task->zeta * f * task->rab2);
98 task->off_diag_twice = (task->iatom == task->jatom) ? 1.0 : 2.0;
101 task->la_max_basis = ibasis->
lmax[iset];
102 task->lb_max_basis = jbasis->
lmax[jset];
103 task->la_min_basis = ibasis->
lmin[iset];
104 task->lb_min_basis = jbasis->
lmin[jset];
107 task->nsgfa = ibasis->
nsgf;
108 task->nsgfb = jbasis->
nsgf;
111 task->nsgf_seta = ibasis->
nsgf_set[iset];
112 task->nsgf_setb = jbasis->
nsgf_set[jset];
115 task->maxcoa = ibasis->
maxco;
116 task->maxcob = jbasis->
maxco;
119 const int sgfa = ibasis->
first_sgf[iset] - 1;
120 const int sgfb = jbasis->
first_sgf[jset] - 1;
123 const int o1 = ipgf *
ncoset(task->la_max_basis);
124 const int o2 = jpgf *
ncoset(task->lb_max_basis);
127 task->sphia = &sphis_dev[ikind][sgfa * task->maxcoa + o1];
128 task->sphib = &sphis_dev[jkind][sgfb * task->maxcob + o2];
131 const int block_num = block_num_list[itask] - 1;
132 task->block_transposed = (task->iatom > task->jatom);
133 const int block_offset = block_offsets[block_num];
134 const int subblock_offset = (task->block_transposed)
135 ? sgfa * task->nsgfb + sgfb
136 : sgfb * task->nsgfa + sgfa;
137 task->ab_block_offset = block_offset + subblock_offset;
143 fmin(dh[level][0][0], fmin(dh[level][1][1], dh[level][2][2]));
144 const int imr =
imax(1, (
int)ceil(task->radius / drmin));
145 task->disr_radius = drmin * imr;
155 for (
int i = 0;
i < 3;
i++) {
156 const int cubecenter = floor(task->gp[
i]);
157 task->cube_center_shifted[
i] = cubecenter - shift_local[level][
i];
158 task->cube_offset[
i] = cubecenter * dh[level][
i][
i] - task->rp[
i];
168 for (
int i = 0;
i < 3;
i++) {
169 task->index_min[
i] = INT_MAX;
170 task->index_max[
i] = INT_MIN;
172 for (
int i = -1;
i <= 1;
i++) {
173 for (
int j = -1; j <= 1; j++) {
174 for (
int k = -1; k <= 1; k++) {
175 const double x = task->rp[0] +
i * task->radius;
176 const double y = task->rp[1] + j * task->radius;
177 const double z = task->rp[2] + k * task->radius;
178 for (
int idir = 0; idir < 3; idir++) {
179 const double resc = dh_inv[level][0][idir] * x +
180 dh_inv[level][1][idir] * y +
181 dh_inv[level][2][idir] * z;
182 task->index_min[idir] =
183 imin(task->index_min[idir], (
int)floor(resc));
184 task->index_max[idir] =
185 imax(task->index_max[idir], (
int)ceil(resc));
192 task->bounds_i[0] = 0;
194 task->bounds_j[0] = 0;
196 task->bounds_k[0] = 0;
201 if (border_mask & (1 << 0))
202 task->bounds_i[0] += border_width[level][0];
203 if (border_mask & (1 << 1))
204 task->bounds_i[1] -= border_width[level][0];
205 if (border_mask & (1 << 2))
206 task->bounds_j[0] += border_width[level][1];
207 if (border_mask & (1 << 3))
208 task->bounds_j[1] -= border_width[level][1];
209 if (border_mask & (1 << 4))
210 task->bounds_k[0] += border_width[level][2];
211 if (border_mask & (1 << 5))
212 task->bounds_k[1] -= border_width[level][2];
221void grid_gpu_create_task_list(
222 const bool orthorhombic,
const int ntasks,
const int nlevels,
223 const int natoms,
const int nkinds,
const int nblocks,
224 const int block_offsets[],
const double atom_positions[][3],
226 const int level_list[],
const int iatom_list[],
const int jatom_list[],
227 const int iset_list[],
const int jset_list[],
const int ipgf_list[],
228 const int jpgf_list[],
const int border_mask_list[],
229 const int block_num_list[],
const double radius_list[],
230 const double rab_list[][3],
const int npts_global[][3],
231 const int npts_local[][3],
const int shift_local[][3],
232 const int border_width[][3],
const double dh[][3][3],
233 const double dh_inv[][3][3], grid_gpu_task_list **task_list_out) {
238 if (*task_list_out != NULL) {
240 grid_gpu_free_task_list(*task_list_out);
243 grid_gpu_task_list *task_list =
244 (grid_gpu_task_list *)malloc(
sizeof(grid_gpu_task_list));
246 task_list->orthorhombic = orthorhombic;
247 task_list->ntasks = ntasks;
248 task_list->nlevels = nlevels;
249 task_list->natoms = natoms;
250 task_list->nkinds = nkinds;
251 task_list->nblocks = nblocks;
254 size_t size = nlevels *
sizeof(grid_gpu_layout);
255 task_list->layouts = (grid_gpu_layout *)malloc(size);
256 for (
int level = 0; level < nlevels; level++) {
257 for (
int i = 0;
i < 3;
i++) {
258 task_list->layouts[level].npts_global[
i] = npts_global[level][
i];
259 task_list->layouts[level].npts_local[
i] =
npts_local[level][
i];
260 task_list->layouts[level].shift_local[
i] = shift_local[level][
i];
261 task_list->layouts[level].border_width[
i] = border_width[level][
i];
262 for (
int j = 0; j < 3; j++) {
263 task_list->layouts[level].dh[
i][j] = dh[level][
i][j];
264 task_list->layouts[level].dh_inv[
i][j] = dh_inv[level][
i][j];
270 task_list->sphis_dev = (
double **)malloc(nkinds *
sizeof(
double *));
271 for (
int i = 0;
i < nkinds;
i++) {
272 size = basis_sets[
i]->
nsgf * basis_sets[
i]->
maxco *
sizeof(double);
273 offloadMalloc((
void **)&task_list->sphis_dev[
i], size);
274 offloadMemcpyHtoD(task_list->sphis_dev[
i], basis_sets[
i]->sphi, size);
277 size = ntasks *
sizeof(grid_gpu_task);
278 grid_gpu_task *tasks_host = (grid_gpu_task *)malloc(size);
279 create_tasks(orthorhombic, ntasks, block_offsets, atom_positions, atom_kinds,
280 basis_sets, level_list, iatom_list, jatom_list, iset_list,
281 jset_list, ipgf_list, jpgf_list, border_mask_list,
282 block_num_list, radius_list, rab_list,
npts_local, shift_local,
283 border_width, dh, dh_inv, (
const double **)task_list->sphis_dev,
285 offloadMalloc((
void **)&task_list->tasks_dev, size);
286 offloadMemcpyHtoD(task_list->tasks_dev, tasks_host, size);
291 size = nlevels *
sizeof(int);
292 task_list->tasks_per_level = (
int *)malloc(size);
293 memset(task_list->tasks_per_level, 0, size);
294 for (
int i = 0;
i < ntasks;
i++) {
295 task_list->tasks_per_level[level_list[
i] - 1]++;
296 assert(
i == 0 || level_list[
i] >= level_list[
i - 1]);
301 for (
int ikind = 0; ikind < nkinds; ikind++) {
302 for (
int iset = 0; iset < basis_sets[ikind]->
nset; iset++) {
303 task_list->lmax =
imax(task_list->lmax, basis_sets[ikind]->lmax[iset]);
308 memset(task_list->stats, 0, 2 * 20 *
sizeof(
int));
309 for (
int itask = 0; itask < ntasks; itask++) {
310 const int iatom = iatom_list[itask] - 1;
311 const int jatom = jatom_list[itask] - 1;
312 const int ikind = atom_kinds[iatom] - 1;
313 const int jkind = atom_kinds[jatom] - 1;
314 const int iset = iset_list[itask] - 1;
315 const int jset = jset_list[itask] - 1;
316 const int la_max = basis_sets[ikind]->
lmax[iset];
317 const int lb_max = basis_sets[jkind]->
lmax[jset];
318 const int lp =
imin(la_max + lb_max, 19);
319 const bool has_border_mask = (border_mask_list[itask] != 0);
320 task_list->stats[has_border_mask][lp]++;
324 offloadStreamCreate(&task_list->main_stream);
327 size = nlevels *
sizeof(offloadStream_t);
328 task_list->level_streams = (offloadStream_t *)malloc(size);
329 for (
int i = 0;
i < nlevels;
i++) {
330 offloadStreamCreate(&task_list->level_streams[
i]);
334 *task_list_out = task_list;
341void grid_gpu_free_task_list(grid_gpu_task_list *task_list) {
343 offloadFree(task_list->tasks_dev);
345 offloadStreamDestroy(task_list->main_stream);
347 for (
int i = 0;
i < task_list->nlevels;
i++) {
348 offloadStreamDestroy(task_list->level_streams[
i]);
350 free(task_list->level_streams);
352 for (
int i = 0;
i < task_list->nkinds;
i++) {
353 offloadFree(task_list->sphis_dev[
i]);
355 free(task_list->sphis_dev);
357 free(task_list->tasks_per_level);
358 free(task_list->layouts);
367void grid_gpu_collocate_task_list(
const grid_gpu_task_list *task_list,
368 const enum grid_func func,
const int nlevels,
377 pab_blocks->
size, task_list->main_stream);
380 offloadEvent_t input_ready_event;
381 offloadEventCreate(&input_ready_event);
382 offloadEventRecord(input_ready_event, task_list->main_stream);
386 assert(task_list->nlevels == nlevels);
387 for (
int level = 0; level < task_list->nlevels; level++) {
388 const int last_task = first_task + task_list->tasks_per_level[level] - 1;
389 const offloadStream_t level_stream = task_list->level_streams[level];
390 const grid_gpu_layout *layout = &task_list->layouts[level];
394 offloadMemsetAsync(
grid->device_buffer, 0,
grid->size, level_stream);
397 offloadStreamWaitEvent(level_stream, input_ready_event, 0);
398 grid_gpu_collocate_one_grid_level(
399 task_list, first_task, last_task, func, layout, level_stream,
402 first_task = last_task + 1;
406 for (
int has_border_mask = 0; has_border_mask <= 1; has_border_mask++) {
407 for (
int lp = 0; lp < 20; lp++) {
408 const int count = task_list->stats[has_border_mask][lp];
409 if (task_list->orthorhombic && !has_border_mask) {
420 for (
int level = 0; level < task_list->nlevels; level++) {
422 offloadMemcpyAsyncDtoH(
grid->host_buffer,
grid->device_buffer,
grid->size,
423 task_list->level_streams[level]);
427 offloadEventDestroy(input_ready_event);
430 offloadDeviceSynchronize();
438void grid_gpu_integrate_task_list(
const grid_gpu_task_list *task_list,
439 const bool compute_tau,
const int natoms,
444 double forces[][3],
double virial[3][3]) {
450 double *forces_dev = NULL;
451 double *virial_dev = NULL;
452 double *pab_blocks_dev = NULL;
453 const size_t forces_size = 3 * natoms *
sizeof(double);
454 const size_t virial_size = 9 *
sizeof(double);
455 if (forces != NULL || virial != NULL) {
457 pab_blocks->
size, task_list->main_stream);
460 if (forces != NULL) {
461 offloadMalloc((
void **)&forces_dev, forces_size);
462 offloadMemsetAsync(forces_dev, 0, forces_size, task_list->main_stream);
464 if (virial != NULL) {
465 offloadMalloc((
void **)&virial_dev, virial_size);
466 offloadMemsetAsync(virial_dev, 0, virial_size, task_list->main_stream);
471 task_list->main_stream);
474 offloadEvent_t input_ready_event;
475 offloadEventCreate(&input_ready_event);
476 offloadEventRecord(input_ready_event, task_list->main_stream);
480 assert(task_list->nlevels == nlevels);
481 for (
int level = 0; level < task_list->nlevels; level++) {
482 const int last_task = first_task + task_list->tasks_per_level[level] - 1;
483 const offloadStream_t level_stream = task_list->level_streams[level];
484 const grid_gpu_layout *layout = &task_list->layouts[level];
488 offloadMemcpyAsyncHtoD(
grid->device_buffer,
grid->host_buffer,
grid->size,
492 offloadStreamWaitEvent(level_stream, input_ready_event, 0);
493 grid_gpu_integrate_one_grid_level(
494 task_list, first_task, last_task, compute_tau, layout, level_stream,
496 forces_dev, virial_dev, &lp_diff);
499 offloadEvent_t level_done_event;
500 offloadEventCreate(&level_done_event);
501 offloadEventRecord(level_done_event, level_stream);
502 offloadStreamWaitEvent(task_list->main_stream, level_done_event, 0);
503 offloadEventDestroy(level_done_event);
505 first_task = last_task + 1;
509 for (
int has_border_mask = 0; has_border_mask <= 1; has_border_mask++) {
510 for (
int lp = 0; lp < 20; lp++) {
511 const int count = task_list->stats[has_border_mask][lp];
512 if (task_list->orthorhombic && !has_border_mask) {
524 hab_blocks->
size, task_list->main_stream);
525 if (forces != NULL) {
526 offloadMemcpyAsyncDtoH(forces, forces_dev, forces_size,
527 task_list->main_stream);
529 if (virial != NULL) {
530 offloadMemcpyAsyncDtoH(virial, virial_dev, virial_size,
531 task_list->main_stream);
535 offloadDeviceSynchronize();
538 offloadEventDestroy(input_ready_event);
539 if (forces != NULL) {
540 offloadFree(forces_dev);
542 if (virial != NULL) {
543 offloadFree(virial_dev);
static int imax(int x, int y)
Returns the larger of two given integers (missing from the C standard)
static int imin(int x, int y)
Returns the smaller of the two integers (missing from the C standard).
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
static void const int const int i
static void const int const int const int const int const int const double const int const int const int npts_local[3]
void grid_library_counter_add(const int lp, const enum grid_backend backend, const enum grid_library_kernel kernel, const int increment)
Adds given increment to counter specified by lp, backend, and kernel.
subroutine, public offload_activate_chosen_device()
Activates the device selected via offload_set_chosen_device()
integer, dimension(:), allocatable, public ncoset
Internal representation of a basis set.
Internal representation of a buffer.