8 #include "../../offload/offload_runtime.h"
9 #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_GRID)
17 #include "../../offload/offload_library.h"
18 #include "../common/grid_common.h"
19 #include "../common/grid_constants.h"
20 #include "../common/grid_library.h"
26 #error "OpenMP should not be used in .cu files to accommodate HIP."
34 create_tasks(
const bool orthorhombic,
const int ntasks,
35 const int block_offsets[],
const double atom_positions[][3],
37 const int level_list[],
const int iatom_list[],
38 const int jatom_list[],
const int iset_list[],
39 const int jset_list[],
const int ipgf_list[],
40 const int jpgf_list[],
const int border_mask_list[],
41 const int block_num_list[],
const double radius_list[],
42 const double rab_list[][3],
const int npts_local[][3],
43 const int shift_local[][3],
const int border_width[][3],
44 const double dh[][3][3],
const double dh_inv[][3][3],
45 const double *sphis_dev[], grid_gpu_task tasks[]) {
47 for (
int itask = 0; itask < ntasks; itask++) {
48 grid_gpu_task *task = &tasks[itask];
50 task->iatom = iatom_list[itask] - 1;
51 task->jatom = jatom_list[itask] - 1;
52 const int iset = iset_list[itask] - 1;
53 const int jset = jset_list[itask] - 1;
54 const int ipgf = ipgf_list[itask] - 1;
55 const int jpgf = jpgf_list[itask] - 1;
56 const int ikind = atom_kinds[task->iatom] - 1;
57 const int jkind = atom_kinds[task->jatom] - 1;
61 const int level = level_list[itask] - 1;
62 const int border_mask = border_mask_list[itask];
64 task->use_orthorhombic_kernel = (orthorhombic && border_mask == 0);
66 task->zeta = ibasis->
zet[iset * ibasis->
maxpgf + ipgf];
67 task->zetb = jbasis->
zet[jset * jbasis->
maxpgf + jpgf];
68 task->zetp = task->zeta + task->zetb;
69 const double f = task->zetb / task->zetp;
72 for (
int i = 0;
i < 3;
i++) {
73 task->rab[
i] = rab_list[itask][
i];
74 task->rab2 += task->rab[
i] * task->rab[
i];
75 task->ra[
i] = atom_positions[task->iatom][
i];
76 task->rb[
i] = task->ra[
i] + task->rab[
i];
77 task->rp[
i] = task->ra[
i] + task->rab[
i] * f;
81 for (
int i = 0;
i < 3;
i++) {
82 task->gp[
i] = dh_inv[level][0][
i] * task->rp[0] +
83 dh_inv[level][1][
i] * task->rp[1] +
84 dh_inv[level][2][
i] * task->rp[2];
89 for (
int i = 0;
i < 3;
i++) {
90 for (
int j = 0; j < 3; j++) {
91 task->dh_max = fmax(task->dh_max, fabs(dh[level][
i][j]));
95 task->radius = radius_list[itask];
96 task->radius2 = task->radius * task->radius;
97 task->prefactor = exp(-task->zeta * f * task->rab2);
98 task->off_diag_twice = (task->iatom == task->jatom) ? 1.0 : 2.0;
101 task->la_max_basis = ibasis->
lmax[iset];
102 task->lb_max_basis = jbasis->
lmax[jset];
103 task->la_min_basis = ibasis->
lmin[iset];
104 task->lb_min_basis = jbasis->
lmin[jset];
107 task->nsgfa = ibasis->
nsgf;
108 task->nsgfb = jbasis->
nsgf;
111 task->nsgf_seta = ibasis->
nsgf_set[iset];
112 task->nsgf_setb = jbasis->
nsgf_set[jset];
115 task->maxcoa = ibasis->
maxco;
116 task->maxcob = jbasis->
maxco;
119 const int sgfa = ibasis->
first_sgf[iset] - 1;
120 const int sgfb = jbasis->
first_sgf[jset] - 1;
123 const int o1 = ipgf *
ncoset(task->la_max_basis);
124 const int o2 = jpgf *
ncoset(task->lb_max_basis);
127 task->sphia = &sphis_dev[ikind][sgfa * task->maxcoa + o1];
128 task->sphib = &sphis_dev[jkind][sgfb * task->maxcob + o2];
131 const int block_num = block_num_list[itask] - 1;
132 task->block_transposed = (task->iatom > task->jatom);
133 const int block_offset = block_offsets[block_num];
134 const int subblock_offset = (task->block_transposed)
135 ? sgfa * task->nsgfb + sgfb
136 : sgfb * task->nsgfa + sgfa;
137 task->ab_block_offset = block_offset + subblock_offset;
143 fmin(dh[level][0][0], fmin(dh[level][1][1], dh[level][2][2]));
144 const int imr =
imax(1, (
int)ceil(task->radius / drmin));
145 task->disr_radius = drmin * imr;
155 for (
int i = 0;
i < 3;
i++) {
156 const int cubecenter = floor(task->gp[
i]);
157 task->cube_center_shifted[
i] = cubecenter - shift_local[level][
i];
158 task->cube_offset[
i] = cubecenter * dh[level][
i][
i] - task->rp[
i];
168 for (
int i = 0;
i < 3;
i++) {
169 task->index_min[
i] = INT_MAX;
170 task->index_max[
i] = INT_MIN;
172 for (
int i = -1;
i <= 1;
i++) {
173 for (
int j = -1; j <= 1; j++) {
174 for (
int k = -1; k <= 1; k++) {
175 const double x = task->rp[0] +
i * task->radius;
176 const double y = task->rp[1] + j * task->radius;
177 const double z = task->rp[2] + k * task->radius;
178 for (
int idir = 0; idir < 3; idir++) {
179 const double resc = dh_inv[level][0][idir] * x +
180 dh_inv[level][1][idir] * y +
181 dh_inv[level][2][idir] * z;
182 task->index_min[idir] =
183 imin(task->index_min[idir], (
int)floor(resc));
184 task->index_max[idir] =
185 imax(task->index_max[idir], (
int)ceil(resc));
192 task->bounds_i[0] = 0;
194 task->bounds_j[0] = 0;
196 task->bounds_k[0] = 0;
201 if (border_mask & (1 << 0))
202 task->bounds_i[0] += border_width[level][0];
203 if (border_mask & (1 << 1))
204 task->bounds_i[1] -= border_width[level][0];
205 if (border_mask & (1 << 2))
206 task->bounds_j[0] += border_width[level][1];
207 if (border_mask & (1 << 3))
208 task->bounds_j[1] -= border_width[level][1];
209 if (border_mask & (1 << 4))
210 task->bounds_k[0] += border_width[level][2];
211 if (border_mask & (1 << 5))
212 task->bounds_k[1] -= border_width[level][2];
221 void grid_gpu_create_task_list(
222 const bool orthorhombic,
const int ntasks,
const int nlevels,
223 const int natoms,
const int nkinds,
const int nblocks,
224 const int block_offsets[],
const double atom_positions[][3],
226 const int level_list[],
const int iatom_list[],
const int jatom_list[],
227 const int iset_list[],
const int jset_list[],
const int ipgf_list[],
228 const int jpgf_list[],
const int border_mask_list[],
229 const int block_num_list[],
const double radius_list[],
230 const double rab_list[][3],
const int npts_global[][3],
231 const int npts_local[][3],
const int shift_local[][3],
232 const int border_width[][3],
const double dh[][3][3],
233 const double dh_inv[][3][3], grid_gpu_task_list **task_list_out) {
238 if (*task_list_out != NULL) {
240 grid_gpu_free_task_list(*task_list_out);
243 grid_gpu_task_list *task_list =
244 (grid_gpu_task_list *)malloc(
sizeof(grid_gpu_task_list));
246 task_list->orthorhombic = orthorhombic;
247 task_list->ntasks = ntasks;
248 task_list->nlevels = nlevels;
249 task_list->natoms = natoms;
250 task_list->nkinds = nkinds;
251 task_list->nblocks = nblocks;
254 size_t size = nlevels *
sizeof(grid_gpu_layout);
255 task_list->layouts = (grid_gpu_layout *)malloc(size);
256 for (
int level = 0; level < nlevels; level++) {
257 for (
int i = 0;
i < 3;
i++) {
258 task_list->layouts[level].npts_global[
i] = npts_global[level][
i];
259 task_list->layouts[level].npts_local[
i] =
npts_local[level][
i];
260 task_list->layouts[level].shift_local[
i] = shift_local[level][
i];
261 task_list->layouts[level].border_width[
i] = border_width[level][
i];
262 for (
int j = 0; j < 3; j++) {
263 task_list->layouts[level].dh[
i][j] = dh[level][
i][j];
264 task_list->layouts[level].dh_inv[
i][j] = dh_inv[level][
i][j];
270 task_list->sphis_dev = (
double **)malloc(nkinds *
sizeof(
double *));
271 for (
int i = 0;
i < nkinds;
i++) {
272 size = basis_sets[
i]->
nsgf * basis_sets[
i]->
maxco *
sizeof(double);
273 offloadMalloc((
void **)&task_list->sphis_dev[
i], size);
274 offloadMemcpyHtoD(task_list->sphis_dev[
i], basis_sets[
i]->sphi, size);
277 size = ntasks *
sizeof(grid_gpu_task);
278 grid_gpu_task *tasks_host = (grid_gpu_task *)malloc(size);
279 create_tasks(orthorhombic, ntasks, block_offsets, atom_positions, atom_kinds,
280 basis_sets, level_list, iatom_list, jatom_list, iset_list,
281 jset_list, ipgf_list, jpgf_list, border_mask_list,
282 block_num_list, radius_list, rab_list,
npts_local, shift_local,
283 border_width, dh, dh_inv, (
const double **)task_list->sphis_dev,
285 offloadMalloc((
void **)&task_list->tasks_dev, size);
286 offloadMemcpyHtoD(task_list->tasks_dev, tasks_host, size);
291 size = nlevels *
sizeof(int);
292 task_list->tasks_per_level = (
int *)malloc(size);
293 memset(task_list->tasks_per_level, 0, size);
294 for (
int i = 0;
i < ntasks;
i++) {
295 task_list->tasks_per_level[level_list[
i] - 1]++;
296 assert(
i == 0 || level_list[
i] >= level_list[
i - 1]);
301 for (
int ikind = 0; ikind < nkinds; ikind++) {
302 for (
int iset = 0; iset < basis_sets[ikind]->
nset; iset++) {
303 task_list->lmax =
imax(task_list->lmax, basis_sets[ikind]->lmax[iset]);
308 memset(task_list->stats, 0, 2 * 20 *
sizeof(
int));
309 for (
int itask = 0; itask < ntasks; itask++) {
310 const int iatom = iatom_list[itask] - 1;
311 const int jatom = jatom_list[itask] - 1;
312 const int ikind = atom_kinds[iatom] - 1;
313 const int jkind = atom_kinds[jatom] - 1;
314 const int iset = iset_list[itask] - 1;
315 const int jset = jset_list[itask] - 1;
316 const int la_max = basis_sets[ikind]->
lmax[iset];
317 const int lb_max = basis_sets[jkind]->
lmax[jset];
318 const int lp =
imin(la_max + lb_max, 19);
319 const bool has_border_mask = (border_mask_list[itask] != 0);
320 task_list->stats[has_border_mask][lp]++;
324 offloadStreamCreate(&task_list->main_stream);
327 size = nlevels *
sizeof(offloadStream_t);
328 task_list->level_streams = (offloadStream_t *)malloc(size);
329 for (
int i = 0;
i < nlevels;
i++) {
330 offloadStreamCreate(&task_list->level_streams[
i]);
334 *task_list_out = task_list;
341 void grid_gpu_free_task_list(grid_gpu_task_list *task_list) {
343 offloadFree(task_list->tasks_dev);
345 offloadStreamDestroy(task_list->main_stream);
347 for (
int i = 0;
i < task_list->nlevels;
i++) {
348 offloadStreamDestroy(task_list->level_streams[
i]);
350 free(task_list->level_streams);
352 for (
int i = 0;
i < task_list->nkinds;
i++) {
353 offloadFree(task_list->sphis_dev[
i]);
355 free(task_list->sphis_dev);
357 free(task_list->tasks_per_level);
358 free(task_list->layouts);
367 void grid_gpu_collocate_task_list(
const grid_gpu_task_list *task_list,
368 const enum grid_func func,
const int nlevels,
377 pab_blocks->
size, task_list->main_stream);
380 offloadEvent_t input_ready_event;
381 offloadEventCreate(&input_ready_event);
382 offloadEventRecord(input_ready_event, task_list->main_stream);
386 assert(task_list->nlevels == nlevels);
387 for (
int level = 0; level < task_list->nlevels; level++) {
388 const int last_task = first_task + task_list->tasks_per_level[level] - 1;
389 const offloadStream_t level_stream = task_list->level_streams[level];
390 const grid_gpu_layout *layout = &task_list->layouts[level];
394 offloadMemsetAsync(
grid->device_buffer, 0,
grid->size, level_stream);
397 offloadStreamWaitEvent(level_stream, input_ready_event, 0);
398 grid_gpu_collocate_one_grid_level(
399 task_list, first_task, last_task, func, layout, level_stream,
402 first_task = last_task + 1;
406 for (
int has_border_mask = 0; has_border_mask <= 1; has_border_mask++) {
407 for (
int lp = 0; lp < 20; lp++) {
408 const int count = task_list->stats[has_border_mask][lp];
409 if (task_list->orthorhombic && !has_border_mask) {
420 for (
int level = 0; level < task_list->nlevels; level++) {
422 offloadMemcpyAsyncDtoH(
grid->host_buffer,
grid->device_buffer,
grid->size,
423 task_list->level_streams[level]);
427 offloadEventDestroy(input_ready_event);
430 offloadDeviceSynchronize();
438 void grid_gpu_integrate_task_list(
const grid_gpu_task_list *task_list,
439 const bool compute_tau,
const int natoms,
444 double forces[][3],
double virial[3][3]) {
450 double *forces_dev = NULL;
451 double *virial_dev = NULL;
452 double *pab_blocks_dev = NULL;
453 const size_t forces_size = 3 * natoms *
sizeof(double);
454 const size_t virial_size = 9 *
sizeof(double);
455 if (forces != NULL || virial != NULL) {
457 pab_blocks->
size, task_list->main_stream);
460 if (forces != NULL) {
461 offloadMalloc((
void **)&forces_dev, forces_size);
462 offloadMemsetAsync(forces_dev, 0, forces_size, task_list->main_stream);
464 if (virial != NULL) {
465 offloadMalloc((
void **)&virial_dev, virial_size);
466 offloadMemsetAsync(virial_dev, 0, virial_size, task_list->main_stream);
471 task_list->main_stream);
474 offloadEvent_t input_ready_event;
475 offloadEventCreate(&input_ready_event);
476 offloadEventRecord(input_ready_event, task_list->main_stream);
480 assert(task_list->nlevels == nlevels);
481 for (
int level = 0; level < task_list->nlevels; level++) {
482 const int last_task = first_task + task_list->tasks_per_level[level] - 1;
483 const offloadStream_t level_stream = task_list->level_streams[level];
484 const grid_gpu_layout *layout = &task_list->layouts[level];
488 offloadMemcpyAsyncHtoD(
grid->device_buffer,
grid->host_buffer,
grid->size,
492 offloadStreamWaitEvent(level_stream, input_ready_event, 0);
493 grid_gpu_integrate_one_grid_level(
494 task_list, first_task, last_task, compute_tau, layout, level_stream,
496 forces_dev, virial_dev, &lp_diff);
499 offloadEvent_t level_done_event;
500 offloadEventCreate(&level_done_event);
501 offloadEventRecord(level_done_event, level_stream);
502 offloadStreamWaitEvent(task_list->main_stream, level_done_event, 0);
503 offloadEventDestroy(level_done_event);
505 first_task = last_task + 1;
509 for (
int has_border_mask = 0; has_border_mask <= 1; has_border_mask++) {
510 for (
int lp = 0; lp < 20; lp++) {
511 const int count = task_list->stats[has_border_mask][lp];
512 if (task_list->orthorhombic && !has_border_mask) {
524 hab_blocks->
size, task_list->main_stream);
525 if (forces != NULL) {
526 offloadMemcpyAsyncDtoH(forces, forces_dev, forces_size,
527 task_list->main_stream);
529 if (virial != NULL) {
530 offloadMemcpyAsyncDtoH(virial, virial_dev, virial_size,
531 task_list->main_stream);
535 offloadDeviceSynchronize();
538 offloadEventDestroy(input_ready_event);
539 if (forces != NULL) {
540 offloadFree(forces_dev);
542 if (virial != NULL) {
543 offloadFree(virial_dev);
static int imax(int x, int y)
Returns the larger of two given integer (missing from the C standard)
static int imin(int x, int y)
Returns the smaller of two given integer (missing from the C standard)
static GRID_HOST_DEVICE int ncoset(const int l)
Number of Cartesian orbitals up to given angular momentum quantum.
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
static void const int const int i
static void const int const int const int const int const int const double const int const int const int npts_local[3]
void grid_library_counter_add(const int lp, const enum grid_backend backend, const enum grid_library_kernel kernel, const int increment)
Adds given increment to counter specified by lp, backend, and kernel.
subroutine, public offload_activate_chosen_device()
Activates the device selected via offload_set_chosen_device()
Internal representation of a basis set.
Internal representation of a buffer.