130 extern __shared__ T shared_memory[];
131 const int number_of_tasks = dev_.num_tasks_per_block_dev[
block_index()];
133 if (number_of_tasks == 0)
136 T *smem_alpha = &shared_memory[0];
138 const int offset = dev_.sorted_blocks_offset_dev[
block_index()];
140 T *__restrict__ smem_cab = allocate_workspace<T>(dev_);
162 for (
int tk = 0; tk < number_of_tasks; tk++) {
164 const int task_id = dev_.task_sorted_by_blocks_dev[offset + tk];
165 if (dev_.tasks[task_id].skip_task)
169 T *__restrict__ coef_ = &dev_.ptr_dev[2][dev_.tasks[task_id].coef_offset];
176 for (
int i = tid / 8;
i < task.nsgf_setb;
i += 8) {
177 for (
int j = tid % 8; j < task.nsgf_seta; j += 8) {
180 if (task.block_transposed) {
182 task.pab_block[j * task.nsgfb +
i] * task.off_diag_twice;
185 task.pab_block[
i * task.nsgfa + j] * task.off_diag_twice;
187 for (
int jco = task.first_cosetb; jco < task.ncosetb; jco++) {
188 const T sphib = task.sphib[
i * task.maxcob + jco];
190 for (
int ico = task.first_coseta; ico < task.ncoseta; ico++) {
192 const T hab = get_hab<COMPUTE_TAU, T>(a, b, task.zeta, task.zetb,
194 T sphia_times_sphib = task.sphia[j * task.maxcoa + ico] * sphib;
195 tmp += hab * sphia_times_sphib;
197 sphia_times_sphib *= block_value;
198 fa[0] += sphia_times_sphib *
199 get_force_a<COMPUTE_TAU, T>(a, b, 0, task.zeta, task.zetb,
201 fa[1] += sphia_times_sphib *
202 get_force_a<COMPUTE_TAU, T>(a, b, 1, task.zeta, task.zetb,
204 fa[2] += sphia_times_sphib *
205 get_force_a<COMPUTE_TAU, T>(a, b, 2, task.zeta, task.zetb,
208 fb[0] += sphia_times_sphib *
209 get_force_b<COMPUTE_TAU, T>(a, b, 0, task.zeta, task.zetb,
210 task.rab, task.n1, smem_cab);
211 fb[1] += sphia_times_sphib *
212 get_force_b<COMPUTE_TAU, T>(a, b, 1, task.zeta, task.zetb,
213 task.rab, task.n1, smem_cab);
214 fb[2] += sphia_times_sphib *
215 get_force_b<COMPUTE_TAU, T>(a, b, 2, task.zeta, task.zetb,
216 task.rab, task.n1, smem_cab);
218 if (dev_.ptr_dev[5] !=
nullptr) {
221 (get_virial_a<COMPUTE_TAU, T>(a, b, 0, 0, task.zeta,
222 task.zetb, task.n1, smem_cab) +
223 get_virial_b<COMPUTE_TAU, T>(a, b, 0, 0, task.zeta,
224 task.zetb, task.rab, task.n1,
228 (get_virial_a<COMPUTE_TAU, T>(a, b, 0, 1, task.zeta,
229 task.zetb, task.n1, smem_cab) +
230 get_virial_b<COMPUTE_TAU, T>(a, b, 0, 1, task.zeta,
231 task.zetb, task.rab, task.n1,
235 (get_virial_a<COMPUTE_TAU, T>(a, b, 0, 2, task.zeta,
236 task.zetb, task.n1, smem_cab) +
237 get_virial_b<COMPUTE_TAU, T>(a, b, 0, 2, task.zeta,
238 task.zetb, task.rab, task.n1,
242 (get_virial_a<COMPUTE_TAU, T>(a, b, 1, 0, task.zeta,
243 task.zetb, task.n1, smem_cab) +
244 get_virial_b<COMPUTE_TAU, T>(a, b, 1, 0, task.zeta,
245 task.zetb, task.rab, task.n1,
249 (get_virial_a<COMPUTE_TAU, T>(a, b, 1, 1, task.zeta,
250 task.zetb, task.n1, smem_cab) +
251 get_virial_b<COMPUTE_TAU, T>(a, b, 1, 1, task.zeta,
252 task.zetb, task.rab, task.n1,
256 (get_virial_a<COMPUTE_TAU, T>(a, b, 1, 2, task.zeta,
257 task.zetb, task.n1, smem_cab) +
258 get_virial_b<COMPUTE_TAU, T>(a, b, 1, 2, task.zeta,
259 task.zetb, task.rab, task.n1,
263 (get_virial_a<COMPUTE_TAU, T>(a, b, 2, 0, task.zeta,
264 task.zetb, task.n1, smem_cab) +
265 get_virial_b<COMPUTE_TAU, T>(a, b, 2, 0, task.zeta,
266 task.zetb, task.rab, task.n1,
270 (get_virial_a<COMPUTE_TAU, T>(a, b, 2, 1, task.zeta,
271 task.zetb, task.n1, smem_cab) +
272 get_virial_b<COMPUTE_TAU, T>(a, b, 2, 1, task.zeta,
273 task.zetb, task.rab, task.n1,
277 (get_virial_a<COMPUTE_TAU, T>(a, b, 2, 2, task.zeta,
278 task.zetb, task.n1, smem_cab) +
279 get_virial_b<COMPUTE_TAU, T>(a, b, 2, 2, task.zeta,
280 task.zetb, task.rab, task.n1,
286 if (task.block_transposed) {
287 task.hab_block[j * task.nsgfb +
i] += tmp;
289 task.hab_block[
i * task.nsgfa + j] += tmp;
299 const int task_id = dev_.task_sorted_by_blocks_dev[offset];
300 const auto &glb_task = dev_.tasks[task_id];
301 const int iatom = glb_task.iatom;
302 const int jatom = glb_task.jatom;
303 T *forces_a = &dev_.ptr_dev[4][3 * iatom];
304 T *forces_b = &dev_.ptr_dev[4][3 * jatom];
306 T *sum = (T *)shared_memory;
307 if (dev_.ptr_dev[5] !=
nullptr) {
309 for (
int i = 0;
i < 9;
i++) {
310 virial[
i] = block_reduce_64<T>(sum, virial[
i], tid);
313 atomicAdd(dev_.ptr_dev[5] +
i, virial[
i]);
317 for (
int i = 0;
i < 3;
i++) {
318 fa[
i] = block_reduce_64<T>(sum, fa[
i], tid);
321 atomicAdd(forces_a +
i, fa[
i]);
323 fb[
i] = block_reduce_64<T>(sum, fb[
i], tid);
326 atomicAdd(forces_b +
i, fb[
i]);
360 if (dev_.tasks[dev_.first_task +
block_index()].skip_task)
372 dh_[tid] = dev_.dh_[tid];
378 setup_task_cube_center<T, T3, distributed__>(dev_, task);
382 __shared__ T accumulator[lbatch][64];
387 const short int size_loop =
388 (ncoset(task.lp) / lbatch + ((ncoset(task.lp) % lbatch) != 0)) * lbatch;
389 const short int length = ncoset(task.lp);
390 for (
int ico = 0; ico < size_loop; ico += lbatch) {
392 for (
int i = 0;
i < lbatch;
i++)
393 accumulator[
i][tid] = 0.0;
397 for (
int z = threadIdx.z; z < task.cube_size.z; z += blockDim.z) {
398 int z2 =
wrap_grid_index(z + task.cube_center.z, dev_.grid_full_size_.z);
402 if (task.apply_border_mask) {
404 if ((z2 < task.window_shift.z) || (z2 > task.window_size.z)) {
412 int ymax = task.cube_size.y - 1;
415 if (orthogonal_ && !task.apply_border_mask) {
419 for (
int y = ymin + threadIdx.y; y <= ymax; y += blockDim.y) {
425 if (task.apply_border_mask) {
426 if ((y2 < task.window_shift.y) || (y2 > task.window_size.y)) {
433 int xmax = task.cube_size.x - 1;
435 if (orthogonal_ && !task.apply_border_mask) {
436 calculate_xmin_xmax_boundaries<T, T3>(task, y, kremain, xmin, xmax);
439 for (
int x = xmin + threadIdx.x; x <= xmax; x += blockDim.x) {
445 if (task.apply_border_mask) {
446 if ((x2 < task.window_shift.x) || (x2 > task.window_size.x)) {
457 (y + task.lb_cube.y + task.roffset.y),
458 (z + task.lb_cube.z + task.roffset.z));
462 const T r3x2 = r3.x * r3.x;
463 const T r3y2 = r3.y * r3.y;
464 const T r3z2 = r3.z * r3.z;
471 if (((task.radius * task.radius) <= (r3x2 + r3y2 + r3z2)) &&
472 (!orthogonal_ || task.apply_border_mask))
476 if ((!orthogonal_) &&
477 ((task.radius * task.radius) <= (r3x2 + r3y2 + r3z2)))
486 __ldg(&dev_.ptr_dev[1][(z2 * dev_.grid_local_size_.y + y2) *
487 dev_.grid_local_size_.x +
490 const T r3xy = r3.x * r3.y;
491 const T r3xz = r3.x * r3.z;
492 const T r3yz = r3.y * r3.z;
494 grid_value *= exp(-(r3x2 + r3y2 + r3z2) * task.zetp);
496 switch (ico / lbatch) {
498 accumulator[0][tid] += grid_value;
501 accumulator[1][tid] += grid_value * r3.x;
502 accumulator[2][tid] += grid_value * r3.y;
503 accumulator[3][tid] += grid_value * r3.z;
507 accumulator[4][tid] += grid_value * r3x2;
508 accumulator[5][tid] += grid_value * r3xy;
509 accumulator[6][tid] += grid_value * r3xz;
510 accumulator[7][tid] += grid_value * r3y2;
511 accumulator[8][tid] += grid_value * r3yz;
512 accumulator[9][tid] += grid_value * r3z2;
515 T tmp = grid_value * r3x2;
516 accumulator[10][tid] += tmp * r3.x;
517 accumulator[11][tid] += tmp * r3.y;
518 accumulator[12][tid] += tmp * r3.z;
519 tmp = grid_value * r3.x;
520 accumulator[13][tid] += tmp * r3y2;
521 accumulator[14][tid] += tmp * r3yz;
522 accumulator[15][tid] += tmp * r3z2;
523 tmp = grid_value * r3y2;
524 accumulator[16][tid] += tmp * r3.y;
525 accumulator[17][tid] += tmp * r3.z;
526 tmp = grid_value * r3z2;
527 accumulator[18][tid] += tmp * r3.y;
528 accumulator[19][tid] += tmp * r3.z;
533 T tmp = grid_value * r3x2;
534 accumulator[0][tid] += tmp * r3x2;
535 accumulator[1][tid] += tmp * r3xy;
536 accumulator[2][tid] += tmp * r3xz;
537 accumulator[3][tid] += tmp * r3y2;
538 accumulator[4][tid] += tmp * r3yz;
539 accumulator[5][tid] += tmp * r3z2;
540 tmp = grid_value * r3y2;
541 accumulator[6][tid] += tmp * r3xy;
542 accumulator[7][tid] += tmp * r3xz;
543 tmp = grid_value * r3z2;
544 accumulator[8][tid] += tmp * r3xy;
545 accumulator[9][tid] += tmp * r3xz;
548 T tmp = grid_value * r3y2;
549 accumulator[10][tid] += tmp * r3y2;
550 accumulator[11][tid] += tmp * r3yz;
551 accumulator[12][tid] += tmp * r3z2;
552 accumulator[13][tid] += grid_value * r3yz * r3z2;
553 accumulator[14][tid] += grid_value * r3z2 * r3z2;
556 T tmp = grid_value * r3x2 * r3x2;
557 accumulator[15][tid] += tmp * r3.x;
558 accumulator[16][tid] += tmp * r3.y;
559 accumulator[17][tid] += tmp * r3.z;
560 tmp = grid_value * r3x2;
561 accumulator[18][tid] += tmp * r3.x * r3y2;
562 accumulator[19][tid] += tmp * r3xy * r3.z;
566 T tmp = grid_value * r3x2;
567 accumulator[0][tid] += tmp * r3.x * r3z2;
568 accumulator[1][tid] += tmp * r3y2 * r3.y;
569 accumulator[2][tid] += tmp * r3y2 * r3.z;
570 accumulator[3][tid] += tmp * r3.y * r3z2;
571 accumulator[4][tid] += tmp * r3z2 * r3.z;
572 tmp = grid_value * r3.x * r3y2;
573 accumulator[5][tid] += tmp * r3y2;
574 accumulator[6][tid] += tmp * r3yz;
575 accumulator[7][tid] += tmp * r3z2;
576 tmp = grid_value * r3.x * r3z2;
577 accumulator[8][tid] += tmp * r3yz;
578 accumulator[9][tid] += tmp * r3z2;
579 tmp = grid_value * r3y2 * r3.y;
580 accumulator[10][tid] += tmp * r3y2;
581 accumulator[11][tid] += tmp * r3yz;
582 accumulator[12][tid] += tmp * r3z2;
583 accumulator[13][tid] += grid_value * r3y2 * r3z2 * r3.z;
584 accumulator[14][tid] += grid_value * r3.y * r3z2 * r3z2;
585 accumulator[15][tid] += grid_value * r3z2 * r3z2 * r3.z;
587 tmp = grid_value * r3x2 * r3x2;
588 accumulator[16][tid] += tmp * r3x2;
589 accumulator[17][tid] += tmp * r3xy;
590 accumulator[18][tid] += tmp * r3xz;
591 accumulator[19][tid] += tmp * r3y2;
595 T tmp = grid_value * r3x2;
596 accumulator[0][tid] += tmp * r3x2 * r3yz;
597 accumulator[1][tid] += tmp * r3x2 * r3z2;
598 accumulator[2][tid] += tmp * r3y2 * r3xy;
599 accumulator[3][tid] += tmp * r3y2 * r3xz;
600 accumulator[4][tid] += tmp * r3xy * r3z2;
601 accumulator[5][tid] += tmp * r3z2 * r3xz;
602 accumulator[6][tid] += tmp * r3y2 * r3y2;
603 accumulator[7][tid] += tmp * r3y2 * r3yz;
604 accumulator[8][tid] += tmp * r3y2 * r3z2;
605 accumulator[9][tid] += tmp * r3z2 * r3yz;
606 accumulator[10][tid] += tmp * r3z2 * r3z2;
607 tmp = grid_value * r3y2 * r3y2;
608 accumulator[11][tid] += tmp * r3xy;
609 accumulator[12][tid] += tmp * r3xz;
610 accumulator[13][tid] +=
611 grid_value * r3y2 * r3xy * r3z2;
612 accumulator[14][tid] +=
613 grid_value * r3y2 * r3z2 * r3xz;
614 accumulator[15][tid] += grid_value * r3xy * r3z2 * r3z2;
615 accumulator[16][tid] += grid_value * r3z2 * r3z2 * r3xz;
616 accumulator[17][tid] += tmp * r3y2;
617 accumulator[18][tid] += tmp * r3yz;
618 accumulator[19][tid] += tmp * r3z2;
621 for (
int ic = 0; (ic < lbatch) && ((ic + ico) < length); ic++) {
624 for (
int po = 0; po < (co.l[2] >> 1); po++)
628 for (
int po = 0; po < (co.l[1] >> 1); po++)
632 for (
int po = 0; po < (co.l[0] >> 1); po++)
636 accumulator[ic][tid] += tmp * grid_value;
644 const int max_i = min(length - ico, lbatch);
653 for (
int i = 0;
i < max_i;
i++) {
655 T val = accumulator[
i][tid] + accumulator[
i][tid + 32];
665 for (
int offset = 16; offset > 0; offset >>= 1) {
666#if defined(__CUDACC__)
667 val += __shfl_down_sync(0xffffffff, val, offset);
669 val += __shfl_down(val, offset);
674 accumulator[
i][0] = val;
679 if (tid < min(length - ico, lbatch))
680 dev_.ptr_dev[2][dev_.tasks[dev_.first_task +
block_index()].coef_offset +
681 tid + ico] = accumulator[tid][0];