83 extern __shared__ T shared_memory[];
84 const int number_of_tasks = dev_.num_tasks_per_block_dev[
block_index()];
86 if (number_of_tasks == 0)
89 T *smem_alpha = &shared_memory[0];
91 const int offset = dev_.sorted_blocks_offset_dev[
block_index()];
92 T *smem_cab =
reinterpret_cast<double *
>(
93 __builtin_assume_aligned(allocate_workspace<T>(dev_), 32));
95 for (
int tk = 0; tk < number_of_tasks; tk++) {
97 const int task_id = dev_.task_sorted_by_blocks_dev[offset + tk];
98 if (dev_.tasks[task_id].skip_task)
102 T *__restrict__ coef_ = &dev_.ptr_dev[2][dev_.tasks[task_id].coef_offset];
106 for (
int z = tid; z < task.n1 * task.n2;
107 z += blockDim.x * blockDim.y * blockDim.z)
111 block_to_cab<T, IS_FUNC_AB>(dev_, task, smem_cab);
153 if (dev_.tasks[dev_.first_task +
block_index()].skip_task)
161 extern __shared__ T coefs_[];
164 &dev_.ptr_dev[2][dev_.tasks[dev_.first_task +
block_index()].coef_offset];
169 dh_[tid] = dev_.dh_[tid];
172 for (
int i = tid;
i < ncoset(6);
i += blockDim.x * blockDim.y * blockDim.z)
173 coefs_[
i] = coef_[
i];
179 setup_task_cube_center<T, T3, distributed__>(dev_, task);
183 for (
int z = threadIdx.z; z < task.cube_size.z; z += blockDim.z) {
184 int z2 =
wrap_grid_index(z + task.cube_center.z, dev_.grid_full_size_.z);
188 if (task.apply_border_mask) {
192 if ((z2 < task.window_shift.z) || (z2 > task.window_size.z)) {
201 int ymax = task.cube_size.y - 1;
203 if (orthogonal_ && !task.apply_border_mask) {
207 for (
int y = ymin + threadIdx.y; y <= ymax; y += blockDim.y) {
208 int y2 =
wrap_grid_index(y + task.cube_center.y, dev_.grid_full_size_.y);
211 if (task.apply_border_mask) {
213 if ((y2 < task.window_shift.y) || (y2 > task.window_size.y)) {
220 int xmax = task.cube_size.x - 1;
221 if (orthogonal_ && !task.apply_border_mask) {
222 calculate_xmin_xmax_boundaries<T, T3>(task, y, kremain, xmin, xmax);
225 for (
int x = xmin + threadIdx.x; x <= xmax; x += blockDim.x) {
230 if (task.apply_border_mask) {
233 if ((x2 < task.window_shift.x) || (x2 > task.window_size.x)) {
244 (y + task.lb_cube.y + task.roffset.y),
245 (z + task.lb_cube.z + task.roffset.z));
247 const T r3x2 = r3.x * r3.x;
248 const T r3y2 = r3.y * r3.y;
249 const T r3z2 = r3.z * r3.z;
256 if (((task.radius * task.radius) <= (r3x2 + r3y2 + r3z2)) &&
257 (!orthogonal_ || task.apply_border_mask))
261 if ((!orthogonal_) &&
262 ((task.radius * task.radius) <= (r3x2 + r3y2 + r3z2)))
271 res += coefs_[1] * r3.x;
272 res += coefs_[2] * r3.y;
273 res += coefs_[3] * r3.z;
275 const T r3xy = r3.x * r3.y;
276 const T r3xz = r3.x * r3.z;
277 const T r3yz = r3.y * r3.z;
280 res += coefs_[4] * r3x2;
281 res += coefs_[5] * r3xy;
282 res += coefs_[6] * r3xz;
283 res += coefs_[7] * r3y2;
284 res += coefs_[8] * r3yz;
285 res += coefs_[9] * r3z2;
290 (coefs_[10] * r3.x + coefs_[11] * r3.y + coefs_[12] * r3.z);
291 res += r3.x * (coefs_[13] * r3y2 + coefs_[15] * r3z2);
292 res += coefs_[14] * r3xy * r3.z;
293 res += r3y2 * (coefs_[16] * r3.y + coefs_[17] * r3.z);
294 res += r3z2 * (coefs_[18] * r3.y + coefs_[19] * r3.z);
299 (coefs_[20] * r3x2 + coefs_[21] * r3xy + coefs_[22] * r3xz +
300 coefs_[23] * r3y2 + coefs_[24] * r3yz + coefs_[25] * r3z2);
302 (coefs_[26] * r3xy + coefs_[27] * r3xz + coefs_[30] * r3y2 +
303 coefs_[31] * r3yz + coefs_[32] * r3z2);
304 res += r3z2 * (coefs_[28] * r3xy + coefs_[29] * r3xz +
305 coefs_[33] * r3yz + coefs_[34] * r3z2);
309 const T r3x4 = r3x2 * r3x2;
310 const T r3y4 = r3y2 * r3y2;
311 const T r3z4 = r3z2 * r3z2;
313 res += r3x4 * (coefs_[35] * r3.x +
316 res += r3x2 * (r3.x * (coefs_[38] * r3y2 +
319 r3y2 * (coefs_[41] * r3.y +
321 r3z2 * (coefs_[43] * r3.y +
323 res += r3.x * (coefs_[45] * r3y4 +
324 r3y2 * (coefs_[46] * r3yz +
326 r3z2 * (coefs_[48] * r3yz +
328 res += r3y2 * (r3y2 * (coefs_[50] * r3.y +
330 r3z2 * (coefs_[52] * r3.y +
332 res += r3z4 * (coefs_[54] * r3.y +
336 res += r3x4 * (coefs_[56] * r3x2 +
343 res += r3x2 * (coefs_[62] * r3y2 * r3xy +
344 coefs_[63] * r3y2 * r3xz +
345 coefs_[64] * r3xy * r3z2 +
346 coefs_[65] * r3z2 * r3xz +
348 coefs_[67] * r3y2 * r3yz +
349 coefs_[68] * r3y2 * r3z2 +
350 coefs_[69] * r3yz * r3z2 +
357 res += r3y4 * (coefs_[71] * r3xy +
362 res += r3z4 * (coefs_[75] * r3xy +
369 for (
int ic = ncoset(6); ic < ncoset(task.lp); ic++) {
372 for (
int po = 0; po < (co.l[2] >> 1); po++)
376 for (
int po = 0; po < (co.l[1] >> 1); po++)
380 for (
int po = 0; po < (co.l[0] >> 1); po++)
384 res += tmp * coef_[ic];
389 res *= exp(-(r3x2 + r3y2 + r3z2) * task.zetp);
390 atomicAdd(dev_.ptr_dev[1] +
391 (z2 * dev_.grid_local_size_.y + y2) *
392 dev_.grid_local_size_.x +
440 *lp_diff = smem_params.
lp_diff();
444 kernel_params params = set_kernel_parameters(level, smem_params);
448 const dim3 threads_per_block(4, 4, 4);
450 if (
grid_[level].is_distributed()) {
451 if (
grid_[level].is_orthogonal())
452 collocate_kernel<double, double3, true, true>
456 collocate_kernel<double, double3, true, false>
460 if (
grid_[level].is_orthogonal())
461 collocate_kernel<double, double3, false, true>
465 collocate_kernel<double, double3, false, false>