23#include "../common/grid_common.h"
28 const double *restrict
const rp,
29 double *restrict rp_c) {
31 dh_inv_[0][0] * rp[0] + dh_inv_[1][0] * rp[1] + dh_inv_[0][0] * rp[2];
33 dh_inv_[0][1] * rp[0] + dh_inv_[1][1] * rp[1] + dh_inv_[1][1] * rp[2];
35 dh_inv_[0][2] * rp[0] + dh_inv_[1][2] * rp[1] + dh_inv_[2][2] * rp[2];
45#if LIBXSMM_VERSION2(1, 17) >= \
46 LIBXSMM_VERSION2(LIBXSMM_VERSION_MAJOR, LIBXSMM_VERSION_MINOR) && \
47 (2079 > LIBXSMM_VERSION_PATCH)
50 m->prefetch = LIBXSMM_PREFETCH_AUTO;
54 (m->
op1 !=
'T' ? LIBXSMM_GEMM_FLAG_NONE : LIBXSMM_GEMM_FLAG_TRANS_B);
56 if (m->kernel == NULL) {
58 libxsmm_dmmdispatch(m->
n, m->
m, m->
k, &m->
ldb, &m->
lda, &m->
ldc,
59 &m->
alpha, &m->
beta, &m->flags, &m->prefetch);
63 m->kernel(m->
b, m->
a, m->
c, m->
b, m->
a, m->
c);
72 if ((m->
op1 ==
'N') && (m->
op2 ==
'N'))
76 if ((m->
op1 ==
'T') && (m->
op2 ==
'N'))
80 if ((m->
op1 ==
'N') && (m->
op2 ==
'T'))
84 if ((m->
op1 ==
'T') && (m->
op2 ==
'T'))
90 if ((m->
op1 ==
'N') && (m->
op2 ==
'N'))
94 if ((m->
op1 ==
'T') && (m->
op2 ==
'N'))
98 if ((m->
op1 ==
'T') && (m->
op2 ==
'T'))
102 if ((m->
op1 ==
'N') && (m->
op2 ==
'T'))
111 assert(batch_size > 0);
113#if defined(__LIBXSMM)
114#if LIBXSMM_VERSION2(1, 17) >= \
115 LIBXSMM_VERSION2(LIBXSMM_VERSION_MAJOR, LIBXSMM_VERSION_MINOR) && \
116 (2079 > LIBXSMM_VERSION_PATCH)
119 m->prefetch = LIBXSMM_PREFETCH_AUTO;
123 (m->
op1 !=
'T' ? LIBXSMM_GEMM_FLAG_NONE : LIBXSMM_GEMM_FLAG_TRANS_B);
125 if (m->kernel == NULL) {
127 libxsmm_dmmdispatch(m->
n, m->
m, m->
k, &m->
ldb, &m->
lda, &m->
ldc,
128 &m->
alpha, &m->
beta, &m->flags, &m->prefetch);
132 for (
int s = 0; s < batch_size - 1; s++) {
133 m->kernel(m[s].b, m[s].a, m[s].c, m[s + 1].b, m[s + 1].a, m[s + 1].c);
135 m->kernel(m[batch_size - 1].b, m[batch_size - 1].a, m[batch_size - 1].c,
136 m[batch_size - 1].b, m[batch_size - 1].a, m[batch_size - 1].c);
145 for (
int s = 0; s < batch_size; s++) {
146 if ((m[s].op1 ==
'N') && (m[s].op2 ==
'N'))
148 m[s].k, m[s].alpha, m[s].a, m[s].lda, m[s].b, m[s].ldb,
149 m[s].beta, m[s].c, m[s].ldc);
151 if ((m[s].op1 ==
'T') && (m[s].op2 ==
'N'))
153 m[s].k, m[s].alpha, m[s].a, m[s].lda, m[s].b, m[s].ldb,
154 m[s].beta, m[s].c, m[s].ldc);
156 if ((m[s].op1 ==
'N') && (m[s].op2 ==
'T'))
158 m[s].k, m[s].alpha, m[s].a, m[s].lda, m[s].b, m[s].ldb,
159 m[s].beta, m[s].c, m[s].ldc);
161 if ((m[s].op1 ==
'T') && (m[s].op2 ==
'T'))
163 m[s].alpha, m[s].a, m[s].lda, m[s].b, m[s].ldb, m[s].beta,
167 for (
int s = 0; s < batch_size; s++) {
168 if ((m[s].op1 ==
'N') && (m[s].op2 ==
'N'))
169 dgemm_(
"N",
"N", &m[s].n, &m[s].m, &m[s].k, &m[s].alpha, m[s].b,
170 &m[s].ldb, m[s].a, &m[s].lda, &m[s].beta, m[s].c, &m[s].ldc);
172 if ((m[s].op1 ==
'T') && (m[s].op2 ==
'N'))
173 dgemm_(
"N",
"T", &m[s].n, &m[s].m, &m[s].k, &m[s].alpha, m[s].b,
174 &m[s].ldb, m[s].a, &m[s].lda, &m[s].beta, m[s].c, &m[s].ldc);
176 if ((m[s].op1 ==
'T') && (m[s].op2 ==
'T'))
177 dgemm_(
"T",
"T", &m[s].n, &m[s].m, &m[s].k, &m[s].alpha, m[s].b,
178 &m[s].ldb, m[s].a, &m[s].lda, &m[s].beta, m[s].c, &m[s].ldc);
180 if ((m[s].op1 ==
'N') && (m[s].op2 ==
'T'))
181 dgemm_(
"T",
"N", &m[s].n, &m[s].m, &m[s].k, &m[s].alpha, m[s].b,
182 &m[s].ldb, m[s].a, &m[s].lda, &m[s].beta, m[s].c, &m[s].ldc);
188 const int *position,
const tensor *
const grid,
190 int position1[3] = {0, 0, 0};
193 position1[0] = position[0];
194 position1[1] = position[1];
195 position1[2] = position[2];
198 const int sizex = upper_corner[2] - lower_corner[2];
199 const int sizey = upper_corner[1] - lower_corner[1];
200 const int sizez = upper_corner[0] - lower_corner[0];
202 for (
int z = 0; z < sizez; z++) {
204 for (
int y = 0; y < sizey; y++) {
205 double *restrict src =
206 &
idx3(
grid[0], lower_corner[0] + z -
grid->window_shift[0],
207 lower_corner[1] + y -
grid->window_shift[1],
208 lower_corner[2] -
grid->window_shift[2]);
209 double *restrict dst =
210 &
idx3(subgrid[0], position1[0] + z, position1[1] + y, position1[2]);
217 for (
int x = 0; x < sizex; x++) {
228 int position1[3] = {0, 0, 0};
231 position1[0] = position[0];
232 position1[1] = position[1];
233 position1[2] = position[2];
236 const int sizex = upper_corner[2] - lower_corner[2];
237 const int sizey = upper_corner[1] - lower_corner[1];
238 const int sizez = upper_corner[0] - lower_corner[0];
240 for (
int z = 0; z < sizez; z++) {
241 double *restrict dst =
242 &
idx3(
grid[0], lower_corner[0] + z, lower_corner[1], lower_corner[2]);
243 double *restrict src =
244 &
idx3(subgrid[0], position1[0] + z, position1[1], position1[2]);
245 for (
int y = 0; y < sizey - 1; y++) {
253 for (
int x = 0; x < sizex; x++) {
263 for (
int x = 0; x < sizex; x++) {
271 const double dh[3][3],
const double dh_inv[3][3],
272 const double *rp,
double *disr_radius,
273 double *roffset,
int *cubecenter,
int *lb_cube,
274 int *ub_cube,
int *cube_size) {
281 for (
int i = 0;
i < 3;
i++) {
282 double dh_inv_rp = 0.0;
283 for (
int j = 0; j < 3; j++) {
284 dh_inv_rp += dh_inv[j][
i] * rp[j];
286 rp1[2 -
i] = dh_inv_rp;
287 cubecenter[2 -
i] = floor(dh_inv_rp);
292 const double dx[3] = {dh[2][2], dh[1][1], dh[0][0]};
293 const double dx_inv[3] = {dh_inv[2][2], dh_inv[1][1], dh_inv[0][0]};
299 const double drmin = fmin(dh[0][0], fmin(dh[1][1], dh[2][2]));
300 *disr_radius = drmin * fmax(1.0, ceil(radius / drmin));
302 for (
int i = 0;
i < 3;
i++) {
303 roffset[
i] = rp[2 -
i] - ((double)cubecenter[
i]) * dx[
i];
306 for (
int i = 0;
i < 3;
i++) {
307 lb_cube[
i] = ceil(-1e-8 - *disr_radius * dx_inv[
i]);
311 for (
int i = 0;
i < 3;
i++) {
312 ub_cube[
i] = 1 - lb_cube[
i];
316 for (
int idir = 0; idir < 3; idir++) {
317 lb_cube[idir] = INT_MAX;
318 ub_cube[idir] = INT_MIN;
323 for (
int i = -1;
i <= 1;
i++) {
324 for (
int j = -1; j <= 1; j++) {
325 for (
int k = -1; k <= 1; k++) {
326 double x[3] = { ((double)
i) * radius,
327 ((double)j) * radius,
328 ((double)k) * radius};
330 for (
int idir = 0; idir < 3; idir++) {
331 const double resc = dh_inv[0][idir] * x[0] +
332 dh_inv[1][idir] * x[1] + dh_inv[2][idir] * x[2];
333 lb_cube[2 - idir] =
imin(lb_cube[2 - idir], floor(resc));
334 ub_cube[2 - idir] =
imax(ub_cube[2 - idir], ceil(resc));
342 for (
int i = 0;
i < 3;
i++) {
343 roffset[
i] = rp1[
i] - cubecenter[
i];
346 *disr_radius = radius;
352 cube_size[0] = ub_cube[0] - lb_cube[0] + 1;
353 cube_size[1] = ub_cube[1] - lb_cube[1] + 1;
354 cube_size[2] = ub_cube[2] - lb_cube[2] + 1;
356 for (
int i = 0;
i < 3;
i++) {
364 const int *
const cube_center,
365 const int *
const lower_boundaries_cube,
366 const int *
const period,
int *
const position) {
367 for (
int i = 0;
i < 3;
i++)
368 position[
i] =
modulo(cube_center[
i] - lb_grid[
i] + lower_boundaries_cube[
i],
373 double norm1, norm2, norm3;
375 norm1 = dh[0][0] * dh[0][0] + dh[0][1] * dh[0][1] + dh[0][2] * dh[0][2];
376 norm2 = dh[1][0] * dh[1][0] + dh[1][1] * dh[1][1] + dh[1][2] * dh[1][2];
377 norm3 = dh[2][0] * dh[2][0] + dh[2][1] * dh[2][1] + dh[2][2] * dh[2][2];
379 norm1 = 1.0 / sqrt(norm1);
380 norm2 = 1.0 / sqrt(norm2);
381 norm3 = 1.0 / sqrt(norm3);
385 ((fabs(dh[0][0] * dh[2][0] + dh[0][1] * dh[2][1] + dh[0][2] * dh[2][2]) *
386 norm1 * norm3) < 1e-12);
389 ((fabs(dh[1][0] * dh[2][0] + dh[1][1] * dh[2][1] + dh[1][2] * dh[2][2]) *
390 norm2 * norm3) < 1e-12);
393 ((fabs(dh[0][0] * dh[1][0] + dh[0][1] * dh[1][1] + dh[0][2] * dh[1][2]) *
394 norm1 * norm2) < 1e-12);
398extern void dger_(
const int *M,
const int *N,
const double *alpha,
399 const double *X,
const int *incX,
const double *Y,
400 const int *incY,
double *A,
const int *lda);
401extern void dgemv_(
const char *Trans,
const int *M,
const int *N,
402 const double *alpha,
const double *A,
const int *lda,
403 const double *X,
const int *incX,
const double *beta,
404 double *Y,
const int *incY);
407 const int incX,
double *Y,
const int incY) {
408 if ((incX == 1) && (incY == 1)) {
409 for (
int i = 0;
i < N;
i++)
410 Y[
i] += alpha * X[
i];
415 for (
int i = 0;
i < N;
i++)
416 Y[
i + incY] += alpha * X[
i];
421 for (
int i = 0;
i < N;
i++)
422 Y[
i] += alpha * X[
i + incX];
426 for (
int i = 0;
i < N;
i++)
427 Y[
i + incY] += alpha * X[
i + incX];
431double cblas_ddot(
const int N,
const double *X,
const int incX,
const double *Y,
433 if ((incX == incY) && (incY == 1)) {
436 for (
int i = 0;
i < N;
i++) {
445 for (
int i = 0;
i < N;
i++) {
446 res += X[
i] * Y[
i + incY];
454 for (
int i = 0;
i < N;
i++) {
455 res += X[
i + incX] * Y[
i];
462 for (
int i = 0;
i < N;
i++) {
463 res += X[
i + incX] * Y[
i + incY];
469 const double alpha,
const double *X,
const int incX,
470 const double *Y,
const int incY,
double *A,
const int lda) {
472 dger_(&N, &M, &alpha, Y, &incY, X, &incX, A, &lda);
474 dger_(&N, &M, &alpha, X, &incX, Y, &incY, A, &lda);
481 const int M,
const int N,
const double alpha,
const double *A,
482 const int lda,
const double *X,
const int incX,
483 const double beta,
double *Y,
const int incY) {
487 dgemv_(
"T", &M, &N, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
489 dgemv_(
"N", &M, &N, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
493 dgemv_(
"N", &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
495 dgemv_(
"T", &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
502 const int cube_size,
const int x1,
int *x,
503 int *
const lower_corner,
int *
const upper_corner,
505 if (size == full_size) {
523 *upper_corner = x1 + 1;
531 for (
int i = *x + 1; (
i < cube_size) && (*upper_corner ==
map[
i]) &&
static int imax(int x, int y)
Returns the larger of two given integers (missing from the C standard)
static int imin(int x, int y)
Returns the smaller of the two integers (missing from the C standard).
void dgemm_(const char *transa, const char *transb, const int *m, const int *n, const int *k, const double *alpha, const double *a, const int *lda, const double *b, const int *ldb, const double *beta, double *c, const int *ldc)
Prototype for BLAS dgemm.
#define GRID_PRAGMA_SIMD(OBJS, N)
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
static void const int const int const int const int const int const double const int const int const int int GRID_CONST_WHEN_COLLOCATE double GRID_CONST_WHEN_INTEGRATE double * grid
static void const int const int i
static void const int cmax
static void const int const int const int const int const int const double const int map[3][2 *cmax+1]
int compute_cube_properties(const bool ortho, const double radius, const double dh[3][3], const double dh_inv[3][3], const double *rp, double *disr_radius, double *roffset, int *cubecenter, int *lb_cube, int *ub_cube, int *cube_size)
double cblas_ddot(const int N, const double *X, const int incX, const double *Y, const int incY)
void dgemv_(const char *Trans, const int *M, const int *N, const double *alpha, const double *A, const int *lda, const double *X, const int *incX, const double *beta, double *Y, const int *incY)
void extract_sub_grid(const int *lower_corner, const int *upper_corner, const int *position, const tensor *const grid, tensor *const subgrid)
void dger_(const int *M, const int *N, const double *alpha, const double *X, const int *incX, const double *Y, const int *incY, double *A, const int *lda)
void convert_to_lattice_coordinates(const double dh_inv_[3][3], const double *restrict const rp, double *restrict rp_c)
void compute_interval(const int *const map, const int full_size, const int size, const int cube_size, const int x1, int *x, int *const lower_corner, int *const upper_corner, const Interval window)
void return_cube_position(const int *const lb_grid, const int *const cube_center, const int *const lower_boundaries_cube, const int *const period, int *const position)
void cblas_dger(const CBLAS_LAYOUT Layout, const int M, const int N, const double alpha, const double *X, const int incX, const double *Y, const int incY, double *A, const int lda)
void cblas_daxpy(const int N, const double alpha, const double *X, const int incX, double *Y, const int incY)
void verify_orthogonality(const double dh[3][3], bool orthogonal[3])
void cblas_dgemv(const CBLAS_LAYOUT order, const CBLAS_TRANSPOSE TransA, const int M, const int N, const double alpha, const double *A, const int lda, const double *X, const int incX, const double beta, double *Y, const int incY)
void dgemm_simplified(dgemm_params *const m)
void add_sub_grid(const int *lower_corner, const int *upper_corner, const int *position, const tensor *subgrid, tensor *grid)
void batched_dgemm_simplified(dgemm_params *const m, const int batch_size)