12 #if defined(__LIBXSMM)
14 #if !defined(DBM_LIBXSMM_PREFETCH)
16 #define DBM_LIBXSMM_PREFETCH LIBXSMM_GEMM_PREFETCH_NONE
18 #if LIBXSMM_VERSION4(1, 17, 0, 3710) > LIBXSMM_VERSION_NUMBER
19 #define libxsmm_dispatch_gemm libxsmm_dispatch_gemm_v2
30 void dgemm_(
const char *transa,
const char *transb,
const int *m,
const int *n,
31 const int *k,
const double *alpha,
const double *
a,
const int *lda,
32 const double *
b,
const int *ldb,
const double *beta,
double *
c,
39 static inline void dbm_dgemm(
const char transa,
const char transb,
const int m,
40 const int n,
const int k,
const double alpha,
41 const double *
a,
const int lda,
const double *
b,
42 const int ldb,
const double beta,
double *
c,
45 dgemm_(&transa, &transb, &m, &n, &k, &alpha,
a, &lda,
b, &ldb, &beta,
c,
58 const unsigned int m = task.
m, n = task.
n, k = task.
k;
59 const unsigned int mn = (m >= n) ? m * m + m + n : m + n * n;
60 const unsigned int mnk = (mn >= k) ? mn * mn + mn + k : mn + k * k;
79 #if defined(__LIBXSMM)
84 for (
int itask = 0; itask < ntasks; ++itask) {
89 buckets[
i] += buckets[
i - 1];
92 int batch_order[ntasks];
93 for (
int itask = 0; itask < ntasks; ++itask) {
96 batch_order[buckets[
i]] = itask;
100 const int flags = LIBXSMM_GEMM_FLAG_TRANS_B;
101 const int prefetch = DBM_LIBXSMM_PREFETCH;
102 int kernel_m = 0, kernel_n = 0, kernel_k = 0;
105 #if (LIBXSMM_GEMM_PREFETCH_NONE != DBM_LIBXSMM_PREFETCH)
106 double *data_a_next = NULL, *data_b_next = NULL, *data_c_next = NULL;
108 #if LIBXSMM_VERSION2(1, 17) < LIBXSMM_VERSION_NUMBER
109 libxsmm_gemmfunction kernel_func = NULL;
111 libxsmm_dmmfunction kernel_func = NULL;
112 const double beta = 1.0;
116 for (
int itask = 0; itask < ntasks; ++itask) {
118 task_next = batch[batch_order[(itask + 1) < ntasks ? (itask + 1) : itask]];
120 if (task.
m != kernel_m || task.
n != kernel_n || task.
k != kernel_k) {
121 #if LIBXSMM_VERSION2(1, 17) < LIBXSMM_VERSION_NUMBER
122 const libxsmm_gemm_shape shape = libxsmm_create_gemm_shape(
123 task.
m, task.
n, task.
k, task.
m , task.
n ,
124 task.
m , LIBXSMM_DATATYPE_F64 ,
125 LIBXSMM_DATATYPE_F64 , LIBXSMM_DATATYPE_F64 ,
126 LIBXSMM_DATATYPE_F64 );
127 kernel_func = (LIBXSMM_FEQ(1.0, alpha)
128 ? libxsmm_dispatch_gemm(shape, (libxsmm_bitfield)flags,
129 (libxsmm_bitfield)prefetch)
132 kernel_func = libxsmm_dmmdispatch(task.
m, task.
n, task.
k, NULL ,
133 NULL , NULL , &alpha,
134 &beta, &flags, &prefetch);
146 if (kernel_func != NULL) {
147 #if LIBXSMM_VERSION2(1, 17) < LIBXSMM_VERSION_NUMBER
148 libxsmm_gemm_param gemm_param;
149 gemm_param.a.primary = data_a;
150 gemm_param.b.primary = data_b;
151 gemm_param.c.primary = data_c;
152 #if (LIBXSMM_GEMM_PREFETCH_NONE != DBM_LIBXSMM_PREFETCH)
153 gemm_param.a.quaternary = pack_a->
data + task_next.
offset_a;
154 gemm_param.b.quaternary = pack_b->
data + task_next.
offset_b;
155 gemm_param.c.quaternary = shard_c->
data + task_next.
offset_c;
157 kernel_func(&gemm_param);
158 #elif (LIBXSMM_GEMM_PREFETCH_NONE != DBM_LIBXSMM_PREFETCH)
159 kernel_func(data_a, data_b, data_c, pack_a->
data + task_next.
offset_a,
163 kernel_func(data_a, data_b, data_c);
166 dbm_dgemm(
'N',
'T', task.
m, task.
n, task.
k, alpha, data_a, task.
m, data_b,
167 task.
n, 1.0, data_c, task.
m);
172 for (
int itask = 0; itask < ntasks; ++itask) {
177 dbm_dgemm(
'N',
'T', task.
m, task.
n, task.
k, alpha, data_a, task.
m, data_b,
178 task.
n, 1.0, data_c, task.
m);
static const int BATCH_NUM_BUCKETS
static void dbm_dgemm(const char transa, const char transb, const int m, const int n, const int k, const double alpha, const double *a, const int lda, const double *b, const int ldb, const double beta, double *c, const int ldc)
Private convenient wrapper to hide Fortran nature of dgemm_.
static unsigned int hash(const dbm_task_t task)
Private hash function based on Szudzik's elegant pairing. Using unsigned int to return a positive num...
void dgemm_(const char *transa, const char *transb, const int *m, const int *n, const int *k, const double *alpha, const double *a, const int *lda, const double *b, const int *ldb, const double *beta, double *c, const int *ldc)
Prototype for BLAS dgemm.
void dbm_multiply_cpu_process_batch(const int ntasks, dbm_task_t batch[ntasks], const double alpha, const dbm_pack_t *pack_a, const dbm_pack_t *pack_b, dbm_shard_t *shard_c)
Internal routine for executing the tasks in given batch on the CPU.
void dbm_shard_allocate_promised_blocks(dbm_shard_t *shard)
Internal routine for allocating and zeroing any promised block's data.
static void const int const int i
Internal struct for storing a pack - essentially a shard for MPI.
Internal struct for storing a matrix shard.
Internal struct for storing a task, ie. a single block multiplication.