14#include "../offload/offload_runtime.h"
28#if !defined(DBM_VALIDATE_AGAINST_LIBXSMM) && 0
29#define DBM_VALIDATE_AGAINST_LIBXSMM
37 const double filter_eps) {
38 const int nrows = (trans) ? matrix->
ncols : matrix->
nrows;
39 int *nblocks_per_row = calloc(nrows,
sizeof(
int));
40 float *row_max_eps = malloc(nrows *
sizeof(
float));
41 assert(row_max_eps != NULL);
48 for (
int iblock = 0; iblock < shard->
nblocks; iblock++) {
50 const int row = (trans) ? blk->
col : blk->
row;
52 nblocks_per_row[row]++;
59 for (
int i = 0;
i < nrows;
i++) {
61 ((float)filter_eps) / ((float)
imax(1, nblocks_per_row[
i]));
62 row_max_eps[
i] = f * f;
66 free(nblocks_per_row);
75#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_DBM)
76 dbm_multiply_gpu_context_t gpu;
87#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_DBM)
89 matrix_c->
shards, &ctx->gpu);
105#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_DBM)
106 dbm_multiply_gpu_upload_packs(pack_a, pack_b, &ctx->gpu);
124#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_DBM)
125 dbm_multiply_gpu_process_batch(ntasks, batch, alpha, kshard, &ctx->gpu);
126#if defined(DBM_VALIDATE_AGAINST_LIBXSMM) && defined(__LIBXSMM)
127 dbm_shard_gpu_t *
const shard_g = &ctx->gpu.shards_c_dev[kshard];
131 assert(shard_c->
data_size == shard_g->data_size);
134 offloadMemcpyAsyncDtoH(shard_c->
data, shard_g->data,
135 shard_c->
data_size *
sizeof(
double), shard_g->stream);
139 offloadStreamSynchronize(shard_g->stream);
140 libxsmm_matdiff_info diff;
141 libxsmm_matdiff_clear(&diff);
142 for (
int itask = 0; itask < ntasks; ++itask) {
144 const double *
const tst = &shard_c->
data[task.
offset_c];
146 libxsmm_matdiff_info d;
147 if (EXIT_SUCCESS == libxsmm_matdiff(&d, LIBXSMM_DATATYPE(
double), task.
m,
148 task.
n, ref, tst, NULL ,
150 libxsmm_matdiff_reduce(&diff, &d);
153 const char *
const maxeps_env = getenv(
"DBM_MULTIPLY_MAXEPS");
154 const double maxeps = (NULL == maxeps_env ? 1E-13 : fabs(atof(maxeps_env)));
155 const double epsilon = libxsmm_matdiff_epsilon(&diff);
156 if (maxeps < epsilon) {
157 if (LIBXSMM_NOTNAN(diff.v_tst)) {
158 fprintf(stderr,
"INFO ACC/LIBDBM: diff=%g (|%g-%g|=%g)\n", epsilon,
159 diff.v_ref, diff.v_tst, diff.linf_abs);
161 fprintf(stderr,
"INFO ACC/LIBDBM: diff=%g\n", epsilon);
182#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_DBM)
183 dbm_multiply_gpu_download_results(&ctx->gpu);
194#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_DBM)
195 dbm_multiply_gpu_stop(&ctx->gpu);
209 const bool retain_sparsity,
210 const float *rows_max_eps, int64_t *flop,
212 const float alpha2 = alpha * alpha;
213 int64_t flop_sum = 0;
217 int *shard_row_start = calloc(nshard_rows,
sizeof(
int));
218 int *shard_col_start = calloc(nshard_cols,
sizeof(
int));
219 assert(NULL != shard_row_start && NULL != shard_col_start);
221 const int *sum_index_sizes_a =
223 const int *sum_index_sizes_b =
225 const int *free_index_sizes_a =
227 const int *free_index_sizes_b =
230#pragma omp parallel reduction(+ : flop_sum)
237#pragma omp for nowait
238 for (
int iblock = 1; iblock < pack_a->
nblocks; iblock++) {
240 const int prev_shard_row =
242 if (prev_shard_row != shard_row) {
243 shard_row_start[shard_row] = iblock;
247 for (
int jblock = 1; jblock < pack_b->
nblocks; jblock++) {
249 const int prev_shard_col =
251 if (prev_shard_col != shard_col) {
252 shard_col_start[shard_col] = jblock;
256#pragma omp for collapse(2) DBM_OMP_SCHEDULE
257 for (
int shard_row = 0; shard_row < nshard_rows; shard_row++) {
258 for (
int shard_col = 0; shard_col < nshard_cols; shard_col++) {
259 const int ishard = shard_row * nshard_cols + shard_col;
265 const int iblock_start = shard_row_start[shard_row];
266 int jblock_start = shard_col_start[shard_col];
267 for (
int iblock = iblock_start; iblock < pack_a->
nblocks; iblock++) {
269 if (blk_a->
free_index % nshard_rows != shard_row) {
272 for (
int jblock = jblock_start; jblock < pack_b->
nblocks; jblock++) {
274 if (blk_b->
free_index % nshard_cols != shard_col) {
289 const float result_norm = alpha2 * blk_a->
norm * blk_b->
norm;
290 if (result_norm < rows_max_eps[blk_a->
free_index]) {
295 const int m = free_index_sizes_a[blk_a->
free_index];
296 const int n = free_index_sizes_b[blk_b->
free_index];
297 const int k = sum_index_sizes_a[blk_a->
sum_index];
300 assert(k == sum_index_sizes_b[blk_b->
sum_index]);
305 if (blk_c == NULL && retain_sparsity) {
307 }
else if (blk_c == NULL) {
309 assert(dbm_get_stored_coordinates(matrix_c, row, col) ==
315 const int64_t task_flops = 2LL * m * n * k;
316 if (task_flops == 0) {
319 flop_sum += task_flops;
333 ishard, shard_c, ctx);
346 free(shard_row_start);
347 free(shard_col_start);
357void dbm_multiply(
const bool transa,
const bool transb,
const double alpha,
360 const bool retain_sparsity,
const double filter_eps,
363 assert(omp_get_num_threads() == 1);
367 const int num_sum_index_a = (transa) ? matrix_a->
nrows : matrix_a->
ncols;
368 const int num_sum_index_b = (transb) ? matrix_b->
ncols : matrix_b->
nrows;
369 const int num_free_index_a = (transa) ? matrix_a->
ncols : matrix_a->
nrows;
370 const int num_free_index_b = (transb) ? matrix_b->
nrows : matrix_b->
ncols;
373 assert(num_sum_index_a == num_sum_index_b);
374 assert(num_free_index_a == matrix_c->
nrows);
375 assert(num_free_index_b == matrix_c->
ncols);
378 dbm_scale(matrix_c, beta);
395 multiply_packs(transa, transb, alpha, pack_a, pack_b, matrix_a, matrix_b,
396 matrix_c, retain_sparsity, rows_max_eps, flop, ctx);
408 dbm_filter(matrix_c, filter_eps);
#define DBM_MAX_BATCH_SIZE
static int imax(int x, int y)
Returns the larger of two given integers (missing from the C standard)
void dbm_library_counter_increment(const int m, const int n, const int k)
Add given block multiplication to stats. This routine is thread-safe.
static int dbm_get_shard_index(const dbm_matrix_t *matrix, const int row, const int col)
Internal routine for getting a block's shard index.
static int dbm_get_num_shards(const dbm_matrix_t *matrix)
Internal routine that returns the number of shards for given matrix.
void dbm_mempool_host_free(const void *memory)
Private routine for releasing memory back to the pool.
void * dbm_mempool_host_malloc(size_t size)
Private routine for allocating host or device memory from the pool.
void dbm_mpi_sum_int(int *values, const int count, const dbm_mpi_comm_t comm)
Wrapper around MPI_Allreduce for op MPI_SUM and datatype MPI_INT.
static void backend_upload_packs(const dbm_pack_t *pack_a, const dbm_pack_t *pack_b, backend_context_t *ctx)
Private routine for handing newly arrived packs to the backend.
static float * compute_rows_max_eps(const bool trans, const dbm_matrix_t *matrix, const double filter_eps)
Private routine for computing the max filter threshold for each row.
static backend_context_t * backend_start(const dbm_matrix_t *matrix_c)
Private routine for initializing the multiplication backend.
static void backend_stop(backend_context_t *ctx)
Private routine for shutting down the multiplication backend.
static void backend_download_results(backend_context_t *ctx)
Private routine for downloading results of the multiplication backend.
static void backend_process_batch(const int ntasks, const dbm_task_t batch[ntasks], const double alpha, const dbm_pack_t *pack_a, const dbm_pack_t *pack_b, const int kshard, dbm_shard_t *shard_c, backend_context_t *ctx)
Private routine for sending a batch to the multiplication backend.
static void multiply_packs(const bool transa, const bool transb, const double alpha, const dbm_pack_t *pack_a, const dbm_pack_t *pack_b, const dbm_matrix_t *matrix_a, const dbm_matrix_t *matrix_b, dbm_matrix_t *matrix_c, const bool retain_sparsity, const float *rows_max_eps, int64_t *flop, backend_context_t *ctx)
Private routine for multipling two packs.
dbm_comm_iterator_t * dbm_comm_iterator_start(const bool transa, const bool transb, const dbm_matrix_t *matrix_a, const dbm_matrix_t *matrix_b, const dbm_matrix_t *matrix_c)
Internal routine for creating a communication iterator.
void dbm_comm_iterator_stop(dbm_comm_iterator_t *iter)
Internal routine for releasing the given communication iterator.
bool dbm_comm_iterator_next(dbm_comm_iterator_t *iter, dbm_pack_t **pack_a, dbm_pack_t **pack_b)
Internal routine for retriving next pair of packs from given iterator.
void dbm_multiply_cpu_process_batch(const int ntasks, const dbm_task_t batch[ntasks], const double alpha, const dbm_pack_t *pack_a, const dbm_pack_t *pack_b, dbm_shard_t *shard_c)
Private hash function based on Szudzik's elegant pairing. Using unsigned int to return a positive num...
void dbm_shard_release(dbm_shard_t *shard)
Internal routine for releasing a shard.
dbm_block_t * dbm_shard_promise_new_block(dbm_shard_t *shard, const int row, const int col, const int block_size)
Internal routine for allocating the metadata of a new block.
dbm_block_t * dbm_shard_lookup(const dbm_shard_t *shard, const int row, const int col)
Internal routine for looking up a block from a shard.
void dbm_shard_allocate_promised_blocks(dbm_shard_t *shard)
Internal routine for allocating and zeroing any promised block's data.
void dbm_shard_init(dbm_shard_t *shard)
Internal routine for initializing a shard.
void dbm_shard_copy(dbm_shard_t *shard_a, const dbm_shard_t *shard_b)
Internal routine for copying content of shard_b into shard_a.
static void const int const int i
Private struct for storing the context of the multiplication backend.
Internal struct for storing a block's metadata.
Internal struct for storing a communication iterator.
Internal struct for storing a matrix.
dbm_distribution_t * dist
Internal struct for storing a dbm_block_t plus its norm.
Internal struct for storing a pack - essentially a shard for MPI.
dbm_pack_block_t * blocks
Internal struct for storing a matrix shard.
Internal struct for storing a task, ie. a single block multiplication.