15 #if defined(__LIBXSMM)
19 #include "../offload/offload_library.h"
29 if (output_unit == 0) {
30 printf(
"%s", message);
38 static inline int imin(
int x,
int y) {
return (x < y ? x : y); }
45 const int row_size,
const int col_size,
47 int cart_dims[2], cart_periods[2], cart_coords[2];
51 int *row_dist = malloc(nrows *
sizeof(
int));
52 int *col_dist = malloc(ncols *
sizeof(
int));
53 for (
int i = 0;
i < nrows;
i++) {
54 row_dist[
i] =
i % cart_dims[0];
56 for (
int i = 0;
i < ncols;
i++) {
57 col_dist[
i] =
i % cart_dims[1];
66 int *row_sizes = malloc(nrows *
sizeof(
int));
67 int *col_sizes = malloc(ncols *
sizeof(
int));
68 for (
int i = 0;
i < nrows;
i++) {
69 row_sizes[
i] = row_size;
71 for (
int i = 0;
i < ncols;
i++) {
72 col_sizes[
i] = col_size;
75 dbm_create(&matrix, dist,
"some name", nrows, ncols, row_sizes, col_sizes);
88 const int *row_sizes, *col_sizes;
95 #pragma omp for collapse(2)
96 for (
int row = 0; row < nrows; row++) {
97 for (
int col = 0; col < ncols; col++) {
104 int *reserve_row = malloc(nblocks *
sizeof(
int));
105 int *reserve_col = malloc(nblocks *
sizeof(
int));
107 #pragma omp for collapse(2)
108 for (
int row = 0; row < nrows; row++) {
109 for (
int col = 0; col < ncols; col++) {
112 reserve_row[iblock] = row;
113 reserve_col[iblock] = col;
118 assert(iblock == nblocks);
135 int row, col, row_size, col_size;
138 const int block_size = row_size * col_size;
139 for (
int i = 0;
i < block_size;
i++) {
161 const double time_start_multiply = omp_get_wtime();
162 dbm_multiply(
false,
false, 1.0, matrix_a, matrix_b, 1.0, matrix_c,
false,
164 const double time_end_multiply = omp_get_wtime();
168 const double expected = (int64_t)M * (int64_t)m * (int64_t)N * (int64_t)n *
169 (int64_t)K * (int64_t)K * (int64_t)k * (int64_t)k;
177 printf(
"%5i x %5i x %5i with %3i x %3i x %3i blocks: ", M, N, K, m, n, k);
179 if (checksum == expected) {
182 const double duration = time_end_multiply - time_start_multiply;
183 printf(
"%6.3f s => %6.1f GFLOP/s\n", duration, 1e-9 * flop / duration);
188 fprintf(stderr,
"Expected checksum %f but got %f.\n", expected, checksum);
197 int main(
int argc,
char *argv[]) {
210 int dims[2] = {0, 0};
212 const int periods[2] = {
true,
true};
217 printf(
"OpenMP-threads: %i GPUs: %i", omp_get_max_threads(),
219 #if defined(__LIBXSMM)
220 printf(
" Libxsmm: %s", LIBXSMM_VERSION);
222 printf(
" Libxsmm: n/a");
224 #if defined(__parallel)
225 printf(
" MPI-ranks: %i MPI-cart: %i x %i", nranks, dims[0], dims[1]);
257 FILE *
const file = fopen(argv[1],
"r");
259 const char delims[] =
"x,;:|/\t ";
260 int mnk[] = {0, 0, 0},
i = 1, j = 0;
262 (NULL == file || NULL != fgets(buffer,
sizeof(buffer), file))) {
263 const char *arg = strtok(NULL != file ? buffer : argv[
i], delims);
264 for (; NULL != arg; arg = strtok(NULL, delims)) {
275 }
else if (++
i < argc) {
280 0 < mnk[2] ? mnk[2] : mnk[0], comm);
281 mnk[0] = mnk[1] = mnk[2] = 0;
void dbm_get_col_sizes(const dbm_matrix_t *matrix, int *ncols, const int **col_sizes)
Returns the column block sizes of the given matrix.
void dbm_get_row_sizes(const dbm_matrix_t *matrix, int *nrows, const int **row_sizes)
Returns the row block sizes of the given matrix.
int main(int argc, char *argv[])
Stand-alone miniapp for smoke-testing and benchmarking dbm_multiply.
static int imin(int x, int y)
Returns the smaller of two given integer (missing from the C standard)
static void set_all_blocks(dbm_matrix_t *matrix)
Private routine for setting all blocks to 1.0.
static void print_func(char *message, int output_unit)
Wrapper for printf, passed to dbm_library_print_stats.
void benchmark_multiply(const int M, const int N, const int K, const int m, const int n, const int k, const dbm_mpi_comm_t comm)
Run a benchmark of dbm_multiply with given block sizes.
static void reserve_all_blocks(dbm_matrix_t *matrix)
Private routine for reserving all blocks of the given matrix.
static dbm_matrix_t * create_some_matrix(const int nrows, const int ncols, const int row_size, const int col_size, const dbm_mpi_comm_t comm)
Private routine for creating a distribution and an empty matrix.
void dbm_mpi_finalize()
Wrapper around MPI_Finalize.
int dbm_mpi_comm_rank(const dbm_mpi_comm_t comm)
Wrapper around MPI_Comm_rank.
int dbm_mpi_comm_size(const dbm_mpi_comm_t comm)
Wrapper around MPI_Comm_size.
void dbm_mpi_init(int *argc, char ***argv)
Wrapper around MPI_Init.
dbm_mpi_comm_t dbm_mpi_cart_create(const dbm_mpi_comm_t comm_old, const int ndims, const int dims[], const int periods[], const int reorder)
Wrapper around MPI_Cart_create.
void dbm_mpi_sum_int64(int64_t *values, const int count, const dbm_mpi_comm_t comm)
Wrapper around MPI_Allreduce for op MPI_SUM and datatype MPI_INT64_T.
void dbm_mpi_dims_create(const int nnodes, const int ndims, int dims[])
Wrapper around MPI_Dims_create.
int dbm_mpi_comm_c2f(const dbm_mpi_comm_t comm)
Wrapper around MPI_Comm_c2f.
void dbm_mpi_comm_free(dbm_mpi_comm_t *comm)
Wrapper around MPI_Comm_free.
void dbm_mpi_cart_get(const dbm_mpi_comm_t comm, int maxdims, int dims[], int periods[], int coords[])
Wrapper around MPI_Cart_get.
dbm_mpi_comm_t dbm_mpi_get_comm_world()
Returns MPI_COMM_WORLD.
static void const int const int i
subroutine, public dbm_multiply(transa, transb, alpha, matrix_a, matrix_b, beta, matrix_c, retain_sparsity, filter_eps, flop)
Computes matrix product: matrix_c = alpha * matrix_a * matrix_b + beta * matrix_c.
subroutine, public dbm_distribution_release(dist)
Decreases the reference counter of the given distribution.
subroutine, public dbm_library_init()
Initialize DBM library.
subroutine, public dbm_iterator_next_block(iterator, row, column, block, row_size, col_size)
Returns the next block from the given iterator.
subroutine, public dbm_get_stored_coordinates(matrix, row, column, processor)
Returns the MPI rank on which the given block should be stored.
subroutine, public dbm_create(matrix, name, dist, row_block_sizes, col_block_sizes)
Creates a new matrix.
logical function, public dbm_iterator_blocks_left(iterator)
Tests whether the given iterator has any block left.
subroutine, public dbm_reserve_blocks(matrix, rows, cols)
Adds given list of blocks efficiently. The blocks will be filled with zeros.
subroutine, public dbm_library_finalize()
Finalize DBM library.
subroutine, public dbm_iterator_stop(iterator)
Releases the given iterator.
real(kind=dp) function, public dbm_checksum(matrix)
Computes a checksum of the given matrix.
subroutine, public dbm_iterator_start(iterator, matrix)
Creates an iterator for the blocks of the given matrix. The iteration order is not stable.
subroutine, public dbm_library_print_stats(mpi_comm, output_unit)
Print DBM library statistics.
subroutine, public dbm_release(matrix)
Releases a matrix and all its ressources.
subroutine, public dbm_distribution_new(dist, mp_comm, row_dist_block, col_dist_block)
Creates a new two dimensional distribution.
subroutine, public offload_set_chosen_device(device_id)
Selects the chosen device to be used.
integer function, public offload_get_device_count()
Returns the number of available devices.
Internal struct for storing a two dimensional distribution.
Internal struct for storing a block iterator.
Internal struct for storing a matrix.
dbm_distribution_t * dist