45 const int row_size,
const int col_size,
47 int cart_dims[2], cart_periods[2], cart_coords[2];
51 int *row_dist = malloc(nrows *
sizeof(
int));
52 int *col_dist = malloc(ncols *
sizeof(
int));
53 for (
int i = 0;
i < nrows;
i++) {
54 row_dist[
i] =
i % cart_dims[0];
56 for (
int i = 0;
i < ncols;
i++) {
57 col_dist[
i] =
i % cart_dims[1];
61 dbm_distribution_new(&dist, fortran_comm, nrows, ncols, row_dist, col_dist);
66 int *row_sizes = malloc(nrows *
sizeof(
int));
67 int *col_sizes = malloc(ncols *
sizeof(
int));
68 for (
int i = 0;
i < nrows;
i++) {
69 row_sizes[
i] = row_size;
71 for (
int i = 0;
i < ncols;
i++) {
72 col_sizes[
i] = col_size;
75 dbm_create(&matrix, dist,
"some name", nrows, ncols, row_sizes, col_sizes);
76 dbm_distribution_release(dist);
88 const int *row_sizes, *col_sizes;
95#pragma omp for collapse(2)
96 for (
int row = 0; row < nrows; row++) {
97 for (
int col = 0; col < ncols; col++) {
98 if (dbm_get_stored_coordinates(matrix, row, col) ==
104 int *reserve_row = malloc(nblocks *
sizeof(
int));
105 int *reserve_col = malloc(nblocks *
sizeof(
int));
107#pragma omp for collapse(2)
108 for (
int row = 0; row < nrows; row++) {
109 for (
int col = 0; col < ncols; col++) {
110 if (dbm_get_stored_coordinates(matrix, row, col) ==
112 reserve_row[iblock] = row;
113 reserve_col[iblock] = col;
118 assert(iblock == nblocks);
119 dbm_reserve_blocks(matrix, nblocks, reserve_row, reserve_col);
161 const double time_start_multiply = omp_get_wtime();
162 dbm_multiply(
false,
false, 1.0, matrix_a, matrix_b, 1.0, matrix_c,
false,
164 const double time_end_multiply = omp_get_wtime();
168 const double expected = (int64_t)M * (int64_t)m * (int64_t)N * (int64_t)n *
169 (int64_t)K * (int64_t)K * (int64_t)k * (int64_t)k;
170 const double checksum = dbm_checksum(matrix_c);
172 dbm_release(matrix_a);
173 dbm_release(matrix_b);
174 dbm_release(matrix_c);
177 printf(
"%5i x %5i x %5i with %3i x %3i x %3i blocks: ", M, N, K, m, n, k);
179 if (checksum == expected) {
182 const double duration = time_end_multiply - time_start_multiply;
183 printf(
"%6.3f s => %6.1f GFLOP/s\n", duration, 1e-9 * flop / duration);
188 fprintf(stderr,
"Expected checksum %f but got %f.\n", expected, checksum);
197int main(
int argc,
char *argv[]) {
198 int result = EXIT_SUCCESS;
206 if (offload_get_device_count() > 0) {
207 offload_set_chosen_device(my_rank % offload_get_device_count());
211 int dims[2] = {0, 0};
213 const int periods[2] = {
true,
true};
218 printf(
"OpenMP-threads: %i GPUs: %i", omp_get_max_threads(),
219 imin(offload_get_device_count(), nranks));
220#if defined(__LIBXSMM)
221 printf(
" Libxsmm: %s", LIBXSMM_VERSION);
223 printf(
" Libxsmm: n/a");
225#if defined(__parallel)
226 printf(
" MPI-ranks: %i MPI-cart: %i x %i", nranks, dims[0], dims[1]);
258 FILE *
const file = fopen(argv[1],
"r");
260 const char delims[] =
"x,;:|/\t ";
261 int mnk[] = {0, 0, 0},
i = 1, j = 0;
263 (NULL == file || NULL != fgets(buffer,
sizeof(buffer), file))) {
264 const char *arg = strtok(NULL != file ? buffer : argv[
i], delims);
265 for (; NULL != arg && j < 3; arg = strtok(NULL, delims), ++j) {
270 }
else if (++
i < argc) {
274 const int extra = (NULL == arg ? 0 : atoi(arg));
283 0 < mnk[2] ? mnk[2] : mnk[0], comm);
284 mnk[0] = mnk[1] = mnk[2] = 0;
286 fprintf(stderr,
"ERROR: invalid argument(s)\n");
287 result = EXIT_FAILURE;
296 if (EXIT_SUCCESS == result) {
299 dbm_library_finalize();
void benchmark_multiply(const int M, const int N, const int K, const int m, const int n, const int k, const dbm_mpi_comm_t comm)
Run a benchmark of dbm_multiply with given block sizes.