48 int cart_dims[2], cart_periods[2], cart_coords[2];
52 assert(0 < nrows && 0 < ncols);
53 int *row_dist = malloc(nrows *
sizeof(
int));
54 int *col_dist = malloc(ncols *
sizeof(
int));
55 assert(row_dist != NULL && col_dist != NULL);
56 for (
int i = 0;
i < nrows;
i++) {
57 row_dist[
i] =
i % cart_dims[0];
59 for (
int i = 0;
i < ncols;
i++) {
60 col_dist[
i] =
i % cart_dims[1];
64 dbm_distribution_new(&dist, fortran_comm, nrows, ncols, row_dist, col_dist);
76 const int nrows_max,
const int ncols_min,
82 assert(0 < nrows && 0 < ncols);
83 int *row_sizes = malloc(nrows *
sizeof(
int));
84 int *col_sizes = malloc(ncols *
sizeof(
int));
85 assert(row_sizes != NULL && col_sizes != NULL);
86 assert(0 < nrows_min && nrows_min <= nrows_max);
87 assert(0 < ncols_min && ncols_min <= ncols_max);
88 if (nrows_min != nrows_max) {
89 const int row_size = nrows_max - nrows_min + 1;
90 for (
int i = 0;
i < nrows;
i++) {
91 row_sizes[
i] = rand() % row_size + 1;
94 for (
int i = 0;
i < nrows;
i++) {
95 row_sizes[
i] = nrows_max;
98 if (ncols_min != ncols_max) {
99 const int col_size = ncols_max - ncols_min + 1;
100 for (
int i = 0;
i < ncols;
i++) {
101 col_sizes[
i] = rand() % col_size + 1;
104 for (
int i = 0;
i < ncols;
i++) {
105 col_sizes[
i] = ncols_max;
109 dbm_create(&matrix, dist,
"some name", nrows, ncols, row_sizes, col_sizes);
110 dbm_distribution_release(dist);
122 const int *row_sizes, *col_sizes;
129#pragma omp for collapse(2)
130 for (
int row = 0; row < nrows; row++) {
131 for (
int col = 0; col < ncols; col++) {
132 if (dbm_get_stored_coordinates(matrix, row, col) ==
138 int *reserve_row = NULL, *reserve_col = NULL;
140 reserve_row = malloc(nblocks *
sizeof(
int));
141 reserve_col = malloc(nblocks *
sizeof(
int));
142 assert(reserve_row != NULL && reserve_col != NULL);
145#pragma omp for collapse(2)
146 for (
int row = 0; row < nrows; row++) {
147 for (
int col = 0; col < ncols; col++) {
148 if (dbm_get_stored_coordinates(matrix, row, col) ==
150 reserve_row[iblock] = row;
151 reserve_col[iblock] = col;
156 assert(iblock == nblocks);
157 dbm_reserve_blocks(matrix, nblocks, reserve_row, reserve_col);
194 dbm_create(&matrix_c, dist_c,
"result", M, N, matrix_a->
row_sizes,
196 dbm_distribution_release(dist_c);
203 const char *
const verify_env = getenv(
"DBM_MULTIPLY_VERIFY");
204 const int skip_verify = (NULL == verify_env ? 0 : (atoi(verify_env) + 1));
206 if (0 == skip_verify) {
208 dbm_create(&matrix_d, dist_shared, matrix_c->
name, matrix_c->
nrows,
210 dbm_copy(matrix_d, matrix_c);
214 const double time_start_multiply = omp_get_wtime();
215 dbm_multiply(
false,
false, 1.0, matrix_a, matrix_b, 1.0, matrix_c,
false,
217 const double time_end_multiply = omp_get_wtime();
220 printf(
"%5i x %5i x %5i with %3i x %3i x %3i blocks: ", M, N, K, m, n, k);
223 if (NULL != matrix_d) {
224 dbm_multiply(
false,
false, 1.0, matrix_a, matrix_b, 1.0, matrix_d,
false,
227 const double maxeps = 1E-5, epsilon =
dbm_maxeps(matrix_d, matrix_c);
228 if (maxeps < epsilon) {
230 fprintf(stderr,
"Failed validation (epsilon=%f).\n", epsilon);
233 dbm_release(matrix_d);
238 const double duration = time_end_multiply - time_start_multiply;
239 printf(
"%6.3f s => %6.1f GFLOP/s\n", duration, 1e-9 * flop / duration);
243 dbm_release(matrix_a);
244 dbm_release(matrix_b);
245 dbm_release(matrix_c);
252int main(
int argc,
char *argv[]) {
253 int result = EXIT_SUCCESS;
264 if (offload_get_device_count() > 0) {
265 offload_set_chosen_device(my_rank % offload_get_device_count());
269 int dims[2] = {0, 0};
271 const int periods[2] = {
true,
true};
275 printf(
"OpenMP-threads: %i GPUs: %i", omp_get_max_threads(),
276 imin(offload_get_device_count(), nranks));
277#if defined(__LIBXSMM)
278 printf(
" Libxsmm: %s", LIBXSMM_VERSION);
280 printf(
" Libxsmm: n/a");
282#if defined(__parallel)
283 printf(
" MPI-ranks: %i MPI-cart: %i x %i", nranks, dims[0], dims[1]);
315 FILE *
const file = fopen(argv[1],
"r");
317 const char delims[] =
"x,;:|/\t ";
318 int mnk[] = {0, 0, 0},
i = 1, j = 0;
320 (NULL == file || NULL != fgets(buffer,
sizeof(buffer), file))) {
321 const char *arg = strtok(NULL != file ? buffer : argv[
i], delims);
322 for (; NULL != arg && j < 3; arg = strtok(NULL, delims), ++j) {
327 }
else if (++
i < argc) {
331 const int m = mnk[0];
332 const int n = (0 < mnk[1] ? mnk[1] : m);
333 const int k = (0 < mnk[2] ? mnk[2] : m);
334 int M = (NULL == arg ? 0 : atoi(arg)), N, K;
336 arg = strtok(NULL, delims);
337 N = (NULL == arg ? 1 : atoi(arg));
338 arg = strtok(NULL, delims);
339 K = (NULL == arg ? 1 : atoi(arg));
344 mnk[0] = mnk[1] = mnk[2] = 0;
346 fprintf(stderr,
"ERROR: invalid argument(s)\n");
347 result = EXIT_FAILURE;
356 if (EXIT_SUCCESS == result) {
358 dbm_library_print_stats(fortran_comm, &
print_func, my_rank);
359 offload_mempool_stats_print(fortran_comm, &
print_func, my_rank);
361 dbm_library_finalize();