(git:34ef472)
dbm_miniapp.c
Go to the documentation of this file.
1 /*----------------------------------------------------------------------------*/
2 /* CP2K: A general program to perform molecular dynamics simulations */
3 /* Copyright 2000-2024 CP2K developers group <https://cp2k.org> */
4 /* */
5 /* SPDX-License-Identifier: BSD-3-Clause */
6 /*----------------------------------------------------------------------------*/
7 
8 #include <assert.h>
9 #include <omp.h>
10 #include <stdint.h>
11 #include <stdio.h>
12 #include <stdlib.h>
13 #include <string.h>
14 
15 #if defined(__LIBXSMM)
16 #include <libxsmm.h>
17 #endif
18 
19 #include "../offload/offload_library.h"
20 #include "dbm_library.h"
21 #include "dbm_matrix.h"
22 #include "dbm_mpi.h"
23 
24 /*******************************************************************************
25  * \brief Wrapper for printf, passed to dbm_library_print_stats.
26  * \author Ole Schuett
27  ******************************************************************************/
28 static void print_func(char *message, int output_unit) {
29  if (output_unit == 0) { // i.e. my_rank == 0
30  printf("%s", message);
31  }
32 }
33 
34 /*******************************************************************************
35  * \brief Returns the smaller of two given integer (missing from the C standard)
36  * \author Ole Schuett
37  ******************************************************************************/
38 static inline int imin(int x, int y) { return (x < y ? x : y); }
39 
40 /*******************************************************************************
41  * \brief Private routine for creating a distribution and an empty matrix.
42  * \author Ole Schuett
43  ******************************************************************************/
44 static dbm_matrix_t *create_some_matrix(const int nrows, const int ncols,
45  const int row_size, const int col_size,
46  const dbm_mpi_comm_t comm) {
47  int cart_dims[2], cart_periods[2], cart_coords[2];
48  dbm_mpi_cart_get(comm, 2, cart_dims, cart_periods, cart_coords);
49 
50  // Create distribution.
51  int *row_dist = malloc(nrows * sizeof(int));
52  int *col_dist = malloc(ncols * sizeof(int));
53  for (int i = 0; i < nrows; i++) {
54  row_dist[i] = i % cart_dims[0];
55  }
56  for (int i = 0; i < ncols; i++) {
57  col_dist[i] = i % cart_dims[1];
58  }
59  const int fortran_comm = dbm_mpi_comm_c2f(comm);
60  dbm_distribution_t *dist = NULL;
61  dbm_distribution_new(&dist, fortran_comm, nrows, ncols, row_dist, col_dist);
62  free(row_dist);
63  free(col_dist);
64 
65  // Create matrix.
66  int *row_sizes = malloc(nrows * sizeof(int));
67  int *col_sizes = malloc(ncols * sizeof(int));
68  for (int i = 0; i < nrows; i++) {
69  row_sizes[i] = row_size;
70  }
71  for (int i = 0; i < ncols; i++) {
72  col_sizes[i] = col_size;
73  }
74  dbm_matrix_t *matrix = NULL;
75  dbm_create(&matrix, dist, "some name", nrows, ncols, row_sizes, col_sizes);
77  free(row_sizes);
78  free(col_sizes);
79  return matrix;
80 }
81 
82 /*******************************************************************************
83  * \brief Private routine for reserving all blocks of the given matrix.
84  * \author Ole Schuett
85  ******************************************************************************/
86 static void reserve_all_blocks(dbm_matrix_t *matrix) {
87  int nrows, ncols;
88  const int *row_sizes, *col_sizes;
89  dbm_get_row_sizes(matrix, &nrows, &row_sizes);
90  dbm_get_col_sizes(matrix, &ncols, &col_sizes);
91 
92 #pragma omp parallel
93  {
94  int nblocks = 0;
95 #pragma omp for collapse(2)
96  for (int row = 0; row < nrows; row++) {
97  for (int col = 0; col < ncols; col++) {
98  if (dbm_get_stored_coordinates(matrix, row, col) ==
99  matrix->dist->my_rank) {
100  ++nblocks;
101  }
102  }
103  }
104  int *reserve_row = malloc(nblocks * sizeof(int));
105  int *reserve_col = malloc(nblocks * sizeof(int));
106  int iblock = 0;
107 #pragma omp for collapse(2)
108  for (int row = 0; row < nrows; row++) {
109  for (int col = 0; col < ncols; col++) {
110  if (dbm_get_stored_coordinates(matrix, row, col) ==
111  matrix->dist->my_rank) {
112  reserve_row[iblock] = row;
113  reserve_col[iblock] = col;
114  iblock++;
115  }
116  }
117  }
118  assert(iblock == nblocks);
119  dbm_reserve_blocks(matrix, nblocks, reserve_row, reserve_col);
120  free(reserve_row);
121  free(reserve_col);
122  }
123 }
124 
125 /*******************************************************************************
126  * \brief Private routine for setting all blocks to 1.0.
127  * \author Ole Schuett
128  ******************************************************************************/
129 static void set_all_blocks(dbm_matrix_t *matrix) {
130 #pragma omp parallel
131  {
132  dbm_iterator_t *iter = NULL;
133  dbm_iterator_start(&iter, matrix);
134  while (dbm_iterator_blocks_left(iter)) {
135  int row, col, row_size, col_size;
136  double *block;
137  dbm_iterator_next_block(iter, &row, &col, &block, &row_size, &col_size);
138  const int block_size = row_size * col_size;
139  for (int i = 0; i < block_size; i++) {
140  block[i] = 1.0;
141  }
142  }
143  dbm_iterator_stop(iter);
144  }
145 }
146 /*******************************************************************************
147  * \brief Run a benchmark of dbm_multiply with given block sizes.
148  * \author Ole Schuett
149  ******************************************************************************/
150 void benchmark_multiply(const int M, const int N, const int K, const int m,
151  const int n, const int k, const dbm_mpi_comm_t comm) {
152  dbm_matrix_t *matrix_a = create_some_matrix(M, K, m, k, comm);
153  dbm_matrix_t *matrix_b = create_some_matrix(K, N, k, n, comm);
154  dbm_matrix_t *matrix_c = create_some_matrix(M, N, m, n, comm);
155  reserve_all_blocks(matrix_a);
156  reserve_all_blocks(matrix_b);
157  set_all_blocks(matrix_a);
158  set_all_blocks(matrix_b);
159 
160  int64_t flop = 0;
161  const double time_start_multiply = omp_get_wtime();
162  dbm_multiply(false, false, 1.0, matrix_a, matrix_b, 1.0, matrix_c, false,
163  1e-8, &flop);
164  const double time_end_multiply = omp_get_wtime();
165 
166  // Validate checksum.
167  // Since all matrix elements were set to 1.0 the checksum is an integer.
168  const double expected = (int64_t)M * (int64_t)m * (int64_t)N * (int64_t)n *
169  (int64_t)K * (int64_t)K * (int64_t)k * (int64_t)k;
170  const double checksum = dbm_checksum(matrix_c);
171 
172  dbm_release(matrix_a);
173  dbm_release(matrix_b);
174  dbm_release(matrix_c);
175 
176  if (dbm_mpi_comm_rank(comm) == 0) {
177  printf("%5i x %5i x %5i with %3i x %3i x %3i blocks: ", M, N, K, m, n, k);
178  }
179  if (checksum == expected) {
180  dbm_mpi_sum_int64(&flop, 1, comm);
181  if (dbm_mpi_comm_rank(comm) == 0) {
182  const double duration = time_end_multiply - time_start_multiply;
183  printf("%6.3f s => %6.1f GFLOP/s\n", duration, 1e-9 * flop / duration);
184  fflush(stdout);
185  }
186  } else {
187  printf("ERROR\n");
188  fprintf(stderr, "Expected checksum %f but got %f.\n", expected, checksum);
189  exit(1);
190  }
191 }
192 
193 /*******************************************************************************
194  * \brief Stand-alone miniapp for smoke-testing and benchmarking dbm_multiply.
195  * \author Ole Schuett
196  ******************************************************************************/
197 int main(int argc, char *argv[]) {
198  dbm_mpi_init(&argc, &argv);
200 
201  const dbm_mpi_comm_t world_comm = dbm_mpi_get_comm_world();
202  const int nranks = dbm_mpi_comm_size(world_comm);
203  const int my_rank = dbm_mpi_comm_rank(world_comm);
204 
205  if (offload_get_device_count() > 0) {
207  }
208 
209  // Create 2D cart.
210  int dims[2] = {0, 0};
211  dbm_mpi_dims_create(nranks, 2, dims);
212  const int periods[2] = {true, true};
213  dbm_mpi_comm_t comm =
214  dbm_mpi_cart_create(world_comm, 2, dims, periods, false);
215 
216  if (my_rank == 0) {
217  printf("OpenMP-threads: %i GPUs: %i", omp_get_max_threads(),
218  imin(offload_get_device_count(), nranks));
219 #if defined(__LIBXSMM)
220  printf(" Libxsmm: %s", LIBXSMM_VERSION);
221 #else
222  printf(" Libxsmm: n/a");
223 #endif
224 #if defined(__parallel)
225  printf(" MPI-ranks: %i MPI-cart: %i x %i", nranks, dims[0], dims[1]);
226 #else
227  printf(" MPI: n/a");
228 #endif
229  printf("\n\n");
230  fflush(stdout);
231  }
232 
233  if (1 >= argc) {
234  benchmark_multiply(16384, 128, 128, 4, 4, 4, comm);
235  benchmark_multiply(128, 16384, 128, 4, 4, 4, comm);
236  benchmark_multiply(128, 128, 16384, 4, 4, 4, comm);
237  benchmark_multiply(645, 645, 645, 4, 4, 4, comm);
238  if (my_rank == 0)
239  printf("\n");
240 
241  benchmark_multiply(60, 500, 500, 128, 4, 4, comm);
242  benchmark_multiply(500, 60, 500, 4, 128, 4, comm);
243  benchmark_multiply(500, 500, 60, 4, 4, 128, comm);
244  if (my_rank == 0)
245  printf("\n");
246 
247  benchmark_multiply(500, 60, 60, 4, 128, 128, comm);
248  benchmark_multiply(60, 500, 60, 128, 4, 128, comm);
249  benchmark_multiply(60, 60, 500, 128, 128, 4, comm);
250  if (my_rank == 0)
251  printf("\n");
252 
253  benchmark_multiply(350, 350, 350, 23, 23, 23, comm);
254  benchmark_multiply(250, 250, 250, 32, 32, 32, comm);
255  benchmark_multiply(60, 60, 60, 128, 128, 128, comm);
256  } else { /* read triplet(s) from file or one triplet from command line */
257  FILE *const file = fopen(argv[1], "r"); /* try 1st arg as filename */
258  char buffer[1024];
259  const char delims[] = "x,;:|/\t ";
260  int mnk[] = {0, 0, 0}, i = 1, j = 0;
261  while (i < argc &&
262  (NULL == file || NULL != fgets(buffer, sizeof(buffer), file))) {
263  const char *arg = strtok(NULL != file ? buffer : argv[i], delims);
264  for (; NULL != arg; arg = strtok(NULL, delims)) {
265  if (3 > j) {
266  mnk[j] = atoi(arg);
267  ++j;
268  } else { /* malformed */
269  mnk[0] = 0;
270  break;
271  }
272  }
273  if (NULL != file) {
274  j = 0;
275  } else if (++i < argc) {
276  continue;
277  }
278  if (0 < mnk[0]) { /* valid MxNxK? */
279  benchmark_multiply(128, 128, 128, mnk[0], 0 < mnk[1] ? mnk[1] : mnk[0],
280  0 < mnk[2] ? mnk[2] : mnk[0], comm);
281  mnk[0] = mnk[1] = mnk[2] = 0;
282  } else {
283  i = argc;
284  }
285  }
286  if (NULL != file) {
287  fclose(file);
288  }
289  }
290 
293  dbm_mpi_comm_free(&comm);
295  return 0;
296 }
297 
298 // EOF
void dbm_get_col_sizes(const dbm_matrix_t *matrix, int *ncols, const int **col_sizes)
Returns the column block sizes of the given matrix.
Definition: dbm_matrix.c:575
void dbm_get_row_sizes(const dbm_matrix_t *matrix, int *nrows, const int **row_sizes)
Returns the row block sizes of the given matrix.
Definition: dbm_matrix.c:565
int main(int argc, char *argv[])
Stand-alone miniapp for smoke-testing and benchmarking dbm_multiply.
Definition: dbm_miniapp.c:197
static int imin(int x, int y)
Returns the smaller of two given integer (missing from the C standard)
Definition: dbm_miniapp.c:38
static void set_all_blocks(dbm_matrix_t *matrix)
Private routine for setting all blocks to 1.0.
Definition: dbm_miniapp.c:129
static void print_func(char *message, int output_unit)
Wrapper for printf, passed to dbm_library_print_stats.
Definition: dbm_miniapp.c:28
void benchmark_multiply(const int M, const int N, const int K, const int m, const int n, const int k, const dbm_mpi_comm_t comm)
Run a benchmark of dbm_multiply with given block sizes.
Definition: dbm_miniapp.c:150
static void reserve_all_blocks(dbm_matrix_t *matrix)
Private routine for reserving all blocks of the given matrix.
Definition: dbm_miniapp.c:86
static dbm_matrix_t * create_some_matrix(const int nrows, const int ncols, const int row_size, const int col_size, const dbm_mpi_comm_t comm)
Private routine for creating a distribution and an empty matrix.
Definition: dbm_miniapp.c:44
void dbm_mpi_finalize()
Wrapper around MPI_Finalize.
Definition: dbm_mpi.c:44
int dbm_mpi_comm_rank(const dbm_mpi_comm_t comm)
Wrapper around MPI_Comm_rank.
Definition: dbm_mpi.c:92
int dbm_mpi_comm_size(const dbm_mpi_comm_t comm)
Wrapper around MPI_Comm_size.
Definition: dbm_mpi.c:107
void dbm_mpi_init(int *argc, char ***argv)
Wrapper around MPI_Init.
Definition: dbm_mpi.c:31
dbm_mpi_comm_t dbm_mpi_cart_create(const dbm_mpi_comm_t comm_old, const int ndims, const int dims[], const int periods[], const int reorder)
Wrapper around MPI_Cart_create.
Definition: dbm_mpi.c:137
void dbm_mpi_sum_int64(int64_t *values, const int count, const dbm_mpi_comm_t comm)
Wrapper around MPI_Allreduce for op MPI_SUM and datatype MPI_INT64_T.
Definition: dbm_mpi.c:290
void dbm_mpi_dims_create(const int nnodes, const int ndims, int dims[])
Wrapper around MPI_Dims_create.
Definition: dbm_mpi.c:122
int dbm_mpi_comm_c2f(const dbm_mpi_comm_t comm)
Wrapper around MPI_Comm_c2f.
Definition: dbm_mpi.c:79
void dbm_mpi_comm_free(dbm_mpi_comm_t *comm)
Wrapper around MPI_Comm_free.
Definition: dbm_mpi.c:209
void dbm_mpi_cart_get(const dbm_mpi_comm_t comm, int maxdims, int dims[], int periods[], int coords[])
Wrapper around MPI_Cart_get.
Definition: dbm_mpi.c:158
dbm_mpi_comm_t dbm_mpi_get_comm_world()
Returns MPI_COMM_WORLD.
Definition: dbm_mpi.c:54
int dbm_mpi_comm_t
Definition: dbm_mpi.h:18
static void const int const int i
subroutine, public dbm_multiply(transa, transb, alpha, matrix_a, matrix_b, beta, matrix_c, retain_sparsity, filter_eps, flop)
Computes matrix product: matrix_c = alpha * matrix_a * matrix_b + beta * matrix_c.
Definition: dbm_api.F:736
subroutine, public dbm_distribution_release(dist)
Decreases the reference counter of the given distribution.
Definition: dbm_api.F:1401
subroutine, public dbm_library_init()
Initialize DBM library.
Definition: dbm_api.F:1483
subroutine, public dbm_iterator_next_block(iterator, row, column, block, row_size, col_size)
Returns the next block from the given iterator.
Definition: dbm_api.F:910
subroutine, public dbm_get_stored_coordinates(matrix, row, column, processor)
Returns the MPI rank on which the given block should be stored.
Definition: dbm_api.F:1233
subroutine, public dbm_create(matrix, name, dist, row_block_sizes, col_block_sizes)
Creates a new matrix.
Definition: dbm_api.F:293
logical function, public dbm_iterator_blocks_left(iterator)
Tests whether the given iterator has any block left.
Definition: dbm_api.F:884
subroutine, public dbm_reserve_blocks(matrix, rows, cols)
Adds given list of blocks efficiently. The blocks will be filled with zeros.
Definition: dbm_api.F:589
subroutine, public dbm_library_finalize()
Finalize DBM library.
Definition: dbm_api.F:1497
subroutine, public dbm_iterator_stop(iterator)
Releases the given iterator.
Definition: dbm_api.F:948
real(kind=dp) function, public dbm_checksum(matrix)
Computes a checksum of the given matrix.
Definition: dbm_api.F:969
subroutine, public dbm_iterator_start(iterator, matrix)
Creates an iterator for the blocks of the given matrix. The iteration order is not stable.
Definition: dbm_api.F:837
subroutine, public dbm_library_print_stats(mpi_comm, output_unit)
Print DBM library statistics.
Definition: dbm_api.F:1513
subroutine, public dbm_release(matrix)
Releases a matrix and all its ressources.
Definition: dbm_api.F:354
subroutine, public dbm_distribution_new(dist, mp_comm, row_dist_block, col_dist_block)
Creates a new two dimensional distribution.
Definition: dbm_api.F:1294
subroutine, public offload_set_chosen_device(device_id)
Selects the chosen device to be used.
Definition: offload_api.F:132
integer function, public offload_get_device_count()
Returns the number of available devices.
Definition: offload_api.F:112
Internal struct for storing a two dimensional distribution.
Internal struct for storing a block iterator.
Definition: dbm_matrix.h:35
Internal struct for storing a matrix.
Definition: dbm_matrix.h:20
dbm_distribution_t * dist
Definition: dbm_matrix.h:21