(git:97501a3)
Loading...
Searching...
No Matches
dbm_miniapp.c
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2025 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: BSD-3-Clause */
6/*----------------------------------------------------------------------------*/
7
8#include <assert.h>
9#include <omp.h>
10#include <stdint.h>
11#include <stdio.h>
12#include <stdlib.h>
13#include <string.h>
14
15#if defined(__LIBXSMM)
16#include <libxsmm.h>
17#endif
18
19#include "../offload/offload_library.h"
20#include "dbm_library.h"
21#include "dbm_matrix.h"
22#include "dbm_mpi.h"
23
24/*******************************************************************************
25 * \brief Wrapper for printf, passed to dbm_library_print_stats.
26 * \author Ole Schuett
27 ******************************************************************************/
28static void print_func(char *message, int output_unit) {
29 if (output_unit == 0) { // i.e. my_rank == 0
30 printf("%s", message);
31 }
32}
33
34/*******************************************************************************
35 * \brief Returns the smaller of the two integers (missing from the C standard).
36 * \author Ole Schuett
37 ******************************************************************************/
38static inline int imin(int x, int y) { return (x < y ? x : y); }
39
40/*******************************************************************************
41 * \brief Private routine for creating a distribution.
42 * \author Ole Schuett
43 ******************************************************************************/
44static dbm_distribution_t *create_dist(const int nrows, const int ncols,
45 const dbm_mpi_comm_t comm) {
46 int cart_dims[2], cart_periods[2], cart_coords[2];
47 dbm_mpi_cart_get(comm, 2, cart_dims, cart_periods, cart_coords);
48
49 // Create distribution.
50 assert(0 < nrows && 0 < ncols);
51 int *row_dist = malloc(nrows * sizeof(int));
52 int *col_dist = malloc(ncols * sizeof(int));
53 assert(row_dist != NULL && col_dist != NULL);
54 for (int i = 0; i < nrows; i++) {
55 row_dist[i] = i % cart_dims[0];
56 }
57 for (int i = 0; i < ncols; i++) {
58 col_dist[i] = i % cart_dims[1];
59 }
60 const int fortran_comm = dbm_mpi_comm_c2f(comm);
61 dbm_distribution_t *dist = NULL;
62 dbm_distribution_new(&dist, fortran_comm, nrows, ncols, row_dist, col_dist);
63 free(row_dist);
64 free(col_dist);
65 return dist;
66}
67
68/*******************************************************************************
69 * \brief Private routine for creating a distribution and an empty matrix.
70 * \author Ole Schuett
71 ******************************************************************************/
72static dbm_matrix_t *
73create_some_matrix(const int nrows, const int ncols, const int nrows_min,
74 const int nrows_max, const int ncols_min,
75 const int ncols_max, const dbm_mpi_comm_t comm) {
76 // Create distribution.
77 dbm_distribution_t *dist = create_dist(nrows, ncols, comm);
78
79 // Create matrix.
80 assert(0 < nrows && 0 < ncols);
81 int *row_sizes = malloc(nrows * sizeof(int));
82 int *col_sizes = malloc(ncols * sizeof(int));
83 assert(row_sizes != NULL && col_sizes != NULL);
84 assert(0 < nrows_min && nrows_min <= nrows_max);
85 assert(0 < ncols_min && ncols_min <= ncols_max);
86 if (nrows_min != nrows_max) {
87 const int row_size = nrows_max - nrows_min + 1;
88 for (int i = 0; i < nrows; i++) {
89 row_sizes[i] = rand() % row_size + 1;
90 }
91 } else {
92 for (int i = 0; i < nrows; i++) {
93 row_sizes[i] = nrows_max;
94 }
95 }
96 if (ncols_min != ncols_max) {
97 const int col_size = ncols_max - ncols_min + 1;
98 for (int i = 0; i < ncols; i++) {
99 col_sizes[i] = rand() % col_size + 1;
100 }
101 } else {
102 for (int i = 0; i < ncols; i++) {
103 col_sizes[i] = ncols_max;
104 }
105 }
106 dbm_matrix_t *matrix = NULL;
107 dbm_create(&matrix, dist, "some name", nrows, ncols, row_sizes, col_sizes);
108 dbm_distribution_release(dist);
109 free(row_sizes);
110 free(col_sizes);
111 return matrix;
112}
113
114/*******************************************************************************
115 * \brief Private routine for reserving all blocks of the given matrix.
116 * \author Ole Schuett
117 ******************************************************************************/
118static void reserve_all_blocks(dbm_matrix_t *matrix) {
119 int nrows, ncols;
120 const int *row_sizes, *col_sizes;
121 dbm_get_row_sizes(matrix, &nrows, &row_sizes);
122 dbm_get_col_sizes(matrix, &ncols, &col_sizes);
123
124#pragma omp parallel
125 {
126 int nblocks = 0;
127#pragma omp for collapse(2)
128 for (int row = 0; row < nrows; row++) {
129 for (int col = 0; col < ncols; col++) {
130 if (dbm_get_stored_coordinates(matrix, row, col) ==
131 matrix->dist->my_rank) {
132 ++nblocks;
133 }
134 }
135 }
136 assert(0 < nblocks);
137 int *reserve_row = malloc(nblocks * sizeof(int));
138 int *reserve_col = malloc(nblocks * sizeof(int));
139 assert(reserve_row != NULL && reserve_col != NULL);
140 int iblock = 0;
141#pragma omp for collapse(2)
142 for (int row = 0; row < nrows; row++) {
143 for (int col = 0; col < ncols; col++) {
144 if (dbm_get_stored_coordinates(matrix, row, col) ==
145 matrix->dist->my_rank) {
146 reserve_row[iblock] = row;
147 reserve_col[iblock] = col;
148 iblock++;
149 }
150 }
151 }
152 assert(iblock == nblocks);
153 dbm_reserve_blocks(matrix, nblocks, reserve_row, reserve_col);
154 free(reserve_row);
155 free(reserve_col);
156 }
157}
158
159/*******************************************************************************
160 * \brief Private routine for setting all blocks.
161 * \author Ole Schuett
162 ******************************************************************************/
163static void set_all_blocks(dbm_matrix_t *matrix) {
164#pragma omp parallel
165 {
166 dbm_iterator_t *iter = NULL;
167 dbm_iterator_start(&iter, matrix);
168 while (dbm_iterator_blocks_left(iter)) {
169 int row, col, row_size, col_size;
170 double *block;
171 dbm_iterator_next_block(iter, &row, &col, &block, &row_size, &col_size);
172 const int block_size = row_size * col_size;
173 for (int i = 0; i < block_size; i++) {
174 block[i] = 1.0 / (i + 1);
175 }
176 }
177 dbm_iterator_stop(iter);
178 }
179}
180/*******************************************************************************
181 * \brief Run a benchmark of dbm_multiply with given block sizes.
182 * \author Ole Schuett
183 ******************************************************************************/
184void benchmark_multiply(const int M, const int N, const int K, const int m,
185 const int n, const int k, const dbm_mpi_comm_t comm) {
186 dbm_matrix_t *matrix_a = create_some_matrix(M, K, 1, m, k, k, comm);
187 dbm_matrix_t *matrix_b = create_some_matrix(K, N, k, k, 1, n, comm);
188 dbm_distribution_t *dist_c = create_dist(M, N, comm);
189 dbm_matrix_t *matrix_c = NULL, *matrix_d = NULL;
190 dbm_create(&matrix_c, dist_c, "result", M, N, matrix_a->row_sizes,
191 matrix_b->col_sizes);
192 dbm_distribution_release(dist_c);
193
194 reserve_all_blocks(matrix_a);
195 reserve_all_blocks(matrix_b);
196 set_all_blocks(matrix_a);
197 set_all_blocks(matrix_b);
198
199 dbm_distribution_t *const dist_shared = matrix_c->dist;
200 dbm_create(&matrix_d, dist_shared, matrix_c->name, matrix_c->nrows,
201 matrix_c->ncols, matrix_c->row_sizes, matrix_c->col_sizes);
202 dbm_copy(matrix_d, matrix_c);
203
204 int64_t flop = 0;
205 const double time_start_multiply = omp_get_wtime();
206 dbm_multiply(false, false, 1.0, matrix_a, matrix_b, 1.0, matrix_c, false,
207 1e-8, &flop);
208 const double time_end_multiply = omp_get_wtime();
209
210 // Calculate result on the host for validation
211 dbm_multiply(false, false, 1.0, matrix_a, matrix_b, 1.0, matrix_d, false,
212 1e-8, NULL);
213
214 if (dbm_mpi_comm_rank(comm) == 0) {
215 printf("%5i x %5i x %5i with %3i x %3i x %3i blocks: ", M, N, K, m, n, k);
216 }
217
218 // Validate result
219 const double maxeps = 1E-5, epsilon = dbm_maxeps(matrix_d, matrix_c);
220 if (maxeps >= epsilon) {
221 dbm_mpi_sum_int64(&flop, 1, comm);
222 if (dbm_mpi_comm_rank(comm) == 0) {
223 const double duration = time_end_multiply - time_start_multiply;
224 printf("%6.3f s => %6.1f GFLOP/s\n", duration, 1e-9 * flop / duration);
225 fflush(stdout);
226 }
227 } else {
228 printf("ERROR\n");
229 fprintf(stderr, "Failed validation (epsilon=%f).\n", epsilon);
230 exit(1);
231 }
232
233 dbm_release(matrix_a);
234 dbm_release(matrix_b);
235 dbm_release(matrix_c);
236 dbm_release(matrix_d);
237}
238
239/*******************************************************************************
240 * \brief Stand-alone miniapp for smoke-testing and benchmarking dbm_multiply.
241 * \author Ole Schuett
242 ******************************************************************************/
243int main(int argc, char *argv[]) {
244 int result = EXIT_SUCCESS;
245
246 srand(25071975); // seed rng
247
248 dbm_mpi_init(&argc, &argv);
249 dbm_library_init();
250
251 const dbm_mpi_comm_t world_comm = dbm_mpi_get_comm_world();
252 const int nranks = dbm_mpi_comm_size(world_comm);
253 const int my_rank = dbm_mpi_comm_rank(world_comm);
254
255 if (offload_get_device_count() > 0) {
256 offload_set_chosen_device(my_rank % offload_get_device_count());
257 }
258
259 // Create 2D cart.
260 int dims[2] = {0, 0};
261 dbm_mpi_dims_create(nranks, 2, dims);
262 const int periods[2] = {true, true};
263 dbm_mpi_comm_t comm =
264 dbm_mpi_cart_create(world_comm, 2, dims, periods, false);
265
266 if (my_rank == 0) {
267 printf("OpenMP-threads: %i GPUs: %i", omp_get_max_threads(),
268 imin(offload_get_device_count(), nranks));
269#if defined(__LIBXSMM)
270 printf(" Libxsmm: %s", LIBXSMM_VERSION);
271#else
272 printf(" Libxsmm: n/a");
273#endif
274#if defined(__parallel)
275 printf(" MPI-ranks: %i MPI-cart: %i x %i", nranks, dims[0], dims[1]);
276#else
277 printf(" MPI: n/a");
278#endif
279 printf("\n\n");
280 fflush(stdout);
281 }
282
283 if (1 >= argc) {
284 benchmark_multiply(16384, 128, 128, 4, 4, 4, comm);
285 benchmark_multiply(128, 16384, 128, 4, 4, 4, comm);
286 benchmark_multiply(128, 128, 16384, 4, 4, 4, comm);
287 benchmark_multiply(645, 645, 645, 4, 4, 4, comm);
288 if (my_rank == 0)
289 printf("\n");
290
291 benchmark_multiply(60, 500, 500, 128, 4, 4, comm);
292 benchmark_multiply(500, 60, 500, 4, 128, 4, comm);
293 benchmark_multiply(500, 500, 60, 4, 4, 128, comm);
294 if (my_rank == 0)
295 printf("\n");
296
297 benchmark_multiply(500, 60, 60, 4, 128, 128, comm);
298 benchmark_multiply(60, 500, 60, 128, 4, 128, comm);
299 benchmark_multiply(60, 60, 500, 128, 128, 4, comm);
300 if (my_rank == 0)
301 printf("\n");
302
303 benchmark_multiply(350, 350, 350, 23, 23, 23, comm);
304 benchmark_multiply(250, 250, 250, 32, 32, 32, comm);
305 benchmark_multiply(60, 60, 60, 128, 128, 128, comm);
306 } else { /* read triplet(s) from file or one triplet from command line */
307 FILE *const file = fopen(argv[1], "r"); /* try 1st arg as filename */
308 char buffer[1024];
309 const char delims[] = "x,;:|/\t ";
310 int mnk[] = {0, 0, 0}, i = 1, j = 0;
311 while (i < argc &&
312 (NULL == file || NULL != fgets(buffer, sizeof(buffer), file))) {
313 const char *arg = strtok(NULL != file ? buffer : argv[i], delims);
314 for (; NULL != arg && j < 3; arg = strtok(NULL, delims), ++j) {
315 mnk[j] = atoi(arg);
316 }
317 if (NULL != file) {
318 j = 0;
319 } else if (++i < argc) {
320 continue;
321 }
322 if (0 < mnk[0]) { /* valid MxNxK? */
323 const int m = mnk[0];
324 const int n = (0 < mnk[1] ? mnk[1] : m);
325 const int k = (0 < mnk[2] ? mnk[2] : m);
326 int M = (NULL == arg ? 0 : atoi(arg)), N, K;
327 if (0 < M) {
328 arg = strtok(NULL, delims);
329 N = (NULL == arg ? 1 : atoi(arg));
330 arg = strtok(NULL, delims);
331 K = (NULL == arg ? 1 : atoi(arg));
332 } else { /* default */
333 M = N = K = 128;
334 }
335 benchmark_multiply(M, N, K, m, n, k, comm);
336 mnk[0] = mnk[1] = mnk[2] = 0;
337 } else {
338 fprintf(stderr, "ERROR: invalid argument(s)\n");
339 result = EXIT_FAILURE;
340 i = argc; /* break */
341 }
342 }
343 if (NULL != file) {
344 fclose(file);
345 }
346 }
347
348 if (EXIT_SUCCESS == result) {
349 dbm_library_print_stats(dbm_mpi_comm_c2f(comm), &print_func, my_rank);
350 }
351 dbm_library_finalize();
352 dbm_mpi_comm_free(&comm);
354 return result;
355}
356
357// EOF
void dbm_get_col_sizes(const dbm_matrix_t *matrix, int *ncols, const int **col_sizes)
Returns the column block sizes of the given matrix.
Definition dbm_matrix.c:649
void dbm_get_row_sizes(const dbm_matrix_t *matrix, int *nrows, const int **row_sizes)
Returns the row block sizes of the given matrix.
Definition dbm_matrix.c:639
double dbm_maxeps(const dbm_matrix_t *matrix_a, const dbm_matrix_t *matrix_b)
Calculates maximum relative difference between matrix_a and matrix_b.
Definition dbm_matrix.c:567
static dbm_distribution_t * create_dist(const int nrows, const int ncols, const dbm_mpi_comm_t comm)
Private routine for creating a distribution.
Definition dbm_miniapp.c:44
static int imin(int x, int y)
Returns the smaller of the two integers (missing from the C standard).
Definition dbm_miniapp.c:38
static dbm_matrix_t * create_some_matrix(const int nrows, const int ncols, const int nrows_min, const int nrows_max, const int ncols_min, const int ncols_max, const dbm_mpi_comm_t comm)
Private routine for creating a distribution and an empty matrix.
Definition dbm_miniapp.c:73
static void set_all_blocks(dbm_matrix_t *matrix)
Private routine for setting all blocks.
static void print_func(char *message, int output_unit)
Wrapper for printf, passed to dbm_library_print_stats.
Definition dbm_miniapp.c:28
void benchmark_multiply(const int M, const int N, const int K, const int m, const int n, const int k, const dbm_mpi_comm_t comm)
Run a benchmark of dbm_multiply with given block sizes.
static void reserve_all_blocks(dbm_matrix_t *matrix)
Private routine for reserving all blocks of the given matrix.
void dbm_mpi_finalize()
Wrapper around MPI_Finalize.
Definition dbm_mpi.c:47
int dbm_mpi_comm_rank(const dbm_mpi_comm_t comm)
Wrapper around MPI_Comm_rank.
Definition dbm_mpi.c:95
int dbm_mpi_comm_size(const dbm_mpi_comm_t comm)
Wrapper around MPI_Comm_size.
Definition dbm_mpi.c:110
void dbm_mpi_init(int *argc, char ***argv)
Wrapper around MPI_Init.
Definition dbm_mpi.c:34
dbm_mpi_comm_t dbm_mpi_cart_create(const dbm_mpi_comm_t comm_old, const int ndims, const int dims[], const int periods[], const int reorder)
Wrapper around MPI_Cart_create.
Definition dbm_mpi.c:140
void dbm_mpi_sum_int64(int64_t *values, const int count, const dbm_mpi_comm_t comm)
Wrapper around MPI_Allreduce for op MPI_SUM and datatype MPI_INT64_T.
Definition dbm_mpi.c:325
void dbm_mpi_dims_create(const int nnodes, const int ndims, int dims[])
Wrapper around MPI_Dims_create.
Definition dbm_mpi.c:125
int dbm_mpi_comm_c2f(const dbm_mpi_comm_t comm)
Wrapper around MPI_Comm_c2f.
Definition dbm_mpi.c:82
void dbm_mpi_comm_free(dbm_mpi_comm_t *comm)
Wrapper around MPI_Comm_free.
Definition dbm_mpi.c:212
void dbm_mpi_cart_get(const dbm_mpi_comm_t comm, int maxdims, int dims[], int periods[], int coords[])
Wrapper around MPI_Cart_get.
Definition dbm_mpi.c:161
dbm_mpi_comm_t dbm_mpi_get_comm_world()
Returns MPI_COMM_WORLD.
Definition dbm_mpi.c:57
int dbm_mpi_comm_t
Definition dbm_mpi.h:19
static void const int const int i
int main()
Unit test of the C-interface provided via libcp2k.h.
Internal struct for storing a two dimensional distribution.
Internal struct for storing a block iterator.
Definition dbm_matrix.h:35
Internal struct for storing a matrix.
Definition dbm_matrix.h:20
int * row_sizes
Definition dbm_matrix.h:25
char * name
Definition dbm_matrix.h:22
int * col_sizes
Definition dbm_matrix.h:26
dbm_distribution_t * dist
Definition dbm_matrix.h:21