(git:ad2d693)
Loading...
Searching...
No Matches
dbm_miniapp.c
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2025 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: BSD-3-Clause */
6/*----------------------------------------------------------------------------*/
7#include "dbm_library.h"
8#include "dbm_matrix.h"
9
10#include "../mpiwrap/cp_mpi.h"
11#include "../offload/offload_library.h"
12#include "../offload/offload_mempool.h"
13
14#include <assert.h>
15#include <omp.h>
16#include <stdint.h>
17#include <stdio.h>
18#include <stdlib.h>
19#include <string.h>
20
21#if defined(__LIBXSMM)
22#include <libxsmm.h>
23#endif
24
25/*******************************************************************************
26 * \brief Wrapper for printf, passed to dbm_library_print_stats.
27 * \author Ole Schuett
28 ******************************************************************************/
29static void print_func(const char *msg, int msglen, int output_unit) {
30 (void)msglen; // mark used
31 if (output_unit == 0) { // i.e. my_rank == 0
32 printf("%s", msg);
33 }
34}
35
36/*******************************************************************************
37 * \brief Returns the smaller of the two integers (missing from the C standard).
38 * \author Ole Schuett
39 ******************************************************************************/
40static inline int imin(int x, int y) { return (x < y ? x : y); }
41
42/*******************************************************************************
43 * \brief Private routine for creating a distribution.
44 * \author Ole Schuett
45 ******************************************************************************/
46static dbm_distribution_t *create_dist(const int nrows, const int ncols,
47 const cp_mpi_comm_t comm) {
48 int cart_dims[2], cart_periods[2], cart_coords[2];
49 cp_mpi_cart_get(comm, 2, cart_dims, cart_periods, cart_coords);
50
51 // Create distribution.
52 assert(0 < nrows && 0 < ncols);
53 int *row_dist = malloc(nrows * sizeof(int));
54 int *col_dist = malloc(ncols * sizeof(int));
55 assert(row_dist != NULL && col_dist != NULL);
56 for (int i = 0; i < nrows; i++) {
57 row_dist[i] = i % cart_dims[0];
58 }
59 for (int i = 0; i < ncols; i++) {
60 col_dist[i] = i % cart_dims[1];
61 }
62 const int fortran_comm = cp_mpi_comm_c2f(comm);
63 dbm_distribution_t *dist = NULL;
64 dbm_distribution_new(&dist, fortran_comm, nrows, ncols, row_dist, col_dist);
65 free(row_dist);
66 free(col_dist);
67 return dist;
68}
69
70/*******************************************************************************
71 * \brief Private routine for creating a distribution and an empty matrix.
72 * \author Ole Schuett
73 ******************************************************************************/
74static dbm_matrix_t *
75create_some_matrix(const int nrows, const int ncols, const int nrows_min,
76 const int nrows_max, const int ncols_min,
77 const int ncols_max, const cp_mpi_comm_t comm) {
78 // Create distribution.
79 dbm_distribution_t *dist = create_dist(nrows, ncols, comm);
80
81 // Create matrix.
82 assert(0 < nrows && 0 < ncols);
83 int *row_sizes = malloc(nrows * sizeof(int));
84 int *col_sizes = malloc(ncols * sizeof(int));
85 assert(row_sizes != NULL && col_sizes != NULL);
86 assert(0 < nrows_min && nrows_min <= nrows_max);
87 assert(0 < ncols_min && ncols_min <= ncols_max);
88 if (nrows_min != nrows_max) {
89 const int row_size = nrows_max - nrows_min + 1;
90 for (int i = 0; i < nrows; i++) {
91 row_sizes[i] = rand() % row_size + 1;
92 }
93 } else {
94 for (int i = 0; i < nrows; i++) {
95 row_sizes[i] = nrows_max;
96 }
97 }
98 if (ncols_min != ncols_max) {
99 const int col_size = ncols_max - ncols_min + 1;
100 for (int i = 0; i < ncols; i++) {
101 col_sizes[i] = rand() % col_size + 1;
102 }
103 } else {
104 for (int i = 0; i < ncols; i++) {
105 col_sizes[i] = ncols_max;
106 }
107 }
108 dbm_matrix_t *matrix = NULL;
109 dbm_create(&matrix, dist, "some name", nrows, ncols, row_sizes, col_sizes);
110 dbm_distribution_release(dist);
111 free(row_sizes);
112 free(col_sizes);
113 return matrix;
114}
115
116/*******************************************************************************
117 * \brief Private routine for reserving all blocks of the given matrix.
118 * \author Ole Schuett
119 ******************************************************************************/
120static void reserve_all_blocks(dbm_matrix_t *matrix) {
121 int nrows, ncols;
122 const int *row_sizes, *col_sizes;
123 dbm_get_row_sizes(matrix, &nrows, &row_sizes);
124 dbm_get_col_sizes(matrix, &ncols, &col_sizes);
125
126#pragma omp parallel
127 {
128 int nblocks = 0;
129#pragma omp for collapse(2)
130 for (int row = 0; row < nrows; row++) {
131 for (int col = 0; col < ncols; col++) {
132 if (dbm_get_stored_coordinates(matrix, row, col) ==
133 matrix->dist->my_rank) {
134 ++nblocks;
135 }
136 }
137 }
138 int *reserve_row = NULL, *reserve_col = NULL;
139 if (0 < nblocks) {
140 reserve_row = malloc(nblocks * sizeof(int));
141 reserve_col = malloc(nblocks * sizeof(int));
142 assert(reserve_row != NULL && reserve_col != NULL);
143 }
144 int iblock = 0;
145#pragma omp for collapse(2)
146 for (int row = 0; row < nrows; row++) {
147 for (int col = 0; col < ncols; col++) {
148 if (dbm_get_stored_coordinates(matrix, row, col) ==
149 matrix->dist->my_rank) {
150 reserve_row[iblock] = row;
151 reserve_col[iblock] = col;
152 ++iblock;
153 }
154 }
155 }
156 assert(iblock == nblocks);
157 dbm_reserve_blocks(matrix, nblocks, reserve_row, reserve_col);
158 free(reserve_row);
159 free(reserve_col);
160 }
161}
162
163/*******************************************************************************
164 * \brief Private routine for setting all blocks.
165 * \author Ole Schuett
166 ******************************************************************************/
167static void set_all_blocks(dbm_matrix_t *matrix) {
168#pragma omp parallel
169 {
170 dbm_iterator_t *iter = NULL;
171 dbm_iterator_start(&iter, matrix);
172 while (dbm_iterator_blocks_left(iter)) {
173 int row, col, row_size, col_size;
174 double *block;
175 dbm_iterator_next_block(iter, &row, &col, &block, &row_size, &col_size);
176 const int block_size = row_size * col_size;
177 for (int i = 0; i < block_size; i++) {
178 block[i] = 1.0 / (i + 1);
179 }
180 }
181 dbm_iterator_stop(iter);
182 }
183}
184/*******************************************************************************
185 * \brief Run a benchmark of dbm_multiply with given block sizes.
186 * \author Ole Schuett
187 ******************************************************************************/
188void benchmark_multiply(const int M, const int N, const int K, const int m,
189 const int n, const int k, const cp_mpi_comm_t comm) {
190 dbm_matrix_t *matrix_a = create_some_matrix(M, K, 1, m, k, k, comm);
191 dbm_matrix_t *matrix_b = create_some_matrix(K, N, k, k, 1, n, comm);
192 dbm_distribution_t *dist_c = create_dist(M, N, comm);
193 dbm_matrix_t *matrix_c = NULL, *matrix_d = NULL;
194 dbm_create(&matrix_c, dist_c, "result", M, N, matrix_a->row_sizes,
195 matrix_b->col_sizes);
196 dbm_distribution_release(dist_c);
197
198 reserve_all_blocks(matrix_a);
199 reserve_all_blocks(matrix_b);
200 set_all_blocks(matrix_a);
201 set_all_blocks(matrix_b);
202
203 const char *const verify_env = getenv("DBM_MULTIPLY_VERIFY");
204 const int skip_verify = (NULL == verify_env ? 0 : (atoi(verify_env) + 1));
205
206 if (0 == skip_verify) {
207 dbm_distribution_t *const dist_shared = matrix_c->dist;
208 dbm_create(&matrix_d, dist_shared, matrix_c->name, matrix_c->nrows,
209 matrix_c->ncols, matrix_c->row_sizes, matrix_c->col_sizes);
210 dbm_copy(matrix_d, matrix_c);
211 }
212
213 int64_t flop = 0;
214 const double time_start_multiply = omp_get_wtime();
215 dbm_multiply(false, false, 1.0, matrix_a, matrix_b, 1.0, matrix_c, false,
216 1e-8, &flop);
217 const double time_end_multiply = omp_get_wtime();
218
219 if (cp_mpi_comm_rank(comm) == 0) {
220 printf("%5i x %5i x %5i with %3i x %3i x %3i blocks: ", M, N, K, m, n, k);
221 }
222
223 if (NULL != matrix_d) { // Calculate result on the host for validation.
224 dbm_multiply(false, false, 1.0, matrix_a, matrix_b, 1.0, matrix_d, false,
225 1e-8, NULL);
226
227 const double maxeps = 1E-5, epsilon = dbm_maxeps(matrix_d, matrix_c);
228 if (maxeps < epsilon) {
229 printf("ERROR\n");
230 fprintf(stderr, "Failed validation (epsilon=%f).\n", epsilon);
231 exit(1);
232 }
233 dbm_release(matrix_d);
234 }
235
236 cp_mpi_sum_int64(&flop, 1, comm);
237 if (cp_mpi_comm_rank(comm) == 0) {
238 const double duration = time_end_multiply - time_start_multiply;
239 printf("%6.3f s => %6.1f GFLOP/s\n", duration, 1e-9 * flop / duration);
240 fflush(stdout);
241 }
242
243 dbm_release(matrix_a);
244 dbm_release(matrix_b);
245 dbm_release(matrix_c);
246}
247
248/*******************************************************************************
249 * \brief Stand-alone miniapp for smoke-testing and benchmarking dbm_multiply.
250 * \author Ole Schuett
251 ******************************************************************************/
252int main(int argc, char *argv[]) {
253 int result = EXIT_SUCCESS;
254
255 srand(25071975); // seed rng
256
257 cp_mpi_init(&argc, &argv);
258 dbm_library_init();
259
260 const cp_mpi_comm_t world_comm = cp_mpi_get_comm_world();
261 const int nranks = cp_mpi_comm_size(world_comm);
262 const int my_rank = cp_mpi_comm_rank(world_comm);
263
264 if (offload_get_device_count() > 0) {
265 offload_set_chosen_device(my_rank % offload_get_device_count());
266 }
267
268 // Create 2D cart.
269 int dims[2] = {0, 0};
270 cp_mpi_dims_create(nranks, 2, dims);
271 const int periods[2] = {true, true};
272 cp_mpi_comm_t comm = cp_mpi_cart_create(world_comm, 2, dims, periods, false);
273
274 if (my_rank == 0) {
275 printf("OpenMP-threads: %i GPUs: %i", omp_get_max_threads(),
276 imin(offload_get_device_count(), nranks));
277#if defined(__LIBXSMM)
278 printf(" Libxsmm: %s", LIBXSMM_VERSION);
279#else
280 printf(" Libxsmm: n/a");
281#endif
282#if defined(__parallel)
283 printf(" MPI-ranks: %i MPI-cart: %i x %i", nranks, dims[0], dims[1]);
284#else
285 printf(" MPI: n/a");
286#endif
287 printf("\n\n");
288 fflush(stdout);
289 }
290
291 if (1 >= argc) {
292 benchmark_multiply(16384, 128, 128, 4, 4, 4, comm);
293 benchmark_multiply(128, 16384, 128, 4, 4, 4, comm);
294 benchmark_multiply(128, 128, 16384, 4, 4, 4, comm);
295 benchmark_multiply(645, 645, 645, 4, 4, 4, comm);
296 if (my_rank == 0)
297 printf("\n");
298
299 benchmark_multiply(60, 500, 500, 128, 4, 4, comm);
300 benchmark_multiply(500, 60, 500, 4, 128, 4, comm);
301 benchmark_multiply(500, 500, 60, 4, 4, 128, comm);
302 if (my_rank == 0)
303 printf("\n");
304
305 benchmark_multiply(500, 60, 60, 4, 128, 128, comm);
306 benchmark_multiply(60, 500, 60, 128, 4, 128, comm);
307 benchmark_multiply(60, 60, 500, 128, 128, 4, comm);
308 if (my_rank == 0)
309 printf("\n");
310
311 benchmark_multiply(350, 350, 350, 23, 23, 23, comm);
312 benchmark_multiply(250, 250, 250, 32, 32, 32, comm);
313 benchmark_multiply(60, 60, 60, 128, 128, 128, comm);
314 } else { /* read triplet(s) from file or one triplet from command line */
315 FILE *const file = fopen(argv[1], "r"); /* try 1st arg as filename */
316 char buffer[1024];
317 const char delims[] = "x,;:|/\t ";
318 int mnk[] = {0, 0, 0}, i = 1, j = 0;
319 while (i < argc &&
320 (NULL == file || NULL != fgets(buffer, sizeof(buffer), file))) {
321 const char *arg = strtok(NULL != file ? buffer : argv[i], delims);
322 for (; NULL != arg && j < 3; arg = strtok(NULL, delims), ++j) {
323 mnk[j] = atoi(arg);
324 }
325 if (NULL != file) {
326 j = 0;
327 } else if (++i < argc) {
328 continue;
329 }
330 if (0 < mnk[0]) { /* valid MxNxK? */
331 const int m = mnk[0];
332 const int n = (0 < mnk[1] ? mnk[1] : m);
333 const int k = (0 < mnk[2] ? mnk[2] : m);
334 int M = (NULL == arg ? 0 : atoi(arg)), N, K;
335 if (0 < M) {
336 arg = strtok(NULL, delims);
337 N = (NULL == arg ? 1 : atoi(arg));
338 arg = strtok(NULL, delims);
339 K = (NULL == arg ? 1 : atoi(arg));
340 } else { /* default */
341 M = N = K = 128;
342 }
343 benchmark_multiply(M, N, K, m, n, k, comm);
344 mnk[0] = mnk[1] = mnk[2] = 0;
345 } else {
346 fprintf(stderr, "ERROR: invalid argument(s)\n");
347 result = EXIT_FAILURE;
348 i = argc; /* break */
349 }
350 }
351 if (NULL != file) {
352 fclose(file);
353 }
354 }
355
356 if (EXIT_SUCCESS == result) {
357 const int fortran_comm = cp_mpi_comm_c2f(comm);
358 dbm_library_print_stats(fortran_comm, &print_func, my_rank);
359 offload_mempool_stats_print(fortran_comm, &print_func, my_rank);
360 }
361 dbm_library_finalize();
362 cp_mpi_comm_free(&comm);
364 return result;
365}
366
367// EOF
int cp_mpi_comm_size(const cp_mpi_comm_t comm)
Wrapper around MPI_Comm_size.
Definition cp_mpi.c:113
cp_mpi_comm_t cp_mpi_cart_create(const cp_mpi_comm_t comm_old, const int ndims, const int dims[], const int periods[], const int reorder)
Wrapper around MPI_Cart_create.
Definition cp_mpi.c:145
void cp_mpi_init(int *argc, char ***argv)
Wrapper around MPI_Init.
Definition cp_mpi.c:34
int cp_mpi_comm_c2f(const cp_mpi_comm_t comm)
Wrapper around MPI_Comm_c2f.
Definition cp_mpi.c:83
cp_mpi_comm_t cp_mpi_get_comm_world(void)
Returns MPI_COMM_WORLD.
Definition cp_mpi.c:57
void cp_mpi_cart_get(const cp_mpi_comm_t comm, int maxdims, int dims[], int periods[], int coords[])
Wrapper around MPI_Cart_get.
Definition cp_mpi.c:166
void cp_mpi_sum_int64(int64_t *values, const int count, const cp_mpi_comm_t comm)
Wrapper around MPI_Allreduce for op MPI_SUM and datatype MPI_INT64_T.
Definition cp_mpi.c:363
void cp_mpi_comm_free(cp_mpi_comm_t *comm)
Wrapper around MPI_Comm_free.
Definition cp_mpi.c:217
void cp_mpi_dims_create(const int nnodes, const int ndims, int dims[])
Wrapper around MPI_Dims_create.
Definition cp_mpi.c:130
void cp_mpi_finalize(void)
Wrapper around MPI_Finalize.
Definition cp_mpi.c:47
int cp_mpi_comm_rank(const cp_mpi_comm_t comm)
Wrapper around MPI_Comm_rank.
Definition cp_mpi.c:96
int cp_mpi_comm_t
Definition cp_mpi.h:18
void dbm_get_col_sizes(const dbm_matrix_t *matrix, int *ncols, const int **col_sizes)
Returns the column block sizes of the given matrix.
Definition dbm_matrix.c:648
void dbm_get_row_sizes(const dbm_matrix_t *matrix, int *nrows, const int **row_sizes)
Returns the row block sizes of the given matrix.
Definition dbm_matrix.c:638
double dbm_maxeps(const dbm_matrix_t *matrix_a, const dbm_matrix_t *matrix_b)
Calculates maximum relative difference between matrix_a and matrix_b.
Definition dbm_matrix.c:566
static void print_func(const char *msg, int msglen, int output_unit)
Wrapper for printf, passed to dbm_library_print_stats.
Definition dbm_miniapp.c:29
static dbm_distribution_t * create_dist(const int nrows, const int ncols, const cp_mpi_comm_t comm)
Private routine for creating a distribution.
Definition dbm_miniapp.c:46
static int imin(int x, int y)
Returns the smaller of the two integers (missing from the C standard).
Definition dbm_miniapp.c:40
void benchmark_multiply(const int M, const int N, const int K, const int m, const int n, const int k, const cp_mpi_comm_t comm)
Run a benchmark of dbm_multiply with given block sizes.
static void set_all_blocks(dbm_matrix_t *matrix)
Private routine for setting all blocks.
static dbm_matrix_t * create_some_matrix(const int nrows, const int ncols, const int nrows_min, const int nrows_max, const int ncols_min, const int ncols_max, const cp_mpi_comm_t comm)
Private routine for creating a distribution and an empty matrix.
Definition dbm_miniapp.c:75
static void reserve_all_blocks(dbm_matrix_t *matrix)
Private routine for reserving all blocks of the given matrix.
static void const int const int i
int main()
Unit test of the C-interface provided via libcp2k.h.
Internal struct for storing a two dimensional distribution.
Internal struct for storing a block iterator.
Definition dbm_matrix.h:34
Internal struct for storing a matrix.
Definition dbm_matrix.h:19
int * row_sizes
Definition dbm_matrix.h:24
char * name
Definition dbm_matrix.h:21
int * col_sizes
Definition dbm_matrix.h:25
dbm_distribution_t * dist
Definition dbm_matrix.h:20