(git:374b731)
Loading...
Searching...
No Matches
dbm_miniapp.c
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2024 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: BSD-3-Clause */
6/*----------------------------------------------------------------------------*/
7
8#include <assert.h>
9#include <omp.h>
10#include <stdint.h>
11#include <stdio.h>
12#include <stdlib.h>
13#include <string.h>
14
15#if defined(__LIBXSMM)
16#include <libxsmm.h>
17#endif
18
19#include "../offload/offload_library.h"
20#include "dbm_library.h"
21#include "dbm_matrix.h"
22#include "dbm_mpi.h"
23
24/*******************************************************************************
25 * \brief Wrapper for printf, passed to dbm_library_print_stats.
26 * \author Ole Schuett
27 ******************************************************************************/
28static void print_func(char *message, int output_unit) {
29 if (output_unit == 0) { // i.e. my_rank == 0
30 printf("%s", message);
31 }
32}
33
34/*******************************************************************************
35 * \brief Returns the smaller of two given integer (missing from the C standard)
36 * \author Ole Schuett
37 ******************************************************************************/
38static inline int imin(int x, int y) { return (x < y ? x : y); }
39
40/*******************************************************************************
41 * \brief Private routine for creating a distribution and an empty matrix.
42 * \author Ole Schuett
43 ******************************************************************************/
44static dbm_matrix_t *create_some_matrix(const int nrows, const int ncols,
45 const int row_size, const int col_size,
46 const dbm_mpi_comm_t comm) {
47 int cart_dims[2], cart_periods[2], cart_coords[2];
48 dbm_mpi_cart_get(comm, 2, cart_dims, cart_periods, cart_coords);
49
50 // Create distribution.
51 int *row_dist = malloc(nrows * sizeof(int));
52 int *col_dist = malloc(ncols * sizeof(int));
53 for (int i = 0; i < nrows; i++) {
54 row_dist[i] = i % cart_dims[0];
55 }
56 for (int i = 0; i < ncols; i++) {
57 col_dist[i] = i % cart_dims[1];
58 }
59 const int fortran_comm = dbm_mpi_comm_c2f(comm);
60 dbm_distribution_t *dist = NULL;
61 dbm_distribution_new(&dist, fortran_comm, nrows, ncols, row_dist, col_dist);
62 free(row_dist);
63 free(col_dist);
64
65 // Create matrix.
66 int *row_sizes = malloc(nrows * sizeof(int));
67 int *col_sizes = malloc(ncols * sizeof(int));
68 for (int i = 0; i < nrows; i++) {
69 row_sizes[i] = row_size;
70 }
71 for (int i = 0; i < ncols; i++) {
72 col_sizes[i] = col_size;
73 }
74 dbm_matrix_t *matrix = NULL;
75 dbm_create(&matrix, dist, "some name", nrows, ncols, row_sizes, col_sizes);
76 dbm_distribution_release(dist);
77 free(row_sizes);
78 free(col_sizes);
79 return matrix;
80}
81
82/*******************************************************************************
83 * \brief Private routine for reserving all blocks of the given matrix.
84 * \author Ole Schuett
85 ******************************************************************************/
86static void reserve_all_blocks(dbm_matrix_t *matrix) {
87 int nrows, ncols;
88 const int *row_sizes, *col_sizes;
89 dbm_get_row_sizes(matrix, &nrows, &row_sizes);
90 dbm_get_col_sizes(matrix, &ncols, &col_sizes);
91
92#pragma omp parallel
93 {
94 int nblocks = 0;
95#pragma omp for collapse(2)
96 for (int row = 0; row < nrows; row++) {
97 for (int col = 0; col < ncols; col++) {
98 if (dbm_get_stored_coordinates(matrix, row, col) ==
99 matrix->dist->my_rank) {
100 ++nblocks;
101 }
102 }
103 }
104 int *reserve_row = malloc(nblocks * sizeof(int));
105 int *reserve_col = malloc(nblocks * sizeof(int));
106 int iblock = 0;
107#pragma omp for collapse(2)
108 for (int row = 0; row < nrows; row++) {
109 for (int col = 0; col < ncols; col++) {
110 if (dbm_get_stored_coordinates(matrix, row, col) ==
111 matrix->dist->my_rank) {
112 reserve_row[iblock] = row;
113 reserve_col[iblock] = col;
114 iblock++;
115 }
116 }
117 }
118 assert(iblock == nblocks);
119 dbm_reserve_blocks(matrix, nblocks, reserve_row, reserve_col);
120 free(reserve_row);
121 free(reserve_col);
122 }
123}
124
125/*******************************************************************************
126 * \brief Private routine for setting all blocks to 1.0.
127 * \author Ole Schuett
128 ******************************************************************************/
129static void set_all_blocks(dbm_matrix_t *matrix) {
130#pragma omp parallel
131 {
132 dbm_iterator_t *iter = NULL;
133 dbm_iterator_start(&iter, matrix);
134 while (dbm_iterator_blocks_left(iter)) {
135 int row, col, row_size, col_size;
136 double *block;
137 dbm_iterator_next_block(iter, &row, &col, &block, &row_size, &col_size);
138 const int block_size = row_size * col_size;
139 for (int i = 0; i < block_size; i++) {
140 block[i] = 1.0;
141 }
142 }
143 dbm_iterator_stop(iter);
144 }
145}
146/*******************************************************************************
147 * \brief Run a benchmark of dbm_multiply with given block sizes.
148 * \author Ole Schuett
149 ******************************************************************************/
150void benchmark_multiply(const int M, const int N, const int K, const int m,
151 const int n, const int k, const dbm_mpi_comm_t comm) {
152 dbm_matrix_t *matrix_a = create_some_matrix(M, K, m, k, comm);
153 dbm_matrix_t *matrix_b = create_some_matrix(K, N, k, n, comm);
154 dbm_matrix_t *matrix_c = create_some_matrix(M, N, m, n, comm);
155 reserve_all_blocks(matrix_a);
156 reserve_all_blocks(matrix_b);
157 set_all_blocks(matrix_a);
158 set_all_blocks(matrix_b);
159
160 int64_t flop = 0;
161 const double time_start_multiply = omp_get_wtime();
162 dbm_multiply(false, false, 1.0, matrix_a, matrix_b, 1.0, matrix_c, false,
163 1e-8, &flop);
164 const double time_end_multiply = omp_get_wtime();
165
166 // Validate checksum.
167 // Since all matrix elements were set to 1.0 the checksum is an integer.
168 const double expected = (int64_t)M * (int64_t)m * (int64_t)N * (int64_t)n *
169 (int64_t)K * (int64_t)K * (int64_t)k * (int64_t)k;
170 const double checksum = dbm_checksum(matrix_c);
171
172 dbm_release(matrix_a);
173 dbm_release(matrix_b);
174 dbm_release(matrix_c);
175
176 if (dbm_mpi_comm_rank(comm) == 0) {
177 printf("%5i x %5i x %5i with %3i x %3i x %3i blocks: ", M, N, K, m, n, k);
178 }
179 if (checksum == expected) {
180 dbm_mpi_sum_int64(&flop, 1, comm);
181 if (dbm_mpi_comm_rank(comm) == 0) {
182 const double duration = time_end_multiply - time_start_multiply;
183 printf("%6.3f s => %6.1f GFLOP/s\n", duration, 1e-9 * flop / duration);
184 fflush(stdout);
185 }
186 } else {
187 printf("ERROR\n");
188 fprintf(stderr, "Expected checksum %f but got %f.\n", expected, checksum);
189 exit(1);
190 }
191}
192
193/*******************************************************************************
194 * \brief Stand-alone miniapp for smoke-testing and benchmarking dbm_multiply.
195 * \author Ole Schuett
196 ******************************************************************************/
197int main(int argc, char *argv[]) {
198 int result = EXIT_SUCCESS;
199 dbm_mpi_init(&argc, &argv);
200 dbm_library_init();
201
202 const dbm_mpi_comm_t world_comm = dbm_mpi_get_comm_world();
203 const int nranks = dbm_mpi_comm_size(world_comm);
204 const int my_rank = dbm_mpi_comm_rank(world_comm);
205
206 if (offload_get_device_count() > 0) {
207 offload_set_chosen_device(my_rank % offload_get_device_count());
208 }
209
210 // Create 2D cart.
211 int dims[2] = {0, 0};
212 dbm_mpi_dims_create(nranks, 2, dims);
213 const int periods[2] = {true, true};
214 dbm_mpi_comm_t comm =
215 dbm_mpi_cart_create(world_comm, 2, dims, periods, false);
216
217 if (my_rank == 0) {
218 printf("OpenMP-threads: %i GPUs: %i", omp_get_max_threads(),
219 imin(offload_get_device_count(), nranks));
220#if defined(__LIBXSMM)
221 printf(" Libxsmm: %s", LIBXSMM_VERSION);
222#else
223 printf(" Libxsmm: n/a");
224#endif
225#if defined(__parallel)
226 printf(" MPI-ranks: %i MPI-cart: %i x %i", nranks, dims[0], dims[1]);
227#else
228 printf(" MPI: n/a");
229#endif
230 printf("\n\n");
231 fflush(stdout);
232 }
233
234 if (1 >= argc) {
235 benchmark_multiply(16384, 128, 128, 4, 4, 4, comm);
236 benchmark_multiply(128, 16384, 128, 4, 4, 4, comm);
237 benchmark_multiply(128, 128, 16384, 4, 4, 4, comm);
238 benchmark_multiply(645, 645, 645, 4, 4, 4, comm);
239 if (my_rank == 0)
240 printf("\n");
241
242 benchmark_multiply(60, 500, 500, 128, 4, 4, comm);
243 benchmark_multiply(500, 60, 500, 4, 128, 4, comm);
244 benchmark_multiply(500, 500, 60, 4, 4, 128, comm);
245 if (my_rank == 0)
246 printf("\n");
247
248 benchmark_multiply(500, 60, 60, 4, 128, 128, comm);
249 benchmark_multiply(60, 500, 60, 128, 4, 128, comm);
250 benchmark_multiply(60, 60, 500, 128, 128, 4, comm);
251 if (my_rank == 0)
252 printf("\n");
253
254 benchmark_multiply(350, 350, 350, 23, 23, 23, comm);
255 benchmark_multiply(250, 250, 250, 32, 32, 32, comm);
256 benchmark_multiply(60, 60, 60, 128, 128, 128, comm);
257 } else { /* read triplet(s) from file or one triplet from command line */
258 FILE *const file = fopen(argv[1], "r"); /* try 1st arg as filename */
259 char buffer[1024];
260 const char delims[] = "x,;:|/\t ";
261 int mnk[] = {0, 0, 0}, i = 1, j = 0;
262 while (i < argc &&
263 (NULL == file || NULL != fgets(buffer, sizeof(buffer), file))) {
264 const char *arg = strtok(NULL != file ? buffer : argv[i], delims);
265 for (; NULL != arg && j < 3; arg = strtok(NULL, delims), ++j) {
266 mnk[j] = atoi(arg);
267 }
268 if (NULL != file) {
269 j = 0;
270 } else if (++i < argc) {
271 continue;
272 }
273 if (0 < mnk[0]) { /* valid MxNxK? */
274 const int extra = (NULL == arg ? 0 : atoi(arg));
275 int nm, nn, nk;
276 if (0 < extra) {
277 nn = nk = 1;
278 nm = extra;
279 } else { /* default */
280 nm = nn = nk = 128;
281 }
282 benchmark_multiply(nm, nn, nk, mnk[0], 0 < mnk[1] ? mnk[1] : mnk[0],
283 0 < mnk[2] ? mnk[2] : mnk[0], comm);
284 mnk[0] = mnk[1] = mnk[2] = 0;
285 } else {
286 fprintf(stderr, "ERROR: invalid argument(s)\n");
287 result = EXIT_FAILURE;
288 i = argc;
289 }
290 }
291 if (NULL != file) {
292 fclose(file);
293 }
294 }
295
296 if (EXIT_SUCCESS == result) {
297 dbm_library_print_stats(dbm_mpi_comm_c2f(comm), &print_func, my_rank);
298 }
299 dbm_library_finalize();
300 dbm_mpi_comm_free(&comm);
302 return result;
303}
304
305// EOF
void dbm_get_col_sizes(const dbm_matrix_t *matrix, int *ncols, const int **col_sizes)
Returns the column block sizes of the given matrix.
Definition dbm_matrix.c:575
void dbm_get_row_sizes(const dbm_matrix_t *matrix, int *nrows, const int **row_sizes)
Returns the row block sizes of the given matrix.
Definition dbm_matrix.c:565
static dbm_matrix_t * create_some_matrix(const int nrows, const int ncols, const int row_size, const int col_size, const dbm_mpi_comm_t comm)
Private routine for creating a distribution and an empty matrix.
Definition dbm_miniapp.c:44
static int imin(int x, int y)
Returns the smaller of two given integer (missing from the C standard)
Definition dbm_miniapp.c:38
static void set_all_blocks(dbm_matrix_t *matrix)
Private routine for setting all blocks to 1.0.
static void print_func(char *message, int output_unit)
Wrapper for printf, passed to dbm_library_print_stats.
Definition dbm_miniapp.c:28
void benchmark_multiply(const int M, const int N, const int K, const int m, const int n, const int k, const dbm_mpi_comm_t comm)
Run a benchmark of dbm_multiply with given block sizes.
static void reserve_all_blocks(dbm_matrix_t *matrix)
Private routine for reserving all blocks of the given matrix.
Definition dbm_miniapp.c:86
void dbm_mpi_finalize()
Wrapper around MPI_Finalize.
Definition dbm_mpi.c:44
int dbm_mpi_comm_rank(const dbm_mpi_comm_t comm)
Wrapper around MPI_Comm_rank.
Definition dbm_mpi.c:92
int dbm_mpi_comm_size(const dbm_mpi_comm_t comm)
Wrapper around MPI_Comm_size.
Definition dbm_mpi.c:107
void dbm_mpi_init(int *argc, char ***argv)
Wrapper around MPI_Init.
Definition dbm_mpi.c:31
dbm_mpi_comm_t dbm_mpi_cart_create(const dbm_mpi_comm_t comm_old, const int ndims, const int dims[], const int periods[], const int reorder)
Wrapper around MPI_Cart_create.
Definition dbm_mpi.c:137
void dbm_mpi_sum_int64(int64_t *values, const int count, const dbm_mpi_comm_t comm)
Wrapper around MPI_Allreduce for op MPI_SUM and datatype MPI_INT64_T.
Definition dbm_mpi.c:290
void dbm_mpi_dims_create(const int nnodes, const int ndims, int dims[])
Wrapper around MPI_Dims_create.
Definition dbm_mpi.c:122
int dbm_mpi_comm_c2f(const dbm_mpi_comm_t comm)
Wrapper around MPI_Comm_c2f.
Definition dbm_mpi.c:79
void dbm_mpi_comm_free(dbm_mpi_comm_t *comm)
Wrapper around MPI_Comm_free.
Definition dbm_mpi.c:209
void dbm_mpi_cart_get(const dbm_mpi_comm_t comm, int maxdims, int dims[], int periods[], int coords[])
Wrapper around MPI_Cart_get.
Definition dbm_mpi.c:158
dbm_mpi_comm_t dbm_mpi_get_comm_world()
Returns MPI_COMM_WORLD.
Definition dbm_mpi.c:54
int dbm_mpi_comm_t
Definition dbm_mpi.h:18
static void const int const int i
int main()
Unit test of the C-interface provided via libcp2k.h.
Internal struct for storing a two dimensional distribution.
Internal struct for storing a block iterator.
Definition dbm_matrix.h:35
Internal struct for storing a matrix.
Definition dbm_matrix.h:20
dbm_distribution_t * dist
Definition dbm_matrix.h:21