(git:97501a3)
Loading...
Searching...
No Matches
dbm_multiply.c
Go to the documentation of this file.
1/*----------------------------------------------------------------------------*/
2/* CP2K: A general program to perform molecular dynamics simulations */
3/* Copyright 2000-2025 CP2K developers group <https://cp2k.org> */
4/* */
5/* SPDX-License-Identifier: BSD-3-Clause */
6/*----------------------------------------------------------------------------*/
7
8#include <assert.h>
9#include <limits.h>
10#include <math.h>
11#include <omp.h>
12#include <stdio.h>
13#include <stdlib.h>
14#include <string.h>
15
16#include "../offload/offload_runtime.h"
17#include "dbm_hyperparams.h"
18#include "dbm_internal.h"
19#include "dbm_library.h"
20#include "dbm_mempool.h"
21#include "dbm_multiply.h"
22#include "dbm_multiply_comm.h"
23#include "dbm_multiply_cpu.h"
24#include "dbm_multiply_gpu.h"
25
26/*******************************************************************************
27 * \brief Private routine for computing the max filter threshold for each row.
28 * \author Ole Schuett
29 ******************************************************************************/
30static float *compute_rows_max_eps(const bool trans, const dbm_matrix_t *matrix,
31 const double filter_eps) {
32 const int nrows = (trans) ? matrix->ncols : matrix->nrows;
33 int *nblocks_per_row = calloc(nrows, sizeof(int));
34 float *row_max_eps = malloc(nrows * sizeof(float));
35 assert((nblocks_per_row != NULL && row_max_eps != NULL) || nrows == 0);
36
37#pragma omp parallel
38 {
39#pragma omp for
40 for (int ishard = 0; ishard < dbm_get_num_shards(matrix); ishard++) {
41 dbm_shard_t *shard = &matrix->shards[ishard];
42 for (int iblock = 0; iblock < shard->nblocks; iblock++) {
43 const dbm_block_t *blk = &shard->blocks[iblock];
44 const int row = (trans) ? blk->col : blk->row;
45#pragma omp atomic
46 nblocks_per_row[row]++;
47 }
48 }
49#pragma omp master
50 dbm_mpi_sum_int(nblocks_per_row, nrows, matrix->dist->comm);
51#pragma omp barrier
52#pragma omp for
53 for (int i = 0; i < nrows; i++) {
54 const float f =
55 ((float)filter_eps) / ((float)imax(1, nblocks_per_row[i]));
56 row_max_eps[i] = f * f;
57 }
58 } // end of omp parallel region
59
60 free(nblocks_per_row);
61 return row_max_eps; // Ownership of row_max_eps transfers to caller.
62}
63
64/*******************************************************************************
65 * \brief Private struct for storing the context of the multiplication backend.
66 * \author Ole Schuett
67 ******************************************************************************/
68typedef struct {
69#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_DBM)
70 dbm_multiply_gpu_context_t gpu;
71#endif
73
74/*******************************************************************************
75 * \brief Private routine for initializing the multiplication backend.
76 * \author Ole Schuett
77 ******************************************************************************/
79 backend_context_t *ctx = calloc(1, sizeof(backend_context_t));
80
81#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_DBM)
82 dbm_multiply_gpu_start(DBM_MAX_BATCH_SIZE, dbm_get_num_shards(matrix_c),
83 matrix_c->shards, &ctx->gpu);
84#else
85 (void)matrix_c; // mark as used
86#endif
87
88 return ctx;
89}
90
91/*******************************************************************************
92 * \brief Private routine for handing newly arrived packs to the backend.
93 * \author Ole Schuett
94 ******************************************************************************/
95static void backend_upload_packs(const dbm_pack_t *pack_a,
96 const dbm_pack_t *pack_b,
97 backend_context_t *ctx) {
98
99#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_DBM)
100 dbm_multiply_gpu_upload_packs(pack_a, pack_b, &ctx->gpu);
101#else
102 (void)pack_a; // mark as used
103 (void)pack_b;
104 (void)ctx;
105#endif
106}
107
108/*******************************************************************************
109 * \brief Private routine for sending a batch to the multiplication backend.
110 * \author Ole Schuett
111 ******************************************************************************/
112static void backend_process_batch(const int ntasks,
113 const dbm_task_t batch[ntasks],
114 const double alpha, const dbm_pack_t *pack_a,
115 const dbm_pack_t *pack_b, const int kshard,
116 const bool finish, dbm_shard_t *shard_c,
117 backend_context_t *ctx) {
118 // BLAS and LIBXSMM benefit in general from DBM_MULTIPLY_TASK_REORDER.
119 const int cpu_options = DBM_MULTIPLY_TASK_REORDER;
120
121 if (NULL != ctx) {
122#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_DBM)
123 dbm_shard_gpu_t *const shard_g = &ctx->gpu.shards_c_dev[kshard];
124 dbm_multiply_gpu_process_batch(ntasks, batch, alpha, kshard, &ctx->gpu);
125 if (finish) { // Start downloading the current shard of matrix_c.
126 // Grow host buffer if necessary.
128 // Download results from device.
129 assert(shard_c->data_size == shard_g->data_size);
130 offloadMemcpyAsyncDtoH(shard_c->data, shard_g->data,
131 shard_g->data_size * sizeof(double),
132 shard_g->stream);
133 }
134#else
135 (void)kshard;
136 (void)finish;
137 dbm_multiply_cpu_process_batch(ntasks, batch, alpha, pack_a, pack_b,
138 shard_c, cpu_options);
139#endif
140 } else { // Validate against host.
141 dbm_multiply_cpu_process_batch(ntasks, batch, alpha, pack_a, pack_b,
142 shard_c,
143 cpu_options | DBM_MULTIPLY_BLAS_LIBRARY);
144 }
145}
146
147/*******************************************************************************
148 * \brief Private routine for shutting down the multiplication backend.
149 * \author Ole Schuett
150 ******************************************************************************/
152#if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_DBM)
153 dbm_multiply_gpu_stop(&ctx->gpu);
154#endif
155 free(ctx);
156}
157
158/*******************************************************************************
159 * \brief Private routine for multipling two packs.
160 * \author Ole Schuett
161 ******************************************************************************/
162static void multiply_packs(const bool transa, const bool transb,
163 const double alpha, const dbm_pack_t *pack_a,
164 const dbm_pack_t *pack_b,
165 const dbm_matrix_t *matrix_a,
166 const dbm_matrix_t *matrix_b, dbm_matrix_t *matrix_c,
167 const bool retain_sparsity,
168 const float *rows_max_eps, int64_t *flop,
169 backend_context_t *ctx) {
170 const float alpha2 = alpha * alpha;
171 int64_t flop_sum = 0;
172
173 const int nshard_rows = matrix_c->dist->rows.nshards;
174 const int nshard_cols = matrix_c->dist->cols.nshards;
175 int *shard_row_start = calloc(nshard_rows, sizeof(int));
176 int *shard_col_start = calloc(nshard_cols, sizeof(int));
177 assert(NULL != shard_row_start && NULL != shard_col_start);
178
179 const int *sum_index_sizes_a =
180 (transa) ? matrix_a->row_sizes : matrix_a->col_sizes;
181 const int *sum_index_sizes_b =
182 (transb) ? matrix_b->col_sizes : matrix_b->row_sizes;
183 const int *free_index_sizes_a =
184 (transa) ? matrix_a->col_sizes : matrix_a->row_sizes;
185 const int *free_index_sizes_b =
186 (transb) ? matrix_b->row_sizes : matrix_b->col_sizes;
187
188#pragma omp parallel reduction(+ : flop_sum)
189 {
190 // Thread-private array covering given work in piece-wise fashion.
191 dbm_task_t *batch =
193
194 // Blocks are ordered first by shard. Creating lookup tables of boundaries.
195#pragma omp for nowait
196 for (int iblock = 1; iblock < pack_a->nblocks; iblock++) {
197 const int shard_row = pack_a->blocks[iblock].free_index % nshard_rows;
198 const int prev_shard_row =
199 pack_a->blocks[iblock - 1].free_index % nshard_rows;
200 if (prev_shard_row != shard_row) {
201 shard_row_start[shard_row] = iblock;
202 }
203 }
204#pragma omp for
205 for (int jblock = 1; jblock < pack_b->nblocks; jblock++) {
206 const int shard_col = pack_b->blocks[jblock].free_index % nshard_cols;
207 const int prev_shard_col =
208 pack_b->blocks[jblock - 1].free_index % nshard_cols;
209 if (prev_shard_col != shard_col) {
210 shard_col_start[shard_col] = jblock;
211 }
212 }
213
214#pragma omp for collapse(2) DBM_OMP_SCHEDULE
215 for (int shard_row = 0; shard_row < nshard_rows; shard_row++) {
216 for (int shard_col = 0; shard_col < nshard_cols; shard_col++) {
217 const int ishard = shard_row * nshard_cols + shard_col;
218 dbm_shard_t *const shard_c = &matrix_c->shards[ishard];
219 int ntasks = 0;
220
221 // Use a merge-join to find pairs of blocks with matching sum indices.
222 // This utilizes that blocks within a shard are ordered by sum_index.
223 const int iblock_start = shard_row_start[shard_row];
224 int jblock_start = shard_col_start[shard_col];
225 for (int iblock = iblock_start; iblock < pack_a->nblocks; iblock++) {
226 const dbm_pack_block_t *blk_a = &pack_a->blocks[iblock];
227 if (blk_a->free_index % nshard_rows != shard_row) {
228 break;
229 }
230 for (int jblock = jblock_start; jblock < pack_b->nblocks; jblock++) {
231 const dbm_pack_block_t *blk_b = &pack_b->blocks[jblock];
232 if (blk_b->free_index % nshard_cols != shard_col) {
233 jblock = pack_b->nblocks; // break
234 continue;
235 }
236 if (blk_a->sum_index < blk_b->sum_index) {
237 jblock = pack_b->nblocks; // break
238 continue;
239 }
240 if (blk_a->sum_index > blk_b->sum_index) {
241 jblock_start++;
242 continue;
243 }
244 // Found block pair with blk_a->sum_index == blk_b->sum_index.
245
246 // Check norms.
247 const float result_norm = alpha2 * blk_a->norm * blk_b->norm;
248 if (result_norm < rows_max_eps[blk_a->free_index]) {
249 continue;
250 }
251
252 // Check block sizes.
253 const int m = free_index_sizes_a[blk_a->free_index];
254 const int n = free_index_sizes_b[blk_b->free_index];
255 const int k = sum_index_sizes_a[blk_a->sum_index];
256 assert(m == matrix_c->row_sizes[blk_a->free_index]);
257 assert(n == matrix_c->col_sizes[blk_b->free_index]);
258 assert(k == sum_index_sizes_b[blk_b->sum_index]);
259
260 // Get C block.
261 const int row = blk_a->free_index, col = blk_b->free_index;
262 dbm_block_t *blk_c = dbm_shard_lookup(shard_c, row, col);
263 if (blk_c == NULL && retain_sparsity) {
264 continue;
265 } else if (blk_c == NULL) {
266 assert(dbm_get_shard_index(matrix_c, row, col) == ishard);
267 assert(dbm_get_stored_coordinates(matrix_c, row, col) ==
268 matrix_c->dist->my_rank);
269 blk_c = dbm_shard_promise_new_block(shard_c, row, col, m * n);
270 }
271
272 // Count flops.
273 const int64_t task_flops = 2LL * m * n * k;
274 if (task_flops == 0) {
275 continue;
276 }
277 flop_sum += task_flops;
279
280 // Add block multiplication to batch.
281 batch[ntasks].m = m;
282 batch[ntasks].n = n;
283 batch[ntasks].k = k;
284 batch[ntasks].offset_a = blk_a->offset;
285 batch[ntasks].offset_b = blk_b->offset;
286 batch[ntasks].offset_c = blk_c->offset;
287 ++ntasks;
288
289 if (ntasks == DBM_MAX_BATCH_SIZE) {
290 backend_process_batch(ntasks, batch, alpha, pack_a, pack_b,
291 ishard, false, shard_c, ctx);
292 ntasks = 0;
293 }
294 }
295 }
296 backend_process_batch(ntasks, batch, alpha, pack_a, pack_b, ishard,
297 true, shard_c, ctx);
298 }
299 }
300
302 }
303
304 free(shard_row_start);
305 free(shard_col_start);
306
307 if (NULL != flop) {
308 *flop += flop_sum;
309 }
310}
311
312/*******************************************************************************
313 * \brief Performs a multiplication of two dbm_matrix_t matrices.
314 * See dbm_matrix.h for details.
315 * \author Ole Schuett
316 ******************************************************************************/
317void dbm_multiply(const bool transa, const bool transb, const double alpha,
318 const dbm_matrix_t *matrix_a, const dbm_matrix_t *matrix_b,
319 const double beta, dbm_matrix_t *matrix_c,
320 const bool retain_sparsity, const double filter_eps,
321 int64_t *flop) {
322 assert(omp_get_num_threads() == 1);
323 assert(matrix_a != NULL && matrix_b != NULL && matrix_c != NULL);
324
325 // Compute filter thresholds for each row.
326 float *rows_max_eps = compute_rows_max_eps(transa, matrix_a, filter_eps);
327
328 // Throughout the matrix multiplication code the "sum_index" and "free_index"
329 // denote the summation (aka dummy) and free index from the Einstein notation.
330 const int num_sum_index_a = (transa) ? matrix_a->nrows : matrix_a->ncols;
331 const int num_sum_index_b = (transb) ? matrix_b->ncols : matrix_b->nrows;
332 const int num_free_index_a = (transa) ? matrix_a->ncols : matrix_a->nrows;
333 const int num_free_index_b = (transb) ? matrix_b->nrows : matrix_b->ncols;
334
335 // Sanity check matrix dimensions.
336 assert(num_sum_index_a == num_sum_index_b);
337 assert(num_free_index_a == matrix_c->nrows);
338 assert(num_free_index_b == matrix_c->ncols);
339
340 // Prepare matrix_c (host).
341 dbm_scale(matrix_c, beta);
342
343 // Determine if validation shall be performed.
344 const char *const maxeps_env = getenv("DBM_MULTIPLY_MAXEPS");
345 const char *const verify_env = getenv("DBM_MULTIPLY_VERIFY");
346 const double maxeps = (NULL == maxeps_env ? 1E-1 : fabs(atof(maxeps_env)));
347 const int verify =
348 (NULL == verify_env ? (NULL == maxeps_env ? 0 : 1) : atoi(verify_env));
349 dbm_matrix_t *matrix_d = NULL;
350 if (0 != verify) {
351 dbm_distribution_t *const dist_shared = matrix_c->dist;
352 dbm_create(&matrix_d, dist_shared, matrix_c->name, matrix_c->nrows,
353 matrix_c->ncols, matrix_c->row_sizes, matrix_c->col_sizes);
354 dbm_copy(matrix_d, matrix_c);
355 }
356
357 // Start uploading matrix_c to the GPU.
358 backend_context_t *ctx = backend_start(matrix_c);
359
360 // Redistribute matrix_a and matrix_b across MPI ranks.
361 dbm_comm_iterator_t *iter =
362 dbm_comm_iterator_start(transa, transb, matrix_a, matrix_b, matrix_c);
363
364 // Count flops if requested.
365 if (NULL != flop) {
366 *flop = 0;
367 }
368
369 // Main loop.
370 dbm_pack_t *pack_a, *pack_b;
371 while (dbm_comm_iterator_next(iter, &pack_a, &pack_b)) {
372 backend_upload_packs(pack_a, pack_b, ctx);
373 multiply_packs(transa, transb, alpha, pack_a, pack_b, matrix_a, matrix_b,
374 matrix_c, retain_sparsity, rows_max_eps, flop, ctx);
375 }
376
377 // Wait for all other MPI ranks to complete, then release ressources.
379 backend_stop(ctx);
380
381 if (NULL != matrix_d) {
382 int npacks = 0;
383 iter =
384 dbm_comm_iterator_start(transa, transb, matrix_a, matrix_b, matrix_d);
385 while (dbm_comm_iterator_next(iter, &pack_a, &pack_b)) {
386 multiply_packs(transa, transb, alpha, pack_a, pack_b, matrix_a, matrix_b,
387 matrix_d, retain_sparsity, rows_max_eps, NULL, NULL);
388 ++npacks;
389 }
391 const double epsilon = dbm_maxeps(matrix_d, matrix_c);
392 if (maxeps < epsilon) {
393 if (1 == verify) {
394 fprintf(stderr, "WARN ACC/LIBDBM: npacks=%i diff=%g\n", npacks,
395 epsilon);
396 } else {
397 fprintf(stderr, "ERROR ACC/LIBDBM: npacks=%i diff=%g\n", npacks,
398 epsilon);
399 exit(EXIT_FAILURE);
400 }
401 }
402 dbm_release(matrix_d);
403 }
404
405 // Release filter thresholds.
406 free(rows_max_eps);
407
408 // Final filter pass.
409 dbm_filter(matrix_c, filter_eps);
410}
411
412// EOF
#define DBM_MAX_BATCH_SIZE
static int imax(int x, int y)
Returns the larger of two given integers (missing from the C standard)
void dbm_library_counter_increment(const int m, const int n, const int k)
Add given block multiplication to stats. This routine is thread-safe.
double dbm_maxeps(const dbm_matrix_t *matrix_a, const dbm_matrix_t *matrix_b)
Calculates maximum relative difference between matrix_a and matrix_b.
Definition dbm_matrix.c:567
static int dbm_get_shard_index(const dbm_matrix_t *matrix, const int row, const int col)
Internal routine for getting a block's shard index.
Definition dbm_matrix.h:246
static int dbm_get_num_shards(const dbm_matrix_t *matrix)
Internal routine that returns the number of shards for given matrix.
Definition dbm_matrix.h:238
void * dbm_mempool_host_malloc(const size_t size)
Internal routine for allocating host memory from the pool.
void dbm_mempool_host_free(const void *memory)
Internal routine for releasing memory back to the pool.
void dbm_mpi_sum_int(int *values, const int count, const dbm_mpi_comm_t comm)
Wrapper around MPI_Allreduce for op MPI_SUM and datatype MPI_INT.
Definition dbm_mpi.c:305
static void backend_upload_packs(const dbm_pack_t *pack_a, const dbm_pack_t *pack_b, backend_context_t *ctx)
Private routine for handing newly arrived packs to the backend.
static float * compute_rows_max_eps(const bool trans, const dbm_matrix_t *matrix, const double filter_eps)
Private routine for computing the max filter threshold for each row.
static backend_context_t * backend_start(const dbm_matrix_t *matrix_c)
Private routine for initializing the multiplication backend.
static void backend_stop(backend_context_t *ctx)
Private routine for shutting down the multiplication backend.
static void backend_process_batch(const int ntasks, const dbm_task_t batch[ntasks], const double alpha, const dbm_pack_t *pack_a, const dbm_pack_t *pack_b, const int kshard, const bool finish, dbm_shard_t *shard_c, backend_context_t *ctx)
Private routine for sending a batch to the multiplication backend.
static void multiply_packs(const bool transa, const bool transb, const double alpha, const dbm_pack_t *pack_a, const dbm_pack_t *pack_b, const dbm_matrix_t *matrix_a, const dbm_matrix_t *matrix_b, dbm_matrix_t *matrix_c, const bool retain_sparsity, const float *rows_max_eps, int64_t *flop, backend_context_t *ctx)
Private routine for multipling two packs.
dbm_comm_iterator_t * dbm_comm_iterator_start(const bool transa, const bool transb, const dbm_matrix_t *matrix_a, const dbm_matrix_t *matrix_b, const dbm_matrix_t *matrix_c)
Internal routine for creating a communication iterator.
void dbm_comm_iterator_stop(dbm_comm_iterator_t *iter)
Internal routine for releasing the given communication iterator.
bool dbm_comm_iterator_next(dbm_comm_iterator_t *iter, dbm_pack_t **pack_a, dbm_pack_t **pack_b)
Internal routine for retriving next pair of packs from given iterator.
void dbm_multiply_cpu_process_batch(int ntasks, const dbm_task_t batch[ntasks], double alpha, const dbm_pack_t *pack_a, const dbm_pack_t *pack_b, dbm_shard_t *shard_c, int options)
Internal routine for executing the tasks in given batch on the CPU.
@ DBM_MULTIPLY_BLAS_LIBRARY
@ DBM_MULTIPLY_TASK_REORDER
dbm_block_t * dbm_shard_promise_new_block(dbm_shard_t *shard, const int row, const int col, const int block_size)
Internal routine for allocating the metadata of a new block.
Definition dbm_shard.c:200
dbm_block_t * dbm_shard_lookup(const dbm_shard_t *shard, const int row, const int col)
Internal routine for looking up a block from a shard.
Definition dbm_shard.c:178
void dbm_shard_allocate_promised_blocks(dbm_shard_t *shard)
Internal routine for allocating and zeroing any promised block's data.
Definition dbm_shard.c:234
static void const int const int i
Private struct for storing the context of the multiplication backend.
Internal struct for storing a block's metadata.
Definition dbm_shard.h:20
Internal struct for storing a communication iterator.
Internal struct for storing a two dimensional distribution.
dbm_mpi_comm_t comm
Internal struct for storing a matrix.
Definition dbm_matrix.h:20
int * row_sizes
Definition dbm_matrix.h:25
char * name
Definition dbm_matrix.h:22
int * col_sizes
Definition dbm_matrix.h:26
dbm_shard_t * shards
Definition dbm_matrix.h:28
dbm_distribution_t * dist
Definition dbm_matrix.h:21
Internal struct for storing a dbm_block_t plus its norm.
Internal struct for storing a pack - essentially a shard for MPI.
dbm_pack_block_t * blocks
Internal struct for storing a matrix shard.
Definition dbm_shard.h:30
double * data
Definition dbm_shard.h:43
dbm_block_t * blocks
Definition dbm_shard.h:33
int data_size
Definition dbm_shard.h:42
Internal struct for storing a task, ie. a single block multiplication.