24 const char name[],
const int nrows,
const int ncols,
25 const int row_sizes[nrows],
const int col_sizes[ncols]) {
26 assert(omp_get_num_threads() == 1);
30 assert(dist->rows.length == nrows);
31 assert(dist->cols.length == ncols);
32 dbm_distribution_hold(dist);
35 size_t size = (strlen(name) + 1) *
sizeof(
char);
36 matrix->
name = malloc(size);
37 assert(matrix->
name != NULL);
38 memcpy(matrix->
name, name, size);
40 matrix->
nrows = nrows;
41 matrix->
ncols = ncols;
43 size = nrows *
sizeof(int);
46 memcpy(matrix->
row_sizes, row_sizes, size);
48 size = ncols *
sizeof(int);
51 memcpy(matrix->
col_sizes, col_sizes, size);
54 assert(matrix->
shards != NULL);
55#pragma omp parallel for
60 assert(*matrix_out == NULL);
69 assert(omp_get_num_threads() == 1);
73 dbm_distribution_release(matrix->
dist);
87 assert(omp_get_num_threads() == 1);
90 for (
int i = 0;
i < matrix_b->
nrows;
i++) {
94 for (
int i = 0;
i < matrix_b->
ncols;
i++) {
98 assert(matrix_a->
dist == matrix_b->
dist);
100#pragma omp parallel for DBM_OMP_SCHEDULE
112 assert(omp_get_num_threads() == 1);
113 assert(matrix->
nrows == redist->nrows);
114 for (
int i = 0;
i < matrix->
nrows;
i++) {
115 assert(matrix->
row_sizes[
i] == redist->row_sizes[
i]);
117 assert(matrix->
ncols == redist->ncols);
118 for (
int i = 0;
i < matrix->
ncols;
i++) {
119 assert(matrix->
col_sizes[
i] == redist->col_sizes[
i]);
127 int send_count[nranks];
128 memset(send_count, 0, nranks *
sizeof(
int));
131 for (
int iblock = 0; iblock < shard->
nblocks; iblock++) {
135 const int block_size = row_size * col_size;
136 const int rank = dbm_get_stored_coordinates(redist, blk->
row, blk->
col);
137 assert(0 <= rank && rank < nranks);
138 send_count[rank] += 2 + block_size;
143 int recv_count[nranks];
147 int send_displ[nranks + 1], recv_displ[nranks + 1];
148 send_displ[0] = recv_displ[0] = 0;
149 for (
int irank = 1; irank <= nranks; irank++) {
150 send_displ[irank] = send_displ[irank - 1] + send_count[irank - 1];
151 recv_displ[irank] = recv_displ[irank - 1] + recv_count[irank - 1];
153 const int total_send_count = send_displ[nranks];
154 const int total_recv_count = recv_displ[nranks];
159 int send_data_positions[nranks];
160 memcpy(send_data_positions, send_displ, nranks *
sizeof(
int));
163 for (
int iblock = 0; iblock < shard->
nblocks; iblock++) {
165 const double *blk_data = &shard->
data[blk->
offset];
168 const int block_size = row_size * col_size;
169 const int rank = dbm_get_stored_coordinates(redist, blk->
row, blk->
col);
170 const int pos = send_data_positions[rank];
171 data_send[pos + 0] = blk->
row;
172 data_send[pos + 1] = blk->
col;
173 memcpy(&data_send[pos + 2], blk_data, block_size *
sizeof(
double));
174 send_data_positions[rank] += 2 + block_size;
177 for (
int irank = 0; irank < nranks; irank++) {
178 assert(send_data_positions[irank] == send_displ[irank + 1]);
183 recv_count, recv_displ, comm);
188 int recv_data_pos = 0;
189 while (recv_data_pos < total_recv_count) {
190 const int row = (int)data_recv[recv_data_pos + 0];
191 const int col = (int)data_recv[recv_data_pos + 1];
192 assert(data_recv[recv_data_pos + 0] - (
double)row == 0.0);
193 assert(data_recv[recv_data_pos + 1] - (
double)col == 0.0);
194 dbm_put_block(redist, row, col,
false, &data_recv[recv_data_pos + 2]);
195 const int row_size = matrix->
row_sizes[row];
196 const int col_size = matrix->
col_sizes[col];
197 const int block_size = row_size * col_size;
198 recv_data_pos += 2 + block_size;
200 assert(recv_data_pos == total_recv_count);
209void dbm_get_block_p(
dbm_matrix_t *matrix,
const int row,
const int col,
210 double **block,
int *row_size,
int *col_size) {
211 assert(0 <= row && row < matrix->nrows);
212 assert(0 <= col && col < matrix->ncols);
213 assert(dbm_get_stored_coordinates(matrix, row, col) == matrix->
dist->
my_rank);
232 const bool summation,
const double *block) {
233 assert(0 <= row && row < matrix->nrows);
234 assert(0 <= col && col < matrix->ncols);
235 assert(dbm_get_stored_coordinates(matrix, row, col) == matrix->
dist->
my_rank);
236 const int row_size = matrix->
row_sizes[row];
237 const int col_size = matrix->
col_sizes[col];
238 const int block_size = row_size * col_size;
242 omp_set_lock(&shard->
lock);
247 for (
int i = 0;
i < block_size;
i++) {
248 blk_data[
i] += block[
i];
251 memcpy(blk_data, block, block_size *
sizeof(
double));
253 omp_unset_lock(&shard->
lock);
261 assert(omp_get_num_threads() == 1);
263#pragma omp parallel for DBM_OMP_SCHEDULE
280 assert(omp_get_num_threads() == 1);
285 const double eps2 = eps * eps;
287#pragma omp parallel for DBM_OMP_SCHEDULE
290 const int old_nblocks = shard->
nblocks;
295 for (
int iblock = 0; iblock < old_nblocks; iblock++) {
297 const double *old_blk_data = &shard->
data[old_blk.
offset];
300 const int block_size = row_size * col_size;
302 for (
int i = 0;
i < block_size;
i++) {
303 const double value = old_blk_data[
i];
304 norm += value * value;
310 if (block_size > 0 && norm < eps2) {
315 shard, old_blk.
row, old_blk.
col, block_size);
319 double *new_blk_data = &shard->
data[new_blk->
offset];
320 memmove(new_blk_data, old_blk_data, block_size *
sizeof(
double));
334 const int rows[],
const int cols[]) {
335 assert(omp_get_num_threads() == omp_get_max_threads() &&
336 "Please call dbm_reserve_blocks within an OpenMP parallel region.");
338 for (
int i = 0;
i < nblocks;
i++) {
341 omp_set_lock(&shard->
lock);
342 assert(0 <= rows[
i] && rows[
i] < matrix->
nrows);
343 assert(0 <= cols[
i] && cols[
i] < matrix->
ncols);
344 assert(dbm_get_stored_coordinates(matrix, rows[
i], cols[
i]) == my_rank);
345 const int row_size = matrix->
row_sizes[rows[
i]];
346 const int col_size = matrix->
col_sizes[cols[
i]];
347 const int block_size = row_size * col_size;
349 omp_unset_lock(&shard->
lock);
353#pragma omp for DBM_OMP_SCHEDULE
365 assert(omp_get_num_threads() == 1);
374#pragma omp parallel for DBM_OMP_SCHEDULE
378 shard->
data[
i] *= alpha;
388 assert(omp_get_num_threads() == 1);
390#pragma omp parallel for DBM_OMP_SCHEDULE
402 assert(omp_get_num_threads() == 1);
403 assert(matrix_a->
dist == matrix_b->
dist);
405#pragma omp parallel for DBM_OMP_SCHEDULE
409 for (
int iblock = 0; iblock < shard_b->
nblocks; iblock++) {
416 const int block_size = row_size * col_size;
418 shard_a, blk_b.
row, blk_b.
col, block_size);
419 double *data_a = &shard_a->
data[blk_a->
offset];
420 const double *data_b = &shard_b->
data[blk_b.
offset];
421 for (
int i = 0;
i < block_size;
i++) {
422 data_a[
i] += data_b[
i];
435 assert(omp_get_num_threads() == omp_get_max_threads() &&
436 "Please call dbm_iterator_start within an OpenMP parallel region.");
438 assert(iter != NULL);
446 assert(*iter_out == NULL);
456 for (
int ishard = omp_get_thread_num();
458 ishard += omp_get_num_threads()) {
477 double **block,
int *row_size,
int *col_size) {
512static double kahan_sum(
double value,
double *accumulator,
513 double *compensation) {
515 assert(NULL != accumulator && NULL != compensation);
516 c = value - *compensation;
517 r = *accumulator + c;
518 *compensation = (r - *accumulator) - c;
528 double checksum = 0.0, compensation = 0.0;
532 const double value = shard->
data[
i];
533 kahan_sum(value * value, &checksum, &compensation);
549 maxabs = fmax(maxabs, fabs(shard->
data[
i]));
591 const int **row_sizes) {
592 *nrows = matrix->
nrows;
601 const int **col_sizes) {
602 *ncols = matrix->
ncols;
611 const int **local_rows) {
621 const int **local_cols) {
630int dbm_get_stored_coordinates(
const dbm_matrix_t *matrix,
const int row,
int dbm_distribution_stored_coords(const dbm_distribution_t *dist, const int row, const int col)
Returns the MPI rank on which the given block should be stored.
void dbm_get_col_sizes(const dbm_matrix_t *matrix, int *ncols, const int **col_sizes)
Returns the column block sizes of the given matrix.
void dbm_get_row_sizes(const dbm_matrix_t *matrix, int *nrows, const int **row_sizes)
Returns the row block sizes of the given matrix.
static int dbm_get_shard_index(const dbm_matrix_t *matrix, const int row, const int col)
Internal routine for getting a block's shard index.
static int dbm_get_num_shards(const dbm_matrix_t *matrix)
Internal routine that returns the number of shards for given matrix.
void dbm_mpi_free_mem(void *mem)
Wrapper around MPI_Free_mem.
void * dbm_mpi_alloc_mem(size_t size)
Wrapper around MPI_Alloc_mem.
bool dbm_mpi_comms_are_similar(const dbm_mpi_comm_t comm1, const dbm_mpi_comm_t comm2)
Wrapper around MPI_Comm_compare.
int dbm_mpi_comm_size(const dbm_mpi_comm_t comm)
Wrapper around MPI_Comm_size.
void dbm_mpi_sum_double(double *values, const int count, const dbm_mpi_comm_t comm)
Wrapper around MPI_Allreduce for op MPI_SUM and datatype MPI_DOUBLE.
void dbm_mpi_max_double(double *values, const int count, const dbm_mpi_comm_t comm)
Wrapper around MPI_Allreduce for op MPI_MAX and datatype MPI_DOUBLE.
void dbm_mpi_alltoall_int(const int *sendbuf, const int sendcount, int *recvbuf, const int recvcount, const dbm_mpi_comm_t comm)
Wrapper around MPI_Alltoall for datatype MPI_INT.
void dbm_mpi_alltoallv_double(const double *sendbuf, const int *sendcounts, const int *sdispls, double *recvbuf, const int *recvcounts, const int *rdispls, const dbm_mpi_comm_t comm)
Wrapper around MPI_Alltoallv for datatype MPI_DOUBLE.
void dbm_shard_release(dbm_shard_t *shard)
Internal routine for releasing a shard.
dbm_block_t * dbm_shard_get_or_allocate_block(dbm_shard_t *shard, const int row, const int col, const int block_size)
Internal routine for getting block or allocating a new one.
dbm_block_t * dbm_shard_promise_new_block(dbm_shard_t *shard, const int row, const int col, const int block_size)
Internal routine for allocating the metadata of a new block.
dbm_block_t * dbm_shard_lookup(const dbm_shard_t *shard, const int row, const int col)
Internal routine for looking up a block from a shard.
void dbm_shard_allocate_promised_blocks(dbm_shard_t *shard)
Internal routine for allocating and zeroing any promised block's data.
dbm_block_t * dbm_shard_get_or_promise_block(dbm_shard_t *shard, const int row, const int col, const int block_size)
Internal routine for getting block or promising a new one.
void dbm_shard_init(dbm_shard_t *shard)
Internal routine for initializing a shard.
void dbm_shard_copy(dbm_shard_t *shard_a, const dbm_shard_t *shard_b)
Internal routine for copying content of shard_b into shard_a.
static void const int const int i
sums arrays of real/complex numbers with much reduced round-off as compared to a naive implementation...
Internal struct for storing a block's metadata.
Internal struct for storing a two dimensional distribution.
Internal struct for storing a block iterator.
const dbm_matrix_t * matrix
Internal struct for storing a matrix.
dbm_distribution_t * dist
Internal struct for storing a matrix shard.