25 const char name[],
const int nrows,
const int ncols,
26 const int row_sizes[nrows],
const int col_sizes[ncols]) {
27 assert(omp_get_num_threads() == 1);
31 assert(dist->rows.length == nrows);
32 assert(dist->cols.length == ncols);
36 size_t size = (strlen(name) + 1) *
sizeof(
char);
37 matrix->
name = malloc(size);
38 memcpy(matrix->
name, name, size);
40 matrix->
nrows = nrows;
41 matrix->
ncols = ncols;
43 size = nrows *
sizeof(int);
45 memcpy(matrix->
row_sizes, row_sizes, size);
47 size = ncols *
sizeof(int);
49 memcpy(matrix->
col_sizes, col_sizes, size);
52 #pragma omp parallel for
57 assert(*matrix_out == NULL);
66 assert(omp_get_num_threads() == 1);
84 assert(omp_get_num_threads() == 1);
87 for (
int i = 0;
i < matrix_b->
nrows;
i++) {
91 for (
int i = 0;
i < matrix_b->
ncols;
i++) {
95 assert(matrix_a->
dist == matrix_b->
dist);
97 #pragma omp parallel for schedule(dynamic)
109 assert(omp_get_num_threads() == 1);
110 assert(matrix->
nrows == redist->nrows);
111 for (
int i = 0;
i < matrix->
nrows;
i++) {
112 assert(matrix->
row_sizes[
i] == redist->row_sizes[
i]);
114 assert(matrix->
ncols == redist->ncols);
115 for (
int i = 0;
i < matrix->
ncols;
i++) {
116 assert(matrix->
col_sizes[
i] == redist->col_sizes[
i]);
124 int send_count[nranks];
125 memset(send_count, 0, nranks *
sizeof(
int));
128 for (
int iblock = 0; iblock < shard->
nblocks; iblock++) {
132 const int block_size = row_size * col_size;
134 assert(0 <= rank && rank < nranks);
135 send_count[rank] += 2 + block_size;
140 int recv_count[nranks];
144 int send_displ[nranks + 1], recv_displ[nranks + 1];
145 send_displ[0] = recv_displ[0] = 0;
146 for (
int irank = 1; irank < nranks + 1; irank++) {
147 send_displ[irank] = send_displ[irank - 1] + send_count[irank - 1];
148 recv_displ[irank] = recv_displ[irank - 1] + recv_count[irank - 1];
150 const int total_send_count = send_displ[nranks];
151 const int total_recv_count = recv_displ[nranks];
152 double *data_send = malloc(total_send_count *
sizeof(
double));
153 double *data_recv = malloc(total_recv_count *
sizeof(
double));
156 int send_data_positions[nranks];
157 memcpy(send_data_positions, send_displ, nranks *
sizeof(
int));
160 for (
int iblock = 0; iblock < shard->
nblocks; iblock++) {
162 const double *blk_data = &shard->
data[blk->
offset];
165 const int block_size = row_size * col_size;
167 const int pos = send_data_positions[rank];
168 data_send[pos + 0] = blk->
row;
169 data_send[pos + 1] = blk->
col;
170 memcpy(&data_send[pos + 2], blk_data, block_size *
sizeof(
double));
171 send_data_positions[rank] += 2 + block_size;
174 for (
int irank = 0; irank < nranks; irank++) {
175 assert(send_data_positions[irank] == send_displ[irank + 1]);
180 recv_count, recv_displ, comm);
185 int recv_data_pos = 0;
186 while (recv_data_pos < total_recv_count) {
187 const int row = (int)data_recv[recv_data_pos + 0];
188 const int col = (int)data_recv[recv_data_pos + 1];
189 assert(data_recv[recv_data_pos + 0] - (
double)row == 0.0);
190 assert(data_recv[recv_data_pos + 1] - (
double)col == 0.0);
191 dbm_put_block(redist, row, col,
false, &data_recv[recv_data_pos + 2]);
192 const int row_size = matrix->
row_sizes[row];
193 const int col_size = matrix->
col_sizes[col];
194 const int block_size = row_size * col_size;
195 recv_data_pos += 2 + block_size;
197 assert(recv_data_pos == total_recv_count);
207 double **block,
int *row_size,
int *col_size) {
208 assert(0 <= row && row < matrix->nrows);
209 assert(0 <= col && col < matrix->ncols);
229 const bool summation,
const double *block) {
230 assert(0 <= row && row < matrix->nrows);
231 assert(0 <= col && col < matrix->ncols);
233 const int row_size = matrix->
row_sizes[row];
234 const int col_size = matrix->
col_sizes[col];
235 const int block_size = row_size * col_size;
239 omp_set_lock(&shard->
lock);
244 for (
int i = 0;
i < block_size;
i++) {
245 blk_data[
i] += block[
i];
248 memcpy(blk_data, block, block_size *
sizeof(
double));
250 omp_unset_lock(&shard->
lock);
258 assert(omp_get_num_threads() == 1);
260 #pragma omp parallel for schedule(dynamic)
277 assert(omp_get_num_threads() == 1);
283 #pragma omp parallel for schedule(dynamic)
286 const int old_nblocks = shard->
nblocks;
291 for (
int iblock = 0; iblock < old_nblocks; iblock++) {
293 const double *old_blk_data = &shard->
data[old_blk.
offset];
296 const int block_size = row_size * col_size;
298 for (
int i = 0;
i < block_size;
i++) {
299 norm += old_blk_data[
i] * old_blk_data[
i];
302 if (sqrt((
float)norm) < eps && block_size > 0) {
307 shard, old_blk.
row, old_blk.
col, block_size);
311 double *new_blk_data = &shard->
data[new_blk->
offset];
312 memmove(new_blk_data, old_blk_data, block_size *
sizeof(
double));
326 const int rows[],
const int cols[]) {
327 assert(omp_get_num_threads() == omp_get_max_threads() &&
328 "Please call dbm_reserve_blocks within an OpenMP parallel region.");
330 for (
int i = 0;
i < nblocks;
i++) {
333 omp_set_lock(&shard->
lock);
334 assert(0 <= rows[
i] && rows[
i] < matrix->
nrows);
335 assert(0 <= cols[
i] && cols[
i] < matrix->
ncols);
337 const int row_size = matrix->
row_sizes[rows[
i]];
338 const int col_size = matrix->
col_sizes[cols[
i]];
339 const int block_size = row_size * col_size;
341 omp_unset_lock(&shard->
lock);
345 #pragma omp for schedule(dynamic)
357 assert(omp_get_num_threads() == 1);
366 #pragma omp parallel for schedule(dynamic)
370 shard->
data[
i] *= alpha;
380 assert(omp_get_num_threads() == 1);
382 #pragma omp parallel for schedule(dynamic)
394 assert(omp_get_num_threads() == 1);
395 assert(matrix_a->
dist == matrix_b->
dist);
397 #pragma omp parallel for schedule(dynamic)
401 for (
int iblock = 0; iblock < shard_b->
nblocks; iblock++) {
408 const int block_size = row_size * col_size;
410 shard_a, blk_b.
row, blk_b.
col, block_size);
411 double *data_a = &shard_a->
data[blk_a->
offset];
412 const double *data_b = &shard_b->
data[blk_b.
offset];
413 for (
int i = 0;
i < block_size;
i++) {
414 data_a[
i] += data_b[
i];
427 assert(omp_get_num_threads() == omp_get_max_threads() &&
428 "Please call dbm_iterator_start within an OpenMP parallel region.");
437 assert(*iter_out == NULL);
447 for (
int ishard = omp_get_thread_num();
449 ishard += omp_get_num_threads()) {
468 double **block,
int *row_size,
int *col_size) {
504 double checksum = 0.0;
524 maxabs = fmax(maxabs, fabs(shard->
data[
i]));
566 const int **row_sizes) {
567 *nrows = matrix->
nrows;
576 const int **col_sizes) {
577 *ncols = matrix->
ncols;
586 const int **local_rows) {
596 const int **local_cols) {
int dbm_distribution_stored_coords(const dbm_distribution_t *dist, const int row, const int col)
Returns the MPI rank on which the given block should be stored.
void dbm_get_block_p(dbm_matrix_t *matrix, const int row, const int col, double **block, int *row_size, int *col_size)
Looks up a block from given matrics. This routine is thread-safe. If the block is not found then a nu...
void dbm_redistribute(const dbm_matrix_t *matrix, dbm_matrix_t *redist)
Copies content of matrix_b into matrix_a. Matrices may have different distributions.
void dbm_reserve_blocks(dbm_matrix_t *matrix, const int nblocks, const int rows[], const int cols[])
Adds list of blocks efficiently. The blocks will be filled with zeros. This routine must always be ca...
bool dbm_iterator_blocks_left(const dbm_iterator_t *iter)
Tests whether the given iterator has any block left.
double dbm_maxabs(const dbm_matrix_t *matrix)
Returns the absolute value of the larges element of the entire matrix.
void dbm_filter(dbm_matrix_t *matrix, const double eps)
Removes all blocks from the matrix whose norm is below the threshold. Blocks of size zero are always ...
void dbm_get_local_cols(const dbm_matrix_t *matrix, int *nlocal_cols, const int **local_cols)
Returns the local column block sizes of the given matrix.
void dbm_add(dbm_matrix_t *matrix_a, const dbm_matrix_t *matrix_b)
Adds matrix_b to matrix_a.
void dbm_get_local_rows(const dbm_matrix_t *matrix, int *nlocal_rows, const int **local_rows)
Returns the local row block sizes of the given matrix.
void dbm_iterator_stop(dbm_iterator_t *iter)
Releases the given iterator.
void dbm_get_col_sizes(const dbm_matrix_t *matrix, int *ncols, const int **col_sizes)
Returns the column block sizes of the given matrix.
void dbm_create(dbm_matrix_t **matrix_out, dbm_distribution_t *dist, const char name[], const int nrows, const int ncols, const int row_sizes[nrows], const int col_sizes[ncols])
Creates a new matrix.
const char * dbm_get_name(const dbm_matrix_t *matrix)
Returns the name of the matrix of the given matrix.
void dbm_put_block(dbm_matrix_t *matrix, const int row, const int col, const bool summation, const double *block)
Adds a block to given matrix. This routine is thread-safe. If block already exist then it gets overwr...
int dbm_iterator_num_blocks(const dbm_iterator_t *iter)
Returns number of blocks the iterator will provide to calling thread.
void dbm_iterator_start(dbm_iterator_t **iter_out, const dbm_matrix_t *matrix)
Creates an iterator for the blocks of the given matrix. The iteration order is not stable....
void dbm_iterator_next_block(dbm_iterator_t *iter, int *row, int *col, double **block, int *row_size, int *col_size)
Returns the next block from the given iterator.
double dbm_checksum(const dbm_matrix_t *matrix)
Computes a checksum of the given matrix.
int dbm_get_num_blocks(const dbm_matrix_t *matrix)
Returns the number of local blocks of the given matrix.
void dbm_get_row_sizes(const dbm_matrix_t *matrix, int *nrows, const int **row_sizes)
Returns the row block sizes of the given matrix.
void dbm_scale(dbm_matrix_t *matrix, const double alpha)
Multiplies all entries in the given matrix by the given factor alpha.
void dbm_copy(dbm_matrix_t *matrix_a, const dbm_matrix_t *matrix_b)
Copies content of matrix_b into matrix_a. Matrices must have the same row/col block sizes and distrib...
void dbm_clear(dbm_matrix_t *matrix)
Remove all blocks from matrix, but does not release underlying memory.
void dbm_zero(dbm_matrix_t *matrix)
Sets all blocks in the given matrix to zero.
int dbm_get_stored_coordinates(const dbm_matrix_t *matrix, const int row, const int col)
Returns the MPI rank on which the given block should be stored.
int dbm_get_nze(const dbm_matrix_t *matrix)
Returns the number of local Non-Zero Elements of the given matrix.
void dbm_release(dbm_matrix_t *matrix)
Releases a matrix and all its ressources.
const dbm_distribution_t * dbm_get_distribution(const dbm_matrix_t *matrix)
Returns the distribution of the given matrix.
static int dbm_get_shard_index(const dbm_matrix_t *matrix, const int row, const int col)
Internal routine for getting a block's shard index.
static int dbm_get_num_shards(const dbm_matrix_t *matrix)
Internal routine that returns the number of shards for given matrix.
bool dbm_mpi_comms_are_similar(const dbm_mpi_comm_t comm1, const dbm_mpi_comm_t comm2)
Wrapper around MPI_Comm_compare.
int dbm_mpi_comm_size(const dbm_mpi_comm_t comm)
Wrapper around MPI_Comm_size.
void dbm_mpi_sum_double(double *values, const int count, const dbm_mpi_comm_t comm)
Wrapper around MPI_Allreduce for op MPI_SUM and datatype MPI_DOUBLE.
void dbm_mpi_max_double(double *values, const int count, const dbm_mpi_comm_t comm)
Wrapper around MPI_Allreduce for op MPI_MAX and datatype MPI_DOUBLE.
void dbm_mpi_alltoall_int(const int *sendbuf, const int sendcount, int *recvbuf, const int recvcount, const dbm_mpi_comm_t comm)
Wrapper around MPI_Alltoall for datatype MPI_INT.
void dbm_mpi_alltoallv_double(const double *sendbuf, const int *sendcounts, const int *sdispls, double *recvbuf, const int *recvcounts, const int *rdispls, const dbm_mpi_comm_t comm)
Wrapper around MPI_Alltoallv for datatype MPI_DOUBLE.
dbm_block_t * dbm_shard_promise_new_block(dbm_shard_t *shard, const int row, const int col, const int block_size)
Internal routine for allocating the metadata of a new block.
void dbm_shard_release(dbm_shard_t *shard)
Internal routine for releasing a shard.
dbm_block_t * dbm_shard_get_or_allocate_block(dbm_shard_t *shard, const int row, const int col, const int block_size)
Internal routine for getting block or allocating a new one.
dbm_block_t * dbm_shard_lookup(const dbm_shard_t *shard, const int row, const int col)
Internal routine for looking up a block from a shard.
void dbm_shard_allocate_promised_blocks(dbm_shard_t *shard)
Internal routine for allocating and zeroing any promised block's data.
void dbm_shard_init(dbm_shard_t *shard)
Internal routine for initializing a shard.
void dbm_shard_copy(dbm_shard_t *shard_a, const dbm_shard_t *shard_b)
Internal routine for copying content of shard_b into shard_a.
dbm_block_t * dbm_shard_get_or_promise_block(dbm_shard_t *shard, const int row, const int col, const int block_size)
Internal routine for getting block or promising a new one.
static void const int const int i
subroutine, public dbm_distribution_release(dist)
Decreases the reference counter of the given distribution.
subroutine, public dbm_distribution_hold(dist)
Increases the reference counter of the given distribution.
Internal struct for storing a block's metadata.
Internal struct for storing a two dimensional distribution.
Internal struct for storing a block iterator.
const dbm_matrix_t * matrix
Internal struct for storing a matrix.
dbm_distribution_t * dist
Internal struct for storing a matrix shard.