23 const char name[],
const int nrows,
const int ncols,
24 const int row_sizes[nrows],
const int col_sizes[ncols]) {
25 assert(omp_get_num_threads() == 1);
29 assert(dist->rows.length == nrows);
30 assert(dist->cols.length == ncols);
31 dbm_distribution_hold(dist);
34 size_t size = (strlen(name) + 1) *
sizeof(
char);
35 matrix->
name = malloc(size);
36 assert(matrix->
name != NULL && name != NULL);
37 memcpy(matrix->
name, name, size);
39 matrix->
nrows = nrows;
40 matrix->
ncols = ncols;
42 size = nrows *
sizeof(int);
44 assert(matrix->
row_sizes != NULL || size == 0);
46 memcpy(matrix->
row_sizes, row_sizes, size);
49 size = ncols *
sizeof(int);
51 assert(matrix->
col_sizes != NULL || size == 0);
53 memcpy(matrix->
col_sizes, col_sizes, size);
58 assert(matrix->
shards != NULL || num_shards == 0);
59#pragma omp parallel for
60 for (
int ishard = 0; ishard < num_shards; ishard++) {
64 assert(*matrix_out == NULL);
73 assert(omp_get_num_threads() == 1);
77 dbm_distribution_release(matrix->
dist);
91 assert(omp_get_num_threads() == 1);
94 for (
int i = 0;
i < matrix_b->
nrows;
i++) {
98 for (
int i = 0;
i < matrix_b->
ncols;
i++) {
102 assert(matrix_a->
dist == matrix_b->
dist);
104#pragma omp parallel for DBM_OMP_SCHEDULE
116 assert(omp_get_num_threads() == 1);
117 assert(matrix->
nrows == redist->nrows);
118 for (
int i = 0;
i < matrix->
nrows;
i++) {
119 assert(matrix->
row_sizes[
i] == redist->row_sizes[
i]);
121 assert(matrix->
ncols == redist->ncols);
122 for (
int i = 0;
i < matrix->
ncols;
i++) {
123 assert(matrix->
col_sizes[
i] == redist->col_sizes[
i]);
131 int send_count[nranks];
132 memset(send_count, 0, nranks *
sizeof(
int));
135 for (
int iblock = 0; iblock < shard->
nblocks; iblock++) {
139 const int block_size = row_size * col_size;
140 const int rank = dbm_get_stored_coordinates(redist, blk->
row, blk->
col);
141 assert(0 <= rank && rank < nranks);
142 send_count[rank] += 2 + block_size;
147 int recv_count[nranks];
151 int send_displ[nranks + 1], recv_displ[nranks + 1];
152 send_displ[0] = recv_displ[0] = 0;
153 for (
int irank = 1; irank <= nranks; irank++) {
154 send_displ[irank] = send_displ[irank - 1] + send_count[irank - 1];
155 recv_displ[irank] = recv_displ[irank - 1] + recv_count[irank - 1];
157 const int total_send_count = send_displ[nranks];
158 const int total_recv_count = recv_displ[nranks];
163 int send_data_positions[nranks];
164 memcpy(send_data_positions, send_displ, nranks *
sizeof(
int));
167 for (
int iblock = 0; iblock < shard->
nblocks; iblock++) {
169 const double *blk_data = &shard->
data[blk->
offset];
172 const int block_size = row_size * col_size;
173 const int rank = dbm_get_stored_coordinates(redist, blk->
row, blk->
col);
174 const int pos = send_data_positions[rank];
175 data_send[pos + 0] = blk->
row;
176 data_send[pos + 1] = blk->
col;
177 memcpy(&data_send[pos + 2], blk_data, block_size *
sizeof(
double));
178 send_data_positions[rank] += 2 + block_size;
181 for (
int irank = 0; irank < nranks; irank++) {
182 assert(send_data_positions[irank] == send_displ[irank + 1]);
187 recv_count, recv_displ, comm);
192 int recv_data_pos = 0;
193 while (recv_data_pos < total_recv_count) {
194 const int row = (int)data_recv[recv_data_pos + 0];
195 const int col = (int)data_recv[recv_data_pos + 1];
196 assert(data_recv[recv_data_pos + 0] - (
double)row == 0.0);
197 assert(data_recv[recv_data_pos + 1] - (
double)col == 0.0);
198 dbm_put_block(redist, row, col,
false, &data_recv[recv_data_pos + 2]);
199 const int row_size = matrix->
row_sizes[row];
200 const int col_size = matrix->
col_sizes[col];
201 const int block_size = row_size * col_size;
202 recv_data_pos += 2 + block_size;
204 assert(recv_data_pos == total_recv_count);
213void dbm_get_block_p(
dbm_matrix_t *matrix,
const int row,
const int col,
214 double **block,
int *row_size,
int *col_size) {
215 assert(0 <= row && row < matrix->nrows);
216 assert(0 <= col && col < matrix->ncols);
217 assert(dbm_get_stored_coordinates(matrix, row, col) == matrix->
dist->
my_rank);
236 const bool summation,
const double *block) {
237 assert(0 <= row && row < matrix->nrows);
238 assert(0 <= col && col < matrix->ncols);
239 assert(dbm_get_stored_coordinates(matrix, row, col) == matrix->
dist->
my_rank);
240 const int row_size = matrix->
row_sizes[row];
241 const int col_size = matrix->
col_sizes[col];
242 const int block_size = row_size * col_size;
246 omp_set_lock(&shard->
lock);
251 assert(blk_data != NULL || 0 == block_size);
252 for (
int i = 0;
i < block_size;
i++) {
253 blk_data[
i] += block[
i];
255 }
else if (block_size != 0) {
256 memcpy(blk_data, block, block_size *
sizeof(
double));
258 omp_unset_lock(&shard->
lock);
266 assert(omp_get_num_threads() == 1);
268#pragma omp parallel for DBM_OMP_SCHEDULE
285 assert(omp_get_num_threads() == 1);
290 const double eps2 = eps * eps;
292#pragma omp parallel for DBM_OMP_SCHEDULE
295 const int old_nblocks = shard->
nblocks;
300 for (
int iblock = 0; iblock < old_nblocks; iblock++) {
302 const double *old_blk_data = &shard->
data[old_blk.
offset];
305 const int block_size = row_size * col_size;
307 for (
int i = 0;
i < block_size;
i++) {
308 const double value = old_blk_data[
i];
309 norm += value * value;
315 if (block_size > 0 && norm < eps2) {
320 shard, old_blk.
row, old_blk.
col, block_size);
324 double *new_blk_data = &shard->
data[new_blk->
offset];
325 memmove(new_blk_data, old_blk_data, block_size *
sizeof(
double));
339 const int rows[],
const int cols[]) {
340 assert(omp_get_num_threads() == omp_get_max_threads() &&
341 "Please call dbm_reserve_blocks within an OpenMP parallel region.");
343 for (
int i = 0;
i < nblocks;
i++) {
346 omp_set_lock(&shard->
lock);
347 assert(0 <= rows[
i] && rows[
i] < matrix->
nrows);
348 assert(0 <= cols[
i] && cols[
i] < matrix->
ncols);
349 assert(dbm_get_stored_coordinates(matrix, rows[
i], cols[
i]) == my_rank);
350 const int row_size = matrix->
row_sizes[rows[
i]];
351 const int col_size = matrix->
col_sizes[cols[
i]];
352 const int block_size = row_size * col_size;
354 omp_unset_lock(&shard->
lock);
358#pragma omp for DBM_OMP_SCHEDULE
370 assert(omp_get_num_threads() == 1);
379#pragma omp parallel for DBM_OMP_SCHEDULE
383 shard->
data[
i] *= alpha;
393 assert(omp_get_num_threads() == 1);
395#pragma omp parallel for DBM_OMP_SCHEDULE
398 if (shard->
data != NULL) {
409 assert(omp_get_num_threads() == 1);
410 assert(matrix_a->
dist == matrix_b->
dist);
412#pragma omp parallel for DBM_OMP_SCHEDULE
416 for (
int iblock = 0; iblock < shard_b->
nblocks; iblock++) {
422 const int block_size = row_size * col_size;
424 shard_a, blk_b.
row, blk_b.
col, block_size);
425 double *data_a = &shard_a->
data[blk_a->
offset];
426 const double *data_b = &shard_b->
data[blk_b.
offset];
427 for (
int i = 0;
i < block_size;
i++) {
428 data_a[
i] += data_b[
i];
441 assert(omp_get_num_threads() == omp_get_max_threads() &&
442 "Please call dbm_iterator_start within an OpenMP parallel region.");
444 assert(iter != NULL);
452 assert(*iter_out == NULL);
462 for (
int ishard = omp_get_thread_num();
464 ishard += omp_get_num_threads()) {
483 double **block,
int *row_size,
int *col_size) {
518static double kahan_sum(
double value,
double *accumulator,
519 double *compensation) {
521 assert(NULL != accumulator && NULL != compensation);
522 c = value - *compensation;
523 r = *accumulator + c;
524 *compensation = (r - *accumulator) - c;
534 double checksum = 0.0, compensation = 0.0;
538 const double value = shard->
data[
i];
539 kahan_sum(value * value, &checksum, &compensation);
555 maxabs = fmax(maxabs, fabs(shard->
data[
i]));
570 assert(omp_get_num_threads() == 1);
571 assert(matrix_a->
dist == matrix_b->
dist);
574#pragma omp parallel for DBM_OMP_SCHEDULE
575 for (
int ishard = 0; ishard < num_shards; ++ishard) {
579 for (
int iblock = 0; iblock < shard_a->
nblocks; ++iblock) {
587 const double *
const data_a = &shard_a->
data[blk_a->
offset];
588 const double *
const data_b = &shard_b->
data[blk_b->
offset];
589 const int block_size = row_size * col_size;
590 for (
int i = 0;
i < block_size; ++
i) {
591 const double d = data_a[
i] - data_b[
i];
592 const double e = fabs(0 != data_a[
i] ? (d / data_a[
i]) : d);
639 const int **row_sizes) {
640 *nrows = matrix->
nrows;
649 const int **col_sizes) {
650 *ncols = matrix->
ncols;
659 const int **local_rows) {
669 const int **local_cols) {
678int dbm_get_stored_coordinates(
const dbm_matrix_t *matrix,
const int row,
int cp_mpi_comm_size(const cp_mpi_comm_t comm)
Wrapper around MPI_Comm_size.
void cp_mpi_free_mem(void *mem)
Wrapper around MPI_Free_mem.
void cp_mpi_sum_double(double *values, const int count, const cp_mpi_comm_t comm)
Wrapper around MPI_Allreduce for op MPI_SUM and datatype MPI_DOUBLE.
void cp_mpi_alltoallv_double(const double *sendbuf, const int *sendcounts, const int *sdispls, double *recvbuf, const int *recvcounts, const int *rdispls, const cp_mpi_comm_t comm)
Wrapper around MPI_Alltoallv for datatype MPI_DOUBLE.
void * cp_mpi_alloc_mem(size_t size)
Wrapper around MPI_Alloc_mem.
void cp_mpi_max_double(double *values, const int count, const cp_mpi_comm_t comm)
Wrapper around MPI_Allreduce for op MPI_MAX and datatype MPI_DOUBLE.
void cp_mpi_alltoall_int(const int *sendbuf, const int sendcount, int *recvbuf, const int recvcount, const cp_mpi_comm_t comm)
Wrapper around MPI_Alltoall for datatype MPI_INT.
bool cp_mpi_comms_are_similar(const cp_mpi_comm_t comm1, const cp_mpi_comm_t comm2)
Wrapper around MPI_Comm_compare.
int dbm_distribution_stored_coords(const dbm_distribution_t *dist, const int row, const int col)
Returns the MPI rank on which the given block should be stored.
void dbm_get_col_sizes(const dbm_matrix_t *matrix, int *ncols, const int **col_sizes)
Returns the column block sizes of the given matrix.
void dbm_get_row_sizes(const dbm_matrix_t *matrix, int *nrows, const int **row_sizes)
Returns the row block sizes of the given matrix.
double dbm_maxeps(const dbm_matrix_t *matrix_a, const dbm_matrix_t *matrix_b)
Calculates maximum relative difference between matrix_a and matrix_b.
static int dbm_get_shard_index(const dbm_matrix_t *matrix, const int row, const int col)
Internal routine for getting a block's shard index.
static int dbm_get_num_shards(const dbm_matrix_t *matrix)
Internal routine that returns the number of shards for given matrix.
void dbm_shard_release(dbm_shard_t *shard)
Internal routine for releasing a shard.
dbm_block_t * dbm_shard_get_or_allocate_block(dbm_shard_t *shard, const int row, const int col, const int block_size)
Internal routine for getting block or allocating a new one.
dbm_block_t * dbm_shard_promise_new_block(dbm_shard_t *shard, const int row, const int col, const int block_size)
Internal routine for allocating the metadata of a new block.
dbm_block_t * dbm_shard_lookup(const dbm_shard_t *shard, const int row, const int col)
Internal routine for looking up a block from a shard.
void dbm_shard_allocate_promised_blocks(dbm_shard_t *shard)
Internal routine for allocating and zeroing any promised block's data.
dbm_block_t * dbm_shard_get_or_promise_block(dbm_shard_t *shard, const int row, const int col, const int block_size)
Internal routine for getting block or promising a new one.
void dbm_shard_init(dbm_shard_t *shard)
Internal routine for initializing a shard.
void dbm_shard_copy(dbm_shard_t *shard_a, const dbm_shard_t *shard_b)
Internal routine for copying content of shard_b into shard_a.
static void const int const int i
sums arrays of real/complex numbers with much reduced round-off as compared to a naive implementation...
Internal struct for storing a block's metadata.
Internal struct for storing a two dimensional distribution.
Internal struct for storing a block iterator.
const dbm_matrix_t * matrix
Internal struct for storing a matrix.
dbm_distribution_t * dist
Internal struct for storing a matrix shard.