23 const char name[],
const int nrows,
const int ncols,
24 const int row_sizes[nrows],
const int col_sizes[ncols]) {
25 assert(omp_get_num_threads() == 1);
29 assert(dist->rows.length == nrows);
30 assert(dist->cols.length == ncols);
31 dbm_distribution_hold(dist);
34 size_t size = (strlen(name) + 1) *
sizeof(
char);
35 matrix->
name = malloc(size);
36 assert(matrix->
name != NULL && name != NULL);
37 memcpy(matrix->
name, name, size);
39 matrix->
nrows = nrows;
40 matrix->
ncols = ncols;
42 size = nrows *
sizeof(int);
44 assert(matrix->
row_sizes != NULL || size == 0);
46 memcpy(matrix->
row_sizes, row_sizes, size);
49 size = ncols *
sizeof(int);
51 assert(matrix->
col_sizes != NULL || size == 0);
53 memcpy(matrix->
col_sizes, col_sizes, size);
58 assert(matrix->
shards != NULL || num_shards == 0);
59#pragma omp parallel for
60 for (
int ishard = 0; ishard < num_shards; ishard++) {
64 assert(*matrix_out == NULL);
73 assert(omp_get_num_threads() == 1);
77 dbm_distribution_release(matrix->
dist);
91 assert(omp_get_num_threads() == 1);
94 for (
int i = 0;
i < matrix_b->
nrows;
i++) {
98 for (
int i = 0;
i < matrix_b->
ncols;
i++) {
102 assert(matrix_a->
dist == matrix_b->
dist);
104#pragma omp parallel for DBM_OMP_SCHEDULE
116 assert(omp_get_num_threads() == 1);
117 assert(matrix->
nrows == redist->nrows);
118 for (
int i = 0;
i < matrix->
nrows;
i++) {
119 assert(matrix->
row_sizes[
i] == redist->row_sizes[
i]);
121 assert(matrix->
ncols == redist->ncols);
122 for (
int i = 0;
i < matrix->
ncols;
i++) {
123 assert(matrix->
col_sizes[
i] == redist->col_sizes[
i]);
131 int send_count[nranks];
132 memset(send_count, 0, nranks *
sizeof(
int));
135 for (
int iblock = 0; iblock < shard->
nblocks; iblock++) {
139 const int block_size = row_size * col_size;
140 const int rank = dbm_get_stored_coordinates(redist, blk->
row, blk->
col);
141 assert(0 <= rank && rank < nranks);
142 send_count[rank] += 2 + block_size;
147 int recv_count[nranks];
151 int send_displ[nranks + 1], recv_displ[nranks + 1];
152 send_displ[0] = recv_displ[0] = 0;
153 for (
int irank = 1; irank <= nranks; irank++) {
154 send_displ[irank] = send_displ[irank - 1] + send_count[irank - 1];
155 recv_displ[irank] = recv_displ[irank - 1] + recv_count[irank - 1];
157 const int total_send_count = send_displ[nranks];
158 const int total_recv_count = recv_displ[nranks];
159 double *
const data_send =
cp_mpi_alloc_mem(total_send_count *
sizeof(
double));
160 double *
const data_recv =
cp_mpi_alloc_mem(total_recv_count *
sizeof(
double));
163 int send_data_positions[nranks];
164 memcpy(send_data_positions, send_displ, nranks *
sizeof(
int));
167 for (
int iblock = 0; iblock < shard->
nblocks; iblock++) {
169 const double *
const blk_data = &shard->
data[blk->
offset];
172 const int block_size = row_size * col_size;
173 const int rank = dbm_get_stored_coordinates(redist, blk->
row, blk->
col);
174 const int pos = send_data_positions[rank];
175 data_send[pos + 0] = blk->
row;
176 data_send[pos + 1] = blk->
col;
177 memcpy(&data_send[pos + 2], blk_data, block_size *
sizeof(
double));
178 send_data_positions[rank] += 2 + block_size;
181 for (
int irank = 0; irank < nranks; irank++) {
182 assert(send_data_positions[irank] == send_displ[irank + 1]);
187 recv_count, recv_displ, comm);
192 int recv_data_pos = 0;
193 while (recv_data_pos < total_recv_count) {
194 const int row = (int)data_recv[recv_data_pos + 0];
195 const int col = (int)data_recv[recv_data_pos + 1];
196 assert(data_recv[recv_data_pos + 0] == (
double)row);
197 assert(data_recv[recv_data_pos + 1] == (
double)col);
198 dbm_put_block(redist, row, col,
false, &data_recv[recv_data_pos + 2]);
199 const int row_size = matrix->
row_sizes[row];
200 const int col_size = matrix->
col_sizes[col];
201 const int block_size = row_size * col_size;
202 recv_data_pos += 2 + block_size;
204 assert(recv_data_pos == total_recv_count);
213void dbm_get_block_p(
dbm_matrix_t *matrix,
const int row,
const int col,
214 double **block,
int *row_size,
int *col_size) {
215 assert(0 <= row && row < matrix->nrows);
216 assert(0 <= col && col < matrix->ncols);
217 assert(dbm_get_stored_coordinates(matrix, row, col) == matrix->
dist->
my_rank);
236 const bool summation,
const double *block) {
237 assert(0 <= row && row < matrix->nrows);
238 assert(0 <= col && col < matrix->ncols);
239 assert(dbm_get_stored_coordinates(matrix, row, col) == matrix->
dist->
my_rank);
240 const int row_size = matrix->
row_sizes[row];
241 const int col_size = matrix->
col_sizes[col];
242 const int block_size = row_size * col_size;
246 omp_set_lock(&shard->
lock);
249 double *
const blk_data = &shard->
data[blk->
offset];
251 assert(blk_data != NULL || 0 == block_size);
252 for (
int i = 0;
i < block_size;
i++) {
253 blk_data[
i] += block[
i];
255 }
else if (block_size != 0) {
256 memcpy(blk_data, block, block_size *
sizeof(
double));
258 omp_unset_lock(&shard->
lock);
266 assert(omp_get_num_threads() == 1);
268#pragma omp parallel for DBM_OMP_SCHEDULE
283 assert(omp_get_num_threads() == 1);
288 const double eps2 = eps * eps;
290#pragma omp parallel for DBM_OMP_SCHEDULE
293 const int old_nblocks = shard->
nblocks;
297 for (
int iblock = 0; iblock < old_nblocks; iblock++) {
299 const double *old_blk_data = &shard->
data[old_blk.
offset];
302 const int block_size = row_size * col_size;
304 for (
int i = 0;
i < block_size;
i++) {
305 const double value = old_blk_data[
i];
306 norm += value * value;
312 if (block_size > 0 && norm < eps2) {
317 shard, old_blk.
row, old_blk.
col, block_size);
321 double *new_blk_data = &shard->
data[new_blk->
offset];
322 memmove(new_blk_data, old_blk_data, block_size *
sizeof(
double));
336 const int rows[],
const int cols[]) {
337 assert(omp_get_num_threads() == omp_get_max_threads() &&
338 "Please call dbm_reserve_blocks within an OpenMP parallel region.");
341 for (
int i = 0;
i < nblocks;
i++) {
342 const int row = rows[
i], col = cols[
i];
343 assert(0 <= row && row < matrix->nrows);
344 assert(0 <= col && col < matrix->ncols);
345 assert(dbm_get_stored_coordinates(matrix, row, col) == my_rank);
348 const int row_size = matrix->
row_sizes[row];
349 const int col_size = matrix->
col_sizes[col];
350 const int block_size = row_size * col_size;
351 omp_set_lock(&shard->
lock);
353 omp_unset_lock(&shard->
lock);
357#pragma omp for DBM_OMP_SCHEDULE
369 assert(omp_get_num_threads() == 1);
378#pragma omp parallel for DBM_OMP_SCHEDULE
382 shard->
data[
i] *= alpha;
392 assert(omp_get_num_threads() == 1);
394#pragma omp parallel for DBM_OMP_SCHEDULE
397 if (shard->
data != NULL) {
408 assert(omp_get_num_threads() == 1);
409 assert(matrix_a->
dist == matrix_b->
dist);
411#pragma omp parallel for DBM_OMP_SCHEDULE
415 for (
int iblock = 0; iblock < shard_b->
nblocks; iblock++) {
421 const int block_size = row_size * col_size;
423 shard_a, blk_b.
row, blk_b.
col, block_size);
424 double *
const data_a = &shard_a->
data[blk_a->
offset];
425 const double *
const data_b = &shard_b->
data[blk_b.
offset];
426 for (
int i = 0;
i < block_size;
i++) {
427 data_a[
i] += data_b[
i];
440 assert(omp_get_num_threads() == omp_get_max_threads() &&
441 "Please call dbm_iterator_start within an OpenMP parallel region.");
443 assert(iter != NULL);
451 assert(*iter_out == NULL);
461 for (
int ishard = omp_get_thread_num();
463 ishard += omp_get_num_threads()) {
482 double **block,
int *row_size,
int *col_size) {
517static double kahan_sum(
double value,
double *accumulator,
518 double *compensation) {
520 assert(NULL != accumulator && NULL != compensation);
521 c = value - *compensation;
522 r = *accumulator + c;
523 *compensation = (r - *accumulator) - c;
533 double checksum = 0.0, compensation = 0.0;
537 const double value = shard->
data[
i];
538 kahan_sum(value * value, &checksum, &compensation);
554 maxabs = fmax(maxabs, fabs(shard->
data[
i]));
569 assert(omp_get_num_threads() == 1);
570 assert(matrix_a->
dist == matrix_b->
dist);
573#pragma omp parallel for reduction(max : epsilon) DBM_OMP_SCHEDULE
574 for (
int ishard = 0; ishard < num_shards; ++ishard) {
577 for (
int iblock = 0; iblock < shard_a->
nblocks; ++iblock) {
583 assert(NULL == blk_b || row_size == matrix_b->
row_sizes[blk_b->
row]);
584 assert(NULL == blk_b || col_size == matrix_b->
col_sizes[blk_b->
col]);
585 const double *
const data_a = &shard_a->
data[blk_a->
offset];
586 const double *
const data_b =
587 (NULL == blk_b) ? NULL : &shard_b->
data[blk_b->
offset];
588 const int block_size = row_size * col_size;
589 for (
int i = 0;
i < block_size; ++
i) {
590 const double d = data_a[
i] - (NULL == data_b ? 0.0 : data_b[
i]);
591 const double e = fabs(0 != data_a[
i] ? (d / data_a[
i]) : d);
597 for (
int iblock = 0; iblock < shard_b->
nblocks; ++iblock) {
604 const double *
const data_b = &shard_b->
data[blk_b->
offset];
605 const int block_size = row_size * col_size;
606 for (
int i = 0;
i < block_size; ++
i) {
607 const double e = fabs(data_b[
i]);
654 const int **row_sizes) {
655 *nrows = matrix->
nrows;
664 const int **col_sizes) {
665 *ncols = matrix->
ncols;
674 const int **local_rows) {
684 const int **local_cols) {
693int dbm_get_stored_coordinates(
const dbm_matrix_t *matrix,
const int row,
int cp_mpi_comm_size(const cp_mpi_comm_t comm)
Wrapper around MPI_Comm_size.
void cp_mpi_free_mem(void *mem)
Wrapper around MPI_Free_mem.
void cp_mpi_sum_double(double *values, const int count, const cp_mpi_comm_t comm)
Wrapper around MPI_Allreduce for op MPI_SUM and datatype MPI_DOUBLE.
void cp_mpi_alltoallv_double(const double *sendbuf, const int *sendcounts, const int *sdispls, double *recvbuf, const int *recvcounts, const int *rdispls, const cp_mpi_comm_t comm)
Wrapper around MPI_Alltoallv for datatype MPI_DOUBLE.
void * cp_mpi_alloc_mem(size_t size)
Wrapper around MPI_Alloc_mem.
void cp_mpi_max_double(double *values, const int count, const cp_mpi_comm_t comm)
Wrapper around MPI_Allreduce for op MPI_MAX and datatype MPI_DOUBLE.
void cp_mpi_alltoall_int(const int *sendbuf, const int sendcount, int *recvbuf, const int recvcount, const cp_mpi_comm_t comm)
Wrapper around MPI_Alltoall for datatype MPI_INT.
bool cp_mpi_comms_are_similar(const cp_mpi_comm_t comm1, const cp_mpi_comm_t comm2)
Wrapper around MPI_Comm_compare.
int dbm_distribution_stored_coords(const dbm_distribution_t *dist, const int row, const int col)
Returns the MPI rank on which the given block should be stored.
void dbm_get_col_sizes(const dbm_matrix_t *matrix, int *ncols, const int **col_sizes)
Returns the column block sizes of the given matrix.
void dbm_get_row_sizes(const dbm_matrix_t *matrix, int *nrows, const int **row_sizes)
Returns the row block sizes of the given matrix.
double dbm_maxeps(const dbm_matrix_t *matrix_a, const dbm_matrix_t *matrix_b)
Calculates maximum relative difference between matrix_a and matrix_b.
static int dbm_get_shard_index(const dbm_matrix_t *matrix, const int row, const int col)
Internal routine for getting a block's shard index.
static int dbm_get_num_shards(const dbm_matrix_t *matrix)
Internal routine that returns the number of shards for given matrix.
void dbm_shard_release(dbm_shard_t *shard)
Internal routine for releasing a shard.
dbm_block_t * dbm_shard_get_or_allocate_block(dbm_shard_t *shard, const int row, const int col, const int block_size)
Internal routine for getting block or allocating a new one.
dbm_block_t * dbm_shard_promise_new_block(dbm_shard_t *shard, const int row, const int col, const int block_size)
Internal routine for allocating the metadata of a new block.
dbm_block_t * dbm_shard_lookup(const dbm_shard_t *shard, const int row, const int col)
Internal routine for looking up a block from a shard.
void dbm_shard_allocate_promised_blocks(dbm_shard_t *shard)
Internal routine for allocating and zeroing any promised block's data.
dbm_block_t * dbm_shard_get_or_promise_block(dbm_shard_t *shard, const int row, const int col, const int block_size)
Internal routine for getting block or promising a new one.
void dbm_shard_init(dbm_shard_t *shard)
Internal routine for initializing a shard.
void dbm_shard_copy(dbm_shard_t *shard_a, const dbm_shard_t *shard_b)
Internal routine for copying content of shard_b into shard_a.
static void const int const int i
sums arrays of real/complex numbers with much reduced round-off as compared to a naive implementation...
Internal struct for storing a block's metadata.
Internal struct for storing a two dimensional distribution.
Internal struct for storing a block iterator.
const dbm_matrix_t * matrix
Internal struct for storing a matrix.
dbm_distribution_t * dist
Internal struct for storing a matrix shard.