21 static inline int imax(
int x,
int y) {
return (x > y ? x : y); }
27 static int gcd(
const int a,
const int b) {
37 static int lcm(
const int a,
const int b) {
return (
a *
b) /
gcd(
a,
b); }
43 static inline int isum(
const int n,
const int input[n]) {
45 for (
int i = 0;
i < n;
i++) {
55 static inline void icumsum(
const int n,
const int input[n],
int output[n]) {
57 for (
int i = 1;
i < n;
i++) {
58 output[
i] = output[
i - 1] + input[
i - 1];
82 const int npacks,
plan_t *plans_per_pack[npacks],
83 int nblks_per_pack[npacks],
84 int ndata_per_pack[npacks]) {
86 memset(nblks_per_pack, 0, npacks *
sizeof(
int));
87 memset(ndata_per_pack, 0, npacks *
sizeof(
int));
92 int nblks_mythread[npacks];
93 memset(nblks_mythread, 0, npacks *
sizeof(
int));
94 #pragma omp for schedule(static)
97 for (
int iblock = 0; iblock < shard->
nblocks; iblock++) {
99 const int sum_index = (trans_matrix) ? blk->
row : blk->
col;
100 const int itick = (1021 * sum_index) % nticks;
101 const int ipack = itick / dist_ticks->
nranks;
102 nblks_mythread[ipack]++;
108 for (
int ipack = 0; ipack < npacks; ipack++) {
109 nblks_per_pack[ipack] += nblks_mythread[ipack];
110 nblks_mythread[ipack] = nblks_per_pack[ipack];
114 for (
int ipack = 0; ipack < npacks; ipack++) {
115 plans_per_pack[ipack] = malloc(nblks_per_pack[ipack] *
sizeof(
plan_t));
119 int ndata_mythread[npacks];
120 memset(ndata_mythread, 0, npacks *
sizeof(
int));
121 #pragma omp for schedule(static)
124 for (
int iblock = 0; iblock < shard->
nblocks; iblock++) {
126 const int free_index = (trans_matrix) ? blk->
col : blk->
row;
127 const int sum_index = (trans_matrix) ? blk->
row : blk->
col;
128 const int itick = (1021 * sum_index) % nticks;
129 const int ipack = itick / dist_ticks->
nranks;
131 const int coord_free_idx = dist_indices->
index2coord[free_index];
132 const int coord_sum_idx = itick % dist_ticks->
nranks;
133 const int coords[2] = {(trans_dist) ? coord_sum_idx : coord_free_idx,
134 (trans_dist) ? coord_free_idx : coord_sum_idx};
138 ndata_mythread[ipack] += row_size * col_size;
140 const int iplan = --nblks_mythread[ipack];
141 plans_per_pack[ipack][iplan].blk = blk;
142 plans_per_pack[ipack][iplan].rank = rank;
143 plans_per_pack[ipack][iplan].row_size = row_size;
144 plans_per_pack[ipack][iplan].col_size = col_size;
148 for (
int ipack = 0; ipack < npacks; ipack++) {
149 ndata_per_pack[ipack] += ndata_mythread[ipack];
159 const dbm_matrix_t *matrix,
const bool trans_matrix,
const int nblks_send,
160 const int ndata_send,
plan_t plans[nblks_send],
const int nranks,
161 int blks_send_count[nranks],
int data_send_count[nranks],
162 int blks_send_displ[nranks],
int data_send_displ[nranks],
165 memset(blks_send_count, 0, nranks *
sizeof(
int));
166 memset(data_send_count, 0, nranks *
sizeof(
int));
171 int nblks_mythread[nranks], ndata_mythread[nranks];
172 memset(nblks_mythread, 0, nranks *
sizeof(
int));
173 memset(ndata_mythread, 0, nranks *
sizeof(
int));
174 #pragma omp for schedule(static)
175 for (
int iblock = 0; iblock < nblks_send; iblock++) {
176 const plan_t *plan = &plans[iblock];
177 nblks_mythread[plan->
rank] += 1;
183 for (
int irank = 0; irank < nranks; irank++) {
184 blks_send_count[irank] += nblks_mythread[irank];
185 data_send_count[irank] += ndata_mythread[irank];
186 nblks_mythread[irank] = blks_send_count[irank];
187 ndata_mythread[irank] = data_send_count[irank];
194 icumsum(nranks, blks_send_count, blks_send_displ);
195 icumsum(nranks, data_send_count, data_send_displ);
196 const int m = nranks - 1;
197 assert(nblks_send == blks_send_displ[m] + blks_send_count[m]);
198 assert(ndata_send == data_send_displ[m] + data_send_count[m]);
203 #pragma omp for schedule(static)
204 for (
int iblock = 0; iblock < nblks_send; iblock++) {
205 const plan_t *plan = &plans[iblock];
209 const double *blk_data = &shard->
data[blk->
offset];
211 const int irank = plan->
rank;
216 nblks_mythread[irank] -= 1;
217 ndata_mythread[irank] -= row_size * col_size;
218 const int offset = data_send_displ[irank] + ndata_mythread[irank];
219 const int jblock = blks_send_displ[irank] + nblks_mythread[irank];
224 for (
int i = 0;
i < row_size;
i++) {
225 for (
int j = 0; j < col_size; j++) {
226 const double element = blk_data[j * row_size +
i];
227 norm += element * element;
228 data_send[offset +
i * col_size + j] = element;
231 blks_send[jblock].free_index = plan->
blk->
col;
232 blks_send[jblock].sum_index = plan->
blk->
row;
234 for (
int i = 0;
i < row_size * col_size;
i++) {
235 const double element = blk_data[
i];
236 norm += element * element;
237 data_send[offset +
i] = element;
239 blks_send[jblock].free_index = plan->
blk->
row;
240 blks_send[jblock].sum_index = plan->
blk->
col;
242 blks_send[jblock].norm = (float)norm;
245 blks_send[jblock].offset = offset - data_send_displ[irank];
265 const int nranks,
const int nshards,
const int nblocks_recv,
266 const int blks_recv_count[nranks],
const int blks_recv_displ[nranks],
267 const int data_recv_displ[nranks],
270 int nblocks_per_shard[nshards], shard_start[nshards];
271 memset(nblocks_per_shard, 0, nshards *
sizeof(
int));
278 for (
int irank = 0; irank < nranks; irank++) {
280 for (
int i = 0;
i < blks_recv_count[irank];
i++) {
281 blks_recv[blks_recv_displ[irank] +
i].offset += data_recv_displ[irank];
286 int nblocks_mythread[nshards];
287 memset(nblocks_mythread, 0, nshards *
sizeof(
int));
288 #pragma omp for schedule(static)
289 for (
int iblock = 0; iblock < nblocks_recv; iblock++) {
290 blocks_tmp[iblock] = blks_recv[iblock];
291 const int ishard = blks_recv[iblock].
free_index % nshards;
292 nblocks_mythread[ishard]++;
295 for (
int ishard = 0; ishard < nshards; ishard++) {
296 nblocks_per_shard[ishard] += nblocks_mythread[ishard];
297 nblocks_mythread[ishard] = nblocks_per_shard[ishard];
301 icumsum(nshards, nblocks_per_shard, shard_start);
303 #pragma omp for schedule(static)
304 for (
int iblock = 0; iblock < nblocks_recv; iblock++) {
305 const int ishard = blocks_tmp[iblock].
free_index % nshards;
306 const int jblock = --nblocks_mythread[ishard] + shard_start[ishard];
307 blks_recv[jblock] = blocks_tmp[iblock];
312 for (
int ishard = 0; ishard < nshards; ishard++) {
313 if (nblocks_per_shard[ishard] > 1) {
314 qsort(&blks_recv[shard_start[ishard]], nblocks_per_shard[ishard],
328 const bool trans_dist,
337 const dbm_dist_1d_t *dist_indices = (trans_dist) ? &dist->cols : &dist->rows;
338 const dbm_dist_1d_t *dist_ticks = (trans_dist) ? &dist->rows : &dist->cols;
341 const int nsend_packs = nticks / dist_ticks->nranks;
342 assert(nsend_packs * dist_ticks->nranks == nticks);
350 plan_t *plans_per_pack[nsend_packs];
351 int nblks_send_per_pack[nsend_packs], ndata_send_per_pack[nsend_packs];
353 dist_ticks, nticks, nsend_packs, plans_per_pack,
354 nblks_send_per_pack, ndata_send_per_pack);
357 for (
int ipack = 0; ipack < nsend_packs; ipack++) {
365 const int nranks = dist->nranks;
366 int blks_send_count[nranks], data_send_count[nranks];
367 int blks_send_displ[nranks], data_send_displ[nranks];
369 ndata_send_per_pack[ipack], plans_per_pack[ipack], nranks,
370 blks_send_count, data_send_count, blks_send_displ,
371 data_send_displ, blks_send, data_send);
372 free(plans_per_pack[ipack]);
375 int blks_recv_count[nranks], blks_recv_displ[nranks];
377 icumsum(nranks, blks_recv_count, blks_recv_displ);
378 const int nblocks_recv =
isum(nranks, blks_recv_count);
383 int blks_send_count_byte[nranks], blks_send_displ_byte[nranks];
384 int blks_recv_count_byte[nranks], blks_recv_displ_byte[nranks];
385 for (
int i = 0;
i < nranks;
i++) {
392 blks_send, blks_send_count_byte, blks_send_displ_byte, blks_recv,
393 blks_recv_count_byte, blks_recv_displ_byte, dist->comm);
398 int data_recv_count[nranks], data_recv_displ[nranks];
400 icumsum(nranks, data_recv_count, data_recv_displ);
401 const int ndata_recv =
isum(nranks, data_recv_count);
406 data_recv, data_recv_count, data_recv_displ,
412 blks_recv_count, blks_recv_displ,
413 data_recv_displ, blks_recv);
421 int max_nblocks = 0, max_data_size = 0;
422 for (
int ipack = 0; ipack < packed.
nsend_packs; ipack++) {
448 const int itick_of_rank0 = (itick + nticks - my_rank) % nticks;
449 const int send_rank = (my_rank + nticks - itick_of_rank0) % nranks;
450 const int send_itick = (itick_of_rank0 + send_rank) % nticks;
451 const int send_ipack = send_itick / nranks;
452 assert(send_itick % nranks == my_rank);
455 const int recv_rank = itick % nranks;
456 const int recv_ipack = itick / nranks;
458 if (send_rank == my_rank) {
459 assert(send_rank == recv_rank && send_ipack == recv_ipack);
502 for (
int ipack = 0; ipack < packed->
nsend_packs; ipack++) {
549 const int shifted_itick = (iter->
itick + shift) % iter->
nticks;
static int dbm_get_shard_index(const dbm_matrix_t *matrix, const int row, const int col)
Internal routine for getting a block's shard index.
static int dbm_get_num_shards(const dbm_matrix_t *matrix)
Internal routine that returns the number of shards for given matrix.
void dbm_mempool_free(void *mem)
Internal routine for releasing memory back to the pool.
void * dbm_mempool_host_malloc(const size_t size)
Internal routine for allocating host memory from the pool.
int dbm_mpi_sendrecv_byte(const void *sendbuf, const int sendcount, const int dest, const int sendtag, void *recvbuf, const int recvcount, const int source, const int recvtag, const dbm_mpi_comm_t comm)
Wrapper around MPI_Sendrecv for datatype MPI_BYTE.
int dbm_mpi_cart_rank(const dbm_mpi_comm_t comm, const int coords[])
Wrapper around MPI_Cart_rank.
bool dbm_mpi_comms_are_similar(const dbm_mpi_comm_t comm1, const dbm_mpi_comm_t comm2)
Wrapper around MPI_Comm_compare.
void dbm_mpi_alltoallv_byte(const void *sendbuf, const int *sendcounts, const int *sdispls, void *recvbuf, const int *recvcounts, const int *rdispls, const dbm_mpi_comm_t comm)
Wrapper around MPI_Alltoallv for datatype MPI_BYTE.
void dbm_mpi_alltoall_int(const int *sendbuf, const int sendcount, int *recvbuf, const int recvcount, const dbm_mpi_comm_t comm)
Wrapper around MPI_Alltoall for datatype MPI_INT.
int dbm_mpi_sendrecv_double(const double *sendbuf, const int sendcount, const int dest, const int sendtag, double *recvbuf, const int recvcount, const int source, const int recvtag, const dbm_mpi_comm_t comm)
Wrapper around MPI_Sendrecv for datatype MPI_DOUBLE.
void dbm_mpi_max_int(int *values, const int count, const dbm_mpi_comm_t comm)
Wrapper around MPI_Allreduce for op MPI_MAX and datatype MPI_INT.
void dbm_mpi_alltoallv_double(const double *sendbuf, const int *sendcounts, const int *sdispls, double *recvbuf, const int *recvcounts, const int *rdispls, const dbm_mpi_comm_t comm)
Wrapper around MPI_Alltoallv for datatype MPI_DOUBLE.
static void create_pack_plans(const bool trans_matrix, const bool trans_dist, const dbm_matrix_t *matrix, const dbm_mpi_comm_t comm, const dbm_dist_1d_t *dist_indices, const dbm_dist_1d_t *dist_ticks, const int nticks, const int npacks, plan_t *plans_per_pack[npacks], int nblks_per_pack[npacks], int ndata_per_pack[npacks])
Private routine for planing packs.
static void free_packed_matrix(dbm_packed_matrix_t *packed)
Private routine for releasing a packed matrix.
static void icumsum(const int n, const int input[n], int output[n])
Private routine for computing the cumulative sums of given numbers.
static void postprocess_received_blocks(const int nranks, const int nshards, const int nblocks_recv, const int blks_recv_count[nranks], const int blks_recv_displ[nranks], const int data_recv_displ[nranks], dbm_pack_block_t blks_recv[nblocks_recv])
Private routine for post-processing received blocks.
static void fill_send_buffers(const dbm_matrix_t *matrix, const bool trans_matrix, const int nblks_send, const int ndata_send, plan_t plans[nblks_send], const int nranks, int blks_send_count[nranks], int data_send_count[nranks], int blks_send_displ[nranks], int data_send_displ[nranks], dbm_pack_block_t blks_send[nblks_send], double data_send[ndata_send])
Private routine for filling send buffers.
dbm_comm_iterator_t * dbm_comm_iterator_start(const bool transa, const bool transb, const dbm_matrix_t *matrix_a, const dbm_matrix_t *matrix_b, const dbm_matrix_t *matrix_c)
Internal routine for creating a communication iterator.
static int gcd(const int a, const int b)
Private routine for computing greatest common divisor of two numbers.
static int lcm(const int a, const int b)
Private routine for computing least common multiple of two numbers.
void dbm_comm_iterator_stop(dbm_comm_iterator_t *iter)
Internal routine for releasing the given communication iterator.
static int compare_pack_blocks_by_sum_index(const void *a, const void *b)
Private comperator passed to qsort to compare two blocks by sum_index.
bool dbm_comm_iterator_next(dbm_comm_iterator_t *iter, dbm_pack_t **pack_a, dbm_pack_t **pack_b)
Internal routine for retriving next pair of packs from given iterator.
static dbm_packed_matrix_t pack_matrix(const bool trans_matrix, const bool trans_dist, const dbm_matrix_t *matrix, const dbm_distribution_t *dist, const int nticks)
Private routine for redistributing a matrix along selected dimensions.
static dbm_pack_t * sendrecv_pack(const int itick, const int nticks, dbm_packed_matrix_t *packed)
Private routine for sending and receiving the pack for the given tick.
static int imax(int x, int y)
Returns the larger of two given integer (missing from the C standard)
static int isum(const int n, const int input[n])
Private routine for computing the sum of the given integers.
static void const int const int i
Internal struct for storing a block's metadata.
Internal struct for storing a communication iterator.
dbm_packed_matrix_t packed_a
dbm_distribution_t * dist
dbm_packed_matrix_t packed_b
Internal struct for storing a one dimensional distribution.
Internal struct for storing a two dimensional distribution.
Internal struct for storing a matrix.
dbm_distribution_t * dist
Internal struct for storing a dbm_block_t plus its norm.
Internal struct for storing a pack - essentially a shard for MPI.
dbm_pack_block_t * blocks
Internal struct for storing a packed matrix.
const dbm_dist_1d_t * dist_ticks
const dbm_dist_1d_t * dist_indices
Internal struct for storing a matrix shard.
Private struct used for planing during pack_matrix.