(git:b195825)
dbm_multiply_comm.c
Go to the documentation of this file.
1 /*----------------------------------------------------------------------------*/
2 /* CP2K: A general program to perform molecular dynamics simulations */
3 /* Copyright 2000-2024 CP2K developers group <https://cp2k.org> */
4 /* */
5 /* SPDX-License-Identifier: BSD-3-Clause */
6 /*----------------------------------------------------------------------------*/
7 
8 #include "dbm_multiply_comm.h"
9 
10 #include <assert.h>
11 #include <stdlib.h>
12 #include <string.h>
13 
14 #include "dbm_hyperparams.h"
15 #include "dbm_mempool.h"
16 
17 /*******************************************************************************
18  * \brief Returns the larger of two given integer (missing from the C standard)
19  * \author Ole Schuett
20  ******************************************************************************/
21 static inline int imax(int x, int y) { return (x > y ? x : y); }
22 
23 /*******************************************************************************
24  * \brief Private routine for computing greatest common divisor of two numbers.
25  * \author Ole Schuett
26  ******************************************************************************/
27 static int gcd(const int a, const int b) {
28  if (a == 0)
29  return b;
30  return gcd(b % a, a); // Euclid's algorithm.
31 }
32 
33 /*******************************************************************************
34  * \brief Private routine for computing least common multiple of two numbers.
35  * \author Ole Schuett
36  ******************************************************************************/
37 static int lcm(const int a, const int b) { return (a * b) / gcd(a, b); }
38 
39 /*******************************************************************************
40  * \brief Private routine for computing the sum of the given integers.
41  * \author Ole Schuett
42  ******************************************************************************/
43 static inline int isum(const int n, const int input[n]) {
44  int output = 0;
45  for (int i = 0; i < n; i++) {
46  output += input[i];
47  }
48  return output;
49 }
50 
51 /*******************************************************************************
52  * \brief Private routine for computing the cumulative sums of given numbers.
53  * \author Ole Schuett
54  ******************************************************************************/
55 static inline void icumsum(const int n, const int input[n], int output[n]) {
56  output[0] = 0;
57  for (int i = 1; i < n; i++) {
58  output[i] = output[i - 1] + input[i - 1];
59  }
60 }
61 
62 /*******************************************************************************
63  * \brief Private struct used for planing during pack_matrix.
64  * \author Ole Schuett
65  ******************************************************************************/
66 typedef struct {
67  const dbm_block_t *blk; // source block
68  int rank; // target mpi rank
69  int row_size;
70  int col_size;
71 } plan_t;
72 
73 /*******************************************************************************
74  * \brief Private routine for planing packs.
75  * \author Ole Schuett
76  ******************************************************************************/
77 static void create_pack_plans(const bool trans_matrix, const bool trans_dist,
78  const dbm_matrix_t *matrix,
79  const dbm_mpi_comm_t comm,
80  const dbm_dist_1d_t *dist_indices,
81  const dbm_dist_1d_t *dist_ticks, const int nticks,
82  const int npacks, plan_t *plans_per_pack[npacks],
83  int nblks_per_pack[npacks],
84  int ndata_per_pack[npacks]) {
85 
86  memset(nblks_per_pack, 0, npacks * sizeof(int));
87  memset(ndata_per_pack, 0, npacks * sizeof(int));
88 
89 #pragma omp parallel
90  {
91  // 1st pass: Compute number of blocks that will be send in each pack.
92  int nblks_mythread[npacks];
93  memset(nblks_mythread, 0, npacks * sizeof(int));
94 #pragma omp for schedule(static)
95  for (int ishard = 0; ishard < dbm_get_num_shards(matrix); ishard++) {
96  dbm_shard_t *shard = &matrix->shards[ishard];
97  for (int iblock = 0; iblock < shard->nblocks; iblock++) {
98  const dbm_block_t *blk = &shard->blocks[iblock];
99  const int sum_index = (trans_matrix) ? blk->row : blk->col;
100  const int itick = (1021 * sum_index) % nticks; // 1021 = a random prime
101  const int ipack = itick / dist_ticks->nranks;
102  nblks_mythread[ipack]++;
103  }
104  }
105 
106  // Sum nblocks across threads and allocate arrays for plans.
107 #pragma omp critical
108  for (int ipack = 0; ipack < npacks; ipack++) {
109  nblks_per_pack[ipack] += nblks_mythread[ipack];
110  nblks_mythread[ipack] = nblks_per_pack[ipack];
111  }
112 #pragma omp barrier
113 #pragma omp for
114  for (int ipack = 0; ipack < npacks; ipack++) {
115  plans_per_pack[ipack] = malloc(nblks_per_pack[ipack] * sizeof(plan_t));
116  }
117 
118  // 2nd pass: Plan where to send each block.
119  int ndata_mythread[npacks];
120  memset(ndata_mythread, 0, npacks * sizeof(int));
121 #pragma omp for schedule(static) // Need static to match previous loop.
122  for (int ishard = 0; ishard < dbm_get_num_shards(matrix); ishard++) {
123  dbm_shard_t *shard = &matrix->shards[ishard];
124  for (int iblock = 0; iblock < shard->nblocks; iblock++) {
125  const dbm_block_t *blk = &shard->blocks[iblock];
126  const int free_index = (trans_matrix) ? blk->col : blk->row;
127  const int sum_index = (trans_matrix) ? blk->row : blk->col;
128  const int itick = (1021 * sum_index) % nticks; // Same mapping as above.
129  const int ipack = itick / dist_ticks->nranks;
130  // Compute rank to which this block should be sent.
131  const int coord_free_idx = dist_indices->index2coord[free_index];
132  const int coord_sum_idx = itick % dist_ticks->nranks;
133  const int coords[2] = {(trans_dist) ? coord_sum_idx : coord_free_idx,
134  (trans_dist) ? coord_free_idx : coord_sum_idx};
135  const int rank = dbm_mpi_cart_rank(comm, coords);
136  const int row_size = matrix->row_sizes[blk->row];
137  const int col_size = matrix->col_sizes[blk->col];
138  ndata_mythread[ipack] += row_size * col_size;
139  // Create plan.
140  const int iplan = --nblks_mythread[ipack];
141  plans_per_pack[ipack][iplan].blk = blk;
142  plans_per_pack[ipack][iplan].rank = rank;
143  plans_per_pack[ipack][iplan].row_size = row_size;
144  plans_per_pack[ipack][iplan].col_size = col_size;
145  }
146  }
147 #pragma omp critical
148  for (int ipack = 0; ipack < npacks; ipack++) {
149  ndata_per_pack[ipack] += ndata_mythread[ipack];
150  }
151  } // end of omp parallel region
152 }
153 
154 /*******************************************************************************
155  * \brief Private routine for filling send buffers.
156  * \author Ole Schuett
157  ******************************************************************************/
158 static void fill_send_buffers(
159  const dbm_matrix_t *matrix, const bool trans_matrix, const int nblks_send,
160  const int ndata_send, plan_t plans[nblks_send], const int nranks,
161  int blks_send_count[nranks], int data_send_count[nranks],
162  int blks_send_displ[nranks], int data_send_displ[nranks],
163  dbm_pack_block_t blks_send[nblks_send], double data_send[ndata_send]) {
164 
165  memset(blks_send_count, 0, nranks * sizeof(int));
166  memset(data_send_count, 0, nranks * sizeof(int));
167 
168 #pragma omp parallel
169  {
170  // 3th pass: Compute per rank nblks and ndata.
171  int nblks_mythread[nranks], ndata_mythread[nranks];
172  memset(nblks_mythread, 0, nranks * sizeof(int));
173  memset(ndata_mythread, 0, nranks * sizeof(int));
174 #pragma omp for schedule(static)
175  for (int iblock = 0; iblock < nblks_send; iblock++) {
176  const plan_t *plan = &plans[iblock];
177  nblks_mythread[plan->rank] += 1;
178  ndata_mythread[plan->rank] += plan->row_size * plan->col_size;
179  }
180 
181  // Sum nblks and ndata across threads.
182 #pragma omp critical
183  for (int irank = 0; irank < nranks; irank++) {
184  blks_send_count[irank] += nblks_mythread[irank];
185  data_send_count[irank] += ndata_mythread[irank];
186  nblks_mythread[irank] = blks_send_count[irank];
187  ndata_mythread[irank] = data_send_count[irank];
188  }
189 #pragma omp barrier
190 
191  // Compute send displacements.
192 #pragma omp master
193  {
194  icumsum(nranks, blks_send_count, blks_send_displ);
195  icumsum(nranks, data_send_count, data_send_displ);
196  const int m = nranks - 1;
197  assert(nblks_send == blks_send_displ[m] + blks_send_count[m]);
198  assert(ndata_send == data_send_displ[m] + data_send_count[m]);
199  }
200 #pragma omp barrier
201 
202  // 4th pass: Fill blks_send and data_send arrays.
203 #pragma omp for schedule(static) // Need static to match previous loop.
204  for (int iblock = 0; iblock < nblks_send; iblock++) {
205  const plan_t *plan = &plans[iblock];
206  const dbm_block_t *blk = plan->blk;
207  const int ishard = dbm_get_shard_index(matrix, blk->row, blk->col);
208  const dbm_shard_t *shard = &matrix->shards[ishard];
209  const double *blk_data = &shard->data[blk->offset];
210  const int row_size = plan->row_size, col_size = plan->col_size;
211  const int irank = plan->rank;
212 
213  // The blk_send_data is ordered by rank, thread, and block.
214  // data_send_displ[irank]: Start of data for irank within blk_send_data.
215  // ndata_mythread[irank]: Current threads offset within data for irank.
216  nblks_mythread[irank] -= 1;
217  ndata_mythread[irank] -= row_size * col_size;
218  const int offset = data_send_displ[irank] + ndata_mythread[irank];
219  const int jblock = blks_send_displ[irank] + nblks_mythread[irank];
220 
221  double norm = 0.0; // Compute norm as double...
222  if (trans_matrix) {
223  // Transpose block to allow for outer-product style multiplication.
224  for (int i = 0; i < row_size; i++) {
225  for (int j = 0; j < col_size; j++) {
226  const double element = blk_data[j * row_size + i];
227  norm += element * element;
228  data_send[offset + i * col_size + j] = element;
229  }
230  }
231  blks_send[jblock].free_index = plan->blk->col;
232  blks_send[jblock].sum_index = plan->blk->row;
233  } else {
234  for (int i = 0; i < row_size * col_size; i++) {
235  const double element = blk_data[i];
236  norm += element * element;
237  data_send[offset + i] = element;
238  }
239  blks_send[jblock].free_index = plan->blk->row;
240  blks_send[jblock].sum_index = plan->blk->col;
241  }
242  blks_send[jblock].norm = (float)norm; // ...store norm as float.
243 
244  // After the block exchange data_recv_displ will be added to the offsets.
245  blks_send[jblock].offset = offset - data_send_displ[irank];
246  }
247  } // end of omp parallel region
248 }
249 
250 /*******************************************************************************
251  * \brief Private comperator passed to qsort to compare two blocks by sum_index.
252  * \author Ole Schuett
253  ******************************************************************************/
254 static int compare_pack_blocks_by_sum_index(const void *a, const void *b) {
255  const dbm_pack_block_t *blk_a = (const dbm_pack_block_t *)a;
256  const dbm_pack_block_t *blk_b = (const dbm_pack_block_t *)b;
257  return blk_a->sum_index - blk_b->sum_index;
258 }
259 
260 /*******************************************************************************
261  * \brief Private routine for post-processing received blocks.
262  * \author Ole Schuett
263  ******************************************************************************/
265  const int nranks, const int nshards, const int nblocks_recv,
266  const int blks_recv_count[nranks], const int blks_recv_displ[nranks],
267  const int data_recv_displ[nranks],
268  dbm_pack_block_t blks_recv[nblocks_recv]) {
269 
270  int nblocks_per_shard[nshards], shard_start[nshards];
271  memset(nblocks_per_shard, 0, nshards * sizeof(int));
272  dbm_pack_block_t *blocks_tmp =
273  malloc(nblocks_recv * sizeof(dbm_pack_block_t));
274 
275 #pragma omp parallel
276  {
277  // Add data_recv_displ to recveived block offsets.
278  for (int irank = 0; irank < nranks; irank++) {
279 #pragma omp for
280  for (int i = 0; i < blks_recv_count[irank]; i++) {
281  blks_recv[blks_recv_displ[irank] + i].offset += data_recv_displ[irank];
282  }
283  }
284 
285  // First use counting sort to group blocks by their free_index shard.
286  int nblocks_mythread[nshards];
287  memset(nblocks_mythread, 0, nshards * sizeof(int));
288 #pragma omp for schedule(static)
289  for (int iblock = 0; iblock < nblocks_recv; iblock++) {
290  blocks_tmp[iblock] = blks_recv[iblock];
291  const int ishard = blks_recv[iblock].free_index % nshards;
292  nblocks_mythread[ishard]++;
293  }
294 #pragma omp critical
295  for (int ishard = 0; ishard < nshards; ishard++) {
296  nblocks_per_shard[ishard] += nblocks_mythread[ishard];
297  nblocks_mythread[ishard] = nblocks_per_shard[ishard];
298  }
299 #pragma omp barrier
300 #pragma omp master
301  icumsum(nshards, nblocks_per_shard, shard_start);
302 #pragma omp barrier
303 #pragma omp for schedule(static) // Need static to match previous loop.
304  for (int iblock = 0; iblock < nblocks_recv; iblock++) {
305  const int ishard = blocks_tmp[iblock].free_index % nshards;
306  const int jblock = --nblocks_mythread[ishard] + shard_start[ishard];
307  blks_recv[jblock] = blocks_tmp[iblock];
308  }
309 
310  // Then sort blocks within each shard by their sum_index.
311 #pragma omp for
312  for (int ishard = 0; ishard < nshards; ishard++) {
313  if (nblocks_per_shard[ishard] > 1) {
314  qsort(&blks_recv[shard_start[ishard]], nblocks_per_shard[ishard],
316  }
317  }
318  } // end of omp parallel region
319 
320  free(blocks_tmp);
321 }
322 
323 /*******************************************************************************
324  * \brief Private routine for redistributing a matrix along selected dimensions.
325  * \author Ole Schuett
326  ******************************************************************************/
327 static dbm_packed_matrix_t pack_matrix(const bool trans_matrix,
328  const bool trans_dist,
329  const dbm_matrix_t *matrix,
330  const dbm_distribution_t *dist,
331  const int nticks) {
332 
333  assert(dbm_mpi_comms_are_similar(matrix->dist->comm, dist->comm));
334 
335  // The row/col indicies are distributed along one cart dimension and the
336  // ticks are distributed along the other cart dimension.
337  const dbm_dist_1d_t *dist_indices = (trans_dist) ? &dist->cols : &dist->rows;
338  const dbm_dist_1d_t *dist_ticks = (trans_dist) ? &dist->rows : &dist->cols;
339 
340  // Allocate packed matrix.
341  const int nsend_packs = nticks / dist_ticks->nranks;
342  assert(nsend_packs * dist_ticks->nranks == nticks);
343  dbm_packed_matrix_t packed;
344  packed.dist_indices = dist_indices;
345  packed.dist_ticks = dist_ticks;
346  packed.nsend_packs = nsend_packs;
347  packed.send_packs = malloc(nsend_packs * sizeof(dbm_pack_t));
348 
349  // Plan all packs.
350  plan_t *plans_per_pack[nsend_packs];
351  int nblks_send_per_pack[nsend_packs], ndata_send_per_pack[nsend_packs];
352  create_pack_plans(trans_matrix, trans_dist, matrix, dist->comm, dist_indices,
353  dist_ticks, nticks, nsend_packs, plans_per_pack,
354  nblks_send_per_pack, ndata_send_per_pack);
355 
356  // Can not parallelize over packs because there might be too few of them.
357  for (int ipack = 0; ipack < nsend_packs; ipack++) {
358  // Allocate send buffers.
359  dbm_pack_block_t *blks_send =
360  malloc(nblks_send_per_pack[ipack] * sizeof(dbm_pack_block_t));
361  double *data_send =
362  dbm_mempool_host_malloc(ndata_send_per_pack[ipack] * sizeof(double));
363 
364  // Fill send buffers according to plans.
365  const int nranks = dist->nranks;
366  int blks_send_count[nranks], data_send_count[nranks];
367  int blks_send_displ[nranks], data_send_displ[nranks];
368  fill_send_buffers(matrix, trans_matrix, nblks_send_per_pack[ipack],
369  ndata_send_per_pack[ipack], plans_per_pack[ipack], nranks,
370  blks_send_count, data_send_count, blks_send_displ,
371  data_send_displ, blks_send, data_send);
372  free(plans_per_pack[ipack]);
373 
374  // 1st communication: Exchange block counts.
375  int blks_recv_count[nranks], blks_recv_displ[nranks];
376  dbm_mpi_alltoall_int(blks_send_count, 1, blks_recv_count, 1, dist->comm);
377  icumsum(nranks, blks_recv_count, blks_recv_displ);
378  const int nblocks_recv = isum(nranks, blks_recv_count);
379 
380  // 2nd communication: Exchange blocks.
381  dbm_pack_block_t *blks_recv =
382  malloc(nblocks_recv * sizeof(dbm_pack_block_t));
383  int blks_send_count_byte[nranks], blks_send_displ_byte[nranks];
384  int blks_recv_count_byte[nranks], blks_recv_displ_byte[nranks];
385  for (int i = 0; i < nranks; i++) { // TODO: this is ugly!
386  blks_send_count_byte[i] = blks_send_count[i] * sizeof(dbm_pack_block_t);
387  blks_send_displ_byte[i] = blks_send_displ[i] * sizeof(dbm_pack_block_t);
388  blks_recv_count_byte[i] = blks_recv_count[i] * sizeof(dbm_pack_block_t);
389  blks_recv_displ_byte[i] = blks_recv_displ[i] * sizeof(dbm_pack_block_t);
390  }
392  blks_send, blks_send_count_byte, blks_send_displ_byte, blks_recv,
393  blks_recv_count_byte, blks_recv_displ_byte, dist->comm);
394  free(blks_send);
395 
396  // 3rd communication: Exchange data counts.
397  // TODO: could be computed from blks_recv.
398  int data_recv_count[nranks], data_recv_displ[nranks];
399  dbm_mpi_alltoall_int(data_send_count, 1, data_recv_count, 1, dist->comm);
400  icumsum(nranks, data_recv_count, data_recv_displ);
401  const int ndata_recv = isum(nranks, data_recv_count);
402 
403  // 4th communication: Exchange data.
404  double *data_recv = dbm_mempool_host_malloc(ndata_recv * sizeof(double));
405  dbm_mpi_alltoallv_double(data_send, data_send_count, data_send_displ,
406  data_recv, data_recv_count, data_recv_displ,
407  dist->comm);
408  dbm_mempool_free(data_send);
409 
410  // Post-process received blocks and assemble them into a pack.
411  postprocess_received_blocks(nranks, dist_indices->nshards, nblocks_recv,
412  blks_recv_count, blks_recv_displ,
413  data_recv_displ, blks_recv);
414  packed.send_packs[ipack].nblocks = nblocks_recv;
415  packed.send_packs[ipack].data_size = ndata_recv;
416  packed.send_packs[ipack].blocks = blks_recv;
417  packed.send_packs[ipack].data = data_recv;
418  }
419 
420  // Allocate pack_recv.
421  int max_nblocks = 0, max_data_size = 0;
422  for (int ipack = 0; ipack < packed.nsend_packs; ipack++) {
423  max_nblocks = imax(max_nblocks, packed.send_packs[ipack].nblocks);
424  max_data_size = imax(max_data_size, packed.send_packs[ipack].data_size);
425  }
426  dbm_mpi_max_int(&max_nblocks, 1, packed.dist_ticks->comm);
427  dbm_mpi_max_int(&max_data_size, 1, packed.dist_ticks->comm);
428  packed.max_nblocks = max_nblocks;
429  packed.max_data_size = max_data_size;
430  packed.recv_pack.blocks =
431  malloc(packed.max_nblocks * sizeof(dbm_pack_block_t));
432  packed.recv_pack.data =
433  dbm_mempool_host_malloc(packed.max_data_size * sizeof(double));
434 
435  return packed; // Ownership of packed transfers to caller.
436 }
437 
438 /*******************************************************************************
439  * \brief Private routine for sending and receiving the pack for the given tick.
440  * \author Ole Schuett
441  ******************************************************************************/
442 static dbm_pack_t *sendrecv_pack(const int itick, const int nticks,
443  dbm_packed_matrix_t *packed) {
444  const int nranks = packed->dist_ticks->nranks;
445  const int my_rank = packed->dist_ticks->my_rank;
446 
447  // Compute send rank and pack.
448  const int itick_of_rank0 = (itick + nticks - my_rank) % nticks;
449  const int send_rank = (my_rank + nticks - itick_of_rank0) % nranks;
450  const int send_itick = (itick_of_rank0 + send_rank) % nticks;
451  const int send_ipack = send_itick / nranks;
452  assert(send_itick % nranks == my_rank);
453 
454  // Compute receive rank and pack.
455  const int recv_rank = itick % nranks;
456  const int recv_ipack = itick / nranks;
457 
458  if (send_rank == my_rank) {
459  assert(send_rank == recv_rank && send_ipack == recv_ipack);
460  return &packed->send_packs[send_ipack]; // Local pack, no mpi needed.
461  } else {
462  const dbm_pack_t *send_pack = &packed->send_packs[send_ipack];
463 
464  // Exchange blocks.
465  const int nblocks_in_bytes = dbm_mpi_sendrecv_byte(
466  /*sendbuf=*/send_pack->blocks,
467  /*sendcound=*/send_pack->nblocks * sizeof(dbm_pack_block_t),
468  /*dest=*/send_rank,
469  /*sendtag=*/send_ipack,
470  /*recvbuf=*/packed->recv_pack.blocks,
471  /*recvcount=*/packed->max_nblocks * sizeof(dbm_pack_block_t),
472  /*source=*/recv_rank,
473  /*recvtag=*/recv_ipack,
474  /*comm=*/packed->dist_ticks->comm);
475 
476  assert(nblocks_in_bytes % sizeof(dbm_pack_block_t) == 0);
477  packed->recv_pack.nblocks = nblocks_in_bytes / sizeof(dbm_pack_block_t);
478 
479  // Exchange data.
481  /*sendbuf=*/send_pack->data,
482  /*sendcound=*/send_pack->data_size,
483  /*dest=*/send_rank,
484  /*sendtag=*/send_ipack,
485  /*recvbuf=*/packed->recv_pack.data,
486  /*recvcount=*/packed->max_data_size,
487  /*source=*/recv_rank,
488  /*recvtag=*/recv_ipack,
489  /*comm=*/packed->dist_ticks->comm);
490 
491  return &packed->recv_pack;
492  }
493 }
494 
495 /*******************************************************************************
496  * \brief Private routine for releasing a packed matrix.
497  * \author Ole Schuett
498  ******************************************************************************/
500  free(packed->recv_pack.blocks);
502  for (int ipack = 0; ipack < packed->nsend_packs; ipack++) {
503  free(packed->send_packs[ipack].blocks);
504  dbm_mempool_free(packed->send_packs[ipack].data);
505  }
506  free(packed->send_packs);
507 }
508 
509 /*******************************************************************************
510  * \brief Internal routine for creating a communication iterator.
511  * \author Ole Schuett
512  ******************************************************************************/
514  const bool transb,
515  const dbm_matrix_t *matrix_a,
516  const dbm_matrix_t *matrix_b,
517  const dbm_matrix_t *matrix_c) {
518 
519  dbm_comm_iterator_t *iter = malloc(sizeof(dbm_comm_iterator_t));
520  iter->dist = matrix_c->dist;
521 
522  // During each communication tick we'll fetch a pack_a and pack_b.
523  // Since the cart might be non-squared, the number of communication ticks is
524  // chosen as the least common multiple of the cart's dimensions.
525  iter->nticks = lcm(iter->dist->rows.nranks, iter->dist->cols.nranks);
526  iter->itick = 0;
527 
528  // 1.arg=source dimension, 2.arg=target dimension, false=rows, true=columns.
529  iter->packed_a =
530  pack_matrix(transa, false, matrix_a, iter->dist, iter->nticks);
531  iter->packed_b =
532  pack_matrix(!transb, true, matrix_b, iter->dist, iter->nticks);
533 
534  return iter;
535 }
536 
537 /*******************************************************************************
538  * \brief Internal routine for retriving next pair of packs from given iterator.
539  * \author Ole Schuett
540  ******************************************************************************/
542  dbm_pack_t **pack_b) {
543  if (iter->itick >= iter->nticks) {
544  return false; // end of iterator reached
545  }
546 
547  // Start each rank at a different tick to spread the load on the sources.
548  const int shift = iter->dist->rows.my_rank + iter->dist->cols.my_rank;
549  const int shifted_itick = (iter->itick + shift) % iter->nticks;
550  *pack_a = sendrecv_pack(shifted_itick, iter->nticks, &iter->packed_a);
551  *pack_b = sendrecv_pack(shifted_itick, iter->nticks, &iter->packed_b);
552 
553  iter->itick++;
554  return true;
555 }
556 
557 /*******************************************************************************
558  * \brief Internal routine for releasing the given communication iterator.
559  * \author Ole Schuett
560  ******************************************************************************/
564  free(iter);
565 }
566 
567 // EOF
static int dbm_get_shard_index(const dbm_matrix_t *matrix, const int row, const int col)
Internal routine for getting a block's shard index.
Definition: dbm_matrix.h:240
static int dbm_get_num_shards(const dbm_matrix_t *matrix)
Internal routine that returns the number of shards for given matrix.
Definition: dbm_matrix.h:232
void dbm_mempool_free(void *mem)
Internal routine for releasing memory back to the pool.
Definition: dbm_mempool.c:157
void * dbm_mempool_host_malloc(const size_t size)
Internal routine for allocating host memory from the pool.
Definition: dbm_mempool.c:141
int dbm_mpi_sendrecv_byte(const void *sendbuf, const int sendcount, const int dest, const int sendtag, void *recvbuf, const int recvcount, const int source, const int recvtag, const dbm_mpi_comm_t comm)
Wrapper around MPI_Sendrecv for datatype MPI_BYTE.
Definition: dbm_mpi.c:326
int dbm_mpi_cart_rank(const dbm_mpi_comm_t comm, const int coords[])
Wrapper around MPI_Cart_rank.
Definition: dbm_mpi.c:176
bool dbm_mpi_comms_are_similar(const dbm_mpi_comm_t comm1, const dbm_mpi_comm_t comm2)
Wrapper around MPI_Comm_compare.
Definition: dbm_mpi.c:221
void dbm_mpi_alltoallv_byte(const void *sendbuf, const int *sendcounts, const int *sdispls, void *recvbuf, const int *recvcounts, const int *rdispls, const dbm_mpi_comm_t comm)
Wrapper around MPI_Alltoallv for datatype MPI_BYTE.
Definition: dbm_mpi.c:402
void dbm_mpi_alltoall_int(const int *sendbuf, const int sendcount, int *recvbuf, const int recvcount, const dbm_mpi_comm_t comm)
Wrapper around MPI_Alltoall for datatype MPI_INT.
Definition: dbm_mpi.c:386
int dbm_mpi_sendrecv_double(const double *sendbuf, const int sendcount, const int dest, const int sendtag, double *recvbuf, const int recvcount, const int source, const int recvtag, const dbm_mpi_comm_t comm)
Wrapper around MPI_Sendrecv for datatype MPI_DOUBLE.
Definition: dbm_mpi.c:356
void dbm_mpi_max_int(int *values, const int count, const dbm_mpi_comm_t comm)
Wrapper around MPI_Allreduce for op MPI_MAX and datatype MPI_INT.
Definition: dbm_mpi.c:238
void dbm_mpi_alltoallv_double(const double *sendbuf, const int *sendcounts, const int *sdispls, double *recvbuf, const int *recvcounts, const int *rdispls, const dbm_mpi_comm_t comm)
Wrapper around MPI_Alltoallv for datatype MPI_DOUBLE.
Definition: dbm_mpi.c:421
int dbm_mpi_comm_t
Definition: dbm_mpi.h:18
static void create_pack_plans(const bool trans_matrix, const bool trans_dist, const dbm_matrix_t *matrix, const dbm_mpi_comm_t comm, const dbm_dist_1d_t *dist_indices, const dbm_dist_1d_t *dist_ticks, const int nticks, const int npacks, plan_t *plans_per_pack[npacks], int nblks_per_pack[npacks], int ndata_per_pack[npacks])
Private routine for planing packs.
static void free_packed_matrix(dbm_packed_matrix_t *packed)
Private routine for releasing a packed matrix.
static void icumsum(const int n, const int input[n], int output[n])
Private routine for computing the cumulative sums of given numbers.
static void postprocess_received_blocks(const int nranks, const int nshards, const int nblocks_recv, const int blks_recv_count[nranks], const int blks_recv_displ[nranks], const int data_recv_displ[nranks], dbm_pack_block_t blks_recv[nblocks_recv])
Private routine for post-processing received blocks.
static void fill_send_buffers(const dbm_matrix_t *matrix, const bool trans_matrix, const int nblks_send, const int ndata_send, plan_t plans[nblks_send], const int nranks, int blks_send_count[nranks], int data_send_count[nranks], int blks_send_displ[nranks], int data_send_displ[nranks], dbm_pack_block_t blks_send[nblks_send], double data_send[ndata_send])
Private routine for filling send buffers.
dbm_comm_iterator_t * dbm_comm_iterator_start(const bool transa, const bool transb, const dbm_matrix_t *matrix_a, const dbm_matrix_t *matrix_b, const dbm_matrix_t *matrix_c)
Internal routine for creating a communication iterator.
static int gcd(const int a, const int b)
Private routine for computing greatest common divisor of two numbers.
static int lcm(const int a, const int b)
Private routine for computing least common multiple of two numbers.
void dbm_comm_iterator_stop(dbm_comm_iterator_t *iter)
Internal routine for releasing the given communication iterator.
static int compare_pack_blocks_by_sum_index(const void *a, const void *b)
Private comperator passed to qsort to compare two blocks by sum_index.
bool dbm_comm_iterator_next(dbm_comm_iterator_t *iter, dbm_pack_t **pack_a, dbm_pack_t **pack_b)
Internal routine for retriving next pair of packs from given iterator.
static dbm_packed_matrix_t pack_matrix(const bool trans_matrix, const bool trans_dist, const dbm_matrix_t *matrix, const dbm_distribution_t *dist, const int nticks)
Private routine for redistributing a matrix along selected dimensions.
static dbm_pack_t * sendrecv_pack(const int itick, const int nticks, dbm_packed_matrix_t *packed)
Private routine for sending and receiving the pack for the given tick.
static int imax(int x, int y)
Returns the larger of two given integer (missing from the C standard)
static int isum(const int n, const int input[n])
Private routine for computing the sum of the given integers.
static void const int const int i
real(dp), dimension(3) a
Definition: ai_eri_debug.F:31
real(dp), dimension(3) b
Definition: ai_eri_debug.F:31
Internal struct for storing a block's metadata.
Definition: dbm_shard.h:20
int offset
Definition: dbm_shard.h:23
Internal struct for storing a communication iterator.
dbm_packed_matrix_t packed_a
dbm_distribution_t * dist
dbm_packed_matrix_t packed_b
Internal struct for storing a one dimensional distribution.
dbm_mpi_comm_t comm
Internal struct for storing a two dimensional distribution.
dbm_dist_1d_t rows
dbm_mpi_comm_t comm
dbm_dist_1d_t cols
Internal struct for storing a matrix.
Definition: dbm_matrix.h:20
int * row_sizes
Definition: dbm_matrix.h:25
int * col_sizes
Definition: dbm_matrix.h:26
dbm_shard_t * shards
Definition: dbm_matrix.h:28
dbm_distribution_t * dist
Definition: dbm_matrix.h:21
Internal struct for storing a dbm_block_t plus its norm.
Internal struct for storing a pack - essentially a shard for MPI.
dbm_pack_block_t * blocks
Internal struct for storing a packed matrix.
const dbm_dist_1d_t * dist_ticks
const dbm_dist_1d_t * dist_indices
Internal struct for storing a matrix shard.
Definition: dbm_shard.h:30
double * data
Definition: dbm_shard.h:44
dbm_block_t * blocks
Definition: dbm_shard.h:33
int nblocks
Definition: dbm_shard.h:31
Private struct used for planing during pack_matrix.
const dbm_block_t * blk