(git:6a2e663)
mp_perf_test.F
Go to the documentation of this file.
1 !--------------------------------------------------------------------------------------------------!
2 ! CP2K: A general program to perform molecular dynamics simulations !
3 ! Copyright 2000-2024 CP2K developers group <https://cp2k.org> !
4 ! !
5 ! SPDX-License-Identifier: GPL-2.0-or-later !
6 !--------------------------------------------------------------------------------------------------!
7 
8 ! **************************************************************************************************
9 !> \brief Interface to the message passing library MPI
10 !> \par History
11 !> JGH (02-Jan-2001): New error handling
12 !> Performance tools
13 !> JGH (14-Jan-2001): New routines mp_comm_compare, mp_cart_coords,
14 !> mp_rank_compare, mp_alltoall
15 !> JGH (06-Feb-2001): New routines mp_comm_free
16 !> JGH (22-Mar-2001): New routines mp_comm_dup
17 !> fawzi (04-NOV-2004): storable performance info (for f77 interface)
18 !> Wrapper routine for mpi_gatherv added (22.12.2005,MK)
19 !> JGH (13-Feb-2006): Flexible precision
20 !> JGH (15-Feb-2006): single precision mp_alltoall
21 !> \author JGH
22 ! **************************************************************************************************
24  USE kinds, ONLY: dp
25  USE message_passing, ONLY: mp_comm_type
26  ! some benchmarking code
27 #include "../base/base_uses.f90"
28 
29 #if defined(__parallel)
30 #if defined(__MPI_F08)
31  USE mpi_f08, ONLY: mpi_wtime
32 #else
33  USE mpi, ONLY: mpi_wtime
34 #endif
35 #endif
36 
37  PRIVATE
38 
39  PUBLIC :: mpi_perf_test
40 
41 CONTAINS
42 
43 ! **************************************************************************************************
44 !> \brief Tests the MPI library
45 !> \param comm the relevant, initialized communicator
46 !> \param npow number of sizes to test, 10**1 .. 10**npow
47 !> \param output_unit where to direct output
48 !> \par History
49 !> JGH 6-Feb-2001 : Test and performance code
50 !> \author JGH 1-JAN-2001
51 !> \note
52 !> quickly adapted benchmark code, will only work on an even number of CPUs.
53 ! **************************************************************************************************
54  SUBROUTINE mpi_perf_test(comm, npow, output_unit)
55  CLASS(mp_comm_type), INTENT(IN) :: comm
56  INTEGER, INTENT(IN) :: npow, output_unit
57 
58 #if defined(__parallel)
59 
60  INTEGER :: i, itask, itests, j, jtask, left, nbufmax, &
61  ncount, ngrid, nloc, nprocs, ntot, partner, right, taskid, tag, source
62  INTEGER, ALLOCATABLE, DIMENSION(:) :: rcount, rdispl, scount, sdispl
63  LOGICAL :: ionode
64  REAL(kind=dp) :: maxdiff, t1, &
65  t2, t3, t4, t5
66  REAL(kind=dp), ALLOCATABLE, DIMENSION(:) :: buffer1, buffer2, buffer3, &
67  lgrid, lgrid2, lgrid3
68  REAL(kind=dp), ALLOCATABLE, &
69  DIMENSION(:, :) :: grid, grid2, grid3, &
70  send_timings, send_timings2
71  REAL(kind=dp), PARAMETER :: threshold = 1.0e-8_dp
72 
73  ! set system sizes !
74  ngrid = 10**npow
75 
76  taskid = comm%mepos
77  nprocs = comm%num_pe
78  ionode = comm%is_source()
79  IF (ionode .AND. output_unit > 0) THEN
80  WRITE (output_unit, *) "Running with ", nprocs
81  WRITE (output_unit, *) "running messages with npow = ", npow
82  WRITE (output_unit, *) "use MPI X in the input for larger (e.g. 6) of smaller (e.g. 3) messages"
83  IF (modulo(nprocs, 2) .NE. 0) WRITE (output_unit, *) "Testing only with an even number of tasks"
84  END IF
85 
86  IF (modulo(nprocs, 2) .NE. 0) RETURN
87 
88  ! equal loads
89  nloc = ngrid/nprocs
90  ntot = nprocs*nloc
91  nbufmax = 10**npow
92  !
93  ALLOCATE (rcount(nprocs))
94  ALLOCATE (scount(nprocs))
95  ALLOCATE (sdispl(nprocs))
96  ALLOCATE (rdispl(nprocs))
97  ALLOCATE (buffer1(nbufmax))
98  ALLOCATE (buffer2(nbufmax))
99  ALLOCATE (buffer3(nbufmax))
100  ALLOCATE (grid(nloc, nprocs))
101  ALLOCATE (grid2(nloc, nprocs))
102  ALLOCATE (grid3(nloc, nprocs))
103  ALLOCATE (lgrid(nloc))
104  ALLOCATE (lgrid2(nloc))
105  ALLOCATE (lgrid3(nloc))
106  ALLOCATE (send_timings(0:nprocs - 1, 0:nprocs - 1))
107  ALLOCATE (send_timings2(0:nprocs - 1, 0:nprocs - 1))
108  buffer1 = 0.0_dp
109  buffer2 = 0.0_dp
110  buffer3 = 0.0_dp
111  ! timings
112  send_timings = 0.0_dp
113  send_timings2 = 0.0_dp
114  ! -------------------------------------------------------------------------------------------
115  ! ------------------------------ some in memory tests ---------------------
116  ! -------------------------------------------------------------------------------------------
117  CALL comm%sync()
118  IF (ionode .AND. output_unit > 0) THEN
119  WRITE (output_unit, *) "Testing in memory copies just 1 CPU "
120  WRITE (output_unit, *) " could tell something about the motherboard / cache / compiler "
121  END IF
122  DO i = 1, npow
123  ncount = 10**i
124  t2 = 0.0e0_dp
125  IF (ncount .GT. nbufmax) cpabort("")
126  DO j = 1, 3**(npow - i)
127  CALL comm%sync()
128  t1 = mpi_wtime()
129  buffer2(1:ncount) = buffer1(1:ncount)
130  t2 = t2 + mpi_wtime() - t1 + threshold
131  END DO
132  CALL comm%max(t2, 0)
133  IF (ionode .AND. output_unit > 0) THEN
134  WRITE (output_unit, '(I9,A,F12.4,A)') 8*ncount, " Bytes ", (3**(npow - i))*ncount*8.0e-6_dp/t2, " MB/s"
135  END IF
136  END DO
137  ! -------------------------------------------------------------------------------------------
138  ! ------------------------------ some in memory tests ---------------------
139  ! -------------------------------------------------------------------------------------------
140  CALL comm%sync()
141  IF (ionode .AND. output_unit > 0) THEN
142  WRITE (output_unit, *) "Testing in memory copies all cpus"
143  WRITE (output_unit, *) " is the memory bandwidth affected on an SMP machine ?"
144  END IF
145  DO i = 1, npow
146  ncount = 10**i
147  t2 = 0.0e0_dp
148  IF (ncount .GT. nbufmax) cpabort("")
149  DO j = 1, 3**(npow - i)
150  CALL comm%sync()
151  t1 = mpi_wtime()
152  buffer2(1:ncount) = buffer1(1:ncount)
153  t2 = t2 + mpi_wtime() - t1 + threshold
154  END DO
155  CALL comm%max(t2, 0)
156  IF (ionode .AND. output_unit > 0) THEN
157  WRITE (output_unit, '(I9,A,F12.4,A)') 8*ncount, " Bytes ", (3**(npow - i))*ncount*8.0e-6_dp/t2, " MB/s"
158  END IF
159  END DO
160  ! -------------------------------------------------------------------------------------------
161  ! ------------------------------ first test point to point communication ---------------------
162  ! -------------------------------------------------------------------------------------------
163  CALL comm%sync()
164  IF (ionode .AND. output_unit > 0) THEN
165  WRITE (output_unit, *) "Testing truly point to point communication (i with j only)"
166  WRITE (output_unit, *) " is there some different connection between i j (e.g. shared memory comm)"
167  END IF
168  ncount = 10**npow
169  IF (ionode .AND. output_unit > 0) WRITE (output_unit, *) "For messages of ", ncount*8, " bytes"
170  IF (ncount .GT. nbufmax) cpabort("")
171  DO itask = 0, nprocs - 1
172  DO jtask = itask + 1, nprocs - 1
173  CALL comm%sync()
174  t1 = mpi_wtime()
175  IF (taskid .EQ. itask) THEN
176  CALL comm%send(buffer1, jtask, itask*jtask)
177  END IF
178  IF (taskid .EQ. jtask) THEN
179  source = itask
180  tag = itask*jtask
181  CALL comm%recv(buffer1, source, tag)
182  END IF
183  send_timings(itask, jtask) = mpi_wtime() - t1 + threshold
184  END DO
185  END DO
186  CALL comm%max(send_timings, 0)
187  IF (ionode .AND. output_unit > 0) THEN
188  DO itask = 0, nprocs - 1
189  DO jtask = itask + 1, nprocs - 1
190  WRITE (output_unit, '(I4,I4,F12.4,A)') itask, jtask, ncount*8.0e-6_dp/send_timings(itask, jtask), " MB/s"
191  END DO
192  END DO
193  END IF
194  CALL comm%sync()
195  ! -------------------------------------------------------------------------------------------
196  ! ------------------------------ second test point to point communication -------------------
197  ! -------------------------------------------------------------------------------------------
198  IF (ionode .AND. output_unit > 0) THEN
199  WRITE (output_unit, *) "Testing all nearby point to point communication (0,1)(2,3)..."
200  WRITE (output_unit, *) " these could / should all be on the same shared memory node "
201  END IF
202  DO i = 1, npow
203  ncount = 10**i
204  t2 = 0.0e0_dp
205  IF (ncount .GT. nbufmax) cpabort("")
206  DO j = 1, 3**(npow - i)
207  CALL comm%sync()
208  t1 = mpi_wtime()
209  IF (modulo(taskid, 2) == 0) THEN
210  CALL comm%send(buffer1, taskid + 1, 0)
211  ELSE
212  source = taskid - 1
213  tag = 0
214  CALL comm%recv(buffer1, source, tag)
215  END IF
216  t2 = t2 + mpi_wtime() - t1 + threshold
217  END DO
218  CALL comm%max(t2, 0)
219  IF (ionode .AND. output_unit > 0) THEN
220  WRITE (output_unit, '(I9,A,F12.4,A)') 8*ncount, " Bytes ", (3**(npow - i))*ncount*8.0e-6_dp/t2, " MB/s"
221  END IF
222  END DO
223  CALL comm%sync()
224  ! -------------------------------------------------------------------------------------------
225  ! ------------------------------ third test point to point communication -------------------
226  ! -------------------------------------------------------------------------------------------
227  IF (ionode .AND. output_unit > 0) THEN
228  WRITE (output_unit, *) "Testing all far point to point communication (0,nprocs/2),(1,nprocs/2+1),.."
229  WRITE (output_unit, *) " these could all be going over the network, and stress it a lot"
230  END IF
231  DO i = 1, npow
232  ncount = 10**i
233  t2 = 0.0e0_dp
234  IF (ncount .GT. nbufmax) cpabort("")
235  DO j = 1, 3**(npow - i)
236  CALL comm%sync()
237  t1 = mpi_wtime()
238  ! first half with partner
239  IF (taskid .LT. nprocs/2) THEN
240  CALL comm%send(buffer1, taskid + nprocs/2, 0)
241  ELSE
242  source = taskid - nprocs/2
243  tag = 0
244  CALL comm%recv(buffer1, source, tag)
245  END IF
246  t2 = t2 + mpi_wtime() - t1 + threshold
247  END DO
248  CALL comm%max(t2, 0)
249  IF (ionode .AND. output_unit > 0) THEN
250  WRITE (output_unit, '(I9,A,F12.4,A)') 8*ncount, " Bytes ", (3**(npow - i))*ncount*8.0e-6_dp/t2, " MB/s"
251  END IF
252  END DO
253  ! -------------------------------------------------------------------------------------------
254  ! ------------------------------ test root to all broadcast -------------------
255  ! -------------------------------------------------------------------------------------------
256  CALL comm%sync()
257  IF (ionode .AND. output_unit > 0) THEN
258  WRITE (output_unit, *) "Testing root to all broadcast "
259  WRITE (output_unit, *) " using trees at least ? "
260  END IF
261  DO i = 1, npow
262  ncount = 10**i
263  t2 = 0.0e0_dp
264  IF (ncount .GT. nbufmax) cpabort("")
265  DO j = 1, 3**(npow - i)
266  CALL comm%sync()
267  t1 = mpi_wtime()
268  CALL comm%bcast(buffer1, 0)
269  t2 = t2 + mpi_wtime() - t1 + threshold
270  END DO
271  CALL comm%max(t2, 0)
272  IF (ionode .AND. output_unit > 0) THEN
273  WRITE (output_unit, '(I9,A,F12.4,A)') 8*ncount, " Bytes ", (3**(npow - i))*ncount*8.0e-6_dp/t2, " MB/s"
274  END IF
275  END DO
276  ! -------------------------------------------------------------------------------------------
277  ! ------------------------------ test parallel sum like behavior -------------------
278  ! -------------------------------------------------------------------------------------------
279  CALL comm%sync()
280  IF (ionode .AND. output_unit > 0) WRITE (output_unit, *) "Test global summation (mpi_allreduce) "
281  DO i = 1, npow
282  ncount = 10**i
283  t2 = 0.0e0_dp
284  IF (ncount .GT. nbufmax) cpabort("")
285  DO j = 1, 3**(npow - i)
286  buffer2(:) = buffer1
287  CALL comm%sync()
288  t1 = mpi_wtime()
289  CALL comm%sum(buffer2)
290  t2 = t2 + mpi_wtime() - t1 + threshold
291  END DO
292  CALL comm%max(t2, 0)
293  IF (ionode .AND. output_unit > 0) THEN
294  WRITE (output_unit, '(I9,A,F12.4,A)') 8*ncount, " Bytes ", (3**(npow - i))*ncount*8.0e-6_dp/t2, " MB/s"
295  END IF
296  END DO
297  ! -------------------------------------------------------------------------------------------
298  ! ------------------------------ test all to all communication -------------------
299  ! -------------------------------------------------------------------------------------------
300  CALL comm%sync()
301  IF (ionode .AND. output_unit > 0) THEN
302  WRITE (output_unit, *) "Test all to all communication (mpi_alltoallv)"
303  WRITE (output_unit, *) " mpi/network getting confused ? "
304  END IF
305  DO i = 1, npow
306  ncount = 10**i
307  t2 = 0.0e0_dp
308  IF (ncount .GT. nbufmax) cpabort("")
309  scount = ncount/nprocs
310  rcount = ncount/nprocs
311  DO j = 1, nprocs
312  sdispl(j) = (j - 1)*(ncount/nprocs)
313  rdispl(j) = (j - 1)*(ncount/nprocs)
314  END DO
315  DO j = 1, 3**(npow - i)
316  CALL comm%sync()
317  t1 = mpi_wtime()
318  CALL comm%alltoall(buffer1, scount, sdispl, buffer2, rcount, rdispl)
319  t2 = t2 + mpi_wtime() - t1 + threshold
320  END DO
321  CALL comm%max(t2, 0)
322  IF (ionode .AND. output_unit > 0) THEN
323  WRITE (output_unit, '(I9,A,F12.4,A)') 8*(ncount/nprocs)*nprocs, " Bytes ", &
324  (3**(npow - i))*(ncount/nprocs)*nprocs*8.0e-6_dp/t2, " MB/s"
325  END IF
326  END DO
327 
328  ! -------------------------------------------------------------------------------------------
329  ! ------------------------------ other stuff ---------------------
330  ! -------------------------------------------------------------------------------------------
331  IF (ionode .AND. output_unit > 0) THEN
332  WRITE (output_unit, *) " Clean tests completed "
333  WRITE (output_unit, *) " Testing MPI_REDUCE scatter"
334  END IF
335  rcount = nloc
336  DO itests = 1, 3
337  IF (ionode .AND. output_unit > 0) &
338  WRITE (output_unit, *) "------------------------------- test ", itests, " ------------------------"
339  ! *** reference ***
340  DO j = 1, nprocs
341  DO i = 1, nloc
342  grid(i, j) = modulo(i*j*taskid, itests)
343  END DO
344  END DO
345  t1 = mpi_wtime()
346  CALL comm%mp_sum_scatter_dv(grid, lgrid, rcount)
347  t2 = mpi_wtime() - t1 + threshold
348  CALL comm%max(t2)
349  IF (ionode .AND. output_unit > 0) WRITE (output_unit, *) "MPI_REDUCE_SCATTER ", t2
350  ! *** simple shift ***
351  DO j = 1, nprocs
352  DO i = 1, nloc
353  grid2(i, j) = modulo(i*j*taskid, itests)
354  END DO
355  END DO
356  t3 = mpi_wtime()
357  lgrid2 = 0.0e0_dp
358  DO i = 1, nprocs
359  lgrid2(:) = lgrid2 + grid(:, modulo(taskid - i, nprocs) + 1)
360  IF (i .EQ. nprocs) EXIT
361  CALL comm%shift(lgrid2, 1)
362  END DO
363  t4 = mpi_wtime() - t3 + threshold
364  CALL comm%max(t4)
365  maxdiff = maxval(abs(lgrid2 - lgrid))
366  CALL comm%max(maxdiff)
367  IF (ionode .AND. output_unit > 0) WRITE (output_unit, *) "MPI_SENDRECV_REPLACE ", t4, maxdiff
368  ! *** involved shift ****
369  IF (modulo(nprocs, 2) /= 0) cpabort("")
370  DO j = 1, nprocs
371  DO i = 1, nloc
372  grid3(i, j) = modulo(i*j*taskid, itests)
373  END DO
374  END DO
375  t3 = mpi_wtime()
376  ! first sum the grid in pairs (0,1),(2,3) should be within an LPAR and fast XXXXXXXXX
377  ! 0 will only need parts 0,2,4,... correctly summed
378  ! 1 will only need parts 1,3,5,... correctly summed
379  ! *** could nicely be generalised ****
380  IF (modulo(taskid, 2) == 0) THEN
381  partner = taskid + 1
382  DO i = 1, nprocs, 2 ! sum the full grid with the partner
383  CALL comm%sendrecv(grid3(:, i + 1), partner, lgrid3, partner, 17)
384  grid3(:, i) = grid3(:, i) + lgrid3(:)
385  END DO
386  ELSE
387  partner = taskid - 1
388  DO i = 1, nprocs, 2
389  CALL comm%sendrecv(grid3(:, i), partner, lgrid3, partner, 17)
390  grid3(:, i + 1) = grid3(:, i + 1) + lgrid3(:)
391  END DO
392  END IF
393  t4 = mpi_wtime() - t3 + threshold
394  ! now send a given buffer from 1 to 3 to 5 .. adding the right part of the data
395  ! since we've summed an lgrid does only need to pass by even or odd tasks
396  left = modulo(taskid - 2, nprocs)
397  right = modulo(taskid + 2, nprocs)
398  t3 = mpi_wtime()
399  lgrid3 = 0.0e0_dp
400  DO i = 1, nprocs, 2
401  lgrid3(:) = lgrid3 + grid3(:, modulo(taskid - i - 1, nprocs) + 1)
402  IF (i .EQ. nprocs - 1) EXIT
403  CALL comm%shift(lgrid3, 2)
404  END DO
405  t5 = mpi_wtime() - t3 + threshold
406  CALL comm%max(t4)
407  CALL comm%max(t5)
408  maxdiff = maxval(abs(lgrid3 - lgrid))
409  CALL comm%max(maxdiff)
410  IF (ionode .AND. output_unit > 0) WRITE (output_unit, *) "INVOLVED SHIFT ", t4 + t5, "(", t4, ",", t5, ")", maxdiff
411  END DO
412  DEALLOCATE (rcount)
413  DEALLOCATE (scount)
414  DEALLOCATE (sdispl)
415  DEALLOCATE (rdispl)
416  DEALLOCATE (buffer1)
417  DEALLOCATE (buffer2)
418  DEALLOCATE (buffer3)
419  DEALLOCATE (grid)
420  DEALLOCATE (grid2)
421  DEALLOCATE (grid3)
422  DEALLOCATE (lgrid)
423  DEALLOCATE (lgrid2)
424  DEALLOCATE (lgrid3)
425  DEALLOCATE (send_timings)
426  DEALLOCATE (send_timings2)
427 #else
428  mark_used(comm)
429  mark_used(npow)
430  IF (output_unit > 0) WRITE (output_unit, *) "No MPI tests for a serial program"
431 #endif
432  END SUBROUTINE mpi_perf_test
433 
434 END MODULE mp_perf_test
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
Definition: grid_common.h:117
Defines the basic variable types.
Definition: kinds.F:23
integer, parameter, public dp
Definition: kinds.F:34
Interface to the message passing library MPI.
Interface to the message passing library MPI.
Definition: mp_perf_test.F:23
subroutine, public mpi_perf_test(comm, npow, output_unit)
Tests the MPI library.
Definition: mp_perf_test.F:55