(git:374b731)
Loading...
Searching...
No Matches
mp_perf_test.F
Go to the documentation of this file.
1!--------------------------------------------------------------------------------------------------!
2! CP2K: A general program to perform molecular dynamics simulations !
3! Copyright 2000-2024 CP2K developers group <https://cp2k.org> !
4! !
5! SPDX-License-Identifier: GPL-2.0-or-later !
6!--------------------------------------------------------------------------------------------------!
7
8! **************************************************************************************************
9!> \brief Interface to the message passing library MPI
10!> \par History
11!> JGH (02-Jan-2001): New error handling
12!> Performance tools
13!> JGH (14-Jan-2001): New routines mp_comm_compare, mp_cart_coords,
14!> mp_rank_compare, mp_alltoall
15!> JGH (06-Feb-2001): New routines mp_comm_free
16!> JGH (22-Mar-2001): New routines mp_comm_dup
17!> fawzi (04-NOV-2004): storable performance info (for f77 interface)
18!> Wrapper routine for mpi_gatherv added (22.12.2005,MK)
19!> JGH (13-Feb-2006): Flexible precision
20!> JGH (15-Feb-2006): single precision mp_alltoall
21!> \author JGH
22! **************************************************************************************************
24 USE kinds, ONLY: dp
26 ! some benchmarking code
27#include "../base/base_uses.f90"
28
29#if defined(__parallel)
30#if defined(__MPI_F08)
31 USE mpi_f08, ONLY: mpi_wtime
32#else
33 USE mpi, ONLY: mpi_wtime
34#endif
35#endif
36
37 PRIVATE
38
39 PUBLIC :: mpi_perf_test
40
41CONTAINS
42
43! **************************************************************************************************
44!> \brief Tests the MPI library
45!> \param comm the relevant, initialized communicator
46!> \param npow number of sizes to test, 10**1 .. 10**npow
47!> \param output_unit where to direct output
48!> \par History
49!> JGH 6-Feb-2001 : Test and performance code
50!> \author JGH 1-JAN-2001
51!> \note
52!> quickly adapted benchmark code, will only work on an even number of CPUs.
53! **************************************************************************************************
54 SUBROUTINE mpi_perf_test(comm, npow, output_unit)
55 CLASS(mp_comm_type), INTENT(IN) :: comm
56 INTEGER, INTENT(IN) :: npow, output_unit
57
58#if defined(__parallel)
59
60 INTEGER :: i, itask, itests, j, jtask, left, nbufmax, &
61 ncount, ngrid, nloc, nprocs, ntot, partner, right, taskid, tag, source
62 INTEGER, ALLOCATABLE, DIMENSION(:) :: rcount, rdispl, scount, sdispl
63 LOGICAL :: ionode
64 REAL(kind=dp) :: maxdiff, t1, &
65 t2, t3, t4, t5
66 REAL(kind=dp), ALLOCATABLE, DIMENSION(:) :: buffer1, buffer2, buffer3, &
67 lgrid, lgrid2, lgrid3
68 REAL(kind=dp), ALLOCATABLE, &
69 DIMENSION(:, :) :: grid, grid2, grid3, &
70 send_timings, send_timings2
71 REAL(kind=dp), PARAMETER :: threshold = 1.0e-8_dp
72
73 ! set system sizes !
74 ngrid = 10**npow
75
76 taskid = comm%mepos
77 nprocs = comm%num_pe
78 ionode = comm%is_source()
79 IF (ionode .AND. output_unit > 0) THEN
80 WRITE (output_unit, *) "Running with ", nprocs
81 WRITE (output_unit, *) "running messages with npow = ", npow
82 WRITE (output_unit, *) "use MPI X in the input for larger (e.g. 6) of smaller (e.g. 3) messages"
83 IF (modulo(nprocs, 2) .NE. 0) WRITE (output_unit, *) "Testing only with an even number of tasks"
84 END IF
85
86 IF (modulo(nprocs, 2) .NE. 0) RETURN
87
88 ! equal loads
89 nloc = ngrid/nprocs
90 ntot = nprocs*nloc
91 nbufmax = 10**npow
92 !
93 ALLOCATE (rcount(nprocs))
94 ALLOCATE (scount(nprocs))
95 ALLOCATE (sdispl(nprocs))
96 ALLOCATE (rdispl(nprocs))
97 ALLOCATE (buffer1(nbufmax))
98 ALLOCATE (buffer2(nbufmax))
99 ALLOCATE (buffer3(nbufmax))
100 ALLOCATE (grid(nloc, nprocs))
101 ALLOCATE (grid2(nloc, nprocs))
102 ALLOCATE (grid3(nloc, nprocs))
103 ALLOCATE (lgrid(nloc))
104 ALLOCATE (lgrid2(nloc))
105 ALLOCATE (lgrid3(nloc))
106 ALLOCATE (send_timings(0:nprocs - 1, 0:nprocs - 1))
107 ALLOCATE (send_timings2(0:nprocs - 1, 0:nprocs - 1))
108 buffer1 = 0.0_dp
109 buffer2 = 0.0_dp
110 buffer3 = 0.0_dp
111 ! timings
112 send_timings = 0.0_dp
113 send_timings2 = 0.0_dp
114 ! -------------------------------------------------------------------------------------------
115 ! ------------------------------ some in memory tests ---------------------
116 ! -------------------------------------------------------------------------------------------
117 CALL comm%sync()
118 IF (ionode .AND. output_unit > 0) THEN
119 WRITE (output_unit, *) "Testing in memory copies just 1 CPU "
120 WRITE (output_unit, *) " could tell something about the motherboard / cache / compiler "
121 END IF
122 DO i = 1, npow
123 ncount = 10**i
124 t2 = 0.0e0_dp
125 IF (ncount .GT. nbufmax) cpabort("")
126 DO j = 1, 3**(npow - i)
127 CALL comm%sync()
128 t1 = mpi_wtime()
129 buffer2(1:ncount) = buffer1(1:ncount)
130 t2 = t2 + mpi_wtime() - t1 + threshold
131 END DO
132 CALL comm%max(t2, 0)
133 IF (ionode .AND. output_unit > 0) THEN
134 WRITE (output_unit, '(I9,A,F12.4,A)') 8*ncount, " Bytes ", (3**(npow - i))*ncount*8.0e-6_dp/t2, " MB/s"
135 END IF
136 END DO
137 ! -------------------------------------------------------------------------------------------
138 ! ------------------------------ some in memory tests ---------------------
139 ! -------------------------------------------------------------------------------------------
140 CALL comm%sync()
141 IF (ionode .AND. output_unit > 0) THEN
142 WRITE (output_unit, *) "Testing in memory copies all cpus"
143 WRITE (output_unit, *) " is the memory bandwidth affected on an SMP machine ?"
144 END IF
145 DO i = 1, npow
146 ncount = 10**i
147 t2 = 0.0e0_dp
148 IF (ncount .GT. nbufmax) cpabort("")
149 DO j = 1, 3**(npow - i)
150 CALL comm%sync()
151 t1 = mpi_wtime()
152 buffer2(1:ncount) = buffer1(1:ncount)
153 t2 = t2 + mpi_wtime() - t1 + threshold
154 END DO
155 CALL comm%max(t2, 0)
156 IF (ionode .AND. output_unit > 0) THEN
157 WRITE (output_unit, '(I9,A,F12.4,A)') 8*ncount, " Bytes ", (3**(npow - i))*ncount*8.0e-6_dp/t2, " MB/s"
158 END IF
159 END DO
160 ! -------------------------------------------------------------------------------------------
161 ! ------------------------------ first test point to point communication ---------------------
162 ! -------------------------------------------------------------------------------------------
163 CALL comm%sync()
164 IF (ionode .AND. output_unit > 0) THEN
165 WRITE (output_unit, *) "Testing truly point to point communication (i with j only)"
166 WRITE (output_unit, *) " is there some different connection between i j (e.g. shared memory comm)"
167 END IF
168 ncount = 10**npow
169 IF (ionode .AND. output_unit > 0) WRITE (output_unit, *) "For messages of ", ncount*8, " bytes"
170 IF (ncount .GT. nbufmax) cpabort("")
171 DO itask = 0, nprocs - 1
172 DO jtask = itask + 1, nprocs - 1
173 CALL comm%sync()
174 t1 = mpi_wtime()
175 IF (taskid .EQ. itask) THEN
176 CALL comm%send(buffer1, jtask, itask*jtask)
177 END IF
178 IF (taskid .EQ. jtask) THEN
179 source = itask
180 tag = itask*jtask
181 CALL comm%recv(buffer1, source, tag)
182 END IF
183 send_timings(itask, jtask) = mpi_wtime() - t1 + threshold
184 END DO
185 END DO
186 CALL comm%max(send_timings, 0)
187 IF (ionode .AND. output_unit > 0) THEN
188 DO itask = 0, nprocs - 1
189 DO jtask = itask + 1, nprocs - 1
190 WRITE (output_unit, '(I4,I4,F12.4,A)') itask, jtask, ncount*8.0e-6_dp/send_timings(itask, jtask), " MB/s"
191 END DO
192 END DO
193 END IF
194 CALL comm%sync()
195 ! -------------------------------------------------------------------------------------------
196 ! ------------------------------ second test point to point communication -------------------
197 ! -------------------------------------------------------------------------------------------
198 IF (ionode .AND. output_unit > 0) THEN
199 WRITE (output_unit, *) "Testing all nearby point to point communication (0,1)(2,3)..."
200 WRITE (output_unit, *) " these could / should all be on the same shared memory node "
201 END IF
202 DO i = 1, npow
203 ncount = 10**i
204 t2 = 0.0e0_dp
205 IF (ncount .GT. nbufmax) cpabort("")
206 DO j = 1, 3**(npow - i)
207 CALL comm%sync()
208 t1 = mpi_wtime()
209 IF (modulo(taskid, 2) == 0) THEN
210 CALL comm%send(buffer1, taskid + 1, 0)
211 ELSE
212 source = taskid - 1
213 tag = 0
214 CALL comm%recv(buffer1, source, tag)
215 END IF
216 t2 = t2 + mpi_wtime() - t1 + threshold
217 END DO
218 CALL comm%max(t2, 0)
219 IF (ionode .AND. output_unit > 0) THEN
220 WRITE (output_unit, '(I9,A,F12.4,A)') 8*ncount, " Bytes ", (3**(npow - i))*ncount*8.0e-6_dp/t2, " MB/s"
221 END IF
222 END DO
223 CALL comm%sync()
224 ! -------------------------------------------------------------------------------------------
225 ! ------------------------------ third test point to point communication -------------------
226 ! -------------------------------------------------------------------------------------------
227 IF (ionode .AND. output_unit > 0) THEN
228 WRITE (output_unit, *) "Testing all far point to point communication (0,nprocs/2),(1,nprocs/2+1),.."
229 WRITE (output_unit, *) " these could all be going over the network, and stress it a lot"
230 END IF
231 DO i = 1, npow
232 ncount = 10**i
233 t2 = 0.0e0_dp
234 IF (ncount .GT. nbufmax) cpabort("")
235 DO j = 1, 3**(npow - i)
236 CALL comm%sync()
237 t1 = mpi_wtime()
238 ! first half with partner
239 IF (taskid .LT. nprocs/2) THEN
240 CALL comm%send(buffer1, taskid + nprocs/2, 0)
241 ELSE
242 source = taskid - nprocs/2
243 tag = 0
244 CALL comm%recv(buffer1, source, tag)
245 END IF
246 t2 = t2 + mpi_wtime() - t1 + threshold
247 END DO
248 CALL comm%max(t2, 0)
249 IF (ionode .AND. output_unit > 0) THEN
250 WRITE (output_unit, '(I9,A,F12.4,A)') 8*ncount, " Bytes ", (3**(npow - i))*ncount*8.0e-6_dp/t2, " MB/s"
251 END IF
252 END DO
253 ! -------------------------------------------------------------------------------------------
254 ! ------------------------------ test root to all broadcast -------------------
255 ! -------------------------------------------------------------------------------------------
256 CALL comm%sync()
257 IF (ionode .AND. output_unit > 0) THEN
258 WRITE (output_unit, *) "Testing root to all broadcast "
259 WRITE (output_unit, *) " using trees at least ? "
260 END IF
261 DO i = 1, npow
262 ncount = 10**i
263 t2 = 0.0e0_dp
264 IF (ncount .GT. nbufmax) cpabort("")
265 DO j = 1, 3**(npow - i)
266 CALL comm%sync()
267 t1 = mpi_wtime()
268 CALL comm%bcast(buffer1, 0)
269 t2 = t2 + mpi_wtime() - t1 + threshold
270 END DO
271 CALL comm%max(t2, 0)
272 IF (ionode .AND. output_unit > 0) THEN
273 WRITE (output_unit, '(I9,A,F12.4,A)') 8*ncount, " Bytes ", (3**(npow - i))*ncount*8.0e-6_dp/t2, " MB/s"
274 END IF
275 END DO
276 ! -------------------------------------------------------------------------------------------
277 ! ------------------------------ test parallel sum like behavior -------------------
278 ! -------------------------------------------------------------------------------------------
279 CALL comm%sync()
280 IF (ionode .AND. output_unit > 0) WRITE (output_unit, *) "Test global summation (mpi_allreduce) "
281 DO i = 1, npow
282 ncount = 10**i
283 t2 = 0.0e0_dp
284 IF (ncount .GT. nbufmax) cpabort("")
285 DO j = 1, 3**(npow - i)
286 buffer2(:) = buffer1
287 CALL comm%sync()
288 t1 = mpi_wtime()
289 CALL comm%sum(buffer2)
290 t2 = t2 + mpi_wtime() - t1 + threshold
291 END DO
292 CALL comm%max(t2, 0)
293 IF (ionode .AND. output_unit > 0) THEN
294 WRITE (output_unit, '(I9,A,F12.4,A)') 8*ncount, " Bytes ", (3**(npow - i))*ncount*8.0e-6_dp/t2, " MB/s"
295 END IF
296 END DO
297 ! -------------------------------------------------------------------------------------------
298 ! ------------------------------ test all to all communication -------------------
299 ! -------------------------------------------------------------------------------------------
300 CALL comm%sync()
301 IF (ionode .AND. output_unit > 0) THEN
302 WRITE (output_unit, *) "Test all to all communication (mpi_alltoallv)"
303 WRITE (output_unit, *) " mpi/network getting confused ? "
304 END IF
305 DO i = 1, npow
306 ncount = 10**i
307 t2 = 0.0e0_dp
308 IF (ncount .GT. nbufmax) cpabort("")
309 scount = ncount/nprocs
310 rcount = ncount/nprocs
311 DO j = 1, nprocs
312 sdispl(j) = (j - 1)*(ncount/nprocs)
313 rdispl(j) = (j - 1)*(ncount/nprocs)
314 END DO
315 DO j = 1, 3**(npow - i)
316 CALL comm%sync()
317 t1 = mpi_wtime()
318 CALL comm%alltoall(buffer1, scount, sdispl, buffer2, rcount, rdispl)
319 t2 = t2 + mpi_wtime() - t1 + threshold
320 END DO
321 CALL comm%max(t2, 0)
322 IF (ionode .AND. output_unit > 0) THEN
323 WRITE (output_unit, '(I9,A,F12.4,A)') 8*(ncount/nprocs)*nprocs, " Bytes ", &
324 (3**(npow - i))*(ncount/nprocs)*nprocs*8.0e-6_dp/t2, " MB/s"
325 END IF
326 END DO
327
328 ! -------------------------------------------------------------------------------------------
329 ! ------------------------------ other stuff ---------------------
330 ! -------------------------------------------------------------------------------------------
331 IF (ionode .AND. output_unit > 0) THEN
332 WRITE (output_unit, *) " Clean tests completed "
333 WRITE (output_unit, *) " Testing MPI_REDUCE scatter"
334 END IF
335 rcount = nloc
336 DO itests = 1, 3
337 IF (ionode .AND. output_unit > 0) &
338 WRITE (output_unit, *) "------------------------------- test ", itests, " ------------------------"
339 ! *** reference ***
340 DO j = 1, nprocs
341 DO i = 1, nloc
342 grid(i, j) = modulo(i*j*taskid, itests)
343 END DO
344 END DO
345 t1 = mpi_wtime()
346 CALL comm%mp_sum_scatter_dv(grid, lgrid, rcount)
347 t2 = mpi_wtime() - t1 + threshold
348 CALL comm%max(t2)
349 IF (ionode .AND. output_unit > 0) WRITE (output_unit, *) "MPI_REDUCE_SCATTER ", t2
350 ! *** simple shift ***
351 DO j = 1, nprocs
352 DO i = 1, nloc
353 grid2(i, j) = modulo(i*j*taskid, itests)
354 END DO
355 END DO
356 t3 = mpi_wtime()
357 lgrid2 = 0.0e0_dp
358 DO i = 1, nprocs
359 lgrid2(:) = lgrid2 + grid(:, modulo(taskid - i, nprocs) + 1)
360 IF (i .EQ. nprocs) EXIT
361 CALL comm%shift(lgrid2, 1)
362 END DO
363 t4 = mpi_wtime() - t3 + threshold
364 CALL comm%max(t4)
365 maxdiff = maxval(abs(lgrid2 - lgrid))
366 CALL comm%max(maxdiff)
367 IF (ionode .AND. output_unit > 0) WRITE (output_unit, *) "MPI_SENDRECV_REPLACE ", t4, maxdiff
368 ! *** involved shift ****
369 IF (modulo(nprocs, 2) /= 0) cpabort("")
370 DO j = 1, nprocs
371 DO i = 1, nloc
372 grid3(i, j) = modulo(i*j*taskid, itests)
373 END DO
374 END DO
375 t3 = mpi_wtime()
376 ! first sum the grid in pairs (0,1),(2,3) should be within an LPAR and fast XXXXXXXXX
377 ! 0 will only need parts 0,2,4,... correctly summed
378 ! 1 will only need parts 1,3,5,... correctly summed
379 ! *** could nicely be generalised ****
380 IF (modulo(taskid, 2) == 0) THEN
381 partner = taskid + 1
382 DO i = 1, nprocs, 2 ! sum the full grid with the partner
383 CALL comm%sendrecv(grid3(:, i + 1), partner, lgrid3, partner, 17)
384 grid3(:, i) = grid3(:, i) + lgrid3(:)
385 END DO
386 ELSE
387 partner = taskid - 1
388 DO i = 1, nprocs, 2
389 CALL comm%sendrecv(grid3(:, i), partner, lgrid3, partner, 17)
390 grid3(:, i + 1) = grid3(:, i + 1) + lgrid3(:)
391 END DO
392 END IF
393 t4 = mpi_wtime() - t3 + threshold
394 ! now send a given buffer from 1 to 3 to 5 .. adding the right part of the data
395 ! since we've summed an lgrid does only need to pass by even or odd tasks
396 left = modulo(taskid - 2, nprocs)
397 right = modulo(taskid + 2, nprocs)
398 t3 = mpi_wtime()
399 lgrid3 = 0.0e0_dp
400 DO i = 1, nprocs, 2
401 lgrid3(:) = lgrid3 + grid3(:, modulo(taskid - i - 1, nprocs) + 1)
402 IF (i .EQ. nprocs - 1) EXIT
403 CALL comm%shift(lgrid3, 2)
404 END DO
405 t5 = mpi_wtime() - t3 + threshold
406 CALL comm%max(t4)
407 CALL comm%max(t5)
408 maxdiff = maxval(abs(lgrid3 - lgrid))
409 CALL comm%max(maxdiff)
410 IF (ionode .AND. output_unit > 0) WRITE (output_unit, *) "INVOLVED SHIFT ", t4 + t5, "(", t4, ",", t5, ")", maxdiff
411 END DO
412 DEALLOCATE (rcount)
413 DEALLOCATE (scount)
414 DEALLOCATE (sdispl)
415 DEALLOCATE (rdispl)
416 DEALLOCATE (buffer1)
417 DEALLOCATE (buffer2)
418 DEALLOCATE (buffer3)
419 DEALLOCATE (grid)
420 DEALLOCATE (grid2)
421 DEALLOCATE (grid3)
422 DEALLOCATE (lgrid)
423 DEALLOCATE (lgrid2)
424 DEALLOCATE (lgrid3)
425 DEALLOCATE (send_timings)
426 DEALLOCATE (send_timings2)
427#else
428 mark_used(comm)
429 mark_used(npow)
430 IF (output_unit > 0) WRITE (output_unit, *) "No MPI tests for a serial program"
431#endif
432 END SUBROUTINE mpi_perf_test
433
434END MODULE mp_perf_test
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
Defines the basic variable types.
Definition kinds.F:23
integer, parameter, public dp
Definition kinds.F:34
Interface to the message passing library MPI.
Interface to the message passing library MPI.
subroutine, public mpi_perf_test(comm, npow, output_unit)
Tests the MPI library.