27 #include "../base/base_uses.f90"
29 #if defined(__parallel)
30 #if defined(__MPI_F08)
31 USE mpi_f08,
ONLY: mpi_wtime
33 USE mpi,
ONLY: mpi_wtime
55 CLASS(mp_comm_type),
INTENT(IN) :: comm
56 INTEGER,
INTENT(IN) :: npow, output_unit
58 #if defined(__parallel)
60 INTEGER :: i, itask, itests, j, jtask, left, nbufmax, &
61 ncount, ngrid, nloc, nprocs, ntot, partner, right, taskid, tag, source
62 INTEGER,
ALLOCATABLE,
DIMENSION(:) :: rcount, rdispl, scount, sdispl
64 REAL(kind=
dp) :: maxdiff, t1, &
66 REAL(kind=
dp),
ALLOCATABLE,
DIMENSION(:) :: buffer1, buffer2, buffer3, &
68 REAL(kind=
dp),
ALLOCATABLE, &
69 DIMENSION(:, :) :: grid, grid2, grid3, &
70 send_timings, send_timings2
71 REAL(kind=
dp),
PARAMETER :: threshold = 1.0e-8_dp
78 ionode = comm%is_source()
79 IF (ionode .AND. output_unit > 0)
THEN
80 WRITE (output_unit, *)
"Running with ", nprocs
81 WRITE (output_unit, *)
"running messages with npow = ", npow
82 WRITE (output_unit, *)
"use MPI X in the input for larger (e.g. 6) of smaller (e.g. 3) messages"
83 IF (
modulo(nprocs, 2) .NE. 0)
WRITE (output_unit, *)
"Testing only with an even number of tasks"
86 IF (
modulo(nprocs, 2) .NE. 0)
RETURN
93 ALLOCATE (rcount(nprocs))
94 ALLOCATE (scount(nprocs))
95 ALLOCATE (sdispl(nprocs))
96 ALLOCATE (rdispl(nprocs))
97 ALLOCATE (buffer1(nbufmax))
98 ALLOCATE (buffer2(nbufmax))
99 ALLOCATE (buffer3(nbufmax))
100 ALLOCATE (grid(nloc, nprocs))
101 ALLOCATE (grid2(nloc, nprocs))
102 ALLOCATE (grid3(nloc, nprocs))
103 ALLOCATE (lgrid(nloc))
104 ALLOCATE (lgrid2(nloc))
105 ALLOCATE (lgrid3(nloc))
106 ALLOCATE (send_timings(0:nprocs - 1, 0:nprocs - 1))
107 ALLOCATE (send_timings2(0:nprocs - 1, 0:nprocs - 1))
112 send_timings = 0.0_dp
113 send_timings2 = 0.0_dp
118 IF (ionode .AND. output_unit > 0)
THEN
119 WRITE (output_unit, *)
"Testing in memory copies just 1 CPU "
120 WRITE (output_unit, *)
" could tell something about the motherboard / cache / compiler "
125 IF (ncount .GT. nbufmax) cpabort(
"")
126 DO j = 1, 3**(npow - i)
129 buffer2(1:ncount) = buffer1(1:ncount)
130 t2 = t2 + mpi_wtime() - t1 + threshold
133 IF (ionode .AND. output_unit > 0)
THEN
134 WRITE (output_unit,
'(I9,A,F12.4,A)') 8*ncount,
" Bytes ", (3**(npow - i))*ncount*8.0e-6_dp/t2,
" MB/s"
141 IF (ionode .AND. output_unit > 0)
THEN
142 WRITE (output_unit, *)
"Testing in memory copies all cpus"
143 WRITE (output_unit, *)
" is the memory bandwidth affected on an SMP machine ?"
148 IF (ncount .GT. nbufmax) cpabort(
"")
149 DO j = 1, 3**(npow - i)
152 buffer2(1:ncount) = buffer1(1:ncount)
153 t2 = t2 + mpi_wtime() - t1 + threshold
156 IF (ionode .AND. output_unit > 0)
THEN
157 WRITE (output_unit,
'(I9,A,F12.4,A)') 8*ncount,
" Bytes ", (3**(npow - i))*ncount*8.0e-6_dp/t2,
" MB/s"
164 IF (ionode .AND. output_unit > 0)
THEN
165 WRITE (output_unit, *)
"Testing truly point to point communication (i with j only)"
166 WRITE (output_unit, *)
" is there some different connection between i j (e.g. shared memory comm)"
169 IF (ionode .AND. output_unit > 0)
WRITE (output_unit, *)
"For messages of ", ncount*8,
" bytes"
170 IF (ncount .GT. nbufmax) cpabort(
"")
171 DO itask = 0, nprocs - 1
172 DO jtask = itask + 1, nprocs - 1
175 IF (taskid .EQ. itask)
THEN
176 CALL comm%send(buffer1, jtask, itask*jtask)
178 IF (taskid .EQ. jtask)
THEN
181 CALL comm%recv(buffer1, source, tag)
183 send_timings(itask, jtask) = mpi_wtime() - t1 + threshold
186 CALL comm%max(send_timings, 0)
187 IF (ionode .AND. output_unit > 0)
THEN
188 DO itask = 0, nprocs - 1
189 DO jtask = itask + 1, nprocs - 1
190 WRITE (output_unit,
'(I4,I4,F12.4,A)') itask, jtask, ncount*8.0e-6_dp/send_timings(itask, jtask),
" MB/s"
198 IF (ionode .AND. output_unit > 0)
THEN
199 WRITE (output_unit, *)
"Testing all nearby point to point communication (0,1)(2,3)..."
200 WRITE (output_unit, *)
" these could / should all be on the same shared memory node "
205 IF (ncount .GT. nbufmax) cpabort(
"")
206 DO j = 1, 3**(npow - i)
209 IF (
modulo(taskid, 2) == 0)
THEN
210 CALL comm%send(buffer1, taskid + 1, 0)
214 CALL comm%recv(buffer1, source, tag)
216 t2 = t2 + mpi_wtime() - t1 + threshold
219 IF (ionode .AND. output_unit > 0)
THEN
220 WRITE (output_unit,
'(I9,A,F12.4,A)') 8*ncount,
" Bytes ", (3**(npow - i))*ncount*8.0e-6_dp/t2,
" MB/s"
227 IF (ionode .AND. output_unit > 0)
THEN
228 WRITE (output_unit, *)
"Testing all far point to point communication (0,nprocs/2),(1,nprocs/2+1),.."
229 WRITE (output_unit, *)
" these could all be going over the network, and stress it a lot"
234 IF (ncount .GT. nbufmax) cpabort(
"")
235 DO j = 1, 3**(npow - i)
239 IF (taskid .LT. nprocs/2)
THEN
240 CALL comm%send(buffer1, taskid + nprocs/2, 0)
242 source = taskid - nprocs/2
244 CALL comm%recv(buffer1, source, tag)
246 t2 = t2 + mpi_wtime() - t1 + threshold
249 IF (ionode .AND. output_unit > 0)
THEN
250 WRITE (output_unit,
'(I9,A,F12.4,A)') 8*ncount,
" Bytes ", (3**(npow - i))*ncount*8.0e-6_dp/t2,
" MB/s"
257 IF (ionode .AND. output_unit > 0)
THEN
258 WRITE (output_unit, *)
"Testing root to all broadcast "
259 WRITE (output_unit, *)
" using trees at least ? "
264 IF (ncount .GT. nbufmax) cpabort(
"")
265 DO j = 1, 3**(npow - i)
268 CALL comm%bcast(buffer1, 0)
269 t2 = t2 + mpi_wtime() - t1 + threshold
272 IF (ionode .AND. output_unit > 0)
THEN
273 WRITE (output_unit,
'(I9,A,F12.4,A)') 8*ncount,
" Bytes ", (3**(npow - i))*ncount*8.0e-6_dp/t2,
" MB/s"
280 IF (ionode .AND. output_unit > 0)
WRITE (output_unit, *)
"Test global summation (mpi_allreduce) "
284 IF (ncount .GT. nbufmax) cpabort(
"")
285 DO j = 1, 3**(npow - i)
289 CALL comm%sum(buffer2)
290 t2 = t2 + mpi_wtime() - t1 + threshold
293 IF (ionode .AND. output_unit > 0)
THEN
294 WRITE (output_unit,
'(I9,A,F12.4,A)') 8*ncount,
" Bytes ", (3**(npow - i))*ncount*8.0e-6_dp/t2,
" MB/s"
301 IF (ionode .AND. output_unit > 0)
THEN
302 WRITE (output_unit, *)
"Test all to all communication (mpi_alltoallv)"
303 WRITE (output_unit, *)
" mpi/network getting confused ? "
308 IF (ncount .GT. nbufmax) cpabort(
"")
309 scount = ncount/nprocs
310 rcount = ncount/nprocs
312 sdispl(j) = (j - 1)*(ncount/nprocs)
313 rdispl(j) = (j - 1)*(ncount/nprocs)
315 DO j = 1, 3**(npow - i)
318 CALL comm%alltoall(buffer1, scount, sdispl, buffer2, rcount, rdispl)
319 t2 = t2 + mpi_wtime() - t1 + threshold
322 IF (ionode .AND. output_unit > 0)
THEN
323 WRITE (output_unit,
'(I9,A,F12.4,A)') 8*(ncount/nprocs)*nprocs,
" Bytes ", &
324 (3**(npow - i))*(ncount/nprocs)*nprocs*8.0e-6_dp/t2,
" MB/s"
331 IF (ionode .AND. output_unit > 0)
THEN
332 WRITE (output_unit, *)
" Clean tests completed "
333 WRITE (output_unit, *)
" Testing MPI_REDUCE scatter"
337 IF (ionode .AND. output_unit > 0) &
338 WRITE (output_unit, *)
"------------------------------- test ", itests,
" ------------------------"
342 grid(i, j) =
modulo(i*j*taskid, itests)
346 CALL comm%mp_sum_scatter_dv(grid, lgrid, rcount)
347 t2 = mpi_wtime() - t1 + threshold
349 IF (ionode .AND. output_unit > 0)
WRITE (output_unit, *)
"MPI_REDUCE_SCATTER ", t2
353 grid2(i, j) =
modulo(i*j*taskid, itests)
359 lgrid2(:) = lgrid2 + grid(:,
modulo(taskid - i, nprocs) + 1)
360 IF (i .EQ. nprocs)
EXIT
361 CALL comm%shift(lgrid2, 1)
363 t4 = mpi_wtime() - t3 + threshold
365 maxdiff = maxval(abs(lgrid2 - lgrid))
366 CALL comm%max(maxdiff)
367 IF (ionode .AND. output_unit > 0)
WRITE (output_unit, *)
"MPI_SENDRECV_REPLACE ", t4, maxdiff
369 IF (
modulo(nprocs, 2) /= 0) cpabort(
"")
372 grid3(i, j) =
modulo(i*j*taskid, itests)
380 IF (
modulo(taskid, 2) == 0)
THEN
383 CALL comm%sendrecv(grid3(:, i + 1), partner, lgrid3, partner, 17)
384 grid3(:, i) = grid3(:, i) + lgrid3(:)
389 CALL comm%sendrecv(grid3(:, i), partner, lgrid3, partner, 17)
390 grid3(:, i + 1) = grid3(:, i + 1) + lgrid3(:)
393 t4 = mpi_wtime() - t3 + threshold
396 left =
modulo(taskid - 2, nprocs)
397 right =
modulo(taskid + 2, nprocs)
401 lgrid3(:) = lgrid3 + grid3(:,
modulo(taskid - i - 1, nprocs) + 1)
402 IF (i .EQ. nprocs - 1)
EXIT
403 CALL comm%shift(lgrid3, 2)
405 t5 = mpi_wtime() - t3 + threshold
408 maxdiff = maxval(abs(lgrid3 - lgrid))
409 CALL comm%max(maxdiff)
410 IF (ionode .AND. output_unit > 0)
WRITE (output_unit, *)
"INVOLVED SHIFT ", t4 + t5,
"(", t4,
",", t5,
")", maxdiff
425 DEALLOCATE (send_timings)
426 DEALLOCATE (send_timings2)
430 IF (output_unit > 0)
WRITE (output_unit, *)
"No MPI tests for a serial program"
static GRID_HOST_DEVICE int modulo(int a, int m)
Equivalent of Fortran's MODULO, which always return a positive number. https://gcc....
Defines the basic variable types.
integer, parameter, public dp
Interface to the message passing library MPI.
Interface to the message passing library MPI.
subroutine, public mpi_perf_test(comm, npow, output_unit)
Tests the MPI library.