(git:374b731)
Loading...
Searching...
No Matches
statistical_methods.F
Go to the documentation of this file.
1!--------------------------------------------------------------------------------------------------!
2! CP2K: A general program to perform molecular dynamics simulations !
3! Copyright 2000-2024 CP2K developers group <https://cp2k.org> !
4! !
5! SPDX-License-Identifier: GPL-2.0-or-later !
6!--------------------------------------------------------------------------------------------------!
7
8! **************************************************************************************************
9!> \brief Methods to perform on the fly statistical analysis of data
10!> -) Schiferl and Wallace, J. Chem. Phys. 83 (10) 1985
11!> \author Teodoro Laino (01.2007) [tlaino]
12!> \par History
13!> - Teodoro Laino (10.2008) [tlaino] - University of Zurich
14!> module made publicly available
15! **************************************************************************************************
19 USE kinds, ONLY: dp
20 USE util, ONLY: sort
21#include "./base/base_uses.f90"
22
23 IMPLICIT NONE
24
25 PRIVATE
26 CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'statistical_methods'
27 LOGICAL, PARAMETER :: debug_this_module = .false.
28 INTEGER, PARAMETER, PUBLIC :: min_sample_size = 20
29 PUBLIC :: sw_test, &
30 k_test, &
32
33CONTAINS
34
35! **************************************************************************************************
36!> \brief Shapiro - Wilk's test or W-statistic to test normality of a distribution
37!> R94 APPL. STATIST. (1995) VOL.44, NO.4
38!> Calculates the Shapiro-Wilk W test and its significance level
39!> \param ix ...
40!> \param n ...
41!> \param w ...
42!> \param pw ...
43!> \par History
44!> Teodoro Laino (02.2007) [tlaino]
45! **************************************************************************************************
46 SUBROUTINE sw_test(ix, n, w, pw)
47 REAL(kind=dp), DIMENSION(:), POINTER :: ix
48 INTEGER, INTENT(IN) :: n
49 REAL(kind=dp), INTENT(OUT) :: w, pw
50
51 REAL(kind=dp), PARAMETER :: c1(6) = (/0.000000_dp, 0.221157_dp, -0.147981_dp, -2.071190_dp, &
52 4.434685_dp, -2.706056_dp/), c2(6) = (/0.000000_dp, 0.042981_dp, -0.293762_dp, &
53 -1.752461_dp, 5.682633_dp, -3.582633_dp/), &
54 c3(4) = (/0.544000_dp, -0.399780_dp, 0.025054_dp, -0.6714e-3_dp/), &
55 c4(4) = (/1.3822_dp, -0.77857_dp, 0.062767_dp, -0.0020322_dp/), &
56 c5(4) = (/-1.5861_dp, -0.31082_dp, -0.083751_dp, 0.0038915_dp/), &
57 c6(3) = (/-0.4803_dp, -0.082676_dp, 0.0030302_dp/), g(2) = (/-2.273_dp, 0.459_dp/), &
58 one = 1.0_dp, pi6 = 1.909859_dp, qtr = 0.25_dp, small = epsilon(0.0_dp), &
59 sqrth = 0.70711_dp
60 REAL(kind=dp), PARAMETER :: stqr = 1.047198_dp, th = 0.375_dp, two = 2.0_dp, zero = 0.0_dp
61
62 INTEGER :: i, i1, j, n2, output_unit
63 INTEGER, ALLOCATABLE, DIMENSION(:) :: itmp
64 LOGICAL :: failure
65 REAL(kind=dp) :: a1, a2, an, an25, asa, fac, gamma, m, &
66 range, rsn, s, sa, sax, ssa, ssassx, &
67 ssumm2, ssx, summ2, sx, w1, xi, xsx, &
68 xx, y
69 REAL(kind=dp), ALLOCATABLE, DIMENSION(:) :: a, x
70
71 failure = .false.
72 output_unit = cp_logger_get_default_io_unit()
73 ! Check for N < 3
74 IF (n < 3 .OR. n > 5000) THEN
75 IF (output_unit > 0) WRITE (output_unit, '(A)') &
76 "Shapiro Wilk test: number of points less than 3 or greated than 5000."
77 IF (output_unit > 0) WRITE (output_unit, '(A)') &
78 "Not able to perform the test!"
79 END IF
80 ! Sort input array of data in ascending order
81 IF (mod(n, 2) == 0) THEN
82 n2 = n/2
83 ELSE
84 n2 = (n - 1)/2
85 END IF
86 ALLOCATE (x(n))
87 ALLOCATE (itmp(n))
88 ALLOCATE (a(n2))
89 x(:) = ix
90 CALL sort(x, n, itmp)
91 ! Check for zero range
92 range = x(n) - x(1)
93 IF (range < small) failure = .true.
94 IF (failure .AND. (output_unit > 0)) THEN
95 WRITE (output_unit, '(A)') "Shapiro Wilk test: two data points are numerically identical."
96 WRITE (output_unit, '(A)') "Not able to perform the test!"
97 END IF
98 pw = one
99 w = one
100 an = n
101 ! Calculates coefficients for the test
102 IF (n == 3) THEN
103 a(1) = sqrth
104 ELSE
105 an25 = an + qtr
106 summ2 = zero
107 DO i = 1, n2
108 CALL ppnd7((i - th)/an25, a(i))
109 summ2 = summ2 + a(i)**2
110 END DO
111 summ2 = summ2*two
112 ssumm2 = sqrt(summ2)
113 rsn = one/sqrt(an)
114 a1 = poly(c1, 6, rsn) - a(1)/ssumm2
115 ! Normalize coefficients
116 IF (n > 5) THEN
117 i1 = 3
118 a2 = -a(2)/ssumm2 + poly(c2, 6, rsn)
119 fac = sqrt((summ2 - two*a(1)**2 - two*a(2)**2)/(one - two*a1**2 - two*a2**2))
120 a(1) = a1
121 a(2) = a2
122 ELSE
123 i1 = 2
124 fac = sqrt((summ2 - two*a(1)**2)/(one - two*a1**2))
125 a(1) = a1
126 END IF
127 DO i = i1, n2
128 a(i) = -a(i)/fac
129 END DO
130 END IF
131 ! scaled X
132 xx = x(1)/range
133 sx = xx
134 sa = -a(1)
135 j = n - 1
136 DO i = 2, n
137 xi = x(i)/range
138 sx = sx + xi
139 IF (i /= j) sa = sa + sign(1, i - j)*a(min(i, j))
140 xx = xi
141 j = j - 1
142 END DO
143 ! Calculate W statistic as squared correlation
144 ! between data and coefficients
145 sa = sa/n
146 sx = sx/n
147 ssa = zero
148 ssx = zero
149 sax = zero
150 j = n
151 DO i = 1, n
152 IF (i /= j) THEN
153 asa = sign(1, i - j)*a(min(i, j)) - sa
154 ELSE
155 asa = -sa
156 END IF
157 xsx = x(i)/range - sx
158 ssa = ssa + asa*asa
159 ssx = ssx + xsx*xsx
160 sax = sax + asa*xsx
161 j = j - 1
162 END DO
163 ! W1 equals (1-W) calculated to avoid excessive rounding error
164 ! for W very near 1 (a potential problem in very large samples)
165 ssassx = sqrt(ssa*ssx)
166 w1 = (ssassx - sax)*(ssassx + sax)/(ssa*ssx)
167 w = one - w1
168 ! Calculate significance level for W (exact for N=3)
169 IF (n == 3) THEN
170 pw = pi6*(asin(sqrt(w)) - stqr)
171 ELSE
172 y = log(w1)
173 xx = log(an)
174 m = zero
175 s = one
176 IF (n <= 11) THEN
177 gamma = poly(g, 2, an)
178 IF (y >= gamma) THEN
179 pw = small
180 ELSE
181 y = -log(gamma - y)
182 m = poly(c3, 4, an)
183 s = exp(poly(c4, 4, an))
184 pw = alnorm((y - m)/s, .true.)
185 END IF
186 ELSE
187 m = poly(c5, 4, xx)
188 s = exp(poly(c6, 3, xx))
189 pw = alnorm((y - m)/s, .true.)
190 END IF
191 END IF
192 DEALLOCATE (x)
193 DEALLOCATE (itmp)
194 DEALLOCATE (a)
195
196 END SUBROUTINE sw_test
197
198! **************************************************************************************************
199!> \brief Produces the normal deviate Z corresponding to a given lower tail area of P
200!> Z is accurate to about 1 part in 10**7.
201!> AS241 APPL. STATIST. (1988) VOL. 37, NO. 3, 477- 484.
202!> \param p ...
203!> \param normal_dev ...
204!> \par History
205!> Original version by Alain J. Miller - 1996
206!> Teodoro Laino (02.2007) [tlaino]
207! **************************************************************************************************
208 SUBROUTINE ppnd7(p, normal_dev)
209 REAL(kind=dp), INTENT(IN) :: p
210 REAL(kind=dp), INTENT(OUT) :: normal_dev
211
212 REAL(kind=dp), PARAMETER :: a0 = 3.3871327179e+00_dp, a1 = 5.0434271938e+01_dp, &
213 a2 = 1.5929113202e+02_dp, a3 = 5.9109374720e+01_dp, b1 = 1.7895169469e+01_dp, &
214 b2 = 7.8757757664e+01_dp, b3 = 6.7187563600e+01_dp, c0 = 1.4234372777e+00_dp, &
215 c1 = 2.7568153900e+00_dp, c2 = 1.3067284816e+00_dp, c3 = 1.7023821103e-01_dp, &
216 const1 = 0.180625_dp, const2 = 1.6_dp, d1 = 7.3700164250e-01_dp, &
217 d2 = 1.2021132975e-01_dp, e0 = 6.6579051150e+00_dp, e1 = 3.0812263860e+00_dp, &
218 e2 = 4.2868294337e-01_dp, e3 = 1.7337203997e-02_dp, f1 = 2.4197894225e-01_dp, &
219 f2 = 1.2258202635e-02_dp, half = 0.5_dp, one = 1.0_dp
220 REAL(kind=dp), PARAMETER :: split1 = 0.425_dp, split2 = 5.0_dp, zero = 0.0_dp
221
222 REAL(kind=dp) :: q, r
223
224 q = p - half
225 IF (abs(q) <= split1) THEN
226 r = const1 - q*q
227 normal_dev = q*(((a3*r + a2)*r + a1)*r + a0)/ &
228 (((b3*r + b2)*r + b1)*r + one)
229 RETURN
230 ELSE
231 IF (q < zero) THEN
232 r = p
233 ELSE
234 r = one - p
235 END IF
236 IF (r <= zero) THEN
237 normal_dev = zero
238 RETURN
239 END IF
240 r = sqrt(-log(r))
241 IF (r <= split2) THEN
242 r = r - const2
243 normal_dev = (((c3*r + c2)*r + c1)*r + c0)/((d2*r + d1)*r + one)
244 ELSE
245 r = r - split2
246 normal_dev = (((e3*r + e2)*r + e1)*r + e0)/((f2*r + f1)*r + one)
247 END IF
248 IF (q < zero) normal_dev = -normal_dev
249 RETURN
250 END IF
251 END SUBROUTINE ppnd7
252
253! **************************************************************************************************
254!> \brief Evaluates the tail area of the standardised normal curve
255!> from x to infinity if upper is .true. or
256!> from minus infinity to x if upper is .false.
257!> AS66 Applied Statistics (1973) vol.22, no.3
258!> \param x ...
259!> \param upper ...
260!> \return ...
261!> \par History
262!> Original version by Alain J. Miller - 1996
263!> Teodoro Laino (02.2007) [tlaino]
264! **************************************************************************************************
265 FUNCTION alnorm(x, upper) RESULT(fn_val)
266 REAL(kind=dp), INTENT(IN) :: x
267 LOGICAL, INTENT(IN) :: upper
268 REAL(kind=dp) :: fn_val
269
270 REAL(kind=dp), PARAMETER :: a1 = 5.75885480458_dp, a2 = 2.62433121679_dp, &
271 a3 = 5.92885724438_dp, b1 = -29.8213557807_dp, b2 = 48.6959930692_dp, c1 = -3.8052e-8_dp, &
272 c2 = 3.98064794e-4_dp, c3 = -0.151679116635_dp, c4 = 4.8385912808_dp, &
273 c5 = 0.742380924027_dp, c6 = 3.99019417011_dp, con = 1.28_dp, d1 = 1.00000615302_dp, &
274 d2 = 1.98615381364_dp, d3 = 5.29330324926_dp, d4 = -15.1508972451_dp, &
275 d5 = 30.789933034_dp, half = 0.5_dp, ltone = 7.0_dp, one = 1.0_dp, p = 0.398942280444_dp, &
276 q = 0.39990348504_dp, r = 0.398942280385_dp, utzero = 18.66_dp, zero = 0.0_dp
277
278 LOGICAL :: up
279 REAL(kind=dp) :: y, z
280
281 up = upper
282 z = x
283 IF (z < zero) THEN
284 up = .NOT. up
285 z = -z
286 END IF
287 IF (.NOT. (z <= ltone .OR. up .AND. z <= utzero)) THEN
288 fn_val = zero
289 IF (.NOT. up) fn_val = one - fn_val
290 RETURN
291 END IF
292 y = half*z*z
293 IF (z <= con) THEN
294 fn_val = r*exp(-y)/(z + c1 + d1/(z + c2 + d2/(z + c3 + d3/(z + c4 + d4/(z + c5 + d5/(z + c6))))))
295 ELSE
296 fn_val = half - z*(p - q*y/(y + a1 + b1/(y + a2 + b2/(y + a3))))
297 END IF
298 IF (.NOT. up) fn_val = one - fn_val
299
300 END FUNCTION alnorm
301
302! **************************************************************************************************
303!> \brief Calculates the algebraic polynomial of order nored-1 with
304!> array of coefficients c. Zero order coefficient is c(1)
305!> AS 181.2 Appl. Statist. (1982) Vol. 31, No. 2
306!> \param c ...
307!> \param nord ...
308!> \param x ...
309!> \return ...
310!> \par History
311!> Original version by Alain J. Miller - 1996
312!> Teodoro Laino (02.2007) [tlaino]
313! **************************************************************************************************
314 FUNCTION poly(c, nord, x) RESULT(fn_val)
315
316 REAL(kind=dp), INTENT(IN) :: c(:)
317 INTEGER, INTENT(IN) :: nord
318 REAL(kind=dp), INTENT(IN) :: x
319 REAL(kind=dp) :: fn_val
320
321 INTEGER :: i, j, n2
322 REAL(kind=dp) :: p
323
324 fn_val = c(1)
325 IF (nord == 1) RETURN
326 p = x*c(nord)
327 IF (nord == 2) THEN
328 fn_val = fn_val + p
329 RETURN
330 END IF
331 n2 = nord - 2
332 j = n2 + 1
333 DO i = 1, n2
334 p = (p + c(j))*x
335 j = j - 1
336 END DO
337 fn_val = fn_val + p
338 END FUNCTION poly
339
340! **************************************************************************************************
341!> \brief Kandall's test for correlation
342!> \param xdata ...
343!> \param istart ...
344!> \param n ...
345!> \param tau ...
346!> \param z ...
347!> \param prob ...
348!> \par History
349!> Teodoro Laino (02.2007) [tlaino]
350!> \note
351!> tau: Kendall's Tau
352!> z: number of std devs from 0 of tau
353!> prob: tau's probability
354! **************************************************************************************************
355 SUBROUTINE k_test(xdata, istart, n, tau, z, prob)
356 REAL(kind=dp), DIMENSION(:), POINTER :: xdata
357 INTEGER, INTENT(IN) :: istart, n
358 REAL(kind=dp) :: tau, z, prob
359
360 INTEGER :: is, j, k, nt
361 REAL(kind=dp) :: a1, var
362
363 nt = n - istart + 1
364 IF (nt .GE. min_sample_size) THEN
365 is = 0
366 DO j = istart, n - 1
367 DO k = j + 1, n
368 a1 = xdata(j) - xdata(k)
369 IF (a1 .GT. 0.0_dp) THEN
370 is = is + 1
371 ELSE IF (a1 .LT. 0.0_dp) THEN
372 is = is - 1
373 END IF
374 END DO
375 END DO
376 tau = real(is, kind=dp)
377 var = real(nt, kind=dp)*real(nt - 1, kind=dp)*real(2*nt + 5, kind=dp)/18.0_dp
378 z = tau/sqrt(var)
379 prob = erf(abs(z)/sqrt(2.0_dp))
380 ELSE
381 tau = 0.0_dp
382 z = 0.0_dp
383 prob = 1.0_dp
384 END IF
385 END SUBROUTINE k_test
386
387! **************************************************************************************************
388!> \brief Von Neumann test for serial correlation
389!> \param xdata ...
390!> \param n ...
391!> \param r ...
392!> \param u ...
393!> \param prob ...
394!> \par History
395!> Teodoro Laino (02.2007) [tlaino]
396! **************************************************************************************************
397 SUBROUTINE vn_test(xdata, n, r, u, prob)
398 REAL(kind=dp), DIMENSION(:), POINTER :: xdata
399 INTEGER, INTENT(IN) :: n
400 REAL(kind=dp) :: r, u, prob
401
402 INTEGER :: i
403 REAL(kind=dp) :: q, s, var, x
404
405 IF (n .GE. min_sample_size) THEN
406 x = 0.0_dp
407 q = 0.0_dp
408 s = 0.0_dp
409 DO i = 1, n - 1
410 x = x + xdata(i)
411 q = q + (xdata(i + 1) - xdata(i))**2
412 END DO
413 x = x + xdata(n)
414 x = x/real(n, kind=dp)
415 DO i = 1, n
416 s = s + (xdata(i) - x)**2
417 END DO
418 s = s/real(n - 1, kind=dp)
419 q = q/real(2*(n - 1), kind=dp)
420 r = q/s
421 var = sqrt(1.0_dp/real(n + 1, kind=dp)*(1.0_dp + 1.0_dp/real(n - 1, kind=dp)))
422 u = (r - 1.0_dp)/var
423 prob = erf(abs(u)/sqrt(2.0_dp))
424 ELSE
425 r = 0.0_dp
426 u = 0.0_dp
427 prob = 1.0_dp
428 END IF
429
430 END SUBROUTINE vn_test
431
432! **************************************************************************************************
433!> \brief Performs tests on statistical methods
434!> Debug use only
435!> \param xdata ...
436!> \param globenv ...
437!> \par History
438!> Teodoro Laino (02.2007) [tlaino]
439! **************************************************************************************************
440 SUBROUTINE tests(xdata, globenv)
441 REAL(kind=dp), DIMENSION(:), POINTER :: xdata
442 TYPE(global_environment_type), POINTER :: globenv
443
444 INTEGER :: i, n
445 REAL(kind=dp) :: prob, pw, r, tau, u, w, z
446 REAL(kind=dp), DIMENSION(:), POINTER :: ydata
447
448 IF (debug_this_module) THEN
449 n = 50 ! original sample size
450 NULLIFY (xdata)
451 ALLOCATE (xdata(n))
452 DO i = 1, 10
453 xdata(i) = 5.0_dp - real(i, kind=dp)/2.0_dp + 0.1*globenv%gaussian_rng_stream%next()
454 WRITE (3, *) xdata(i)
455 END DO
456 DO i = 11, n
457 xdata(i) = 0.1*globenv%gaussian_rng_stream%next()
458 END DO
459
460 ! Test for trend
461 DO i = 1, n
462 CALL k_test(xdata, i, n, tau, z, prob)
463 IF (prob <= 0.2_dp) EXIT
464 END DO
465 WRITE (*, *) "Mann-Kendall test", i
466
467 ! Test for normality distribution and for serial correlation
468 DO i = 1, n
469 ALLOCATE (ydata(n - i + 1))
470 ydata = xdata(i:n)
471 CALL sw_test(ydata, n - i + 1, w, pw)
472 CALL vn_test(ydata, n - i + 1, r, u, prob)
473 WRITE (*, *) "Shapiro Wilks test", i, w, pw
474 WRITE (*, *) "Von Neu", i, r, u, prob
475 DEALLOCATE (ydata)
476 END DO
477
478 DEALLOCATE (xdata)
479 END IF
480 END SUBROUTINE tests
481
482END MODULE statistical_methods
483
static GRID_HOST_DEVICE double fac(const int i)
Factorial function, e.g. fac(5) = 5! = 120.
Definition grid_common.h:48
static GRID_HOST_DEVICE orbital up(const int i, const orbital a)
Increase i'th component of given orbital angular momentum.
various routines to log and control the output. The idea is that decisions about where to log should ...
integer function, public cp_logger_get_default_io_unit(logger)
returns the unit nr for the ionode (-1 on all other processors) skips as well checks if the procs cal...
Calculation of the incomplete Gamma function F_n(t) for multi-center integrals over Cartesian Gaussia...
Definition gamma.F:15
Define type storing the global information of a run. Keep the amount of stored data small....
Defines the basic variable types.
Definition kinds.F:23
integer, parameter, public dp
Definition kinds.F:34
Methods to perform on the fly statistical analysis of data -) Schiferl and Wallace,...
subroutine, public vn_test(xdata, n, r, u, prob)
Von Neumann test for serial correlation.
integer, parameter, public min_sample_size
subroutine, public k_test(xdata, istart, n, tau, z, prob)
Kandall's test for correlation.
subroutine, public sw_test(ix, n, w, pw)
Shapiro - Wilk's test or W-statistic to test normality of a distribution R94 APPL....
All kind of helpful little routines.
Definition util.F:14
contains the initially parsed file and the initial parallel environment