(git:0de0cc2)
statistical_methods.F
Go to the documentation of this file.
1 !--------------------------------------------------------------------------------------------------!
2 ! CP2K: A general program to perform molecular dynamics simulations !
3 ! Copyright 2000-2024 CP2K developers group <https://cp2k.org> !
4 ! !
5 ! SPDX-License-Identifier: GPL-2.0-or-later !
6 !--------------------------------------------------------------------------------------------------!
7 
8 ! **************************************************************************************************
9 !> \brief Methods to perform on the fly statistical analysis of data
10 !> -) Schiferl and Wallace, J. Chem. Phys. 83 (10) 1985
11 !> \author Teodoro Laino (01.2007) [tlaino]
12 !> \par History
13 !> - Teodoro Laino (10.2008) [tlaino] - University of Zurich
14 !> module made publicly available
15 ! **************************************************************************************************
18  USE global_types, ONLY: global_environment_type
19  USE kinds, ONLY: dp
20  USE util, ONLY: sort
21 #include "./base/base_uses.f90"
22 
23  IMPLICIT NONE
24 
25  PRIVATE
26  CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'statistical_methods'
27  LOGICAL, PARAMETER :: debug_this_module = .false.
28  INTEGER, PARAMETER, PUBLIC :: min_sample_size = 20
29  PUBLIC :: sw_test, &
30  k_test, &
31  vn_test
32 
33 CONTAINS
34 
35 ! **************************************************************************************************
36 !> \brief Shapiro - Wilk's test or W-statistic to test normality of a distribution
37 !> R94 APPL. STATIST. (1995) VOL.44, NO.4
38 !> Calculates the Shapiro-Wilk W test and its significance level
39 !> \param ix ...
40 !> \param n ...
41 !> \param w ...
42 !> \param pw ...
43 !> \par History
44 !> Teodoro Laino (02.2007) [tlaino]
45 ! **************************************************************************************************
46  SUBROUTINE sw_test(ix, n, w, pw)
47  REAL(kind=dp), DIMENSION(:), POINTER :: ix
48  INTEGER, INTENT(IN) :: n
49  REAL(kind=dp), INTENT(OUT) :: w, pw
50 
51  REAL(kind=dp), PARAMETER :: c1(6) = (/0.000000_dp, 0.221157_dp, -0.147981_dp, -2.071190_dp, &
52  4.434685_dp, -2.706056_dp/), c2(6) = (/0.000000_dp, 0.042981_dp, -0.293762_dp, &
53  -1.752461_dp, 5.682633_dp, -3.582633_dp/), &
54  c3(4) = (/0.544000_dp, -0.399780_dp, 0.025054_dp, -0.6714e-3_dp/), &
55  c4(4) = (/1.3822_dp, -0.77857_dp, 0.062767_dp, -0.0020322_dp/), &
56  c5(4) = (/-1.5861_dp, -0.31082_dp, -0.083751_dp, 0.0038915_dp/), &
57  c6(3) = (/-0.4803_dp, -0.082676_dp, 0.0030302_dp/), g(2) = (/-2.273_dp, 0.459_dp/), &
58  one = 1.0_dp, pi6 = 1.909859_dp, qtr = 0.25_dp, small = epsilon(0.0_dp), &
59  sqrth = 0.70711_dp
60  REAL(kind=dp), PARAMETER :: stqr = 1.047198_dp, th = 0.375_dp, two = 2.0_dp, zero = 0.0_dp
61 
62  INTEGER :: i, i1, j, n2, output_unit
63  INTEGER, ALLOCATABLE, DIMENSION(:) :: itmp
64  LOGICAL :: failure
65  REAL(kind=dp) :: a1, a2, an, an25, asa, fac, gamma, m, &
66  range, rsn, s, sa, sax, ssa, ssassx, &
67  ssumm2, ssx, summ2, sx, w1, xi, xsx, &
68  xx, y
69  REAL(kind=dp), ALLOCATABLE, DIMENSION(:) :: a, x
70 
71  failure = .false.
72  output_unit = cp_logger_get_default_io_unit()
73  ! Check for N < 3
74  IF (n < 3 .OR. n > 5000) THEN
75  IF (output_unit > 0) WRITE (output_unit, '(A)') &
76  "Shapiro Wilk test: number of points less than 3 or greated than 5000."
77  IF (output_unit > 0) WRITE (output_unit, '(A)') &
78  "Not able to perform the test!"
79  END IF
80  ! Sort input array of data in ascending order
81  IF (mod(n, 2) == 0) THEN
82  n2 = n/2
83  ELSE
84  n2 = (n - 1)/2
85  END IF
86  ALLOCATE (x(n))
87  ALLOCATE (itmp(n))
88  ALLOCATE (a(n2))
89  x(:) = ix
90  CALL sort(x, n, itmp)
91  ! Check for zero range
92  range = x(n) - x(1)
93  IF (range < small) failure = .true.
94  IF (failure .AND. (output_unit > 0)) THEN
95  WRITE (output_unit, '(A)') "Shapiro Wilk test: two data points are numerically identical."
96  WRITE (output_unit, '(A)') "Not able to perform the test!"
97  END IF
98  pw = one
99  w = one
100  an = n
101  ! Calculates coefficients for the test
102  IF (n == 3) THEN
103  a(1) = sqrth
104  ELSE
105  an25 = an + qtr
106  summ2 = zero
107  DO i = 1, n2
108  CALL ppnd7((i - th)/an25, a(i))
109  summ2 = summ2 + a(i)**2
110  END DO
111  summ2 = summ2*two
112  ssumm2 = sqrt(summ2)
113  rsn = one/sqrt(an)
114  a1 = poly(c1, 6, rsn) - a(1)/ssumm2
115  ! Normalize coefficients
116  IF (n > 5) THEN
117  i1 = 3
118  a2 = -a(2)/ssumm2 + poly(c2, 6, rsn)
119  fac = sqrt((summ2 - two*a(1)**2 - two*a(2)**2)/(one - two*a1**2 - two*a2**2))
120  a(1) = a1
121  a(2) = a2
122  ELSE
123  i1 = 2
124  fac = sqrt((summ2 - two*a(1)**2)/(one - two*a1**2))
125  a(1) = a1
126  END IF
127  DO i = i1, n2
128  a(i) = -a(i)/fac
129  END DO
130  END IF
131  ! scaled X
132  xx = x(1)/range
133  sx = xx
134  sa = -a(1)
135  j = n - 1
136  DO i = 2, n
137  xi = x(i)/range
138  sx = sx + xi
139  IF (i /= j) sa = sa + sign(1, i - j)*a(min(i, j))
140  xx = xi
141  j = j - 1
142  END DO
143  ! Calculate W statistic as squared correlation
144  ! between data and coefficients
145  sa = sa/n
146  sx = sx/n
147  ssa = zero
148  ssx = zero
149  sax = zero
150  j = n
151  DO i = 1, n
152  IF (i /= j) THEN
153  asa = sign(1, i - j)*a(min(i, j)) - sa
154  ELSE
155  asa = -sa
156  END IF
157  xsx = x(i)/range - sx
158  ssa = ssa + asa*asa
159  ssx = ssx + xsx*xsx
160  sax = sax + asa*xsx
161  j = j - 1
162  END DO
163  ! W1 equals (1-W) calculated to avoid excessive rounding error
164  ! for W very near 1 (a potential problem in very large samples)
165  ssassx = sqrt(ssa*ssx)
166  w1 = (ssassx - sax)*(ssassx + sax)/(ssa*ssx)
167  w = one - w1
168  ! Calculate significance level for W (exact for N=3)
169  IF (n == 3) THEN
170  pw = pi6*(asin(sqrt(w)) - stqr)
171  ELSE
172  y = log(w1)
173  xx = log(an)
174  m = zero
175  s = one
176  IF (n <= 11) THEN
177  gamma = poly(g, 2, an)
178  IF (y >= gamma) THEN
179  pw = small
180  ELSE
181  y = -log(gamma - y)
182  m = poly(c3, 4, an)
183  s = exp(poly(c4, 4, an))
184  pw = alnorm((y - m)/s, .true.)
185  END IF
186  ELSE
187  m = poly(c5, 4, xx)
188  s = exp(poly(c6, 3, xx))
189  pw = alnorm((y - m)/s, .true.)
190  END IF
191  END IF
192  DEALLOCATE (x)
193  DEALLOCATE (itmp)
194  DEALLOCATE (a)
195 
196  END SUBROUTINE sw_test
197 
198 ! **************************************************************************************************
199 !> \brief Produces the normal deviate Z corresponding to a given lower tail area of P
200 !> Z is accurate to about 1 part in 10**7.
201 !> AS241 APPL. STATIST. (1988) VOL. 37, NO. 3, 477- 484.
202 !> \param p ...
203 !> \param normal_dev ...
204 !> \par History
205 !> Original version by Alain J. Miller - 1996
206 !> Teodoro Laino (02.2007) [tlaino]
207 ! **************************************************************************************************
208  SUBROUTINE ppnd7(p, normal_dev)
209  REAL(kind=dp), INTENT(IN) :: p
210  REAL(kind=dp), INTENT(OUT) :: normal_dev
211 
212  REAL(kind=dp), PARAMETER :: a0 = 3.3871327179e+00_dp, a1 = 5.0434271938e+01_dp, &
213  a2 = 1.5929113202e+02_dp, a3 = 5.9109374720e+01_dp, b1 = 1.7895169469e+01_dp, &
214  b2 = 7.8757757664e+01_dp, b3 = 6.7187563600e+01_dp, c0 = 1.4234372777e+00_dp, &
215  c1 = 2.7568153900e+00_dp, c2 = 1.3067284816e+00_dp, c3 = 1.7023821103e-01_dp, &
216  const1 = 0.180625_dp, const2 = 1.6_dp, d1 = 7.3700164250e-01_dp, &
217  d2 = 1.2021132975e-01_dp, e0 = 6.6579051150e+00_dp, e1 = 3.0812263860e+00_dp, &
218  e2 = 4.2868294337e-01_dp, e3 = 1.7337203997e-02_dp, f1 = 2.4197894225e-01_dp, &
219  f2 = 1.2258202635e-02_dp, half = 0.5_dp, one = 1.0_dp
220  REAL(kind=dp), PARAMETER :: split1 = 0.425_dp, split2 = 5.0_dp, zero = 0.0_dp
221 
222  REAL(kind=dp) :: q, r
223 
224  q = p - half
225  IF (abs(q) <= split1) THEN
226  r = const1 - q*q
227  normal_dev = q*(((a3*r + a2)*r + a1)*r + a0)/ &
228  (((b3*r + b2)*r + b1)*r + one)
229  RETURN
230  ELSE
231  IF (q < zero) THEN
232  r = p
233  ELSE
234  r = one - p
235  END IF
236  IF (r <= zero) THEN
237  normal_dev = zero
238  RETURN
239  END IF
240  r = sqrt(-log(r))
241  IF (r <= split2) THEN
242  r = r - const2
243  normal_dev = (((c3*r + c2)*r + c1)*r + c0)/((d2*r + d1)*r + one)
244  ELSE
245  r = r - split2
246  normal_dev = (((e3*r + e2)*r + e1)*r + e0)/((f2*r + f1)*r + one)
247  END IF
248  IF (q < zero) normal_dev = -normal_dev
249  RETURN
250  END IF
251  END SUBROUTINE ppnd7
252 
253 ! **************************************************************************************************
254 !> \brief Evaluates the tail area of the standardised normal curve
255 !> from x to infinity if upper is .true. or
256 !> from minus infinity to x if upper is .false.
257 !> AS66 Applied Statistics (1973) vol.22, no.3
258 !> \param x ...
259 !> \param upper ...
260 !> \return ...
261 !> \par History
262 !> Original version by Alain J. Miller - 1996
263 !> Teodoro Laino (02.2007) [tlaino]
264 ! **************************************************************************************************
265  FUNCTION alnorm(x, upper) RESULT(fn_val)
266  REAL(kind=dp), INTENT(IN) :: x
267  LOGICAL, INTENT(IN) :: upper
268  REAL(kind=dp) :: fn_val
269 
270  REAL(kind=dp), PARAMETER :: a1 = 5.75885480458_dp, a2 = 2.62433121679_dp, &
271  a3 = 5.92885724438_dp, b1 = -29.8213557807_dp, b2 = 48.6959930692_dp, c1 = -3.8052e-8_dp, &
272  c2 = 3.98064794e-4_dp, c3 = -0.151679116635_dp, c4 = 4.8385912808_dp, &
273  c5 = 0.742380924027_dp, c6 = 3.99019417011_dp, con = 1.28_dp, d1 = 1.00000615302_dp, &
274  d2 = 1.98615381364_dp, d3 = 5.29330324926_dp, d4 = -15.1508972451_dp, &
275  d5 = 30.789933034_dp, half = 0.5_dp, ltone = 7.0_dp, one = 1.0_dp, p = 0.398942280444_dp, &
276  q = 0.39990348504_dp, r = 0.398942280385_dp, utzero = 18.66_dp, zero = 0.0_dp
277 
278  LOGICAL :: up
279  REAL(kind=dp) :: y, z
280 
281  up = upper
282  z = x
283  IF (z < zero) THEN
284  up = .NOT. up
285  z = -z
286  END IF
287  IF (.NOT. (z <= ltone .OR. up .AND. z <= utzero)) THEN
288  fn_val = zero
289  IF (.NOT. up) fn_val = one - fn_val
290  RETURN
291  END IF
292  y = half*z*z
293  IF (z <= con) THEN
294  fn_val = r*exp(-y)/(z + c1 + d1/(z + c2 + d2/(z + c3 + d3/(z + c4 + d4/(z + c5 + d5/(z + c6))))))
295  ELSE
296  fn_val = half - z*(p - q*y/(y + a1 + b1/(y + a2 + b2/(y + a3))))
297  END IF
298  IF (.NOT. up) fn_val = one - fn_val
299 
300  END FUNCTION alnorm
301 
302 ! **************************************************************************************************
303 !> \brief Calculates the algebraic polynomial of order nored-1 with
304 !> array of coefficients c. Zero order coefficient is c(1)
305 !> AS 181.2 Appl. Statist. (1982) Vol. 31, No. 2
306 !> \param c ...
307 !> \param nord ...
308 !> \param x ...
309 !> \return ...
310 !> \par History
311 !> Original version by Alain J. Miller - 1996
312 !> Teodoro Laino (02.2007) [tlaino]
313 ! **************************************************************************************************
314  FUNCTION poly(c, nord, x) RESULT(fn_val)
315 
316  REAL(kind=dp), INTENT(IN) :: c(:)
317  INTEGER, INTENT(IN) :: nord
318  REAL(kind=dp), INTENT(IN) :: x
319  REAL(kind=dp) :: fn_val
320 
321  INTEGER :: i, j, n2
322  REAL(kind=dp) :: p
323 
324  fn_val = c(1)
325  IF (nord == 1) RETURN
326  p = x*c(nord)
327  IF (nord == 2) THEN
328  fn_val = fn_val + p
329  RETURN
330  END IF
331  n2 = nord - 2
332  j = n2 + 1
333  DO i = 1, n2
334  p = (p + c(j))*x
335  j = j - 1
336  END DO
337  fn_val = fn_val + p
338  END FUNCTION poly
339 
340 ! **************************************************************************************************
341 !> \brief Kandall's test for correlation
342 !> \param xdata ...
343 !> \param istart ...
344 !> \param n ...
345 !> \param tau ...
346 !> \param z ...
347 !> \param prob ...
348 !> \par History
349 !> Teodoro Laino (02.2007) [tlaino]
350 !> \note
351 !> tau: Kendall's Tau
352 !> z: number of std devs from 0 of tau
353 !> prob: tau's probability
354 ! **************************************************************************************************
355  SUBROUTINE k_test(xdata, istart, n, tau, z, prob)
356  REAL(kind=dp), DIMENSION(:), POINTER :: xdata
357  INTEGER, INTENT(IN) :: istart, n
358  REAL(kind=dp) :: tau, z, prob
359 
360  INTEGER :: is, j, k, nt
361  REAL(kind=dp) :: a1, var
362 
363  nt = n - istart + 1
364  IF (nt .GE. min_sample_size) THEN
365  is = 0
366  DO j = istart, n - 1
367  DO k = j + 1, n
368  a1 = xdata(j) - xdata(k)
369  IF (a1 .GT. 0.0_dp) THEN
370  is = is + 1
371  ELSE IF (a1 .LT. 0.0_dp) THEN
372  is = is - 1
373  END IF
374  END DO
375  END DO
376  tau = real(is, kind=dp)
377  var = real(nt, kind=dp)*real(nt - 1, kind=dp)*real(2*nt + 5, kind=dp)/18.0_dp
378  z = tau/sqrt(var)
379  prob = erf(abs(z)/sqrt(2.0_dp))
380  ELSE
381  tau = 0.0_dp
382  z = 0.0_dp
383  prob = 1.0_dp
384  END IF
385  END SUBROUTINE k_test
386 
387 ! **************************************************************************************************
388 !> \brief Von Neumann test for serial correlation
389 !> \param xdata ...
390 !> \param n ...
391 !> \param r ...
392 !> \param u ...
393 !> \param prob ...
394 !> \par History
395 !> Teodoro Laino (02.2007) [tlaino]
396 ! **************************************************************************************************
397  SUBROUTINE vn_test(xdata, n, r, u, prob)
398  REAL(kind=dp), DIMENSION(:), POINTER :: xdata
399  INTEGER, INTENT(IN) :: n
400  REAL(kind=dp) :: r, u, prob
401 
402  INTEGER :: i
403  REAL(kind=dp) :: q, s, var, x
404 
405  IF (n .GE. min_sample_size) THEN
406  x = 0.0_dp
407  q = 0.0_dp
408  s = 0.0_dp
409  DO i = 1, n - 1
410  x = x + xdata(i)
411  q = q + (xdata(i + 1) - xdata(i))**2
412  END DO
413  x = x + xdata(n)
414  x = x/real(n, kind=dp)
415  DO i = 1, n
416  s = s + (xdata(i) - x)**2
417  END DO
418  s = s/real(n - 1, kind=dp)
419  q = q/real(2*(n - 1), kind=dp)
420  r = q/s
421  var = sqrt(1.0_dp/real(n + 1, kind=dp)*(1.0_dp + 1.0_dp/real(n - 1, kind=dp)))
422  u = (r - 1.0_dp)/var
423  prob = erf(abs(u)/sqrt(2.0_dp))
424  ELSE
425  r = 0.0_dp
426  u = 0.0_dp
427  prob = 1.0_dp
428  END IF
429 
430  END SUBROUTINE vn_test
431 
432 ! **************************************************************************************************
433 !> \brief Performs tests on statistical methods
434 !> Debug use only
435 !> \param xdata ...
436 !> \param globenv ...
437 !> \par History
438 !> Teodoro Laino (02.2007) [tlaino]
439 ! **************************************************************************************************
440  SUBROUTINE tests(xdata, globenv)
441  REAL(kind=dp), DIMENSION(:), POINTER :: xdata
442  TYPE(global_environment_type), POINTER :: globenv
443 
444  INTEGER :: i, n
445  REAL(kind=dp) :: prob, pw, r, tau, u, w, z
446  REAL(kind=dp), DIMENSION(:), POINTER :: ydata
447 
448  IF (debug_this_module) THEN
449  n = 50 ! original sample size
450  NULLIFY (xdata)
451  ALLOCATE (xdata(n))
452  DO i = 1, 10
453  xdata(i) = 5.0_dp - real(i, kind=dp)/2.0_dp + 0.1*globenv%gaussian_rng_stream%next()
454  WRITE (3, *) xdata(i)
455  END DO
456  DO i = 11, n
457  xdata(i) = 0.1*globenv%gaussian_rng_stream%next()
458  END DO
459 
460  ! Test for trend
461  DO i = 1, n
462  CALL k_test(xdata, i, n, tau, z, prob)
463  IF (prob <= 0.2_dp) EXIT
464  END DO
465  WRITE (*, *) "Mann-Kendall test", i
466 
467  ! Test for normality distribution and for serial correlation
468  DO i = 1, n
469  ALLOCATE (ydata(n - i + 1))
470  ydata = xdata(i:n)
471  CALL sw_test(ydata, n - i + 1, w, pw)
472  CALL vn_test(ydata, n - i + 1, r, u, prob)
473  WRITE (*, *) "Shapiro Wilks test", i, w, pw
474  WRITE (*, *) "Von Neu", i, r, u, prob
475  DEALLOCATE (ydata)
476  END DO
477 
478  DEALLOCATE (xdata)
479  END IF
480  END SUBROUTINE tests
481 
482 END MODULE statistical_methods
483 
static GRID_HOST_DEVICE double fac(const int i)
Factorial function, e.g. fac(5) = 5! = 120.
Definition: grid_common.h:48
static GRID_HOST_DEVICE orbital up(const int i, const orbital a)
Increase i'th component of given orbital angular momentum.
Definition: grid_common.h:133
various routines to log and control the output. The idea is that decisions about where to log should ...
integer function, public cp_logger_get_default_io_unit(logger)
returns the unit nr for the ionode (-1 on all other processors) skips as well checks if the procs cal...
Calculation of the incomplete Gamma function F_n(t) for multi-center integrals over Cartesian Gaussia...
Definition: gamma.F:15
Define type storing the global information of a run. Keep the amount of stored data small....
Definition: global_types.F:21
Defines the basic variable types.
Definition: kinds.F:23
integer, parameter, public dp
Definition: kinds.F:34
Methods to perform on the fly statistical analysis of data -) Schiferl and Wallace,...
subroutine, public vn_test(xdata, n, r, u, prob)
Von Neumann test for serial correlation.
integer, parameter, public min_sample_size
subroutine, public k_test(xdata, istart, n, tau, z, prob)
Kandall's test for correlation.
subroutine, public sw_test(ix, n, w, pw)
Shapiro - Wilk's test or W-statistic to test normality of a distribution R94 APPL....
All kind of helpful little routines.
Definition: util.F:14