Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
GoFTest.cxx
Go to the documentation of this file.
1// @(#)root/mathcore:$Id$
2// Authors: Bartolomeu Rabacal 05/2010
3/**********************************************************************
4 * *
5 * Copyright (c) 2006 , LCG ROOT MathLib Team *
6 * *
7 * *
8 **********************************************************************/
9// implementation file for GoFTest
10
11
12#include <algorithm>
13#include <functional>
14#include <iostream>
15#include <map>
16#include <numeric>
17#include <string.h>
18#include <cassert>
19
20#include "Math/Error.h"
21#include "Math/Math.h"
22#include "Math/IFunction.h"
23#include "Math/IFunctionfwd.h"
24#include "Math/Integrator.h"
27
28#include "Math/GoFTest.h"
29
30#include "Fit/BinData.h"
31
32#include "TStopwatch.h"
33
34/* Note: The references mentioned here are stated in GoFTest.h */
35
36namespace ROOT {
37namespace Math {
38
39 struct CDFWrapper : public IGenFunction {
40 // wrapper around a cdf funciton to re-scale for the range
41 Double_t fXmin; // lower range for x
42 Double_t fXmax; // lower range for x
43 Double_t fNorm; // normalization
44 const IGenFunction* fCDF; // cdf pointer (owned by the class)
45
46
47 virtual ~CDFWrapper() { if (fCDF) delete fCDF; }
48
49 CDFWrapper(const IGenFunction& cdf, Double_t xmin=0, Double_t xmax=-1) :
50 fCDF(cdf.Clone())
51 {
52 if (xmin >= xmax) {
53 fNorm = 1;
54 fXmin = -std::numeric_limits<double>::infinity();
55 fXmax = std::numeric_limits<double>::infinity();
56 }
57 else {
58 fNorm = cdf(xmax) - cdf(xmin);
59 fXmin = xmin;
60 fXmax = xmax;
61 }
62 }
63
64 Double_t DoEval(Double_t x) const {
65 if (x <= fXmin) return 0;
66 if (x >= fXmax) return 1.0;
67 return (*fCDF)(x)/fNorm;
68 }
69
70 IGenFunction* Clone() const {
71 return new CDFWrapper(*fCDF,fXmin,fXmax);
72 }
73 };
74
75
76 class PDFIntegral : public IGenFunction {
77 Double_t fXmin; // lower range for x
78 Double_t fXmax; // lower range for x
79 Double_t fNorm; // normalization
80 mutable IntegratorOneDim fIntegral;
81 const IGenFunction* fPDF; // pdf pointer (owned by the class)
82 public:
83
84 virtual ~PDFIntegral() { if (fPDF) delete fPDF; }
85
86 PDFIntegral(const IGenFunction& pdf, Double_t xmin = 0, Double_t xmax = -1) :
87 fXmin(xmin),
88 fXmax(xmax),
89 fNorm(1),
90 fPDF(pdf.Clone())
91 {
92 // compute normalization
93 fIntegral.SetFunction(*fPDF); // N.B. must be fPDF (the cloned copy) and not pdf which can disappear
94 if (fXmin >= fXmax) {
95 fXmin = -std::numeric_limits<double>::infinity();
96 fXmax = std::numeric_limits<double>::infinity();
97 }
98 if (fXmin == -std::numeric_limits<double>::infinity() && fXmax == std::numeric_limits<double>::infinity() ) {
99 fNorm = fIntegral.Integral();
100 }
101 else if (fXmin == -std::numeric_limits<double>::infinity() )
102 fNorm = fIntegral.IntegralLow(fXmax);
103 else if (fXmax == std::numeric_limits<double>::infinity() )
104 fNorm = fIntegral.IntegralUp(fXmin);
105 else
106 fNorm = fIntegral.Integral(fXmin, fXmax);
107 }
108
109 Double_t DoEval(Double_t x) const {
110 if (x <= fXmin) return 0;
111 if (x >= fXmax) return 1.0;
112 if (fXmin == -std::numeric_limits<double>::infinity() )
113 return fIntegral.IntegralLow(x)/fNorm;
114 else
115 return fIntegral.Integral(fXmin,x)/fNorm;
116 }
117
118 IGenFunction* Clone() const {
119 return new PDFIntegral(*fPDF, fXmin, fXmax);
120 }
121 };
122
124 if (!(kGaussian <= dist && dist <= kExponential)) {
125 MATH_ERROR_MSG("SetDistribution", "Cannot set distribution type! Distribution type option must be ennabled.");
126 return;
127 }
128 fDist = dist;
129 SetCDF();
130 }
131
132 GoFTest::GoFTest( UInt_t sample1Size, const Double_t* sample1, UInt_t sample2Size, const Double_t* sample2 )
133 : fDist(kUndefined),
134 fSamples(std::vector<std::vector<Double_t> >(2)),
135 fTestSampleFromH0(kFALSE) {
136 Bool_t badSampleArg = sample1 == 0 || sample1Size == 0;
137 if (badSampleArg) {
138 std::string msg = "'sample1";
139 msg += !sample1Size ? "Size' cannot be zero" : "' cannot be zero-length";
140 MATH_ERROR_MSG("GoFTest", msg.c_str());
141 assert(!badSampleArg);
142 }
143 badSampleArg = sample2 == 0 || sample2Size == 0;
144 if (badSampleArg) {
145 std::string msg = "'sample2";
146 msg += !sample2Size ? "Size' cannot be zero" : "' cannot be zero-length";
147 MATH_ERROR_MSG("GoFTest", msg.c_str());
148 assert(!badSampleArg);
149 }
150 std::vector<const Double_t*> samples(2);
151 std::vector<UInt_t> samplesSizes(2);
152 samples[0] = sample1;
153 samples[1] = sample2;
154 samplesSizes[0] = sample1Size;
155 samplesSizes[1] = sample2Size;
156 SetSamples(samples, samplesSizes);
158 }
159
160 GoFTest::GoFTest(UInt_t sampleSize, const Double_t* sample, EDistribution dist)
161 : fDist(dist),
162 fSamples(std::vector<std::vector<Double_t> >(1)),
163 fTestSampleFromH0(kTRUE) {
164 Bool_t badSampleArg = sample == 0 || sampleSize == 0;
165 if (badSampleArg) {
166 std::string msg = "'sample";
167 msg += !sampleSize ? "Size' cannot be zero" : "' cannot be zero-length";
168 MATH_ERROR_MSG("GoFTest", msg.c_str());
169 assert(!badSampleArg);
170 }
171 std::vector<const Double_t*> samples(1, sample);
172 std::vector<UInt_t> samplesSizes(1, sampleSize);
173 SetSamples(samples, samplesSizes);
175 SetCDF();
176 }
177
179
180 void GoFTest::SetSamples(std::vector<const Double_t*> samples, const std::vector<UInt_t> samplesSizes) {
181 fCombinedSamples.assign(std::accumulate(samplesSizes.begin(), samplesSizes.end(), 0u), 0.0);
182 UInt_t combinedSamplesSize = 0;
183 for (UInt_t i = 0; i < samples.size(); ++i) {
184 fSamples[i].assign(samples[i], samples[i] + samplesSizes[i]);
185 std::sort(fSamples[i].begin(), fSamples[i].end());
186 for (UInt_t j = 0; j < samplesSizes[i]; ++j) {
187 fCombinedSamples[combinedSamplesSize + j] = samples[i][j];
188 }
189 combinedSamplesSize += samplesSizes[i];
190 }
191 std::sort(fCombinedSamples.begin(), fCombinedSamples.end());
192
193 Bool_t degenerateSamples = *(fCombinedSamples.begin()) == *(fCombinedSamples.end() - 1);
194 if (degenerateSamples) {
195 std::string msg = "Degenerate sample";
196 msg += samplesSizes.size() > 1 ? "s!" : "!";
197 msg += " Sampling values all identical.";
198 MATH_ERROR_MSG("SetSamples", msg.c_str());
199 assert(!degenerateSamples);
200 }
201 }
202
204 fMean = std::accumulate(fSamples[0].begin(), fSamples[0].end(), 0.0) / fSamples[0].size();
205 fSigma = TMath::Sqrt(1. / (fSamples[0].size() - 1) * (std::inner_product(fSamples[0].begin(), fSamples[0].end(), fSamples[0].begin(), 0.0) - fSamples[0].size() * TMath::Power(fMean, 2)));
206 }
207
208 void GoFTest::operator()(ETestType test, Double_t& pvalue, Double_t& testStat) const {
209 switch (test) {
210 default:
211 case kAD:
212 AndersonDarlingTest(pvalue, testStat);
213 break;
214 case kAD2s:
215 AndersonDarling2SamplesTest(pvalue, testStat);
216 break;
217 case kKS:
218 KolmogorovSmirnovTest(pvalue, testStat);
219 break;
220 case kKS2s:
221 KolmogorovSmirnov2SamplesTest(pvalue, testStat);
222 }
223 }
224
226 Double_t result = 0.0;
227 switch (test) {
228 default:
229 case kAD:
230 result = AndersonDarlingTest(option);
231 break;
232 case kAD2s:
233 result = AndersonDarling2SamplesTest(option);
234 break;
235 case kKS:
236 result = KolmogorovSmirnovTest(option);
237 break;
238 case kKS2s:
239 result = KolmogorovSmirnov2SamplesTest(option);
240 }
241 return result;
242 }
243
244 void GoFTest::SetCDF() { // Setting parameter-free distributions
245 IGenFunction* cdf = 0;
246 switch (fDist) {
247 case kLogNormal:
248 LogSample();
249 /* fall through */
250 case kGaussian :
252 break;
253 case kExponential:
255 break;
256 case kUserDefined:
257 case kUndefined:
258 default:
259 break;
260 }
261 fCDF.reset(cdf);
262 }
263
265 if (fDist > kUserDefined) {
266 MATH_WARN_MSG("SetDistributionFunction","Distribution type is changed to user defined");
267 }
269 // function will be cloned inside the wrapper PDFIntegral of CDFWrapper classes
270 if (isPDF)
271 fCDF.reset(new PDFIntegral(f, xmin, xmax) );
272 else
273 fCDF.reset(new CDFWrapper(f, xmin, xmax) );
274 }
275
276 void GoFTest::Instantiate(const Double_t* sample, UInt_t sampleSize) {
277 // initialization function for the template constructors
278 Bool_t badSampleArg = sample == 0 || sampleSize == 0;
279 if (badSampleArg) {
280 std::string msg = "'sample";
281 msg += !sampleSize ? "Size' cannot be zero" : "' cannot be zero-length";
282 MATH_ERROR_MSG("GoFTest", msg.c_str());
283 assert(!badSampleArg);
284 }
285 fCDF.reset((IGenFunction*)0);
287 fMean = 0;
288 fSigma = 0;
289 fSamples = std::vector<std::vector<Double_t> >(1);
291 SetSamples(std::vector<const Double_t*>(1, sample), std::vector<UInt_t>(1, sampleSize));
292 }
293
296 }
297
299 return ROOT::Math::exponential_cdf(x, 1.0 / fMean);
300 }
301
303 transform(fSamples[0].begin(), fSamples[0].end(), fSamples[0].begin(),
304 std::function<Double_t(Double_t)>(TMath::Log));
306 }
307
308/*
309 Taken from (1)
310*/
311 Double_t GoFTest::GetSigmaN(const std::vector<UInt_t> & ns, UInt_t N) {
312 // compute moments of AD distribution (from Scholz-Stephen paper, paragraph 3)
313
314 Double_t sigmaN = 0.0, h = 0.0, H = 0.0, g = 0.0, a, b, c, d, k = ns.size();
315
316 for (UInt_t i = 0; i < ns.size(); ++i) {
317 H += 1.0 / double( ns[i] );
318 }
319
320 // use approximate formulas for large N
321 // cache Sum( 1 / i)
322 if (N < 2000) {
323 std::vector<double> invI(N);
324 for (UInt_t i = 1; i <= N - 1; ++i) {
325 invI[i] = 1.0 / i;
326 h += invI[i];
327 }
328 for (UInt_t i = 1; i <= N - 2; ++i) {
329 double tmp = invI[N-i];
330 for (UInt_t j = i + 1; j <= N - 1; ++j) {
331 g += tmp * invI[j];
332 }
333 }
334 }
335 else {
336 // for N larger than 2000 error difference in g is ~ 5 10^-3 while in h is at the level of 10^-5
337 const double emc = 0.5772156649015328606065120900824024; // Euler-Mascheroni constant
338 h = std::log(double(N-1) ) + emc;
339 g = (M_PI)*(M_PI)/6.0;
340 }
341 double k2 = std::pow(k,2);
342 a = (4 * g - 6) * k + (10 - 6 * g) * H - 4 * g + 6;
343 b = (2 * g - 4) * k2 + 8 * h * k + (2 * g - 14 * h - 4) * H - 8 * h + 4 * g - 6;
344 c = (6 * h + 2 * g - 2) * k2 + (4 * h - 4 *g + 6) * k + (2 * h - 6) * H + 4 * h;
345 d = (2 * h + 6) * k2 - 4 * h * k;
346 sigmaN += a * std::pow(double(N),3) + b * std::pow(double(N),2) + c * N + d;
347 sigmaN /= ( double(N - 1) * double(N - 2) * double(N - 3) );
348 sigmaN = TMath::Sqrt(sigmaN);
349 return sigmaN;
350 }
351
352
354
355 /*
356 Computation of p-values according to
357 "K-Sample Anderson-Darling Tests" by F.W. Scholz
358 and M.A. Stephens (1987), Journal of the American Statistical Association,
359 Vol 82, No. 399, pp 918-924.
360 Code from kSamples package from R (author F. Scholtz)
361
362 This function uses the upper T_m quantiles as obtained via simulation of
363 the Anderson-Darling test statistics (Nsim = 2*10^6) with sample sizes n=500
364 for each sample, and after standardization, in order to emulate the Table 1
365 values given in the above reference. However, here we estimate p-quantiles
366 for p = .00001,.00005,.0001,.0005,.001,.005,.01,.025,.05,.075,
367 .1,.2,.3,.4,.5,.6,.7,.8,.9,.925,.95,.975,.99,.9925,.995,.9975,.999,
368 .99925,.9995,.99975,.9999,.999925,.99995,.999975,.99999
369 First the appropriate p-quantiles are determined from those simulated
370 for ms = 1,2,3,4,6,8,10, Inf, interpolating to the given value of m.
371 Since we use only m=2 we avoid this interpolation.
372
373 Next linear inetrpolation to find the observed p value given the observed test statistic value.
374 We use interpolation in the test statistic -> log((1-p)/p) domain
375 and we extrapolatelinearly) beyond p = .00001 and .99999.
376 */
377
378 // sample values
379 //double ms[] = { 1, 2, 3, 4, 6, 8, 10, TMath::Infinity() };
380 //int ns = ms.size();
381 const int ns = 8;
382 double ts[ ] = { -1.1954, -1.5806, -1.8172,
383 -2.0032, -2.2526, -2.4204, -2.5283, -4.2649, -1.1786, -1.5394,
384 -1.7728, -1.9426, -2.1685, -2.3288, -2.4374, -3.8906, -1.166,
385 -1.5193, -1.7462, -1.9067, -2.126, -2.2818, -2.3926, -3.719,
386 -1.1407, -1.4659, -1.671, -1.8105, -2.0048, -2.1356, -2.2348,
387 -3.2905, -1.1253, -1.4371, -1.6314, -1.7619, -1.9396, -2.0637,
388 -2.1521, -3.0902, -1.0777, -1.3503, -1.5102, -1.6177, -1.761,
389 -1.8537, -1.9178, -2.5758, -1.0489, -1.2984, -1.4415, -1.5355,
390 -1.6625, -1.738, -1.7936, -2.3263, -0.9978, -1.2098, -1.3251,
391 -1.4007, -1.4977, -1.5555, -1.5941, -1.96, -0.9417, -1.1187,
392 -1.209, -1.2671, -1.3382, -1.379, -1.405, -1.6449, -0.8981, -1.0491,
393 -1.1235, -1.1692, -1.2249, -1.2552, -1.2755, -1.4395, -0.8598,
394 -0.9904, -1.0513, -1.0879, -1.1317, -1.155, -1.1694, -1.2816,
395 -0.7258, -0.7938, -0.8188, -0.8312, -0.8435, -0.8471, -0.8496,
396 -0.8416, -0.5966, -0.617, -0.6177, -0.6139, -0.6073, -0.5987,
397 -0.5941, -0.5244, -0.4572, -0.4383, -0.419, -0.4033, -0.3834,
398 -0.3676, -0.3587, -0.2533, -0.2966, -0.2428, -0.2078, -0.1844,
399 -0.1548, -0.1346, -0.1224, 0, -0.1009, -0.0169, 0.0304, 0.0596,
400 0.0933, 0.1156, 0.1294, 0.2533, 0.1571, 0.2635, 0.3169, 0.348,
401 0.3823, 0.4038, 0.4166, 0.5244, 0.5357, 0.6496, 0.6992, 0.7246,
402 0.7528, 0.7683, 0.7771, 0.8416, 1.2255, 1.2989, 1.3202, 1.3254,
403 1.3305, 1.3286, 1.3257, 1.2816, 1.5262, 1.5677, 1.5709, 1.5663,
404 1.5561, 1.5449, 1.5356, 1.4395, 1.9633, 1.943, 1.919, 1.8975,
405 1.8641, 1.8389, 1.8212, 1.6449, 2.7314, 2.5899, 2.5, 2.4451,
406 2.3664, 2.3155, 2.2823, 1.96, 3.7825, 3.4425, 3.2582, 3.1423,
407 3.0036, 2.9101, 2.8579, 2.3263, 4.1241, 3.716, 3.4984, 3.3651,
408 3.2003, 3.0928, 3.0311, 2.4324, 4.6044, 4.0847, 3.8348, 3.6714,
409 3.4721, 3.3453, 3.2777, 2.5758, 5.409, 4.7223, 4.4022, 4.1791,
410 3.9357, 3.7809, 3.6963, 2.807, 6.4954, 5.5823, 5.1456, 4.8657,
411 4.5506, 4.3275, 4.2228, 3.0902, 6.8279, 5.8282, 5.3658, 5.0749,
412 4.7318, 4.4923, 4.3642, 3.1747, 7.2755, 6.197, 5.6715, 5.3642,
413 4.9991, 4.7135, 4.5945, 3.2905, 8.1885, 6.8537, 6.2077, 5.8499,
414 5.4246, 5.1137, 4.9555, 3.4808, 9.3061, 7.6592, 6.85, 6.4806,
415 5.9919, 5.6122, 5.5136, 3.719, 9.6132, 7.9234, 7.1025, 6.6731,
416 6.1549, 5.8217, 5.7345, 3.7911, 10.0989, 8.2395, 7.4326, 6.9567,
417 6.3908, 6.011, 5.9566, 3.8906, 10.8825, 8.8994, 7.8934, 7.4501,
418 6.9009, 6.4538, 6.2705, 4.0556, 11.8537, 9.5482, 8.5568, 8.0283,
419 7.4418, 6.9524, 6.6195, 4.2649 };
420
421
422
423
424
425 // p values bins
426 double p[] = { .00001,.00005,.0001,.0005,.001,.005,.01,.025,.05,.075,.1,.2,.3,.4,.5,.6,.7,.8,.9,
427 .925,.95,.975,.99,.9925,.995,.9975,.999,.99925,.9995,.99975,.9999,.999925,.99995,.999975,.99999 };
428
429 //int nbins = p.size();
430 const int nbins = 35;
431 //assert ( nbins*ns == ts.size() );
432
433 // get ts values for nsamples = 2
434 // corresponding value is for m=nsamples-1
435 int offset = 0; // for m = 1 (i.e. for nsamples = 2)
436 if (nsamples != 2) {
437 MATH_ERROR_MSG("InterpolatePValues", "Interpolation not implemented for nsamples not equal to 2");
438 return 0;
439 }
440 std::vector<double> ts2(nbins); // ts values for nsamples = 2
441 std::vector<double> lp(nbins); // log ( p / (1-p) )
442 for (int i = 0; i < nbins; ++i)
443 {
444 ts2[i] = ts[offset+ i * ns];
445 p[i] = 1.-p[i];
446 lp[i] = std::log( p[i]/(1.-p[i] ) );
447 }
448 // do linear interpolation to find right lp value for given observed test staistic value
449 //auto it = std::lower_bound(ts2.begin(), ts2.end(), tx );
450 int i1 = std::distance(ts2.begin(), std::lower_bound(ts2.begin(), ts2.end(), tx ) ) - 1;
451 int i2 = i1+1;
452 // if tx is before min of tabluated data
453 if (i1 < 0) {
454 i1 = 0;
455 i2 = 1;
456 }
457 // if tx is after max of tabulated data
458 if (i2 >= int(ts2.size()) ) {
459 i1 = ts2.size()-2;
460 i2 = ts2.size()-1;
461 }
462
463 //std::cout << i1 << " , " << i2 << std::endl;
464 assert(i1 < (int) lp.size() && i2 < (int) lp.size() );
465 double lp1 = lp[i1];
466 double lp2 = lp[i2];
467 double tx1 = ts2[i1];
468 double tx2 = ts2[i2];
469
470 //std::cout << " tx1,2 " << tx1 << " " << tx2 << std::endl;
471 /// find interpolated (or extrapolated value)(
472 double lp0 = (lp1-lp2) * (tx - tx2)/ ( tx1-tx2) + lp2;
473
474
475 double p0 = exp(lp0)/(1. + exp(lp0) );
476 return p0;
477
478 }
479
480
481/*
482 Taken from (2)
484 Double_t pvalue = 0.0;
485 if (A2 <= 0.0) {
486 return pvalue;
487 } else if (A2 < 2.) {
488 pvalue = std::pow(A2, -0.5) * std::exp(-1.2337141 / A2) * (2.00012 + (0.247105 - (0.0649821 - (0.0347962 - (0.011672 - 0.00168691 * A2) * A2) * A2) * A2) * A2);
489 } else {
490 pvalue = std::exp(-1. * std::exp(1.0776 - (2.30695 - (0.43424 - (.082433 - (0.008056 - 0.0003146 * A2) * A2) * A2) * A2) * A2));
491 }
492 return 1. - pvalue;
493 }
494
495
496// code from kSamples (R) F. Scholz
497
498/* computes the k-sample Anderson-Darling test statistics in both original
499 and alternative versions for the nonparametric (rank) test described in
500 Scholz F.W. and Stephens M.A. (1987), K-sample Anderson-Darling Tests,
501 Journal of the American Statistical Association, Vol 82, No. 399,
502 pp. 918-924
503
504 Arguments:
505 adk: double array with length 2, stores AkN2 and AakN2
506 k: integer, number of samples being compared
507 x: double array storing the concatenated samples in the same order as ns
508 ns: integer array storing the k sample sizes, corresponding to x
509 zstar: double array storing the l distinct ordered observations in the
510 pooled sample
511 l: integer, length of zstar
512
513 Outputs:
514 when the computation ends, AkN2 and AakN2 are stored in the given memory
515 pointed by adk
516*/
517
518/* counts and returns the number of occurrence of a given number
519 in a double array */
520int getCount(double z, const double *dat, int n) {
521 int i;
522 int count = 0;
523
524 for (i = 0; i < n; i++) {
525 if (dat[i] == z) {
526 count++;
527 }
528 }
529
530 return(count);
531}
532
533/* computes and returns the sum of elements in a given integer array */
534int getSum(const int *x, int n) {
535 int i;
536 int sum = 0;
537
538 for (i = 0; i < n; i++) {
539 sum += x[i];
540 }
541
542 return(sum);
543}
544
545
546void adkTestStat(double *adk, const std::vector<std::vector<double> > & samples, const std::vector<double> & zstar) {
547
548 int i;
549 int j;
550
551 int nsum; /* total sample size = n_1 + ... + n_k */
552 int k = samples.size();
553 int l = zstar.size();
554
555 /* fij records the number of observations in the ith sample coinciding
556 with zstar[j], where i = 1, ..., k, and j = 1, ..., l */
557 std::vector<int> fij (k*l);
558 /* lvec is an integer vector with length l,
559 whose jth entry = \sum_{i=1}^{k} f_{ij}, i.e., the multiplicity
560 of zstar[j] */
561 std::vector<int> lvec(l);
562
563 /* for computation */
564 double mij;
565 double maij;
566 double innerSum;
567 double aInnerSum;
568 double bj;
569 double baj;
570 double tmp;
571
572 /* samples is a two-dimensional double array with length k;
573 it stores an array of k pointers to double arrays which are
574 the k samples beeing compared */
575// double **samples;
576
577 /* dynamically allocate memory */
578 //std::vector< std::vector<double> > samples(k);
579 std::vector<int> ns(k);
580 nsum = 0;
581 for (i = 0; i < k; i++) {
582 ns[i] = samples[i].size();
583 nsum += ns[i];
584 }
585
586 /* fij: k*l integer matrix, where l is the length of zstar and
587 k is the number of samples being compared
588 lvec: integer vector of length l, records the multiplicity of
589 each element of zstar */
590 for (j = 0; j < l; j++) {
591 lvec[j] = 0;
592 for (i = 0; i < k; i++) {
593 fij[i + j*k] = getCount(zstar[j], &samples[i][0], ns[i]);
594 lvec[j] += fij[i + j*k];
595 }
596 }
597
598 // loop on samples to compute the adk's
599 // Formula (6) and (7) of the paper
600 adk[0] = adk[1] = 0;
601 for (i = 0; i < k; i++) {
602 mij = 0;
603 maij = 0;
604 innerSum = 0;
605 aInnerSum = 0;
606
607 for (j = 0; j < l; j++) {
608 mij += fij[i + j*k];
609 maij = mij - (double) fij[i + j*k] / 2.0;
610 bj = getSum(&lvec[0], j + 1);
611 baj = bj - (double) lvec[j] / 2.0;
612
613 if (j < l - 1) {
614 tmp = (double) nsum * mij - (double) ns[i] * bj;
615 innerSum = innerSum + (double) lvec[j] * tmp * tmp /
616 (bj * ((double) nsum - bj));
617 }
618
619 tmp = (double) nsum * maij - (double) ns[i] * baj;
620 aInnerSum = aInnerSum + (double) lvec[j] * tmp * tmp /
621 (baj * (nsum - baj) - nsum * (double) lvec[j] / 4.0);
622 }
623
624 adk[0] = adk[0] + innerSum / ns[i]; /* AkN2*/
625 adk[1] = adk[1] + aInnerSum / ns[i]; /* AakN2 */
626 }
627
628 /* k-sample Anderson-Darling test statistics in both original and
629 alternative versions, AkN2 and AakN2, are stored in the given
630 double array adk */
631 adk[0] = adk[0] / (double) nsum; /* AkN2*/
632 adk[1] = (nsum - 1) * adk[1] / ((double) nsum * (double) nsum); /* AakN2 */
633
634 // /* free pointers */
635 // for (i = 0; i < k; i++) {
636 // free(samples[i]);
637 // }
638 // free(samples);
639
640}
641
642
643/*
644 Taken from (1) -- Named for 2 samples but implemented for K. Restricted to K = 2 by the class's constructors
645*/
647 pvalue = -1;
648 testStat = -1;
649 if (fTestSampleFromH0) {
650 MATH_ERROR_MSG("AndersonDarling2SamplesTest", "Only 1-sample tests can be issued with a 1-sample constructed GoFTest object!");
651 return;
652 }
653 std::vector<Double_t> z(fCombinedSamples);
654 // unique removes all consecutives duplicates elements. This is exactly what we wants
655 // for example unique of v={1,2,2,3,1,2,3,3} results in {1,2,3,1,2,3} which is exactly what we wants
656 std::vector<Double_t>::iterator endUnique = std::unique(z.begin(), z.end()); //z_j's in (1)
657 z.erase(endUnique, z.end() );
658 std::vector<UInt_t> h; // h_j's in (1)
659 std::vector<Double_t> H; // H_j's in (1)
660 UInt_t N = fCombinedSamples.size();
661 Double_t A2 = 0.0; // Anderson-Darling A^2 Test Statistic
662
663#ifdef USE_OLDIMPL
664
665 TStopwatch w; w.Start();
666
667 unsigned int nSamples = fSamples.size();
668
669 // old implementation
670 for (std::vector<Double_t>::iterator data = z.begin(); data != endUnique; ++data) {
671 UInt_t n = std::count(fCombinedSamples.begin(), fCombinedSamples.end(), *data);
672 h.push_back(n);
673 H.push_back(std::count_if(fCombinedSamples.begin(), fCombinedSamples.end(),
674 std::bind(std::less<Double_t>(), std::placeholders::_1, *data)) + n / 2.);
675 }
676 std::cout << "time for H";
677 w.Print();
678 w.Reset(); w.Start();
679 std::vector<std::vector<Double_t> > F(nSamples); // F_ij's in (1)
680 for (UInt_t i = 0; i < nSamples; ++i) {
681 for (std::vector<Double_t>::iterator data = z.begin(); data != endUnique; ++data) {
682 UInt_t n = std::count(fSamples[i].begin(), fSamples[i].end(), *data);
683 F[i].push_back(std::count_if(fSamples[i].begin(), fSamples[i].end(),
684 std::bind(std::less<Double_t>(), std::placeholders::_1, *data)) + n / 2.);
685 }
686 }
687 std::cout << "time for F";
688 w.Print();
689 for (UInt_t i = 0; i < nSamples; ++i) {
690 Double_t sum_result = 0.0;
691 UInt_t j = 0;
692 w.Reset(); w.Start();
693 for (std::vector<Double_t>::iterator data = z.begin(); data != endUnique; ++data) {
694 sum_result += h[j] * TMath::Power(N * F[i][j]- fSamples[i].size() * H[j], 2) / (H[j] * (N - H[j]) - N * h[j] / 4.0);
695 ++j;
696 }
697 std::cout << "time for sum_resut";
698 w.Print();
699 std::cout << "sum_result " << sum_result << std::endl;
700 A2 += 1.0 / fSamples[i].size() * sum_result;
701 }
702 A2 *= (N - 1) / (TMath::Power(N, 2)); // A2_akN in (1)
703
704 std::cout << "A2 - old Bartolomeo code " << A2 << std::endl;
705#endif
706 // w.Reset();
707 // w.Start();
708
709 double adk[2] = {0,0};
710
711 //debug
712 // std::cout << "combined samples\n";
713 // for (int i = 0; i < fCombinedSamples.size(); ++i)
714 // std::cout << fCombinedSamples[i] << " ,";
715 // std::cout << std::endl;
716 // std::cout << ns[0] << " " << ns[1] << std::endl;
717 // std::cout << "Z\n";
718 // for (int i = 0; i < z.size(); ++i)
719 // std::cout << z[i] << " ,";
720 // std::cout << std::endl;
721
722 // use function from kSamples code
723 adkTestStat(adk, fSamples, z );
724 // w.Print();
725 // std::cout << "A2 - new kSamples code " << adk[0] << " " << adk[1] << std::endl;
726
727 A2 = adk[0];
728
729 // compute the normalized test statistic
730
731 std::vector<UInt_t> ns(fSamples.size());
732 for (unsigned int k = 0; k < ns.size(); ++k) ns[k] = fSamples[k].size();
733 Double_t sigmaN = GetSigmaN(ns, N);
734 A2 -= fSamples.size() - 1;
735 A2 /= sigmaN; // standartized test statistic
736
737 pvalue = PValueADKSamples(2,A2);
738 testStat = A2;
739 return;
740 }
741
742
743/*
744 Compute Anderson Darling test for two binned data set.
745 A binned data set can be seen as many identical observation happening at the center of the bin
746 In this way it is trivial to apply the formula (6) in the paper of W. Scholz, M. Stephens, "K-Sample Anderson-Darling Tests"
747 to the case of histograms. See also http://arxiv.org/pdf/0804.0380v1.pdf paragraph 3.3.5
748 It is importat that empty bins are not present
749*/
751 pvalue = -1;
752 testStat = -1;
753 //
754 // compute cumulative sum of bin counts
755 // std::vector<double> sum1(data1.Size() );
756 // std::vector<double> sum2(data2.Size() );
757 // std::vector<double> sumAll(data1.Size() + data2.Size() );
758
759 if (data1.NDim() != 1 && data2.NDim() != 1) {
760 MATH_ERROR_MSG("AndersonDarling2SamplesTest", "Bin Data set must be one-dimensional ");
761 return;
762 }
763 unsigned int n1 = data1.Size();
764 unsigned int n2 = data2.Size();
765 double ntot1 = 0;
766 double ntot2 = 0;
767
768
769 // make a combined data set and sort it
770 std::vector<double> xdata(n1+n2);
771 for (unsigned int i = 0; i < n1; ++i) {
772 double value = 0;
773 const double * x = data1.GetPoint(i, value);
774 xdata[i] = *x;
775 ntot1 += value;
776 }
777 for (unsigned int i = 0; i < n2; ++i) {
778 double value = 0;
779 const double * x = data2.GetPoint(i, value);
780 xdata[n1+i] = *x;
781 ntot2 += value;
782 }
783 double nall = ntot1+ntot2;
784 // sort the combined data
785 std::vector<unsigned int> index(n1+n2);
786 TMath::Sort(n1+n2, &xdata[0], &index[0], false );
787
788 // now compute the sums for the tests
789 double sum1 = 0;
790 double sum2 = 0;
791 double sumAll = 0;
792 double adsum = 0;
793 unsigned int j = 0;
794
795 while( j < n1+n2 ) {
796// for (unsigned int j = 0; j < n1+n2; ++j) {
797 // skip equal observations
798 double x = xdata[ index[j] ];
799 unsigned int k = j;
800 // loop on the bins with the same center value
801 double t = 0;
802 do {
803 unsigned int i = index[k];
804 double value = 0;
805 if (i < n1 ) {
806 value = data1.Value(i);
807 sum1 += value;
808 }
809 else {
810 // from data2
811 i -= n1;
812 assert(i < n2);
813 value = data2.Value(i);
814 sum2 += value;
815 }
816 sumAll += value;
817 t += value;
818 //std::cout << "j " << j << " k " << k << " data " << x << " index " << index[k] << " value " << value << std::endl;
819 k++;
820 } while ( k < n1+n2 && xdata[ index[k] ] == x );
821
822
823 j = k;
824 // skip last point
825 if (j < n1+n2) {
826 double tmp1 = ( nall * sum1 - ntot1 * sumAll );
827 double tmp2 = ( nall * sum2 - ntot2 * sumAll );
828 adsum += t * (tmp1*tmp1/ntot1 + tmp2*tmp2/ntot2) / ( sumAll * (nall - sumAll) ) ;
829
830 //std::cout << "comp sum " << adsum << " " << t << " " << sumAll << " s1 " << sum1 << " s2 " << sum2 << " tmp1 " << tmp1 << " tmp2 " << tmp2 << std::endl;
831 }
832 }
833 double A2 = adsum / nall;
834
835 // compute the normalized test statistic
836 std::vector<unsigned int> ns(2);
837 ns[0] = ntot1;
838 ns[1] = ntot2;
839 //std::cout << " ad2 = " << A2 << " nall " << nall;
840
841 Double_t sigmaN = GetSigmaN(ns,nall);
842 A2 -= 1;
843 A2 /= sigmaN; // standartized test statistic
844
845 //std::cout << " sigmaN " << sigmaN << " new A2 " << A2;
846
847 pvalue = PValueADKSamples(2,A2);
848 //std::cout << " pvalue = " << pvalue << std::endl;
849 testStat = A2;
850 return;
851 }
852
853
855 Double_t pvalue, testStat;
856 AndersonDarling2SamplesTest(pvalue, testStat);
857 return (strncmp(option, "p", 1) == 0 || strncmp(option, "t", 1) != 0) ? pvalue : testStat;
858 }
859
860/*
861 Taken from (3)
862*/ void GoFTest::AndersonDarlingTest(Double_t& pvalue, Double_t& testStat) const {
863 pvalue = -1;
864 testStat = -1;
865 if (!fTestSampleFromH0) {
866 MATH_ERROR_MSG("AndersonDarlingTest", "Only 2-sample tests can be issued with a 2-sample constructed GoFTest object!");
867 return;
868 }
869 if (fDist == kUndefined) {
870 MATH_ERROR_MSG("AndersonDarlingTest", "Distribution type is undefined! Please use SetDistribution(GoFTest::EDistribution).");
871 return;
872 }
873 Double_t A2 = 0.0;
874 Int_t n = fSamples[0].size();
875 for (Int_t i = 0; i < n ; ++i) {
876 Double_t x1 = fSamples[0][i];
877 Double_t w1 = (*fCDF)(x1);
878 Double_t result = (2 * (i + 1) - 1) * TMath::Log(w1) + (2 * (n - (i + 1)) + 1) * TMath::Log(1 - w1);
879 A2 += result;
880 }
881 (A2 /= -n) -= n;
882 if (A2 != A2) {
883 MATH_ERROR_MSG("AndersonDarlingTest", "Cannot compute p-value: data below or above the distribution's thresholds. Check sample consistency.");
884 return;
885 }
886 pvalue = PValueAD1Sample(A2);
887 testStat = A2;
888 }
889
891 Double_t pvalue, testStat;
892 AndersonDarlingTest(pvalue, testStat);
893 return (strncmp(option, "p", 1) == 0 || strncmp(option, "t", 1) != 0) ? pvalue : testStat;
894 }
895
897 pvalue = -1;
898 testStat = -1;
899 if (fTestSampleFromH0) {
900 MATH_ERROR_MSG("KolmogorovSmirnov2SamplesTest", "Only 1-sample tests can be issued with a 1-sample constructed GoFTest object!");
901 return;
902 }
903 const UInt_t na = fSamples[0].size();
904 const UInt_t nb = fSamples[1].size();
905 std::vector<Double_t> a(na);
906 std::vector<Double_t> b(nb);
907 std::copy(fSamples[0].begin(), fSamples[0].end(), a.begin());
908 std::copy(fSamples[1].begin(), fSamples[1].end(), b.begin());
909 pvalue = TMath::KolmogorovTest(na, a.data(), nb, b.data(), 0);
910 testStat = TMath::KolmogorovTest(na, a.data(), nb, b.data(), "M");
911 }
912
914 Double_t pvalue, testStat;
915 KolmogorovSmirnov2SamplesTest(pvalue, testStat);
916 return (strncmp(option, "p", 1) == 0 || strncmp(option, "t", 1) != 0) ? pvalue : testStat;
917 }
918
919/*
920 Algorithm taken from (3) in page 737
921*/ void GoFTest::KolmogorovSmirnovTest(Double_t& pvalue, Double_t& testStat) const {
922 pvalue = -1;
923 testStat = -1;
924 if (!fTestSampleFromH0) {
925 MATH_ERROR_MSG("KolmogorovSmirnovTest", "Only 2-sample tests can be issued with a 2-sample constructed GoFTest object!");
926 return;
927 }
928 if (fDist == kUndefined) {
929 MATH_ERROR_MSG("KolmogorovSmirnovTest", "Distribution type is undefined! Please use SetDistribution(GoFTest::EDistribution).");
930 return;
931 }
932 Double_t Fo = 0.0, Dn = 0.0;
933 UInt_t n = fSamples[0].size();
934 for (UInt_t i = 0; i < n; ++i) {
935 Double_t Fn = (i + 1.0) / n;
936 Double_t F = (*fCDF)(fSamples[0][i]);
937 Double_t result = std::max(TMath::Abs(Fn - F), TMath::Abs(Fo - Fn));
938 if (result > Dn) Dn = result;
939 Fo = Fn;
940 }
941 pvalue = TMath::KolmogorovProb(Dn * (TMath::Sqrt(n) + 0.12 + 0.11 / TMath::Sqrt(n)));
942 testStat = Dn;
943 }
944
946 Double_t pvalue, testStat;
947 KolmogorovSmirnovTest(pvalue, testStat);
948 return (strncmp(option, "p", 1) == 0 || strncmp(option, "t", 1) != 0) ? pvalue : testStat;
949 }
950
951
952
953
954
955} // ROOT namespace
956} // Math namespace
957
double
#define MATH_ERROR_MSG(loc, str)
Definition Error.h:83
#define MATH_WARN_MSG(loc, str)
Definition Error.h:80
#define d(i)
Definition RSha256.hxx:102
#define b(i)
Definition RSha256.hxx:100
#define f(i)
Definition RSha256.hxx:104
#define c(i)
Definition RSha256.hxx:101
#define g(i)
Definition RSha256.hxx:105
#define a(i)
Definition RSha256.hxx:99
#define h(i)
Definition RSha256.hxx:106
static const double x1[5]
#define M_PI
Definition Rotated.cxx:105
char Char_t
Definition RtypesCore.h:37
const Bool_t kFALSE
Definition RtypesCore.h:92
double Double_t
Definition RtypesCore.h:59
const Bool_t kTRUE
Definition RtypesCore.h:91
#define N
float xmin
float xmax
double exp(double)
Class describing the binned data sets : vectors of x coordinates, y values and optionally error on y ...
Definition BinData.h:52
const double * GetPoint(unsigned int ipoint, double &value) const
retrieve at the same time a pointer to the coordinate data and the fit value More efficient than call...
Definition BinData.h:369
double Value(unsigned int ipoint) const
return the value for the given fit point
Definition BinData.h:216
unsigned int Size() const
return number of fit points
Definition FitData.h:303
unsigned int NDim() const
return coordinate data dimension
Definition FitData.h:311
static Double_t PValueADKSamples(UInt_t nsamples, Double_t A2)
Definition GoFTest.cxx:353
void operator()(ETestType test, Double_t &pvalue, Double_t &testStat) const
Definition GoFTest.cxx:208
void SetDistributionFunction(const IGenFunction &cdf, Bool_t isPDF, Double_t xmin, Double_t xmax)
Definition GoFTest.cxx:264
std::unique_ptr< IGenFunction > fCDF
Definition GoFTest.h:182
Bool_t fTestSampleFromH0
Definition GoFTest.h:194
EDistribution fDist
Definition GoFTest.h:185
void SetSamples(std::vector< const Double_t * > samples, const std::vector< UInt_t > samplesSizes)
Definition GoFTest.cxx:180
std::vector< Double_t > fCombinedSamples
Definition GoFTest.h:190
void KolmogorovSmirnovTest(Double_t &pvalue, Double_t &testStat) const
Definition GoFTest.cxx:921
void SetDistribution(EDistribution dist)
Definition GoFTest.cxx:123
void Instantiate(const Double_t *sample, UInt_t sampleSize)
Definition GoFTest.cxx:276
Double_t GaussianCDF(Double_t x) const
Definition GoFTest.cxx:294
void AndersonDarling2SamplesTest(Double_t &pvalue, Double_t &testStat) const
Definition GoFTest.cxx:646
static Double_t GetSigmaN(const std::vector< UInt_t > &ns, UInt_t N)
Definition GoFTest.cxx:311
void KolmogorovSmirnov2SamplesTest(Double_t &pvalue, Double_t &testStat) const
Definition GoFTest.cxx:896
std::vector< std::vector< Double_t > > fSamples
Definition GoFTest.h:192
Double_t PValueAD1Sample(Double_t A2) const
Definition GoFTest.cxx:483
void AndersonDarlingTest(Double_t &pvalue, Double_t &testStat) const
Definition GoFTest.cxx:862
Double_t ExponentialCDF(Double_t x) const
Definition GoFTest.cxx:298
Interface (abstract class) for generic functions objects of one-dimension Provides a method to evalua...
Definition IFunction.h:135
Template class to wrap any member function of a class taking a double and returning a double in a 1D ...
Stopwatch class.
Definition TStopwatch.h:28
void Start(Bool_t reset=kTRUE)
Start the stopwatch.
void Reset()
Definition TStopwatch.h:52
void Print(Option_t *option="") const
Print the real and cpu time passed between the start and stop events.
double normal_cdf(double x, double sigma=1, double x0=0)
Cumulative distribution function of the normal (Gaussian) distribution (lower tail).
double exponential_cdf(double x, double lambda, double x0=0)
Cumulative distribution function of the exponential distribution (lower tail).
Double_t x[n]
Definition legend1.C:17
const Int_t n
Definition legend1.C:16
#define F(x, y, z)
#define H(x, y, z)
Namespace for new Math classes and functions.
void adkTestStat(double *adk, const std::vector< std::vector< double > > &samples, const std::vector< double > &zstar)
Definition GoFTest.cxx:546
int getCount(double z, const double *dat, int n)
Definition GoFTest.cxx:520
int getSum(const int *x, int n)
Definition GoFTest.cxx:534
IBaseFunctionOneDim IGenFunction
tbb::task_arena is an alias of tbb::interface7::task_arena, which doesn't allow to forward declare tb...
Double_t KolmogorovTest(Int_t na, const Double_t *a, Int_t nb, const Double_t *b, Option_t *option)
Statistical test whether two one-dimensional sets of points are compatible with coming from the same ...
Definition TMath.cxx:782
Double_t Log(Double_t x)
Definition TMath.h:760
Double_t Sqrt(Double_t x)
Definition TMath.h:691
LongDouble_t Power(LongDouble_t x, LongDouble_t y)
Definition TMath.h:735
Double_t KolmogorovProb(Double_t z)
Calculates the Kolmogorov distribution function,.
Definition TMath.cxx:656
void Sort(Index n, const Element *a, Index *index, Bool_t down=kTRUE)
Definition TMathBase.h:362
Short_t Abs(Short_t d)
Definition TMathBase.h:120
Definition test.py:1
auto * l
Definition textangle.C:4
static uint64_t sum(uint64_t i)
Definition Factory.cxx:2345