Logo ROOT   6.16/01
Reference Guide
GoFTest.cxx
Go to the documentation of this file.
1// @(#)root/mathcore:$Id$
2// Authors: Bartolomeu Rabacal 05/2010
3/**********************************************************************
4 * *
5 * Copyright (c) 2006 , LCG ROOT MathLib Team *
6 * *
7 * *
8 **********************************************************************/
9// implementation file for GoFTest
10
11
12#include <algorithm>
13#include <functional>
14#include <iostream>
15#include <map>
16#include <numeric>
17#include <string.h>
18#include <cassert>
19
20#include "Math/Error.h"
21#include "Math/Math.h"
22#include "Math/IFunction.h"
23#include "Math/IFunctionfwd.h"
24#include "Math/Integrator.h"
27
28#include "Math/GoFTest.h"
29
30#include "Fit/BinData.h"
31
32#include "TStopwatch.h"
33
34/* Note: The references mentioned here are stated in GoFTest.h */
35
36namespace ROOT {
37namespace Math {
38
39 struct CDFWrapper : public IGenFunction {
40 // wrapper around a cdf funciton to re-scale for the range
41 Double_t fXmin; // lower range for x
42 Double_t fXmax; // lower range for x
43 Double_t fNorm; // normalization
44 const IGenFunction* fCDF; // cdf pointer (owned by the class)
45
46
47 virtual ~CDFWrapper() { if (fCDF) delete fCDF; }
48
49 CDFWrapper(const IGenFunction& cdf, Double_t xmin=0, Double_t xmax=-1) :
50 fCDF(cdf.Clone())
51 {
52 if (xmin >= xmax) {
53 fNorm = 1;
54 fXmin = -std::numeric_limits<double>::infinity();
55 fXmax = std::numeric_limits<double>::infinity();
56 }
57 else {
58 fNorm = cdf(xmax) - cdf(xmin);
59 fXmin = xmin;
60 fXmax = xmax;
61 }
62 }
63
64 Double_t DoEval(Double_t x) const {
65 if (x <= fXmin) return 0;
66 if (x >= fXmax) return 1.0;
67 return (*fCDF)(x)/fNorm;
68 }
69
70 IGenFunction* Clone() const {
71 return new CDFWrapper(*fCDF,fXmin,fXmax);
72 }
73 };
74
75
76 class PDFIntegral : public IGenFunction {
77 Double_t fXmin; // lower range for x
78 Double_t fXmax; // lower range for x
79 Double_t fNorm; // normalization
80 mutable IntegratorOneDim fIntegral;
81 const IGenFunction* fPDF; // pdf pointer (owned by the class)
82 public:
83
84 virtual ~PDFIntegral() { if (fPDF) delete fPDF; }
85
86 PDFIntegral(const IGenFunction& pdf, Double_t xmin = 0, Double_t xmax = -1) :
87 fXmin(xmin),
88 fXmax(xmax),
89 fNorm(1),
90 fPDF(pdf.Clone())
91 {
92 // compute normalization
93 fIntegral.SetFunction(*fPDF); // N.B. must be fPDF (the cloned copy) and not pdf which can disappear
94 if (fXmin >= fXmax) {
95 fXmin = -std::numeric_limits<double>::infinity();
96 fXmax = std::numeric_limits<double>::infinity();
97 }
98 if (fXmin == -std::numeric_limits<double>::infinity() && fXmax == std::numeric_limits<double>::infinity() ) {
99 fNorm = fIntegral.Integral();
100 }
101 else if (fXmin == -std::numeric_limits<double>::infinity() )
102 fNorm = fIntegral.IntegralLow(fXmax);
103 else if (fXmax == std::numeric_limits<double>::infinity() )
104 fNorm = fIntegral.IntegralUp(fXmin);
105 else
106 fNorm = fIntegral.Integral(fXmin, fXmax);
107 }
108
109 Double_t DoEval(Double_t x) const {
110 if (x <= fXmin) return 0;
111 if (x >= fXmax) return 1.0;
112 if (fXmin == -std::numeric_limits<double>::infinity() )
113 return fIntegral.IntegralLow(x)/fNorm;
114 else
115 return fIntegral.Integral(fXmin,x)/fNorm;
116 }
117
118 IGenFunction* Clone() const {
119 return new PDFIntegral(*fPDF, fXmin, fXmax);
120 }
121 };
122
124 if (!(kGaussian <= dist && dist <= kExponential)) {
125 MATH_ERROR_MSG("SetDistribution", "Cannot set distribution type! Distribution type option must be ennabled.");
126 return;
127 }
128 fDist = dist;
129 SetCDF();
130 }
131
132 GoFTest::GoFTest( UInt_t sample1Size, const Double_t* sample1, UInt_t sample2Size, const Double_t* sample2 )
133 : fDist(kUndefined),
134 fSamples(std::vector<std::vector<Double_t> >(2)),
135 fTestSampleFromH0(kFALSE) {
136 Bool_t badSampleArg = sample1 == 0 || sample1Size == 0;
137 if (badSampleArg) {
138 std::string msg = "'sample1";
139 msg += !sample1Size ? "Size' cannot be zero" : "' cannot be zero-length";
140 MATH_ERROR_MSG("GoFTest", msg.c_str());
141 assert(!badSampleArg);
142 }
143 badSampleArg = sample2 == 0 || sample2Size == 0;
144 if (badSampleArg) {
145 std::string msg = "'sample2";
146 msg += !sample2Size ? "Size' cannot be zero" : "' cannot be zero-length";
147 MATH_ERROR_MSG("GoFTest", msg.c_str());
148 assert(!badSampleArg);
149 }
150 std::vector<const Double_t*> samples(2);
151 std::vector<UInt_t> samplesSizes(2);
152 samples[0] = sample1;
153 samples[1] = sample2;
154 samplesSizes[0] = sample1Size;
155 samplesSizes[1] = sample2Size;
156 SetSamples(samples, samplesSizes);
158 }
159
161 : fDist(dist),
162 fSamples(std::vector<std::vector<Double_t> >(1)),
163 fTestSampleFromH0(kTRUE) {
164 Bool_t badSampleArg = sample == 0 || sampleSize == 0;
165 if (badSampleArg) {
166 std::string msg = "'sample";
167 msg += !sampleSize ? "Size' cannot be zero" : "' cannot be zero-length";
168 MATH_ERROR_MSG("GoFTest", msg.c_str());
169 assert(!badSampleArg);
170 }
171 std::vector<const Double_t*> samples(1, sample);
172 std::vector<UInt_t> samplesSizes(1, sampleSize);
173 SetSamples(samples, samplesSizes);
175 SetCDF();
176 }
177
179
180 void GoFTest::SetSamples(std::vector<const Double_t*> samples, const std::vector<UInt_t> samplesSizes) {
181 fCombinedSamples.assign(std::accumulate(samplesSizes.begin(), samplesSizes.end(), 0u), 0.0);
182 UInt_t combinedSamplesSize = 0;
183 for (UInt_t i = 0; i < samples.size(); ++i) {
184 fSamples[i].assign(samples[i], samples[i] + samplesSizes[i]);
185 std::sort(fSamples[i].begin(), fSamples[i].end());
186 for (UInt_t j = 0; j < samplesSizes[i]; ++j) {
187 fCombinedSamples[combinedSamplesSize + j] = samples[i][j];
188 }
189 combinedSamplesSize += samplesSizes[i];
190 }
191 std::sort(fCombinedSamples.begin(), fCombinedSamples.end());
192
193 Bool_t degenerateSamples = *(fCombinedSamples.begin()) == *(fCombinedSamples.end() - 1);
194 if (degenerateSamples) {
195 std::string msg = "Degenerate sample";
196 msg += samplesSizes.size() > 1 ? "s!" : "!";
197 msg += " Sampling values all identical.";
198 MATH_ERROR_MSG("SetSamples", msg.c_str());
199 assert(!degenerateSamples);
200 }
201 }
202
204 fMean = std::accumulate(fSamples[0].begin(), fSamples[0].end(), 0.0) / fSamples[0].size();
205 fSigma = TMath::Sqrt(1. / (fSamples[0].size() - 1) * (std::inner_product(fSamples[0].begin(), fSamples[0].end(), fSamples[0].begin(), 0.0) - fSamples[0].size() * TMath::Power(fMean, 2)));
206 }
207
208 void GoFTest::operator()(ETestType test, Double_t& pvalue, Double_t& testStat) const {
209 switch (test) {
210 default:
211 case kAD:
212 AndersonDarlingTest(pvalue, testStat);
213 break;
214 case kAD2s:
215 AndersonDarling2SamplesTest(pvalue, testStat);
216 break;
217 case kKS:
218 KolmogorovSmirnovTest(pvalue, testStat);
219 break;
220 case kKS2s:
221 KolmogorovSmirnov2SamplesTest(pvalue, testStat);
222 }
223 }
224
226 Double_t result = 0.0;
227 switch (test) {
228 default:
229 case kAD:
230 result = AndersonDarlingTest(option);
231 break;
232 case kAD2s:
233 result = AndersonDarling2SamplesTest(option);
234 break;
235 case kKS:
236 result = KolmogorovSmirnovTest(option);
237 break;
238 case kKS2s:
239 result = KolmogorovSmirnov2SamplesTest(option);
240 }
241 return result;
242 }
243
244 void GoFTest::SetCDF() { // Setting parameter-free distributions
245 IGenFunction* cdf = 0;
246 switch (fDist) {
247 case kLogNormal:
248 LogSample();
249 /* fall through */
250 case kGaussian :
252 break;
253 case kExponential:
255 break;
256 case kUserDefined:
257 case kUndefined:
258 default:
259 break;
260 }
261 fCDF.reset(cdf);
262 }
263
265 if (fDist > kUserDefined) {
266 MATH_WARN_MSG("SetDistributionFunction","Distribution type is changed to user defined");
267 }
269 // function will be cloned inside the wrapper PDFIntegral of CDFWrapper classes
270 if (isPDF)
271 fCDF.reset(new PDFIntegral(f, xmin, xmax) );
272 else
273 fCDF.reset(new CDFWrapper(f, xmin, xmax) );
274 }
275
276 void GoFTest::Instantiate(const Double_t* sample, UInt_t sampleSize) {
277 // initialization function for the template constructors
278 Bool_t badSampleArg = sample == 0 || sampleSize == 0;
279 if (badSampleArg) {
280 std::string msg = "'sample";
281 msg += !sampleSize ? "Size' cannot be zero" : "' cannot be zero-length";
282 MATH_ERROR_MSG("GoFTest", msg.c_str());
283 assert(!badSampleArg);
284 }
285 fCDF.reset((IGenFunction*)0);
287 fMean = 0;
288 fSigma = 0;
289 fSamples = std::vector<std::vector<Double_t> >(1);
291 SetSamples(std::vector<const Double_t*>(1, sample), std::vector<UInt_t>(1, sampleSize));
292 }
293
296 }
297
299 return ROOT::Math::exponential_cdf(x, 1.0 / fMean);
300 }
301
303 transform(fSamples[0].begin(), fSamples[0].end(), fSamples[0].begin(),
306 }
307
308/*
309 Taken from (1)
310*/
311 Double_t GoFTest::GetSigmaN(const std::vector<UInt_t> & ns, UInt_t N) {
312 // compute moments of AD distribution (from Scholz-Stephen paper, paragraph 3)
313
314 Double_t sigmaN = 0.0, h = 0.0, H = 0.0, g = 0.0, a, b, c, d, k = ns.size();
315
316 for (UInt_t i = 0; i < ns.size(); ++i) {
317 H += 1.0 / double( ns[i] );
318 }
319
320 // use approximate formulas for large N
321 // cache Sum( 1 / i)
322 if (N < 2000) {
323 std::vector<double> invI(N);
324 for (UInt_t i = 1; i <= N - 1; ++i) {
325 invI[i] = 1.0 / i;
326 h += invI[i];
327 }
328 for (UInt_t i = 1; i <= N - 2; ++i) {
329 double tmp = invI[N-i];
330 for (UInt_t j = i + 1; j <= N - 1; ++j) {
331 g += tmp * invI[j];
332 }
333 }
334 }
335 else {
336 // for N larger than 2000 error difference in g is ~ 5 10^-3 while in h is at the level of 10^-5
337 const double emc = 0.5772156649015328606065120900824024; // Euler-Mascheroni constant
338 h = std::log(double(N-1) ) + emc;
339 g = (M_PI)*(M_PI)/6.0;
340 }
341 double k2 = std::pow(k,2);
342 a = (4 * g - 6) * k + (10 - 6 * g) * H - 4 * g + 6;
343 b = (2 * g - 4) * k2 + 8 * h * k + (2 * g - 14 * h - 4) * H - 8 * h + 4 * g - 6;
344 c = (6 * h + 2 * g - 2) * k2 + (4 * h - 4 *g + 6) * k + (2 * h - 6) * H + 4 * h;
345 d = (2 * h + 6) * k2 - 4 * h * k;
346 sigmaN += a * std::pow(double(N),3) + b * std::pow(double(N),2) + c * N + d;
347 sigmaN /= ( double(N - 1) * double(N - 2) * double(N - 3) );
348 sigmaN = TMath::Sqrt(sigmaN);
349 return sigmaN;
350 }
351
352
354
355 /*
356 Computation of p-values according to
357 "K-Sample Anderson-Darling Tests" by F.W. Scholz
358 and M.A. Stephens (1987), Journal of the American Statistical Association,
359 Vol 82, No. 399, pp 918-924.
360 Code from kSamples package from R (author F. Scholtz)
361
362 This function uses the upper T_m quantiles as obtained via simulation of
363 the Anderson-Darling test statistics (Nsim = 2*10^6) with sample sizes n=500
364 for each sample, and after standardization, in order to emulate the Table 1
365 values given in the above reference. However, here we estimate p-quantiles
366 for p = .00001,.00005,.0001,.0005,.001,.005,.01,.025,.05,.075,
367 .1,.2,.3,.4,.5,.6,.7,.8,.9,.925,.95,.975,.99,.9925,.995,.9975,.999,
368 .99925,.9995,.99975,.9999,.999925,.99995,.999975,.99999
369 First the appropriate p-quantiles are determined from those simulated
370 for ms = 1,2,3,4,6,8,10, Inf, interpolating to the given value of m.
371 Since we use only m=2 we avoid this interpolation.
372
373 Next linear inetrpolation to find the observed p value given the observed test statistic value.
374 We use interpolation in the test statistic -> log((1-p)/p) domain
375 and we extrapolatelinearly) beyond p = .00001 and .99999.
376 */
377
378 // sample values
379 //double ms[] = { 1, 2, 3, 4, 6, 8, 10, TMath::Infinity() };
380 //int ns = ms.size();
381 const int ns = 8;
382 double ts[ ] = { -1.1954, -1.5806, -1.8172,
383 -2.0032, -2.2526, -2.4204, -2.5283, -4.2649, -1.1786, -1.5394,
384 -1.7728, -1.9426, -2.1685, -2.3288, -2.4374, -3.8906, -1.166,
385 -1.5193, -1.7462, -1.9067, -2.126, -2.2818, -2.3926, -3.719,
386 -1.1407, -1.4659, -1.671, -1.8105, -2.0048, -2.1356, -2.2348,
387 -3.2905, -1.1253, -1.4371, -1.6314, -1.7619, -1.9396, -2.0637,
388 -2.1521, -3.0902, -1.0777, -1.3503, -1.5102, -1.6177, -1.761,
389 -1.8537, -1.9178, -2.5758, -1.0489, -1.2984, -1.4415, -1.5355,
390 -1.6625, -1.738, -1.7936, -2.3263, -0.9978, -1.2098, -1.3251,
391 -1.4007, -1.4977, -1.5555, -1.5941, -1.96, -0.9417, -1.1187,
392 -1.209, -1.2671, -1.3382, -1.379, -1.405, -1.6449, -0.8981, -1.0491,
393 -1.1235, -1.1692, -1.2249, -1.2552, -1.2755, -1.4395, -0.8598,
394 -0.9904, -1.0513, -1.0879, -1.1317, -1.155, -1.1694, -1.2816,
395 -0.7258, -0.7938, -0.8188, -0.8312, -0.8435, -0.8471, -0.8496,
396 -0.8416, -0.5966, -0.617, -0.6177, -0.6139, -0.6073, -0.5987,
397 -0.5941, -0.5244, -0.4572, -0.4383, -0.419, -0.4033, -0.3834,
398 -0.3676, -0.3587, -0.2533, -0.2966, -0.2428, -0.2078, -0.1844,
399 -0.1548, -0.1346, -0.1224, 0, -0.1009, -0.0169, 0.0304, 0.0596,
400 0.0933, 0.1156, 0.1294, 0.2533, 0.1571, 0.2635, 0.3169, 0.348,
401 0.3823, 0.4038, 0.4166, 0.5244, 0.5357, 0.6496, 0.6992, 0.7246,
402 0.7528, 0.7683, 0.7771, 0.8416, 1.2255, 1.2989, 1.3202, 1.3254,
403 1.3305, 1.3286, 1.3257, 1.2816, 1.5262, 1.5677, 1.5709, 1.5663,
404 1.5561, 1.5449, 1.5356, 1.4395, 1.9633, 1.943, 1.919, 1.8975,
405 1.8641, 1.8389, 1.8212, 1.6449, 2.7314, 2.5899, 2.5, 2.4451,
406 2.3664, 2.3155, 2.2823, 1.96, 3.7825, 3.4425, 3.2582, 3.1423,
407 3.0036, 2.9101, 2.8579, 2.3263, 4.1241, 3.716, 3.4984, 3.3651,
408 3.2003, 3.0928, 3.0311, 2.4324, 4.6044, 4.0847, 3.8348, 3.6714,
409 3.4721, 3.3453, 3.2777, 2.5758, 5.409, 4.7223, 4.4022, 4.1791,
410 3.9357, 3.7809, 3.6963, 2.807, 6.4954, 5.5823, 5.1456, 4.8657,
411 4.5506, 4.3275, 4.2228, 3.0902, 6.8279, 5.8282, 5.3658, 5.0749,
412 4.7318, 4.4923, 4.3642, 3.1747, 7.2755, 6.197, 5.6715, 5.3642,
413 4.9991, 4.7135, 4.5945, 3.2905, 8.1885, 6.8537, 6.2077, 5.8499,
414 5.4246, 5.1137, 4.9555, 3.4808, 9.3061, 7.6592, 6.85, 6.4806,
415 5.9919, 5.6122, 5.5136, 3.719, 9.6132, 7.9234, 7.1025, 6.6731,
416 6.1549, 5.8217, 5.7345, 3.7911, 10.0989, 8.2395, 7.4326, 6.9567,
417 6.3908, 6.011, 5.9566, 3.8906, 10.8825, 8.8994, 7.8934, 7.4501,
418 6.9009, 6.4538, 6.2705, 4.0556, 11.8537, 9.5482, 8.5568, 8.0283,
419 7.4418, 6.9524, 6.6195, 4.2649 };
420
421
422
423
424
425 // p values bins
426 double p[] = { .00001,.00005,.0001,.0005,.001,.005,.01,.025,.05,.075,.1,.2,.3,.4,.5,.6,.7,.8,.9,
427 .925,.95,.975,.99,.9925,.995,.9975,.999,.99925,.9995,.99975,.9999,.999925,.99995,.999975,.99999 };
428
429 //int nbins = p.size();
430 const int nbins = 35;
431 //assert ( nbins*ns == ts.size() );
432
433 // get ts values for nsamples = 2
434 // corresponding value is for m=nsamples-1
435 int offset = 0; // for m = 1 (i.e. for nsamples = 2)
436 if (nsamples != 2) {
437 MATH_ERROR_MSG("InterpolatePValues", "Interpolation not implemented for nsamples not equal to 2");
438 return 0;
439 }
440 std::vector<double> ts2(nbins); // ts values for nsamples = 2
441 std::vector<double> lp(nbins); // log ( p / (1-p) )
442 for (int i = 0; i < nbins; ++i)
443 {
444 ts2[i] = ts[offset+ i * ns];
445 p[i] = 1.-p[i];
446 lp[i] = std::log( p[i]/(1.-p[i] ) );
447 }
448 // do linear interpolation to find right lp value for given observed test staistic value
449 //auto it = std::lower_bound(ts2.begin(), ts2.end(), tx );
450 int i1 = std::distance(ts2.begin(), std::lower_bound(ts2.begin(), ts2.end(), tx ) ) - 1;
451 int i2 = i1+1;
452 // if tx is before min of tabluated data
453 if (i1 < 0) {
454 i1 = 0;
455 i2 = 1;
456 }
457 // if tx is after max of tabulated data
458 if (i2 >= int(ts2.size()) ) {
459 i1 = ts2.size()-2;
460 i2 = ts2.size()-1;
461 }
462
463 //std::cout << i1 << " , " << i2 << std::endl;
464 assert(i1 < (int) lp.size() && i2 < (int) lp.size() );
465 double lp1 = lp[i1];
466 double lp2 = lp[i2];
467 double tx1 = ts2[i1];
468 double tx2 = ts2[i2];
469
470 //std::cout << " tx1,2 " << tx1 << " " << tx2 << std::endl;
471 /// find interpolated (or extrapolated value)(
472 double lp0 = (lp1-lp2) * (tx - tx2)/ ( tx1-tx2) + lp2;
473
474
475 double p0 = exp(lp0)/(1. + exp(lp0) );
476 return p0;
477
478 }
479
480
481/*
482 Taken from (2)
484 Double_t pvalue = 0.0;
485 if (A2 <= 0.0) {
486 return pvalue;
487 } else if (A2 < 2.) {
488 pvalue = std::pow(A2, -0.5) * std::exp(-1.2337141 / A2) * (2.00012 + (0.247105 - (0.0649821 - (0.0347962 - (0.011672 - 0.00168691 * A2) * A2) * A2) * A2) * A2);
489 } else {
490 pvalue = std::exp(-1. * std::exp(1.0776 - (2.30695 - (0.43424 - (.082433 - (0.008056 - 0.0003146 * A2) * A2) * A2) * A2) * A2));
491 }
492 return 1. - pvalue;
493 }
494
495
496// code from kSamples (R) F. Scholz
497
498/* computes the k-sample Anderson-Darling test statistics in both original
499 and alternative versions for the nonparametric (rank) test described in
500 Scholz F.W. and Stephens M.A. (1987), K-sample Anderson-Darling Tests,
501 Journal of the American Statistical Association, Vol 82, No. 399,
502 pp. 918-924
503
504 Arguments:
505 adk: double array with length 2, stores AkN2 and AakN2
506 k: integer, number of samples being compared
507 x: double array storing the concatenated samples in the same order as ns
508 ns: integer array storing the k sample sizes, corresponding to x
509 zstar: double array storing the l distinct ordered observations in the
510 pooled sample
511 l: integer, length of zstar
512
513 Outputs:
514 when the computation ends, AkN2 and AakN2 are stored in the given memory
515 pointed by adk
516*/
517
518/* counts and returns the number of occurrence of a given number
519 in a double array */
520int getCount(double z, const double *dat, int n) {
521 int i;
522 int count = 0;
523
524 for (i = 0; i < n; i++) {
525 if (dat[i] == z) {
526 count++;
527 }
528 }
529
530 return(count);
531}
532
533/* computes and returns the sum of elements in a given integer array */
534int getSum(const int *x, int n) {
535 int i;
536 int sum = 0;
537
538 for (i = 0; i < n; i++) {
539 sum += x[i];
540 }
541
542 return(sum);
543}
544
545
546void adkTestStat(double *adk, const std::vector<std::vector<double> > & samples, const std::vector<double> & zstar) {
547
548 int i;
549 int j;
550
551 int nsum; /* total sample size = n_1 + ... + n_k */
552 int k = samples.size();
553 int l = zstar.size();
554
555 /* fij records the number of observations in the ith sample coinciding
556 with zstar[j], where i = 1, ..., k, and j = 1, ..., l */
557 std::vector<int> fij (k*l);
558 /* lvec is an integer vector with length l,
559 whose jth entry = \sum_{i=1}^{k} f_{ij}, i.e., the multiplicity
560 of zstar[j] */
561 std::vector<int> lvec(l);
562
563 /* for computation */
564 double mij;
565 double maij;
566 double innerSum;
567 double aInnerSum;
568 double bj;
569 double baj;
570 double tmp;
571
572 /* samples is a two-dimensional double array with length k;
573 it stores an array of k pointers to double arrays which are
574 the k samples beeing compared */
575// double **samples;
576
577 /* dynamically allocate memory */
578 //std::vector< std::vector<double> > samples(k);
579 std::vector<int> ns(k);
580 nsum = 0;
581 for (i = 0; i < k; i++) {
582 ns[i] = samples[i].size();
583 nsum += ns[i];
584 }
585
586 /* fij: k*l integer matrix, where l is the length of zstar and
587 k is the number of samples being compared
588 lvec: integer vector of length l, records the multiplicity of
589 each element of zstar */
590 for (j = 0; j < l; j++) {
591 lvec[j] = 0;
592 for (i = 0; i < k; i++) {
593 fij[i + j*k] = getCount(zstar[j], &samples[i][0], ns[i]);
594 lvec[j] += fij[i + j*k];
595 }
596 }
597
598 // loop on samples to compute the adk's
599 // Formula (6) and (7) of the paper
600 adk[0] = adk[1] = 0;
601 for (i = 0; i < k; i++) {
602 mij = 0;
603 maij = 0;
604 innerSum = 0;
605 aInnerSum = 0;
606
607 for (j = 0; j < l; j++) {
608 mij += fij[i + j*k];
609 maij = mij - (double) fij[i + j*k] / 2.0;
610 bj = getSum(&lvec[0], j + 1);
611 baj = bj - (double) lvec[j] / 2.0;
612
613 if (j < l - 1) {
614 tmp = (double) nsum * mij - (double) ns[i] * bj;
615 innerSum = innerSum + (double) lvec[j] * tmp * tmp /
616 (bj * ((double) nsum - bj));
617 }
618
619 tmp = (double) nsum * maij - (double) ns[i] * baj;
620 aInnerSum = aInnerSum + (double) lvec[j] * tmp * tmp /
621 (baj * (nsum - baj) - nsum * (double) lvec[j] / 4.0);
622 }
623
624 adk[0] = adk[0] + innerSum / ns[i]; /* AkN2*/
625 adk[1] = adk[1] + aInnerSum / ns[i]; /* AakN2 */
626 }
627
628 /* k-sample Anderson-Darling test statistics in both original and
629 alternative versions, AkN2 and AakN2, are stored in the given
630 double array adk */
631 adk[0] = adk[0] / (double) nsum; /* AkN2*/
632 adk[1] = (nsum - 1) * adk[1] / ((double) nsum * (double) nsum); /* AakN2 */
633
634 // /* free pointers */
635 // for (i = 0; i < k; i++) {
636 // free(samples[i]);
637 // }
638 // free(samples);
639
640}
641
642
643/*
644 Taken from (1) -- Named for 2 samples but implemented for K. Restricted to K = 2 by the class's constructors
645*/
647 pvalue = -1;
648 testStat = -1;
649 if (fTestSampleFromH0) {
650 MATH_ERROR_MSG("AndersonDarling2SamplesTest", "Only 1-sample tests can be issued with a 1-sample constructed GoFTest object!");
651 return;
652 }
653 std::vector<Double_t> z(fCombinedSamples);
654 // unique removes all consecutives duplicates elements. This is exactly what we wants
655 // for example unique of v={1,2,2,3,1,2,3,3} results in {1,2,3,1,2,3} which is exactly what we wants
656 std::vector<Double_t>::iterator endUnique = std::unique(z.begin(), z.end()); //z_j's in (1)
657 z.erase(endUnique, z.end() );
658 std::vector<UInt_t> h; // h_j's in (1)
659 std::vector<Double_t> H; // H_j's in (1)
660 UInt_t N = fCombinedSamples.size();
661 Double_t A2 = 0.0; // Anderson-Darling A^2 Test Statistic
662
663#ifdef USE_OLDIMPL
664
665 TStopwatch w; w.Start();
666
667 unsigned int nSamples = fSamples.size();
668
669 // old implementation
670 for (std::vector<Double_t>::iterator data = z.begin(); data != endUnique; ++data) {
671 UInt_t n = std::count(fCombinedSamples.begin(), fCombinedSamples.end(), *data);
672 h.push_back(n);
673 H.push_back(std::count_if(fCombinedSamples.begin(), fCombinedSamples.end(),
674 std::bind(std::less<Double_t>(), std::placeholders::_1, *data)) + n / 2.);
675 }
676 std::cout << "time for H";
677 w.Print();
678 w.Reset(); w.Start();
679 std::vector<std::vector<Double_t> > F(nSamples); // F_ij's in (1)
680 for (UInt_t i = 0; i < nSamples; ++i) {
681 for (std::vector<Double_t>::iterator data = z.begin(); data != endUnique; ++data) {
682 UInt_t n = std::count(fSamples[i].begin(), fSamples[i].end(), *data);
683 F[i].push_back(std::count_if(fSamples[i].begin(), fSamples[i].end(),
684 std::bind(std::less<Double_t>(), std::placeholders::_1, *data)) + n / 2.);
685 }
686 }
687 std::cout << "time for F";
688 w.Print();
689 for (UInt_t i = 0; i < nSamples; ++i) {
690 Double_t sum_result = 0.0;
691 UInt_t j = 0;
692 w.Reset(); w.Start();
693 for (std::vector<Double_t>::iterator data = z.begin(); data != endUnique; ++data) {
694 sum_result += h[j] * TMath::Power(N * F[i][j]- fSamples[i].size() * H[j], 2) / (H[j] * (N - H[j]) - N * h[j] / 4.0);
695 ++j;
696 }
697 std::cout << "time for sum_resut";
698 w.Print();
699 std::cout << "sum_result " << sum_result << std::endl;
700 A2 += 1.0 / fSamples[i].size() * sum_result;
701 }
702 A2 *= (N - 1) / (TMath::Power(N, 2)); // A2_akN in (1)
703
704 std::cout << "A2 - old Bartolomeo code " << A2 << std::endl;
705#endif
706 // w.Reset();
707 // w.Start();
708
709 double adk[2] = {0,0};
710
711 //debug
712 // std::cout << "combined samples\n";
713 // for (int i = 0; i < fCombinedSamples.size(); ++i)
714 // std::cout << fCombinedSamples[i] << " ,";
715 // std::cout << std::endl;
716 // std::cout << ns[0] << " " << ns[1] << std::endl;
717 // std::cout << "Z\n";
718 // for (int i = 0; i < z.size(); ++i)
719 // std::cout << z[i] << " ,";
720 // std::cout << std::endl;
721
722 // use function from kSamples code
723 adkTestStat(adk, fSamples, z );
724 // w.Print();
725 // std::cout << "A2 - new kSamples code " << adk[0] << " " << adk[1] << std::endl;
726
727 A2 = adk[0];
728
729 // compute the normalized test statistic
730
731 std::vector<UInt_t> ns(fSamples.size());
732 for (unsigned int k = 0; k < ns.size(); ++k) ns[k] = fSamples[k].size();
733 Double_t sigmaN = GetSigmaN(ns, N);
734 A2 -= fSamples.size() - 1;
735 A2 /= sigmaN; // standartized test statistic
736
737 pvalue = PValueADKSamples(2,A2);
738 testStat = A2;
739 return;
740 }
741
742
743/*
744 Compute Anderson Darling test for two binned data set.
745 A binned data set can be seen as many identical observation happening at the center of the bin
746 In this way it is trivial to apply the formula (6) in the paper of W. Scholz, M. Stephens, "K-Sample Anderson-Darling Tests"
747 to the case of histograms. See also http://arxiv.org/pdf/0804.0380v1.pdf paragraph 3.3.5
748 It is importat that empty bins are not present
749*/
751 pvalue = -1;
752 testStat = -1;
753 //
754 // compute cumulative sum of bin counts
755 // std::vector<double> sum1(data1.Size() );
756 // std::vector<double> sum2(data2.Size() );
757 // std::vector<double> sumAll(data1.Size() + data2.Size() );
758
759 if (data1.NDim() != 1 && data2.NDim() != 1) {
760 MATH_ERROR_MSG("AndersonDarling2SamplesTest", "Bin Data set must be one-dimensional ");
761 return;
762 }
763 unsigned int n1 = data1.Size();
764 unsigned int n2 = data2.Size();
765 double ntot1 = 0;
766 double ntot2 = 0;
767
768
769 // make a combined data set and sort it
770 std::vector<double> xdata(n1+n2);
771 for (unsigned int i = 0; i < n1; ++i) {
772 double value = 0;
773 const double * x = data1.GetPoint(i, value);
774 xdata[i] = *x;
775 ntot1 += value;
776 }
777 for (unsigned int i = 0; i < n2; ++i) {
778 double value = 0;
779 const double * x = data2.GetPoint(i, value);
780 xdata[n1+i] = *x;
781 ntot2 += value;
782 }
783 double nall = ntot1+ntot2;
784 // sort the combined data
785 std::vector<unsigned int> index(n1+n2);
786 TMath::Sort(n1+n2, &xdata[0], &index[0], false );
787
788 // now compute the sums for the tests
789 double sum1 = 0;
790 double sum2 = 0;
791 double sumAll = 0;
792 double adsum = 0;
793 unsigned int j = 0;
794
795 while( j < n1+n2 ) {
796// for (unsigned int j = 0; j < n1+n2; ++j) {
797 // skip equal observations
798 double x = xdata[ index[j] ];
799 unsigned int k = j;
800 // loop on the bins with the same center value
801 double t = 0;
802 do {
803 unsigned int i = index[k];
804 double value = 0;
805 if (i < n1 ) {
806 value = data1.Value(i);
807 sum1 += value;
808 }
809 else {
810 // from data2
811 i -= n1;
812 assert(i < n2);
813 value = data2.Value(i);
814 sum2 += value;
815 }
816 sumAll += value;
817 t += value;
818 //std::cout << "j " << j << " k " << k << " data " << x << " index " << index[k] << " value " << value << std::endl;
819 k++;
820 } while ( k < n1+n2 && xdata[ index[k] ] == x );
821
822
823 j = k;
824 // skip last point
825 if (j < n1+n2) {
826 double tmp1 = ( nall * sum1 - ntot1 * sumAll );
827 double tmp2 = ( nall * sum2 - ntot2 * sumAll );
828 adsum += t * (tmp1*tmp1/ntot1 + tmp2*tmp2/ntot2) / ( sumAll * (nall - sumAll) ) ;
829
830 //std::cout << "comp sum " << adsum << " " << t << " " << sumAll << " s1 " << sum1 << " s2 " << sum2 << " tmp1 " << tmp1 << " tmp2 " << tmp2 << std::endl;
831 }
832 }
833 double A2 = adsum / nall;
834
835 // compute the normalized test statistic
836 std::vector<unsigned int> ns(2);
837 ns[0] = ntot1;
838 ns[1] = ntot2;
839 //std::cout << " ad2 = " << A2 << " nall " << nall;
840
841 Double_t sigmaN = GetSigmaN(ns,nall);
842 A2 -= 1;
843 A2 /= sigmaN; // standartized test statistic
844
845 //std::cout << " sigmaN " << sigmaN << " new A2 " << A2;
846
847 pvalue = PValueADKSamples(2,A2);
848 //std::cout << " pvalue = " << pvalue << std::endl;
849 testStat = A2;
850 return;
851 }
852
853
855 Double_t pvalue, testStat;
856 AndersonDarling2SamplesTest(pvalue, testStat);
857 return (strncmp(option, "p", 1) == 0 || strncmp(option, "t", 1) != 0) ? pvalue : testStat;
858 }
859
860/*
861 Taken from (3)
862*/ void GoFTest::AndersonDarlingTest(Double_t& pvalue, Double_t& testStat) const {
863 pvalue = -1;
864 testStat = -1;
865 if (!fTestSampleFromH0) {
866 MATH_ERROR_MSG("AndersonDarlingTest", "Only 2-sample tests can be issued with a 2-sample constructed GoFTest object!");
867 return;
868 }
869 if (fDist == kUndefined) {
870 MATH_ERROR_MSG("AndersonDarlingTest", "Distribution type is undefined! Please use SetDistribution(GoFTest::EDistribution).");
871 return;
872 }
873 Double_t A2 = 0.0;
874 Int_t n = fSamples[0].size();
875 for (Int_t i = 0; i < n ; ++i) {
876 Double_t x1 = fSamples[0][i];
877 Double_t w1 = (*fCDF)(x1);
878 Double_t result = (2 * (i + 1) - 1) * TMath::Log(w1) + (2 * (n - (i + 1)) + 1) * TMath::Log(1 - w1);
879 A2 += result;
880 }
881 (A2 /= -n) -= n;
882 if (A2 != A2) {
883 MATH_ERROR_MSG("AndersonDarlingTest", "Cannot compute p-value: data below or above the distribution's thresholds. Check sample consistency.");
884 return;
885 }
886 pvalue = PValueAD1Sample(A2);
887 testStat = A2;
888 }
889
891 Double_t pvalue, testStat;
892 AndersonDarlingTest(pvalue, testStat);
893 return (strncmp(option, "p", 1) == 0 || strncmp(option, "t", 1) != 0) ? pvalue : testStat;
894 }
895
897 pvalue = -1;
898 testStat = -1;
899 if (fTestSampleFromH0) {
900 MATH_ERROR_MSG("KolmogorovSmirnov2SamplesTest", "Only 1-sample tests can be issued with a 1-sample constructed GoFTest object!");
901 return;
902 }
903 const UInt_t na = fSamples[0].size();
904 const UInt_t nb = fSamples[1].size();
905 std::vector<Double_t> a(na);
906 std::vector<Double_t> b(nb);
907 std::copy(fSamples[0].begin(), fSamples[0].end(), a.begin());
908 std::copy(fSamples[1].begin(), fSamples[1].end(), b.begin());
909 pvalue = TMath::KolmogorovTest(na, a.data(), nb, b.data(), 0);
910 testStat = TMath::KolmogorovTest(na, a.data(), nb, b.data(), "M");
911 }
912
914 Double_t pvalue, testStat;
915 KolmogorovSmirnov2SamplesTest(pvalue, testStat);
916 return (strncmp(option, "p", 1) == 0 || strncmp(option, "t", 1) != 0) ? pvalue : testStat;
917 }
918
919/*
920 Algorithm taken from (3) in page 737
921*/ void GoFTest::KolmogorovSmirnovTest(Double_t& pvalue, Double_t& testStat) const {
922 pvalue = -1;
923 testStat = -1;
924 if (!fTestSampleFromH0) {
925 MATH_ERROR_MSG("KolmogorovSmirnovTest", "Only 2-sample tests can be issued with a 2-sample constructed GoFTest object!");
926 return;
927 }
928 if (fDist == kUndefined) {
929 MATH_ERROR_MSG("KolmogorovSmirnovTest", "Distribution type is undefined! Please use SetDistribution(GoFTest::EDistribution).");
930 return;
931 }
932 Double_t Fo = 0.0, Dn = 0.0;
933 UInt_t n = fSamples[0].size();
934 for (UInt_t i = 0; i < n; ++i) {
935 Double_t Fn = (i + 1.0) / n;
936 Double_t F = (*fCDF)(fSamples[0][i]);
937 Double_t result = std::max(TMath::Abs(Fn - F), TMath::Abs(Fo - Fn));
938 if (result > Dn) Dn = result;
939 Fo = Fn;
940 }
941 pvalue = TMath::KolmogorovProb(Dn * (TMath::Sqrt(n) + 0.12 + 0.11 / TMath::Sqrt(n)));
942 testStat = Dn;
943 }
944
946 Double_t pvalue, testStat;
947 KolmogorovSmirnovTest(pvalue, testStat);
948 return (strncmp(option, "p", 1) == 0 || strncmp(option, "t", 1) != 0) ? pvalue : testStat;
949 }
950
951
952
953
954
955} // ROOT namespace
956} // Math namespace
957
#define MATH_ERROR_MSG(loc, str)
Definition: Error.h:82
#define MATH_WARN_MSG(loc, str)
Definition: Error.h:79
#define d(i)
Definition: RSha256.hxx:102
#define b(i)
Definition: RSha256.hxx:100
#define f(i)
Definition: RSha256.hxx:104
#define c(i)
Definition: RSha256.hxx:101
#define g(i)
Definition: RSha256.hxx:105
#define h(i)
Definition: RSha256.hxx:106
static const double x1[5]
#define M_PI
Definition: Rotated.cxx:105
int Int_t
Definition: RtypesCore.h:41
char Char_t
Definition: RtypesCore.h:29
unsigned int UInt_t
Definition: RtypesCore.h:42
const Bool_t kFALSE
Definition: RtypesCore.h:88
bool Bool_t
Definition: RtypesCore.h:59
double Double_t
Definition: RtypesCore.h:55
const Bool_t kTRUE
Definition: RtypesCore.h:87
#define N
float xmin
Definition: THbookFile.cxx:93
float xmax
Definition: THbookFile.cxx:93
double pow(double, double)
double exp(double)
double log(double)
Class describing the binned data sets : vectors of x coordinates, y values and optionally error on y ...
Definition: BinData.h:53
const double * GetPoint(unsigned int ipoint, double &value) const
retrieve at the same time a pointer to the coordinate data and the fit value More efficient than call...
Definition: BinData.h:370
double Value(unsigned int ipoint) const
return the value for the given fit point
Definition: BinData.h:217
unsigned int Size() const
return number of fit points
Definition: FitData.h:303
unsigned int NDim() const
return coordinate data dimension
Definition: FitData.h:311
static Double_t PValueADKSamples(UInt_t nsamples, Double_t A2)
Definition: GoFTest.cxx:353
void operator()(ETestType test, Double_t &pvalue, Double_t &testStat) const
Definition: GoFTest.cxx:208
void SetDistributionFunction(const IGenFunction &cdf, Bool_t isPDF, Double_t xmin, Double_t xmax)
Definition: GoFTest.cxx:264
std::unique_ptr< IGenFunction > fCDF
Definition: GoFTest.h:181
Bool_t fTestSampleFromH0
Definition: GoFTest.h:193
EDistribution fDist
Definition: GoFTest.h:184
void SetSamples(std::vector< const Double_t * > samples, const std::vector< UInt_t > samplesSizes)
Definition: GoFTest.cxx:180
Double_t fSigma
Definition: GoFTest.h:187
std::vector< Double_t > fCombinedSamples
Definition: GoFTest.h:189
void KolmogorovSmirnovTest(Double_t &pvalue, Double_t &testStat) const
Definition: GoFTest.cxx:921
void SetDistribution(EDistribution dist)
Definition: GoFTest.cxx:123
void Instantiate(const Double_t *sample, UInt_t sampleSize)
Definition: GoFTest.cxx:276
virtual ~GoFTest()
Definition: GoFTest.cxx:178
Double_t GaussianCDF(Double_t x) const
Definition: GoFTest.cxx:294
void AndersonDarling2SamplesTest(Double_t &pvalue, Double_t &testStat) const
Definition: GoFTest.cxx:646
static Double_t GetSigmaN(const std::vector< UInt_t > &ns, UInt_t N)
Definition: GoFTest.cxx:311
void KolmogorovSmirnov2SamplesTest(Double_t &pvalue, Double_t &testStat) const
Definition: GoFTest.cxx:896
std::vector< std::vector< Double_t > > fSamples
Definition: GoFTest.h:191
Double_t PValueAD1Sample(Double_t A2) const
Definition: GoFTest.cxx:483
void AndersonDarlingTest(Double_t &pvalue, Double_t &testStat) const
Definition: GoFTest.cxx:862
Double_t ExponentialCDF(Double_t x) const
Definition: GoFTest.cxx:298
Double_t fMean
Definition: GoFTest.h:186
Interface (abstract class) for generic functions objects of one-dimension Provides a method to evalua...
Definition: IFunction.h:135
Template class to wrap any member function of a class taking a double and returning a double in a 1D ...
Stopwatch class.
Definition: TStopwatch.h:28
void Start(Bool_t reset=kTRUE)
Start the stopwatch.
Definition: TStopwatch.cxx:58
void Reset()
Definition: TStopwatch.h:52
void Print(Option_t *option="") const
Print the real and cpu time passed between the start and stop events.
Definition: TStopwatch.cxx:219
double normal_cdf(double x, double sigma=1, double x0=0)
Cumulative distribution function of the normal (Gaussian) distribution (lower tail).
double exponential_cdf(double x, double lambda, double x0=0)
Cumulative distribution function of the exponential distribution (lower tail).
Double_t x[n]
Definition: legend1.C:17
const Int_t n
Definition: legend1.C:16
#define F(x, y, z)
#define H(x, y, z)
Namespace for new Math classes and functions.
double dist(Rotation3D const &r1, Rotation3D const &r2)
Definition: 3DDistances.cxx:48
void adkTestStat(double *adk, const std::vector< std::vector< double > > &samples, const std::vector< double > &zstar)
Definition: GoFTest.cxx:546
int getCount(double z, const double *dat, int n)
Definition: GoFTest.cxx:520
int getSum(const int *x, int n)
Definition: GoFTest.cxx:534
IBaseFunctionOneDim IGenFunction
Definition: IFunctionfwd.h:37
double inner_product(const LAVector &, const LAVector &)
void function(const Char_t *name_, T fun, const Char_t *docstring=0)
Definition: RExports.h:151
Namespace for new ROOT classes and functions.
Definition: StringConv.hxx:21
static constexpr double ns
Double_t KolmogorovTest(Int_t na, const Double_t *a, Int_t nb, const Double_t *b, Option_t *option)
Statistical test whether two one-dimensional sets of points are compatible with coming from the same ...
Definition: TMath.cxx:789
Double_t Log(Double_t x)
Definition: TMath.h:748
Double_t Sqrt(Double_t x)
Definition: TMath.h:679
LongDouble_t Power(LongDouble_t x, LongDouble_t y)
Definition: TMath.h:723
Double_t KolmogorovProb(Double_t z)
Calculates the Kolmogorov distribution function,.
Definition: TMath.cxx:663
void Sort(Index n, const Element *a, Index *index, Bool_t down=kTRUE)
Definition: TMathBase.h:362
Short_t Abs(Short_t d)
Definition: TMathBase.h:120
STL namespace.
Definition: test.py:1
auto * l
Definition: textangle.C:4
auto * a
Definition: textangle.C:12
static long int sum(long int i)
Definition: Factory.cxx:2258