Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
GoFTest.cxx
Go to the documentation of this file.
1// @(#)root/mathcore:$Id$
2// Authors: Bartolomeu Rabacal 05/2010
3/**********************************************************************
4 * *
5 * Copyright (c) 2006 , LCG ROOT MathLib Team *
6 * *
7 * *
8 **********************************************************************/
9// implementation file for GoFTest
10
11
12#include <algorithm>
13#include <functional>
14#include <iostream>
15#include <map>
16#include <numeric>
17#include <string.h>
18#include <cassert>
19
20#include "Math/Error.h"
21#include "Math/Math.h"
22#include "Math/IFunction.h"
23#include "Math/IFunctionfwd.h"
24#include "Math/Integrator.h"
27
28#include "Math/GoFTest.h"
29
30#include "Fit/BinData.h"
31
32#include "TStopwatch.h"
33
34/* Note: The references mentioned here are stated in GoFTest.h */
35
36namespace ROOT {
37namespace Math {
38
39 struct CDFWrapper : public IGenFunction {
40 // wrapper around a cdf funciton to re-scale for the range
41 Double_t fXmin; // lower range for x
42 Double_t fXmax; // lower range for x
43 Double_t fNorm; // normalization
44 const IGenFunction* fCDF; // cdf pointer (owned by the class)
45
46
47 virtual ~CDFWrapper() { if (fCDF) delete fCDF; }
48
50 fCDF(cdf.Clone())
51 {
52 if (xmin >= xmax) {
53 fNorm = 1;
54 fXmin = -std::numeric_limits<double>::infinity();
55 fXmax = std::numeric_limits<double>::infinity();
56 }
57 else {
58 fNorm = cdf(xmax) - cdf(xmin);
59 fXmin = xmin;
60 fXmax = xmax;
61 }
62 }
63
65 if (x <= fXmin) return 0;
66 if (x >= fXmax) return 1.0;
67 return (*fCDF)(x)/fNorm;
68 }
69
71 return new CDFWrapper(*fCDF,fXmin,fXmax);
72 }
73 };
74
75
76 class PDFIntegral : public IGenFunction {
77 Double_t fXmin; // lower range for x
78 Double_t fXmax; // lower range for x
79 Double_t fNorm; // normalization
81 const IGenFunction* fPDF; // pdf pointer (owned by the class)
82 public:
83
84 virtual ~PDFIntegral() { if (fPDF) delete fPDF; }
85
87 fXmin(xmin),
88 fXmax(xmax),
89 fNorm(1),
90 fPDF(pdf.Clone())
91 {
92 // compute normalization
93 fIntegral.SetFunction(*fPDF); // N.B. must be fPDF (the cloned copy) and not pdf which can disappear
94 if (fXmin >= fXmax) {
95 fXmin = -std::numeric_limits<double>::infinity();
96 fXmax = std::numeric_limits<double>::infinity();
97 }
98 if (fXmin == -std::numeric_limits<double>::infinity() && fXmax == std::numeric_limits<double>::infinity() ) {
100 }
101 else if (fXmin == -std::numeric_limits<double>::infinity() )
103 else if (fXmax == std::numeric_limits<double>::infinity() )
105 else
107 }
108
110 if (x <= fXmin) return 0;
111 if (x >= fXmax) return 1.0;
112 if (fXmin == -std::numeric_limits<double>::infinity() )
114 else
116 }
117
119 return new PDFIntegral(*fPDF, fXmin, fXmax);
120 }
121 };
122
123 void GoFTest::SetDistribution(EDistribution dist, const std::vector<double> & distParams ) {
124 if (!(kGaussian <= dist && dist <= kExponential)) {
125 MATH_ERROR_MSG("SetDistribution", "Cannot set distribution type! Distribution type option must be enabled.");
126 return;
127 }
128 fDist = dist;
129 SetParameters(distParams);
130 SetCDF();
131 }
132
133 GoFTest::GoFTest( UInt_t sample1Size, const Double_t* sample1, UInt_t sample2Size, const Double_t* sample2 )
134 : fDist(kUndefined),
135 fSamples(std::vector<std::vector<Double_t> >(2)),
136 fTestSampleFromH0(kFALSE) {
137 Bool_t badSampleArg = sample1 == 0 || sample1Size == 0;
138 if (badSampleArg) {
139 std::string msg = "'sample1";
140 msg += !sample1Size ? "Size' cannot be zero" : "' cannot be zero-length";
141 MATH_ERROR_MSG("GoFTest", msg.c_str());
142 assert(!badSampleArg);
143 }
144 badSampleArg = sample2 == 0 || sample2Size == 0;
145 if (badSampleArg) {
146 std::string msg = "'sample2";
147 msg += !sample2Size ? "Size' cannot be zero" : "' cannot be zero-length";
148 MATH_ERROR_MSG("GoFTest", msg.c_str());
149 assert(!badSampleArg);
150 }
151 std::vector<const Double_t*> samples(2);
152 std::vector<UInt_t> samplesSizes(2);
153 samples[0] = sample1;
154 samples[1] = sample2;
155 samplesSizes[0] = sample1Size;
156 samplesSizes[1] = sample2Size;
157 SetSamples(samples, samplesSizes);
158 }
159
160 GoFTest::GoFTest(UInt_t sampleSize, const Double_t* sample, EDistribution dist, const std::vector<double> & distParams)
161 : fDist(dist),
162 fSamples(std::vector<std::vector<Double_t> >(1)),
163 fTestSampleFromH0(kTRUE) {
164 Bool_t badSampleArg = sample == 0 || sampleSize == 0;
165 if (badSampleArg) {
166 std::string msg = "'sample";
167 msg += !sampleSize ? "Size' cannot be zero" : "' cannot be zero-length";
168 MATH_ERROR_MSG("GoFTest", msg.c_str());
169 assert(!badSampleArg);
170 }
171 std::vector<const Double_t*> samples(1, sample);
172 std::vector<UInt_t> samplesSizes(1, sampleSize);
173 SetSamples(samples, samplesSizes);
174 SetParameters(distParams);
175 SetCDF();
176 }
177
179
180 void GoFTest::SetSamples(std::vector<const Double_t*> samples, const std::vector<UInt_t> samplesSizes) {
181 fCombinedSamples.assign(std::accumulate(samplesSizes.begin(), samplesSizes.end(), 0u), 0.0);
182 UInt_t combinedSamplesSize = 0;
183 for (UInt_t i = 0; i < samples.size(); ++i) {
184 fSamples[i].assign(samples[i], samples[i] + samplesSizes[i]);
185 std::sort(fSamples[i].begin(), fSamples[i].end());
186 for (UInt_t j = 0; j < samplesSizes[i]; ++j) {
187 fCombinedSamples[combinedSamplesSize + j] = samples[i][j];
188 }
189 combinedSamplesSize += samplesSizes[i];
190 }
191 std::sort(fCombinedSamples.begin(), fCombinedSamples.end());
192
193 Bool_t degenerateSamples = *(fCombinedSamples.begin()) == *(fCombinedSamples.end() - 1);
194 if (degenerateSamples) {
195 std::string msg = "Degenerate sample";
196 msg += samplesSizes.size() > 1 ? "s!" : "!";
197 msg += " Sampling values all identical.";
198 MATH_ERROR_MSG("SetSamples", msg.c_str());
199 assert(!degenerateSamples);
200 }
201 }
202
203 void GoFTest::SetParameters(const std::vector<double> & distParams) {
204 fParams = distParams;
205 }
206
207 void GoFTest::operator()(ETestType test, Double_t& pvalue, Double_t& testStat) const {
208 switch (test) {
209 default:
210 case kAD:
211 AndersonDarlingTest(pvalue, testStat);
212 break;
213 case kAD2s:
214 AndersonDarling2SamplesTest(pvalue, testStat);
215 break;
216 case kKS:
217 KolmogorovSmirnovTest(pvalue, testStat);
218 break;
219 case kKS2s:
220 KolmogorovSmirnov2SamplesTest(pvalue, testStat);
221 }
222 }
223
225 Double_t result = 0.0;
226 switch (test) {
227 default:
228 case kAD:
229 result = AndersonDarlingTest(option);
230 break;
231 case kAD2s:
232 result = AndersonDarling2SamplesTest(option);
233 break;
234 case kKS:
235 result = KolmogorovSmirnovTest(option);
236 break;
237 case kKS2s:
238 result = KolmogorovSmirnov2SamplesTest(option);
239 }
240 return result;
241 }
242
243 void GoFTest::SetCDF() { // Setting parameter-free distributions
244 IGenFunction* cdf = 0;
245 switch (fDist) {
246 case kLogNormal:
247 LogSample();
248 if (fParams.empty()) fParams = {0,1};
249 /* fall through */
250 case kGaussian :
252 if (fParams.empty()) fParams = {0,1};
253 break;
254 case kExponential:
256 if (fParams.empty()) fParams = {1};
257 break;
258 case kUserDefined:
259 case kUndefined:
260 default:
261 break;
262 }
263 fCDF.reset(cdf);
264 }
265
267 if (fDist > kUserDefined) {
268 MATH_WARN_MSG("SetDistributionFunction","Distribution type is changed to user defined");
269 }
271 // function will be cloned inside the wrapper PDFIntegral of CDFWrapper classes
272 if (isPDF)
273 fCDF.reset(new PDFIntegral(f, xmin, xmax) );
274 else
275 fCDF.reset(new CDFWrapper(f, xmin, xmax) );
276 }
277
278 void GoFTest::Instantiate(const Double_t* sample, UInt_t sampleSize) {
279 // initialization function for the template constructors
280 Bool_t badSampleArg = sample == 0 || sampleSize == 0;
281 if (badSampleArg) {
282 std::string msg = "'sample";
283 msg += !sampleSize ? "Size' cannot be zero" : "' cannot be zero-length";
284 MATH_ERROR_MSG("GoFTest", msg.c_str());
285 assert(!badSampleArg);
286 }
287 fCDF.reset((IGenFunction*)0);
289 fSamples = std::vector<std::vector<Double_t> >(1);
291 SetSamples(std::vector<const Double_t*>(1, sample), std::vector<UInt_t>(1, sampleSize));
292 }
293
295 return ROOT::Math::normal_cdf(x, fParams[1], fParams[0]);
296 }
297
300 }
301
303 transform(fSamples[0].begin(), fSamples[0].end(), fSamples[0].begin(),
304 std::function<Double_t(Double_t)>(TMath::Log));
305 }
306
307/*
308 Taken from (1)
309*/
310 Double_t GoFTest::GetSigmaN(const std::vector<UInt_t> & ns, UInt_t N) {
311 // compute moments of AD distribution (from Scholz-Stephen paper, paragraph 3)
312
313 Double_t sigmaN = 0.0, h = 0.0, H = 0.0, g = 0.0, a, b, c, d, k = ns.size();
314
315 for (UInt_t i = 0; i < ns.size(); ++i) {
316 H += 1.0 / double( ns[i] );
317 }
318
319 // use approximate formulas for large N
320 // cache Sum( 1 / i)
321 if (N < 2000) {
322 std::vector<double> invI(N);
323 for (UInt_t i = 1; i <= N - 1; ++i) {
324 invI[i] = 1.0 / i;
325 h += invI[i];
326 }
327 for (UInt_t i = 1; i <= N - 2; ++i) {
328 double tmp = invI[N-i];
329 for (UInt_t j = i + 1; j <= N - 1; ++j) {
330 g += tmp * invI[j];
331 }
332 }
333 }
334 else {
335 // for N larger than 2000 error difference in g is ~ 5 10^-3 while in h is at the level of 10^-5
336 const double emc = 0.5772156649015328606065120900824024; // Euler-Mascheroni constant
337 h = std::log(double(N-1) ) + emc;
338 g = (M_PI)*(M_PI)/6.0;
339 }
340 double k2 = std::pow(k,2);
341 a = (4 * g - 6) * k + (10 - 6 * g) * H - 4 * g + 6;
342 b = (2 * g - 4) * k2 + 8 * h * k + (2 * g - 14 * h - 4) * H - 8 * h + 4 * g - 6;
343 c = (6 * h + 2 * g - 2) * k2 + (4 * h - 4 *g + 6) * k + (2 * h - 6) * H + 4 * h;
344 d = (2 * h + 6) * k2 - 4 * h * k;
345 sigmaN += a * std::pow(double(N),3) + b * std::pow(double(N),2) + c * N + d;
346 sigmaN /= ( double(N - 1) * double(N - 2) * double(N - 3) );
347 sigmaN = TMath::Sqrt(sigmaN);
348 return sigmaN;
349 }
350
351
353
354 /*
355 Computation of p-values according to
356 "K-Sample Anderson-Darling Tests" by F.W. Scholz
357 and M.A. Stephens (1987), Journal of the American Statistical Association,
358 Vol 82, No. 399, pp 918-924.
359 Code from kSamples package from R (author F. Scholtz)
360
361 This function uses the upper T_m quantiles as obtained via simulation of
362 the Anderson-Darling test statistics (Nsim = 2*10^6) with sample sizes n=500
363 for each sample, and after standardization, in order to emulate the Table 1
364 values given in the above reference. However, here we estimate p-quantiles
365 for p = .00001,.00005,.0001,.0005,.001,.005,.01,.025,.05,.075,
366 .1,.2,.3,.4,.5,.6,.7,.8,.9,.925,.95,.975,.99,.9925,.995,.9975,.999,
367 .99925,.9995,.99975,.9999,.999925,.99995,.999975,.99999
368 First the appropriate p-quantiles are determined from those simulated
369 for ms = 1,2,3,4,6,8,10, Inf, interpolating to the given value of m.
370 Since we use only m=2 we avoid this interpolation.
371
372 Next linear inetrpolation to find the observed p value given the observed test statistic value.
373 We use interpolation in the test statistic -> log((1-p)/p) domain
374 and we extrapolatelinearly) beyond p = .00001 and .99999.
375 */
376
377 // sample values
378 //double ms[] = { 1, 2, 3, 4, 6, 8, 10, TMath::Infinity() };
379 //int ns = ms.size();
380 const int ns = 8;
381 double ts[ ] = { -1.1954, -1.5806, -1.8172,
382 -2.0032, -2.2526, -2.4204, -2.5283, -4.2649, -1.1786, -1.5394,
383 -1.7728, -1.9426, -2.1685, -2.3288, -2.4374, -3.8906, -1.166,
384 -1.5193, -1.7462, -1.9067, -2.126, -2.2818, -2.3926, -3.719,
385 -1.1407, -1.4659, -1.671, -1.8105, -2.0048, -2.1356, -2.2348,
386 -3.2905, -1.1253, -1.4371, -1.6314, -1.7619, -1.9396, -2.0637,
387 -2.1521, -3.0902, -1.0777, -1.3503, -1.5102, -1.6177, -1.761,
388 -1.8537, -1.9178, -2.5758, -1.0489, -1.2984, -1.4415, -1.5355,
389 -1.6625, -1.738, -1.7936, -2.3263, -0.9978, -1.2098, -1.3251,
390 -1.4007, -1.4977, -1.5555, -1.5941, -1.96, -0.9417, -1.1187,
391 -1.209, -1.2671, -1.3382, -1.379, -1.405, -1.6449, -0.8981, -1.0491,
392 -1.1235, -1.1692, -1.2249, -1.2552, -1.2755, -1.4395, -0.8598,
393 -0.9904, -1.0513, -1.0879, -1.1317, -1.155, -1.1694, -1.2816,
394 -0.7258, -0.7938, -0.8188, -0.8312, -0.8435, -0.8471, -0.8496,
395 -0.8416, -0.5966, -0.617, -0.6177, -0.6139, -0.6073, -0.5987,
396 -0.5941, -0.5244, -0.4572, -0.4383, -0.419, -0.4033, -0.3834,
397 -0.3676, -0.3587, -0.2533, -0.2966, -0.2428, -0.2078, -0.1844,
398 -0.1548, -0.1346, -0.1224, 0, -0.1009, -0.0169, 0.0304, 0.0596,
399 0.0933, 0.1156, 0.1294, 0.2533, 0.1571, 0.2635, 0.3169, 0.348,
400 0.3823, 0.4038, 0.4166, 0.5244, 0.5357, 0.6496, 0.6992, 0.7246,
401 0.7528, 0.7683, 0.7771, 0.8416, 1.2255, 1.2989, 1.3202, 1.3254,
402 1.3305, 1.3286, 1.3257, 1.2816, 1.5262, 1.5677, 1.5709, 1.5663,
403 1.5561, 1.5449, 1.5356, 1.4395, 1.9633, 1.943, 1.919, 1.8975,
404 1.8641, 1.8389, 1.8212, 1.6449, 2.7314, 2.5899, 2.5, 2.4451,
405 2.3664, 2.3155, 2.2823, 1.96, 3.7825, 3.4425, 3.2582, 3.1423,
406 3.0036, 2.9101, 2.8579, 2.3263, 4.1241, 3.716, 3.4984, 3.3651,
407 3.2003, 3.0928, 3.0311, 2.4324, 4.6044, 4.0847, 3.8348, 3.6714,
408 3.4721, 3.3453, 3.2777, 2.5758, 5.409, 4.7223, 4.4022, 4.1791,
409 3.9357, 3.7809, 3.6963, 2.807, 6.4954, 5.5823, 5.1456, 4.8657,
410 4.5506, 4.3275, 4.2228, 3.0902, 6.8279, 5.8282, 5.3658, 5.0749,
411 4.7318, 4.4923, 4.3642, 3.1747, 7.2755, 6.197, 5.6715, 5.3642,
412 4.9991, 4.7135, 4.5945, 3.2905, 8.1885, 6.8537, 6.2077, 5.8499,
413 5.4246, 5.1137, 4.9555, 3.4808, 9.3061, 7.6592, 6.85, 6.4806,
414 5.9919, 5.6122, 5.5136, 3.719, 9.6132, 7.9234, 7.1025, 6.6731,
415 6.1549, 5.8217, 5.7345, 3.7911, 10.0989, 8.2395, 7.4326, 6.9567,
416 6.3908, 6.011, 5.9566, 3.8906, 10.8825, 8.8994, 7.8934, 7.4501,
417 6.9009, 6.4538, 6.2705, 4.0556, 11.8537, 9.5482, 8.5568, 8.0283,
418 7.4418, 6.9524, 6.6195, 4.2649 };
419
420
421
422
423
424 // p values bins
425 double p[] = { .00001,.00005,.0001,.0005,.001,.005,.01,.025,.05,.075,.1,.2,.3,.4,.5,.6,.7,.8,.9,
426 .925,.95,.975,.99,.9925,.995,.9975,.999,.99925,.9995,.99975,.9999,.999925,.99995,.999975,.99999 };
427
428 //int nbins = p.size();
429 const int nbins = 35;
430 //assert ( nbins*ns == ts.size() );
431
432 // get ts values for nsamples = 2
433 // corresponding value is for m=nsamples-1
434 int offset = 0; // for m = 1 (i.e. for nsamples = 2)
435 if (nsamples != 2) {
436 MATH_ERROR_MSG("InterpolatePValues", "Interpolation not implemented for nsamples not equal to 2");
437 return 0;
438 }
439 std::vector<double> ts2(nbins); // ts values for nsamples = 2
440 std::vector<double> lp(nbins); // log ( p / (1-p) )
441 for (int i = 0; i < nbins; ++i)
442 {
443 ts2[i] = ts[offset+ i * ns];
444 p[i] = 1.-p[i];
445 lp[i] = std::log( p[i]/(1.-p[i] ) );
446 }
447 // do linear interpolation to find right lp value for given observed test staistic value
448 //auto it = std::lower_bound(ts2.begin(), ts2.end(), tx );
449 int i1 = std::distance(ts2.begin(), std::lower_bound(ts2.begin(), ts2.end(), tx ) ) - 1;
450 int i2 = i1+1;
451 // if tx is before min of tabluated data
452 if (i1 < 0) {
453 i1 = 0;
454 i2 = 1;
455 }
456 // if tx is after max of tabulated data
457 if (i2 >= int(ts2.size()) ) {
458 i1 = ts2.size()-2;
459 i2 = ts2.size()-1;
460 }
461
462 //std::cout << i1 << " , " << i2 << std::endl;
463 assert(i1 < (int) lp.size() && i2 < (int) lp.size() );
464 double lp1 = lp[i1];
465 double lp2 = lp[i2];
466 double tx1 = ts2[i1];
467 double tx2 = ts2[i2];
468
469 //std::cout << " tx1,2 " << tx1 << " " << tx2 << std::endl;
470 /// find interpolated (or extrapolated value)(
471 double lp0 = (lp1-lp2) * (tx - tx2)/ ( tx1-tx2) + lp2;
472
473
474 double p0 = exp(lp0)/(1. + exp(lp0) );
475 return p0;
476
477 }
478
479
480/*
481 Taken from (2)
483 Double_t pvalue = 0.0;
484 if (A2 <= 0.0) {
485 return pvalue;
486 } else if (A2 < 2.) {
487 pvalue = std::pow(A2, -0.5) * std::exp(-1.2337141 / A2) * (2.00012 + (0.247105 - (0.0649821 - (0.0347962 - (0.011672 - 0.00168691 * A2) * A2) * A2) * A2) * A2);
488 } else {
489 pvalue = std::exp(-1. * std::exp(1.0776 - (2.30695 - (0.43424 - (.082433 - (0.008056 - 0.0003146 * A2) * A2) * A2) * A2) * A2));
490 }
491 return 1. - pvalue;
492 }
493
494
495// code from kSamples (R) F. Scholz
496
497/* computes the k-sample Anderson-Darling test statistics in both original
498 and alternative versions for the nonparametric (rank) test described in
499 Scholz F.W. and Stephens M.A. (1987), K-sample Anderson-Darling Tests,
500 Journal of the American Statistical Association, Vol 82, No. 399,
501 pp. 918-924
502
503 Arguments:
504 adk: double array with length 2, stores AkN2 and AakN2
505 k: integer, number of samples being compared
506 x: double array storing the concatenated samples in the same order as ns
507 ns: integer array storing the k sample sizes, corresponding to x
508 zstar: double array storing the l distinct ordered observations in the
509 pooled sample
510 l: integer, length of zstar
511
512 Outputs:
513 when the computation ends, AkN2 and AakN2 are stored in the given memory
514 pointed by adk
515*/
516
517/* counts and returns the number of occurrence of a given number
518 in a double array */
519int getCount(double z, const double *dat, int n) {
520 int i;
521 int count = 0;
522
523 for (i = 0; i < n; i++) {
524 if (dat[i] == z) {
525 count++;
526 }
527 }
528
529 return(count);
530}
531
532/* computes and returns the sum of elements in a given integer array */
533int getSum(const int *x, int n) {
534 int i;
535 int sum = 0;
536
537 for (i = 0; i < n; i++) {
538 sum += x[i];
539 }
540
541 return(sum);
542}
543
544
545void adkTestStat(double *adk, const std::vector<std::vector<double> > & samples, const std::vector<double> & zstar) {
546
547 int i;
548 int j;
549
550 int nsum; /* total sample size = n_1 + ... + n_k */
551 int k = samples.size();
552 int l = zstar.size();
553
554 /* fij records the number of observations in the ith sample coinciding
555 with zstar[j], where i = 1, ..., k, and j = 1, ..., l */
556 std::vector<int> fij (k*l);
557 /* lvec is an integer vector with length l,
558 whose jth entry = \sum_{i=1}^{k} f_{ij}, i.e., the multiplicity
559 of zstar[j] */
560 std::vector<int> lvec(l);
561
562 /* for computation */
563 double mij;
564 double maij;
565 double innerSum;
566 double aInnerSum;
567 double bj;
568 double baj;
569 double tmp;
570
571 /* samples is a two-dimensional double array with length k;
572 it stores an array of k pointers to double arrays which are
573 the k samples beeing compared */
574// double **samples;
575
576 /* dynamically allocate memory */
577 //std::vector< std::vector<double> > samples(k);
578 std::vector<int> ns(k);
579 nsum = 0;
580 for (i = 0; i < k; i++) {
581 ns[i] = samples[i].size();
582 nsum += ns[i];
583 }
584
585 /* fij: k*l integer matrix, where l is the length of zstar and
586 k is the number of samples being compared
587 lvec: integer vector of length l, records the multiplicity of
588 each element of zstar */
589 for (j = 0; j < l; j++) {
590 lvec[j] = 0;
591 for (i = 0; i < k; i++) {
592 fij[i + j*k] = getCount(zstar[j], &samples[i][0], ns[i]);
593 lvec[j] += fij[i + j*k];
594 }
595 }
596
597 // loop on samples to compute the adk's
598 // Formula (6) and (7) of the paper
599 adk[0] = adk[1] = 0;
600 for (i = 0; i < k; i++) {
601 mij = 0;
602 maij = 0;
603 innerSum = 0;
604 aInnerSum = 0;
605
606 for (j = 0; j < l; j++) {
607 mij += fij[i + j*k];
608 maij = mij - (double) fij[i + j*k] / 2.0;
609 bj = getSum(&lvec[0], j + 1);
610 baj = bj - (double) lvec[j] / 2.0;
611
612 if (j < l - 1) {
613 tmp = (double) nsum * mij - (double) ns[i] * bj;
614 innerSum = innerSum + (double) lvec[j] * tmp * tmp /
615 (bj * ((double) nsum - bj));
616 }
617
618 tmp = (double) nsum * maij - (double) ns[i] * baj;
619 aInnerSum = aInnerSum + (double) lvec[j] * tmp * tmp /
620 (baj * (nsum - baj) - nsum * (double) lvec[j] / 4.0);
621 }
622
623 adk[0] = adk[0] + innerSum / ns[i]; /* AkN2*/
624 adk[1] = adk[1] + aInnerSum / ns[i]; /* AakN2 */
625 }
626
627 /* k-sample Anderson-Darling test statistics in both original and
628 alternative versions, AkN2 and AakN2, are stored in the given
629 double array adk */
630 adk[0] = adk[0] / (double) nsum; /* AkN2*/
631 adk[1] = (nsum - 1) * adk[1] / ((double) nsum * (double) nsum); /* AakN2 */
632
633 // /* free pointers */
634 // for (i = 0; i < k; i++) {
635 // free(samples[i]);
636 // }
637 // free(samples);
638
639}
640
641
642/*
643 Taken from (1) -- Named for 2 samples but implemented for K. Restricted to K = 2 by the class's constructors
644*/
646 pvalue = -1;
647 testStat = -1;
648 if (fTestSampleFromH0) {
649 MATH_ERROR_MSG("AndersonDarling2SamplesTest", "Only 1-sample tests can be issued with a 1-sample constructed GoFTest object!");
650 return;
651 }
652 std::vector<Double_t> z(fCombinedSamples);
653 // unique removes all consecutives duplicates elements. This is exactly what we wants
654 // for example unique of v={1,2,2,3,1,2,3,3} results in {1,2,3,1,2,3} which is exactly what we wants
655 std::vector<Double_t>::iterator endUnique = std::unique(z.begin(), z.end()); //z_j's in (1)
656 z.erase(endUnique, z.end() );
657 std::vector<UInt_t> h; // h_j's in (1)
658 std::vector<Double_t> H; // H_j's in (1)
659 UInt_t N = fCombinedSamples.size();
660 Double_t A2 = 0.0; // Anderson-Darling A^2 Test Statistic
661
662#ifdef USE_OLDIMPL
663
664 TStopwatch w; w.Start();
665
666 unsigned int nSamples = fSamples.size();
667
668 // old implementation
669 for (std::vector<Double_t>::iterator data = z.begin(); data != endUnique; ++data) {
670 UInt_t n = std::count(fCombinedSamples.begin(), fCombinedSamples.end(), *data);
671 h.push_back(n);
672 H.push_back(std::count_if(fCombinedSamples.begin(), fCombinedSamples.end(),
673 std::bind(std::less<Double_t>(), std::placeholders::_1, *data)) + n / 2.);
674 }
675 std::cout << "time for H";
676 w.Print();
677 w.Reset(); w.Start();
678 std::vector<std::vector<Double_t> > F(nSamples); // F_ij's in (1)
679 for (UInt_t i = 0; i < nSamples; ++i) {
680 for (std::vector<Double_t>::iterator data = z.begin(); data != endUnique; ++data) {
681 UInt_t n = std::count(fSamples[i].begin(), fSamples[i].end(), *data);
682 F[i].push_back(std::count_if(fSamples[i].begin(), fSamples[i].end(),
683 std::bind(std::less<Double_t>(), std::placeholders::_1, *data)) + n / 2.);
684 }
685 }
686 std::cout << "time for F";
687 w.Print();
688 for (UInt_t i = 0; i < nSamples; ++i) {
689 Double_t sum_result = 0.0;
690 UInt_t j = 0;
691 w.Reset(); w.Start();
692 for (std::vector<Double_t>::iterator data = z.begin(); data != endUnique; ++data) {
693 sum_result += h[j] * TMath::Power(N * F[i][j]- fSamples[i].size() * H[j], 2) / (H[j] * (N - H[j]) - N * h[j] / 4.0);
694 ++j;
695 }
696 std::cout << "time for sum_resut";
697 w.Print();
698 std::cout << "sum_result " << sum_result << std::endl;
699 A2 += 1.0 / fSamples[i].size() * sum_result;
700 }
701 A2 *= (N - 1) / (TMath::Power(N, 2)); // A2_akN in (1)
702
703 std::cout << "A2 - old Bartolomeo code " << A2 << std::endl;
704#endif
705 // w.Reset();
706 // w.Start();
707
708 double adk[2] = {0,0};
709
710 //debug
711 // std::cout << "combined samples\n";
712 // for (int i = 0; i < fCombinedSamples.size(); ++i)
713 // std::cout << fCombinedSamples[i] << " ,";
714 // std::cout << std::endl;
715 // std::cout << ns[0] << " " << ns[1] << std::endl;
716 // std::cout << "Z\n";
717 // for (int i = 0; i < z.size(); ++i)
718 // std::cout << z[i] << " ,";
719 // std::cout << std::endl;
720
721 // use function from kSamples code
722 adkTestStat(adk, fSamples, z );
723 // w.Print();
724 // std::cout << "A2 - new kSamples code " << adk[0] << " " << adk[1] << std::endl;
725
726 A2 = adk[0];
727
728 // compute the normalized test statistic
729
730 std::vector<UInt_t> ns(fSamples.size());
731 for (unsigned int k = 0; k < ns.size(); ++k) ns[k] = fSamples[k].size();
732 Double_t sigmaN = GetSigmaN(ns, N);
733 A2 -= fSamples.size() - 1;
734 A2 /= sigmaN; // standartized test statistic
735
736 pvalue = PValueADKSamples(2,A2);
737 testStat = A2;
738 return;
739 }
740
741
742/*
743 Compute Anderson Darling test for two binned data set.
744 A binned data set can be seen as many identical observation happening at the center of the bin
745 In this way it is trivial to apply the formula (6) in the paper of W. Scholz, M. Stephens, "K-Sample Anderson-Darling Tests"
746 to the case of histograms. See also http://arxiv.org/pdf/0804.0380v1.pdf paragraph 3.3.5
747 It is importat that empty bins are not present
748*/
750 pvalue = -1;
751 testStat = -1;
752 //
753 // compute cumulative sum of bin counts
754 // std::vector<double> sum1(data1.Size() );
755 // std::vector<double> sum2(data2.Size() );
756 // std::vector<double> sumAll(data1.Size() + data2.Size() );
757
758 if (data1.NDim() != 1 && data2.NDim() != 1) {
759 MATH_ERROR_MSG("AndersonDarling2SamplesTest", "Bin Data set must be one-dimensional ");
760 return;
761 }
762 unsigned int n1 = data1.Size();
763 unsigned int n2 = data2.Size();
764 double ntot1 = 0;
765 double ntot2 = 0;
766
767
768 // make a combined data set and sort it
769 std::vector<double> xdata(n1+n2);
770 for (unsigned int i = 0; i < n1; ++i) {
771 double value = 0;
772 const double * x = data1.GetPoint(i, value);
773 xdata[i] = *x;
774 ntot1 += value;
775 }
776 for (unsigned int i = 0; i < n2; ++i) {
777 double value = 0;
778 const double * x = data2.GetPoint(i, value);
779 xdata[n1+i] = *x;
780 ntot2 += value;
781 }
782 double nall = ntot1+ntot2;
783 // sort the combined data
784 std::vector<unsigned int> index(n1+n2);
785 TMath::Sort(n1+n2, &xdata[0], &index[0], false );
786
787 // now compute the sums for the tests
788 double sum1 = 0;
789 double sum2 = 0;
790 double sumAll = 0;
791 double adsum = 0;
792 unsigned int j = 0;
793
794 while( j < n1+n2 ) {
795// for (unsigned int j = 0; j < n1+n2; ++j) {
796 // skip equal observations
797 double x = xdata[ index[j] ];
798 unsigned int k = j;
799 // loop on the bins with the same center value
800 double t = 0;
801 do {
802 unsigned int i = index[k];
803 double value = 0;
804 if (i < n1 ) {
805 value = data1.Value(i);
806 sum1 += value;
807 }
808 else {
809 // from data2
810 i -= n1;
811 assert(i < n2);
812 value = data2.Value(i);
813 sum2 += value;
814 }
815 sumAll += value;
816 t += value;
817 //std::cout << "j " << j << " k " << k << " data " << x << " index " << index[k] << " value " << value << std::endl;
818 k++;
819 } while ( k < n1+n2 && xdata[ index[k] ] == x );
820
821
822 j = k;
823 // skip last point
824 if (j < n1+n2) {
825 double tmp1 = ( nall * sum1 - ntot1 * sumAll );
826 double tmp2 = ( nall * sum2 - ntot2 * sumAll );
827 adsum += t * (tmp1*tmp1/ntot1 + tmp2*tmp2/ntot2) / ( sumAll * (nall - sumAll) ) ;
828
829 //std::cout << "comp sum " << adsum << " " << t << " " << sumAll << " s1 " << sum1 << " s2 " << sum2 << " tmp1 " << tmp1 << " tmp2 " << tmp2 << std::endl;
830 }
831 }
832 double A2 = adsum / nall;
833
834 // compute the normalized test statistic
835 std::vector<unsigned int> ns(2);
836 ns[0] = ntot1;
837 ns[1] = ntot2;
838 //std::cout << " ad2 = " << A2 << " nall " << nall;
839
840 Double_t sigmaN = GetSigmaN(ns,nall);
841 A2 -= 1;
842 A2 /= sigmaN; // standartized test statistic
843
844 //std::cout << " sigmaN " << sigmaN << " new A2 " << A2;
845
846 pvalue = PValueADKSamples(2,A2);
847 //std::cout << " pvalue = " << pvalue << std::endl;
848 testStat = A2;
849 return;
850 }
851
852
854 Double_t pvalue, testStat;
855 AndersonDarling2SamplesTest(pvalue, testStat);
856 return (strncmp(option, "p", 1) == 0 || strncmp(option, "t", 1) != 0) ? pvalue : testStat;
857 }
858
859/*
860 Taken from (3)
861*/ void GoFTest::AndersonDarlingTest(Double_t& pvalue, Double_t& testStat) const {
862 pvalue = -1;
863 testStat = -1;
864 if (!fTestSampleFromH0) {
865 MATH_ERROR_MSG("AndersonDarlingTest", "Only 2-sample tests can be issued with a 2-sample constructed GoFTest object!");
866 return;
867 }
868 if (fDist == kUndefined) {
869 MATH_ERROR_MSG("AndersonDarlingTest", "Distribution type is undefined! Please use SetDistribution(GoFTest::EDistribution).");
870 return;
871 }
872 Double_t A2 = 0.0;
873 Int_t n = fSamples[0].size();
874 for (Int_t i = 0; i < n ; ++i) {
875 Double_t x1 = fSamples[0][i];
876 Double_t w1 = (*fCDF)(x1);
877 Double_t result = (2 * (i + 1) - 1) * TMath::Log(w1) + (2 * (n - (i + 1)) + 1) * TMath::Log(1 - w1);
878 A2 += result;
879 }
880 (A2 /= -n) -= n;
881 if (A2 != A2) {
882 MATH_ERROR_MSG("AndersonDarlingTest", "Cannot compute p-value: data below or above the distribution's thresholds. Check sample consistency.");
883 return;
884 }
885 pvalue = PValueAD1Sample(A2);
886 testStat = A2;
887 }
888
890 Double_t pvalue, testStat;
891 AndersonDarlingTest(pvalue, testStat);
892 return (strncmp(option, "p", 1) == 0 || strncmp(option, "t", 1) != 0) ? pvalue : testStat;
893 }
894
896 pvalue = -1;
897 testStat = -1;
898 if (fTestSampleFromH0) {
899 MATH_ERROR_MSG("KolmogorovSmirnov2SamplesTest", "Only 1-sample tests can be issued with a 1-sample constructed GoFTest object!");
900 return;
901 }
902 const UInt_t na = fSamples[0].size();
903 const UInt_t nb = fSamples[1].size();
904 std::vector<Double_t> a(na);
905 std::vector<Double_t> b(nb);
906 std::copy(fSamples[0].begin(), fSamples[0].end(), a.begin());
907 std::copy(fSamples[1].begin(), fSamples[1].end(), b.begin());
908 pvalue = TMath::KolmogorovTest(na, a.data(), nb, b.data(), 0);
909 testStat = TMath::KolmogorovTest(na, a.data(), nb, b.data(), "M");
910 }
911
913 Double_t pvalue, testStat;
914 KolmogorovSmirnov2SamplesTest(pvalue, testStat);
915 return (strncmp(option, "p", 1) == 0 || strncmp(option, "t", 1) != 0) ? pvalue : testStat;
916 }
917
918/*
919 Algorithm taken from (3) in page 737
920*/ void GoFTest::KolmogorovSmirnovTest(Double_t& pvalue, Double_t& testStat) const {
921 pvalue = -1;
922 testStat = -1;
923 if (!fTestSampleFromH0) {
924 MATH_ERROR_MSG("KolmogorovSmirnovTest", "Only 2-sample tests can be issued with a 2-sample constructed GoFTest object!");
925 return;
926 }
927 if (fDist == kUndefined) {
928 MATH_ERROR_MSG("KolmogorovSmirnovTest", "Distribution type is undefined! Please use SetDistribution(GoFTest::EDistribution).");
929 return;
930 }
931 Double_t Fo = 0.0, Dn = 0.0;
932 UInt_t n = fSamples[0].size();
933 for (UInt_t i = 0; i < n; ++i) {
934 Double_t Fn = (i + 1.0) / n;
935 Double_t F = (*fCDF)(fSamples[0][i]);
936 Double_t result = std::max(TMath::Abs(Fn - F), TMath::Abs(Fo - F));
937 if (result > Dn) Dn = result;
938 Fo = Fn;
939 }
940 pvalue = TMath::KolmogorovProb(Dn * (TMath::Sqrt(n) + 0.12 + 0.11 / TMath::Sqrt(n)));
941 testStat = Dn;
942 }
943
945 Double_t pvalue, testStat;
946 KolmogorovSmirnovTest(pvalue, testStat);
947 return (strncmp(option, "p", 1) == 0 || strncmp(option, "t", 1) != 0) ? pvalue : testStat;
948 }
949
950
951
952
953
954} // ROOT namespace
955} // Math namespace
956
double
#define MATH_ERROR_MSG(loc, str)
Definition Error.h:83
#define MATH_WARN_MSG(loc, str)
Definition Error.h:80
#define d(i)
Definition RSha256.hxx:102
#define b(i)
Definition RSha256.hxx:100
#define f(i)
Definition RSha256.hxx:104
#define c(i)
Definition RSha256.hxx:101
#define g(i)
Definition RSha256.hxx:105
#define a(i)
Definition RSha256.hxx:99
#define h(i)
Definition RSha256.hxx:106
static const double x1[5]
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
#define M_PI
Definition Rotated.cxx:105
char Char_t
Definition RtypesCore.h:37
const Bool_t kFALSE
Definition RtypesCore.h:101
double Double_t
Definition RtypesCore.h:59
const Bool_t kTRUE
Definition RtypesCore.h:100
#define N
float xmin
float xmax
Class describing the binned data sets : vectors of x coordinates, y values and optionally error on y ...
Definition BinData.h:52
const double * GetPoint(unsigned int ipoint, double &value) const
retrieve at the same time a pointer to the coordinate data and the fit value More efficient than call...
Definition BinData.h:369
double Value(unsigned int ipoint) const
return the value for the given fit point
Definition BinData.h:216
unsigned int Size() const
return number of fit points
Definition FitData.h:303
unsigned int NDim() const
return coordinate data dimension
Definition FitData.h:311
static Double_t PValueADKSamples(UInt_t nsamples, Double_t A2)
Definition GoFTest.cxx:352
void operator()(ETestType test, Double_t &pvalue, Double_t &testStat) const
Definition GoFTest.cxx:207
void SetDistributionFunction(const IGenFunction &cdf, Bool_t isPDF, Double_t xmin, Double_t xmax)
Definition GoFTest.cxx:266
std::unique_ptr< IGenFunction > fCDF
Definition GoFTest.h:182
Bool_t fTestSampleFromH0
Definition GoFTest.h:192
EDistribution fDist
Definition GoFTest.h:185
void SetSamples(std::vector< const Double_t * > samples, const std::vector< UInt_t > samplesSizes)
Definition GoFTest.cxx:180
std::vector< Double_t > fCombinedSamples
The distribution parameters (e.g. fParams[0] = mean, fParams[1] = sigma for a Gaussian)
Definition GoFTest.h:188
void KolmogorovSmirnovTest(Double_t &pvalue, Double_t &testStat) const
Definition GoFTest.cxx:920
std::vector< Double_t > fParams
Type of distribution.
Definition GoFTest.h:186
void Instantiate(const Double_t *sample, UInt_t sampleSize)
Definition GoFTest.cxx:278
void SetDistribution(EDistribution dist, const std::vector< double > &distParams={})
Definition GoFTest.cxx:123
Double_t GaussianCDF(Double_t x) const
Definition GoFTest.cxx:294
void AndersonDarling2SamplesTest(Double_t &pvalue, Double_t &testStat) const
Definition GoFTest.cxx:645
static Double_t GetSigmaN(const std::vector< UInt_t > &ns, UInt_t N)
Definition GoFTest.cxx:310
void KolmogorovSmirnov2SamplesTest(Double_t &pvalue, Double_t &testStat) const
Definition GoFTest.cxx:895
std::vector< std::vector< Double_t > > fSamples
Definition GoFTest.h:190
Double_t PValueAD1Sample(Double_t A2) const
Definition GoFTest.cxx:482
void AndersonDarlingTest(Double_t &pvalue, Double_t &testStat) const
Definition GoFTest.cxx:861
Double_t ExponentialCDF(Double_t x) const
Definition GoFTest.cxx:298
void SetParameters(const std::vector< double > &params)
Definition GoFTest.cxx:203
Interface (abstract class) for generic functions objects of one-dimension Provides a method to evalua...
Definition IFunction.h:135
User Class for performing numerical integration of a function in one dimension.
Definition Integrator.h:98
double IntegralUp(const IGenFunction &f, double a)
evaluate the Integral of a function f over the semi-infinite interval (a,+inf)
Definition Integrator.h:278
void SetFunction(Function &f)
method to set the a generic integration function
Definition Integrator.h:493
double Integral(Function &f, double a, double b)
evaluate the Integral of a function f over the defined interval (a,b)
Definition Integrator.h:500
double IntegralLow(const IGenFunction &f, double b)
evaluate the Integral of a function f over the over the semi-infinite interval (-inf,...
Definition Integrator.h:296
PDFIntegral(const IGenFunction &pdf, Double_t xmin=0, Double_t xmax=-1)
Definition GoFTest.cxx:86
Double_t DoEval(Double_t x) const
implementation of the evaluation function. Must be implemented by derived classes
Definition GoFTest.cxx:109
IGenFunction * Clone() const
Clone a function.
Definition GoFTest.cxx:118
const IGenFunction * fPDF
Definition GoFTest.cxx:81
IntegratorOneDim fIntegral
Definition GoFTest.cxx:80
Template class to wrap any member function of a class taking a double and returning a double in a 1D ...
Stopwatch class.
Definition TStopwatch.h:28
void Start(Bool_t reset=kTRUE)
Start the stopwatch.
void Reset()
Definition TStopwatch.h:52
void Print(Option_t *option="") const
Print the real and cpu time passed between the start and stop events.
double normal_cdf(double x, double sigma=1, double x0=0)
Cumulative distribution function of the normal (Gaussian) distribution (lower tail).
double exponential_cdf(double x, double lambda, double x0=0)
Cumulative distribution function of the exponential distribution (lower tail).
Double_t x[n]
Definition legend1.C:17
const Int_t n
Definition legend1.C:16
#define F(x, y, z)
#define H(x, y, z)
Namespace for new Math classes and functions.
void adkTestStat(double *adk, const std::vector< std::vector< double > > &samples, const std::vector< double > &zstar)
Definition GoFTest.cxx:545
int getCount(double z, const double *dat, int n)
Definition GoFTest.cxx:519
int getSum(const int *x, int n)
Definition GoFTest.cxx:533
tbb::task_arena is an alias of tbb::interface7::task_arena, which doesn't allow to forward declare tb...
Double_t KolmogorovTest(Int_t na, const Double_t *a, Int_t nb, const Double_t *b, Option_t *option)
Statistical test whether two one-dimensional sets of points are compatible with coming from the same ...
Definition TMath.cxx:782
Double_t Log(Double_t x)
Definition TMath.h:710
Double_t Sqrt(Double_t x)
Definition TMath.h:641
LongDouble_t Power(LongDouble_t x, LongDouble_t y)
Definition TMath.h:685
Double_t KolmogorovProb(Double_t z)
Calculates the Kolmogorov distribution function,.
Definition TMath.cxx:656
void Sort(Index n, const Element *a, Index *index, Bool_t down=kTRUE)
Definition TMathBase.h:358
Short_t Abs(Short_t d)
Definition TMathBase.h:120
Definition test.py:1
IGenFunction * Clone() const
Clone a function.
Definition GoFTest.cxx:70
Double_t DoEval(Double_t x) const
implementation of the evaluation function. Must be implemented by derived classes
Definition GoFTest.cxx:64
CDFWrapper(const IGenFunction &cdf, Double_t xmin=0, Double_t xmax=-1)
Definition GoFTest.cxx:49
const IGenFunction * fCDF
Definition GoFTest.cxx:44
auto * l
Definition textangle.C:4
static uint64_t sum(uint64_t i)
Definition Factory.cxx:2345