Logo ROOT   6.08/07
Reference Guide
GoFTest.cxx
Go to the documentation of this file.
1 // @(#)root/mathcore:$Id$
2 // Authors: Bartolomeu Rabacal 05/2010
3 /**********************************************************************
4  * *
5  * Copyright (c) 2006 , LCG ROOT MathLib Team *
6  * *
7  * *
8  **********************************************************************/
9 // implementation file for GoFTest
10 
11 
12 #include <algorithm>
13 #include <functional>
14 #include <iostream>
15 #include <map>
16 #include <numeric>
17 #include <string.h>
18 #include <cassert>
19 
20 #include "Math/Error.h"
21 #include "Math/Math.h"
22 #include "Math/IFunction.h"
23 #include "Math/IFunctionfwd.h"
24 #include "Math/Integrator.h"
25 #include "Math/ProbFuncMathCore.h"
26 #include "Math/WrappedFunction.h"
27 
28 #include "Math/GoFTest.h"
29 
30 #include "Fit/BinData.h"
31 
32 #include "TStopwatch.h"
33 
34 /* Note: The references mentioned here are stated in GoFTest.h */
35 
36 namespace ROOT {
37 namespace Math {
38 
39  struct CDFWrapper : public IGenFunction {
40  // wrapper around a cdf funciton to re-scale for the range
41  Double_t fXmin; // lower range for x
42  Double_t fXmax; // lower range for x
43  Double_t fNorm; // normalization
44  const IGenFunction* fCDF; // cdf pointer (owned by the class)
45 
46 
47  virtual ~CDFWrapper() { if (fCDF) delete fCDF; }
48 
49  CDFWrapper(const IGenFunction& cdf, Double_t xmin=0, Double_t xmax=-1) :
50  fCDF(cdf.Clone())
51  {
52  if (xmin >= xmax) {
53  fNorm = 1;
54  fXmin = -std::numeric_limits<double>::infinity();
55  fXmax = std::numeric_limits<double>::infinity();
56  }
57  else {
58  fNorm = cdf(xmax) - cdf(xmin);
59  fXmin = xmin;
60  fXmax = xmax;
61  }
62  }
63 
64  Double_t DoEval(Double_t x) const {
65  if (x <= fXmin) return 0;
66  if (x >= fXmax) return 1.0;
67  return (*fCDF)(x)/fNorm;
68  }
69 
70  IGenFunction* Clone() const {
71  return new CDFWrapper(*fCDF,fXmin,fXmax);
72  }
73  };
74 
75 
76  class PDFIntegral : public IGenFunction {
77  Double_t fXmin; // lower range for x
78  Double_t fXmax; // lower range for x
79  Double_t fNorm; // normalization
80  mutable IntegratorOneDim fIntegral;
81  const IGenFunction* fPDF; // pdf pointer (owned by the class)
82  public:
83 
84  virtual ~PDFIntegral() { if (fPDF) delete fPDF; }
85 
86  PDFIntegral(const IGenFunction& pdf, Double_t xmin = 0, Double_t xmax = -1) :
87  fXmin(xmin),
88  fXmax(xmax),
89  fNorm(1),
90  fPDF(pdf.Clone())
91  {
92  // compute normalization
93  fIntegral.SetFunction(*fPDF); // N.B. must be fPDF (the cloned copy) and not pdf which can disappear
94  if (fXmin >= fXmax) {
95  fXmin = -std::numeric_limits<double>::infinity();
96  fXmax = std::numeric_limits<double>::infinity();
97  }
98  if (fXmin == -std::numeric_limits<double>::infinity() && fXmax == std::numeric_limits<double>::infinity() ) {
99  fNorm = fIntegral.Integral();
100  }
101  else if (fXmin == -std::numeric_limits<double>::infinity() )
102  fNorm = fIntegral.IntegralLow(fXmax);
103  else if (fXmax == std::numeric_limits<double>::infinity() )
104  fNorm = fIntegral.IntegralUp(fXmin);
105  else
106  fNorm = fIntegral.Integral(fXmin, fXmax);
107  }
108 
109  Double_t DoEval(Double_t x) const {
110  if (x <= fXmin) return 0;
111  if (x >= fXmax) return 1.0;
112  if (fXmin == -std::numeric_limits<double>::infinity() )
113  return fIntegral.IntegralLow(x)/fNorm;
114  else
115  return fIntegral.Integral(fXmin,x)/fNorm;
116  }
117 
118  IGenFunction* Clone() const {
119  return new PDFIntegral(*fPDF, fXmin, fXmax);
120  }
121  };
122 
124  if (!(kGaussian <= dist && dist <= kExponential)) {
125  MATH_ERROR_MSG("SetDistribution", "Cannot set distribution type! Distribution type option must be ennabled.");
126  return;
127  }
128  fDist = dist;
129  SetCDF();
130  }
131 
132  GoFTest::GoFTest( UInt_t sample1Size, const Double_t* sample1, UInt_t sample2Size, const Double_t* sample2 )
133  : fDist(kUndefined),
134  fSamples(std::vector<std::vector<Double_t> >(2)),
135  fTestSampleFromH0(kFALSE) {
136  Bool_t badSampleArg = sample1 == 0 || sample1Size == 0;
137  if (badSampleArg) {
138  std::string msg = "'sample1";
139  msg += !sample1Size ? "Size' cannot be zero" : "' cannot be zero-length";
140  MATH_ERROR_MSG("GoFTest", msg.c_str());
141  assert(!badSampleArg);
142  }
143  badSampleArg = sample2 == 0 || sample2Size == 0;
144  if (badSampleArg) {
145  std::string msg = "'sample2";
146  msg += !sample2Size ? "Size' cannot be zero" : "' cannot be zero-length";
147  MATH_ERROR_MSG("GoFTest", msg.c_str());
148  assert(!badSampleArg);
149  }
150  std::vector<const Double_t*> samples(2);
151  std::vector<UInt_t> samplesSizes(2);
152  samples[0] = sample1;
153  samples[1] = sample2;
154  samplesSizes[0] = sample1Size;
155  samplesSizes[1] = sample2Size;
156  SetSamples(samples, samplesSizes);
157  SetParameters();
158  }
159 
161  : fDist(dist),
162  fSamples(std::vector<std::vector<Double_t> >(1)),
164  Bool_t badSampleArg = sample == 0 || sampleSize == 0;
165  if (badSampleArg) {
166  std::string msg = "'sample";
167  msg += !sampleSize ? "Size' cannot be zero" : "' cannot be zero-length";
168  MATH_ERROR_MSG("GoFTest", msg.c_str());
169  assert(!badSampleArg);
170  }
171  std::vector<const Double_t*> samples(1, sample);
172  std::vector<UInt_t> samplesSizes(1, sampleSize);
173  SetSamples(samples, samplesSizes);
174  SetParameters();
175  SetCDF();
176  }
177 
179 
180  void GoFTest::SetSamples(std::vector<const Double_t*> samples, const std::vector<UInt_t> samplesSizes) {
181  fCombinedSamples.assign(std::accumulate(samplesSizes.begin(), samplesSizes.end(), 0), 0.0);
182  UInt_t combinedSamplesSize = 0;
183  for (UInt_t i = 0; i < samples.size(); ++i) {
184  fSamples[i].assign(samples[i], samples[i] + samplesSizes[i]);
185  std::sort(fSamples[i].begin(), fSamples[i].end());
186  for (UInt_t j = 0; j < samplesSizes[i]; ++j) {
187  fCombinedSamples[combinedSamplesSize + j] = samples[i][j];
188  }
189  combinedSamplesSize += samplesSizes[i];
190  }
191  std::sort(fCombinedSamples.begin(), fCombinedSamples.end());
192 
193  Bool_t degenerateSamples = *(fCombinedSamples.begin()) == *(fCombinedSamples.end() - 1);
194  if (degenerateSamples) {
195  std::string msg = "Degenerate sample";
196  msg += samplesSizes.size() > 1 ? "s!" : "!";
197  msg += " Sampling values all identical.";
198  MATH_ERROR_MSG("SetSamples", msg.c_str());
199  assert(!degenerateSamples);
200  }
201  }
202 
204  fMean = std::accumulate(fSamples[0].begin(), fSamples[0].end(), 0.0) / fSamples[0].size();
205  fSigma = TMath::Sqrt(1. / (fSamples[0].size() - 1) * (std::inner_product(fSamples[0].begin(), fSamples[0].end(), fSamples[0].begin(), 0.0) - fSamples[0].size() * TMath::Power(fMean, 2)));
206  }
207 
208  void GoFTest::operator()(ETestType test, Double_t& pvalue, Double_t& testStat) const {
209  switch (test) {
210  default:
211  case kAD:
212  AndersonDarlingTest(pvalue, testStat);
213  break;
214  case kAD2s:
215  AndersonDarling2SamplesTest(pvalue, testStat);
216  break;
217  case kKS:
218  KolmogorovSmirnovTest(pvalue, testStat);
219  break;
220  case kKS2s:
221  KolmogorovSmirnov2SamplesTest(pvalue, testStat);
222  }
223  }
224 
226  Double_t result = 0.0;
227  switch (test) {
228  default:
229  case kAD:
230  result = AndersonDarlingTest(option);
231  break;
232  case kAD2s:
233  result = AndersonDarling2SamplesTest(option);
234  break;
235  case kKS:
236  result = KolmogorovSmirnovTest(option);
237  break;
238  case kKS2s:
239  result = KolmogorovSmirnov2SamplesTest(option);
240  }
241  return result;
242  }
243 
244  void GoFTest::SetCDF() { // Setting parameter-free distributions
245  IGenFunction* cdf = 0;
246  switch (fDist) {
247  case kLogNormal:
248  LogSample();
249  case kGaussian :
251  break;
252  case kExponential:
254  break;
255  case kUserDefined:
256  case kUndefined:
257  default:
258  break;
259  }
260  fCDF.reset(cdf);
261  }
262 
264  if (fDist > kUserDefined) {
265  MATH_WARN_MSG("SetDistributionFunction","Distribution type is changed to user defined");
266  }
268  // function will be cloned inside the wrapper PDFIntegral of CDFWrapper classes
269  if (isPDF)
270  fCDF.reset(new PDFIntegral(f, xmin, xmax) );
271  else
272  fCDF.reset(new CDFWrapper(f, xmin, xmax) );
273  }
274 
276  // initialization function for the template constructors
277  Bool_t badSampleArg = sample == 0 || sampleSize == 0;
278  if (badSampleArg) {
279  std::string msg = "'sample";
280  msg += !sampleSize ? "Size' cannot be zero" : "' cannot be zero-length";
281  MATH_ERROR_MSG("GoFTest", msg.c_str());
282  assert(!badSampleArg);
283  }
284  fCDF.reset((IGenFunction*)0);
286  fMean = 0;
287  fSigma = 0;
288  fSamples = std::vector<std::vector<Double_t> >(1);
290  SetSamples(std::vector<const Double_t*>(1, sample), std::vector<UInt_t>(1, sampleSize));
291  }
292 
294  return ROOT::Math::normal_cdf(x, fSigma, fMean);
295  }
296 
298  return ROOT::Math::exponential_cdf(x, 1.0 / fMean);
299  }
300 
302  transform(fSamples[0].begin(), fSamples[0].end(), fSamples[0].begin(), std::ptr_fun<Double_t, Double_t>(TMath::Log));
303  SetParameters();
304  }
305 
306 /*
307  Taken from (1)
308 */
309  Double_t GoFTest::GetSigmaN(const std::vector<UInt_t> & ns, UInt_t N) {
310  // compute moments of AD distribution (from Scholz-Stephen paper, paragraph 3)
311 
312  Double_t sigmaN = 0.0, h = 0.0, H = 0.0, g = 0.0, a, b, c, d, k = ns.size();
313 
314  for (UInt_t i = 0; i < ns.size(); ++i) {
315  H += 1.0 / double( ns[i] );
316  }
317 
318  // use approximate formulas for large N
319  // cache Sum( 1 / i)
320  if (N < 2000) {
321  std::vector<double> invI(N);
322  for (UInt_t i = 1; i <= N - 1; ++i) {
323  invI[i] = 1.0 / i;
324  h += invI[i];
325  }
326  for (UInt_t i = 1; i <= N - 2; ++i) {
327  double tmp = invI[N-i];
328  for (UInt_t j = i + 1; j <= N - 1; ++j) {
329  g += tmp * invI[j];
330  }
331  }
332  }
333  else {
334  // for N larger than 2000 error difference in g is ~ 5 10^-3 while in h is at the level of 10^-5
335  const double emc = 0.5772156649015328606065120900824024; // Euler-Mascheroni constant
336  h = std::log(double(N-1) ) + emc;
337  g = (M_PI)*(M_PI)/6.0;
338  }
339  double k2 = std::pow(k,2);
340  a = (4 * g - 6) * k + (10 - 6 * g) * H - 4 * g + 6;
341  b = (2 * g - 4) * k2 + 8 * h * k + (2 * g - 14 * h - 4) * H - 8 * h + 4 * g - 6;
342  c = (6 * h + 2 * g - 2) * k2 + (4 * h - 4 *g + 6) * k + (2 * h - 6) * H + 4 * h;
343  d = (2 * h + 6) * k2 - 4 * h * k;
344  sigmaN += a * std::pow(double(N),3) + b * std::pow(double(N),2) + c * N + d;
345  sigmaN /= ( double(N - 1) * double(N - 2) * double(N - 3) );
346  sigmaN = TMath::Sqrt(sigmaN);
347  return sigmaN;
348  }
349 
350 
352 
353  /*
354  Computation of p-values according to
355  "K-Sample Anderson-Darling Tests" by F.W. Scholz
356  and M.A. Stephens (1987), Journal of the American Statistical Association,
357  Vol 82, No. 399, pp 918-924.
358  Code from kSamples package from R (author F. Scholtz)
359 
360  This function uses the upper T_m quantiles as obtained via simulation of
361  the Anderson-Darling test statistics (Nsim = 2*10^6) with sample sizes n=500
362  for each sample, and after standardization, in order to emulate the Table 1
363  values given in the above reference. However, here we estimate p-quantiles
364  for p = .00001,.00005,.0001,.0005,.001,.005,.01,.025,.05,.075,
365  .1,.2,.3,.4,.5,.6,.7,.8,.9,.925,.95,.975,.99,.9925,.995,.9975,.999,
366  .99925,.9995,.99975,.9999,.999925,.99995,.999975,.99999
367  First the appropriate p-quantiles are determined from those simulated
368  for ms = 1,2,3,4,6,8,10, Inf, interpolating to the given value of m.
369  Since we use only m=2 we avoid this interpolation.
370 
371  Next linear inetrpolation to find the observed p value given the observed test statistic value.
372  We use interpolation in the test statistic -> log((1-p)/p) domain
373  and we extrapolatelinearly) beyond p = .00001 and .99999.
374  */
375 
376  // sample values
377  //double ms[] = { 1, 2, 3, 4, 6, 8, 10, TMath::Infinity() };
378  //int ns = ms.size();
379  const int ns = 8;
380  double ts[ ] = { -1.1954, -1.5806, -1.8172,
381  -2.0032, -2.2526, -2.4204, -2.5283, -4.2649, -1.1786, -1.5394,
382  -1.7728, -1.9426, -2.1685, -2.3288, -2.4374, -3.8906, -1.166,
383  -1.5193, -1.7462, -1.9067, -2.126, -2.2818, -2.3926, -3.719,
384  -1.1407, -1.4659, -1.671, -1.8105, -2.0048, -2.1356, -2.2348,
385  -3.2905, -1.1253, -1.4371, -1.6314, -1.7619, -1.9396, -2.0637,
386  -2.1521, -3.0902, -1.0777, -1.3503, -1.5102, -1.6177, -1.761,
387  -1.8537, -1.9178, -2.5758, -1.0489, -1.2984, -1.4415, -1.5355,
388  -1.6625, -1.738, -1.7936, -2.3263, -0.9978, -1.2098, -1.3251,
389  -1.4007, -1.4977, -1.5555, -1.5941, -1.96, -0.9417, -1.1187,
390  -1.209, -1.2671, -1.3382, -1.379, -1.405, -1.6449, -0.8981, -1.0491,
391  -1.1235, -1.1692, -1.2249, -1.2552, -1.2755, -1.4395, -0.8598,
392  -0.9904, -1.0513, -1.0879, -1.1317, -1.155, -1.1694, -1.2816,
393  -0.7258, -0.7938, -0.8188, -0.8312, -0.8435, -0.8471, -0.8496,
394  -0.8416, -0.5966, -0.617, -0.6177, -0.6139, -0.6073, -0.5987,
395  -0.5941, -0.5244, -0.4572, -0.4383, -0.419, -0.4033, -0.3834,
396  -0.3676, -0.3587, -0.2533, -0.2966, -0.2428, -0.2078, -0.1844,
397  -0.1548, -0.1346, -0.1224, 0, -0.1009, -0.0169, 0.0304, 0.0596,
398  0.0933, 0.1156, 0.1294, 0.2533, 0.1571, 0.2635, 0.3169, 0.348,
399  0.3823, 0.4038, 0.4166, 0.5244, 0.5357, 0.6496, 0.6992, 0.7246,
400  0.7528, 0.7683, 0.7771, 0.8416, 1.2255, 1.2989, 1.3202, 1.3254,
401  1.3305, 1.3286, 1.3257, 1.2816, 1.5262, 1.5677, 1.5709, 1.5663,
402  1.5561, 1.5449, 1.5356, 1.4395, 1.9633, 1.943, 1.919, 1.8975,
403  1.8641, 1.8389, 1.8212, 1.6449, 2.7314, 2.5899, 2.5, 2.4451,
404  2.3664, 2.3155, 2.2823, 1.96, 3.7825, 3.4425, 3.2582, 3.1423,
405  3.0036, 2.9101, 2.8579, 2.3263, 4.1241, 3.716, 3.4984, 3.3651,
406  3.2003, 3.0928, 3.0311, 2.4324, 4.6044, 4.0847, 3.8348, 3.6714,
407  3.4721, 3.3453, 3.2777, 2.5758, 5.409, 4.7223, 4.4022, 4.1791,
408  3.9357, 3.7809, 3.6963, 2.807, 6.4954, 5.5823, 5.1456, 4.8657,
409  4.5506, 4.3275, 4.2228, 3.0902, 6.8279, 5.8282, 5.3658, 5.0749,
410  4.7318, 4.4923, 4.3642, 3.1747, 7.2755, 6.197, 5.6715, 5.3642,
411  4.9991, 4.7135, 4.5945, 3.2905, 8.1885, 6.8537, 6.2077, 5.8499,
412  5.4246, 5.1137, 4.9555, 3.4808, 9.3061, 7.6592, 6.85, 6.4806,
413  5.9919, 5.6122, 5.5136, 3.719, 9.6132, 7.9234, 7.1025, 6.6731,
414  6.1549, 5.8217, 5.7345, 3.7911, 10.0989, 8.2395, 7.4326, 6.9567,
415  6.3908, 6.011, 5.9566, 3.8906, 10.8825, 8.8994, 7.8934, 7.4501,
416  6.9009, 6.4538, 6.2705, 4.0556, 11.8537, 9.5482, 8.5568, 8.0283,
417  7.4418, 6.9524, 6.6195, 4.2649 };
418 
419 
420 
421 
422 
423  // p values bins
424  double p[] = { .00001,.00005,.0001,.0005,.001,.005,.01,.025,.05,.075,.1,.2,.3,.4,.5,.6,.7,.8,.9,
425  .925,.95,.975,.99,.9925,.995,.9975,.999,.99925,.9995,.99975,.9999,.999925,.99995,.999975,.99999 };
426 
427  //int nbins = p.size();
428  const int nbins = 35;
429  //assert ( nbins*ns == ts.size() );
430 
431  // get ts values for nsamples = 2
432  // corresponding value is for m=nsamples-1
433  int offset = 0; // for m = 1 (i.e. for nsamples = 2)
434  if (nsamples != 2) {
435  MATH_ERROR_MSG("InterpolatePValues", "Interpolation not implemented for nsamples not equal to 2");
436  return 0;
437  }
438  std::vector<double> ts2(nbins); // ts values for nsamples = 2
439  std::vector<double> lp(nbins); // log ( p / (1-p) )
440  for (int i = 0; i < nbins; ++i)
441  {
442  ts2[i] = ts[offset+ i * ns];
443  p[i] = 1.-p[i];
444  lp[i] = std::log( p[i]/(1.-p[i] ) );
445  }
446  // do linear interpolation to find right lp value for given observed test staistic value
447  //auto it = std::lower_bound(ts2.begin(), ts2.end(), tx );
448  int i1 = std::distance(ts2.begin(), std::lower_bound(ts2.begin(), ts2.end(), tx ) ) - 1;
449  int i2 = i1+1;
450  // if tx is before min of tabluated data
451  if (i1 < 0) {
452  i1 = 0;
453  i2 = 1;
454  }
455  // if tx is after max of tabulated data
456  if (i2 >= int(ts2.size()) ) {
457  i1 = ts2.size()-2;
458  i2 = ts2.size()-1;
459  }
460 
461  //std::cout << i1 << " , " << i2 << std::endl;
462  assert(i1 < (int) lp.size() && i2 < (int) lp.size() );
463  double lp1 = lp[i1];
464  double lp2 = lp[i2];
465  double tx1 = ts2[i1];
466  double tx2 = ts2[i2];
467 
468  //std::cout << " tx1,2 " << tx1 << " " << tx2 << std::endl;
469  /// find interpolated (or extrapolated value)(
470  double lp0 = (lp1-lp2) * (tx - tx2)/ ( tx1-tx2) + lp2;
471 
472 
473  double p0 = exp(lp0)/(1. + exp(lp0) );
474  return p0;
475 
476  }
477 
478 
479 /*
480  Taken from (2)
482  Double_t pvalue = 0.0;
483  if (A2 <= 0.0) {
484  return pvalue;
485  } else if (A2 < 2.) {
486  pvalue = std::pow(A2, -0.5) * std::exp(-1.2337141 / A2) * (2.00012 + (0.247105 - (0.0649821 - (0.0347962 - (0.011672 - 0.00168691 * A2) * A2) * A2) * A2) * A2);
487  } else {
488  pvalue = std::exp(-1. * std::exp(1.0776 - (2.30695 - (0.43424 - (.082433 - (0.008056 - 0.0003146 * A2) * A2) * A2) * A2) * A2));
489  }
490  return 1. - pvalue;
491  }
492 
493 
494 // code from kSamples (R) F. Scholz
495 
496 /* computes the k-sample Anderson-Darling test statistics in both original
497  and alternative versions for the nonparametric (rank) test described in
498  Scholz F.W. and Stephens M.A. (1987), K-sample Anderson-Darling Tests,
499  Journal of the American Statistical Association, Vol 82, No. 399,
500  pp. 918-924
501 
502  Arguments:
503  adk: double array with length 2, stores AkN2 and AakN2
504  k: integer, number of samples being compared
505  x: double array storing the concatenated samples in the same order as ns
506  ns: integer array storing the k sample sizes, corresponding to x
507  zstar: double array storing the l distinct ordered observations in the
508  pooled sample
509  l: integer, length of zstar
510 
511  Outputs:
512  when the computation ends, AkN2 and AakN2 are stored in the given memory
513  pointed by adk
514 */
515 
516 /* counts and returns the number of occurrence of a given number
517  in a double array */
518 int getCount(double z, const double *dat, int n) {
519  int i;
520  int count = 0;
521 
522  for (i = 0; i < n; i++) {
523  if (dat[i] == z) {
524  count++;
525  }
526  }
527 
528  return(count);
529 }
530 
531 /* computes and returns the sum of elements in a given integer array */
532 int getSum(const int *x, int n) {
533  int i;
534  int sum = 0;
535 
536  for (i = 0; i < n; i++) {
537  sum += x[i];
538  }
539 
540  return(sum);
541 }
542 
543 
544 void adkTestStat(double *adk, const std::vector<std::vector<double> > & samples, const std::vector<double> & zstar) {
545 
546  int i;
547  int j;
548 
549  int nsum; /* total sample size = n_1 + ... + n_k */
550  int k = samples.size();
551  int l = zstar.size();
552 
553  /* fij records the number of observations in the ith sample coinciding
554  with zstar[j], where i = 1, ..., k, and j = 1, ..., l */
555  std::vector<int> fij (k*l);
556  /* lvec is an integer vector with length l,
557  whose jth entry = \sum_{i=1}^{k} f_{ij}, i.e., the multiplicity
558  of zstar[j] */
559  std::vector<int> lvec(l);
560 
561  /* for computation */
562  double mij;
563  double maij;
564  double innerSum;
565  double aInnerSum;
566  double bj;
567  double baj;
568  double tmp;
569 
570  /* samples is a two-dimensional double array with length k;
571  it stores an array of k pointers to double arrays which are
572  the k samples beeing compared */
573 // double **samples;
574 
575  /* dynamically allocate memory */
576  //std::vector< std::vector<double> > samples(k);
577  std::vector<int> ns(k);
578  nsum = 0;
579  for (i = 0; i < k; i++) {
580  ns[i] = samples[i].size();
581  nsum += ns[i];
582  }
583 
584  /* fij: k*l integer matrix, where l is the length of zstar and
585  k is the number of samples being compared
586  lvec: integer vector of length l, records the multiplicity of
587  each element of zstar */
588  for (j = 0; j < l; j++) {
589  lvec[j] = 0;
590  for (i = 0; i < k; i++) {
591  fij[i + j*k] = getCount(zstar[j], &samples[i][0], ns[i]);
592  lvec[j] += fij[i + j*k];
593  }
594  }
595 
596  // loop on samples to compute the adk's
597  // Formula (6) and (7) of the paper
598  adk[0] = adk[1] = 0;
599  for (i = 0; i < k; i++) {
600  mij = 0;
601  maij = 0;
602  innerSum = 0;
603  aInnerSum = 0;
604 
605  for (j = 0; j < l; j++) {
606  mij += fij[i + j*k];
607  maij = mij - (double) fij[i + j*k] / 2.0;
608  bj = getSum(&lvec[0], j + 1);
609  baj = bj - (double) lvec[j] / 2.0;
610 
611  if (j < l - 1) {
612  tmp = (double) nsum * mij - (double) ns[i] * bj;
613  innerSum = innerSum + (double) lvec[j] * tmp * tmp /
614  (bj * ((double) nsum - bj));
615  }
616 
617  tmp = (double) nsum * maij - (double) ns[i] * baj;
618  aInnerSum = aInnerSum + (double) lvec[j] * tmp * tmp /
619  (baj * (nsum - baj) - nsum * (double) lvec[j] / 4.0);
620  }
621 
622  adk[0] = adk[0] + innerSum / ns[i]; /* AkN2*/
623  adk[1] = adk[1] + aInnerSum / ns[i]; /* AakN2 */
624  }
625 
626  /* k-sample Anderson-Darling test statistics in both original and
627  alternative versions, AkN2 and AakN2, are stored in the given
628  double array adk */
629  adk[0] = adk[0] / (double) nsum; /* AkN2*/
630  adk[1] = (nsum - 1) * adk[1] / ((double) nsum * (double) nsum); /* AakN2 */
631 
632  // /* free pointers */
633  // for (i = 0; i < k; i++) {
634  // free(samples[i]);
635  // }
636  // free(samples);
637 
638 }
639 
640 
641 /*
642  Taken from (1) -- Named for 2 samples but implemented for K. Restricted to K = 2 by the class's constructors
643 */
645  pvalue = -1;
646  testStat = -1;
647  if (fTestSampleFromH0) {
648  MATH_ERROR_MSG("AndersonDarling2SamplesTest", "Only 1-sample tests can be issued with a 1-sample constructed GoFTest object!");
649  return;
650  }
651  std::vector<Double_t> z(fCombinedSamples);
652  // unique removes all consecutives duplicates elements. This is exactly what we wants
653  // for example unique of v={1,2,2,3,1,2,3,3} results in {1,2,3,1,2,3} which is exactly what we wants
654  std::vector<Double_t>::iterator endUnique = std::unique(z.begin(), z.end()); //z_j's in (1)
655  z.erase(endUnique, z.end() );
656  std::vector<UInt_t> h; // h_j's in (1)
657  std::vector<Double_t> H; // H_j's in (1)
658  UInt_t N = fCombinedSamples.size();
659  Double_t A2 = 0.0; // Anderson-Darling A^2 Test Statistic
660 
661 #ifdef USE_OLDIMPL
662 
663  TStopwatch w; w.Start();
664 
665  unsigned int nSamples = fSamples.size();
666 
667  // old implementation
668  for (std::vector<Double_t>::iterator data = z.begin(); data != endUnique; ++data) {
669  UInt_t n = std::count(fCombinedSamples.begin(), fCombinedSamples.end(), *data);
670  h.push_back(n);
671  H.push_back(std::count_if(fCombinedSamples.begin(), fCombinedSamples.end(), bind2nd(std::less<Double_t>(), *data)) + n / 2.);
672  }
673  std::cout << "time for H";
674  w.Print();
675  w.Reset(); w.Start();
676  std::vector<std::vector<Double_t> > F(nSamples); // F_ij's in (1)
677  for (UInt_t i = 0; i < nSamples; ++i) {
678  for (std::vector<Double_t>::iterator data = z.begin(); data != endUnique; ++data) {
679  UInt_t n = std::count(fSamples[i].begin(), fSamples[i].end(), *data);
680  F[i].push_back(std::count_if(fSamples[i].begin(), fSamples[i].end(), bind2nd(std::less<Double_t>(), *data)) + n / 2.);
681  }
682  }
683  std::cout << "time for F";
684  w.Print();
685  for (UInt_t i = 0; i < nSamples; ++i) {
686  Double_t sum_result = 0.0;
687  UInt_t j = 0;
688  w.Reset(); w.Start();
689  for (std::vector<Double_t>::iterator data = z.begin(); data != endUnique; ++data) {
690  sum_result += h[j] * TMath::Power(N * F[i][j]- fSamples[i].size() * H[j], 2) / (H[j] * (N - H[j]) - N * h[j] / 4.0);
691  ++j;
692  }
693  std::cout << "time for sum_resut";
694  w.Print();
695  std::cout << "sum_result " << sum_result << std::endl;
696  A2 += 1.0 / fSamples[i].size() * sum_result;
697  }
698  A2 *= (N - 1) / (TMath::Power(N, 2)); // A2_akN in (1)
699 
700  std::cout << "A2 - old Bartolomeo code " << A2 << std::endl;
701 #endif
702  // w.Reset();
703  // w.Start();
704 
705  double adk[2] = {0,0};
706 
707  //debug
708  // std::cout << "combined samples\n";
709  // for (int i = 0; i < fCombinedSamples.size(); ++i)
710  // std::cout << fCombinedSamples[i] << " ,";
711  // std::cout << std::endl;
712  // std::cout << ns[0] << " " << ns[1] << std::endl;
713  // std::cout << "Z\n";
714  // for (int i = 0; i < z.size(); ++i)
715  // std::cout << z[i] << " ,";
716  // std::cout << std::endl;
717 
718  // use function from kSamples code
719  adkTestStat(adk, fSamples, z );
720  // w.Print();
721  // std::cout << "A2 - new kSamples code " << adk[0] << " " << adk[1] << std::endl;
722 
723  A2 = adk[0];
724 
725  // compute the normalized test statistic
726 
727  std::vector<UInt_t> ns(fSamples.size());
728  for (unsigned int k = 0; k < ns.size(); ++k) ns[k] = fSamples[k].size();
729  Double_t sigmaN = GetSigmaN(ns, N);
730  A2 -= fSamples.size() - 1;
731  A2 /= sigmaN; // standartized test statistic
732 
733  pvalue = PValueADKSamples(2,A2);
734  testStat = A2;
735  return;
736  }
737 
738 
739 /*
740  Compute Anderson Darling test for two binned data set.
741  A binned data set can be seen as many identical observation happening at the center of the bin
742  In this way it is trivial to apply the formula (6) in the paper of W. Scholz, M. Stephens, "K-Sample Anderson-Darling Tests"
743  to the case of histograms. See also http://arxiv.org/pdf/0804.0380v1.pdf paragraph 3.3.5
744  It is importat that empty bins are not present
745 */
746  void GoFTest::AndersonDarling2SamplesTest(const ROOT::Fit::BinData &data1, const ROOT::Fit::BinData & data2, Double_t& pvalue, Double_t& testStat) {
747  pvalue = -1;
748  testStat = -1;
749  //
750  // compute cumulative sum of bin counts
751  // std::vector<double> sum1(data1.Size() );
752  // std::vector<double> sum2(data2.Size() );
753  // std::vector<double> sumAll(data1.Size() + data2.Size() );
754 
755  if (data1.NDim() != 1 && data2.NDim() != 1) {
756  MATH_ERROR_MSG("AndersonDarling2SamplesTest", "Bin Data set must be one-dimensional ");
757  return;
758  }
759  unsigned int n1 = data1.Size();
760  unsigned int n2 = data2.Size();
761  double ntot1 = 0;
762  double ntot2 = 0;
763 
764 
765  // make a combined data set and sort it
766  std::vector<double> xdata(n1+n2);
767  for (unsigned int i = 0; i < n1; ++i) {
768  double value = 0;
769  const double * x = data1.GetPoint(i, value);
770  xdata[i] = *x;
771  ntot1 += value;
772  }
773  for (unsigned int i = 0; i < n2; ++i) {
774  double value = 0;
775  const double * x = data2.GetPoint(i, value);
776  xdata[n1+i] = *x;
777  ntot2 += value;
778  }
779  double nall = ntot1+ntot2;
780  // sort the combined data
781  std::vector<unsigned int> index(n1+n2);
782  TMath::Sort(n1+n2, &xdata[0], &index[0], false );
783 
784  // now compute the sums for the tests
785  double sum1 = 0;
786  double sum2 = 0;
787  double sumAll = 0;
788  double adsum = 0;
789  unsigned int j = 0;
790 
791  while( j < n1+n2 ) {
792 // for (unsigned int j = 0; j < n1+n2; ++j) {
793  // skip equal observations
794  double x = xdata[ index[j] ];
795  unsigned int k = j;
796  // loop on the bins with the same center value
797  double t = 0;
798  do {
799  unsigned int i = index[k];
800  double value = 0;
801  if (i < n1 ) {
802  value = data1.Value(i);
803  sum1 += value;
804  }
805  else {
806  // from data2
807  i -= n1;
808  assert(i < n2);
809  value = data2.Value(i);
810  sum2 += value;
811  }
812  sumAll += value;
813  t += value;
814  //std::cout << "j " << j << " k " << k << " data " << x << " index " << index[k] << " value " << value << std::endl;
815  k++;
816  } while ( k < n1+n2 && xdata[ index[k] ] == x );
817 
818 
819  j = k;
820  // skip last point
821  if (j < n1+n2) {
822  double tmp1 = ( nall * sum1 - ntot1 * sumAll );
823  double tmp2 = ( nall * sum2 - ntot2 * sumAll );
824  adsum += t * (tmp1*tmp1/ntot1 + tmp2*tmp2/ntot2) / ( sumAll * (nall - sumAll) ) ;
825 
826  //std::cout << "comp sum " << adsum << " " << t << " " << sumAll << " s1 " << sum1 << " s2 " << sum2 << " tmp1 " << tmp1 << " tmp2 " << tmp2 << std::endl;
827  }
828  }
829  double A2 = adsum / nall;
830 
831  // compute the normalized test statistic
832  std::vector<unsigned int> ns(2);
833  ns[0] = ntot1;
834  ns[1] = ntot2;
835  //std::cout << " ad2 = " << A2 << " nall " << nall;
836 
837  Double_t sigmaN = GetSigmaN(ns,nall);
838  A2 -= 1;
839  A2 /= sigmaN; // standartized test statistic
840 
841  //std::cout << " sigmaN " << sigmaN << " new A2 " << A2;
842 
843  pvalue = PValueADKSamples(2,A2);
844  //std::cout << " pvalue = " << pvalue << std::endl;
845  testStat = A2;
846  return;
847  }
848 
849 
851  Double_t pvalue, testStat;
852  AndersonDarling2SamplesTest(pvalue, testStat);
853  return (strncmp(option, "p", 1) == 0 || strncmp(option, "t", 1) != 0) ? pvalue : testStat;
854  }
855 
856 /*
857  Taken from (3)
858 */ void GoFTest::AndersonDarlingTest(Double_t& pvalue, Double_t& testStat) const {
859  pvalue = -1;
860  testStat = -1;
861  if (!fTestSampleFromH0) {
862  MATH_ERROR_MSG("AndersonDarlingTest", "Only 2-sample tests can be issued with a 2-sample constructed GoFTest object!");
863  return;
864  }
865  if (fDist == kUndefined) {
866  MATH_ERROR_MSG("AndersonDarlingTest", "Distribution type is undefined! Please use SetDistribution(GoFTest::EDistribution).");
867  return;
868  }
869  Double_t A2 = 0.0;
870  Int_t n = fSamples[0].size();
871  for (Int_t i = 0; i < n ; ++i) {
872  Double_t x1 = fSamples[0][i];
873  Double_t w1 = (*fCDF)(x1);
874  Double_t result = (2 * (i + 1) - 1) * TMath::Log(w1) + (2 * (n - (i + 1)) + 1) * TMath::Log(1 - w1);
875  A2 += result;
876  }
877  (A2 /= -n) -= n;
878  if (A2 != A2) {
879  MATH_ERROR_MSG("AndersonDarlingTest", "Cannot compute p-value: data below or above the distribution's thresholds. Check sample consistency.");
880  return;
881  }
882  pvalue = PValueAD1Sample(A2);
883  testStat = A2;
884  }
885 
887  Double_t pvalue, testStat;
888  AndersonDarlingTest(pvalue, testStat);
889  return (strncmp(option, "p", 1) == 0 || strncmp(option, "t", 1) != 0) ? pvalue : testStat;
890  }
891 
893  pvalue = -1;
894  testStat = -1;
895  if (fTestSampleFromH0) {
896  MATH_ERROR_MSG("KolmogorovSmirnov2SamplesTest", "Only 1-sample tests can be issued with a 1-sample constructed GoFTest object!");
897  return;
898  }
899  const UInt_t na = fSamples[0].size();
900  const UInt_t nb = fSamples[1].size();
901  std::vector<Double_t> a(na);
902  std::vector<Double_t> b(nb);
903  std::copy(fSamples[0].begin(), fSamples[0].end(), a.begin());
904  std::copy(fSamples[1].begin(), fSamples[1].end(), b.begin());
905  pvalue = TMath::KolmogorovTest(na, a.data(), nb, b.data(), 0);
906  testStat = TMath::KolmogorovTest(na, a.data(), nb, b.data(), "M");
907  }
908 
910  Double_t pvalue, testStat;
911  KolmogorovSmirnov2SamplesTest(pvalue, testStat);
912  return (strncmp(option, "p", 1) == 0 || strncmp(option, "t", 1) != 0) ? pvalue : testStat;
913  }
914 
915 /*
916  Algorithm taken from (3) in page 737
917 */ void GoFTest::KolmogorovSmirnovTest(Double_t& pvalue, Double_t& testStat) const {
918  pvalue = -1;
919  testStat = -1;
920  if (!fTestSampleFromH0) {
921  MATH_ERROR_MSG("KolmogorovSmirnovTest", "Only 2-sample tests can be issued with a 2-sample constructed GoFTest object!");
922  return;
923  }
924  if (fDist == kUndefined) {
925  MATH_ERROR_MSG("KolmogorovSmirnovTest", "Distribution type is undefined! Please use SetDistribution(GoFTest::EDistribution).");
926  return;
927  }
928  Double_t Fo = 0.0, Dn = 0.0;
929  UInt_t n = fSamples[0].size();
930  for (UInt_t i = 0; i < n; ++i) {
931  Double_t Fn = (i + 1.0) / n;
932  Double_t F = (*fCDF)(fSamples[0][i]);
933  Double_t result = std::max(TMath::Abs(Fn - F), TMath::Abs(Fo - Fn));
934  if (result > Dn) Dn = result;
935  Fo = Fn;
936  }
937  pvalue = TMath::KolmogorovProb(Dn * (TMath::Sqrt(n) + 0.12 + 0.11 / TMath::Sqrt(n)));
938  testStat = Dn;
939  }
940 
942  Double_t pvalue, testStat;
943  KolmogorovSmirnovTest(pvalue, testStat);
944  return (strncmp(option, "p", 1) == 0 || strncmp(option, "t", 1) != 0) ? pvalue : testStat;
945  }
946 
947 
948 
949 
950 
951 } // ROOT namespace
952 } // Math namespace
953 
unsigned int Size() const
return number of fit points
Definition: BinData.h:447
void SetDistribution(EDistribution dist)
Definition: GoFTest.cxx:123
double dist(Rotation3D const &r1, Rotation3D const &r2)
Definition: 3DDistances.cxx:48
Bool_t fTestSampleFromH0
Definition: GoFTest.h:193
static long int sum(long int i)
Definition: Factory.cxx:1786
int getCount(double z, const double *dat, int n)
Definition: GoFTest.cxx:518
Double_t fMean
Definition: GoFTest.h:186
float xmin
Definition: THbookFile.cxx:93
Interface (abstract class) for generic functions objects of one-dimension Provides a method to evalua...
Definition: IFunction.h:133
void Start(Bool_t reset=kTRUE)
Start the stopwatch.
Definition: TStopwatch.cxx:58
void operator()(ETestType test, Double_t &pvalue, Double_t &testStat) const
Definition: GoFTest.cxx:208
This namespace contains pre-defined functions to be used in conjuction with TExecutor::Map and TExecu...
Definition: StringConv.hxx:21
Double_t Log(Double_t x)
Definition: TMath.h:526
void SetSamples(std::vector< const Double_t *> samples, const std::vector< UInt_t > samplesSizes)
Definition: GoFTest.cxx:180
void Print(Option_t *option="") const
Print the real and cpu time passed between the start and stop events.
Definition: TStopwatch.cxx:219
return c
Double_t KolmogorovProb(Double_t z)
Calculates the Kolmogorov distribution function,.
Definition: TMath.cxx:665
std::vector< Double_t > fCombinedSamples
Definition: GoFTest.h:189
TH1 * h
Definition: legend2.C:5
void AndersonDarlingTest(Double_t &pvalue, Double_t &testStat) const
Definition: GoFTest.cxx:858
#define N
#define H(x, y, z)
Definition: test.py:1
double inner_product(const LAVector &, const LAVector &)
int Int_t
Definition: RtypesCore.h:41
bool Bool_t
Definition: RtypesCore.h:59
TArc * a
Definition: textangle.C:12
const Bool_t kFALSE
Definition: Rtypes.h:92
int nbins[3]
STL namespace.
#define MATH_WARN_MSG(loc, str)
Definition: Error.h:47
Short_t Abs(Short_t d)
Definition: TMathBase.h:110
LongDouble_t Power(LongDouble_t x, LongDouble_t y)
Definition: TMath.h:501
const double * GetPoint(unsigned int ipoint, double &value) const
retrieve at the same time a pointer to the coordinate data and the fit value More efficient than call...
Definition: BinData.h:304
int sampleSize
Definition: unuranDistr.cxx:34
Double_t x[n]
Definition: legend1.C:17
Double_t GaussianCDF(Double_t x) const
Definition: GoFTest.cxx:293
double cdf(double *x, double *p)
Definition: unuranDistr.cxx:44
double normal_cdf(double x, double sigma=1, double x0=0)
Cumulative distribution function of the normal (Gaussian) distribution (lower tail).
static Double_t PValueADKSamples(UInt_t nsamples, Double_t A2)
Definition: GoFTest.cxx:351
EDistribution fDist
Definition: GoFTest.h:184
double pow(double, double)
void Sort(Index n, const Element *a, Index *index, Bool_t down=kTRUE)
Definition: TMath.h:989
static Double_t GetSigmaN(const std::vector< UInt_t > &ns, UInt_t N)
Definition: GoFTest.cxx:309
void KolmogorovSmirnovTest(Double_t &pvalue, Double_t &testStat) const
Definition: GoFTest.cxx:917
#define MATH_ERROR_MSG(loc, str)
Definition: Error.h:50
int getSum(const int *x, int n)
Definition: GoFTest.cxx:532
#define F(x, y, z)
#define M_PI
Definition: Rotated.cxx:105
IBaseFunctionOneDim IGenFunction
Definition: IFunctionfwd.h:24
unsigned int UInt_t
Definition: RtypesCore.h:42
Double_t KolmogorovTest(Int_t na, const Double_t *a, Int_t nb, const Double_t *b, Option_t *option)
Statistical test whether two one-dimensional sets of points are compatible with coming from the same ...
Definition: TMath.cxx:788
TLine * l
Definition: textangle.C:4
virtual ~GoFTest()
Definition: GoFTest.cxx:178
Class describing the binned data sets : vectors of x coordinates, y values and optionally error on y ...
Definition: BinData.h:61
Double_t fSigma
Definition: GoFTest.h:187
float xmax
Definition: THbookFile.cxx:93
std::vector< std::vector< Double_t > > fSamples
Definition: GoFTest.h:191
static const double x1[5]
double f(double x)
double Double_t
Definition: RtypesCore.h:55
void Instantiate(const Double_t *sample, UInt_t sampleSize)
Definition: GoFTest.cxx:275
double Value(unsigned int ipoint) const
return the value for the given fit point
Definition: BinData.h:236
void KolmogorovSmirnov2SamplesTest(Double_t &pvalue, Double_t &testStat) const
Definition: GoFTest.cxx:892
Namespace for new Math classes and functions.
Double_t PValueAD1Sample(Double_t A2) const
Definition: GoFTest.cxx:481
you should not use this method at all Int_t Int_t z
Definition: TRolke.cxx:630
unsigned int NDim() const
return coordinate data dimension
Definition: BinData.h:452
char Char_t
Definition: RtypesCore.h:29
Double_t ExponentialCDF(Double_t x) const
Definition: GoFTest.cxx:297
you should not use this method at all Int_t Int_t Double_t Double_t Double_t Int_t Double_t Double_t Double_t Double_t b
Definition: TRolke.cxx:630
void Reset()
Definition: TStopwatch.h:54
double result[121]
Double_t Sqrt(Double_t x)
Definition: TMath.h:464
double exp(double)
void SetDistributionFunction(const IGenFunction &cdf, Bool_t isPDF, Double_t xmin, Double_t xmax)
Definition: GoFTest.cxx:263
const Bool_t kTRUE
Definition: Rtypes.h:91
const Int_t n
Definition: legend1.C:16
Template class to wrap any member function of a class taking a double and returning a double in a 1D ...
double log(double)
void adkTestStat(double *adk, const std::vector< std::vector< double > > &samples, const std::vector< double > &zstar)
Definition: GoFTest.cxx:544
double exponential_cdf(double x, double lambda, double x0=0)
Cumulative distribution function of the exponential distribution (lower tail).
void AndersonDarling2SamplesTest(Double_t &pvalue, Double_t &testStat) const
Definition: GoFTest.cxx:644
Stopwatch class.
Definition: TStopwatch.h:30