ROOT  6.07/01
Reference Guide
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
GoFTest.cxx
Go to the documentation of this file.
1 // @(#)root/mathcore:$Id$
2 // Authors: Bartolomeu Rabacal 05/2010
3 /**********************************************************************
4  * *
5  * Copyright (c) 2006 , LCG ROOT MathLib Team *
6  * *
7  * *
8  **********************************************************************/
9 // implementation file for GoFTest
10 
11 
12 #include <algorithm>
13 #include <functional>
14 #include <iostream>
15 #include <map>
16 #include <numeric>
17 #include <string.h>
18 #include <cassert>
19 
20 #include "Math/Error.h"
21 #include "Math/Math.h"
22 #include "Math/IFunction.h"
23 #include "Math/IFunctionfwd.h"
24 #include "Math/Integrator.h"
25 #include "Math/ProbFuncMathCore.h"
26 #include "Math/WrappedFunction.h"
27 
28 #include "Math/GoFTest.h"
29 
30 #include "Fit/BinData.h"
31 
32 #include "TStopwatch.h"
33 
34 /* Note: The references mentioned here are stated in GoFTest.h */
35 
36 namespace ROOT {
37 namespace Math {
38 
39  struct CDFWrapper : public IGenFunction {
40  // wrapper around a cdf funciton to re-scale for the range
41  Double_t fXmin; // lower range for x
42  Double_t fXmax; // lower range for x
43  Double_t fNorm; // normalization
44  const IGenFunction* fCDF; // cdf pointer (owned by the class)
45 
46 
47  virtual ~CDFWrapper() { if (fCDF) delete fCDF; }
48 
49  CDFWrapper(const IGenFunction& cdf, Double_t xmin=0, Double_t xmax=-1) :
50  fCDF(cdf.Clone())
51  {
52  if (xmin >= xmax) {
53  fNorm = 1;
56  }
57  else {
58  fNorm = cdf(xmax) - cdf(xmin);
59  fXmin = xmin;
60  fXmax = xmax;
61  }
62  }
63 
64  Double_t DoEval(Double_t x) const {
65  if (x <= fXmin) return 0;
66  if (x >= fXmax) return 1.0;
67  return (*fCDF)(x)/fNorm;
68  }
69 
70  IGenFunction* Clone() const {
71  return new CDFWrapper(*fCDF,fXmin,fXmax);
72  }
73  };
74 
75 
76  class PDFIntegral : public IGenFunction {
77  Double_t fXmin; // lower range for x
78  Double_t fXmax; // lower range for x
79  Double_t fNorm; // normalization
80  mutable IntegratorOneDim fIntegral;
81  const IGenFunction* fPDF; // pdf pointer (owned by the class)
82  public:
83 
84  virtual ~PDFIntegral() { if (fPDF) delete fPDF; }
85 
86  PDFIntegral(const IGenFunction& pdf, Double_t xmin = 0, Double_t xmax = -1) :
87  fXmin(xmin),
88  fXmax(xmax),
89  fNorm(1),
90  fPDF(pdf.Clone())
91  {
92  // compute normalization
93  fIntegral.SetFunction(*fPDF); // N.B. must be fPDF (the cloned copy) and not pdf which can disappear
94  if (fXmin >= fXmax) {
97  }
99  fNorm = fIntegral.Integral();
100  }
101  else if (fXmin == -std::numeric_limits<double>::infinity() )
102  fNorm = fIntegral.IntegralLow(fXmax);
103  else if (fXmax == std::numeric_limits<double>::infinity() )
104  fNorm = fIntegral.IntegralUp(fXmin);
105  else
106  fNorm = fIntegral.Integral(fXmin, fXmax);
107  }
108 
109  Double_t DoEval(Double_t x) const {
110  if (x <= fXmin) return 0;
111  if (x >= fXmax) return 1.0;
112  if (fXmin == -std::numeric_limits<double>::infinity() )
113  return fIntegral.IntegralLow(x)/fNorm;
114  else
115  return fIntegral.Integral(fXmin,x)/fNorm;
116  }
117 
118  IGenFunction* Clone() const {
119  return new PDFIntegral(*fPDF, fXmin, fXmax);
120  }
121  };
122 
124  if (!(kGaussian <= dist && dist <= kExponential)) {
125  MATH_ERROR_MSG("SetDistribution", "Cannot set distribution type! Distribution type option must be ennabled.");
126  return;
127  }
128  fDist = dist;
129  SetCDF();
130  }
131 
132  GoFTest::GoFTest( UInt_t sample1Size, const Double_t* sample1, UInt_t sample2Size, const Double_t* sample2 )
133  : fCDF(std::auto_ptr<IGenFunction>((IGenFunction*)0)),
134  fDist(kUndefined),
135  fSamples(std::vector<std::vector<Double_t> >(2)),
136  fTestSampleFromH0(kFALSE) {
137  Bool_t badSampleArg = sample1 == 0 || sample1Size == 0;
138  if (badSampleArg) {
139  std::string msg = "'sample1";
140  msg += !sample1Size ? "Size' cannot be zero" : "' cannot be zero-length";
141  MATH_ERROR_MSG("GoFTest", msg.c_str());
142  assert(!badSampleArg);
143  }
144  badSampleArg = sample2 == 0 || sample2Size == 0;
145  if (badSampleArg) {
146  std::string msg = "'sample2";
147  msg += !sample2Size ? "Size' cannot be zero" : "' cannot be zero-length";
148  MATH_ERROR_MSG("GoFTest", msg.c_str());
149  assert(!badSampleArg);
150  }
151  std::vector<const Double_t*> samples(2);
152  std::vector<UInt_t> samplesSizes(2);
153  samples[0] = sample1;
154  samples[1] = sample2;
155  samplesSizes[0] = sample1Size;
156  samplesSizes[1] = sample2Size;
157  SetSamples(samples, samplesSizes);
158  SetParameters();
159  }
160 
162  : fCDF(std::auto_ptr<IGenFunction>((IGenFunction*)0)),
163  fDist(dist),
164  fSamples(std::vector<std::vector<Double_t> >(1)),
165  fTestSampleFromH0(kTRUE) {
166  Bool_t badSampleArg = sample == 0 || sampleSize == 0;
167  if (badSampleArg) {
168  std::string msg = "'sample";
169  msg += !sampleSize ? "Size' cannot be zero" : "' cannot be zero-length";
170  MATH_ERROR_MSG("GoFTest", msg.c_str());
171  assert(!badSampleArg);
172  }
173  std::vector<const Double_t*> samples(1, sample);
174  std::vector<UInt_t> samplesSizes(1, sampleSize);
175  SetSamples(samples, samplesSizes);
176  SetParameters();
177  SetCDF();
178  }
179 
181 
182  void GoFTest::SetSamples(std::vector<const Double_t*> samples, const std::vector<UInt_t> samplesSizes) {
183  fCombinedSamples.assign(std::accumulate(samplesSizes.begin(), samplesSizes.end(), 0), 0.0);
184  UInt_t combinedSamplesSize = 0;
185  for (UInt_t i = 0; i < samples.size(); ++i) {
186  fSamples[i].assign(samples[i], samples[i] + samplesSizes[i]);
187  std::sort(fSamples[i].begin(), fSamples[i].end());
188  for (UInt_t j = 0; j < samplesSizes[i]; ++j) {
189  fCombinedSamples[combinedSamplesSize + j] = samples[i][j];
190  }
191  combinedSamplesSize += samplesSizes[i];
192  }
193  std::sort(fCombinedSamples.begin(), fCombinedSamples.end());
194 
195  Bool_t degenerateSamples = *(fCombinedSamples.begin()) == *(fCombinedSamples.end() - 1);
196  if (degenerateSamples) {
197  std::string msg = "Degenerate sample";
198  msg += samplesSizes.size() > 1 ? "s!" : "!";
199  msg += " Sampling values all identical.";
200  MATH_ERROR_MSG("SetSamples", msg.c_str());
201  assert(!degenerateSamples);
202  }
203  }
204 
206  fMean = std::accumulate(fSamples[0].begin(), fSamples[0].end(), 0.0) / fSamples[0].size();
207  fSigma = TMath::Sqrt(1. / (fSamples[0].size() - 1) * (std::inner_product(fSamples[0].begin(), fSamples[0].end(), fSamples[0].begin(), 0.0) - fSamples[0].size() * TMath::Power(fMean, 2)));
208  }
209 
210  void GoFTest::operator()(ETestType test, Double_t& pvalue, Double_t& testStat) const {
211  switch (test) {
212  default:
213  case kAD:
214  AndersonDarlingTest(pvalue, testStat);
215  break;
216  case kAD2s:
217  AndersonDarling2SamplesTest(pvalue, testStat);
218  break;
219  case kKS:
220  KolmogorovSmirnovTest(pvalue, testStat);
221  break;
222  case kKS2s:
223  KolmogorovSmirnov2SamplesTest(pvalue, testStat);
224  }
225  }
226 
227  Double_t GoFTest::operator()(ETestType test, const Char_t* option) const {
228  Double_t result = 0.0;
229  switch (test) {
230  default:
231  case kAD:
232  result = AndersonDarlingTest(option);
233  break;
234  case kAD2s:
235  result = AndersonDarling2SamplesTest(option);
236  break;
237  case kKS:
238  result = KolmogorovSmirnovTest(option);
239  break;
240  case kKS2s:
241  result = KolmogorovSmirnov2SamplesTest(option);
242  }
243  return result;
244  }
245 
246  void GoFTest::SetCDF() { // Setting parameter-free distributions
247  IGenFunction* cdf = 0;
248  switch (fDist) {
249  case kLogNormal:
250  LogSample();
251  case kGaussian :
253  break;
254  case kExponential:
256  break;
257  case kUserDefined:
258  case kUndefined:
259  default:
260  break;
261  }
262  fCDF = std::auto_ptr<IGenFunction>(cdf);
263  }
264 
266  if (fDist > kUserDefined) {
267  MATH_WARN_MSG("SetDistributionFunction","Distribution type is changed to user defined");
268  }
270  // function will be cloned inside the wrapper PDFIntegral of CDFWrapper classes
271  if (isPDF)
272  fCDF = std::auto_ptr<IGenFunction>(new PDFIntegral(f, xmin, xmax) );
273  else
274  fCDF = std::auto_ptr<IGenFunction>(new CDFWrapper(f, xmin, xmax) );
275  }
276 
278  // initialization function for the template constructors
279  Bool_t badSampleArg = sample == 0 || sampleSize == 0;
280  if (badSampleArg) {
281  std::string msg = "'sample";
282  msg += !sampleSize ? "Size' cannot be zero" : "' cannot be zero-length";
283  MATH_ERROR_MSG("GoFTest", msg.c_str());
284  assert(!badSampleArg);
285  }
286  fCDF = std::auto_ptr<IGenFunction>((IGenFunction*)0);
288  fMean = 0;
289  fSigma = 0;
290  fSamples = std::vector<std::vector<Double_t> >(1);
292  SetSamples(std::vector<const Double_t*>(1, sample), std::vector<UInt_t>(1, sampleSize));
293  }
294 
296  return ROOT::Math::normal_cdf(x, fSigma, fMean);
297  }
298 
300  return ROOT::Math::exponential_cdf(x, 1.0 / fMean);
301  }
302 
304  transform(fSamples[0].begin(), fSamples[0].end(), fSamples[0].begin(), std::ptr_fun<Double_t, Double_t>(TMath::Log));
305  SetParameters();
306  }
307 
308 /*
309  Taken from (1)
310 */
311  Double_t GoFTest::GetSigmaN(const std::vector<UInt_t> & ns, UInt_t N) {
312  // compute moments of AD distribution (from Scholz-Stephen paper, paragraph 3)
313 
314  Double_t sigmaN = 0.0, h = 0.0, H = 0.0, g = 0.0, a, b, c, d, k = ns.size();
315 
316  for (UInt_t i = 0; i < ns.size(); ++i) {
317  H += 1.0 / double( ns[i] );
318  }
319 
320  // use approximate formulas for large N
321  // cache Sum( 1 / i)
322  if (N < 2000) {
323  std::vector<double> invI(N);
324  for (UInt_t i = 1; i <= N - 1; ++i) {
325  invI[i] = 1.0 / i;
326  h += invI[i];
327  }
328  for (UInt_t i = 1; i <= N - 2; ++i) {
329  double tmp = invI[N-i];
330  for (UInt_t j = i + 1; j <= N - 1; ++j) {
331  g += tmp * invI[j];
332  }
333  }
334  }
335  else {
336  // for N larger than 2000 error difference in g is ~ 5 10^-3 while in h is at the level of 10^-5
337  const double emc = 0.5772156649015328606065120900824024; // Euler-Mascheroni constant
338  h = std::log(double(N-1) ) + emc;
339  g = (M_PI)*(M_PI)/6.0;
340  }
341  double k2 = std::pow(k,2);
342  a = (4 * g - 6) * k + (10 - 6 * g) * H - 4 * g + 6;
343  b = (2 * g - 4) * k2 + 8 * h * k + (2 * g - 14 * h - 4) * H - 8 * h + 4 * g - 6;
344  c = (6 * h + 2 * g - 2) * k2 + (4 * h - 4 *g + 6) * k + (2 * h - 6) * H + 4 * h;
345  d = (2 * h + 6) * k2 - 4 * h * k;
346  sigmaN += a * std::pow(double(N),3) + b * std::pow(double(N),2) + c * N + d;
347  sigmaN /= ( double(N - 1) * double(N - 2) * double(N - 3) );
348  sigmaN = TMath::Sqrt(sigmaN);
349  return sigmaN;
350  }
351 
352 
354 
355  /*
356  Computation of p-values according to
357  "K-Sample Anderson-Darling Tests" by F.W. Scholz
358  and M.A. Stephens (1987), Journal of the American Statistical Association,
359  Vol 82, No. 399, pp 918-924.
360  Code from kSamples package from R (author F. Scholtz)
361 
362  This function uses the upper T_m quantiles as obtained via simulation of
363  the Anderson-Darling test statistics (Nsim = 2*10^6) with sample sizes n=500
364  for each sample, and after standardization, in order to emulate the Table 1
365  values given in the above reference. However, here we estimate p-quantiles
366  for p = .00001,.00005,.0001,.0005,.001,.005,.01,.025,.05,.075,
367  .1,.2,.3,.4,.5,.6,.7,.8,.9,.925,.95,.975,.99,.9925,.995,.9975,.999,
368  .99925,.9995,.99975,.9999,.999925,.99995,.999975,.99999
369  First the appropriate p-quantiles are determined from those simulated
370  for ms = 1,2,3,4,6,8,10, Inf, interpolating to the given value of m.
371  Since we use only m=2 we avoid this interpolation.
372 
373  Next linear inetrpolation to find the observed p value given the observed test statistic value.
374  We use interpolation in the test statistic -> log((1-p)/p) domain
375  and we extrapolatelinearly) beyond p = .00001 and .99999.
376  */
377 
378  // sample values
379  //double ms[] = { 1, 2, 3, 4, 6, 8, 10, TMath::Infinity() };
380  //int ns = ms.size();
381  const int ns = 8;
382  double ts[ ] = { -1.1954, -1.5806, -1.8172,
383  -2.0032, -2.2526, -2.4204, -2.5283, -4.2649, -1.1786, -1.5394,
384  -1.7728, -1.9426, -2.1685, -2.3288, -2.4374, -3.8906, -1.166,
385  -1.5193, -1.7462, -1.9067, -2.126, -2.2818, -2.3926, -3.719,
386  -1.1407, -1.4659, -1.671, -1.8105, -2.0048, -2.1356, -2.2348,
387  -3.2905, -1.1253, -1.4371, -1.6314, -1.7619, -1.9396, -2.0637,
388  -2.1521, -3.0902, -1.0777, -1.3503, -1.5102, -1.6177, -1.761,
389  -1.8537, -1.9178, -2.5758, -1.0489, -1.2984, -1.4415, -1.5355,
390  -1.6625, -1.738, -1.7936, -2.3263, -0.9978, -1.2098, -1.3251,
391  -1.4007, -1.4977, -1.5555, -1.5941, -1.96, -0.9417, -1.1187,
392  -1.209, -1.2671, -1.3382, -1.379, -1.405, -1.6449, -0.8981, -1.0491,
393  -1.1235, -1.1692, -1.2249, -1.2552, -1.2755, -1.4395, -0.8598,
394  -0.9904, -1.0513, -1.0879, -1.1317, -1.155, -1.1694, -1.2816,
395  -0.7258, -0.7938, -0.8188, -0.8312, -0.8435, -0.8471, -0.8496,
396  -0.8416, -0.5966, -0.617, -0.6177, -0.6139, -0.6073, -0.5987,
397  -0.5941, -0.5244, -0.4572, -0.4383, -0.419, -0.4033, -0.3834,
398  -0.3676, -0.3587, -0.2533, -0.2966, -0.2428, -0.2078, -0.1844,
399  -0.1548, -0.1346, -0.1224, 0, -0.1009, -0.0169, 0.0304, 0.0596,
400  0.0933, 0.1156, 0.1294, 0.2533, 0.1571, 0.2635, 0.3169, 0.348,
401  0.3823, 0.4038, 0.4166, 0.5244, 0.5357, 0.6496, 0.6992, 0.7246,
402  0.7528, 0.7683, 0.7771, 0.8416, 1.2255, 1.2989, 1.3202, 1.3254,
403  1.3305, 1.3286, 1.3257, 1.2816, 1.5262, 1.5677, 1.5709, 1.5663,
404  1.5561, 1.5449, 1.5356, 1.4395, 1.9633, 1.943, 1.919, 1.8975,
405  1.8641, 1.8389, 1.8212, 1.6449, 2.7314, 2.5899, 2.5, 2.4451,
406  2.3664, 2.3155, 2.2823, 1.96, 3.7825, 3.4425, 3.2582, 3.1423,
407  3.0036, 2.9101, 2.8579, 2.3263, 4.1241, 3.716, 3.4984, 3.3651,
408  3.2003, 3.0928, 3.0311, 2.4324, 4.6044, 4.0847, 3.8348, 3.6714,
409  3.4721, 3.3453, 3.2777, 2.5758, 5.409, 4.7223, 4.4022, 4.1791,
410  3.9357, 3.7809, 3.6963, 2.807, 6.4954, 5.5823, 5.1456, 4.8657,
411  4.5506, 4.3275, 4.2228, 3.0902, 6.8279, 5.8282, 5.3658, 5.0749,
412  4.7318, 4.4923, 4.3642, 3.1747, 7.2755, 6.197, 5.6715, 5.3642,
413  4.9991, 4.7135, 4.5945, 3.2905, 8.1885, 6.8537, 6.2077, 5.8499,
414  5.4246, 5.1137, 4.9555, 3.4808, 9.3061, 7.6592, 6.85, 6.4806,
415  5.9919, 5.6122, 5.5136, 3.719, 9.6132, 7.9234, 7.1025, 6.6731,
416  6.1549, 5.8217, 5.7345, 3.7911, 10.0989, 8.2395, 7.4326, 6.9567,
417  6.3908, 6.011, 5.9566, 3.8906, 10.8825, 8.8994, 7.8934, 7.4501,
418  6.9009, 6.4538, 6.2705, 4.0556, 11.8537, 9.5482, 8.5568, 8.0283,
419  7.4418, 6.9524, 6.6195, 4.2649 };
420 
421 
422 
423 
424 
425  // p values bins
426  double p[] = { .00001,.00005,.0001,.0005,.001,.005,.01,.025,.05,.075,.1,.2,.3,.4,.5,.6,.7,.8,.9,
427  .925,.95,.975,.99,.9925,.995,.9975,.999,.99925,.9995,.99975,.9999,.999925,.99995,.999975,.99999 };
428 
429  //int nbins = p.size();
430  const int nbins = 35;
431  //assert ( nbins*ns == ts.size() );
432 
433  // get ts values for nsamples = 2
434  // corresponding value is for m=nsamples-1
435  int offset = 0; // for m = 1 (i.e. for nsamples = 2)
436  if (nsamples != 2) {
437  MATH_ERROR_MSG("InterpolatePValues", "Interpolation not implemented for nsamples not equal to 2");
438  return 0;
439  }
440  std::vector<double> ts2(nbins); // ts values for nsamples = 2
441  std::vector<double> lp(nbins); // log ( p / (1-p) )
442  for (int i = 0; i < nbins; ++i)
443  {
444  ts2[i] = ts[offset+ i * ns];
445  p[i] = 1.-p[i];
446  lp[i] = std::log( p[i]/(1.-p[i] ) );
447  }
448  // do linear interpolation to find right lp value for given observed test staistic value
449  //auto it = std::lower_bound(ts2.begin(), ts2.end(), tx );
450  int i1 = std::distance(ts2.begin(), std::lower_bound(ts2.begin(), ts2.end(), tx ) ) - 1;
451  int i2 = i1+1;
452  // if tx is before min of tabluated data
453  if (i1 < 0) {
454  i1 = 0;
455  i2 = 1;
456  }
457  // if tx is after max of tabulated data
458  if (i2 >= int(ts2.size()) ) {
459  i1 = ts2.size()-2;
460  i2 = ts2.size()-1;
461  }
462 
463  //std::cout << i1 << " , " << i2 << std::endl;
464  assert(i1 < (int) lp.size() && i2 < (int) lp.size() );
465  double lp1 = lp[i1];
466  double lp2 = lp[i2];
467  double tx1 = ts2[i1];
468  double tx2 = ts2[i2];
469 
470  //std::cout << " tx1,2 " << tx1 << " " << tx2 << std::endl;
471  /// find interpolated (or extrapolated value)(
472  double lp0 = (lp1-lp2) * (tx - tx2)/ ( tx1-tx2) + lp2;
473 
474 
475  double p0 = exp(lp0)/(1. + exp(lp0) );
476  return p0;
477 
478  }
479 
480 
481 /*
482  Taken from (2)
484  Double_t pvalue = 0.0;
485  if (A2 <= 0.0) {
486  return pvalue;
487  } else if (A2 < 2.) {
488  pvalue = std::pow(A2, -0.5) * std::exp(-1.2337141 / A2) * (2.00012 + (0.247105 - (0.0649821 - (0.0347962 - (0.011672 - 0.00168691 * A2) * A2) * A2) * A2) * A2);
489  } else {
490  pvalue = std::exp(-1. * std::exp(1.0776 - (2.30695 - (0.43424 - (.082433 - (0.008056 - 0.0003146 * A2) * A2) * A2) * A2) * A2));
491  }
492  return 1. - pvalue;
493  }
494 
495 
496 // code from kSamples (R) F. Scholz
497 
498 /* computes the k-sample Anderson-Darling test statistics in both original
499  and alternative versions for the nonparametric (rank) test described in
500  Scholz F.W. and Stephens M.A. (1987), K-sample Anderson-Darling Tests,
501  Journal of the American Statistical Association, Vol 82, No. 399,
502  pp. 918-924
503 
504  Arguments:
505  adk: double array with length 2, stores AkN2 and AakN2
506  k: integer, number of samples being compared
507  x: double array storing the concatenated samples in the same order as ns
508  ns: integer array storing the k sample sizes, corresponding to x
509  zstar: double array storing the l distinct ordered observations in the
510  pooled sample
511  l: integer, length of zstar
512 
513  Outputs:
514  when the computation ends, AkN2 and AakN2 are stored in the given memory
515  pointed by adk
516 */
517 
518 /* counts and returns the number of occurrence of a given number
519  in a double array */
520 int getCount(double z, const double *dat, int n) {
521  int i;
522  int count = 0;
523 
524  for (i = 0; i < n; i++) {
525  if (dat[i] == z) {
526  count++;
527  }
528  }
529 
530  return(count);
531 }
532 
533 /* computes and returns the sum of elements in a given integer array */
534 int getSum(const int *x, int n) {
535  int i;
536  int sum = 0;
537 
538  for (i = 0; i < n; i++) {
539  sum += x[i];
540  }
541 
542  return(sum);
543 }
544 
545 
546  void adkTestStat(double *adk, const std::vector<std::vector<double> > & samples, const std::vector<double> & zstar) {
547 
548  int i;
549  int j;
550 
551  int nsum; /* total sample size = n_1 + ... + n_k */
552  int k = samples.size();
553  int l = zstar.size();
554 
555  /* fij records the number of observations in the ith sample coinciding
556  with zstar[j], where i = 1, ..., k, and j = 1, ..., l */
557  std::vector<int> fij (k*l);
558  /* lvec is an integer vector with length l,
559  whose jth entry = \sum_{i=1}^{k} f_{ij}, i.e., the multiplicity
560  of zstar[j] */
561  std::vector<int> lvec(l);
562 
563  /* for computation */
564  double mij;
565  double maij;
566  double innerSum;
567  double aInnerSum;
568  double bj;
569  double baj;
570  double tmp;
571 
572  /* samples is a two-dimensional double array with length k;
573  it stores an array of k pointers to double arrays which are
574  the k samples beeing compared */
575 // double **samples;
576 
577  /* dynamically allocate memory */
578  //std::vector< std::vector<double> > samples(k);
579  std::vector<int> ns(k);
580  nsum = 0;
581  for (i = 0; i < k; i++) {
582  ns[i] = samples[i].size();
583  nsum += ns[i];
584  }
585 
586  /* fij: k*l integer matrix, where l is the length of zstar and
587  k is the number of samples being compared
588  lvec: integer vector of length l, records the multiplicity of
589  each element of zstar */
590  for (j = 0; j < l; j++) {
591  lvec[j] = 0;
592  for (i = 0; i < k; i++) {
593  fij[i + j*k] = getCount(zstar[j], &samples[i][0], ns[i]);
594  lvec[j] += fij[i + j*k];
595  }
596  }
597 
598  // loop on samples to compute the adk's
599  // Formula (6) and (7) of the paper
600  adk[0] = adk[1] = 0;
601  for (i = 0; i < k; i++) {
602  mij = 0;
603  maij = 0;
604  innerSum = 0;
605  aInnerSum = 0;
606 
607  for (j = 0; j < l; j++) {
608  mij += fij[i + j*k];
609  maij = mij - (double) fij[i + j*k] / 2.0;
610  bj = getSum(&lvec[0], j + 1);
611  baj = bj - (double) lvec[j] / 2.0;
612 
613  if (j < l - 1) {
614  tmp = (double) nsum * mij - (double) ns[i] * bj;
615  innerSum = innerSum + (double) lvec[j] * tmp * tmp /
616  (bj * ((double) nsum - bj));
617  }
618 
619  tmp = (double) nsum * maij - (double) ns[i] * baj;
620  aInnerSum = aInnerSum + (double) lvec[j] * tmp * tmp /
621  (baj * (nsum - baj) - nsum * (double) lvec[j] / 4.0);
622  }
623 
624  adk[0] = adk[0] + innerSum / ns[i]; /* AkN2*/
625  adk[1] = adk[1] + aInnerSum / ns[i]; /* AakN2 */
626  }
627 
628  /* k-sample Anderson-Darling test statistics in both original and
629  alternative versions, AkN2 and AakN2, are stored in the given
630  double array adk */
631  adk[0] = adk[0] / (double) nsum; /* AkN2*/
632  adk[1] = (nsum - 1) * adk[1] / ((double) nsum * (double) nsum); /* AakN2 */
633 
634  // /* free pointers */
635  // for (i = 0; i < k; i++) {
636  // free(samples[i]);
637  // }
638  // free(samples);
639 
640 }
641 
642 
643 /*
644  Taken from (1) -- Named for 2 samples but implemented for K. Restricted to K = 2 by the class's constructors
645 */
647  pvalue = -1;
648  testStat = -1;
649  if (fTestSampleFromH0) {
650  MATH_ERROR_MSG("AndersonDarling2SamplesTest", "Only 1-sample tests can be issued with a 1-sample constructed GoFTest object!");
651  return;
652  }
653  std::vector<Double_t> z(fCombinedSamples);
654  // unique removes all consecutives duplicates elements. This is exactly what we wants
655  // for example unique of v={1,2,2,3,1,2,3,3} results in {1,2,3,1,2,3} which is exactly what we wants
656  std::vector<Double_t>::iterator endUnique = std::unique(z.begin(), z.end()); //z_j's in (1)
657  z.erase(endUnique, z.end() );
658  std::vector<UInt_t> h; // h_j's in (1)
659  std::vector<Double_t> H; // H_j's in (1)
660  UInt_t N = fCombinedSamples.size();
661  Double_t A2 = 0.0; // Anderson-Darling A^2 Test Statistic
662 
663 #ifdef USE_OLDIMPL
664 
665  TStopwatch w; w.Start();
666 
667  unsigned int nSamples = fSamples.size();
668 
669  // old implementation
670  for (std::vector<Double_t>::iterator data = z.begin(); data != endUnique; ++data) {
671  UInt_t n = std::count(fCombinedSamples.begin(), fCombinedSamples.end(), *data);
672  h.push_back(n);
673  H.push_back(std::count_if(fCombinedSamples.begin(), fCombinedSamples.end(), bind2nd(std::less<Double_t>(), *data)) + n / 2.);
674  }
675  std::cout << "time for H";
676  w.Print();
677  w.Reset(); w.Start();
678  std::vector<std::vector<Double_t> > F(nSamples); // F_ij's in (1)
679  for (UInt_t i = 0; i < nSamples; ++i) {
680  for (std::vector<Double_t>::iterator data = z.begin(); data != endUnique; ++data) {
681  UInt_t n = std::count(fSamples[i].begin(), fSamples[i].end(), *data);
682  F[i].push_back(std::count_if(fSamples[i].begin(), fSamples[i].end(), bind2nd(std::less<Double_t>(), *data)) + n / 2.);
683  }
684  }
685  std::cout << "time for F";
686  w.Print();
687  for (UInt_t i = 0; i < nSamples; ++i) {
688  Double_t sum_result = 0.0;
689  UInt_t j = 0;
690  w.Reset(); w.Start();
691  for (std::vector<Double_t>::iterator data = z.begin(); data != endUnique; ++data) {
692  sum_result += h[j] * TMath::Power(N * F[i][j]- fSamples[i].size() * H[j], 2) / (H[j] * (N - H[j]) - N * h[j] / 4.0);
693  ++j;
694  }
695  std::cout << "time for sum_resut";
696  w.Print();
697  std::cout << "sum_result " << sum_result << std::endl;
698  A2 += 1.0 / fSamples[i].size() * sum_result;
699  }
700  A2 *= (N - 1) / (TMath::Power(N, 2)); // A2_akN in (1)
701 
702  std::cout << "A2 - old Bartolomeo code " << A2 << std::endl;
703 #endif
704  // w.Reset();
705  // w.Start();
706 
707  double adk[2] = {0,0};
708 
709  //debug
710  // std::cout << "combined samples\n";
711  // for (int i = 0; i < fCombinedSamples.size(); ++i)
712  // std::cout << fCombinedSamples[i] << " ,";
713  // std::cout << std::endl;
714  // std::cout << ns[0] << " " << ns[1] << std::endl;
715  // std::cout << "Z\n";
716  // for (int i = 0; i < z.size(); ++i)
717  // std::cout << z[i] << " ,";
718  // std::cout << std::endl;
719 
720  // use function from kSamples code
721  adkTestStat(adk, fSamples, z );
722  // w.Print();
723  // std::cout << "A2 - new kSamples code " << adk[0] << " " << adk[1] << std::endl;
724 
725  A2 = adk[0];
726 
727  // compute the normalized test statistic
728 
729  std::vector<UInt_t> ns(fSamples.size());
730  for (unsigned int k = 0; k < ns.size(); ++k) ns[k] = fSamples[k].size();
731  Double_t sigmaN = GetSigmaN(ns, N);
732  A2 -= fSamples.size() - 1;
733  A2 /= sigmaN; // standartized test statistic
734 
735  pvalue = PValueADKSamples(2,A2);
736  testStat = A2;
737  return;
738  }
739 
740 
741 /*
742  Compute Anderson Darling test for two binned data set.
743  A binned data set can be seen as many identical observation happening at the center of the bin
744  In this way it is trivial to apply the formula (6) in the paper of W. Scholz, M. Stephens, "K-Sample Anderson-Darling Tests"
745  to the case of histograms. See also http://arxiv.org/pdf/0804.0380v1.pdf paragraph 3.3.5
746  It is importat that empty bins are not present
747 */
748  void GoFTest::AndersonDarling2SamplesTest(const ROOT::Fit::BinData &data1, const ROOT::Fit::BinData & data2, Double_t& pvalue, Double_t& testStat) {
749  pvalue = -1;
750  testStat = -1;
751  //
752  // compute cumulative sum of bin counts
753  // std::vector<double> sum1(data1.Size() );
754  // std::vector<double> sum2(data2.Size() );
755  // std::vector<double> sumAll(data1.Size() + data2.Size() );
756 
757  if (data1.NDim() != 1 && data2.NDim() != 1) {
758  MATH_ERROR_MSG("AndersonDarling2SamplesTest", "Bin Data set must be one-dimensional ");
759  return;
760  }
761  unsigned int n1 = data1.Size();
762  unsigned int n2 = data2.Size();
763  double ntot1 = 0;
764  double ntot2 = 0;
765 
766 
767  // make a combined data set and sort it
768  std::vector<double> xdata(n1+n2);
769  for (unsigned int i = 0; i < n1; ++i) {
770  double value = 0;
771  const double * x = data1.GetPoint(i, value);
772  xdata[i] = *x;
773  ntot1 += value;
774  }
775  for (unsigned int i = 0; i < n2; ++i) {
776  double value = 0;
777  const double * x = data2.GetPoint(i, value);
778  xdata[n1+i] = *x;
779  ntot2 += value;
780  }
781  double nall = ntot1+ntot2;
782  // sort the combined data
783  std::vector<unsigned int> index(n1+n2);
784  TMath::Sort(n1+n2, &xdata[0], &index[0], false );
785 
786  // now compute the sums for the tests
787  double sum1 = 0;
788  double sum2 = 0;
789  double sumAll = 0;
790  double adsum = 0;
791  unsigned int j = 0;
792 
793  while( j < n1+n2 ) {
794 // for (unsigned int j = 0; j < n1+n2; ++j) {
795  // skip equal observations
796  double x = xdata[ index[j] ];
797  unsigned int k = j;
798  // loop on the bins with the same center value
799  double t = 0;
800  do {
801  unsigned int i = index[k];
802  double value = 0;
803  if (i < n1 ) {
804  value = data1.Value(i);
805  sum1 += value;
806  }
807  else {
808  // from data2
809  i -= n1;
810  assert(i < n2);
811  value = data2.Value(i);
812  sum2 += value;
813  }
814  sumAll += value;
815  t += value;
816  //std::cout << "j " << j << " k " << k << " data " << x << " index " << index[k] << " value " << value << std::endl;
817  k++;
818  } while ( k < n1+n2 && xdata[ index[k] ] == x );
819 
820 
821  j = k;
822  // skip last point
823  if (j < n1+n2) {
824  double tmp1 = ( nall * sum1 - ntot1 * sumAll );
825  double tmp2 = ( nall * sum2 - ntot2 * sumAll );
826  adsum += t * (tmp1*tmp1/ntot1 + tmp2*tmp2/ntot2) / ( sumAll * (nall - sumAll) ) ;
827 
828  //std::cout << "comp sum " << adsum << " " << t << " " << sumAll << " s1 " << sum1 << " s2 " << sum2 << " tmp1 " << tmp1 << " tmp2 " << tmp2 << std::endl;
829  }
830  }
831  double A2 = adsum / nall;
832 
833  // compute the normalized test statistic
834  std::vector<unsigned int> ns(2);
835  ns[0] = ntot1;
836  ns[1] = ntot2;
837  //std::cout << " ad2 = " << A2 << " nall " << nall;
838 
839  Double_t sigmaN = GetSigmaN(ns,nall);
840  A2 -= 1;
841  A2 /= sigmaN; // standartized test statistic
842 
843  //std::cout << " sigmaN " << sigmaN << " new A2 " << A2;
844 
845  pvalue = PValueADKSamples(2,A2);
846  //std::cout << " pvalue = " << pvalue << std::endl;
847  testStat = A2;
848  return;
849  }
850 
851 
853  Double_t pvalue, testStat;
854  AndersonDarling2SamplesTest(pvalue, testStat);
855  return (strncmp(option, "p", 1) == 0 || strncmp(option, "t", 1) != 0) ? pvalue : testStat;
856  }
857 
858 /*
859  Taken from (3)
860 */ void GoFTest::AndersonDarlingTest(Double_t& pvalue, Double_t& testStat) const {
861  pvalue = -1;
862  testStat = -1;
863  if (!fTestSampleFromH0) {
864  MATH_ERROR_MSG("AndersonDarlingTest", "Only 2-sample tests can be issued with a 2-sample constructed GoFTest object!");
865  return;
866  }
867  if (fDist == kUndefined) {
868  MATH_ERROR_MSG("AndersonDarlingTest", "Distribution type is undefined! Please use SetDistribution(GoFTest::EDistribution).");
869  return;
870  }
871  Double_t A2 = 0.0;
872  Int_t n = fSamples[0].size();
873  for (Int_t i = 0; i < n ; ++i) {
874  Double_t x1 = fSamples[0][i];
875  Double_t w1 = (*fCDF)(x1);
876  Double_t result = (2 * (i + 1) - 1) * TMath::Log(w1) + (2 * (n - (i + 1)) + 1) * TMath::Log(1 - w1);
877  A2 += result;
878  }
879  (A2 /= -n) -= n;
880  if (A2 != A2) {
881  MATH_ERROR_MSG("AndersonDarlingTest", "Cannot compute p-value: data below or above the distribution's thresholds. Check sample consistency.");
882  return;
883  }
884  pvalue = PValueAD1Sample(A2);
885  testStat = A2;
886  }
887 
889  Double_t pvalue, testStat;
890  AndersonDarlingTest(pvalue, testStat);
891  return (strncmp(option, "p", 1) == 0 || strncmp(option, "t", 1) != 0) ? pvalue : testStat;
892  }
893 
895  pvalue = -1;
896  testStat = -1;
897  if (fTestSampleFromH0) {
898  MATH_ERROR_MSG("KolmogorovSmirnov2SamplesTest", "Only 1-sample tests can be issued with a 1-sample constructed GoFTest object!");
899  return;
900  }
901  const UInt_t na = fSamples[0].size();
902  const UInt_t nb = fSamples[1].size();
903  std::vector<Double_t> a(na);
904  std::vector<Double_t> b(nb);
905  std::copy(fSamples[0].begin(), fSamples[0].end(), a.begin());
906  std::copy(fSamples[1].begin(), fSamples[1].end(), b.begin());
907  pvalue = TMath::KolmogorovTest(na, a.data(), nb, b.data(), 0);
908  testStat = TMath::KolmogorovTest(na, a.data(), nb, b.data(), "M");
909  }
910 
912  Double_t pvalue, testStat;
913  KolmogorovSmirnov2SamplesTest(pvalue, testStat);
914  return (strncmp(option, "p", 1) == 0 || strncmp(option, "t", 1) != 0) ? pvalue : testStat;
915  }
916 
917 /*
918  Algorithm taken from (3) in page 737
919 */ void GoFTest::KolmogorovSmirnovTest(Double_t& pvalue, Double_t& testStat) const {
920  pvalue = -1;
921  testStat = -1;
922  if (!fTestSampleFromH0) {
923  MATH_ERROR_MSG("KolmogorovSmirnovTest", "Only 2-sample tests can be issued with a 2-sample constructed GoFTest object!");
924  return;
925  }
926  if (fDist == kUndefined) {
927  MATH_ERROR_MSG("KolmogorovSmirnovTest", "Distribution type is undefined! Please use SetDistribution(GoFTest::EDistribution).");
928  return;
929  }
930  Double_t Fo = 0.0, Dn = 0.0;
931  UInt_t n = fSamples[0].size();
932  for (UInt_t i = 0; i < n; ++i) {
933  Double_t Fn = (i + 1.0) / n;
934  Double_t F = (*fCDF)(fSamples[0][i]);
935  Double_t result = std::max(TMath::Abs(Fn - F), TMath::Abs(Fo - Fn));
936  if (result > Dn) Dn = result;
937  Fo = Fn;
938  }
939  pvalue = TMath::KolmogorovProb(Dn * (TMath::Sqrt(n) + 0.12 + 0.11 / TMath::Sqrt(n)));
940  testStat = Dn;
941  }
942 
944  Double_t pvalue, testStat;
945  KolmogorovSmirnovTest(pvalue, testStat);
946  return (strncmp(option, "p", 1) == 0 || strncmp(option, "t", 1) != 0) ? pvalue : testStat;
947  }
948 
949 
950 
951 
952 
953 } // ROOT namespace
954 } // Math namespace
955 
void SetDistribution(EDistribution dist)
Definition: GoFTest.cxx:123
Double_t PValueAD1Sample(Double_t A2) const
Definition: GoFTest.cxx:483
double dist(Rotation3D const &r1, Rotation3D const &r2)
Definition: 3DDistances.cxx:48
Bool_t fTestSampleFromH0
Definition: GoFTest.h:197
void Print(Option_t *option="") const
Print the real and cpu time passed between the start and stop events.
Definition: TStopwatch.cxx:217
int getCount(double z, const double *dat, int n)
Definition: GoFTest.cxx:520
Double_t fMean
Definition: GoFTest.h:190
float xmin
Definition: THbookFile.cxx:93
Interface (abstract class) for generic functions objects of one-dimension Provides a method to evalua...
Definition: IFunction.h:133
void Start(Bool_t reset=kTRUE)
Start the stopwatch.
Definition: TStopwatch.cxx:56
Double_t Log(Double_t x)
Definition: TMath.h:526
return c
Double_t KolmogorovProb(Double_t z)
Calculates the Kolmogorov distribution function, Begin_Html.
Definition: TMath.cxx:635
tuple offset
Definition: tree.py:93
std::vector< Double_t > fCombinedSamples
Definition: GoFTest.h:193
#define assert(cond)
Definition: unittest.h:542
Double_t distance(const TPoint2 &p1, const TPoint2 &p2)
Definition: CsgOps.cxx:467
TH1 * h
Definition: legend2.C:5
std::auto_ptr< IGenFunction > fCDF
Definition: GoFTest.h:185
#define N
#define H(x, y, z)
double inner_product(const LAVector &, const LAVector &)
int Int_t
Definition: RtypesCore.h:41
bool Bool_t
Definition: RtypesCore.h:59
TArc * a
Definition: textangle.C:12
const Bool_t kFALSE
Definition: Rtypes.h:92
int nbins[3]
void KolmogorovSmirnovTest(Double_t &pvalue, Double_t &testStat) const
Definition: GoFTest.cxx:919
#define MATH_WARN_MSG(loc, str)
Definition: Error.h:47
Short_t Abs(Short_t d)
Definition: TMathBase.h:110
TFile * f
LongDouble_t Power(LongDouble_t x, LongDouble_t y)
Definition: TMath.h:501
void operator()(ETestType test, Double_t &pvalue, Double_t &testStat) const
Definition: GoFTest.cxx:210
unsigned int Size() const
return number of fit points
Definition: BinData.h:447
int sampleSize
Definition: unuranDistr.cxx:34
Double_t x[n]
Definition: legend1.C:17
double cdf(double *x, double *p)
Definition: unuranDistr.cxx:44
double normal_cdf(double x, double sigma=1, double x0=0)
Cumulative distribution function of the normal (Gaussian) distribution (lower tail).
int d
Definition: tornado.py:11
static Double_t PValueADKSamples(UInt_t nsamples, Double_t A2)
Definition: GoFTest.cxx:353
EDistribution fDist
Definition: GoFTest.h:188
double pow(double, double)
void Sort(Index n, const Element *a, Index *index, Bool_t down=kTRUE)
Definition: TMath.h:1002
static Double_t GetSigmaN(const std::vector< UInt_t > &ns, UInt_t N)
Definition: GoFTest.cxx:311
Float_t z[5]
Definition: Ifit.C:16
#define MATH_ERROR_MSG(loc, str)
Definition: Error.h:50
void AndersonDarling2SamplesTest(Double_t &pvalue, Double_t &testStat) const
Definition: GoFTest.cxx:646
int getSum(const int *x, int n)
Definition: GoFTest.cxx:534
#define F(x, y, z)
TThread * t[5]
Definition: threadsh1.C:13
#define M_PI
Definition: Rotated.cxx:105
IBaseFunctionOneDim IGenFunction
Definition: IFunctionfwd.h:24
Double_t GaussianCDF(Double_t x) const
Definition: GoFTest.cxx:295
unsigned int UInt_t
Definition: RtypesCore.h:42
Double_t KolmogorovTest(Int_t na, const Double_t *a, Int_t nb, const Double_t *b, Option_t *option)
Statistical test whether two one-dimensional sets of points are compatible with coming from the same ...
Definition: TMath.cxx:784
tuple w
Definition: qtexample.py:51
TLine * l
Definition: textangle.C:4
virtual ~GoFTest()
Definition: GoFTest.cxx:180
Class describing the binned data sets : vectors of x coordinates, y values and optionally error on y ...
Definition: BinData.h:61
Double_t fSigma
Definition: GoFTest.h:191
float xmax
Definition: THbookFile.cxx:93
const Double_t infinity
Definition: CsgOps.cxx:85
std::vector< std::vector< Double_t > > fSamples
Definition: GoFTest.h:195
static const double x1[5]
double Double_t
Definition: RtypesCore.h:55
void Instantiate(const Double_t *sample, UInt_t sampleSize)
Definition: GoFTest.cxx:277
double Value(unsigned int ipoint) const
return the value for the given fit point
Definition: BinData.h:236
static Vc_ALWAYS_INLINE int_v max(const int_v &x, const int_v &y)
Definition: vector.h:440
char Char_t
Definition: RtypesCore.h:29
void SetSamples(std::vector< const Double_t * > samples, const std::vector< UInt_t > samplesSizes)
Definition: GoFTest.cxx:182
void Reset()
Definition: TStopwatch.h:54
double result[121]
const double * GetPoint(unsigned int ipoint, double &value) const
retrieve at the same time a pointer to the coordinate data and the fit value More efficient than call...
Definition: BinData.h:304
Double_t Sqrt(Double_t x)
Definition: TMath.h:464
double exp(double)
void SetDistributionFunction(const IGenFunction &cdf, Bool_t isPDF, Double_t xmin, Double_t xmax)
Definition: GoFTest.cxx:265
const Bool_t kTRUE
Definition: Rtypes.h:91
float value
Definition: math.cpp:443
const Int_t n
Definition: legend1.C:16
void AndersonDarlingTest(Double_t &pvalue, Double_t &testStat) const
Definition: GoFTest.cxx:860
Template class to wrap any member function of a class taking a double and returning a double in a 1D ...
double log(double)
void KolmogorovSmirnov2SamplesTest(Double_t &pvalue, Double_t &testStat) const
Definition: GoFTest.cxx:894
void adkTestStat(double *adk, const std::vector< std::vector< double > > &samples, const std::vector< double > &zstar)
Definition: GoFTest.cxx:546
unsigned int NDim() const
return coordinate data dimension
Definition: BinData.h:452
double exponential_cdf(double x, double lambda, double x0=0)
Cumulative distribution function of the exponential distribution (lower tail).
Stopwatch class.
Definition: TStopwatch.h:30
Double_t ExponentialCDF(Double_t x) const
Definition: GoFTest.cxx:299