Logo ROOT   6.10/09
Reference Guide
SamplingDistribution.cxx
Go to the documentation of this file.
1 // @(#)root/roostats:$Id$
2 
3 /*************************************************************************
4  * Project: RooStats *
5  * Package: RooFit/RooStats *
6  * Authors: *
7  * Kyle Cranmer, Lorenzo Moneta, Gregory Schott, Wouter Verkerke *
8  *************************************************************************
9  * Copyright (C) 1995-2008, Rene Brun and Fons Rademakers. *
10  * All rights reserved. *
11  * *
12  * For the licensing terms see $ROOTSYS/LICENSE. *
13  * For the list of contributors see $ROOTSYS/README/CREDITS. *
14  *************************************************************************/
15 
16 /** \class RooStats::SamplingDistribution
17  \ingroup Roostats
18 
19 This class simply holds a sampling distribution of some test statistic.
20 The distribution can either be an empirical distribution (eg. the samples themselves) or
21 a weighted set of points (eg. for the FFT method).
22 The class supports merging.
23 */
24 
25 #include "RooMsgService.h"
26 
28 #include "RooNumber.h"
29 #include "TMath.h"
30 #include <algorithm>
31 #include <iostream>
32 #include <cmath>
33 #include <limits>
34 using namespace std ;
35 
37 
38 using namespace RooStats;
39 
40 ////////////////////////////////////////////////////////////////////////////////
41 /// SamplingDistribution constructor
42 
43 SamplingDistribution::SamplingDistribution( const char *name, const char *title,
44  std::vector<Double_t>& samplingDist, const char * varName) :
45  TNamed(name,title)
46 {
47  fSamplingDist = samplingDist;
48  // need to check STL stuff here. Will this = operator work as wanted, or do we need:
49  // std::copy(samplingDist.begin(), samplingDist.end(), fSamplingDist.begin());
50 
51  // WVE must fill sampleWeights vector here otherwise append behavior potentially undefined
52  fSampleWeights.resize(fSamplingDist.size(),1.0) ;
53 
54  fVarName = varName;
55 }
56 
57 ////////////////////////////////////////////////////////////////////////////////
58 /// SamplingDistribution constructor
59 
60 SamplingDistribution::SamplingDistribution( const char *name, const char *title,
61  std::vector<Double_t>& samplingDist, std::vector<Double_t>& sampleWeights, const char * varName) :
62  TNamed(name,title)
63 {
64  fSamplingDist = samplingDist;
65  fSampleWeights = sampleWeights;
66  // need to check STL stuff here. Will this = operator work as wanted, or do we need:
67  // std::copy(samplingDist.begin(), samplingDist.end(), fSamplingDist.begin());
68 
69  fVarName = varName;
70 }
71 
72 ////////////////////////////////////////////////////////////////////////////////
73 /// SamplingDistribution constructor (with name and title)
74 
75 SamplingDistribution::SamplingDistribution( const char *name, const char *title, const char * varName) :
76  TNamed(name,title)
77 {
78  fVarName = varName;
79 }
80 
81 ////////////////////////////////////////////////////////////////////////////////
82 /// Creates a SamplingDistribution from a RooDataSet for debugging
83 /// purposes; e.g. if you need a Gaussian type SamplingDistribution
84 /// you can generate it from a Gaussian pdf and use the resulting
85 /// RooDataSet with this constructor.
86 ///
87 /// The result is the projected distribution onto varName
88 /// marginalizing the other variables.
89 ///
90 /// If varName is not given, the first variable will be used.
91 /// This is useful mostly for RooDataSets with only one observable.
92 
94  const char *name,
95  const char *title,
96  RooDataSet& dataSet,
97  const char * _columnName,
98  const char * varName
99 ) : TNamed(name, title) {
100 
101 
102  // check there are any meaningful entries in the given dataset
103  if( dataSet.numEntries() == 0 || !dataSet.get()->first() ) {
104  if( varName ) fVarName = varName;
105  return;
106  }
107 
108  TString columnName( _columnName );
109 
110  if( !columnName.Length() ) {
111  columnName.Form( "%s_TS0", name );
112  if( !dataSet.get()->find(columnName) ) {
113  columnName = dataSet.get()->first()->GetName();
114  }
115  }
116 
117  if( !varName ) {
118  // no leak. none of these transfers ownership.
119  fVarName = (*dataSet.get())[columnName].GetTitle();
120  }else{
121  fVarName = varName;
122  }
123 
124  for(Int_t i=0; i < dataSet.numEntries(); i++) {
125  fSamplingDist.push_back(dataSet.get(i)->getRealValue(columnName));
126  fSampleWeights.push_back(dataSet.weight());
127  }
128 }
129 
130 
131 ////////////////////////////////////////////////////////////////////////////////
132 /// SamplingDistribution default constructor
133 
135  TNamed("SamplingDistribution_DefaultName","SamplingDistribution")
136 {
137 }
138 
139 ////////////////////////////////////////////////////////////////////////////////
140 /// SamplingDistribution destructor
141 
143 {
144  fSamplingDist.clear();
145  fSampleWeights.clear();
146 }
147 
148 ////////////////////////////////////////////////////////////////////////////////
149 /// Merge SamplingDistributions (does nothing if NULL is given).
150 /// If variable name was not set before, it is copied from the added
151 /// SamplingDistribution.
152 
154 {
155  if(!other) return;
156 
157  std::vector<double> newSamplingDist = other->fSamplingDist;
158  std::vector<double> newSampleWeights = other->fSampleWeights;
159  // need to check STL stuff here. Will this = operator work as wanted, or do we need:
160  // std::copy(samplingDist.begin(), samplingDist.end(), fSamplingDist.begin());
161  // need to look into STL, do it the easy way for now
162 
163  // reserve memory
164  fSamplingDist.reserve(fSamplingDist.size()+newSamplingDist.size());
165  fSampleWeights.reserve(fSampleWeights.size()+newSampleWeights.size());
166 
167  // push back elements
168  for(unsigned int i=0; i<newSamplingDist.size(); ++i){
169  fSamplingDist.push_back(newSamplingDist[i]);
170  fSampleWeights.push_back(newSampleWeights[i]);
171  }
172 
173 
174  if(GetVarName().Length() == 0 && other->GetVarName().Length() > 0)
175  fVarName = other->GetVarName();
176 
177  if(strlen(GetName()) == 0 && strlen(other->GetName()) > 0)
178  SetName(other->GetName());
179  if(strlen(GetTitle()) == 0 && strlen(other->GetTitle()) > 0)
180  SetTitle(other->GetTitle());
181 
182 }
183 
184 ////////////////////////////////////////////////////////////////////////////////
185 /// Returns the integral in the open/closed/mixed interval. Default is [low,high) interval.
186 /// Normalization can be turned off.
187 
189  highClosed) const
190 {
191  double error = 0;
192  return IntegralAndError(error, low,high, normalize, lowClosed, highClosed);
193 }
194 
195 ////////////////////////////////////////////////////////////////////////////////
196 /// first need to sort the values and then compute the
197 /// running sum of the weights and of the weight square
198 /// needed later for computing the integral
199 
201 
202  unsigned int n = fSamplingDist.size();
203  std::vector<unsigned int> index(n);
204  TMath::SortItr(fSamplingDist.begin(), fSamplingDist.end(), index.begin(), false );
205 
206  // compute the empirical CDF and cache in a vector
207  fSumW = std::vector<double>( n );
208  fSumW2 = std::vector<double>( n );
209 
210  std::vector<double> sortedDist( n);
211  std::vector<double> sortedWeights( n);
212 
213  for(unsigned int i=0; i <n; i++) {
214  unsigned int j = index[i];
215  if (i > 0) {
216  fSumW[i] += fSumW[i-1];
217  fSumW2[i] += fSumW2[i-1];
218  }
219  fSumW[i] += fSampleWeights[j];
220  fSumW2[i] += fSampleWeights[j]*fSampleWeights[j];
221  // sort also the sampling distribution and the weights
222  sortedDist[i] = fSamplingDist[ j] ;
223  sortedWeights[i] = fSampleWeights[ j] ;
224  }
225 
226  // save the sorted distribution
227  fSamplingDist = sortedDist;
228  fSampleWeights = sortedWeights;
229 
230 
231 }
232 
233 ////////////////////////////////////////////////////////////////////////////////
234 /// Returns the integral in the open/closed/mixed interval. Default is [low,high) interval.
235 /// Normalization can be turned off.
236 /// compute also the error on the integral
237 
239  highClosed) const
240 {
241  int n = fSamplingDist.size();
242  if( n == 0 ) {
243  error = numeric_limits<Double_t>::infinity();
244  return 0.0;
245  }
246 
247  if (int(fSumW.size()) != n)
248  SortValues();
249 
250 
251  // use std::upper_bounds returns lower index value
252  int indexLow = -1;
253  int indexHigh = -1;
254  if (lowClosed) {
255  // case of closed intervals want to include lower part
256  indexLow = std::lower_bound( fSamplingDist.begin(), fSamplingDist.end() , low) - fSamplingDist.begin() -1;
257  }
258  else {
259  // case of open intervals
260  indexLow = std::upper_bound( fSamplingDist.begin(), fSamplingDist.end() , low) - fSamplingDist.begin() - 1;
261  }
262 
263 
264  if (highClosed) {
265  indexHigh = std::upper_bound( fSamplingDist.begin(), fSamplingDist.end() , high) - fSamplingDist.begin() -1;
266  }
267  else {
268  indexHigh = std::lower_bound( fSamplingDist.begin(), fSamplingDist.end() , high) - fSamplingDist.begin() -1;
269 
270  }
271 
272 
273  assert(indexLow < n && indexHigh < n);
274 
275  double sum = 0;
276  double sum2 = 0;
277 
278  if (indexHigh >= 0) {
279  sum = fSumW[indexHigh];
280  sum2 = fSumW2[indexHigh];
281 
282  if (indexLow >= 0) {
283  sum -= fSumW[indexLow];
284  sum2 -= fSumW2[indexLow];
285  }
286  }
287 
288  if(normalize) {
289 
290  double norm = fSumW.back();
291  double norm2 = fSumW2.back();
292 
293  sum /= norm;
294 
295  // use formula for binomial error in case of weighted events
296  // expression can be derived using a MLE for a weighted binomial likelihood
297  error = std::sqrt( sum2 * (1. - 2. * sum) + norm2 * sum * sum ) / norm;
298  }
299  else {
300  error = std::sqrt(sum2);
301  }
302 
303 
304  return sum;
305 }
306 
307 ////////////////////////////////////////////////////////////////////////////////
308 /// returns the closed integral [-inf,x]
309 
311  return Integral(-RooNumber::infinity(), x, kTRUE, kTRUE, kTRUE);
312 }
313 
314 ////////////////////////////////////////////////////////////////////////////////
315 /// returns the inverse of the cumulative distribution function
316 
318 {
319  Double_t dummy=0;
320  return InverseCDF(pvalue,0,dummy);
321 }
322 
323 ////////////////////////////////////////////////////////////////////////////////
324 /// returns the inverse of the cumulative distribution function, with variations depending on number of samples
325 
327  Double_t sigmaVariation,
328  Double_t& inverseWithVariation)
329 {
330  if (fSumW.size() != fSamplingDist.size())
331  SortValues();
332 
333  if (!TMath::AreEqualRel(fSumW.back(), fSumW2.back(), 1.E-6) )
334  Warning("InverseCDF","Estimation of Quantiles (InverseCDF) for weighted events is not yet supported");
335 
336 
337  // Acceptance regions are meant to be inclusive of (1-\alpha) of the probability
338  // so the returned values of the CDF should make this easy.
339  // in particular:
340  // if finding the critical value for a lower bound
341  // when p_i < p < p_j, one should return the value associated with i
342  // if i=0, then one should return -infinity
343  // if finding the critical value for an upper bound
344  // when p_i < p < p_j, one should return the value associated with j
345  // if i = size-1, then one should return +infinity
346  // use pvalue < 0.5 to indicate a lower bound is requested
347 
348  // casting will round down, eg. give i
349  int nominal = (unsigned int) (pvalue*fSamplingDist.size());
350 
351  if(nominal <= 0) {
352  inverseWithVariation = -1.*RooNumber::infinity();
353  return -1.*RooNumber::infinity();
354  }
355  else if(nominal >= (Int_t)fSamplingDist.size()-1 ) {
356  inverseWithVariation = RooNumber::infinity();
357  return RooNumber::infinity();
358  }
359  else if(pvalue < 0.5){
360  int delta = (int)(sigmaVariation*sqrt(1.0*nominal)); // note sqrt(small fraction)
361  int variation = nominal+delta;
362 
363  if(variation>=(Int_t)fSamplingDist.size()-1)
364  inverseWithVariation = RooNumber::infinity();
365  else if(variation<=0)
366  inverseWithVariation = -1.*RooNumber::infinity();
367  else
368  inverseWithVariation = fSamplingDist[ variation ];
369 
370  return fSamplingDist[nominal];
371  }
372  else if(pvalue >= 0.5){
373  int delta = (int)(sigmaVariation*sqrt(1.0*fSamplingDist.size()- nominal)); // note sqrt(small fraction)
374  int variation = nominal+delta;
375 
376 
377  if(variation>=(Int_t)fSamplingDist.size()-1)
378  inverseWithVariation = RooNumber::infinity();
379 
380  else if(variation<=0)
381  inverseWithVariation = -1.*RooNumber::infinity();
382  else
383  inverseWithVariation = fSamplingDist[ variation+1 ];
384 
385 
386  /*
387  std::cout << "dgb SamplingDistribution::InverseCDF. variation = " << variation
388  << " size = " << fSamplingDist.size()
389  << " value = " << inverseWithVariation << std::endl;
390  */
391 
392  return fSamplingDist[nominal+1];
393  }
394  else{
395  std::cout << "problem in SamplingDistribution::InverseCDF" << std::endl;
396  }
397  inverseWithVariation = RooNumber::infinity();
398  return RooNumber::infinity();
399 
400 }
401 
402 
403 ////////////////////////////////////////////////////////////////////////////////
404 /// returns the inverse of the cumulative distribution function
405 
407 {
408  if (fSumW.size() != fSamplingDist.size())
409  SortValues();
410 
411  if (!TMath::AreEqualRel(fSumW.back(), fSumW2.back(), 1.E-6) )
412  Warning("InverseCDFInterpolate","Estimation of Quantiles (InverseCDF) for weighted events is not yet supported.");
413 
414  // casting will round down, eg. give i
415  int nominal = (unsigned int) (pvalue*fSamplingDist.size());
416 
417  if(nominal <= 0) {
418  return -1.*RooNumber::infinity();
419  }
420  if(nominal >= (Int_t)fSamplingDist.size()-1 ) {
421  return RooNumber::infinity();
422  }
423  Double_t upperX = fSamplingDist[nominal+1];
424  Double_t upperY = ((Double_t) (nominal+1))/fSamplingDist.size();
425  Double_t lowerX = fSamplingDist[nominal];
426  Double_t lowerY = ((Double_t) nominal)/fSamplingDist.size();
427 
428  // std::cout << upperX << " " << upperY << " " << lowerX << " " << lowerY << std::endl;
429 
430  return (upperX-lowerX)/(upperY-lowerY)*(pvalue-lowerY)+lowerX;
431 
432 }
virtual const char * GetName() const
Returns name of object.
Definition: TNamed.h:47
static long int sum(long int i)
Definition: Factory.cxx:2162
std::vector< Double_t > fSumW2
Cached vector with sum of the weight used to compute integral.
TString fVarName
vector of weights for the samples
virtual void SetName(const char *name)
Set the name of the TNamed.
Definition: TNamed.cxx:131
Double_t InverseCDF(Double_t pvalue)
get the inverse of the Cumulative distribution function
int Int_t
Definition: RtypesCore.h:41
bool Bool_t
Definition: RtypesCore.h:59
STL namespace.
std::vector< Double_t > fSampleWeights
vector of points for the sampling distribution
Float_t delta
double sqrt(double)
Double_t x[n]
Definition: legend1.C:17
The TNamed class is the base class for all named ROOT classes.
Definition: TNamed.h:29
virtual Double_t weight() const
Return event weight of current event.
Double_t InverseCDFInterpolate(Double_t pvalue)
get the inverse of the Cumulative distribution function
Bool_t AreEqualRel(Double_t af, Double_t bf, Double_t relPrec)
Definition: TMath.h:322
Double_t Integral(Double_t low, Double_t high, Bool_t normalize=kTRUE, Bool_t lowClosed=kTRUE, Bool_t highClosed=kFALSE) const
numerical integral in these limits
void SortValues() const
Cached vector with sum of the weight used to compute integral error.
std::vector< Double_t > fSamplingDist
RooAbsArg * first() const
Double_t CDF(Double_t x) const
calculate CDF as a special case of Integral(...) with lower limit equal to -inf
virtual ~SamplingDistribution()
Destructor of SamplingDistribution.
static Double_t infinity()
Return internal infinity representation.
Definition: RooNumber.cxx:49
void Add(const SamplingDistribution *other)
merge two sampling distributions
RooDataSet is a container class to hold unbinned data.
Definition: RooDataSet.h:29
This class simply holds a sampling distribution of some test statistic.
virtual const RooArgSet * get(Int_t index) const
Return RooArgSet with coordinates of event &#39;index&#39;.
Namespace for the RooStats classes.
Definition: Asimov.h:20
#define ClassImp(name)
Definition: Rtypes.h:336
RooAbsArg * find(const char *name) const
Find object with given name in list.
double Double_t
Definition: RtypesCore.h:55
static RooMathCoreReg dummy
Double_t IntegralAndError(Double_t &error, Double_t low, Double_t high, Bool_t normalize=kTRUE, Bool_t lowClosed=kTRUE, Bool_t highClosed=kFALSE) const
numerical integral in these limits including error estimation
SamplingDistribution()
Default constructor for SamplingDistribution.
Double_t getRealValue(const char *name, Double_t defVal=0, Bool_t verbose=kFALSE) const
Get value of a RooAbsReal stored in set with given name.
Definition: RooArgSet.cxx:527
virtual void SetTitle(const char *title="")
Set the title of the TNamed.
Definition: TNamed.cxx:155
const Bool_t kTRUE
Definition: RtypesCore.h:91
double norm(double *x, double *p)
Definition: unuranDistr.cxx:40
const Int_t n
Definition: legend1.C:16
virtual void Warning(const char *method, const char *msgfmt,...) const
Issue warning message.
Definition: TObject.cxx:859
virtual Int_t numEntries() const
Definition: RooAbsData.cxx:269
virtual const char * GetTitle() const
Returns title of object.
Definition: TNamed.h:48
void SortItr(Iterator first, Iterator last, IndexIterator index, Bool_t down=kTRUE)
Definition: TMath.h:1126