Logo ROOT   6.07/09
Reference Guide
SamplingDistribution.cxx
Go to the documentation of this file.
1 // @(#)root/roostats:$Id$
2 
3 /*************************************************************************
4  * Project: RooStats *
5  * Package: RooFit/RooStats *
6  * Authors: *
7  * Kyle Cranmer, Lorenzo Moneta, Gregory Schott, Wouter Verkerke *
8  *************************************************************************
9  * Copyright (C) 1995-2008, Rene Brun and Fons Rademakers. *
10  * All rights reserved. *
11  * *
12  * For the licensing terms see $ROOTSYS/LICENSE. *
13  * For the list of contributors see $ROOTSYS/README/CREDITS. *
14  *************************************************************************/
15 
16 ////////////////////////////////////////////////////////////////////////////////
17 
18 
19 #ifndef ROO_MSG_SERVICE
20 #include "RooMsgService.h"
21 #endif
22 
24 #include "RooNumber.h"
25 #include "TMath.h"
26 #include <algorithm>
27 #include <iostream>
28 #include <cmath>
29 #include <limits>
30 using namespace std ;
31 
32 /// ClassImp for building the THtml documentation of the class
34 
35 using namespace RooStats;
36 
37 ////////////////////////////////////////////////////////////////////////////////
38 /// SamplingDistribution constructor
39 
40 SamplingDistribution::SamplingDistribution( const char *name, const char *title,
41  std::vector<Double_t>& samplingDist, const char * varName) :
42  TNamed(name,title)
43 {
44  fSamplingDist = samplingDist;
45  // need to check STL stuff here. Will this = operator work as wanted, or do we need:
46  // std::copy(samplingDist.begin(), samplingDist.end(), fSamplingDist.begin());
47 
48  // WVE must fill sampleWeights vector here otherwise append behavior potentially undefined
49  fSampleWeights.resize(fSamplingDist.size(),1.0) ;
50 
51  fVarName = varName;
52 }
53 
54 ////////////////////////////////////////////////////////////////////////////////
55 /// SamplingDistribution constructor
56 
57 SamplingDistribution::SamplingDistribution( const char *name, const char *title,
58  std::vector<Double_t>& samplingDist, std::vector<Double_t>& sampleWeights, const char * varName) :
59  TNamed(name,title)
60 {
61  fSamplingDist = samplingDist;
62  fSampleWeights = sampleWeights;
63  // need to check STL stuff here. Will this = operator work as wanted, or do we need:
64  // std::copy(samplingDist.begin(), samplingDist.end(), fSamplingDist.begin());
65 
66  fVarName = varName;
67 }
68 
69 ////////////////////////////////////////////////////////////////////////////////
70 /// SamplingDistribution constructor (with name and title)
71 
72 SamplingDistribution::SamplingDistribution( const char *name, const char *title, const char * varName) :
73  TNamed(name,title)
74 {
75  fVarName = varName;
76 }
77 
78 
80  const char *name,
81  const char *title,
82  RooDataSet& dataSet,
83  const char * _columnName,
84  const char * varName
85 ) : TNamed(name, title) {
86  // Creates a SamplingDistribution from a RooDataSet for debugging
87  // purposes; e.g. if you need a Gaussian type SamplingDistribution
88  // you can generate it from a Gaussian pdf and use the resulting
89  // RooDataSet with this constructor.
90  //
91  // The result is the projected distribution onto varName
92  // marginalizing the other variables.
93  //
94  // If varName is not given, the first variable will be used.
95  // This is useful mostly for RooDataSets with only one observable.
96 
97  // check there are any meaningful entries in the given dataset
98  if( dataSet.numEntries() == 0 || !dataSet.get()->first() ) {
99  if( varName ) fVarName = varName;
100  return;
101  }
102 
103  TString columnName( _columnName );
104 
105  if( !columnName.Length() ) {
106  columnName.Form( "%s_TS0", name );
107  if( !dataSet.get()->find(columnName) ) {
108  columnName = dataSet.get()->first()->GetName();
109  }
110  }
111 
112  if( !varName ) {
113  // no leak. none of these transfers ownership.
114  fVarName = (*dataSet.get())[columnName].GetTitle();
115  }else{
116  fVarName = varName;
117  }
118 
119  for(Int_t i=0; i < dataSet.numEntries(); i++) {
120  fSamplingDist.push_back(dataSet.get(i)->getRealValue(columnName));
121  fSampleWeights.push_back(dataSet.weight());
122  }
123 }
124 
125 
126 ////////////////////////////////////////////////////////////////////////////////
127 /// SamplingDistribution default constructor
128 
130  TNamed("SamplingDistribution_DefaultName","SamplingDistribution")
131 {
132 }
133 
134 ////////////////////////////////////////////////////////////////////////////////
135 /// SamplingDistribution destructor
136 
138 {
139  fSamplingDist.clear();
140  fSampleWeights.clear();
141 }
142 
143 
144 ////////////////////////////////////////////////////////////////////////////////
145 /// Merge SamplingDistributions (does nothing if NULL is given).
146 /// If variable name was not set before, it is copied from the added
147 /// SamplingDistribution.
148 
150 {
151  if(!other) return;
152 
153  std::vector<double> newSamplingDist = other->fSamplingDist;
154  std::vector<double> newSampleWeights = other->fSampleWeights;
155  // need to check STL stuff here. Will this = operator work as wanted, or do we need:
156  // std::copy(samplingDist.begin(), samplingDist.end(), fSamplingDist.begin());
157  // need to look into STL, do it the easy way for now
158 
159  // reserve memory
160  fSamplingDist.reserve(fSamplingDist.size()+newSamplingDist.size());
161  fSampleWeights.reserve(fSampleWeights.size()+newSampleWeights.size());
162 
163  // push back elements
164  for(unsigned int i=0; i<newSamplingDist.size(); ++i){
165  fSamplingDist.push_back(newSamplingDist[i]);
166  fSampleWeights.push_back(newSampleWeights[i]);
167  }
168 
169 
170  if(GetVarName().Length() == 0 && other->GetVarName().Length() > 0)
171  fVarName = other->GetVarName();
172 
173  if(strlen(GetName()) == 0 && strlen(other->GetName()) > 0)
174  SetName(other->GetName());
175  if(strlen(GetTitle()) == 0 && strlen(other->GetTitle()) > 0)
176  SetTitle(other->GetTitle());
177 
178 }
179 
180 
181 ////////////////////////////////////////////////////////////////////////////////
182 /// Returns the integral in the open/closed/mixed interval. Default is [low,high) interval.
183 /// Normalization can be turned off.
184 
186  highClosed) const
187 {
188  double error = 0;
189  return IntegralAndError(error, low,high, normalize, lowClosed, highClosed);
190 }
191 
192 ////////////////////////////////////////////////////////////////////////////////
193 
195  // first need to sort the values and then compute the
196  // running sum of the weights and of the weight square
197  // needed later for computing the integral
198 
199  unsigned int n = fSamplingDist.size();
200  std::vector<unsigned int> index(n);
201  TMath::SortItr(fSamplingDist.begin(), fSamplingDist.end(), index.begin(), false );
202 
203  // compute the empirical CDF and cache in a vector
204  fSumW = std::vector<double>( n );
205  fSumW2 = std::vector<double>( n );
206 
207  std::vector<double> sortedDist( n);
208  std::vector<double> sortedWeights( n);
209 
210  for(unsigned int i=0; i <n; i++) {
211  unsigned int j = index[i];
212  if (i > 0) {
213  fSumW[i] += fSumW[i-1];
214  fSumW2[i] += fSumW2[i-1];
215  }
216  fSumW[i] += fSampleWeights[j];
217  fSumW2[i] += fSampleWeights[j]*fSampleWeights[j];
218  // sort also the sampling distribution and the weights
219  sortedDist[i] = fSamplingDist[ j] ;
220  sortedWeights[i] = fSampleWeights[ j] ;
221  }
222 
223  // save the sorted distribution
224  fSamplingDist = sortedDist;
225  fSampleWeights = sortedWeights;
226 
227 
228 }
229 
230 ////////////////////////////////////////////////////////////////////////////////
231 /// Returns the integral in the open/closed/mixed interval. Default is [low,high) interval.
232 /// Normalization can be turned off.
233 /// compute also the error on the integral
234 
236  highClosed) const
237 {
238  int n = fSamplingDist.size();
239  if( n == 0 ) {
240  error = numeric_limits<Double_t>::infinity();
241  return 0.0;
242  }
243 
244  if (int(fSumW.size()) != n)
245  SortValues();
246 
247 
248  // use std::upper_bounds returns lower index value
249  int indexLow = -1;
250  int indexHigh = -1;
251  if (lowClosed) {
252  // case of closed intervals want to include lower part
253  indexLow = std::lower_bound( fSamplingDist.begin(), fSamplingDist.end() , low) - fSamplingDist.begin() -1;
254  }
255  else {
256  // case of open intervals
257  indexLow = std::upper_bound( fSamplingDist.begin(), fSamplingDist.end() , low) - fSamplingDist.begin() - 1;
258  }
259 
260 
261  if (highClosed) {
262  indexHigh = std::upper_bound( fSamplingDist.begin(), fSamplingDist.end() , high) - fSamplingDist.begin() -1;
263  }
264  else {
265  indexHigh = std::lower_bound( fSamplingDist.begin(), fSamplingDist.end() , high) - fSamplingDist.begin() -1;
266 
267  }
268 
269 
270  assert(indexLow < n && indexHigh < n);
271 
272  double sum = 0;
273  double sum2 = 0;
274 
275  if (indexHigh >= 0) {
276  sum = fSumW[indexHigh];
277  sum2 = fSumW2[indexHigh];
278 
279  if (indexLow >= 0) {
280  sum -= fSumW[indexLow];
281  sum2 -= fSumW2[indexLow];
282  }
283  }
284 
285  if(normalize) {
286 
287  double norm = fSumW.back();
288  double norm2 = fSumW2.back();
289 
290  sum /= norm;
291 
292  // use formula for binomial error in case of weighted events
293  // expression can be derived using a MLE for a weighted binomial likelihood
294  error = std::sqrt( sum2 * (1. - 2. * sum) + norm2 * sum * sum ) / norm;
295  }
296  else {
297  error = std::sqrt(sum2);
298  }
299 
300 
301  return sum;
302 }
303 
304 
305 
306 ////////////////////////////////////////////////////////////////////////////////
307 /// returns the closed integral [-inf,x]
308 
310  return Integral(-RooNumber::infinity(), x, kTRUE, kTRUE, kTRUE);
311 }
312 
313 
314 
315 ////////////////////////////////////////////////////////////////////////////////
316 /// returns the inverse of the cumulative distribution function
317 
319 {
320  Double_t dummy=0;
321  return InverseCDF(pvalue,0,dummy);
322 }
323 ////////////////////////////////////////////////////////////////////////////////
324 /// returns the inverse of the cumulative distribution function, with variations depending on number of samples
325 
327  Double_t sigmaVariation,
328  Double_t& inverseWithVariation)
329 {
330  if (fSumW.size() != fSamplingDist.size())
331  SortValues();
332 
333  if (!TMath::AreEqualRel(fSumW.back(), fSumW2.back(), 1.E-6) )
334  Warning("InverseCDF","Estimation of Quantiles (InverseCDF) for weighted events is not yet supported");
335 
336 
337  // Acceptance regions are meant to be inclusive of (1-\alpha) of the probability
338  // so the returned values of the CDF should make this easy.
339  // in particular:
340  // if finding the critical value for a lower bound
341  // when p_i < p < p_j, one should return the value associated with i
342  // if i=0, then one should return -infinity
343  // if finding the critical value for an upper bound
344  // when p_i < p < p_j, one should return the value associated with j
345  // if i = size-1, then one should return +infinity
346  // use pvalue < 0.5 to indicate a lower bound is requested
347 
348  // casting will round down, eg. give i
349  int nominal = (unsigned int) (pvalue*fSamplingDist.size());
350 
351  if(nominal <= 0) {
352  inverseWithVariation = -1.*RooNumber::infinity();
353  return -1.*RooNumber::infinity();
354  }
355  else if(nominal >= (Int_t)fSamplingDist.size()-1 ) {
356  inverseWithVariation = RooNumber::infinity();
357  return RooNumber::infinity();
358  }
359  else if(pvalue < 0.5){
360  int delta = (int)(sigmaVariation*sqrt(1.0*nominal)); // note sqrt(small fraction)
361  int variation = nominal+delta;
362 
363  if(variation>=(Int_t)fSamplingDist.size()-1)
364  inverseWithVariation = RooNumber::infinity();
365  else if(variation<=0)
366  inverseWithVariation = -1.*RooNumber::infinity();
367  else
368  inverseWithVariation = fSamplingDist[ variation ];
369 
370  return fSamplingDist[nominal];
371  }
372  else if(pvalue >= 0.5){
373  int delta = (int)(sigmaVariation*sqrt(1.0*fSamplingDist.size()- nominal)); // note sqrt(small fraction)
374  int variation = nominal+delta;
375 
376 
377  if(variation>=(Int_t)fSamplingDist.size()-1)
378  inverseWithVariation = RooNumber::infinity();
379 
380  else if(variation<=0)
381  inverseWithVariation = -1.*RooNumber::infinity();
382  else
383  inverseWithVariation = fSamplingDist[ variation+1 ];
384 
385 
386  /*
387  std::cout << "dgb SamplingDistribution::InverseCDF. variation = " << variation
388  << " size = " << fSamplingDist.size()
389  << " value = " << inverseWithVariation << std::endl;
390  */
391 
392  return fSamplingDist[nominal+1];
393  }
394  else{
395  std::cout << "problem in SamplingDistribution::InverseCDF" << std::endl;
396  }
397  inverseWithVariation = RooNumber::infinity();
398  return RooNumber::infinity();
399 
400 }
401 
402 
403 ////////////////////////////////////////////////////////////////////////////////
404 /// returns the inverse of the cumulative distribution function
405 
407 {
408  if (fSumW.size() != fSamplingDist.size())
409  SortValues();
410 
411  if (!TMath::AreEqualRel(fSumW.back(), fSumW2.back(), 1.E-6) )
412  Warning("InverseCDFInterpolate","Estimation of Quantiles (InverseCDF) for weighted events is not yet supported.");
413 
414  // casting will round down, eg. give i
415  int nominal = (unsigned int) (pvalue*fSamplingDist.size());
416 
417  if(nominal <= 0) {
418  return -1.*RooNumber::infinity();
419  }
420  if(nominal >= (Int_t)fSamplingDist.size()-1 ) {
421  return RooNumber::infinity();
422  }
423  Double_t upperX = fSamplingDist[nominal+1];
424  Double_t upperY = ((Double_t) (nominal+1))/fSamplingDist.size();
425  Double_t lowerX = fSamplingDist[nominal];
426  Double_t lowerY = ((Double_t) nominal)/fSamplingDist.size();
427 
428  // std::cout << upperX << " " << upperY << " " << lowerX << " " << lowerY << std::endl;
429 
430  return (upperX-lowerX)/(upperY-lowerY)*(pvalue-lowerY)+lowerX;
431 
432 }
virtual const char * GetTitle() const
Returns title of object.
Definition: TNamed.h:52
static long int sum(long int i)
Definition: Factory.cxx:1785
void SortValues() const
Chached vector with sum of the weight used to compute integral error.
Double_t IntegralAndError(Double_t &error, Double_t low, Double_t high, Bool_t normalize=kTRUE, Bool_t lowClosed=kTRUE, Bool_t highClosed=kFALSE) const
numerical integral in these limits including error estimation
Ssiz_t Length() const
Definition: TString.h:390
std::vector< Double_t > fSumW2
Chached vector with sum of the weight used to compute integral.
TString fVarName
vector of weights for the samples
virtual void SetName(const char *name)
Set the name of the TNamed.
Definition: TNamed.cxx:131
Double_t InverseCDF(Double_t pvalue)
get the inverse of the Cumulative distribution function
Basic string class.
Definition: TString.h:137
int Int_t
Definition: RtypesCore.h:41
bool Bool_t
Definition: RtypesCore.h:59
STL namespace.
std::vector< Double_t > fSampleWeights
vector of points for the sampling distribution
const TString GetVarName() const
RooAbsArg * first() const
double sqrt(double)
Double_t x[n]
Definition: legend1.C:17
The TNamed class is the base class for all named ROOT classes.
Definition: TNamed.h:33
Double_t InverseCDFInterpolate(Double_t pvalue)
get the inverse of the Cumulative distribution function
Bool_t AreEqualRel(Double_t af, Double_t bf, Double_t relPrec)
Definition: TMath.h:196
RooAbsArg * find(const char *name) const
Find object with given name in list.
virtual Double_t weight() const
Return event weight of current event.
std::vector< Double_t > fSamplingDist
virtual ~SamplingDistribution()
Destructor of SamplingDistribution.
virtual Int_t numEntries() const
Definition: RooAbsData.cxx:269
void Form(const char *fmt,...)
Formats a string using a printf style format descriptor.
Definition: TString.cxx:2322
static Double_t infinity()
Return internal infinity representation.
Definition: RooNumber.cxx:49
Double_t CDF(Double_t x) const
calculate CDF as a special case of Integral(...) with lower limit equal to -inf
virtual const char * GetName() const
Returns name of object.
Definition: TNamed.h:51
void Add(const SamplingDistribution *other)
merge two sampling distributions
RooDataSet is a container class to hold unbinned data.
Definition: RooDataSet.h:29
This class simply holds a sampling distribution of some test statistic.
Namespace for the RooStats classes.
Definition: Asimov.h:20
#define ClassImp(name)
Definition: Rtypes.h:279
double Double_t
Definition: RtypesCore.h:55
static RooMathCoreReg dummy
SamplingDistribution()
Default constructor for SamplingDistribution.
Double_t Integral(Double_t low, Double_t high, Bool_t normalize=kTRUE, Bool_t lowClosed=kTRUE, Bool_t highClosed=kFALSE) const
numerical integral in these limits
const Bool_t kTRUE
Definition: Rtypes.h:91
virtual void SetTitle(const char *title="")
Set the title of the TNamed.
Definition: TNamed.cxx:155
double norm(double *x, double *p)
Definition: unuranDistr.cxx:40
virtual const RooArgSet * get(Int_t index) const
Return RooArgSet with coordinates of event &#39;index&#39;.
const Int_t n
Definition: legend1.C:16
char name[80]
Definition: TGX11.cxx:109
void SortItr(Iterator first, Iterator last, IndexIterator index, Bool_t down=kTRUE)
Definition: TMath.h:964
Double_t getRealValue(const char *name, Double_t defVal=0, Bool_t verbose=kFALSE) const
Get value of a RooAbsReal stored in set with given name.
Definition: RooArgSet.cxx:527
virtual void Warning(const char *method, const char *msgfmt,...) const
Issue warning message.
Definition: TObject.cxx:911