ROOT  6.06/09
Reference Guide
RuleFitParams.h
Go to the documentation of this file.
1 // @(#)root/tmva $Id$
2 // Author: Andreas Hoecker, Joerg Stelzer, Fredrik Tegenfeldt, Helge Voss
3 
4 /**********************************************************************************
5  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6  * Package: TMVA *
7  * Class : RuleFitParams *
8  * Web : http://tmva.sourceforge.net *
9  * *
10  * Description: *
11  * A class doing the actual fitting of a linear model using rules as *
12  * base functions. *
13  * Reference paper: 1.Gradient Directed Regularization *
14  * Friedman, Popescu, 2004 *
15  * 2.Predictive Learning with Rule Ensembles *
16  * Friedman, Popescu, 2005 *
17  * *
18  * *
19  * Authors (alphabetical): *
20  * Fredrik Tegenfeldt <Fredrik.Tegenfeldt@cern.ch> - Iowa State U., USA *
21  * Helge Voss <Helge.Voss@cern.ch> - MPI-KP Heidelberg, Ger. *
22  * *
23  * Copyright (c) 2005: *
24  * CERN, Switzerland *
25  * Iowa State U. *
26  * MPI-K Heidelberg, Germany *
27  * *
28  * Redistribution and use in source and binary forms, with or without *
29  * modification, are permitted according to the terms listed in LICENSE *
30  * (http://tmva.sourceforge.net/LICENSE) *
31  **********************************************************************************/
32 
33 #ifndef ROOT_TMVA_RuleFitParams
34 #define ROOT_TMVA_RuleFitParams
35 
36 // #if ROOT_VERSION_CODE >= 364802
37 #ifndef ROOT_TMathBase
38 #include "TMathBase.h"
39 #endif
40 // #else
41 // #ifndef ROOT_TMath
42 // #include "TMath.h"
43 // #endif
44 // #endif
45 
46 #ifndef ROOT_TMVA_Event
47 #include "TMVA/Event.h"
48 #endif
49 
50 class TTree;
51 
52 namespace TMVA {
53 
54  class RuleEnsemble;
55  class MsgLogger;
56  class RuleFit;
57  class RuleFitParams {
58 
59  public:
60 
61  RuleFitParams();
62  virtual ~RuleFitParams();
63 
64  void Init();
65 
66  // set message type
67  void SetMsgType( EMsgType t );
68 
69  // set RuleFit ptr
70  void SetRuleFit( RuleFit *rf ) { fRuleFit = rf; }
71  //
72  // GD path: set N(path steps)
73  void SetGDNPathSteps( Int_t np ) { fGDNPathSteps = np; }
74 
75  // GD path: set path step size
76  void SetGDPathStep( Double_t s ) { fGDPathStep = s; }
77 
78  // GD path: set tau search range
80  {
81  fGDTauMin = (t0>1.0 ? 1.0:(t0<0.0 ? 0.0:t0));
82  fGDTauMax = (t1>1.0 ? 1.0:(t1<0.0 ? 0.0:t1));
84  }
85 
86  // GD path: set number of steps in tau search range
87  void SetGDTauScan( UInt_t n ) { fGDTauScan = n; }
88 
89  // GD path: set tau
90  void SetGDTau( Double_t t ) { fGDTau = t; }
91 
92 
93  void SetGDErrScale( Double_t s ) { fGDErrScale = s; }
95 
96  // return type such that +1 = signal and -1 = background
97  Int_t Type( const Event * e ) const; // return (fRuleFit->GetMethodRuleFit()->DataInfo().IsSignal(e) ? 1:-1); }
98  //
99  UInt_t GetPathIdx1() const { return fPathIdx1; }
100  UInt_t GetPathIdx2() const { return fPathIdx2; }
101  UInt_t GetPerfIdx1() const { return fPerfIdx1; }
102  UInt_t GetPerfIdx2() const { return fPerfIdx2; }
103 
104  // Loss function; Huber loss eq 33
105  Double_t LossFunction( const Event& e ) const;
106 
107  // same but using evt idx (faster)
108  Double_t LossFunction( UInt_t evtidx ) const;
109  Double_t LossFunction( UInt_t evtidx, UInt_t itau ) const;
110 
111  // Empirical risk
112  Double_t Risk(UInt_t ind1, UInt_t ind2, Double_t neff) const;
113  Double_t Risk(UInt_t ind1, UInt_t ind2, Double_t neff, UInt_t itau) const;
114 
115  // Risk evaluation for fPathIdx and fPerfInd
118  Double_t RiskPerf( UInt_t itau ) const { return Risk(fPerfIdx1,fPerfIdx2,fNEveEffPerf,itau); }
119 
120  // Risk evaluation for all tau
122 
123  // Penalty function; Lasso function (eq 8)
124  Double_t Penalty() const;
125 
126  // initialize GD path
127  void InitGD();
128 
129  // find best tau and return the number of scan steps used
130  Int_t FindGDTau();
131 
132  // make path for binary classification (squared-error ramp, sect 6 in ref 1)
133  void MakeGDPath();
134 
135  protected:
136 
137  // typedef of an Event const iterator
138  typedef std::vector<const TMVA::Event *>::const_iterator EventItr;
139 
140  // init ntuple
141  void InitNtuple();
142 
143  // calculate N(tau) in scan - limit to 100000.
144  void CalcGDNTau() { fGDNTau = static_cast<UInt_t>(1.0/fGDTauPrec)+1; if (fGDNTau>100000) fGDNTau=100000; }
145 
146  // fill ntuple with coefficient info
147  void FillCoefficients();
148 
149  // estimate the optimum scoring function
150  void CalcFStar();
151 
152  // estimate of binary error rate
154 
155  // estimate of scale average error rate
157 
158  // estimate 1-area under ROC
159  Double_t ErrorRateRocRaw( std::vector<Double_t> & sFsig, std::vector<Double_t> & sFbkg );
161  void ErrorRateRocTst();
162 
163  // estimate optimism
164  Double_t Optimism();
165 
166  // make gradient vector (eq 44 in ref 1)
167  void MakeGradientVector();
168 
169  // Calculate the direction in parameter space (eq 25, ref 1) and update coeffs (eq 22, ref 1)
170  void UpdateCoefficients();
171 
172  // calculate average of responses of F
175 
176  // calculate average of true response (initial estimate of a0)
178 
179  // calculate the average of each variable over the range
180  void EvaluateAverage(UInt_t ind1, UInt_t ind2,
181  std::vector<Double_t> &avsel,
182  std::vector<Double_t> &avrul);
183 
184  // evaluate using fPathIdx1,2
186 
187  // evaluate using fPerfIdx1,2
189 
190  // the same as above but for the various tau
191  void MakeTstGradientVector();
192  void UpdateTstCoefficients();
193  void CalcTstAverageResponse();
194 
195 
196  RuleFit * fRuleFit; // rule fit
197  RuleEnsemble * fRuleEnsemble; // rule ensemble
198  //
199  UInt_t fNRules; // number of rules
200  UInt_t fNLinear; // number of linear terms
201  //
202  // Event indecis for path/validation - TODO: should let the user decide
203  // Now it is just a simple one-fold cross validation.
204  //
205  UInt_t fPathIdx1; // first event index for path search
206  UInt_t fPathIdx2; // last event index for path search
207  UInt_t fPerfIdx1; // first event index for performance evaluation
208  UInt_t fPerfIdx2; // last event index for performance evaluation
209  Double_t fNEveEffPath; // sum of weights for Path events
210  Double_t fNEveEffPerf; // idem for Perf events
211 
212  std::vector<Double_t> fAverageSelectorPath; // average of each variable over the range fPathIdx1,2
213  std::vector<Double_t> fAverageRulePath; // average of each rule, same range
214  std::vector<Double_t> fAverageSelectorPerf; // average of each variable over the range fPerfIdx1,2
215  std::vector<Double_t> fAverageRulePerf; // average of each rule, same range
216 
217  std::vector<Double_t> fGradVec; // gradient vector - dimension = number of rules in ensemble
218  std::vector<Double_t> fGradVecLin; // gradient vector - dimension = number of variables
219 
220  std::vector< std::vector<Double_t> > fGradVecTst; // gradient vector - one per tau
221  std::vector< std::vector<Double_t> > fGradVecLinTst; // gradient vector, linear terms - one per tau
222  //
223  std::vector<Double_t> fGDErrTst; // error rates per tau
224  std::vector<Char_t> fGDErrTstOK; // error rate is sufficiently low <--- stores boolean
225  std::vector< std::vector<Double_t> > fGDCoefTst; // rule coeffs - one per tau
226  std::vector< std::vector<Double_t> > fGDCoefLinTst; // linear coeffs - one per tau
227  std::vector<Double_t> fGDOfsTst; // offset per tau
228  std::vector< Double_t > fGDTauVec; // the tau's
229  UInt_t fGDNTauTstOK; // number of tau in the test-phase that are ok
230  UInt_t fGDNTau; // number of tau-paths - calculated in SetGDTauPrec
231  Double_t fGDTauPrec; // precision in tau
232  UInt_t fGDTauScan; // number scan for tau-paths
233  Double_t fGDTauMin; // min threshold parameter (tau in eq 26, ref 1)
234  Double_t fGDTauMax; // max threshold parameter (tau in eq 26, ref 1)
235  Double_t fGDTau; // selected threshold parameter (tau in eq 26, ref 1)
236  Double_t fGDPathStep; // step size along path (delta nu in eq 22, ref 1)
237  Int_t fGDNPathSteps; // number of path steps
238  Double_t fGDErrScale; // stop scan at error = scale*errmin
239  //
240  Double_t fAverageTruth; // average truth, ie sum(y)/N, y=+-1
241  //
242  std::vector<Double_t> fFstar; // vector of F*() - filled in CalcFStar()
243  Double_t fFstarMedian; // median value of F*() using
244  //
245  TTree *fGDNtuple; // Gradient path ntuple, contains params for each step along the path
246  Double_t fNTRisk; // GD path: risk
247  Double_t fNTErrorRate; // GD path: error rate (or performance)
248  Double_t fNTNuval; // GD path: value of nu
249  Double_t fNTCoefRad; // GD path: 'radius' of all rulecoeffs
250  Double_t fNTOffset; // GD path: model offset
251  Double_t *fNTCoeff; // GD path: rule coefficients
252  Double_t *fNTLinCoeff; // GD path: linear coefficients
253 
254  Double_t fsigave; // Sigma of current signal score function F(sig)
255  Double_t fsigrms; // Rms of F(sig)
256  Double_t fbkgave; // Average of F(bkg)
257  Double_t fbkgrms; // Rms of F(bkg)
258 
259  private:
260 
261  mutable MsgLogger* fLogger; //! message logger
262  MsgLogger& Log() const { return *fLogger; }
263 
264  };
265 
266  // --------------------------------------------------------
267 
268  class AbsValue {
269 
270  public:
271 
272  Bool_t operator()( Double_t first, Double_t second ) const { return TMath::Abs(first) < TMath::Abs(second); }
273  };
274 }
275 
276 
277 #endif
Bool_t operator()(Double_t first, Double_t second) const
MsgLogger & Log() const
message logger
Double_t * fNTLinCoeff
std::vector< std::vector< Double_t > > fGDCoefLinTst
std::vector< Double_t > fAverageSelectorPath
UInt_t GetPathIdx2() const
void MakeGradientVector()
make gradient vector
void SetGDTau(Double_t t)
Definition: RuleFitParams.h:90
void FillCoefficients()
helper function to store the rule coefficients in local arrays
Double_t Penalty() const
This is the "lasso" penalty To be used for regression.
std::vector< std::vector< Double_t > > fGradVecTst
std::vector< Double_t > fAverageRulePath
void SetGDTauPrec(Double_t p)
Definition: RuleFitParams.h:94
int Int_t
Definition: RtypesCore.h:41
bool Bool_t
Definition: RtypesCore.h:59
std::vector< Double_t > fGDTauVec
std::vector< Double_t > fFstar
TLatex * t1
Definition: textangle.C:20
void ErrorRateRocTst()
Estimates the error rate with the current set of parameters.
Short_t Abs(Short_t d)
Definition: TMathBase.h:110
void MakeTstGradientVector()
make test gradient vector for all tau same algorithm as MakeGradientVector()
std::vector< std::vector< Double_t > > fGDCoefTst
Int_t FindGDTau()
This finds the cutoff parameter tau by scanning several different paths.
std::vector< Double_t > fAverageSelectorPerf
Double_t CalcAverageTruth()
calulate the average truth
void SetGDErrScale(Double_t s)
Definition: RuleFitParams.h:93
Double_t ErrorRateBin()
Estimates the error rate with the current set of parameters It uses a binary estimate of (y-F*(x)) (y...
std::vector< Double_t > fAverageRulePerf
std::vector< const TMVA::Event * >::const_iterator EventItr
void SetMsgType(EMsgType t)
void SetGDNPathSteps(Int_t np)
Definition: RuleFitParams.h:73
void SetGDTauScan(UInt_t n)
Definition: RuleFitParams.h:87
Int_t Type(const Event *e) const
std::vector< Char_t > fGDErrTstOK
Double_t ErrorRateReg()
Estimates the error rate with the current set of parameters This code is pretty messy at the moment...
Double_t ErrorRateRoc()
Estimates the error rate with the current set of parameters.
EMsgType
Definition: Types.h:61
std::vector< Double_t > fGDOfsTst
Double_t Optimism()
implementation of eq.
Double_t RiskPath() const
unsigned int UInt_t
Definition: RtypesCore.h:42
Double_t LossFunction(const Event &e) const
Implementation of squared-error ramp loss function (eq 39,40 in ref 1) This is used for binary Classi...
void InitNtuple()
initializes the ntuple
double Double_t
Definition: RtypesCore.h:55
Double_t RiskPerf() const
Double_t CalcAverageResponseOLD()
void CalcFStar()
Estimates F* (optimum scoring function) for all events for the given sets.
virtual ~RuleFitParams()
destructor
UInt_t GetPathIdx1() const
Definition: RuleFitParams.h:99
std::vector< std::vector< Double_t > > fGradVecLinTst
void UpdateCoefficients()
Establish maximum gradient for rules, linear terms and the offset.
Double_t ErrorRateRocRaw(std::vector< Double_t > &sFsig, std::vector< Double_t > &sFbkg)
void SetGDTauRange(Double_t t0, Double_t t1)
Definition: RuleFitParams.h:79
RuleFitParams()
constructor
std::vector< Double_t > fGradVecLin
Abstract ClassifierFactory template that handles arbitrary types.
std::vector< Double_t > fGradVec
UInt_t GetPerfIdx1() const
void Init()
Initializes all parameters using the RuleEnsemble and the training tree.
Double_t CalcAverageResponse()
calulate the average response - TODO : rewrite bad dependancy on EvaluateAverage() ! ...
RuleEnsemble * fRuleEnsemble
Double_t Risk(UInt_t ind1, UInt_t ind2, Double_t neff) const
risk asessment
UInt_t GetPerfIdx2() const
void InitGD()
Initialize GD path search.
A TTree object has a header with a name and a title.
Definition: TTree.h:94
Double_t RiskPerf(UInt_t itau) const
void UpdateTstCoefficients()
Establish maximum gradient for rules, linear terms and the offset for all taus TODO: do not need inde...
void SetRuleFit(RuleFit *rf)
Definition: RuleFitParams.h:70
UInt_t RiskPerfTst()
Estimates the error rate with the current set of parameters.
std::vector< Double_t > fGDErrTst
void EvaluateAverage(UInt_t ind1, UInt_t ind2, std::vector< Double_t > &avsel, std::vector< Double_t > &avrul)
evaluate the average of each variable and f(x) in the given range
void MakeGDPath()
The following finds the gradient directed path in parameter space.
const Int_t n
Definition: legend1.C:16
void SetGDPathStep(Double_t s)
Definition: RuleFitParams.h:76
void CalcTstAverageResponse()
calc average response for all test paths - TODO: see comment under CalcAverageResponse() note that 0 ...