Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RuleFitParams.h
Go to the documentation of this file.
1// @(#)root/tmva $Id$
2// Author: Andreas Hoecker, Joerg Stelzer, Fredrik Tegenfeldt, Helge Voss
3
4/**********************************************************************************
5 * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6 * Package: TMVA *
7 * Class : RuleFitParams *
8 * Web : http://tmva.sourceforge.net *
9 * *
10 * Description: *
11 * A class doing the actual fitting of a linear model using rules as *
12 * base functions. *
13 * Reference paper: 1.Gradient Directed Regularization *
14 * Friedman, Popescu, 2004 *
15 * 2.Predictive Learning with Rule Ensembles *
16 * Friedman, Popescu, 2005 *
17 * *
18 * *
19 * Authors (alphabetical): *
20 * Fredrik Tegenfeldt <Fredrik.Tegenfeldt@cern.ch> - Iowa State U., USA *
21 * Helge Voss <Helge.Voss@cern.ch> - MPI-KP Heidelberg, Ger. *
22 * *
23 * Copyright (c) 2005: *
24 * CERN, Switzerland *
25 * Iowa State U. *
26 * MPI-K Heidelberg, Germany *
27 * *
28 * Redistribution and use in source and binary forms, with or without *
29 * modification, are permitted according to the terms listed in LICENSE *
30 * (http://tmva.sourceforge.net/LICENSE) *
31 **********************************************************************************/
32
33#ifndef ROOT_TMVA_RuleFitParams
34#define ROOT_TMVA_RuleFitParams
35
36#include "TMathBase.h"
37
38#include "TMVA/Event.h"
39
40#include <vector>
41
42class TTree;
43
44namespace TMVA {
45
46 class RuleEnsemble;
47 class MsgLogger;
48 class RuleFit;
50
51 public:
52
54 virtual ~RuleFitParams();
55
56 void Init();
57
58 // set message type
59 void SetMsgType( EMsgType t );
60
61 // set RuleFit ptr
62 void SetRuleFit( RuleFit *rf ) { fRuleFit = rf; }
63 //
64 // GD path: set N(path steps)
66
67 // GD path: set path step size
69
70 // GD path: set tau search range
72 {
73 fGDTauMin = (t0>1.0 ? 1.0:(t0<0.0 ? 0.0:t0));
74 fGDTauMax = (t1>1.0 ? 1.0:(t1<0.0 ? 0.0:t1));
76 }
77
78 // GD path: set number of steps in tau search range
80
81 // GD path: set tau
82 void SetGDTau( Double_t t ) { fGDTau = t; }
83
84
87
88 // return type such that +1 = signal and -1 = background
89 Int_t Type( const Event * e ) const; // return (fRuleFit->GetMethodRuleFit()->DataInfo().IsSignal(e) ? 1:-1); }
90 //
91 UInt_t GetPathIdx1() const { return fPathIdx1; }
92 UInt_t GetPathIdx2() const { return fPathIdx2; }
93 UInt_t GetPerfIdx1() const { return fPerfIdx1; }
94 UInt_t GetPerfIdx2() const { return fPerfIdx2; }
95
96 // Loss function; Huber loss eq 33
97 Double_t LossFunction( const Event& e ) const;
98
99 // same but using evt idx (faster)
100 Double_t LossFunction( UInt_t evtidx ) const;
101 Double_t LossFunction( UInt_t evtidx, UInt_t itau ) const;
102
103 // Empirical risk
104 Double_t Risk(UInt_t ind1, UInt_t ind2, Double_t neff) const;
105 Double_t Risk(UInt_t ind1, UInt_t ind2, Double_t neff, UInt_t itau) const;
106
107 // Risk evaluation for fPathIdx and fPerfInd
111
112 // Risk evaluation for all tau
114
115 // Penalty function; Lasso function (eq 8)
116 Double_t Penalty() const;
117
118 // initialize GD path
119 void InitGD();
120
121 // find best tau and return the number of scan steps used
123
124 // make path for binary classification (squared-error ramp, sect 6 in ref 1)
125 void MakeGDPath();
126
127 protected:
128
129 // typedef of an Event const iterator
130 typedef std::vector<const TMVA::Event *>::const_iterator EventItr;
131
132 // init ntuple
133 void InitNtuple();
134
135 // calculate N(tau) in scan - limit to 100000.
136 void CalcGDNTau() { fGDNTau = static_cast<UInt_t>(1.0/fGDTauPrec)+1; if (fGDNTau>100000) fGDNTau=100000; }
137
138 // fill ntuple with coefficient info
139 void FillCoefficients();
140
141 // estimate the optimum scoring function
142 void CalcFStar();
143
144 // estimate of binary error rate
146
147 // estimate of scale average error rate
149
150 // estimate 1-area under ROC
151 Double_t ErrorRateRocRaw( std::vector<Double_t> & sFsig, std::vector<Double_t> & sFbkg );
153 void ErrorRateRocTst();
154
155 // estimate optimism
157
158 // make gradient vector (eq 44 in ref 1)
159 void MakeGradientVector();
160
161 // Calculate the direction in parameter space (eq 25, ref 1) and update coeffs (eq 22, ref 1)
162 void UpdateCoefficients();
163
164 // calculate average of responses of F
167
168 // calculate average of true response (initial estimate of a0)
170
171 // calculate the average of each variable over the range
172 void EvaluateAverage(UInt_t ind1, UInt_t ind2,
173 std::vector<Double_t> &avsel,
174 std::vector<Double_t> &avrul);
175
176 // evaluate using fPathIdx1,2
178
179 // evaluate using fPerfIdx1,2
181
182 // the same as above but for the various tau
186
187
188 RuleFit * fRuleFit; ///< rule fit
189 RuleEnsemble * fRuleEnsemble; ///< rule ensemble
190 //
191 UInt_t fNRules; ///< number of rules
192 UInt_t fNLinear; ///< number of linear terms
193 //
194 // Event indices for path/validation - TODO: should let the user decide
195 // Now it is just a simple one-fold cross validation.
196 //
197 UInt_t fPathIdx1; ///< first event index for path search
198 UInt_t fPathIdx2; ///< last event index for path search
199 UInt_t fPerfIdx1; ///< first event index for performance evaluation
200 UInt_t fPerfIdx2; ///< last event index for performance evaluation
201 Double_t fNEveEffPath; ///< sum of weights for Path events
202 Double_t fNEveEffPerf; ///< idem for Perf events
203
204 std::vector<Double_t> fAverageSelectorPath; ///< average of each variable over the range fPathIdx1,2
205 std::vector<Double_t> fAverageRulePath; ///< average of each rule, same range
206 std::vector<Double_t> fAverageSelectorPerf; ///< average of each variable over the range fPerfIdx1,2
207 std::vector<Double_t> fAverageRulePerf; ///< average of each rule, same range
208
209 std::vector<Double_t> fGradVec; ///< gradient vector - dimension = number of rules in ensemble
210 std::vector<Double_t> fGradVecLin; ///< gradient vector - dimension = number of variables
211
212 std::vector< std::vector<Double_t> > fGradVecTst; ///< gradient vector - one per tau
213 std::vector< std::vector<Double_t> > fGradVecLinTst; ///< gradient vector, linear terms - one per tau
214 //
215 std::vector<Double_t> fGDErrTst; ///< error rates per tau
216 std::vector<Char_t> fGDErrTstOK; ///< error rate is sufficiently low <--- stores boolean
217 std::vector< std::vector<Double_t> > fGDCoefTst; ///< rule coeffs - one per tau
218 std::vector< std::vector<Double_t> > fGDCoefLinTst; ///< linear coeffs - one per tau
219 std::vector<Double_t> fGDOfsTst; ///< offset per tau
220 std::vector< Double_t > fGDTauVec; ///< the tau's
221 UInt_t fGDNTauTstOK; ///< number of tau in the test-phase that are ok
222 UInt_t fGDNTau; ///< number of tau-paths - calculated in SetGDTauPrec
223 Double_t fGDTauPrec; ///< precision in tau
224 UInt_t fGDTauScan; ///< number scan for tau-paths
225 Double_t fGDTauMin; ///< min threshold parameter (tau in eq 26, ref 1)
226 Double_t fGDTauMax; ///< max threshold parameter (tau in eq 26, ref 1)
227 Double_t fGDTau; ///< selected threshold parameter (tau in eq 26, ref 1)
228 Double_t fGDPathStep; ///< step size along path (delta nu in eq 22, ref 1)
229 Int_t fGDNPathSteps; ///< number of path steps
230 Double_t fGDErrScale; ///< stop scan at error = scale*errmin
231 //
232 Double_t fAverageTruth; ///< average truth, ie sum(y)/N, y=+-1
233 //
234 std::vector<Double_t> fFstar; ///< vector of F*() - filled in CalcFStar()
235 Double_t fFstarMedian; ///< median value of F*() using
236 //
237 TTree *fGDNtuple; ///< Gradient path ntuple, contains params for each step along the path
238 Double_t fNTRisk; ///< GD path: risk
239 Double_t fNTErrorRate; ///< GD path: error rate (or performance)
240 Double_t fNTNuval; ///< GD path: value of nu
241 Double_t fNTCoefRad; ///< GD path: 'radius' of all rulecoeffs
242 Double_t fNTOffset; ///< GD path: model offset
243 Double_t *fNTCoeff; ///< GD path: rule coefficients
244 Double_t *fNTLinCoeff; ///< GD path: linear coefficients
245
246 Double_t fsigave; ///< Sigma of current signal score function F(sig)
247 Double_t fsigrms; ///< Rms of F(sig)
248 Double_t fbkgave; ///< Average of F(bkg)
249 Double_t fbkgrms; ///< Rms of F(bkg)
250
251 private:
252
253 mutable MsgLogger* fLogger; ///<! message logger
254 MsgLogger& Log() const { return *fLogger; }
255
256 };
257
258 // --------------------------------------------------------
259
260 class AbsValue {
261
262 public:
263
264 Bool_t operator()( Double_t first, Double_t second ) const { return TMath::Abs(first) < TMath::Abs(second); }
265 };
266}
267
268
269#endif
#define e(i)
Definition RSha256.hxx:103
unsigned int UInt_t
Definition RtypesCore.h:46
double Double_t
Definition RtypesCore.h:59
winID h TVirtualViewer3D TVirtualGLPainter p
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t np
Bool_t operator()(Double_t first, Double_t second) const
ostringstream derivative to redirect and format output
Definition MsgLogger.h:57
A class doing the actual fitting of a linear model using rules as base functions.
void CalcTstAverageResponse()
calc average response for all test paths - TODO: see comment under CalcAverageResponse() note that 0 ...
void MakeGDPath()
The following finds the gradient directed path in parameter space.
std::vector< std::vector< Double_t > > fGDCoefLinTst
linear coeffs - one per tau
void EvaluateAverage(UInt_t ind1, UInt_t ind2, std::vector< Double_t > &avsel, std::vector< Double_t > &avrul)
evaluate the average of each variable and f(x) in the given range
RuleFit * fRuleFit
rule fit
Double_t fGDTau
selected threshold parameter (tau in eq 26, ref 1)
Double_t fNTCoefRad
GD path: 'radius' of all rulecoeffs.
Double_t * fNTLinCoeff
GD path: linear coefficients.
UInt_t RiskPerfTst()
Estimates the error rate with the current set of parameters.
Int_t fGDNPathSteps
number of path steps
Double_t Risk(UInt_t ind1, UInt_t ind2, Double_t neff) const
risk assessment
Double_t RiskPerf(UInt_t itau) const
Double_t Optimism()
implementation of eq.
Int_t FindGDTau()
This finds the cutoff parameter tau by scanning several different paths.
UInt_t fPerfIdx2
last event index for performance evaluation
void SetGDPathStep(Double_t s)
std::vector< Double_t > fAverageSelectorPath
average of each variable over the range fPathIdx1,2
UInt_t fGDTauScan
number scan for tau-paths
std::vector< Double_t > fGradVecLin
gradient vector - dimension = number of variables
Double_t fsigrms
Rms of F(sig)
Double_t fNTRisk
GD path: risk.
virtual ~RuleFitParams()
destructor
RuleFitParams()
constructor
void SetRuleFit(RuleFit *rf)
Double_t * fNTCoeff
GD path: rule coefficients.
std::vector< constTMVA::Event * >::const_iterator EventItr
void Init()
Initializes all parameters using the RuleEnsemble and the training tree.
Double_t CalcAverageResponse()
calculate the average response - TODO : rewrite bad dependancy on EvaluateAverage() !
std::vector< Double_t > fAverageRulePerf
average of each rule, same range
Double_t fNEveEffPerf
idem for Perf events
void SetMsgType(EMsgType t)
Double_t Penalty() const
This is the "lasso" penalty To be used for regression.
std::vector< std::vector< Double_t > > fGradVecLinTst
gradient vector, linear terms - one per tau
void SetGDTauRange(Double_t t0, Double_t t1)
Double_t fNEveEffPath
sum of weights for Path events
std::vector< std::vector< Double_t > > fGDCoefTst
rule coeffs - one per tau
Double_t RiskPath() const
MsgLogger * fLogger
! message logger
void FillCoefficients()
helper function to store the rule coefficients in local arrays
std::vector< Double_t > fGDTauVec
the tau's
Double_t fsigave
Sigma of current signal score function F(sig)
void SetGDErrScale(Double_t s)
UInt_t fPathIdx2
last event index for path search
void InitGD()
Initialize GD path search.
UInt_t fPathIdx1
first event index for path search
Double_t fNTNuval
GD path: value of nu.
TTree * fGDNtuple
Gradient path ntuple, contains params for each step along the path.
std::vector< Double_t > fGDErrTst
error rates per tau
void ErrorRateRocTst()
Estimates the error rate with the current set of parameters.
void SetGDTauPrec(Double_t p)
std::vector< Double_t > fGradVec
gradient vector - dimension = number of rules in ensemble
std::vector< Double_t > fAverageSelectorPerf
average of each variable over the range fPerfIdx1,2
Double_t fGDTauMax
max threshold parameter (tau in eq 26, ref 1)
std::vector< std::vector< Double_t > > fGradVecTst
gradient vector - one per tau
std::vector< Char_t > fGDErrTstOK
error rate is sufficiently low <— stores boolean
RuleEnsemble * fRuleEnsemble
rule ensemble
void CalcFStar()
Estimates F* (optimum scoring function) for all events for the given sets.
MsgLogger & Log() const
UInt_t GetPathIdx2() const
void MakeTstGradientVector()
make test gradient vector for all tau same algorithm as MakeGradientVector()
UInt_t GetPerfIdx2() const
Double_t fbkgave
Average of F(bkg)
Double_t CalcAverageTruth()
calculate the average truth
void SetGDTauScan(UInt_t n)
Double_t fFstarMedian
median value of F*() using
Double_t fbkgrms
Rms of F(bkg)
Double_t fAverageTruth
average truth, ie sum(y)/N, y=+-1
std::vector< Double_t > fFstar
vector of F*() - filled in CalcFStar()
void UpdateTstCoefficients()
Establish maximum gradient for rules, linear terms and the offset for all taus TODO: do not need inde...
Double_t fGDErrScale
stop scan at error = scale*errmin
Double_t RiskPerf() const
void MakeGradientVector()
make gradient vector
void UpdateCoefficients()
Establish maximum gradient for rules, linear terms and the offset.
Double_t fGDTauMin
min threshold parameter (tau in eq 26, ref 1)
UInt_t fNLinear
number of linear terms
UInt_t GetPerfIdx1() const
void InitNtuple()
initializes the ntuple
UInt_t fPerfIdx1
first event index for performance evaluation
Double_t CalcAverageResponseOLD()
Double_t ErrorRateBin()
Estimates the error rate with the current set of parameters It uses a binary estimate of (y-F*(x)) (y...
void SetGDTau(Double_t t)
Double_t fGDTauPrec
precision in tau
Double_t ErrorRateReg()
Estimates the error rate with the current set of parameters This code is pretty messy at the moment.
std::vector< Double_t > fGDOfsTst
offset per tau
Double_t fNTOffset
GD path: model offset.
UInt_t fGDNTau
number of tau-paths - calculated in SetGDTauPrec
UInt_t GetPathIdx1() const
Double_t fGDPathStep
step size along path (delta nu in eq 22, ref 1)
Double_t ErrorRateRoc()
Estimates the error rate with the current set of parameters.
UInt_t fNRules
number of rules
Double_t ErrorRateRocRaw(std::vector< Double_t > &sFsig, std::vector< Double_t > &sFbkg)
Estimates the error rate with the current set of parameters.
UInt_t fGDNTauTstOK
number of tau in the test-phase that are ok
Double_t fNTErrorRate
GD path: error rate (or performance)
std::vector< Double_t > fAverageRulePath
average of each rule, same range
void SetGDNPathSteps(Int_t np)
A class implementing various fits of rule ensembles.
Definition RuleFit.h:46
A TTree represents a columnar dataset.
Definition TTree.h:79
const Int_t n
Definition legend1.C:16
create variable transformations
Short_t Abs(Short_t d)
Returns the absolute value of parameter Short_t d.
Definition TMathBase.h:123
Definition first.py:1
auto * t1
Definition textangle.C:20