ROOT  6.06/09
Reference Guide
MethodRuleFit.h
Go to the documentation of this file.
1 // @(#)root/tmva $Id$
2 // Author: Fredrik Tegenfeldt
3 
4 /**********************************************************************************
5  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6  * Package: TMVA *
7  * Class : MethodRuleFit *
8  * Web : http://tmva.sourceforge.net *
9  * *
10  * Description: *
11  * Friedman's RuleFit method *
12  * *
13  * Authors (alphabetical): *
14  * Fredrik Tegenfeldt <Fredrik.Tegenfeldt@cern.ch> - Iowa State U., USA *
15  * *
16  * Copyright (c) 2005: *
17  * CERN, Switzerland *
18  * Iowa State U. *
19  * MPI-K Heidelberg, Germany *
20  * *
21  * Redistribution and use in source and binary forms, with or without *
22  * modification, are permitted according to the terms listed in LICENSE *
23  * *
24  **********************************************************************************/
25 
26 #ifndef ROOT_TMVA_MethodRuleFit
27 #define ROOT_TMVA_MethodRuleFit
28 
29 //////////////////////////////////////////////////////////////////////////
30 // //
31 // MethodRuleFit //
32 // //
33 // J Friedman's RuleFit method //
34 // //
35 //////////////////////////////////////////////////////////////////////////
36 
37 #ifndef ROOT_TMVA_MethodBase
38 #include "TMVA/MethodBase.h"
39 #endif
40 #ifndef ROOT_TMatrixDfwd
41 #include "TMatrixDfwd.h"
42 #endif
43 #ifndef ROOT_TVectorD
44 #include "TVectorD.h"
45 #endif
46 #ifndef ROOT_TMVA_DecisionTree
47 #include "TMVA/DecisionTree.h"
48 #endif
49 #ifndef ROOT_TMVA_RuleFit
50 #include "TMVA/RuleFit.h"
51 #endif
52 
53 namespace TMVA {
54 
55  class SeparationBase;
56 
57  class MethodRuleFit : public MethodBase {
58 
59  public:
60 
61  MethodRuleFit( const TString& jobName,
62  const TString& methodTitle,
63  DataSetInfo& theData,
64  const TString& theOption = "",
65  TDirectory* theTargetDir = 0 );
66 
67  MethodRuleFit( DataSetInfo& theData,
68  const TString& theWeightFile,
69  TDirectory* theTargetDir = NULL );
70 
71  virtual ~MethodRuleFit( void );
72 
73  virtual Bool_t HasAnalysisType( Types::EAnalysisType type, UInt_t numberClasses, UInt_t /*numberTargets*/ );
74 
75  // training method
76  void Train( void );
77 
79 
80  // write weights to file
81  void AddWeightsXMLTo ( void* parent ) const;
82 
83  // read weights from file
84  void ReadWeightsFromStream( std::istream& istr );
85  void ReadWeightsFromXML ( void* wghtnode );
86 
87  // calculate the MVA value
88  Double_t GetMvaValue( Double_t* err = 0, Double_t* errUpper = 0 );
89 
90  // write method specific histos to target file
91  void WriteMonitoringHistosToFile( void ) const;
92 
93  // ranking of input variables
94  const Ranking* CreateRanking();
95 
96  Bool_t UseBoost() const { return fUseBoost; }
97 
98  // accessors
99  RuleFit* GetRuleFitPtr() { return &fRuleFit; }
100  const RuleFit* GetRuleFitConstPtr() const { return &fRuleFit; }
101  TDirectory* GetMethodBaseDir() const { return BaseDir(); }
102  const std::vector<TMVA::Event*>& GetTrainingEvents() const { return fEventSample; }
103  const std::vector<TMVA::DecisionTree*>& GetForest() const { return fForest; }
104  Int_t GetNTrees() const { return fNTrees; }
106  const SeparationBase* GetSeparationBaseConst() const { return fSepType; }
112  Int_t GetNCuts() const { return fNCuts; }
113  //
114  Int_t GetGDNPathSteps() const { return fGDNPathSteps; }
115  Double_t GetGDPathStep() const { return fGDPathStep; }
116  Double_t GetGDErrScale() const { return fGDErrScale; }
119  //
121 
122  const TString GetRFWorkDir() const { return fRFWorkDir; }
123  Int_t GetRFNrules() const { return fRFNrules; }
124  Int_t GetRFNendnodes() const { return fRFNendnodes; }
125 
126  protected:
127 
128  // make ROOT-independent C++ class for classifier response (classifier-specific implementation)
129  void MakeClassSpecific( std::ostream&, const TString& ) const;
130 
131  void MakeClassRuleCuts( std::ostream& ) const;
132 
133  void MakeClassLinear( std::ostream& ) const;
134 
135  // get help message text
136  void GetHelpMessage() const;
137 
138  // initialize rulefit
139  void Init( void );
140 
141  // copy all training events into a stl::vector
142  void InitEventSample( void );
143 
144  // initialize monitor ntuple
145  void InitMonitorNtuple();
146 
147  void TrainTMVARuleFit();
148  void TrainJFRuleFit();
149 
150  private:
151 
152  // check variable range and set var to lower or upper if out of range
153  template<typename T>
154  inline Bool_t VerifyRange( MsgLogger& mlog, const char *varstr, T& var, const T& vmin, const T& vmax );
155 
156  template<typename T>
157  inline Bool_t VerifyRange( MsgLogger& mlog, const char *varstr, T& var, const T& vmin, const T& vmax, const T& vdef );
158 
159  template<typename T>
160  inline Int_t VerifyRange( const T& var, const T& vmin, const T& vmax );
161 
162  // the option handling methods
163  void DeclareOptions();
164  void ProcessOptions();
165 
166  RuleFit fRuleFit; // RuleFit instance
167  std::vector<TMVA::Event *> fEventSample; // the complete training sample
168  Double_t fSignalFraction; // scalefactor for bkg events to modify initial s/b fraction in training data
169 
170  // ntuple
171  TTree *fMonitorNtuple; // pointer to monitor rule ntuple
172  Double_t fNTImportance; // ntuple: rule importance
173  Double_t fNTCoefficient; // ntuple: rule coefficient
174  Double_t fNTSupport; // ntuple: rule support
175  Int_t fNTNcuts; // ntuple: rule number of cuts
176  Int_t fNTNvars; // ntuple: rule number of vars
177  Double_t fNTPtag; // ntuple: rule P(tag)
178  Double_t fNTPss; // ntuple: rule P(tag s, true s)
179  Double_t fNTPsb; // ntuple: rule P(tag s, true b)
180  Double_t fNTPbs; // ntuple: rule P(tag b, true s)
181  Double_t fNTPbb; // ntuple: rule P(tag b, true b)
182  Double_t fNTSSB; // ntuple: rule S/(S+B)
183  Int_t fNTType; // ntuple: rule type (+1->signal, -1->bkg)
184 
185  // options
186  TString fRuleFitModuleS;// which rulefit module to use
187  Bool_t fUseRuleFitJF; // if true interface with J.Friedmans RuleFit module
188  TString fRFWorkDir; // working directory from Friedmans module
189  Int_t fRFNrules; // max number of rules (only Friedmans module)
190  Int_t fRFNendnodes; // max number of rules (only Friedmans module)
191  std::vector<DecisionTree *> fForest; // the forest
192  Int_t fNTrees; // number of trees in forest
193  Double_t fTreeEveFrac; // fraction of events used for traing each tree
194  SeparationBase *fSepType; // the separation used in node splitting
195  Double_t fMinFracNEve; // min fraction of number events
196  Double_t fMaxFracNEve; // ditto max
197  Int_t fNCuts; // grid used in cut applied in node splitting
198  TString fSepTypeS; // forest generation: separation type - see DecisionTree
199  TString fPruneMethodS; // forest generation: prune method - see DecisionTree
200  TMVA::DecisionTree::EPruneMethod fPruneMethod; // forest generation: method used for pruning - see DecisionTree
201  Double_t fPruneStrength; // forest generation: prune strength - see DecisionTree
202  TString fForestTypeS; // forest generation: how the trees are generated
203  Bool_t fUseBoost; // use boosted events for forest generation
204  //
205  Double_t fGDPathEveFrac; // GD path: fraction of subsamples used for the fitting
206  Double_t fGDValidEveFrac; // GD path: fraction of subsamples used for the fitting
207  Double_t fGDTau; // GD path: def threshhold fraction [0..1]
208  Double_t fGDTauPrec; // GD path: precision of estimated tau
209  Double_t fGDTauMin; // GD path: min threshhold fraction [0..1]
210  Double_t fGDTauMax; // GD path: max threshhold fraction [0..1]
211  UInt_t fGDTauScan; // GD path: number of points to scan
212  Double_t fGDPathStep; // GD path: step size in path
213  Int_t fGDNPathSteps; // GD path: number of steps
214  Double_t fGDErrScale; // GD path: stop
215  Double_t fMinimp; // rule/linear: minimum importance
216  //
217  TString fModelTypeS; // rule ensemble: which model (rule,linear or both)
218  Double_t fRuleMinDist; // rule min distance - see RuleEnsemble
219  Double_t fLinQuantile; // quantile cut to remove outliers - see RuleEnsemble
220 
221  ClassDef(MethodRuleFit,0) // Friedman's RuleFit method
222  };
223 
224 } // namespace TMVA
225 
226 
227 //_______________________________________________________________________
228 template<typename T>
229 inline Int_t TMVA::MethodRuleFit::VerifyRange( const T& var, const T& vmin, const T& vmax )
230 {
231  // check range and return +1 if above, -1 if below or 0 if inside
232  if (var>vmax) return 1;
233  if (var<vmin) return -1;
234  return 0;
235 }
236 
237 //_______________________________________________________________________
238 template<typename T>
239 inline Bool_t TMVA::MethodRuleFit::VerifyRange( TMVA::MsgLogger& mlog, const char *varstr, T& var, const T& vmin, const T& vmax )
240 {
241  // verify range and print out message
242  // if outside range, set to closest limit
243  Int_t dir = TMVA::MethodRuleFit::VerifyRange(var,vmin,vmax);
244  Bool_t modif=kFALSE;
245  if (dir==1) {
246  modif = kTRUE;
247  var=vmax;
248  }
249  if (dir==-1) {
250  modif = kTRUE;
251  var=vmin;
252  }
253  if (modif) {
254  mlog << kWARNING << "Option <" << varstr << "> " << (dir==1 ? "above":"below") << " allowed range. Reset to new value = " << var << Endl;
255  }
256  return modif;
257 }
258 
259 //_______________________________________________________________________
260 template<typename T>
261 inline Bool_t TMVA::MethodRuleFit::VerifyRange( TMVA::MsgLogger& mlog, const char *varstr, T& var, const T& vmin, const T& vmax, const T& vdef )
262 {
263  // verify range and print out message
264  // if outside range, set to given default value
265  Int_t dir = TMVA::MethodRuleFit::VerifyRange(var,vmin,vmax);
266  Bool_t modif=kFALSE;
267  if (dir!=0) {
268  modif = kTRUE;
269  var=vdef;
270  }
271  if (modif) {
272  mlog << kWARNING << "Option <" << varstr << "> " << (dir==1 ? "above":"below") << " allowed range. Reset to default value = " << var << Endl;
273  }
274  return modif;
275 }
276 
277 
278 #endif // MethodRuleFit_H
void DeclareOptions()
define the options (their key words) that can be set in the option string know options.
void Init(void)
default initialization
void ReadWeightsFromXML(void *wghtnode)
read rules from XML node
MsgLogger & Endl(MsgLogger &ml)
Definition: MsgLogger.h:162
Double_t GetGDValidEveFrac() const
void ReadWeightsFromStream(std::istream &istr)
read rules from an std::istream
double T(double x)
Definition: ChebyshevPol.h:34
void InitMonitorNtuple()
initialize the monitoring ntuple
const std::vector< TMVA::DecisionTree * > & GetForest() const
EAnalysisType
Definition: Types.h:124
Int_t GetNTrees() const
const TString GetRFWorkDir() const
Int_t GetNCuts() const
void WriteMonitoringHistosToFile(void) const
write special monitoring histograms to file (here ntuple)
Basic string class.
Definition: TString.h:137
int Int_t
Definition: RtypesCore.h:41
bool Bool_t
Definition: RtypesCore.h:59
const Bool_t kFALSE
Definition: Rtypes.h:92
const Ranking * CreateRanking()
computes ranking of input variables
void TrainJFRuleFit()
training of rules using Jerome Friedmans implementation
TDirectory * GetMethodBaseDir() const
Int_t GetRFNendnodes() const
Int_t GetGDNPathSteps() const
Double_t GetMvaValue(Double_t *err=0, Double_t *errUpper=0)
returns MVA value for given event
const std::vector< TMVA::Event * > & GetTrainingEvents() const
TMVA::DecisionTree::EPruneMethod fPruneMethod
#define ClassDef(name, id)
Definition: Rtypes.h:254
void ProcessOptions()
process the options specified by the user
void MakeClassLinear(std::ostream &) const
print out the linear terms
Double_t GetGDPathStep() const
Double_t GetMinFracNEve() const
std::vector< TMVA::Event * > fEventSample
void TrainTMVARuleFit()
training of rules using TMVA implementation
const RuleFit * GetRuleFitConstPtr() const
SeparationBase * fSepType
void AddWeightsXMLTo(void *parent) const
add the rules to XML node
Double_t GetGDErrScale() const
unsigned int UInt_t
Definition: RtypesCore.h:42
Double_t GetMaxFracNEve() const
RuleFit * GetRuleFitPtr()
Definition: MethodRuleFit.h:99
Bool_t VerifyRange(MsgLogger &mlog, const char *varstr, T &var, const T &vmin, const T &vmax)
void GetHelpMessage() const
get help message text
double Double_t
Definition: RtypesCore.h:55
Describe directory structure in memory.
Definition: TDirectory.h:41
int type
Definition: TGX11.cxx:120
TDirectory * BaseDir() const
returns the ROOT directory where info/histograms etc of the corresponding MVA method instance are sto...
Double_t GetLinQuantile() const
TMVA::DecisionTree::EPruneMethod GetPruneMethod() const
const SeparationBase * GetSeparationBaseConst() const
Abstract ClassifierFactory template that handles arbitrary types.
Bool_t UseBoost() const
Definition: MethodRuleFit.h:96
Double_t GetTreeEveFrac() const
SeparationBase * GetSeparationBase() const
#define NULL
Definition: Rtypes.h:82
void MakeClassSpecific(std::ostream &, const TString &) const
write specific classifier response
A TTree object has a header with a name and a title.
Definition: TTree.h:94
virtual Bool_t HasAnalysisType(Types::EAnalysisType type, UInt_t numberClasses, UInt_t)
RuleFit can handle classification with 2 classes.
Int_t GetRFNrules() const
virtual ~MethodRuleFit(void)
destructor
Double_t GetPruneStrength() const
virtual void ReadWeightsFromStream(std::istream &)=0
const Bool_t kTRUE
Definition: Rtypes.h:91
std::vector< DecisionTree * > fForest
Double_t GetGDPathEveFrac() const
MethodRuleFit(const TString &jobName, const TString &methodTitle, DataSetInfo &theData, const TString &theOption="", TDirectory *theTargetDir=0)
void InitEventSample(void)
write all Events from the Tree into a vector of Events, that are more easily manipulated.
void MakeClassRuleCuts(std::ostream &) const
print out the rule cuts