Logo ROOT   6.10/09
Reference Guide
MethodRuleFit.h
Go to the documentation of this file.
1 // @(#)root/tmva $Id$
2 // Author: Fredrik Tegenfeldt
3 
4 /**********************************************************************************
5  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6  * Package: TMVA *
7  * Class : MethodRuleFit *
8  * Web : http://tmva.sourceforge.net *
9  * *
10  * Description: *
11  * Friedman's RuleFit method *
12  * *
13  * Authors (alphabetical): *
14  * Fredrik Tegenfeldt <Fredrik.Tegenfeldt@cern.ch> - Iowa State U., USA *
15  * *
16  * Copyright (c) 2005: *
17  * CERN, Switzerland *
18  * Iowa State U. *
19  * MPI-K Heidelberg, Germany *
20  * *
21  * Redistribution and use in source and binary forms, with or without *
22  * modification, are permitted according to the terms listed in LICENSE *
23  * *
24  **********************************************************************************/
25 
26 #ifndef ROOT_TMVA_MethodRuleFit
27 #define ROOT_TMVA_MethodRuleFit
28 
29 //////////////////////////////////////////////////////////////////////////
30 // //
31 // MethodRuleFit //
32 // //
33 // J Friedman's RuleFit method //
34 // //
35 //////////////////////////////////////////////////////////////////////////
36 
37 #include "TMVA/MethodBase.h"
38 #include "TMatrixDfwd.h"
39 #include "TVectorD.h"
40 #include "TMVA/DecisionTree.h"
41 #include "TMVA/RuleFit.h"
42 
43 namespace TMVA {
44 
45  class SeparationBase;
46 
47  class MethodRuleFit : public MethodBase {
48 
49  public:
50 
51  MethodRuleFit( const TString& jobName,
52  const TString& methodTitle,
53  DataSetInfo& theData,
54  const TString& theOption = "");
55 
56  MethodRuleFit( DataSetInfo& theData,
57  const TString& theWeightFile);
58 
59  virtual ~MethodRuleFit( void );
60 
61  virtual Bool_t HasAnalysisType( Types::EAnalysisType type, UInt_t numberClasses, UInt_t /*numberTargets*/ );
62 
63  // training method
64  void Train( void );
65 
67 
68  // write weights to file
69  void AddWeightsXMLTo ( void* parent ) const;
70 
71  // read weights from file
72  void ReadWeightsFromStream( std::istream& istr );
73  void ReadWeightsFromXML ( void* wghtnode );
74 
75  // calculate the MVA value
76  Double_t GetMvaValue( Double_t* err = 0, Double_t* errUpper = 0 );
77 
78  // write method specific histos to target file
79  void WriteMonitoringHistosToFile( void ) const;
80 
81  // ranking of input variables
82  const Ranking* CreateRanking();
83 
84  Bool_t UseBoost() const { return fUseBoost; }
85 
86  // accessors
87  RuleFit* GetRuleFitPtr() { return &fRuleFit; }
88  const RuleFit* GetRuleFitConstPtr() const { return &fRuleFit; }
89  TDirectory* GetMethodBaseDir() const { return BaseDir(); }
90  const std::vector<TMVA::Event*>& GetTrainingEvents() const { return fEventSample; }
91  const std::vector<TMVA::DecisionTree*>& GetForest() const { return fForest; }
92  Int_t GetNTrees() const { return fNTrees; }
93  Double_t GetTreeEveFrac() const { return fTreeEveFrac; }
94  const SeparationBase* GetSeparationBaseConst() const { return fSepType; }
98  Double_t GetMinFracNEve() const { return fMinFracNEve; }
99  Double_t GetMaxFracNEve() const { return fMaxFracNEve; }
100  Int_t GetNCuts() const { return fNCuts; }
101  //
102  Int_t GetGDNPathSteps() const { return fGDNPathSteps; }
103  Double_t GetGDPathStep() const { return fGDPathStep; }
104  Double_t GetGDErrScale() const { return fGDErrScale; }
107  //
109 
110  const TString GetRFWorkDir() const { return fRFWorkDir; }
111  Int_t GetRFNrules() const { return fRFNrules; }
112  Int_t GetRFNendnodes() const { return fRFNendnodes; }
113 
114  protected:
115 
116  // make ROOT-independent C++ class for classifier response (classifier-specific implementation)
117  void MakeClassSpecific( std::ostream&, const TString& ) const;
118 
119  void MakeClassRuleCuts( std::ostream& ) const;
120 
121  void MakeClassLinear( std::ostream& ) const;
122 
123  // get help message text
124  void GetHelpMessage() const;
125 
126  // initialize rulefit
127  void Init( void );
128 
129  // copy all training events into a stl::vector
130  void InitEventSample( void );
131 
132  // initialize monitor ntuple
133  void InitMonitorNtuple();
134 
135  void TrainTMVARuleFit();
136  void TrainJFRuleFit();
137 
138  private:
139 
140  // check variable range and set var to lower or upper if out of range
141  template<typename T>
142  inline Bool_t VerifyRange( MsgLogger& mlog, const char *varstr, T& var, const T& vmin, const T& vmax );
143 
144  template<typename T>
145  inline Bool_t VerifyRange( MsgLogger& mlog, const char *varstr, T& var, const T& vmin, const T& vmax, const T& vdef );
146 
147  template<typename T>
148  inline Int_t VerifyRange( const T& var, const T& vmin, const T& vmax );
149 
150  // the option handling methods
151  void DeclareOptions();
152  void ProcessOptions();
153 
154  RuleFit fRuleFit; // RuleFit instance
155  std::vector<TMVA::Event *> fEventSample; // the complete training sample
156  Double_t fSignalFraction; // scalefactor for bkg events to modify initial s/b fraction in training data
157 
158  // ntuple
159  TTree *fMonitorNtuple; // pointer to monitor rule ntuple
160  Double_t fNTImportance; // ntuple: rule importance
161  Double_t fNTCoefficient; // ntuple: rule coefficient
162  Double_t fNTSupport; // ntuple: rule support
163  Int_t fNTNcuts; // ntuple: rule number of cuts
164  Int_t fNTNvars; // ntuple: rule number of vars
165  Double_t fNTPtag; // ntuple: rule P(tag)
166  Double_t fNTPss; // ntuple: rule P(tag s, true s)
167  Double_t fNTPsb; // ntuple: rule P(tag s, true b)
168  Double_t fNTPbs; // ntuple: rule P(tag b, true s)
169  Double_t fNTPbb; // ntuple: rule P(tag b, true b)
170  Double_t fNTSSB; // ntuple: rule S/(S+B)
171  Int_t fNTType; // ntuple: rule type (+1->signal, -1->bkg)
172 
173  // options
174  TString fRuleFitModuleS;// which rulefit module to use
175  Bool_t fUseRuleFitJF; // if true interface with J.Friedmans RuleFit module
176  TString fRFWorkDir; // working directory from Friedmans module
177  Int_t fRFNrules; // max number of rules (only Friedmans module)
178  Int_t fRFNendnodes; // max number of rules (only Friedmans module)
179  std::vector<DecisionTree *> fForest; // the forest
180  Int_t fNTrees; // number of trees in forest
181  Double_t fTreeEveFrac; // fraction of events used for training each tree
182  SeparationBase *fSepType; // the separation used in node splitting
183  Double_t fMinFracNEve; // min fraction of number events
184  Double_t fMaxFracNEve; // ditto max
185  Int_t fNCuts; // grid used in cut applied in node splitting
186  TString fSepTypeS; // forest generation: separation type - see DecisionTree
187  TString fPruneMethodS; // forest generation: prune method - see DecisionTree
188  TMVA::DecisionTree::EPruneMethod fPruneMethod; // forest generation: method used for pruning - see DecisionTree
189  Double_t fPruneStrength; // forest generation: prune strength - see DecisionTree
190  TString fForestTypeS; // forest generation: how the trees are generated
191  Bool_t fUseBoost; // use boosted events for forest generation
192  //
193  Double_t fGDPathEveFrac; // GD path: fraction of subsamples used for the fitting
194  Double_t fGDValidEveFrac; // GD path: fraction of subsamples used for the fitting
195  Double_t fGDTau; // GD path: def threshold fraction [0..1]
196  Double_t fGDTauPrec; // GD path: precision of estimated tau
197  Double_t fGDTauMin; // GD path: min threshold fraction [0..1]
198  Double_t fGDTauMax; // GD path: max threshold fraction [0..1]
199  UInt_t fGDTauScan; // GD path: number of points to scan
200  Double_t fGDPathStep; // GD path: step size in path
201  Int_t fGDNPathSteps; // GD path: number of steps
202  Double_t fGDErrScale; // GD path: stop
203  Double_t fMinimp; // rule/linear: minimum importance
204  //
205  TString fModelTypeS; // rule ensemble: which model (rule,linear or both)
206  Double_t fRuleMinDist; // rule min distance - see RuleEnsemble
207  Double_t fLinQuantile; // quantile cut to remove outliers - see RuleEnsemble
208 
209  ClassDef(MethodRuleFit,0); // Friedman's RuleFit method
210  };
211 
212 } // namespace TMVA
213 
214 
215 //_______________________________________________________________________
216 template<typename T>
217 inline Int_t TMVA::MethodRuleFit::VerifyRange( const T& var, const T& vmin, const T& vmax )
218 {
219  // check range and return +1 if above, -1 if below or 0 if inside
220  if (var>vmax) return 1;
221  if (var<vmin) return -1;
222  return 0;
223 }
224 
225 //_______________________________________________________________________
226 template<typename T>
227 inline Bool_t TMVA::MethodRuleFit::VerifyRange( TMVA::MsgLogger& mlog, const char *varstr, T& var, const T& vmin, const T& vmax )
228 {
229  // verify range and print out message
230  // if outside range, set to closest limit
231  Int_t dir = TMVA::MethodRuleFit::VerifyRange(var,vmin,vmax);
232  Bool_t modif=kFALSE;
233  if (dir==1) {
234  modif = kTRUE;
235  var=vmax;
236  }
237  if (dir==-1) {
238  modif = kTRUE;
239  var=vmin;
240  }
241  if (modif) {
242  mlog << kWARNING << "Option <" << varstr << "> " << (dir==1 ? "above":"below") << " allowed range. Reset to new value = " << var << Endl;
243  }
244  return modif;
245 }
246 
247 //_______________________________________________________________________
248 template<typename T>
249 inline Bool_t TMVA::MethodRuleFit::VerifyRange( TMVA::MsgLogger& mlog, const char *varstr, T& var, const T& vmin, const T& vmax, const T& vdef )
250 {
251  // verify range and print out message
252  // if outside range, set to given default value
253  Int_t dir = TMVA::MethodRuleFit::VerifyRange(var,vmin,vmax);
254  Bool_t modif=kFALSE;
255  if (dir!=0) {
256  modif = kTRUE;
257  var=vdef;
258  }
259  if (modif) {
260  mlog << kWARNING << "Option <" << varstr << "> " << (dir==1 ? "above":"below") << " allowed range. Reset to default value = " << var << Endl;
261  }
262  return modif;
263 }
264 
265 
266 #endif // MethodRuleFit_H
const std::vector< TMVA::Event * > & GetTrainingEvents() const
Definition: MethodRuleFit.h:90
void DeclareOptions()
define the options (their key words) that can be set in the option string know options.
void Init(void)
default initialization
J Friedman&#39;s RuleFit method.
Definition: MethodRuleFit.h:47
void ReadWeightsFromXML(void *wghtnode)
read rules from XML node
Double_t GetTreeEveFrac() const
Definition: MethodRuleFit.h:93
MsgLogger & Endl(MsgLogger &ml)
Definition: MsgLogger.h:158
Double_t GetGDErrScale() const
void WriteMonitoringHistosToFile(void) const
write special monitoring histograms to file (here ntuple)
void ReadWeightsFromStream(std::istream &istr)
read rules from an std::istream
A class implementing various fits of rule ensembles.
Definition: RuleFit.h:44
double T(double x)
Definition: ChebyshevPol.h:34
void InitMonitorNtuple()
initialize the monitoring ntuple
EAnalysisType
Definition: Types.h:125
Virtual base Class for all MVA method.
Definition: MethodBase.h:106
Basic string class.
Definition: TString.h:129
Ranking for variables in method (implementation)
Definition: Ranking.h:48
Double_t GetGDValidEveFrac() const
int Int_t
Definition: RtypesCore.h:41
bool Bool_t
Definition: RtypesCore.h:59
const Ranking * CreateRanking()
computes ranking of input variables
void TrainJFRuleFit()
training of rules using Jerome Friedmans implementation
MethodRuleFit(const TString &jobName, const TString &methodTitle, DataSetInfo &theData, const TString &theOption="")
standard constructor
const std::vector< TMVA::DecisionTree * > & GetForest() const
Definition: MethodRuleFit.h:91
Double_t GetMvaValue(Double_t *err=0, Double_t *errUpper=0)
returns MVA value for given event
Double_t GetGDPathStep() const
TMVA::DecisionTree::EPruneMethod fPruneMethod
#define ClassDef(name, id)
Definition: Rtypes.h:297
void ProcessOptions()
process the options specified by the user
TMVA::DecisionTree::EPruneMethod GetPruneMethod() const
Definition: MethodRuleFit.h:96
void MakeClassSpecific(std::ostream &, const TString &) const
write specific classifier response
void MakeClassLinear(std::ostream &) const
print out the linear terms
std::vector< TMVA::Event * > fEventSample
SeparationBase * GetSeparationBase() const
Definition: MethodRuleFit.h:95
Class that contains all the data information.
Definition: DataSetInfo.h:60
void TrainTMVARuleFit()
training of rules using TMVA implementation
Double_t GetPruneStrength() const
Definition: MethodRuleFit.h:97
SeparationBase * fSepType
Double_t GetGDPathEveFrac() const
Int_t GetRFNrules() const
Double_t GetLinQuantile() const
unsigned int UInt_t
Definition: RtypesCore.h:42
An interface to calculate the "SeparationGain" for different separation criteria used in various trai...
RuleFit * GetRuleFitPtr()
Definition: MethodRuleFit.h:87
const SeparationBase * GetSeparationBaseConst() const
Definition: MethodRuleFit.h:94
Int_t GetRFNendnodes() const
const Bool_t kFALSE
Definition: RtypesCore.h:92
void GetHelpMessage() const
get help message text
Bool_t VerifyRange(MsgLogger &mlog, const char *varstr, T &var, const T &vmin, const T &vmax)
const TString GetRFWorkDir() const
Int_t GetNCuts() const
double Double_t
Definition: RtypesCore.h:55
const RuleFit * GetRuleFitConstPtr() const
Definition: MethodRuleFit.h:88
Int_t GetGDNPathSteps() const
Describe directory structure in memory.
Definition: TDirectory.h:34
int type
Definition: TGX11.cxx:120
Int_t GetNTrees() const
Definition: MethodRuleFit.h:92
Bool_t UseBoost() const
Definition: MethodRuleFit.h:84
ostringstream derivative to redirect and format output
Definition: MsgLogger.h:59
Abstract ClassifierFactory template that handles arbitrary types.
TDirectory * BaseDir() const
returns the ROOT directory where info/histograms etc of the corresponding MVA method instance are sto...
TDirectory * GetMethodBaseDir() const
Definition: MethodRuleFit.h:89
A TTree object has a header with a name and a title.
Definition: TTree.h:78
virtual Bool_t HasAnalysisType(Types::EAnalysisType type, UInt_t numberClasses, UInt_t)
RuleFit can handle classification with 2 classes.
virtual ~MethodRuleFit(void)
destructor
void MakeClassRuleCuts(std::ostream &) const
print out the rule cuts
virtual void ReadWeightsFromStream(std::istream &)=0
const Bool_t kTRUE
Definition: RtypesCore.h:91
std::vector< DecisionTree * > fForest
void AddWeightsXMLTo(void *parent) const
add the rules to XML node
Double_t GetMaxFracNEve() const
Definition: MethodRuleFit.h:99
void InitEventSample(void)
write all Events from the Tree into a vector of Events, that are more easily manipulated.
Double_t GetMinFracNEve() const
Definition: MethodRuleFit.h:98