121 const TString& theWeightFile) :
269 Log() << kFATAL <<
"Mechanism to ignore events with negative weights in training not yet available for method: "
271 <<
" --> please remove \"IgnoreNegWeightsInTraining\" option from booking string."
309 Int_t nevents =
Data()->GetNTrainingEvents();
342 Log() << kINFO <<
"--------------------------------------" <<
Endl;
343 Log() << kINFO <<
"Friedmans RuleFit module is selected." <<
Endl;
344 Log() << kINFO <<
"Only the following options are used:" <<
Endl;
353 Log() << kINFO <<
"--------------------------------------" <<
Endl;
423 if (
Data()->
GetNEvents()==0)
Log() << kFATAL <<
"<Init> Data().TrainingTree() is zero pointer" <<
Endl;
426 for (
Int_t ievt=0; ievt<nevents; ievt++){
459 fRuleFit.GetRuleEnsemblePtr()->ClearRuleMap();
469 if (
IsNormalised())
Log() << kFATAL <<
"\"Normalise\" option cannot be used with RuleFit; "
470 <<
"please remove the option from the configuration string, or "
471 <<
"use \"!Normalise\""
493 Log() << kDEBUG <<
"Fitting rule coefficients ..." <<
Endl;
497 Log() << kDEBUG <<
"Computing rule and variable importance" <<
Endl;
501 fRuleFit.GetRuleEnsemblePtr()->Print();
505 Log() << kDEBUG <<
"Filling rule ntuple" <<
Endl;
506 UInt_t nrules =
fRuleFit.GetRuleEnsemble().GetRulesConst().size();
509 rule =
fRuleFit.GetRuleEnsemble().GetRulesConst(
i);
528 Log() << kDEBUG <<
"Training done" <<
Endl;
539 UInt_t nevents =
Data()->GetNTrainingEvents();
540 std::vector<const TMVA::Event*> tmp;
541 for (
Long64_t ievt=0; ievt<nevents; ievt++) {
543 tmp.push_back(event);
554 Log() << kINFO <<
"Training ..." <<
Endl;
557 Log() << kDEBUG <<
"reading model summary from rf_go.exe output" <<
Endl;
562 Log() << kDEBUG <<
"calculating rule and variable importance" <<
Endl;
566 fRuleFit.GetRuleEnsemblePtr()->Print();
572 Log() << kDEBUG <<
"done training" <<
Endl;
595 fRuleFit.GetRuleEnsemble().AddXMLTo( parent );
603 fRuleFit.GetRuleEnsemblePtr()->ReadRaw( istr );
611 fRuleFit.GetRuleEnsemblePtr()->ReadFromXML( wghtnode );
631 Log() << kINFO <<
"Write monitoring ntuple to file: " <<
BaseDir()->GetPath() <<
Endl;
640 Int_t dp = fout.precision();
641 fout <<
" // not implemented for class: \"" << className <<
"\"" << std::endl;
642 fout <<
"};" << std::endl;
643 fout <<
"void " << className <<
"::Initialize(){}" << std::endl;
644 fout <<
"void " << className <<
"::Clear(){}" << std::endl;
645 fout <<
"double " << className <<
"::GetMvaValue__( const std::vector<double>& inputValues ) const {" << std::endl;
646 fout <<
" double rval=" << std::setprecision(10) <<
fRuleFit.GetRuleEnsemble().GetOffset() <<
";" << std::endl;
649 fout <<
" return rval;" << std::endl;
650 fout <<
"}" << std::endl;
651 fout << std::setprecision(dp);
659 Int_t dp = fout.precision();
660 if (!
fRuleFit.GetRuleEnsemble().DoRules()) {
661 fout <<
" //" << std::endl;
662 fout <<
" // ==> MODEL CONTAINS NO RULES <==" << std::endl;
663 fout <<
" //" << std::endl;
667 const std::vector< Rule* > *rules = &(rens->
GetRulesConst());
670 std::list< std::pair<Double_t,Int_t> > sortedRules;
671 for (
UInt_t ir=0; ir<rules->size(); ir++) {
672 sortedRules.push_back( std::pair<Double_t,Int_t>( (*rules)[ir]->GetImportance()/rens->
GetImportanceRef(),ir ) );
676 fout <<
" //" << std::endl;
677 fout <<
" // here follows all rules ordered in importance (most important first)" << std::endl;
678 fout <<
" // at the end of each line, the relative importance of the rule is given" << std::endl;
679 fout <<
" //" << std::endl;
681 for ( std::list< std::pair<double,int> >::reverse_iterator itpair = sortedRules.rbegin();
682 itpair != sortedRules.rend(); ++itpair ) {
683 UInt_t ir = itpair->second;
685 ruleCut = (*rules)[ir]->GetRuleCut();
686 if (impr<rens->GetImportanceCut()) fout <<
" //" << std::endl;
687 fout <<
" if (" << std::flush;
695 if (ic>0) fout <<
"&&" << std::flush;
697 fout <<
"(" << std::setprecision(10) << valmin << std::flush;
698 fout <<
"<inputValues[" <<
sel <<
"])" << std::flush;
701 if (domin) fout <<
"&&" << std::flush;
702 fout <<
"(inputValues[" <<
sel <<
"]" << std::flush;
703 fout <<
"<" << std::setprecision(10) << valmax <<
")" <<std::flush;
706 fout <<
") rval+=" << std::setprecision(10) << (*rules)[ir]->GetCoefficient() <<
";" << std::flush;
707 fout <<
" // importance = " <<
TString::Format(
"%3.3f",impr) << std::endl;
709 fout << std::setprecision(dp);
717 if (!
fRuleFit.GetRuleEnsemble().DoLinear()) {
718 fout <<
" //" << std::endl;
719 fout <<
" // ==> MODEL CONTAINS NO LINEAR TERMS <==" << std::endl;
720 fout <<
" //" << std::endl;
723 fout <<
" //" << std::endl;
724 fout <<
" // here follows all linear terms" << std::endl;
725 fout <<
" // at the end of each line, the relative importance of the term is given" << std::endl;
726 fout <<
" //" << std::endl;
729 for (
UInt_t il=0; il<nlin; il++) {
737 <<
"*std::min( double(" << std::setprecision(10) << rens->
GetLinDP(il)
738 <<
"), std::max( double(inputValues[" << il <<
"]), double(" << std::setprecision(10) << rens->
GetLinDM(il) <<
")));"
740 fout <<
" // importance = " <<
TString::Format(
"%3.3f",imp) << std::endl;
758 Log() << col <<
"--- Short description:" << colres <<
Endl;
760 Log() <<
"This method uses a collection of so called rules to create a" <<
Endl;
761 Log() <<
"discriminating scoring function. Each rule consists of a series" <<
Endl;
762 Log() <<
"of cuts in parameter space. The ensemble of rules are created" <<
Endl;
763 Log() <<
"from a forest of decision trees, trained using the training data." <<
Endl;
764 Log() <<
"Each node (apart from the root) corresponds to one rule." <<
Endl;
765 Log() <<
"The scoring function is then obtained by linearly combining" <<
Endl;
766 Log() <<
"the rules. A fitting procedure is applied to find the optimum" <<
Endl;
767 Log() <<
"set of coefficients. The goal is to find a model with few rules" <<
Endl;
768 Log() <<
"but with a strong discriminating power." <<
Endl;
770 Log() << col <<
"--- Performance optimisation:" << colres <<
Endl;
772 Log() <<
"There are two important considerations to make when optimising:" <<
Endl;
774 Log() <<
" 1. Topology of the decision tree forest" << brk <<
Endl;
775 Log() <<
" 2. Fitting of the coefficients" <<
Endl;
777 Log() <<
"The maximum complexity of the rules is defined by the size of" <<
Endl;
778 Log() <<
"the trees. Large trees will yield many complex rules and capture" <<
Endl;
779 Log() <<
"higher order correlations. On the other hand, small trees will" <<
Endl;
780 Log() <<
"lead to a smaller ensemble with simple rules, only capable of" <<
Endl;
781 Log() <<
"modeling simple structures." <<
Endl;
782 Log() <<
"Several parameters exists for controlling the complexity of the" <<
Endl;
783 Log() <<
"rule ensemble." <<
Endl;
785 Log() <<
"The fitting procedure searches for a minimum using a gradient" <<
Endl;
786 Log() <<
"directed path. Apart from step size and number of steps, the" <<
Endl;
787 Log() <<
"evolution of the path is defined by a cut-off parameter, tau." <<
Endl;
788 Log() <<
"This parameter is unknown and depends on the training data." <<
Endl;
789 Log() <<
"A large value will tend to give large weights to a few rules." <<
Endl;
790 Log() <<
"Similarly, a small value will lead to a large set of rules" <<
Endl;
791 Log() <<
"with similar weights." <<
Endl;
793 Log() <<
"A final point is the model used; rules and/or linear terms." <<
Endl;
794 Log() <<
"For a given training sample, the result may improve by adding" <<
Endl;
795 Log() <<
"linear terms. If best performance is obtained using only linear" <<
Endl;
796 Log() <<
"terms, it is very likely that the Fisher discriminant would be" <<
Endl;
797 Log() <<
"a better choice. Ideally the fitting procedure should be able to" <<
Endl;
798 Log() <<
"make this choice by giving appropriate weights for either terms." <<
Endl;
800 Log() << col <<
"--- Performance tuning via configuration options:" << colres <<
Endl;
802 Log() <<
"I. TUNING OF RULE ENSEMBLE:" <<
Endl;
804 Log() <<
" " << col <<
"ForestType " << colres
805 <<
": Recommended is to use the default \"AdaBoost\"." << brk <<
Endl;
806 Log() <<
" " << col <<
"nTrees " << colres
807 <<
": More trees leads to more rules but also slow" <<
Endl;
808 Log() <<
" performance. With too few trees the risk is" <<
Endl;
809 Log() <<
" that the rule ensemble becomes too simple." << brk <<
Endl;
810 Log() <<
" " << col <<
"fEventsMin " << colres << brk <<
Endl;
811 Log() <<
" " << col <<
"fEventsMax " << colres
812 <<
": With a lower min, more large trees will be generated" <<
Endl;
813 Log() <<
" leading to more complex rules." <<
Endl;
814 Log() <<
" With a higher max, more small trees will be" <<
Endl;
815 Log() <<
" generated leading to more simple rules." <<
Endl;
816 Log() <<
" By changing this range, the average complexity" <<
Endl;
817 Log() <<
" of the rule ensemble can be controlled." << brk <<
Endl;
818 Log() <<
" " << col <<
"RuleMinDist " << colres
819 <<
": By increasing the minimum distance between" <<
Endl;
820 Log() <<
" rules, fewer and more diverse rules will remain." <<
Endl;
821 Log() <<
" Initially it is a good idea to keep this small" <<
Endl;
822 Log() <<
" or zero and let the fitting do the selection of" <<
Endl;
823 Log() <<
" rules. In order to reduce the ensemble size," <<
Endl;
824 Log() <<
" the value can then be increased." <<
Endl;
827 Log() <<
"II. TUNING OF THE FITTING:" <<
Endl;
829 Log() <<
" " << col <<
"GDPathEveFrac " << colres
830 <<
": fraction of events in path evaluation" <<
Endl;
831 Log() <<
" Increasing this fraction will improve the path" <<
Endl;
832 Log() <<
" finding. However, a too high value will give few" <<
Endl;
833 Log() <<
" unique events available for error estimation." <<
Endl;
834 Log() <<
" It is recommended to use the default = 0.5." << brk <<
Endl;
835 Log() <<
" " << col <<
"GDTau " << colres
836 <<
": cutoff parameter tau" <<
Endl;
837 Log() <<
" By default this value is set to -1.0." <<
Endl;
839 Log() <<
" This means that the cut off parameter is" <<
Endl;
840 Log() <<
" automatically estimated. In most cases" <<
Endl;
841 Log() <<
" this should be fine. However, you may want" <<
Endl;
842 Log() <<
" to fix this value if you already know it" <<
Endl;
843 Log() <<
" and want to reduce on training time." << brk <<
Endl;
844 Log() <<
" " << col <<
"GDTauPrec " << colres
845 <<
": precision of estimated tau" <<
Endl;
846 Log() <<
" Increase this precision to find a more" <<
Endl;
847 Log() <<
" optimum cut-off parameter." << brk <<
Endl;
848 Log() <<
" " << col <<
"GDNStep " << colres
849 <<
": number of steps in path search" <<
Endl;
850 Log() <<
" If the number of steps is too small, then" <<
Endl;
851 Log() <<
" the program will give a warning message." <<
Endl;
853 Log() <<
"III. WARNING MESSAGES" <<
Endl;
855 Log() << col <<
"Risk(i+1)>=Risk(i) in path" << colres << brk <<
Endl;
856 Log() << col <<
"Chaotic behaviour of risk evolution." << colres <<
Endl;
858 Log() <<
" The error rate was still decreasing at the end" <<
Endl;
859 Log() <<
" By construction the Risk should always decrease." <<
Endl;
860 Log() <<
" However, if the training sample is too small or" <<
Endl;
861 Log() <<
" the model is overtrained, such warnings can" <<
Endl;
863 Log() <<
" The warnings can safely be ignored if only a" <<
Endl;
864 Log() <<
" few (<3) occur. If more warnings are generated," <<
Endl;
865 Log() <<
" the fitting fails." <<
Endl;
866 Log() <<
" A remedy may be to increase the value" << brk <<
Endl;
868 << col <<
"GDValidEveFrac" << colres
869 <<
" to 1.0 (or a larger value)." << brk <<
Endl;
870 Log() <<
" In addition, if "
871 << col <<
"GDPathEveFrac" << colres
872 <<
" is too high" <<
Endl;
873 Log() <<
" the same warnings may occur since the events" <<
Endl;
874 Log() <<
" used for error estimation are also used for" <<
Endl;
875 Log() <<
" path estimation." <<
Endl;
876 Log() <<
" Another possibility is to modify the model - " <<
Endl;
877 Log() <<
" See above on tuning the rule ensemble." <<
Endl;
879 Log() << col <<
"The error rate was still decreasing at the end of the path"
881 Log() <<
" Too few steps in path! Increase "
882 << col <<
"GDNSteps" << colres <<
"." <<
Endl;
884 Log() << col <<
"Reached minimum early in the search" << colres <<
Endl;
886 Log() <<
" Minimum was found early in the fitting. This" <<
Endl;
887 Log() <<
" may indicate that the used step size "
888 << col <<
"GDStep" << colres <<
"." <<
Endl;
889 Log() <<
" was too large. Reduce it and rerun." <<
Endl;
890 Log() <<
" If the results still are not OK, modify the" <<
Endl;
891 Log() <<
" model either by modifying the rule ensemble" <<
Endl;
892 Log() <<
" or add/remove linear terms" <<
Endl;
#define REGISTER_METHOD(CLASS)
for example
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char Pixmap_t Pixmap_t PictureAttributes_t attr const char char ret_data h unsigned char height h Atom_t Int_t ULong_t ULong_t unsigned char prop_list Atom_t sel
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char Pixmap_t Pixmap_t PictureAttributes_t attr const char char ret_data h unsigned char height h Atom_t Int_t ULong_t ULong_t unsigned char prop_list Atom_t Atom_t Atom_t Time_t type
Bool_t WriteOptionsReference() const
OptionBase * DeclareOptionRef(T &ref, const TString &name, const TString &desc="")
void AddPreDefVal(const T &)
Implementation of the CrossEntropy as separation criterion.
Class that contains all the data information.
static void SetIsTraining(bool on)
Implementation of a Decision Tree.
Implementation of the GiniIndex as separation criterion.
MethodBase(const TString &jobName, Types::EMVA methodType, const TString &methodTitle, DataSetInfo &dsi, const TString &theOption="")
standard constructor
Bool_t HasTrainingTree() const
TString GetMethodTypeName() const
const char * GetName() const
Bool_t IgnoreEventsWithNegWeightsInTraining() const
TDirectory * BaseDir() const
returns the ROOT directory where info/histograms etc of the corresponding MVA method instance are sto...
UInt_t GetNEvents() const
const Event * GetEvent() const
Bool_t IsSilentFile() const
void SetSignalReferenceCut(Double_t cut)
void NoErrorCalc(Double_t *const err, Double_t *const errUpper)
const TString & GetInputLabel(Int_t i) const
Bool_t IsNormalised() const
J Friedman's RuleFit method.
RuleFit fRuleFit
RuleFit instance.
UInt_t fGDTauScan
GD path: number of points to scan.
Double_t fNTPss
ntuple: rule P(tag s, true s)
TString fForestTypeS
forest generation: how the trees are generated
Double_t fMinimp
rule/linear: minimum importance
TString fRuleFitModuleS
which rulefit module to use
Double_t fLinQuantile
quantile cut to remove outliers - see RuleEnsemble
void MakeClassSpecific(std::ostream &, const TString &) const
write specific classifier response
Double_t fMinFracNEve
min fraction of number events
Int_t fNTType
ntuple: rule type (+1->signal, -1->bkg)
Bool_t fUseRuleFitJF
if true interface with J.Friedmans RuleFit module
Double_t fGDTauMax
GD path: max threshold fraction [0..1].
Double_t fGDPathEveFrac
GD path: fraction of subsamples used for the fitting.
Bool_t fUseBoost
use boosted events for forest generation
TMVA::DecisionTree::EPruneMethod fPruneMethod
forest generation: method used for pruning - see DecisionTree
std::vector< DecisionTree * > fForest
the forest
Double_t fMaxFracNEve
ditto max
TString fRFWorkDir
working directory from Friedmans module
Double_t fTreeEveFrac
fraction of events used for training each tree
void MakeClassLinear(std::ostream &) const
print out the linear terms
Double_t fNTPsb
ntuple: rule P(tag s, true b)
void GetHelpMessage() const
get help message text
Int_t fNTNvars
ntuple: rule number of vars
Int_t fGDNPathSteps
GD path: number of steps.
std::vector< TMVA::Event * > fEventSample
the complete training sample
Double_t fNTPbb
ntuple: rule P(tag b, true b)
Double_t fNTSSB
ntuple: rule S/(S+B)
void TrainJFRuleFit()
training of rules using Jerome Friedmans implementation
Int_t fNTNcuts
ntuple: rule number of cuts
TString fModelTypeS
rule ensemble: which model (rule,linear or both)
virtual Bool_t HasAnalysisType(Types::EAnalysisType type, UInt_t numberClasses, UInt_t)
RuleFit can handle classification with 2 classes.
Double_t fNTImportance
ntuple: rule importance
void ProcessOptions()
process the options specified by the user
void ReadWeightsFromStream(std::istream &istr)
read rules from an std::istream
void AddWeightsXMLTo(void *parent) const
add the rules to XML node
Double_t fGDValidEveFrac
GD path: fraction of subsamples used for the fitting.
void InitEventSample(void)
write all Events from the Tree into a vector of Events, that are more easily manipulated.
void MakeClassRuleCuts(std::ostream &) const
print out the rule cuts
Double_t fNTCoefficient
ntuple: rule coefficient
TString fSepTypeS
forest generation: separation type - see DecisionTree
void InitMonitorNtuple()
initialize the monitoring ntuple
Int_t fNTrees
number of trees in forest
Double_t fNTPtag
ntuple: rule P(tag)
virtual ~MethodRuleFit(void)
destructor
Int_t fRFNendnodes
max number of rules (only Friedmans module)
void Init(void)
default initialization
Double_t fNTPbs
ntuple: rule P(tag b, true s)
TTree * fMonitorNtuple
pointer to monitor rule ntuple
void WriteMonitoringHistosToFile(void) const
write special monitoring histograms to file (here ntuple)
MethodRuleFit(const TString &jobName, const TString &methodTitle, DataSetInfo &theData, const TString &theOption="")
standard constructor
Double_t fNTSupport
ntuple: rule support
Double_t fGDTauMin
GD path: min threshold fraction [0..1].
void ReadWeightsFromXML(void *wghtnode)
read rules from XML node
Double_t fGDTau
GD path: def threshold fraction [0..1].
SeparationBase * fSepType
the separation used in node splitting
Double_t fGDPathStep
GD path: step size in path.
Double_t fGDTauPrec
GD path: precision of estimated tau.
Double_t fPruneStrength
forest generation: prune strength - see DecisionTree
Int_t fNCuts
grid used in cut applied in node splitting
Double_t fGDErrScale
GD path: stop.
void DeclareOptions()
define the options (their key words) that can be set in the option string know options.
TString fPruneMethodS
forest generation: prune method - see DecisionTree
Double_t fRuleMinDist
rule min distance - see RuleEnsemble
Int_t fRFNrules
max number of rules (only Friedmans module)
Bool_t VerifyRange(MsgLogger &mlog, const char *varstr, T &var, const T &vmin, const T &vmax)
Double_t GetMvaValue(Double_t *err=nullptr, Double_t *errUpper=nullptr)
returns MVA value for given event
const Ranking * CreateRanking()
computes ranking of input variables
void TrainTMVARuleFit()
training of rules using TMVA implementation
Double_t fSignalFraction
scalefactor for bkg events to modify initial s/b fraction in training data
Implementation of the MisClassificationError as separation criterion.
Ranking for variables in method (implementation)
A class describing a 'rule cut'.
Double_t GetCutMin(Int_t is) const
UInt_t GetSelector(Int_t is) const
Char_t GetCutDoMin(Int_t is) const
Char_t GetCutDoMax(Int_t is) const
UInt_t GetNcuts() const
get number of cuts
Double_t GetCutMax(Int_t is) const
Double_t GetLinDP(int i) const
Double_t GetLinDM(int i) const
const std::vector< Double_t > & GetLinCoefficients() const
Double_t GetImportanceRef() const
const std::vector< Double_t > & GetLinNorm() const
UInt_t GetNLinear() const
const std::vector< TMVA::Rule * > & GetRulesConst() const
const std::vector< Double_t > & GetLinImportance() const
Bool_t IsLinTermOK(int i) const
J Friedman's RuleFit method.
Bool_t ReadModelSum()
read model from rulefit.sum
void WelcomeMessage()
welcome message
Implementation of a rule.
Double_t GetSupport() const
const RuleCut * GetRuleCut() const
Bool_t IsSignalRule() const
Double_t GetCoefficient() const
Double_t GetRelImportance() const
Implementation of the SdivSqrtSplusB as separation criterion.
Timing information for training and evaluation of MVA methods.
Singleton class for Global types used by TMVA.
static TString Format(const char *fmt,...)
Static method which formats a string using a printf style format descriptor and return a TString.
A TTree represents a columnar dataset.
create variable transformations
MsgLogger & Endl(MsgLogger &ml)