120 const TString& theWeightFile) :
268 Log() << kFATAL <<
"Mechanism to ignore events with negative weights in training not yet available for method: "
270 <<
" --> please remove \"IgnoreNegWeightsInTraining\" option from booking string."
308 Int_t nevents =
Data()->GetNTrainingEvents();
341 Log() << kINFO <<
"--------------------------------------" <<
Endl;
342 Log() << kINFO <<
"Friedmans RuleFit module is selected." <<
Endl;
343 Log() << kINFO <<
"Only the following options are used:" <<
Endl;
352 Log() << kINFO <<
"--------------------------------------" <<
Endl;
422 if (
Data()->
GetNEvents()==0)
Log() << kFATAL <<
"<Init> Data().TrainingTree() is zero pointer" <<
Endl;
425 for (
Int_t ievt=0; ievt<nevents; ievt++){
458 fRuleFit.GetRuleEnsemblePtr()->ClearRuleMap();
468 if (
IsNormalised())
Log() << kFATAL <<
"\"Normalise\" option cannot be used with RuleFit; "
469 <<
"please remove the option from the configuration string, or "
470 <<
"use \"!Normalise\""
492 Log() << kDEBUG <<
"Fitting rule coefficients ..." <<
Endl;
496 Log() << kDEBUG <<
"Computing rule and variable importance" <<
Endl;
500 fRuleFit.GetRuleEnsemblePtr()->Print();
504 Log() << kDEBUG <<
"Filling rule ntuple" <<
Endl;
505 UInt_t nrules =
fRuleFit.GetRuleEnsemble().GetRulesConst().size();
507 for (
UInt_t i=0; i<nrules; i++ ) {
508 rule =
fRuleFit.GetRuleEnsemble().GetRulesConst(i);
527 Log() << kDEBUG <<
"Training done" <<
Endl;
538 UInt_t nevents =
Data()->GetNTrainingEvents();
539 std::vector<const TMVA::Event*> tmp;
540 for (
Long64_t ievt=0; ievt<nevents; ievt++) {
542 tmp.push_back(event);
553 Log() << kINFO <<
"Training ..." <<
Endl;
556 Log() << kDEBUG <<
"reading model summary from rf_go.exe output" <<
Endl;
561 Log() << kDEBUG <<
"calculating rule and variable importance" <<
Endl;
565 fRuleFit.GetRuleEnsemblePtr()->Print();
571 Log() << kDEBUG <<
"done training" <<
Endl;
594 fRuleFit.GetRuleEnsemble().AddXMLTo( parent );
602 fRuleFit.GetRuleEnsemblePtr()->ReadRaw( istr );
610 fRuleFit.GetRuleEnsemblePtr()->ReadFromXML( wghtnode );
630 Log() << kINFO <<
"Write monitoring ntuple to file: " <<
BaseDir()->GetPath() <<
Endl;
639 Int_t dp = fout.precision();
640 fout <<
" // not implemented for class: \"" << className <<
"\"" << std::endl;
641 fout <<
"};" << std::endl;
642 fout <<
"void " << className <<
"::Initialize(){}" << std::endl;
643 fout <<
"void " << className <<
"::Clear(){}" << std::endl;
644 fout <<
"double " << className <<
"::GetMvaValue__( const std::vector<double>& inputValues ) const {" << std::endl;
645 fout <<
" double rval=" << std::setprecision(10) <<
fRuleFit.GetRuleEnsemble().GetOffset() <<
";" << std::endl;
648 fout <<
" return rval;" << std::endl;
649 fout <<
"}" << std::endl;
650 fout << std::setprecision(dp);
658 Int_t dp = fout.precision();
659 if (!
fRuleFit.GetRuleEnsemble().DoRules()) {
660 fout <<
" //" << std::endl;
661 fout <<
" // ==> MODEL CONTAINS NO RULES <==" << std::endl;
662 fout <<
" //" << std::endl;
666 const std::vector< Rule* > *rules = &(rens->
GetRulesConst());
669 std::list< std::pair<Double_t,Int_t> > sortedRules;
670 for (
UInt_t ir=0; ir<rules->size(); ir++) {
671 sortedRules.push_back( std::pair<Double_t,Int_t>( (*rules)[ir]->GetImportance()/rens->
GetImportanceRef(),ir ) );
675 fout <<
" //" << std::endl;
676 fout <<
" // here follows all rules ordered in importance (most important first)" << std::endl;
677 fout <<
" // at the end of each line, the relative importance of the rule is given" << std::endl;
678 fout <<
" //" << std::endl;
680 for ( std::list< std::pair<double,int> >::reverse_iterator itpair = sortedRules.rbegin();
681 itpair != sortedRules.rend(); ++itpair ) {
682 UInt_t ir = itpair->second;
684 ruleCut = (*rules)[ir]->GetRuleCut();
685 if (impr<rens->GetImportanceCut()) fout <<
" //" << std::endl;
686 fout <<
" if (" << std::flush;
694 if (ic>0) fout <<
"&&" << std::flush;
696 fout <<
"(" << std::setprecision(10) << valmin << std::flush;
697 fout <<
"<inputValues[" << sel <<
"])" << std::flush;
700 if (domin) fout <<
"&&" << std::flush;
701 fout <<
"(inputValues[" << sel <<
"]" << std::flush;
702 fout <<
"<" << std::setprecision(10) << valmax <<
")" <<std::flush;
705 fout <<
") rval+=" << std::setprecision(10) << (*rules)[ir]->GetCoefficient() <<
";" << std::flush;
706 fout <<
" // importance = " <<
TString::Format(
"%3.3f",impr) << std::endl;
708 fout << std::setprecision(dp);
716 if (!
fRuleFit.GetRuleEnsemble().DoLinear()) {
717 fout <<
" //" << std::endl;
718 fout <<
" // ==> MODEL CONTAINS NO LINEAR TERMS <==" << std::endl;
719 fout <<
" //" << std::endl;
722 fout <<
" //" << std::endl;
723 fout <<
" // here follows all linear terms" << std::endl;
724 fout <<
" // at the end of each line, the relative importance of the term is given" << std::endl;
725 fout <<
" //" << std::endl;
728 for (
UInt_t il=0; il<nlin; il++) {
736 <<
"*std::min( double(" << std::setprecision(10) << rens->
GetLinDP(il)
737 <<
"), std::max( double(inputValues[" << il <<
"]), double(" << std::setprecision(10) << rens->
GetLinDM(il) <<
")));"
739 fout <<
" // importance = " <<
TString::Format(
"%3.3f",imp) << std::endl;
757 Log() << col <<
"--- Short description:" << colres <<
Endl;
759 Log() <<
"This method uses a collection of so called rules to create a" <<
Endl;
760 Log() <<
"discriminating scoring function. Each rule consists of a series" <<
Endl;
761 Log() <<
"of cuts in parameter space. The ensemble of rules are created" <<
Endl;
762 Log() <<
"from a forest of decision trees, trained using the training data." <<
Endl;
763 Log() <<
"Each node (apart from the root) corresponds to one rule." <<
Endl;
764 Log() <<
"The scoring function is then obtained by linearly combining" <<
Endl;
765 Log() <<
"the rules. A fitting procedure is applied to find the optimum" <<
Endl;
766 Log() <<
"set of coefficients. The goal is to find a model with few rules" <<
Endl;
767 Log() <<
"but with a strong discriminating power." <<
Endl;
769 Log() << col <<
"--- Performance optimisation:" << colres <<
Endl;
771 Log() <<
"There are two important considerations to make when optimising:" <<
Endl;
773 Log() <<
" 1. Topology of the decision tree forest" << brk <<
Endl;
774 Log() <<
" 2. Fitting of the coefficients" <<
Endl;
776 Log() <<
"The maximum complexity of the rules is defined by the size of" <<
Endl;
777 Log() <<
"the trees. Large trees will yield many complex rules and capture" <<
Endl;
778 Log() <<
"higher order correlations. On the other hand, small trees will" <<
Endl;
779 Log() <<
"lead to a smaller ensemble with simple rules, only capable of" <<
Endl;
780 Log() <<
"modeling simple structures." <<
Endl;
781 Log() <<
"Several parameters exists for controlling the complexity of the" <<
Endl;
782 Log() <<
"rule ensemble." <<
Endl;
784 Log() <<
"The fitting procedure searches for a minimum using a gradient" <<
Endl;
785 Log() <<
"directed path. Apart from step size and number of steps, the" <<
Endl;
786 Log() <<
"evolution of the path is defined by a cut-off parameter, tau." <<
Endl;
787 Log() <<
"This parameter is unknown and depends on the training data." <<
Endl;
788 Log() <<
"A large value will tend to give large weights to a few rules." <<
Endl;
789 Log() <<
"Similarly, a small value will lead to a large set of rules" <<
Endl;
790 Log() <<
"with similar weights." <<
Endl;
792 Log() <<
"A final point is the model used; rules and/or linear terms." <<
Endl;
793 Log() <<
"For a given training sample, the result may improve by adding" <<
Endl;
794 Log() <<
"linear terms. If best performance is obtained using only linear" <<
Endl;
795 Log() <<
"terms, it is very likely that the Fisher discriminant would be" <<
Endl;
796 Log() <<
"a better choice. Ideally the fitting procedure should be able to" <<
Endl;
797 Log() <<
"make this choice by giving appropriate weights for either terms." <<
Endl;
799 Log() << col <<
"--- Performance tuning via configuration options:" << colres <<
Endl;
801 Log() <<
"I. TUNING OF RULE ENSEMBLE:" <<
Endl;
803 Log() <<
" " << col <<
"ForestType " << colres
804 <<
": Recommended is to use the default \"AdaBoost\"." << brk <<
Endl;
805 Log() <<
" " << col <<
"nTrees " << colres
806 <<
": More trees leads to more rules but also slow" <<
Endl;
807 Log() <<
" performance. With too few trees the risk is" <<
Endl;
808 Log() <<
" that the rule ensemble becomes too simple." << brk <<
Endl;
809 Log() <<
" " << col <<
"fEventsMin " << colres << brk <<
Endl;
810 Log() <<
" " << col <<
"fEventsMax " << colres
811 <<
": With a lower min, more large trees will be generated" <<
Endl;
812 Log() <<
" leading to more complex rules." <<
Endl;
813 Log() <<
" With a higher max, more small trees will be" <<
Endl;
814 Log() <<
" generated leading to more simple rules." <<
Endl;
815 Log() <<
" By changing this range, the average complexity" <<
Endl;
816 Log() <<
" of the rule ensemble can be controlled." << brk <<
Endl;
817 Log() <<
" " << col <<
"RuleMinDist " << colres
818 <<
": By increasing the minimum distance between" <<
Endl;
819 Log() <<
" rules, fewer and more diverse rules will remain." <<
Endl;
820 Log() <<
" Initially it is a good idea to keep this small" <<
Endl;
821 Log() <<
" or zero and let the fitting do the selection of" <<
Endl;
822 Log() <<
" rules. In order to reduce the ensemble size," <<
Endl;
823 Log() <<
" the value can then be increased." <<
Endl;
826 Log() <<
"II. TUNING OF THE FITTING:" <<
Endl;
828 Log() <<
" " << col <<
"GDPathEveFrac " << colres
829 <<
": fraction of events in path evaluation" <<
Endl;
830 Log() <<
" Increasing this fraction will improve the path" <<
Endl;
831 Log() <<
" finding. However, a too high value will give few" <<
Endl;
832 Log() <<
" unique events available for error estimation." <<
Endl;
833 Log() <<
" It is recommended to use the default = 0.5." << brk <<
Endl;
834 Log() <<
" " << col <<
"GDTau " << colres
835 <<
": cutoff parameter tau" <<
Endl;
836 Log() <<
" By default this value is set to -1.0." <<
Endl;
838 Log() <<
" This means that the cut off parameter is" <<
Endl;
839 Log() <<
" automatically estimated. In most cases" <<
Endl;
840 Log() <<
" this should be fine. However, you may want" <<
Endl;
841 Log() <<
" to fix this value if you already know it" <<
Endl;
842 Log() <<
" and want to reduce on training time." << brk <<
Endl;
843 Log() <<
" " << col <<
"GDTauPrec " << colres
844 <<
": precision of estimated tau" <<
Endl;
845 Log() <<
" Increase this precision to find a more" <<
Endl;
846 Log() <<
" optimum cut-off parameter." << brk <<
Endl;
847 Log() <<
" " << col <<
"GDNStep " << colres
848 <<
": number of steps in path search" <<
Endl;
849 Log() <<
" If the number of steps is too small, then" <<
Endl;
850 Log() <<
" the program will give a warning message." <<
Endl;
852 Log() <<
"III. WARNING MESSAGES" <<
Endl;
854 Log() << col <<
"Risk(i+1)>=Risk(i) in path" << colres << brk <<
Endl;
855 Log() << col <<
"Chaotic behaviour of risk evolution." << colres <<
Endl;
857 Log() <<
" The error rate was still decreasing at the end" <<
Endl;
858 Log() <<
" By construction the Risk should always decrease." <<
Endl;
859 Log() <<
" However, if the training sample is too small or" <<
Endl;
860 Log() <<
" the model is overtrained, such warnings can" <<
Endl;
862 Log() <<
" The warnings can safely be ignored if only a" <<
Endl;
863 Log() <<
" few (<3) occur. If more warnings are generated," <<
Endl;
864 Log() <<
" the fitting fails." <<
Endl;
865 Log() <<
" A remedy may be to increase the value" << brk <<
Endl;
867 << col <<
"GDValidEveFrac" << colres
868 <<
" to 1.0 (or a larger value)." << brk <<
Endl;
869 Log() <<
" In addition, if "
870 << col <<
"GDPathEveFrac" << colres
871 <<
" is too high" <<
Endl;
872 Log() <<
" the same warnings may occur since the events" <<
Endl;
873 Log() <<
" used for error estimation are also used for" <<
Endl;
874 Log() <<
" path estimation." <<
Endl;
875 Log() <<
" Another possibility is to modify the model - " <<
Endl;
876 Log() <<
" See above on tuning the rule ensemble." <<
Endl;
878 Log() << col <<
"The error rate was still decreasing at the end of the path"
880 Log() <<
" Too few steps in path! Increase "
881 << col <<
"GDNSteps" << colres <<
"." <<
Endl;
883 Log() << col <<
"Reached minimum early in the search" << colres <<
Endl;
885 Log() <<
" Minimum was found early in the fitting. This" <<
Endl;
886 Log() <<
" may indicate that the used step size "
887 << col <<
"GDStep" << colres <<
"." <<
Endl;
888 Log() <<
" was too large. Reduce it and rerun." <<
Endl;
889 Log() <<
" If the results still are not OK, modify the" <<
Endl;
890 Log() <<
" model either by modifying the rule ensemble" <<
Endl;
891 Log() <<
" or add/remove linear terms" <<
Endl;
#define REGISTER_METHOD(CLASS)
for example
int Int_t
Signed integer 4 bytes (int).
unsigned int UInt_t
Unsigned integer 4 bytes (unsigned int).
bool Bool_t
Boolean (0=false, 1=true) (bool).
double Double_t
Double 8 bytes.
long long Long64_t
Portable signed long integer 8 bytes.
Bool_t WriteOptionsReference() const
OptionBase * DeclareOptionRef(T &ref, const TString &name, const TString &desc="")
void AddPreDefVal(const T &)
Implementation of the CrossEntropy as separation criterion.
Class that contains all the data information.
static void SetIsTraining(bool on)
Implementation of a Decision Tree.
Implementation of the GiniIndex as separation criterion.
MethodBase(const TString &jobName, Types::EMVA methodType, const TString &methodTitle, DataSetInfo &dsi, const TString &theOption="")
standard constructor
Bool_t HasTrainingTree() const
const char * GetName() const override
TString GetMethodTypeName() const
Bool_t IgnoreEventsWithNegWeightsInTraining() const
TDirectory * BaseDir() const
returns the ROOT directory where info/histograms etc of the corresponding MVA method instance are sto...
UInt_t GetNEvents() const
const Event * GetEvent() const
Bool_t IsSilentFile() const
void SetSignalReferenceCut(Double_t cut)
void NoErrorCalc(Double_t *const err, Double_t *const errUpper)
const TString & GetInputLabel(Int_t i) const
Bool_t IsNormalised() const
RuleFit fRuleFit
RuleFit instance.
UInt_t fGDTauScan
GD path: number of points to scan.
Double_t fNTPss
ntuple: rule P(tag s, true s)
TString fForestTypeS
forest generation: how the trees are generated
Double_t fMinimp
rule/linear: minimum importance
TString fRuleFitModuleS
which rulefit module to use
Double_t fLinQuantile
quantile cut to remove outliers - see RuleEnsemble
void GetHelpMessage() const override
get help message text
void ReadWeightsFromXML(void *wghtnode) override
read rules from XML node
Double_t fMinFracNEve
min fraction of number events
Int_t fNTType
ntuple: rule type (+1->signal, -1->bkg)
Bool_t fUseRuleFitJF
if true interface with J.Friedmans RuleFit module
Double_t fGDTauMax
GD path: max threshold fraction [0..1].
void DeclareOptions() override
define the options (their key words) that can be set in the option string know options.
Double_t fGDPathEveFrac
GD path: fraction of subsamples used for the fitting.
Bool_t fUseBoost
use boosted events for forest generation
TMVA::DecisionTree::EPruneMethod fPruneMethod
forest generation: method used for pruning - see DecisionTree
std::vector< DecisionTree * > fForest
the forest
Double_t fMaxFracNEve
ditto max
Double_t GetMvaValue(Double_t *err=nullptr, Double_t *errUpper=nullptr) override
returns MVA value for given event
TString fRFWorkDir
working directory from Friedmans module
Double_t fTreeEveFrac
fraction of events used for training each tree
void MakeClassLinear(std::ostream &) const
print out the linear terms
Double_t fNTPsb
ntuple: rule P(tag s, true b)
Int_t fNTNvars
ntuple: rule number of vars
Int_t fGDNPathSteps
GD path: number of steps.
std::vector< TMVA::Event * > fEventSample
the complete training sample
Double_t fNTPbb
ntuple: rule P(tag b, true b)
Double_t fNTSSB
ntuple: rule S/(S+B)
void TrainJFRuleFit()
training of rules using Jerome Friedmans implementation
Int_t fNTNcuts
ntuple: rule number of cuts
TString fModelTypeS
rule ensemble: which model (rule,linear or both)
Double_t fNTImportance
ntuple: rule importance
Double_t fGDValidEveFrac
GD path: fraction of subsamples used for the fitting.
void InitEventSample(void)
write all Events from the Tree into a vector of Events, that are more easily manipulated.
void MakeClassRuleCuts(std::ostream &) const
print out the rule cuts
Double_t fNTCoefficient
ntuple: rule coefficient
void Init(void) override
default initialization
TString fSepTypeS
forest generation: separation type - see DecisionTree
void InitMonitorNtuple()
initialize the monitoring ntuple
Int_t fNTrees
number of trees in forest
Double_t fNTPtag
ntuple: rule P(tag)
virtual ~MethodRuleFit(void)
destructor
Int_t fRFNendnodes
max number of rules (only Friedmans module)
Double_t fNTPbs
ntuple: rule P(tag b, true s)
void ReadWeightsFromStream(std::istream &istr) override
read rules from an std::istream
TTree * fMonitorNtuple
pointer to monitor rule ntuple
void ProcessOptions() override
process the options specified by the user
void AddWeightsXMLTo(void *parent) const override
add the rules to XML node
MethodRuleFit(const TString &jobName, const TString &methodTitle, DataSetInfo &theData, const TString &theOption="")
standard constructor
Double_t fNTSupport
ntuple: rule support
Bool_t HasAnalysisType(Types::EAnalysisType type, UInt_t numberClasses, UInt_t) override
RuleFit can handle classification with 2 classes.
Double_t fGDTauMin
GD path: min threshold fraction [0..1].
Double_t fGDTau
GD path: def threshold fraction [0..1].
SeparationBase * fSepType
the separation used in node splitting
const Ranking * CreateRanking() override
computes ranking of input variables
Double_t fGDPathStep
GD path: step size in path.
Double_t fGDTauPrec
GD path: precision of estimated tau.
void MakeClassSpecific(std::ostream &, const TString &) const override
write specific classifier response
Double_t fPruneStrength
forest generation: prune strength - see DecisionTree
void Train(void) override
Int_t fNCuts
grid used in cut applied in node splitting
void WriteMonitoringHistosToFile(void) const override
write special monitoring histograms to file (here ntuple)
Double_t fGDErrScale
GD path: stop.
TString fPruneMethodS
forest generation: prune method - see DecisionTree
Double_t fRuleMinDist
rule min distance - see RuleEnsemble
Int_t fRFNrules
max number of rules (only Friedmans module)
Bool_t VerifyRange(MsgLogger &mlog, const char *varstr, T &var, const T &vmin, const T &vmax)
void TrainTMVARuleFit()
training of rules using TMVA implementation
Double_t fSignalFraction
scalefactor for bkg events to modify initial s/b fraction in training data
Implementation of the MisClassificationError as separation criterion.
Ranking for variables in method (implementation).
A class describing a 'rule cut'.
Double_t GetCutMin(Int_t is) const
UInt_t GetSelector(Int_t is) const
Char_t GetCutDoMin(Int_t is) const
Char_t GetCutDoMax(Int_t is) const
UInt_t GetNcuts() const
get number of cuts
Double_t GetCutMax(Int_t is) const
Double_t GetLinDP(int i) const
Double_t GetLinDM(int i) const
const std::vector< Double_t > & GetLinCoefficients() const
Double_t GetImportanceRef() const
const std::vector< Double_t > & GetLinNorm() const
UInt_t GetNLinear() const
const std::vector< TMVA::Rule * > & GetRulesConst() const
const std::vector< Double_t > & GetLinImportance() const
Bool_t IsLinTermOK(int i) const
J Friedman's RuleFit method.
Bool_t ReadModelSum()
read model from rulefit.sum
void WelcomeMessage()
welcome message
Implementation of a rule.
Double_t GetSupport() const
const RuleCut * GetRuleCut() const
Bool_t IsSignalRule() const
Double_t GetCoefficient() const
Double_t GetRelImportance() const
Implementation of the SdivSqrtSplusB as separation criterion.
Timing information for training and evaluation of MVA methods.
Singleton class for Global types used by TMVA.
static TString Format(const char *fmt,...)
Static method which formats a string using a printf style format descriptor and return a TString.
A TTree represents a columnar dataset.
create variable transformations
MsgLogger & Endl(MsgLogger &ml)