66 : fVisHistsUseImp(
kTRUE )
81 , fVisHistsUseImp(
kTRUE)
100 UInt_t neve = fTrainingEvents.size();
103 fNEveEffTrain = CalcWeightSum( &fTrainingEvents );
112 this->SetMethodBase(rfbase);
113 fRuleEnsemble.Initialize(
this );
114 fRuleFitParams.SetRuleFit(
this );
126 UInt_t nevents = fMethodRuleFit->Data()->GetNTrainingEvents();
127 std::vector<const TMVA::Event*> tmp;
128 for (
Long64_t ievt=0; ievt<nevents; ievt++) {
129 const Event *
event = fMethodRuleFit->GetEvent(ievt);
130 tmp.push_back(event);
132 SetTrainingEvents( tmp );
141 fRuleEnsemble.MakeModel();
144 fRuleFitParams.Init();
153 fMethodBase = rfbase;
154 fMethodRuleFit =
dynamic_cast<const MethodRuleFit *
>(rfbase);
178 if (events==0)
return 0.0;
179 if (neve==0) neve=events->size();
182 for (
UInt_t ie=0; ie<neve; ie++) {
183 sumw += ((*events)[ie])->GetWeight();
193 fLogger->SetMinType(t);
194 fRuleEnsemble.SetMsgType(t);
195 fRuleFitParams.SetMsgType(t);
204 if (fMethodRuleFit==0) {
205 Log() << kFATAL <<
"RuleFit::BuildTree() - Attempting to build a tree NOT from a MethodRuleFit" <<
Endl;
207 std::vector<const Event *> evevec;
208 for (
UInt_t ie=0; ie<fNTreeSample; ie++) {
209 evevec.push_back(fTrainingEventsRndm[ie]);
224 if (fMethodRuleFit==0) {
225 Log() << kFATAL <<
"RuleFit::BuildTree() - Attempting to build a tree NOT from a MethodRuleFit" <<
Endl;
227 Log() << kDEBUG <<
"Creating a forest with " << fMethodRuleFit->GetNTrees() <<
" decision trees" <<
Endl;
228 Log() << kDEBUG <<
"Each tree is built using a random subsample with " << fNTreeSample <<
" events" <<
Endl;
230 Timer timer( fMethodRuleFit->GetNTrees(),
"RuleFit" );
241 Bool_t useBoost = fMethodRuleFit->UseBoost();
243 if (useBoost) SaveEventWeights();
245 for (
Int_t i=0; i<fMethodRuleFit->GetNTrees(); i++) {
247 if (!useBoost) ReshuffleEvents();
250 for (
UInt_t ie = 0; ie<fNTreeSample; ie++) {
251 if (fMethodBase->DataInfo().IsSignal(fTrainingEventsRndm[ie])) nsig++;
260 const Int_t ntriesMax=10;
263 frnd = 100*rndGen.
Uniform( fMethodRuleFit->GetMinFracNEve(), 0.5*fMethodRuleFit->GetMaxFracNEve() );
265 Bool_t useRandomisedTree = !useBoost;
266 dt =
new DecisionTree( fMethodRuleFit->GetSeparationBase(), frnd, fMethodRuleFit->GetNCuts(), &(fMethodRuleFit->DataInfo()), iclass, useRandomisedTree);
267 dt->
SetNVars(fMethodBase->GetNvar());
275 tryAgain = ((dt==0) && (ntries<ntriesMax));
278 fForest.push_back(dt);
279 if (useBoost) Boost(dt);
283 Log() << kWARNING <<
"------------------------------------------------------------------" <<
Endl;
284 Log() << kWARNING <<
" Failed growing a tree even after " << ntriesMax <<
" trials" <<
Endl;
285 Log() << kWARNING <<
" Possible solutions: " <<
Endl;
286 Log() << kWARNING <<
" 1. increase the number of training events" <<
Endl;
287 Log() << kWARNING <<
" 2. set a lower min fraction cut (fEventsMin)" <<
Endl;
288 Log() << kWARNING <<
" 3. maybe also decrease the max fraction cut (fEventsMax)" <<
Endl;
289 Log() << kWARNING <<
" If the above warning occurs rarely only, it can be ignored" <<
Endl;
290 Log() << kWARNING <<
"------------------------------------------------------------------" <<
Endl;
293 Log() << kDEBUG <<
"Built tree with minimum cut at N = " << frnd <<
"% events"
294 <<
" => N(nodes) = " << fForest.back()->GetNNodes()
295 <<
" ; n(tries) = " << ntries
300 if (useBoost) RestoreEventWeights();
311 fEventWeights.clear();
312 for (std::vector<const Event*>::iterator
e=fTrainingEvents.begin();
e!=fTrainingEvents.end(); ++
e) {
313 Double_t w = (*e)->GetBoostWeight();
314 fEventWeights.push_back(w);
324 if (fEventWeights.size() != fTrainingEvents.size()) {
325 Log() << kERROR <<
"RuleFit::RestoreEventWeights() called without having called SaveEventWeights() before!" <<
Endl;
328 for (std::vector<const Event*>::iterator
e=fTrainingEvents.begin();
e!=fTrainingEvents.end(); ++
e) {
329 (*e)->SetBoostWeight(fEventWeights[ie]);
344 std::vector<Char_t> correctSelected;
346 for (std::vector<const Event*>::iterator
e=fTrainingEvents.begin();
e!=fTrainingEvents.end(); ++
e) {
351 if (isSignalType == fMethodBase->DataInfo().IsSignal(*
e)) {
352 correctSelected.push_back(
kTRUE);
356 correctSelected.push_back(
kFALSE);
364 Double_t boostWeight = (err>0 ? (1.0-err)/err : 1000.0);
368 for (std::vector<const Event*>::iterator
e=fTrainingEvents.begin();
e!=fTrainingEvents.end(); ++
e) {
369 if (!correctSelected[ie])
370 (*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostWeight);
371 newSumw+=(*e)->GetWeight();
376 for (std::vector<const Event*>::iterator
e=fTrainingEvents.begin();
e!=fTrainingEvents.end(); ++
e) {
377 (*e)->SetBoostWeight( (*e)->GetBoostWeight() * scale);
379 Log() << kDEBUG <<
"boostWeight = " << boostWeight <<
" scale = " << scale <<
Endl;
388 UInt_t ntrees = fForest.size();
389 if (ntrees==0)
return;
394 for (
UInt_t i=0; i<ntrees; i++) {
401 Log() << kVERBOSE <<
"Nodes in trees: average & std dev = " << sumn/ntrees <<
" , " << sig <<
Endl;
411 Log() << kVERBOSE <<
"Fitting rule/linear terms" <<
Endl;
412 fRuleFitParams.MakeGDPath();
420 Log() << kVERBOSE <<
"Calculating importance" <<
Endl;
421 fRuleEnsemble.CalcImportance();
422 fRuleEnsemble.CleanupRules();
423 fRuleEnsemble.CleanupLinear();
424 fRuleEnsemble.CalcVarImportance();
425 Log() << kVERBOSE <<
"Filling rule statistics" <<
Endl;
426 fRuleEnsemble.RuleResponseStats();
434 return fRuleEnsemble.EvalEvent(
e );
442 if (fMethodRuleFit==0)
Log() << kFATAL <<
"RuleFit::SetTrainingEvents - MethodRuleFit not initialized" <<
Endl;
444 if (neve==0)
Log() << kWARNING <<
"An empty sample of training events was given" <<
Endl;
447 fTrainingEvents.clear();
448 fTrainingEventsRndm.clear();
449 for (
UInt_t i=0; i<neve; i++) {
450 fTrainingEvents.push_back(
static_cast< const Event *
>(el[i]));
451 fTrainingEventsRndm.push_back(
static_cast< const Event *
>(el[i]));
455 std::shuffle(fTrainingEventsRndm.begin(), fTrainingEventsRndm.end(), fRNGEngine);
458 fNTreeSample =
static_cast<UInt_t>(neve*fMethodRuleFit->GetTreeEveFrac());
459 Log() << kDEBUG <<
"Number of events per tree : " << fNTreeSample
460 <<
" ( N(events) = " << neve <<
" )"
461 <<
" randomly drawn without replacement" <<
Endl;
470 if ((nevents<fTrainingEventsRndm.size()) && (nevents>0)) {
471 evevec.resize(nevents);
472 for (
UInt_t ie=0; ie<nevents; ie++) {
473 evevec[ie] = fTrainingEventsRndm[ie];
477 Log() << kWARNING <<
"GetRndmSampleEvents() : requested sub sample size larger than total size (BUG!).";
488 if (hlist.empty())
return;
495 for (
UInt_t i=0; i<hlist.size(); i++) {
505 if (wm<wmin) wmin=wm;
522 for (
UInt_t i=0; i<hlist.size(); i++) {
541 if (!ruleHasVar)
return;
544 if(firstbin<0) firstbin=0;
552 Double_t fbfrac = (dormin ? ((fbmin+xbinw-rmin)/xbinw):1.0);
553 Double_t lbfrac = (dormax ? ((rmax-lbmax+xbinw)/xbinw):1.0);
558 for (
Int_t bin = binmin; bin<binmax+1; bin++) {
559 fbin = bin-firstbin+1;
563 else if (bin==binmax) {
571 if (fVisHistsUseImp) {
577 h2->
Fill(xc,0.5,val*
f);
587 if (!fRuleEnsemble.DoLinear())
return;
593 if (fVisHistsUseImp) {
594 val = fRuleEnsemble.GetLinImportance(vind);
597 val = fRuleEnsemble.GetLinCoefficients(vind);
599 for (
Int_t bin = firstbin; bin<lastbin+1; bin++) {
601 h2->
Fill(xc,0.5,val);
613 if (fVisHistsUseImp) {
620 Double_t rxmin, rxmax, rymin, rymax;
621 Bool_t dorxmin, dorxmax, dorymin, dorymax;
627 if (!(ruleHasVarX || ruleHasVarY))
return;
646 Double_t fxbinmin = (dorxmin ? ((xbinmin+xbinw-vxmin)/xbinw):1.0);
647 Double_t fxbinmax = (dorxmax ? ((vxmax-xbinmax+xbinw)/xbinw):1.0);
648 Double_t fybinmin = (dorymin ? ((ybinmin+ybinw-vymin)/ybinw):1.0);
649 Double_t fybinmax = (dorymax ? ((vymax-ybinmax+ybinw)/ybinw):1.0);
654 for (
Int_t binx = binxmin; binx<binxmax+1; binx++) {
658 else if (binx==binxmax) {
665 for (
Int_t biny = binymin; biny<binymax+1; biny++) {
669 else if (biny==binymax) {
676 h2->
Fill(xc,yc,val*fx*fy);
686 Int_t nhists = hlist.size();
687 Int_t nvar = fMethodBase->GetNvar();
688 if (nhists!=nvar)
Log() << kFATAL <<
"BUG TRAP: number of hists is not equal the number of variables!" <<
Endl;
690 std::vector<Int_t> vindex;
693 for (
Int_t ih=0; ih<nhists; ih++) {
694 hstr = hlist[ih]->GetTitle();
695 for (
Int_t iv=0; iv<nvar; iv++) {
696 if (fMethodBase->GetInputTitle(iv) == hstr)
697 vindex.push_back(iv);
701 for (
Int_t iv=0; iv<nvar; iv++) {
704 FillCut(hlist[iv],rule,vindex[iv]);
708 FillLin(hlist[iv],vindex[iv]);
719 if (!(ruleimp>0))
return;
720 if (ruleimp<fRuleEnsemble.GetImportanceCut())
return;
722 Int_t nhists = hlist.size();
723 Int_t nvar = fMethodBase->GetNvar();
724 Int_t ncorr = (nvar*(nvar+1)/2)-nvar;
725 if (nhists!=ncorr)
Log() << kERROR <<
"BUG TRAP: number of corr hists is not correct! ncorr = "
726 << ncorr <<
" nvar = " << nvar <<
" nhists = " << nhists <<
Endl;
728 std::vector< std::pair<Int_t,Int_t> > vindex;
732 for (
Int_t ih=0; ih<nhists; ih++) {
733 hstr = hlist[ih]->GetName();
734 if (GetCorrVars( hstr, var1, var2 )) {
735 iv1 = fMethodBase->DataInfo().FindVarIndex( var1 );
736 iv2 = fMethodBase->DataInfo().FindVarIndex( var2 );
737 vindex.push_back( std::pair<Int_t,Int_t>(iv2,iv1) );
740 Log() << kERROR <<
"BUG TRAP: should not be here - failed getting var1 and var2" <<
Endl;
744 for (
Int_t ih=0; ih<nhists; ih++) {
747 FillCorr(hlist[ih],rule,vindex[ih].
first,vindex[ih].
second);
765 var1 = titleCopy(0,splitPos);
766 var2 = titleCopy(splitPos+4, titleCopy.
Length());
779 const TString directories[5] = {
"InputVariables_Id",
780 "InputVariables_Deco",
781 "InputVariables_PCA",
782 "InputVariables_Gauss",
783 "InputVariables_Gauss_Deco" };
785 const TString corrDirName =
"CorrelationPlots";
791 TDirectory* methodDir = fMethodBase->BaseDir();
797 Log() << kWARNING <<
"No basedir - BUG??" <<
Endl;
803 done = ((varDir!=0) || (
type>4));
806 Log() << kWARNING <<
"No input variable directory found - BUG?" <<
Endl;
811 Log() << kWARNING <<
"No correlation directory found" <<
Endl;
812 Log() << kWARNING <<
"Check for other warnings related to correlation histograms" <<
Endl;
816 Log() << kWARNING <<
"No rulefit method directory found - BUG?" <<
Endl;
820 varDirName = varDir->
GetName();
826 Log() << kWARNING <<
"No correlation directory found : " << corrDirName <<
Endl;
832 Log() << kDEBUG <<
"Got number of plots = " << noPlots <<
Endl;
835 std::vector<TH2F *> h1Vector;
836 std::vector<TH2F *> h2CorrVector;
839 while ((key = (
TKey*)next())) {
845 Log() << kDEBUG <<
"Got histogram : " << hname <<
Endl;
859 h1Vector.push_back( newhist );
866 while ((key = (
TKey*)nextCorr())) {
875 Log() << kDEBUG <<
"Got histogram (2D) : " << hname <<
Endl;
883 TH2F *newhist =
new TH2F(newname,htitle,
886 if (GetCorrVars( newname, var1, var2 )) {
887 Int_t iv1 = fMethodBase->DataInfo().FindVarIndex(var1);
888 Int_t iv2 = fMethodBase->DataInfo().FindVarIndex(var2);
903 h2CorrVector.push_back( newhist );
909 UInt_t nrules = fRuleEnsemble.GetNRules();
911 for (
UInt_t i=0; i<nrules; i++) {
912 rule = fRuleEnsemble.GetRulesConst(i);
913 FillVisHistCut(rule, h1Vector);
916 FillVisHistCut(0, h1Vector);
917 NormVisHists(h1Vector);
922 for (
UInt_t i=0; i<nrules; i++) {
923 rule = fRuleEnsemble.GetRulesConst(i);
924 FillVisHistCorr(rule, h2CorrVector);
926 NormVisHists(h2CorrVector);
930 for (
UInt_t i=0; i<h1Vector.size(); i++) h1Vector[i]->Write();
931 for (
UInt_t i=0; i<h2CorrVector.size(); i++) h2CorrVector[i]->Write();
939 TDirectory* methodDir = fMethodBase->BaseDir();
941 Log() << kWARNING <<
"<MakeDebugHists> No rulefit method directory found - bug?" <<
Endl;
946 std::vector<Double_t> distances;
947 std::vector<Double_t> fncuts;
948 std::vector<Double_t> fnvars;
953 UInt_t nrules = fRuleEnsemble.GetNRules();
954 for (
UInt_t i=0; i<nrules; i++) {
955 ruleA = fRuleEnsemble.GetRulesConst(i);
956 for (
UInt_t j=i+1; j<nrules; j++) {
957 ruleB = fRuleEnsemble.GetRulesConst(j);
962 distances.push_back(dAB);
963 fncuts.push_back(
static_cast<Double_t>(nc));
964 fnvars.push_back(
static_cast<Double_t>(nv));
965 if (dAB<dABmin) dABmin=dAB;
966 if (dAB>dABmax) dABmax=dAB;
971 TH1F *histDist =
new TH1F(
"RuleDist",
"Rule distances",100,dABmin,dABmax);
972 TTree *distNtuple =
new TTree(
"RuleDistNtuple",
"RuleDist ntuple");
976 distNtuple->
Branch(
"dist", &ntDist,
"dist/D");
977 distNtuple->
Branch(
"ncuts",&ntNcuts,
"ncuts/D");
978 distNtuple->
Branch(
"nvars",&ntNvars,
"nvars/D");
980 for (
UInt_t i=0; i<distances.size(); i++) {
981 histDist->
Fill(distances[i]);
982 ntDist = distances[i];
virtual Double_t GetBinCenter(Int_t bin) const
Return center of bin.
virtual Int_t FindBin(Double_t x)
Find bin number corresponding to abscissa x.
virtual Double_t GetBinLowEdge(Int_t bin) const
Return low edge of bin.
virtual Double_t GetBinWidth(Int_t bin) const
Return bin width.
TClass instances represent classes, structs and namespaces in the ROOT type system.
Bool_t InheritsFrom(const char *cl) const
Return kTRUE if this class inherits from a class with name "classname".
Describe directory structure in memory.
virtual TObject * Get(const char *namecycle)
Return pointer to object identified by namecycle.
virtual TFile * GetFile() const
virtual TList * GetListOfKeys() const
virtual Bool_t cd(const char *path=nullptr)
Change current directory to "this" directory.
1-D histogram with a float per channel (see TH1 documentation)}
virtual Int_t GetNbinsY() const
TAxis * GetXaxis()
Get the behaviour adopted by the object about the statoverflows. See EStatOverflows for more informat...
virtual Double_t GetMaximum(Double_t maxval=FLT_MAX) const
Return maximum value smaller than maxval of bins in the range, unless the value has been overridden b...
virtual Int_t GetNbinsX() const
virtual void SetMaximum(Double_t maximum=-1111)
virtual Int_t Fill(Double_t x)
Increment bin with abscissa X by 1.
virtual void SetMinimum(Double_t minimum=-1111)
virtual void Scale(Double_t c1=1, Option_t *option="")
Multiply this histogram by a constant c1.
virtual Int_t FindBin(Double_t x, Double_t y=0, Double_t z=0)
Return Global bin number corresponding to x,y,z.
virtual Double_t GetMinimum(Double_t minval=-FLT_MAX) const
Return minimum value larger than minval of bins in the range, unless the value has been overridden by...
2-D histogram with a float per channel (see TH1 documentation)}
Int_t Fill(Double_t)
Invalid Fill method.
virtual Int_t GetBin(Int_t binx, Int_t biny, Int_t binz=0) const
Return Global bin number corresponding to binx,y,z.
Book space in a file, create I/O buffers, to fill them, (un)compress them.
virtual const char * GetClassName() const
virtual TObject * ReadObj()
To read a TObject* from the file.
Implementation of a Decision Tree.
void SetPruneMethod(EPruneMethod m=kCostComplexityPruning)
void SetPruneStrength(Double_t p)
Double_t CheckEvent(const TMVA::Event *, Bool_t UseYesNoLeaf=kFALSE) const
the event e is put into the decision tree (starting at the root node) and the output is NodeType (sig...
UInt_t BuildTree(const EventConstList &eventSample, DecisionTreeNode *node=NULL)
building the decision tree by recursively calling the splitting of one (root-) node into two daughter...
Double_t PruneTree(const EventConstList *validationSample=NULL)
prune (get rid of internal nodes) the Decision tree to avoid overtraining several different pruning m...
Virtual base Class for all MVA method.
J Friedman's RuleFit method.
ostringstream derivative to redirect and format output
Bool_t GetCutRange(Int_t sel, Double_t &rmin, Double_t &rmax, Bool_t &dormin, Bool_t &dormax) const
get cut range for a given selector
A class implementing various fits of rule ensembles.
void GetRndmSampleEvents(std::vector< const TMVA::Event * > &evevec, UInt_t nevents)
draw a random subsample of the training events without replacement
Double_t EvalEvent(const Event &e)
evaluate single event
void SetMethodBase(const MethodBase *rfbase)
set MethodBase
void InitPtrs(const TMVA::MethodBase *rfbase)
initialize pointers
void Boost(TMVA::DecisionTree *dt)
Boost the events.
void ForestStatistics()
summary of statistics of all trees
static const Int_t randSEED
void CalcImportance()
calculates the importance of each rule
void SetMsgType(EMsgType t)
set the current message type to that of mlog for this class and all other subtools
void Initialize(const TMVA::MethodBase *rfbase)
initialize the parameters of the RuleFit method and make rules
virtual ~RuleFit(void)
destructor
void FillVisHistCorr(const Rule *rule, std::vector< TH2F * > &hlist)
help routine to MakeVisHists() - fills for all correlation plots
std::default_random_engine fRNGEngine
void InitNEveEff()
init effective number of events (using event weights)
void SaveEventWeights()
save event weights - must be done before making the forest
void FillCut(TH2F *h2, const TMVA::Rule *rule, Int_t vind)
Fill cut.
void FillLin(TH2F *h2, Int_t vind)
fill lin
Bool_t GetCorrVars(TString &title, TString &var1, TString &var2)
get first and second variables from title
void MakeForest()
make a forest of decisiontrees
const std::vector< const TMVA::DecisionTree * > & GetForest() const
void FitCoefficients()
Fit the coefficients for the rule ensemble.
const MethodBase * GetMethodBase() const
void FillCorr(TH2F *h2, const TMVA::Rule *rule, Int_t v1, Int_t v2)
fill rule correlation between vx and vy, weighted with either the importance or the coefficient
void NormVisHists(std::vector< TH2F * > &hlist)
normalize rule importance hists
void RestoreEventWeights()
save event weights - must be done before making the forest
void MakeVisHists()
this will create histograms visualizing the rule ensemble
void FillVisHistCut(const Rule *rule, std::vector< TH2F * > &hlist)
help routine to MakeVisHists() - fills for all variables
void BuildTree(TMVA::DecisionTree *dt)
build the decision tree using fNTreeSample events from fTrainingEventsRndm
const std::vector< const TMVA::Event * > & GetTrainingEvents() const
const MethodRuleFit * GetMethodRuleFit() const
void SetTrainingEvents(const std::vector< const TMVA::Event * > &el)
set the training events randomly
void Copy(const RuleFit &other)
copy method
const RuleEnsemble & GetRuleEnsemble() const
Double_t CalcWeightSum(const std::vector< const TMVA::Event * > *events, UInt_t neve=0)
calculate the sum of weights
RuleFit(void)
default constructor
void MakeDebugHists()
this will create a histograms intended rather for debugging or for the curious user
Implementation of a rule.
Double_t GetSupport() const
UInt_t GetNumVarsUsed() const
const RuleCut * GetRuleCut() const
Double_t GetCoefficient() const
Double_t GetImportance() const
Double_t RuleDist(const Rule &other, Bool_t useCutValue) const
Returns:
Bool_t ContainsVariable(UInt_t iv) const
check if variable in node
Timing information for training and evaluation of MVA methods.
virtual void SetTitle(const char *title="")
Set the title of the TNamed.
virtual const char * GetTitle() const
Returns title of object.
virtual const char * GetName() const
Returns name of object.
Random number generator class based on M.
virtual Double_t Uniform(Double_t x1=1)
Returns a uniform deviate on the interval (0, x1).
TString & ReplaceAll(const TString &s1, const TString &s2)
Bool_t BeginsWith(const char *s, ECaseCompare cmp=kExact) const
TString & Remove(Ssiz_t pos)
Bool_t Contains(const char *pat, ECaseCompare cmp=kExact) const
Ssiz_t Index(const char *pat, Ssiz_t i=0, ECaseCompare cmp=kExact) const
A TTree represents a columnar dataset.
virtual Int_t Fill()
Fill all branches.
TBranch * Branch(const char *name, T *obj, Int_t bufsize=32000, Int_t splitlevel=99)
Add a new branch, and infer the data type from the type of obj being passed.
virtual Int_t Write(const char *name=0, Int_t option=0, Int_t bufsize=0)
Write this object to the current directory.
static constexpr double second
MsgLogger & Endl(MsgLogger &ml)
Double_t Sqrt(Double_t x)