65 : fVisHistsUseImp(
kTRUE )
80 , fVisHistsUseImp(
kTRUE)
99 UInt_t neve = fTrainingEvents.size();
102 fNEveEffTrain = CalcWeightSum( &fTrainingEvents );
111 this->SetMethodBase(rfbase);
112 fRuleEnsemble.Initialize(
this );
113 fRuleFitParams.SetRuleFit(
this );
125 UInt_t nevents = fMethodRuleFit->Data()->GetNTrainingEvents();
126 std::vector<const TMVA::Event*> tmp;
127 for (
Long64_t ievt=0; ievt<nevents; ievt++) {
128 const Event *
event = fMethodRuleFit->GetEvent(ievt);
129 tmp.push_back(
event);
131 SetTrainingEvents( tmp );
140 fRuleEnsemble.MakeModel();
143 fRuleFitParams.Init();
152 fMethodBase = rfbase;
153 fMethodRuleFit =
dynamic_cast<const MethodRuleFit *
>(rfbase);
177 if (events==0)
return 0.0;
178 if (neve==0) neve=events->size();
181 for (
UInt_t ie=0; ie<neve; ie++) {
182 sumw += ((*events)[ie])->GetWeight();
192 fLogger->SetMinType(t);
193 fRuleEnsemble.SetMsgType(t);
194 fRuleFitParams.SetMsgType(t);
203 if (fMethodRuleFit==0) {
204 Log() << kFATAL <<
"RuleFit::BuildTree() - Attempting to build a tree NOT from a MethodRuleFit" <<
Endl;
206 std::vector<const Event *> evevec;
207 for (
UInt_t ie=0; ie<fNTreeSample; ie++) {
208 evevec.push_back(fTrainingEventsRndm[ie]);
223 if (fMethodRuleFit==0) {
224 Log() << kFATAL <<
"RuleFit::BuildTree() - Attempting to build a tree NOT from a MethodRuleFit" <<
Endl;
226 Log() << kDEBUG <<
"Creating a forest with " << fMethodRuleFit->GetNTrees() <<
" decision trees" <<
Endl;
227 Log() << kDEBUG <<
"Each tree is built using a random subsample with " << fNTreeSample <<
" events" <<
Endl;
229 Timer timer( fMethodRuleFit->GetNTrees(),
"RuleFit" );
240 Bool_t useBoost = fMethodRuleFit->UseBoost();
242 if (useBoost) SaveEventWeights();
244 for (
Int_t i=0; i<fMethodRuleFit->GetNTrees(); i++) {
246 if (!useBoost) ReshuffleEvents();
249 for (
UInt_t ie = 0; ie<fNTreeSample; ie++) {
250 if (fMethodBase->DataInfo().IsSignal(fTrainingEventsRndm[ie])) nsig++;
259 const Int_t ntriesMax=10;
262 frnd = 100*rndGen.
Uniform( fMethodRuleFit->GetMinFracNEve(), 0.5*fMethodRuleFit->GetMaxFracNEve() );
264 Bool_t useRandomisedTree = !useBoost;
265 dt =
new DecisionTree( fMethodRuleFit->GetSeparationBase(), frnd, fMethodRuleFit->GetNCuts(), &(fMethodRuleFit->DataInfo()), iclass, useRandomisedTree);
266 dt->
SetNVars(fMethodBase->GetNvar());
274 tryAgain = ((dt==0) && (ntries<ntriesMax));
277 fForest.push_back(dt);
278 if (useBoost) Boost(dt);
282 Log() << kWARNING <<
"------------------------------------------------------------------" <<
Endl;
283 Log() << kWARNING <<
" Failed growing a tree even after " << ntriesMax <<
" trials" <<
Endl;
284 Log() << kWARNING <<
" Possible solutions: " <<
Endl;
285 Log() << kWARNING <<
" 1. increase the number of training events" <<
Endl;
286 Log() << kWARNING <<
" 2. set a lower min fraction cut (fEventsMin)" <<
Endl;
287 Log() << kWARNING <<
" 3. maybe also decrease the max fraction cut (fEventsMax)" <<
Endl;
288 Log() << kWARNING <<
" If the above warning occurs rarely only, it can be ignored" <<
Endl;
289 Log() << kWARNING <<
"------------------------------------------------------------------" <<
Endl;
292 Log() << kDEBUG <<
"Built tree with minimum cut at N = " << frnd <<
"% events"
293 <<
" => N(nodes) = " << fForest.back()->GetNNodes()
294 <<
" ; n(tries) = " << ntries
299 if (useBoost) RestoreEventWeights();
310 fEventWeights.clear();
311 for (std::vector<const Event*>::iterator
e=fTrainingEvents.begin();
e!=fTrainingEvents.end(); ++
e) {
312 Double_t w = (*e)->GetBoostWeight();
313 fEventWeights.push_back(w);
323 if (fEventWeights.size() != fTrainingEvents.size()) {
324 Log() << kERROR <<
"RuleFit::RestoreEventWeights() called without having called SaveEventWeights() before!" <<
Endl;
327 for (std::vector<const Event*>::iterator
e=fTrainingEvents.begin();
e!=fTrainingEvents.end(); ++
e) {
328 (*e)->SetBoostWeight(fEventWeights[ie]);
343 std::vector<Char_t> correctSelected;
345 for (std::vector<const Event*>::iterator
e=fTrainingEvents.begin();
e!=fTrainingEvents.end(); ++
e) {
350 if (isSignalType == fMethodBase->DataInfo().IsSignal(*
e)) {
351 correctSelected.push_back(
kTRUE);
355 correctSelected.push_back(
kFALSE);
363 Double_t boostWeight = (err>0 ? (1.0-err)/err : 1000.0);
367 for (std::vector<const Event*>::iterator
e=fTrainingEvents.begin();
e!=fTrainingEvents.end(); ++
e) {
368 if (!correctSelected[ie])
369 (*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostWeight);
370 newSumw+=(*e)->GetWeight();
375 for (std::vector<const Event*>::iterator
e=fTrainingEvents.begin();
e!=fTrainingEvents.end(); ++
e) {
376 (*e)->SetBoostWeight( (*e)->GetBoostWeight() * scale);
378 Log() << kDEBUG <<
"boostWeight = " << boostWeight <<
" scale = " << scale <<
Endl;
387 UInt_t ntrees = fForest.size();
388 if (ntrees==0)
return;
393 for (
UInt_t i=0; i<ntrees; i++) {
400 Log() << kVERBOSE <<
"Nodes in trees: average & std dev = " << sumn/ntrees <<
" , " << sig <<
Endl;
410 Log() << kVERBOSE <<
"Fitting rule/linear terms" <<
Endl;
411 fRuleFitParams.MakeGDPath();
419 Log() << kVERBOSE <<
"Calculating importance" <<
Endl;
420 fRuleEnsemble.CalcImportance();
421 fRuleEnsemble.CleanupRules();
422 fRuleEnsemble.CleanupLinear();
423 fRuleEnsemble.CalcVarImportance();
424 Log() << kVERBOSE <<
"Filling rule statistics" <<
Endl;
425 fRuleEnsemble.RuleResponseStats();
433 return fRuleEnsemble.EvalEvent(
e );
441 if (fMethodRuleFit==0) Log() << kFATAL <<
"RuleFit::SetTrainingEvents - MethodRuleFit not initialized" <<
Endl;
443 if (neve==0) Log() << kWARNING <<
"An empty sample of training events was given" <<
Endl;
446 fTrainingEvents.clear();
447 fTrainingEventsRndm.clear();
448 for (
UInt_t i=0; i<neve; i++) {
449 fTrainingEvents.push_back(
static_cast< const Event *
>(el[i]));
450 fTrainingEventsRndm.push_back(
static_cast< const Event *
>(el[i]));
454 std::shuffle(fTrainingEventsRndm.begin(), fTrainingEventsRndm.end(), fRNGEngine);
457 fNTreeSample =
static_cast<UInt_t>(neve*fMethodRuleFit->GetTreeEveFrac());
458 Log() << kDEBUG <<
"Number of events per tree : " << fNTreeSample
459 <<
" ( N(events) = " << neve <<
" )"
460 <<
" randomly drawn without replacement" <<
Endl;
469 if ((nevents<fTrainingEventsRndm.size()) && (nevents>0)) {
470 evevec.resize(nevents);
471 for (
UInt_t ie=0; ie<nevents; ie++) {
472 evevec[ie] = fTrainingEventsRndm[ie];
476 Log() << kWARNING <<
"GetRndmSampleEvents() : requested sub sample size larger than total size (BUG!).";
487 if (hlist.empty())
return;
494 for (
UInt_t i=0; i<hlist.size(); i++) {
504 if (wm<wmin) wmin=wm;
521 for (
UInt_t i=0; i<hlist.size(); i++) {
540 if (!ruleHasVar)
return;
543 if(firstbin<0) firstbin=0;
551 Double_t fbfrac = (dormin ? ((fbmin+xbinw-rmin)/xbinw):1.0);
552 Double_t lbfrac = (dormax ? ((rmax-lbmax+xbinw)/xbinw):1.0);
557 for (
Int_t bin = binmin; bin<binmax+1; bin++) {
558 fbin = bin-firstbin+1;
562 else if (bin==binmax) {
570 if (fVisHistsUseImp) {
576 h2->
Fill(xc,0.5,val*
f);
586 if (!fRuleEnsemble.DoLinear())
return;
592 if (fVisHistsUseImp) {
593 val = fRuleEnsemble.GetLinImportance(vind);
596 val = fRuleEnsemble.GetLinCoefficients(vind);
598 for (
Int_t bin = firstbin; bin<lastbin+1; bin++) {
600 h2->
Fill(xc,0.5,val);
612 if (fVisHistsUseImp) {
619 Double_t rxmin, rxmax, rymin, rymax;
620 Bool_t dorxmin, dorxmax, dorymin, dorymax;
626 if (!(ruleHasVarX || ruleHasVarY))
return;
645 Double_t fxbinmin = (dorxmin ? ((xbinmin+xbinw-vxmin)/xbinw):1.0);
646 Double_t fxbinmax = (dorxmax ? ((vxmax-xbinmax+xbinw)/xbinw):1.0);
647 Double_t fybinmin = (dorymin ? ((ybinmin+ybinw-vymin)/ybinw):1.0);
648 Double_t fybinmax = (dorymax ? ((vymax-ybinmax+ybinw)/ybinw):1.0);
653 for (
Int_t binx = binxmin; binx<binxmax+1; binx++) {
657 else if (binx==binxmax) {
664 for (
Int_t biny = binymin; biny<binymax+1; biny++) {
668 else if (biny==binymax) {
675 h2->
Fill(xc,yc,val*fx*fy);
685 Int_t nhists = hlist.size();
686 Int_t nvar = fMethodBase->GetNvar();
687 if (nhists!=nvar) Log() << kFATAL <<
"BUG TRAP: number of hists is not equal the number of variables!" <<
Endl;
689 std::vector<Int_t> vindex;
692 for (
Int_t ih=0; ih<nhists; ih++) {
693 hstr = hlist[ih]->GetTitle();
694 for (
Int_t iv=0; iv<nvar; iv++) {
695 if (fMethodBase->GetInputTitle(iv) == hstr)
696 vindex.push_back(iv);
700 for (
Int_t iv=0; iv<nvar; iv++) {
703 FillCut(hlist[iv],rule,vindex[iv]);
707 FillLin(hlist[iv],vindex[iv]);
718 if (!(ruleimp>0))
return;
719 if (ruleimp<fRuleEnsemble.GetImportanceCut())
return;
721 Int_t nhists = hlist.size();
722 Int_t nvar = fMethodBase->GetNvar();
723 Int_t ncorr = (nvar*(nvar+1)/2)-nvar;
724 if (nhists!=ncorr) Log() << kERROR <<
"BUG TRAP: number of corr hists is not correct! ncorr = "
725 << ncorr <<
" nvar = " << nvar <<
" nhists = " << nhists <<
Endl;
727 std::vector< std::pair<Int_t,Int_t> > vindex;
731 for (
Int_t ih=0; ih<nhists; ih++) {
732 hstr = hlist[ih]->GetName();
733 if (GetCorrVars( hstr, var1, var2 )) {
734 iv1 = fMethodBase->DataInfo().FindVarIndex( var1 );
735 iv2 = fMethodBase->DataInfo().FindVarIndex( var2 );
736 vindex.push_back( std::pair<Int_t,Int_t>(iv2,iv1) );
739 Log() << kERROR <<
"BUG TRAP: should not be here - failed getting var1 and var2" <<
Endl;
743 for (
Int_t ih=0; ih<nhists; ih++) {
746 FillCorr(hlist[ih],rule,vindex[ih].
first,vindex[ih].second);
764 var1 = titleCopy(0,splitPos);
765 var2 = titleCopy(splitPos+4, titleCopy.
Length());
778 const TString directories[5] = {
"InputVariables_Id",
779 "InputVariables_Deco",
780 "InputVariables_PCA",
781 "InputVariables_Gauss",
782 "InputVariables_Gauss_Deco" };
784 const TString corrDirName =
"CorrelationPlots";
790 TDirectory* methodDir = fMethodBase->BaseDir();
796 Log() << kWARNING <<
"No basedir - BUG??" <<
Endl;
802 done = ((varDir!=0) || (
type>4));
805 Log() << kWARNING <<
"No input variable directory found - BUG?" <<
Endl;
810 Log() << kWARNING <<
"No correlation directory found" <<
Endl;
811 Log() << kWARNING <<
"Check for other warnings related to correlation histograms" <<
Endl;
815 Log() << kWARNING <<
"No rulefit method directory found - BUG?" <<
Endl;
819 varDirName = varDir->
GetName();
825 Log() << kWARNING <<
"No correlation directory found : " << corrDirName <<
Endl;
831 Log() << kDEBUG <<
"Got number of plots = " << noPlots <<
Endl;
834 std::vector<TH2F *> h1Vector;
835 std::vector<TH2F *> h2CorrVector;
838 while ((key = (
TKey*)next())) {
844 Log() << kDEBUG <<
"Got histogram : " << hname <<
Endl;
858 h1Vector.push_back( newhist );
865 while ((key = (
TKey*)nextCorr())) {
874 Log() << kDEBUG <<
"Got histogram (2D) : " << hname <<
Endl;
882 TH2F *newhist =
new TH2F(newname,htitle,
885 if (GetCorrVars( newname, var1, var2 )) {
886 Int_t iv1 = fMethodBase->DataInfo().FindVarIndex(var1);
887 Int_t iv2 = fMethodBase->DataInfo().FindVarIndex(var2);
902 h2CorrVector.push_back( newhist );
908 UInt_t nrules = fRuleEnsemble.GetNRules();
910 for (
UInt_t i=0; i<nrules; i++) {
911 rule = fRuleEnsemble.GetRulesConst(i);
912 FillVisHistCut(rule, h1Vector);
915 FillVisHistCut(0, h1Vector);
916 NormVisHists(h1Vector);
921 for (
UInt_t i=0; i<nrules; i++) {
922 rule = fRuleEnsemble.GetRulesConst(i);
923 FillVisHistCorr(rule, h2CorrVector);
925 NormVisHists(h2CorrVector);
929 for (
UInt_t i=0; i<h1Vector.size(); i++) h1Vector[i]->Write();
930 for (
UInt_t i=0; i<h2CorrVector.size(); i++) h2CorrVector[i]->Write();
938 TDirectory* methodDir = fMethodBase->BaseDir();
940 Log() << kWARNING <<
"<MakeDebugHists> No rulefit method directory found - bug?" <<
Endl;
945 std::vector<Double_t> distances;
946 std::vector<Double_t> fncuts;
947 std::vector<Double_t> fnvars;
952 UInt_t nrules = fRuleEnsemble.GetNRules();
953 for (
UInt_t i=0; i<nrules; i++) {
954 ruleA = fRuleEnsemble.GetRulesConst(i);
955 for (
UInt_t j=i+1; j<nrules; j++) {
956 ruleB = fRuleEnsemble.GetRulesConst(j);
961 distances.push_back(dAB);
962 fncuts.push_back(
static_cast<Double_t>(nc));
963 fnvars.push_back(
static_cast<Double_t>(nv));
964 if (dAB<dABmin) dABmin=dAB;
965 if (dAB>dABmax) dABmax=dAB;
970 TH1F *histDist =
new TH1F(
"RuleDist",
"Rule distances",100,dABmin,dABmax);
971 TTree *distNtuple =
new TTree(
"RuleDistNtuple",
"RuleDist ntuple");
975 distNtuple->
Branch(
"dist", &ntDist,
"dist/D");
976 distNtuple->
Branch(
"ncuts",&ntNcuts,
"ncuts/D");
977 distNtuple->
Branch(
"nvars",&ntNvars,
"nvars/D");
979 for (
UInt_t i=0; i<distances.size(); i++) {
980 histDist->
Fill(distances[i]);
981 ntDist = distances[i];
virtual Double_t GetBinCenter(Int_t bin) const
Return center of bin.
virtual Int_t FindBin(Double_t x)
Find bin number corresponding to abscissa x.
virtual Double_t GetBinLowEdge(Int_t bin) const
Return low edge of bin.
virtual Double_t GetBinWidth(Int_t bin) const
Return bin width.
TClass instances represent classes, structs and namespaces in the ROOT type system.
Bool_t InheritsFrom(const char *cl) const
Return kTRUE if this class inherits from a class with name "classname".
static TClass * GetClass(const char *name, Bool_t load=kTRUE, Bool_t silent=kFALSE)
Static method returning pointer to TClass of the specified class name.
Describe directory structure in memory.
virtual TObject * Get(const char *namecycle)
Return pointer to object identified by namecycle.
virtual TFile * GetFile() const
virtual Bool_t cd()
Change current directory to "this" directory.
virtual TList * GetListOfKeys() const
1-D histogram with a float per channel (see TH1 documentation)}
virtual Int_t GetNbinsY() const
TAxis * GetXaxis()
Get the behaviour adopted by the object about the statoverflows. See EStatOverflows for more informat...
virtual Double_t GetMaximum(Double_t maxval=FLT_MAX) const
Return maximum value smaller than maxval of bins in the range, unless the value has been overridden b...
virtual Int_t GetNbinsX() const
virtual void SetMaximum(Double_t maximum=-1111)
virtual Int_t Fill(Double_t x)
Increment bin with abscissa X by 1.
virtual void SetMinimum(Double_t minimum=-1111)
virtual void Scale(Double_t c1=1, Option_t *option="")
Multiply this histogram by a constant c1.
virtual Int_t FindBin(Double_t x, Double_t y=0, Double_t z=0)
Return Global bin number corresponding to x,y,z.
virtual Double_t GetMinimum(Double_t minval=-FLT_MAX) const
Return minimum value larger than minval of bins in the range, unless the value has been overridden by...
2-D histogram with a float per channel (see TH1 documentation)}
Int_t Fill(Double_t)
Invalid Fill method.
virtual Int_t GetBin(Int_t binx, Int_t biny, Int_t binz=0) const
Return Global bin number corresponding to binx,y,z.
Book space in a file, create I/O buffers, to fill them, (un)compress them.
virtual const char * GetClassName() const
virtual TObject * ReadObj()
To read a TObject* from the file.
Implementation of a Decision Tree.
void SetPruneMethod(EPruneMethod m=kCostComplexityPruning)
void SetPruneStrength(Double_t p)
Double_t CheckEvent(const TMVA::Event *, Bool_t UseYesNoLeaf=kFALSE) const
the event e is put into the decision tree (starting at the root node) and the output is NodeType (sig...
UInt_t BuildTree(const EventConstList &eventSample, DecisionTreeNode *node=NULL)
building the decision tree by recursively calling the splitting of one (root-) node into two daughter...
Double_t PruneTree(const EventConstList *validationSample=NULL)
prune (get rid of internal nodes) the Decision tree to avoid overtraining several different pruning m...
Virtual base Class for all MVA method.
J Friedman's RuleFit method.
ostringstream derivative to redirect and format output
Bool_t GetCutRange(Int_t sel, Double_t &rmin, Double_t &rmax, Bool_t &dormin, Bool_t &dormax) const
get cut range for a given selector
A class implementing various fits of rule ensembles.
void GetRndmSampleEvents(std::vector< const TMVA::Event * > &evevec, UInt_t nevents)
draw a random subsample of the training events without replacement
Double_t EvalEvent(const Event &e)
evaluate single event
void SetMethodBase(const MethodBase *rfbase)
set MethodBase
void InitPtrs(const TMVA::MethodBase *rfbase)
initialize pointers
void Boost(TMVA::DecisionTree *dt)
Boost the events.
void ForestStatistics()
summary of statistics of all trees
static const Int_t randSEED
void CalcImportance()
calculates the importance of each rule
void SetMsgType(EMsgType t)
set the current message type to that of mlog for this class and all other subtools
void Initialize(const TMVA::MethodBase *rfbase)
initialize the parameters of the RuleFit method and make rules
virtual ~RuleFit(void)
destructor
void FillVisHistCorr(const Rule *rule, std::vector< TH2F * > &hlist)
help routine to MakeVisHists() - fills for all correlation plots
std::default_random_engine fRNGEngine
void InitNEveEff()
init effective number of events (using event weights)
void SaveEventWeights()
save event weights - must be done before making the forest
void FillCut(TH2F *h2, const TMVA::Rule *rule, Int_t vind)
Fill cut.
void FillLin(TH2F *h2, Int_t vind)
fill lin
Bool_t GetCorrVars(TString &title, TString &var1, TString &var2)
get first and second variables from title
void MakeForest()
make a forest of decisiontrees
const std::vector< const TMVA::DecisionTree * > & GetForest() const
void FitCoefficients()
Fit the coefficients for the rule ensemble.
const MethodBase * GetMethodBase() const
void FillCorr(TH2F *h2, const TMVA::Rule *rule, Int_t v1, Int_t v2)
fill rule correlation between vx and vy, weighted with either the importance or the coefficient
void NormVisHists(std::vector< TH2F * > &hlist)
normalize rule importance hists
void RestoreEventWeights()
save event weights - must be done before making the forest
void MakeVisHists()
this will create histograms visualizing the rule ensemble
void FillVisHistCut(const Rule *rule, std::vector< TH2F * > &hlist)
help routine to MakeVisHists() - fills for all variables
void BuildTree(TMVA::DecisionTree *dt)
build the decision tree using fNTreeSample events from fTrainingEventsRndm
const std::vector< const TMVA::Event * > & GetTrainingEvents() const
const MethodRuleFit * GetMethodRuleFit() const
void SetTrainingEvents(const std::vector< const TMVA::Event * > &el)
set the training events randomly
void Copy(const RuleFit &other)
copy method
const RuleEnsemble & GetRuleEnsemble() const
Double_t CalcWeightSum(const std::vector< const TMVA::Event * > *events, UInt_t neve=0)
calculate the sum of weights
RuleFit(void)
default constructor
void MakeDebugHists()
this will create a histograms intended rather for debugging or for the curious user
Implementation of a rule.
Double_t GetSupport() const
UInt_t GetNumVarsUsed() const
const RuleCut * GetRuleCut() const
Double_t GetCoefficient() const
Double_t GetImportance() const
Double_t RuleDist(const Rule &other, Bool_t useCutValue) const
Returns:
Bool_t ContainsVariable(UInt_t iv) const
check if variable in node
Timing information for training and evaluation of MVA methods.
virtual void SetTitle(const char *title="")
Set the title of the TNamed.
virtual const char * GetTitle() const
Returns title of object.
virtual const char * GetName() const
Returns name of object.
Random number generator class based on M.
virtual Double_t Uniform(Double_t x1=1)
Returns a uniform deviate on the interval (0, x1).
TString & ReplaceAll(const TString &s1, const TString &s2)
Bool_t BeginsWith(const char *s, ECaseCompare cmp=kExact) const
TString & Remove(Ssiz_t pos)
Bool_t Contains(const char *pat, ECaseCompare cmp=kExact) const
Ssiz_t Index(const char *pat, Ssiz_t i=0, ECaseCompare cmp=kExact) const
A TTree represents a columnar dataset.
virtual Int_t Fill()
Fill all branches.
TBranch * Branch(const char *name, T *obj, Int_t bufsize=32000, Int_t splitlevel=99)
Add a new branch, and infer the data type from the type of obj being passed.
virtual Int_t Write(const char *name=0, Int_t option=0, Int_t bufsize=0)
Write this object to the current directory.
MsgLogger & Endl(MsgLogger &ml)
Double_t Sqrt(Double_t x)