125 std::vector<const TMVA::Event*> tmp;
126 for (
Long64_t ievt=0; ievt<nevents; ievt++) {
128 tmp.push_back(event);
176 if (events==0)
return 0.0;
177 if (neve==0) neve=events->size();
180 for (
UInt_t ie=0; ie<neve; ie++) {
181 sumw += ((*events)[ie])->GetWeight();
203 Log() << kFATAL <<
"RuleFit::BuildTree() - Attempting to build a tree NOT from a MethodRuleFit" <<
Endl;
205 std::vector<const Event *> evevec;
223 Log() << kFATAL <<
"RuleFit::BuildTree() - Attempting to build a tree NOT from a MethodRuleFit" <<
Endl;
225 Log() << kDEBUG <<
"Creating a forest with " <<
fMethodRuleFit->GetNTrees() <<
" decision trees" <<
Endl;
226 Log() << kDEBUG <<
"Each tree is built using a random subsample with " <<
fNTreeSample <<
" events" <<
Endl;
248 const Int_t ntriesMax=10;
253 Bool_t useRandomisedTree = !useBoost;
263 tryAgain = ((dt==0) && (ntries<ntriesMax));
267 if (useBoost)
Boost(dt);
271 Log() << kWARNING <<
"------------------------------------------------------------------" <<
Endl;
272 Log() << kWARNING <<
" Failed growing a tree even after " << ntriesMax <<
" trials" <<
Endl;
273 Log() << kWARNING <<
" Possible solutions: " <<
Endl;
274 Log() << kWARNING <<
" 1. increase the number of training events" <<
Endl;
275 Log() << kWARNING <<
" 2. set a lower min fraction cut (fEventsMin)" <<
Endl;
276 Log() << kWARNING <<
" 3. maybe also decrease the max fraction cut (fEventsMax)" <<
Endl;
277 Log() << kWARNING <<
" If the above warning occurs rarely only, it can be ignored" <<
Endl;
278 Log() << kWARNING <<
"------------------------------------------------------------------" <<
Endl;
281 Log() << kDEBUG <<
"Built tree with minimum cut at N = " << frnd <<
"% events"
282 <<
" => N(nodes) = " <<
fForest.back()->GetNNodes()
283 <<
" ; n(tries) = " << ntries
301 Double_t w = (*e)->GetBoostWeight();
313 Log() << kERROR <<
"RuleFit::RestoreEventWeights() called without having called SaveEventWeights() before!" <<
Endl;
332 std::vector<Char_t> correctSelected;
339 if (isSignalType ==
fMethodBase->DataInfo().IsSignal(*
e)) {
340 correctSelected.push_back(
kTRUE);
344 correctSelected.push_back(
kFALSE);
357 if (!correctSelected[ie])
358 (*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostWeight);
359 newSumw+=(*e)->GetWeight();
365 (*e)->SetBoostWeight( (*e)->GetBoostWeight() * scale);
367 Log() << kDEBUG <<
"boostWeight = " << boostWeight <<
" scale = " << scale <<
Endl;
377 if (ntrees==0)
return;
382 for (
UInt_t i=0; i<ntrees; i++) {
389 Log() << kVERBOSE <<
"Nodes in trees: average & std dev = " << sumn/ntrees <<
" , " << sig <<
Endl;
399 Log() << kVERBOSE <<
"Fitting rule/linear terms" <<
Endl;
408 Log() << kVERBOSE <<
"Calculating importance" <<
Endl;
413 Log() << kVERBOSE <<
"Filling rule statistics" <<
Endl;
430 if (
fMethodRuleFit==0)
Log() << kFATAL <<
"RuleFit::SetTrainingEvents - MethodRuleFit not initialized" <<
Endl;
432 if (neve==0)
Log() << kWARNING <<
"An empty sample of training events was given" <<
Endl;
437 for (
UInt_t i=0; i<neve; i++) {
448 <<
" ( N(events) = " << neve <<
" )"
449 <<
" randomly drawn without replacement" <<
Endl;
459 evevec.resize(nevents);
460 for (
UInt_t ie=0; ie<nevents; ie++) {
465 Log() << kWARNING <<
"GetRndmSampleEvents() : requested sub sample size larger than total size (BUG!).";
476 if (hlist.empty())
return;
483 for (
UInt_t i=0; i<hlist.size(); i++) {
493 if (wm<wmin) wmin=wm;
510 for (
UInt_t i=0; i<hlist.size(); i++) {
529 if (!ruleHasVar)
return;
532 if(firstbin<0) firstbin=0;
540 Double_t fbfrac = (dormin ? ((fbmin+xbinw-rmin)/xbinw):1.0);
541 Double_t lbfrac = (dormax ? ((rmax-lbmax+xbinw)/xbinw):1.0);
546 for (
Int_t bin = binmin; bin<binmax+1; bin++) {
547 fbin = bin-firstbin+1;
551 else if (bin==binmax) {
565 h2->
Fill(xc,0.5,val*
f);
587 for (
Int_t bin = firstbin; bin<lastbin+1; bin++) {
589 h2->
Fill(xc,0.5,val);
608 Double_t rxmin, rxmax, rymin, rymax;
609 Bool_t dorxmin, dorxmax, dorymin, dorymax;
615 if (!(ruleHasVarX || ruleHasVarY))
return;
634 Double_t fxbinmin = (dorxmin ? ((xbinmin+xbinw-vxmin)/xbinw):1.0);
635 Double_t fxbinmax = (dorxmax ? ((vxmax-xbinmax+xbinw)/xbinw):1.0);
636 Double_t fybinmin = (dorymin ? ((ybinmin+ybinw-vymin)/ybinw):1.0);
637 Double_t fybinmax = (dorymax ? ((vymax-ybinmax+ybinw)/ybinw):1.0);
642 for (
Int_t binx = binxmin; binx<binxmax+1; binx++) {
646 else if (binx==binxmax) {
653 for (
Int_t biny = binymin; biny<binymax+1; biny++) {
657 else if (biny==binymax) {
664 h2->
Fill(xc,yc,val*fx*fy);
674 Int_t nhists = hlist.size();
676 if (nhists!=nvar)
Log() << kFATAL <<
"BUG TRAP: number of hists is not equal the number of variables!" <<
Endl;
678 std::vector<Int_t> vindex;
681 for (
Int_t ih=0; ih<nhists; ih++) {
682 hstr = hlist[ih]->GetTitle();
683 for (
Int_t iv=0; iv<nvar; iv++) {
685 vindex.push_back(iv);
689 for (
Int_t iv=0; iv<nvar; iv++) {
692 FillCut(hlist[iv],rule,vindex[iv]);
707 if (!(ruleimp>0))
return;
710 Int_t nhists = hlist.size();
712 Int_t ncorr = (nvar*(nvar+1)/2)-nvar;
713 if (nhists!=ncorr)
Log() << kERROR <<
"BUG TRAP: number of corr hists is not correct! ncorr = "
714 << ncorr <<
" nvar = " << nvar <<
" nhists = " << nhists <<
Endl;
716 std::vector< std::pair<Int_t,Int_t> > vindex;
720 for (
Int_t ih=0; ih<nhists; ih++) {
721 hstr = hlist[ih]->GetName();
723 iv1 =
fMethodBase->DataInfo().FindVarIndex( var1 );
724 iv2 =
fMethodBase->DataInfo().FindVarIndex( var2 );
725 vindex.push_back( std::pair<Int_t,Int_t>(iv2,iv1) );
728 Log() << kERROR <<
"BUG TRAP: should not be here - failed getting var1 and var2" <<
Endl;
732 for (
Int_t ih=0; ih<nhists; ih++) {
735 FillCorr(hlist[ih],rule,vindex[ih].first,vindex[ih].second);
746 if(!title.BeginsWith(
"scat_"))
return kFALSE;
748 TString titleCopy = title(5,title.Length());
753 var1 = titleCopy(0,splitPos);
754 var2 = titleCopy(splitPos+4, titleCopy.
Length());
767 const TString directories[5] = {
"InputVariables_Id",
768 "InputVariables_Deco",
769 "InputVariables_PCA",
770 "InputVariables_Gauss",
771 "InputVariables_Gauss_Deco" };
773 const TString corrDirName =
"CorrelationPlots";
785 Log() << kWARNING <<
"No basedir - BUG??" <<
Endl;
791 done = ((varDir!=0) || (type>4));
794 Log() << kWARNING <<
"No input variable directory found - BUG?" <<
Endl;
799 Log() << kWARNING <<
"No correlation directory found" <<
Endl;
800 Log() << kWARNING <<
"Check for other warnings related to correlation histograms" <<
Endl;
804 Log() << kWARNING <<
"No rulefit method directory found - BUG?" <<
Endl;
808 varDirName = varDir->
GetName();
814 Log() << kWARNING <<
"No correlation directory found : " << corrDirName <<
Endl;
820 Log() << kDEBUG <<
"Got number of plots = " << noPlots <<
Endl;
823 std::vector<TH2F *> h1Vector;
824 std::vector<TH2F *> h2CorrVector;
827 while ((key = (
TKey*)next())) {
833 Log() << kDEBUG <<
"Got histogram : " << hname <<
Endl;
847 h1Vector.push_back( newhist );
854 while ((key = (
TKey*)nextCorr())) {
863 Log() << kDEBUG <<
"Got histogram (2D) : " << hname <<
Endl;
871 TH2F *newhist =
new TH2F(newname,htitle,
891 h2CorrVector.push_back( newhist );
899 for (
UInt_t i=0; i<nrules; i++) {
910 for (
UInt_t i=0; i<nrules; i++) {
918 for (
UInt_t i=0; i<h1Vector.size(); i++) h1Vector[i]->Write();
919 for (
UInt_t i=0; i<h2CorrVector.size(); i++) h2CorrVector[i]->Write();
929 Log() << kWARNING <<
"<MakeDebugHists> No rulefit method directory found - bug?" <<
Endl;
934 std::vector<Double_t> distances;
935 std::vector<Double_t> fncuts;
936 std::vector<Double_t> fnvars;
942 for (
UInt_t i=0; i<nrules; i++) {
944 for (
UInt_t j=i+1; j<nrules; j++) {
950 distances.push_back(dAB);
951 fncuts.push_back(
static_cast<Double_t>(nc));
952 fnvars.push_back(
static_cast<Double_t>(nv));
953 if (dAB<dABmin) dABmin=dAB;
954 if (dAB>dABmax) dABmax=dAB;
959 TH1F *histDist =
new TH1F(
"RuleDist",
"Rule distances",100,dABmin,dABmax);
960 TTree *distNtuple =
new TTree(
"RuleDistNtuple",
"RuleDist ntuple");
964 distNtuple->
Branch(
"dist", &ntDist,
"dist/D");
965 distNtuple->
Branch(
"ncuts",&ntNcuts,
"ncuts/D");
966 distNtuple->
Branch(
"nvars",&ntNvars,
"nvars/D");
968 for (
UInt_t i=0; i<distances.size(); i++) {
969 histDist->
Fill(distances[i]);
970 ntDist = distances[i];
int Int_t
Signed integer 4 bytes (int).
unsigned int UInt_t
Unsigned integer 4 bytes (unsigned int).
bool Bool_t
Boolean (0=false, 1=true) (bool).
double Double_t
Double 8 bytes.
long long Long64_t
Portable signed long integer 8 bytes.
virtual Double_t GetBinCenter(Int_t bin) const
Return center of bin.
virtual Int_t FindBin(Double_t x)
Find bin number corresponding to abscissa x.
virtual Double_t GetBinLowEdge(Int_t bin) const
Return low edge of bin.
virtual Double_t GetBinWidth(Int_t bin) const
Return bin width.
TClass instances represent classes, structs and namespaces in the ROOT type system.
Bool_t InheritsFrom(const char *cl) const override
Return kTRUE if this class inherits from a class with name "classname".
Describe directory structure in memory.
virtual TObject * Get(const char *namecycle)
Return pointer to object identified by namecycle.
virtual Bool_t cd()
Change current directory to "this" directory.
virtual TList * GetListOfKeys() const
1-D histogram with a float per channel (see TH1 documentation)
virtual Int_t GetNbinsY() const
virtual Double_t GetMaximum(Double_t maxval=FLT_MAX) const
Return maximum value smaller than maxval of bins in the range, unless the value has been overridden b...
virtual Int_t GetNbinsX() const
virtual void SetMaximum(Double_t maximum=-1111)
virtual Int_t Fill(Double_t x)
Increment bin with abscissa X by 1.
virtual void SetMinimum(Double_t minimum=-1111)
virtual void Scale(Double_t c1=1, Option_t *option="")
Multiply this histogram by a constant c1.
virtual Int_t FindBin(Double_t x, Double_t y=0, Double_t z=0)
Return Global bin number corresponding to x,y,z.
virtual Double_t GetMinimum(Double_t minval=-FLT_MAX) const
Return minimum value larger than minval of bins in the range, unless the value has been overridden by...
2-D histogram with a float per channel (see TH1 documentation)
Int_t GetBin(Int_t binx, Int_t biny, Int_t binz=0) const override
Return Global bin number corresponding to binx,y,z.
Int_t Fill(Double_t) override
Invalid Fill method.
Book space in a file, create I/O buffers, to fill them, (un)compress them.
virtual const char * GetClassName() const
virtual TObject * ReadObj()
To read a TObject* from the file.
Implementation of a Decision Tree.
UInt_t BuildTree(const EventConstList &eventSample, DecisionTreeNode *node=nullptr)
building the decision tree by recursively calling the splitting of one (root-) node into two daughter...
void SetPruneMethod(EPruneMethod m=kCostComplexityPruning)
void SetPruneStrength(Double_t p)
Double_t CheckEvent(const TMVA::Event *, Bool_t UseYesNoLeaf=kFALSE) const
the event e is put into the decision tree (starting at the root node) and the output is NodeType (sig...
Double_t PruneTree(const EventConstList *validationSample=nullptr)
prune (get rid of internal nodes) the Decision tree to avoid overtraining several different pruning m...
Virtual base Class for all MVA method.
J Friedman's RuleFit method.
ostringstream derivative to redirect and format output
Bool_t GetCutRange(Int_t sel, Double_t &rmin, Double_t &rmax, Bool_t &dormin, Bool_t &dormax) const
get cut range for a given selector
void GetRndmSampleEvents(std::vector< const TMVA::Event * > &evevec, UInt_t nevents)
draw a random subsample of the training events without replacement
Double_t EvalEvent(const Event &e)
evaluate single event
UInt_t fNTreeSample
number of events in sub sample = frac*neve
void SetMethodBase(const MethodBase *rfbase)
set MethodBase
void InitPtrs(const TMVA::MethodBase *rfbase)
initialize pointers
void Boost(TMVA::DecisionTree *dt)
Boost the events.
Bool_t fVisHistsUseImp
if true, use importance as weight; else coef in vis hists
void ForestStatistics()
summary of statistics of all trees
static const Int_t randSEED
void CalcImportance()
calculates the importance of each rule
void SetMsgType(EMsgType t)
set the current message type to that of mlog for this class and all other subtools
void Initialize(const TMVA::MethodBase *rfbase)
initialize the parameters of the RuleFit method and make rules
std::vector< const TMVA::Event * > fTrainingEventsRndm
idem, but randomly shuffled
virtual ~RuleFit(void)
destructor
void FillVisHistCorr(const Rule *rule, std::vector< TH2F * > &hlist)
help routine to MakeVisHists() - fills for all correlation plots
std::default_random_engine fRNGEngine
void InitNEveEff()
init effective number of events (using event weights)
std::vector< const TMVA::DecisionTree * > fForest
the input forest of decision trees
const MethodBase * fMethodBase
pointer the method base which initialized this RuleFit instance
std::vector< const TMVA::Event * > fTrainingEvents
all training events
void SaveEventWeights()
save event weights - must be done before making the forest
void FillCut(TH2F *h2, const TMVA::Rule *rule, Int_t vind)
Fill cut.
void FillLin(TH2F *h2, Int_t vind)
fill lin
Bool_t GetCorrVars(TString &title, TString &var1, TString &var2)
get first and second variables from title
void MakeForest()
make a forest of decisiontrees
const std::vector< const TMVA::DecisionTree * > & GetForest() const
void FitCoefficients()
Fit the coefficients for the rule ensemble.
const MethodRuleFit * fMethodRuleFit
pointer the method which initialized this RuleFit instance
const MethodBase * GetMethodBase() const
void FillCorr(TH2F *h2, const TMVA::Rule *rule, Int_t v1, Int_t v2)
fill rule correlation between vx and vy, weighted with either the importance or the coefficient
void NormVisHists(std::vector< TH2F * > &hlist)
normalize rule importance hists
void RestoreEventWeights()
save event weights - must be done before making the forest
RuleFitParams fRuleFitParams
fit rule parameters
void MakeVisHists()
this will create histograms visualizing the rule ensemble
void FillVisHistCut(const Rule *rule, std::vector< TH2F * > &hlist)
help routine to MakeVisHists() - fills for all variables
std::vector< Double_t > fEventWeights
original weights of the events - follows fTrainingEvents
void BuildTree(TMVA::DecisionTree *dt)
build the decision tree using fNTreeSample events from fTrainingEventsRndm
const std::vector< const TMVA::Event * > & GetTrainingEvents() const
const MethodRuleFit * GetMethodRuleFit() const
void SetTrainingEvents(const std::vector< const TMVA::Event * > &el)
set the training events randomly
Double_t fNEveEffTrain
reweighted number of events = sum(wi)
void Copy(const RuleFit &other)
copy method
RuleFit(const TMVA::MethodBase *rfbase)
constructor
RuleEnsemble fRuleEnsemble
the ensemble of rules
const RuleEnsemble & GetRuleEnsemble() const
Double_t CalcWeightSum(const std::vector< const TMVA::Event * > *events, UInt_t neve=0)
calculate the sum of weights
RuleFit(void)
default constructor
MsgLogger * fLogger
! message logger
void MakeDebugHists()
this will create a histograms intended rather for debugging or for the curious user
Implementation of a rule.
Double_t GetSupport() const
UInt_t GetNumVarsUsed() const
const RuleCut * GetRuleCut() const
Double_t GetCoefficient() const
Double_t GetImportance() const
Double_t RuleDist(const Rule &other, Bool_t useCutValue) const
Returns:
Bool_t ContainsVariable(UInt_t iv) const
check if variable in node
Timing information for training and evaluation of MVA methods.
virtual void SetTitle(const char *title="")
Set the title of the TNamed.
const char * GetName() const override
Returns name of object.
const char * GetTitle() const override
Returns title of object.
Random number generator class based on M.
virtual Double_t Uniform(Double_t x1=1)
Returns a uniform deviate on the interval (0, x1).
TString & ReplaceAll(const TString &s1, const TString &s2)
TString & Remove(Ssiz_t pos)
Bool_t Contains(const char *pat, ECaseCompare cmp=kExact) const
Ssiz_t Index(const char *pat, Ssiz_t i=0, ECaseCompare cmp=kExact) const
A TTree represents a columnar dataset.
virtual Int_t Fill()
Fill all branches.
TBranch * Branch(const char *name, T *obj, Int_t bufsize=32000, Int_t splitlevel=99)
Add a new branch, and infer the data type from the type of obj being passed.
Int_t Write(const char *name=nullptr, Int_t option=0, Int_t bufsize=0) override
Write this object to the current directory.
MsgLogger & Endl(MsgLogger &ml)
Double_t Sqrt(Double_t x)
Returns the square root of x.
Short_t Abs(Short_t d)
Returns the absolute value of parameter Short_t d.