124 for ( std::vector<Rule *>::iterator itrRule =
fRules.begin(); itrRule !=
fRules.end(); ++itrRule ) {
147 for (
UInt_t i=0; i<nvars; i++) {
201 if (ncoeffs<1)
return 0;
205 for (
Int_t i=0; i<ncoeffs; i++) {
206 val =
fRules[i]->GetCoefficient();
219 for (
UInt_t i=0; i<nrules; i++) {
220 fRules[i]->SetCoefficient(0.0);
230 if (
v.size()!=nrules) {
231 Log() << kFATAL <<
"<SetCoefficients> - BUG TRAP - input vector wrong size! It is = " <<
v.size()
232 <<
" when it should be = " << nrules <<
Endl;
234 for (
UInt_t i=0; i<nrules; i++) {
235 fRules[i]->SetCoefficient(
v[i]);
246 if (nrules==0)
return;
248 for (
UInt_t i=0; i<nrules; i++) {
249 v[i] = (
fRules[i]->GetCoefficient());
258 return &(
fRuleFit->GetTrainingEvents());
266 return fRuleFit->GetTrainingEvent(i);
278 std::vector< Char_t > removeMe( nrulesIn,
false );
283 for (
UInt_t i=0; i<nrulesIn; i++) {
286 for (
UInt_t k=i+1; k<nrulesIn; k++) {
292 remind = (
r>0.5 ? k:i);
299 if (!removeMe[remind]) {
300 removeMe[remind] =
true;
309 for (
UInt_t i=0; i<nrulesIn; i++) {
319 Log() << kVERBOSE <<
"Removed " << nrulesIn - nrulesOut <<
" out of " << nrulesIn <<
" rules" <<
Endl;
328 if (nrules==0)
return;
336 for (
UInt_t i=0; i<nrules; i++) {
345 Log() << kINFO <<
"Removed " << nrules-ind <<
" out of a total of " << nrules
359 for (
UInt_t i=0; i<nlin; i++) {
369 Log() << kVERBOSE <<
"Evaluating Rule support" <<
Endl;
379 if ((nrules>0) && (events->size()>0)) {
380 for ( std::vector< Rule * >::iterator itrRule=
fRules.begin(); itrRule!=
fRules.end(); ++itrRule ) {
384 for ( std::vector<const Event * >::const_iterator itrEvent=events->begin(); itrEvent!=events->end(); ++itrEvent ) {
385 if ((*itrRule)->EvalEvent( *(*itrEvent) )) {
386 ew = (*itrEvent)->GetWeight();
395 t = (t<0 ? 0:sqrt(t));
399 (*itrRule)->SetSupport(s);
400 (*itrRule)->SetNorm(t);
401 (*itrRule)->SetSSB( ssb );
402 (*itrRule)->SetSSBNeve(
Double_t(ssig+sbkg));
418 Double_t maxImp = (maxRuleImp>maxLinImp ? maxRuleImp : maxLinImp);
428 fRules[i]->SetImportanceRef(impref);
440 for (
int i=0; i<nrules; i++ ) {
441 fRules[i]->CalcImportance();
442 imp =
fRules[i]->GetImportance();
443 if (imp>maxImp) maxImp = imp;
445 for (
Int_t i=0; i<nrules; i++ ) {
446 fRules[i]->SetImportanceRef(maxImp);
470 for (
UInt_t i=0; i<nvars; i++ ) {
473 if (imp>maxImp) maxImp = imp;
483 Log() << kVERBOSE <<
"Compute variable importance" <<
Endl;
486 if (
GetMethodBase()==0)
Log() << kFATAL <<
"RuleEnsemble::CalcVarImportance() - should not be here!" <<
Endl;
493 for (
UInt_t ind=0; ind<nrules; ind++ ) {
494 rimp =
fRules[ind]->GetImportance();
495 nvarsUsed =
fRules[ind]->GetNumVarsUsed();
497 Log() << kFATAL <<
"<CalcVarImportance> Variables for importance calc!!!??? A BUG!" <<
Endl;
498 rimpN = (nvarsUsed > 0 ? rimp/nvarsUsed:0.0);
499 for (
UInt_t iv=0; iv<nvars; iv++ ) {
500 if (
fRules[ind]->ContainsVariable(iv)) {
516 for (
UInt_t iv=0; iv<nvars; iv++ ) {
520 for (
UInt_t iv=0; iv<nvars; iv++ ) {
535 fRules.resize(rules.size());
558 UInt_t ntrees = forest.size();
559 for (
UInt_t ind=0; ind<ntrees; ind++ ) {
563 nendn = (nrules/2) + 1;
565 sumn2 += nendn*nendn;
566 nrulesCheck += nrules;
568 Double_t nmean = (ntrees>0) ? sumnendn/ntrees : 0;
570 Double_t ndev = 2.0*(nmean-2.0-nsigm)/(nmean-2.0+nsigm);
572 Log() << kVERBOSE <<
"Average number of end nodes per tree = " << nmean <<
Endl;
573 if (ntrees>1)
Log() << kVERBOSE <<
"sigma of ditto ( ~= mean-2 ?) = "
576 Log() << kVERBOSE <<
"Deviation from exponential model = " << ndev <<
Endl;
577 Log() << kVERBOSE <<
"Corresponds to L (eq. 13, RuleFit ppr) = " << nmean <<
Endl;
579 if (nrulesCheck !=
static_cast<Int_t>(
fRules.size())) {
581 <<
"BUG! number of generated and possible rules do not match! N(rules) = " <<
fRules.size()
582 <<
" != " << nrulesCheck <<
Endl;
584 Log() << kVERBOSE <<
"Number of generated rules: " <<
fRules.size() <<
Endl;
604 UInt_t neve = events->size();
605 UInt_t nvars = ((*events)[0])->GetNVariables();
607 typedef std::pair< Double_t, Int_t> dataType;
608 typedef std::pair< Double_t, dataType > dataPoint;
610 std::vector< std::vector<dataPoint> > vardata(nvars);
611 std::vector< Double_t > varsum(nvars,0.0);
612 std::vector< Double_t > varsum2(nvars,0.0);
617 for (
UInt_t i=0; i<neve; i++) {
618 ew = ((*events)[i])->GetWeight();
620 val = ((*events)[i])->GetValue(
v);
621 vardata[
v].push_back( dataPoint( val, dataType(ew,((*events)[i])->GetClass()) ) );
651 std::sort( vardata[
v].begin(),vardata[
v].end() );
656 while ( (ie<neve) && (neff<nquant) ) {
657 neff += vardata[
v][ie].second.first;
660 indquantM = (ie==0 ? 0:ie-1);
664 while ( (ie>0) && (neff<nquant) ) {
666 neff += vardata[
v][ie].second.first;
668 indquantP = (ie==neve ? ie=neve-1:ie);
670 fLinDM[
v] = vardata[
v][indquantM].first;
671 fLinDP[
v] = vardata[
v][indquantP].first;
685 for (ie=0; ie<neve; ie++) {
686 val = vardata[
v][ie].first;
687 ew = vardata[
v][ie].second.first;
688 type = vardata[
v][ie].second.second;
691 varsum2[
v] += ew*lx*lx;
728 fstot +=
fLinPDFS[
v]->GetBinContent(bin);
729 fbtot +=
fLinPDFB[
v]->GetBinContent(bin);
731 if (nvars<1)
return 0;
732 ntot = (fstot+fbtot)/
Double_t(nvars);
734 return fstot/(fstot+fbtot);
749 for (
UInt_t ir=0; ir<nrules; ir++) {
761 if (ntot>0)
return nsig/ntot;
793 if ((nlt>0) && (nrt>0)) nt=2.0;
806 const UInt_t neve = events->size();
809 const Event *eveData;
825 std::vector<Int_t> varcnt;
833 varcnt.resize(nvars,0);
837 for (
UInt_t i=0; i<nrules; i++ ) {
839 if (
fRules[i]->ContainsVariable(
v)) varcnt[
v]++;
841 sigRule =
fRules[i]->IsSignalRule();
856 eveData = (*events)[
e];
857 tagged =
fRules[i]->EvalEvent(*eveData);
858 sigTag = (tagged && sigRule);
859 bkgTag = (tagged && (!sigRule));
861 sigTrue = (eveData->
GetClass() == 0);
864 if (sigTag && sigTrue) nss++;
865 if (sigTag && !sigTrue) nsb++;
866 if (bkgTag && sigTrue) nbs++;
867 if (bkgTag && !sigTrue) nbb++;
871 if (ntag>0 && neve > 0) {
895 for (
UInt_t i=0; i<nrules; i++ ) {
913 Log() << kHEADER <<
"-------------------RULE ENSEMBLE SUMMARY------------------------" <<
Endl;
915 if (mrf)
Log() << kINFO <<
"Tree training method : " << (mrf->
UseBoost() ?
"AdaBoost":
"Random") <<
Endl;
916 Log() << kINFO <<
"Number of events per tree : " <<
fRuleFit->GetNTreeSample() <<
Endl;
917 Log() << kINFO <<
"Number of trees : " <<
fRuleFit->GetForest().size() <<
Endl;
919 Log() << kINFO <<
"Idem, after cleanup : " <<
fRules.size() <<
Endl;
923 Log() << kINFO <<
"----------------------------------------------------------------" <<
Endl;
932 const EMsgType kmtype=kINFO;
936 Log() << kmtype <<
"================================================================" <<
Endl;
937 Log() << kmtype <<
" M o d e l " <<
Endl;
938 Log() << kmtype <<
"================================================================" <<
Endl;
950 Log() << kDEBUG <<
"Variable importance:" <<
Endl;
953 << std::resetiosflags(std::ios::right)
962 Log() << kmtype <<
"------------------------------------" <<
Endl;
963 Log() << kmtype <<
"Linear model (weights unnormalised)" <<
Endl;
964 Log() << kmtype <<
"------------------------------------" <<
Endl;
965 Log() << kmtype << std::setw(maxL) <<
"Variable"
966 << std::resetiosflags(std::ios::right) <<
" : "
967 << std::setw(11) <<
" Weights"
968 << std::resetiosflags(std::ios::right) <<
" : "
970 << std::resetiosflags(std::ios::right)
972 Log() << kmtype <<
"------------------------------------" <<
Endl;
974 Log() << kmtype << std::setw(std::max(maxL,8)) <<
GetMethodBase()->GetInputLabel(i);
977 << std::resetiosflags(std::ios::right)
982 Log() << kmtype <<
"-> importance below threshold = "
986 Log() << kmtype <<
"------------------------------------" <<
Endl;
989 else Log() << kmtype <<
"Linear terms were disabled" <<
Endl;
991 if ((!
DoRules()) || (nrules==0)) {
993 Log() << kmtype <<
"Rule terms were disabled" <<
Endl;
996 Log() << kmtype <<
"Even though rules were included in the model, none passed! " << nrules <<
Endl;
1000 Log() << kmtype <<
"Number of rules = " << nrules <<
Endl;
1005 Log() << kmtype <<
"Fraction of rules containing a variable (%):" <<
Endl;
1014 std::list< std::pair<double,int> > sortedImp;
1015 for (
Int_t i=0; i<nrules; i++) {
1016 sortedImp.push_back( std::pair<double,int>(
fRules[i]->GetImportance(),i ) );
1020 Log() << kmtype <<
"Printing the first " << printN <<
" rules, ordered in importance." <<
Endl;
1022 for ( std::list< std::pair<double,int> >::reverse_iterator itpair = sortedImp.rbegin();
1023 itpair != sortedImp.rend(); ++itpair ) {
1024 ind = itpair->second;
1031 if (nrules==printN) {
1032 Log() << kmtype <<
"All rules printed" <<
Endl;
1035 Log() << kmtype <<
"Skipping the next " << nrules-printN <<
" rules" <<
Endl;
1041 Log() << kmtype <<
"================================================================" <<
Endl;
1050 Int_t dp = os.precision();
1058 os <<
"Offset= " <<
fOffset << std::endl;
1059 os <<
"NRules= " << nrules << std::endl;
1060 for (
UInt_t i=0; i<nrules; i++){
1061 os <<
"***Rule " << i << std::endl;
1066 os <<
"NLinear= " <<
fLinTermOK.size() << std::endl;
1067 for (
UInt_t i=0; i<nlinear; i++) {
1068 os <<
"***Linear " << i << std::endl;
1069 os << std::setprecision(10) << (
fLinTermOK[i] ? 1:0) <<
" "
1076 os << std::setprecision(dp);
1098 for (
UInt_t i=0; i<nlinear; i++) {
1118 Int_t iLearningModel;
1133 for (i=0; i<nrules; i++) {
1135 fRules[i]->SetRuleEnsemble(
this );
1136 fRules[i]->ReadFromXML( ch );
1145 fLinDP .resize( nlinear );
1146 fLinDM .resize( nlinear );
1182 istr >> dummy >> nrules;
1188 for (
UInt_t i=0; i<nrules; i++){
1189 istr >> dummy >> idum;
1191 (
fRules.back())->SetRuleEnsemble(
this );
1192 (
fRules.back())->ReadRaw(istr);
1200 istr >> dummy >> nlinear;
1205 fLinDP .resize( nlinear );
1206 fLinDM .resize( nlinear );
1211 for (
UInt_t i=0; i<nlinear; i++) {
1212 istr >> dummy >> idum;
1228 if(
this != &other) {
1255 if (dtree==0)
return 0;
1257 Int_t nendnodes = 0;
1259 return 2*(nendnodes-1);
1267 if (node==0)
return;
1292 if (node==0)
return;
1300 fRules.push_back( rule );
1305 Log() << kFATAL <<
"<AddRule> - ERROR failed in creating a rule! BUG!" <<
Endl;
1319 Log() << kFATAL <<
"<MakeTheRule> Input node is NULL. Should not happen. BUG!" <<
Endl;
1327 std::vector< const Node * > nodeVec;
1328 const Node *parent = node;
1333 nodeVec.push_back( node );
1336 if (!parent)
continue;
1339 nodeVec.insert( nodeVec.begin(), parent );
1342 if (nodeVec.size()<2) {
1343 Log() << kFATAL <<
"<MakeTheRule> BUG! Inconsistent Rule!" <<
Endl;
1346 Rule *rule =
new Rule(
this, nodeVec );
1356 Log() << kVERBOSE <<
"Making Rule map for all events" <<
Endl;
1359 if ((ifirst==0) || (ilast==0) || (ifirst>ilast)) {
1361 ilast = events->size()-1;
1371 Log() << kVERBOSE <<
"<MakeRuleMap> Map is already valid" <<
Endl;
1380 Log() << kVERBOSE <<
"No rules found in MakeRuleMap()" <<
Endl;
1387 std::vector<UInt_t> ruleind;
1389 for (
UInt_t i=ifirst; i<=ilast; i++) {
1399 Log() << kVERBOSE <<
"Made rule map for event# " << ifirst <<
" : " << ilast <<
Endl;
1407 os <<
"DON'T USE THIS - TO BE REMOVED" << std::endl;
int Int_t
Signed integer 4 bytes (int).
unsigned int UInt_t
Unsigned integer 4 bytes (unsigned int).
bool Bool_t
Boolean (0=false, 1=true) (bool).
double Double_t
Double 8 bytes.
char * Form(const char *fmt,...)
Formats a string in a circular formatting buffer.
1-D histogram with a float per channel (see TH1 documentation)
Short_t GetSelector() const
return index of variable used for discrimination at this node
Implementation of a Decision Tree.
DecisionTreeNode * GetRoot() const override
Virtual base Class for all MVA method.
Bool_t IsSilentFile() const
J Friedman's RuleFit method.
ostringstream derivative to redirect and format output
Node for the BinarySearch or Decision Trees.
virtual Node * GetLeft() const
virtual Node * GetParent() const
virtual Node * GetRight() const
std::vector< Double_t > fRulePBB
p(tag as B|B)
virtual ~RuleEnsemble()
destructor
Double_t EvalEvent() const
void CalcVarImportance()
Calculates variable importance using eq (35) in RuleFit paper by Friedman et.al.
std::vector< Double_t > fLinImportance
linear term importance
void SetImportanceRef(Double_t impref)
set reference importance
void CalcImportance()
calculate the importance of each rule
void PrintRuleGen() const
print rule generation info
Int_t CalcNRules(const TMVA::DecisionTree *dtree)
calculate the number of rules
std::vector< Double_t > fLinCoefficients
linear coefficients, one per variable
UInt_t fNRulesGenerated
number of rules generated, before cleanup
void ResetCoefficients()
reset all rule coefficients
std::vector< Double_t > fRulePBS
p(tag as B|S)
void SetMsgType(EMsgType t)
std::vector< TMVA::Rule * > fRules
vector of rules
Double_t CalcLinNorm(Double_t stdev)
Double_t fOffset
offset in discriminator function
Double_t GetLinQuantile() const
void ReadRaw(std::istream &istr)
read rule ensemble from stream
std::vector< Double_t > fRulePSS
p(tag as S|S) - tagged as S if rule is SIG and the event is accepted
std::vector< Double_t > fLinDP
delta+ in eq 24, ref 2
const Event * fEvent
current event.
void AddRule(const Node *node)
add a new rule to the tree
void ReadFromXML(void *wghtnode)
read rules from XML
Double_t GetImportanceCut() const
const Event * GetTrainingEvent(UInt_t i) const
get the training event from the rule fitter
const std::vector< const TMVA::Event * > * GetTrainingEvents() const
get list of training events from the rule fitter
Double_t GetRuleMinDist() const
void SetRules(const std::vector< TMVA::Rule * > &rules)
set rules
void MakeRules(const std::vector< const TMVA::DecisionTree * > &forest)
Makes rules from the given decision tree.
void RemoveSimilarRules()
remove rules that behave similar
std::vector< Double_t > fRulePTag
p(tag)
std::vector< TH1F * > fLinPDFB
pdfs for each variable, background
std::vector< Char_t > fEventRuleVal
the rule respons of current event <--— stores boolean
Double_t fRuleFSig
N(sig)/N(sig)+N(bkg).
ELearningModel fLearningModel
can be full (rules+linear), rules, linear
Double_t fRuleMinDist
minimum rule distance
void FindNEndNodes(const TMVA::Node *node, Int_t &nendnodes)
find the number of leaf nodes
Bool_t fRuleMapOK
true if MakeRuleMap() has been called
RuleEnsemble()
constructor
const RuleFit * fRuleFit
pointer to rule fit object
UInt_t fRuleMapInd1
last index
const std::vector< Double_t > & GetVarImportance() const
void CleanupRules()
cleanup rules
void Initialize(const RuleFit *rf)
Initializes all member variables with default values.
std::vector< Double_t > fLinDM
delta-
void CleanupLinear()
cleanup linear model
void RuleResponseStats()
calculate various statistics for this rule
std::vector< Double_t > fVarImportance
one importance per input variable
const RuleFit * GetRuleFit() const
void * AddXMLTo(void *parent) const
write rules to XML
std::vector< Double_t > fLinNorm
norm of ditto, see after eq 26 in ref 2
const std::vector< TMVA::Rule * > & GetRulesConst() const
Double_t fLinQuantile
quantile cut to remove outliers
const MethodRuleFit * GetMethodRuleFit() const
Get a pointer to the original MethodRuleFit.
std::vector< Double_t > fEventLinearVal
linear respons
void MakeModel()
create model
void RuleStatistics()
calculate various statistics for this rule
void SetCoefficients(const std::vector< Double_t > &v)
set all rule coefficients
void Print() const
print function
Double_t PdfRule(Double_t &nsig, Double_t &ntot) const
This function returns Pr( y = 1 | x ) for rules.
void MakeRuleMap(const std::vector< const TMVA::Event * > *events=nullptr, UInt_t ifirst=0, UInt_t ilast=0)
Makes rule map for all events.
const MethodBase * GetMethodBase() const
Get a pointer to the original MethodRuleFit.
Double_t fAverageSupport
average support (over all rules)
Double_t GetOffset() const
Double_t fRuleNCave
N(cuts) average.
std::vector< Char_t > fLinTermOK
flags linear terms with sufficient strong importance <– stores boolean
RuleEnsemble(RuleFit *rf)
constructor
void Copy(RuleEnsemble const &other)
copy function
MsgLogger * fLogger
! message logger
std::vector< std::vector< UInt_t > > fRuleMap
map of rule responses
Double_t CalcLinImportance()
calculate the linear importance for each rule
const std::vector< const TMVA::Event * > * fRuleMapEvents
pointer to vector of events used
void SetAverageRuleSigma(Double_t v)
Double_t CalcRuleImportance()
calculate importance of each rule
std::vector< TH1F * > fLinPDFS
pdfs for each variable, signal
Double_t fImportanceRef
reference importance (max)
void PrintRaw(std::ostream &os) const
write rules to stream
std::vector< Double_t > fRulePSB
p(tag as S|B)
Double_t fAverageRuleSigma
average rule sigma
Bool_t fEventCacheOK
true if rule/linear respons are updated
void CalcRuleSupport()
calculate the support for all rules
Double_t fImportanceCut
minimum importance accepted
ELearningModel GetLearningModel() const
Double_t PdfLinear(Double_t &nsig, Double_t &ntot) const
This function returns Pr( y = 1 | x ) for the linear terms.
Double_t CoefficientRadius()
Calculates sqrt(Sum(a_i^2)), i=1..N (NOTE do not include a0).
void SetEvent(const Event &e)
UInt_t fRuleMapInd0
start index
void MakeRulesFromTree(const DecisionTree *dtree)
create rules from the decision tree structure
Double_t fRuleNCsig
idem sigma
void MakeLinearTerms()
Make the linear terms as in eq 25, ref 2 For this the b and (1-b) quantiles are needed.
Rule * MakeTheRule(const Node *node)
Make a Rule from a given Node.
std::vector< Double_t > fRuleVarFrac
fraction of rules using a given variable - size of vector = n(variables)
void GetCoefficients(std::vector< Double_t > &v)
Retrieve all rule coefficients.
Double_t FStar() const
We want to estimate F* = argmin Eyx( L(y,F(x) ), min wrt F(x) F(x) = FL(x) + FR(x) ,...
A class implementing various fits of rule ensembles.
Implementation of a rule.
void SetMsgType(EMsgType t)
Bool_t Equal(const Rule &other, Bool_t useCutValue, Double_t maxdist) const
Compare two rules.
static TString Format(const char *fmt,...)
Static method which formats a string using a printf style format descriptor and return a TString.
std::ostream & operator<<(std::ostream &os, const BinaryTree &tree)
MsgLogger & Endl(MsgLogger &ml)
Short_t Max(Short_t a, Short_t b)
Returns the largest of a and b.
Double_t Sqrt(Double_t x)
Returns the square root of x.
Short_t Min(Short_t a, Short_t b)
Returns the smallest of a and b.
Short_t Abs(Short_t d)
Returns the absolute value of parameter Short_t d.