#include <algorithm>
#include <math.h>
#include <fstream>
#include "Riostream.h"
#include "TRandom3.h"
#include "TMath.h"
#include "TObjString.h"
#include "TGraph.h"
#include "TMVA/ClassifierFactory.h"
#include "TMVA/MethodBDT.h"
#include "TMVA/Tools.h"
#include "TMVA/Timer.h"
#include "TMVA/Ranking.h"
#include "TMVA/SdivSqrtSplusB.h"
#include "TMVA/BinarySearchTree.h"
#include "TMVA/SeparationBase.h"
#include "TMVA/GiniIndex.h"
#include "TMVA/GiniIndexWithLaplace.h"
#include "TMVA/CrossEntropy.h"
#include "TMVA/MisClassificationError.h"
#include "TMVA/Results.h"
#include "TMVA/ResultsMulticlass.h"
#include "TMVA/Interval.h"
#include "TMVA/LogInterval.h"
#include "TMVA/PDF.h"
#include "TMVA/BDTEventWrapper.h"
#include "TMatrixTSym.h"
using std::vector;
using std::make_pair;
REGISTER_METHOD(BDT)
ClassImp(TMVA::MethodBDT)
const Int_t TMVA::MethodBDT::fgDebugLevel = 0;
TMVA::MethodBDT::MethodBDT( const TString& jobName,
const TString& methodTitle,
DataSetInfo& theData,
const TString& theOption,
TDirectory* theTargetDir ) :
TMVA::MethodBase( jobName, Types::kBDT, methodTitle, theData, theOption, theTargetDir )
, fTrainSample(0)
, fNTrees(0)
, fSigToBkgFraction(0)
, fAdaBoostBeta(0)
, fTransitionPoint(0)
, fShrinkage(0)
, fBaggedBoost(kFALSE)
, fBaggedGradBoost(kFALSE)
, fSumOfWeights(0)
, fMinNodeEvents(0)
, fMinNodeSize(5)
, fMinNodeSizeS("5%")
, fNCuts(0)
, fUseFisherCuts(0)
, fMinLinCorrForFisher(.8)
, fUseExclusiveVars(0)
, fUseYesNoLeaf(kFALSE)
, fNodePurityLimit(0)
, fNNodesMax(0)
, fMaxDepth(0)
, fPruneMethod(DecisionTree::kNoPruning)
, fPruneStrength(0)
, fFValidationEvents(0)
, fAutomatic(kFALSE)
, fRandomisedTrees(kFALSE)
, fUseNvars(0)
, fUsePoissonNvars(0)
, fUseNTrainEvents(0)
, fBaggedSampleFraction(0)
, fNoNegWeightsInTraining(kFALSE)
, fInverseBoostNegWeights(kFALSE)
, fPairNegWeightsGlobal(kFALSE)
, fTrainWithNegWeights(kFALSE)
, fDoBoostMonitor(kFALSE)
, fITree(0)
, fBoostWeight(0)
, fErrorFraction(0)
, fCss(0)
, fCts_sb(0)
, fCtb_ss(0)
, fCbb(0)
, fDoPreselection(kFALSE)
, fHistoricBool(kFALSE)
{
fMonitorNtuple = NULL;
fSepType = NULL;
}
TMVA::MethodBDT::MethodBDT( DataSetInfo& theData,
const TString& theWeightFile,
TDirectory* theTargetDir )
: TMVA::MethodBase( Types::kBDT, theData, theWeightFile, theTargetDir )
, fTrainSample(0)
, fNTrees(0)
, fSigToBkgFraction(0)
, fAdaBoostBeta(0)
, fTransitionPoint(0)
, fShrinkage(0)
, fBaggedBoost(kFALSE)
, fBaggedGradBoost(kFALSE)
, fSumOfWeights(0)
, fMinNodeEvents(0)
, fMinNodeSize(5)
, fMinNodeSizeS("5%")
, fNCuts(0)
, fUseFisherCuts(0)
, fMinLinCorrForFisher(.8)
, fUseExclusiveVars(0)
, fUseYesNoLeaf(kFALSE)
, fNodePurityLimit(0)
, fNNodesMax(0)
, fMaxDepth(0)
, fPruneMethod(DecisionTree::kNoPruning)
, fPruneStrength(0)
, fFValidationEvents(0)
, fAutomatic(kFALSE)
, fRandomisedTrees(kFALSE)
, fUseNvars(0)
, fUsePoissonNvars(0)
, fUseNTrainEvents(0)
, fBaggedSampleFraction(0)
, fNoNegWeightsInTraining(kFALSE)
, fInverseBoostNegWeights(kFALSE)
, fPairNegWeightsGlobal(kFALSE)
, fTrainWithNegWeights(kFALSE)
, fDoBoostMonitor(kFALSE)
, fITree(0)
, fBoostWeight(0)
, fErrorFraction(0)
, fCss(0)
, fCts_sb(0)
, fCtb_ss(0)
, fCbb(0)
, fDoPreselection(kFALSE)
, fHistoricBool(kFALSE)
{
fMonitorNtuple = NULL;
fSepType = NULL;
}
Bool_t TMVA::MethodBDT::HasAnalysisType( Types::EAnalysisType type, UInt_t numberClasses, UInt_t numberTargets )
{
if (type == Types::kClassification && numberClasses == 2) return kTRUE;
if (type == Types::kMulticlass ) return kTRUE;
if( type == Types::kRegression && numberTargets == 1 ) return kTRUE;
return kFALSE;
}
void TMVA::MethodBDT::DeclareOptions()
{
DeclareOptionRef(fNTrees, "NTrees", "Number of trees in the forest");
if (DoRegression()) {
DeclareOptionRef(fMaxDepth=50,"MaxDepth","Max depth of the decision tree allowed");
}else{
DeclareOptionRef(fMaxDepth=3,"MaxDepth","Max depth of the decision tree allowed");
}
TString tmp="5%"; if (DoRegression()) tmp="0.2%";
DeclareOptionRef(fMinNodeSizeS=tmp, "MinNodeSize", "Minimum percentage of training events required in a leaf node (default: Classification: 5%, Regression: 0.2%)");
DeclareOptionRef(fNCuts, "nCuts", "Number of grid points in variable range used in finding optimal cut in node splitting");
DeclareOptionRef(fBoostType, "BoostType", "Boosting type for the trees in the forest (note: AdaCost is still experimental)");
AddPreDefVal(TString("AdaBoost"));
AddPreDefVal(TString("RealAdaBoost"));
AddPreDefVal(TString("AdaCost"));
AddPreDefVal(TString("Bagging"));
AddPreDefVal(TString("AdaBoostR2"));
AddPreDefVal(TString("Grad"));
if (DoRegression()) {
fBoostType = "AdaBoostR2";
}else{
fBoostType = "AdaBoost";
}
DeclareOptionRef(fAdaBoostR2Loss="Quadratic", "AdaBoostR2Loss", "Type of Loss function in AdaBoostR2");
AddPreDefVal(TString("Linear"));
AddPreDefVal(TString("Quadratic"));
AddPreDefVal(TString("Exponential"));
DeclareOptionRef(fBaggedBoost=kFALSE, "UseBaggedBoost","Use only a random subsample of all events for growing the trees in each boost iteration.");
DeclareOptionRef(fShrinkage=1.0, "Shrinkage", "Learning rate for GradBoost algorithm");
DeclareOptionRef(fAdaBoostBeta=.5, "AdaBoostBeta", "Learning rate for AdaBoost algorithm");
DeclareOptionRef(fRandomisedTrees,"UseRandomisedTrees","Determine at each node splitting the cut variable only as the best out of a random subset of variables (like in RandomForests)");
DeclareOptionRef(fUseNvars,"UseNvars","Size of the subset of variables used with RandomisedTree option");
DeclareOptionRef(fUsePoissonNvars,"UsePoissonNvars", "Interpret \"UseNvars\" not as fixed number but as mean of a Possion distribution in each split with RandomisedTree option");
DeclareOptionRef(fBaggedSampleFraction=.6,"BaggedSampleFraction","Relative size of bagged event sample to original size of the data sample (used whenever bagging is used (i.e. UseBaggedBoost, Bagging,)" );
DeclareOptionRef(fUseYesNoLeaf=kTRUE, "UseYesNoLeaf",
"Use Sig or Bkg categories, or the purity=S/(S+B) as classification of the leaf node -> Real-AdaBoost");
if (DoRegression()) {
fUseYesNoLeaf = kFALSE;
}
DeclareOptionRef(fNegWeightTreatment="InverseBoostNegWeights","NegWeightTreatment","How to treat events with negative weights in the BDT training (particular the boosting) : IgnoreInTraining; Boost With inverse boostweight; Pair events with negative and positive weights in traning sample and *annihilate* them (experimental!)");
AddPreDefVal(TString("InverseBoostNegWeights"));
AddPreDefVal(TString("IgnoreNegWeightsInTraining"));
AddPreDefVal(TString("NoNegWeightsInTraining"));
AddPreDefVal(TString("PairNegWeightsGlobal"));
AddPreDefVal(TString("Pray"));
DeclareOptionRef(fCss=1., "Css", "AdaCost: cost of true signal selected signal");
DeclareOptionRef(fCts_sb=1.,"Cts_sb","AdaCost: cost of true signal selected bkg");
DeclareOptionRef(fCtb_ss=1.,"Ctb_ss","AdaCost: cost of true bkg selected signal");
DeclareOptionRef(fCbb=1., "Cbb", "AdaCost: cost of true bkg selected bkg ");
DeclareOptionRef(fNodePurityLimit=0.5, "NodePurityLimit", "In boosting/pruning, nodes with purity > NodePurityLimit are signal; background otherwise.");
DeclareOptionRef(fSepTypeS, "SeparationType", "Separation criterion for node splitting");
AddPreDefVal(TString("CrossEntropy"));
AddPreDefVal(TString("GiniIndex"));
AddPreDefVal(TString("GiniIndexWithLaplace"));
AddPreDefVal(TString("MisClassificationError"));
AddPreDefVal(TString("SDivSqrtSPlusB"));
AddPreDefVal(TString("RegressionVariance"));
if (DoRegression()) {
fSepTypeS = "RegressionVariance";
}else{
fSepTypeS = "GiniIndex";
}
DeclareOptionRef(fDoBoostMonitor=kFALSE,"DoBoostMonitor","Create control plot with ROC integral vs tree number");
DeclareOptionRef(fUseFisherCuts=kFALSE, "UseFisherCuts", "Use multivariate splits using the Fisher criterion");
DeclareOptionRef(fMinLinCorrForFisher=.8,"MinLinCorrForFisher", "The minimum linear correlation between two variables demanded for use in Fisher criterion in node splitting");
DeclareOptionRef(fUseExclusiveVars=kFALSE,"UseExclusiveVars","Variables already used in fisher criterion are not anymore analysed individually for node splitting");
DeclareOptionRef(fDoPreselection=kFALSE,"DoPreselection","and and apply automatic pre-selection for 100% efficient signal (bkg) cuts prior to training");
DeclareOptionRef(fSigToBkgFraction=1,"SigToBkgFraction","Sig to Bkg ratio used in Training (similar to NodePurityLimit, which cannot be used in real adaboost");
DeclareOptionRef(fPruneMethodS, "PruneMethod", "Note: for BDTs use small trees (e.g.MaxDepth=3) and NoPruning: Pruning: Method used for pruning (removal) of statistically insignificant branches ");
AddPreDefVal(TString("NoPruning"));
AddPreDefVal(TString("ExpectedError"));
AddPreDefVal(TString("CostComplexity"));
DeclareOptionRef(fPruneStrength, "PruneStrength", "Pruning strength");
DeclareOptionRef(fFValidationEvents=0.5, "PruningValFraction", "Fraction of events to use for optimizing automatic pruning.");
DeclareOptionRef(fMinNodeEvents=0, "nEventsMin", "deprecated: Use MinNodeSize (in % of training events) instead");
DeclareOptionRef(fBaggedGradBoost=kFALSE, "UseBaggedGrad","deprecated: Use *UseBaggedBoost* instead: Use only a random subsample of all events for growing the trees in each iteration.");
DeclareOptionRef(fBaggedSampleFraction, "GradBaggingFraction","deprecated: Use *BaggedSampleFraction* instead: Defines the fraction of events to be used in each iteration, e.g. when UseBaggedGrad=kTRUE. ");
DeclareOptionRef(fUseNTrainEvents,"UseNTrainEvents","deprecated: Use *BaggedSampleFraction* instead: Number of randomly picked training events used in randomised (and bagged) trees");
DeclareOptionRef(fNNodesMax,"NNodesMax","deprecated: Use MaxDepth instead to limit the tree size" );
}
void TMVA::MethodBDT::DeclareCompatibilityOptions() {
MethodBase::DeclareCompatibilityOptions();
DeclareOptionRef(fHistoricBool=kTRUE, "UseWeightedTrees",
"Use weighted trees or simple average in classification from the forest");
DeclareOptionRef(fHistoricBool=kFALSE, "PruneBeforeBoost", "Flag to prune the tree before applying boosting algorithm");
DeclareOptionRef(fHistoricBool=kFALSE,"RenormByClass","Individually re-normalize each event class to the original size after boosting");
AddPreDefVal(TString("NegWeightTreatment"),TString("IgnoreNegWeights"));
}
void TMVA::MethodBDT::ProcessOptions()
{
fSepTypeS.ToLower();
if (fSepTypeS == "misclassificationerror") fSepType = new MisClassificationError();
else if (fSepTypeS == "giniindex") fSepType = new GiniIndex();
else if (fSepTypeS == "giniindexwithlaplace") fSepType = new GiniIndexWithLaplace();
else if (fSepTypeS == "crossentropy") fSepType = new CrossEntropy();
else if (fSepTypeS == "sdivsqrtsplusb") fSepType = new SdivSqrtSplusB();
else if (fSepTypeS == "regressionvariance") fSepType = NULL;
else {
Log() << kINFO << GetOptions() << Endl;
Log() << kFATAL << "<ProcessOptions> unknown Separation Index option " << fSepTypeS << " called" << Endl;
}
fPruneMethodS.ToLower();
if (fPruneMethodS == "expectederror") fPruneMethod = DecisionTree::kExpectedErrorPruning;
else if (fPruneMethodS == "costcomplexity") fPruneMethod = DecisionTree::kCostComplexityPruning;
else if (fPruneMethodS == "nopruning") fPruneMethod = DecisionTree::kNoPruning;
else {
Log() << kINFO << GetOptions() << Endl;
Log() << kFATAL << "<ProcessOptions> unknown PruneMethod " << fPruneMethodS << " option called" << Endl;
}
if (fPruneStrength < 0 && (fPruneMethod != DecisionTree::kNoPruning) && fBoostType!="Grad") fAutomatic = kTRUE;
else fAutomatic = kFALSE;
if (fAutomatic && fPruneMethod==DecisionTree::kExpectedErrorPruning){
Log() << kFATAL
<< "Sorry autmoatic pruning strength determination is not implemented yet for ExpectedErrorPruning" << Endl;
}
if (fMinNodeEvents > 0){
fMinNodeSize = Double_t(fMinNodeEvents*100.) / Data()->GetNTrainingEvents();
Log() << kWARNING << "You have explicitly set ** nEventsMin = " << fMinNodeEvents<<" ** the min ablsolut number \n"
<< "of events in a leaf node. This is DEPRECATED, please use the option \n"
<< "*MinNodeSize* giving the relative number as percentage of training \n"
<< "events instead. \n"
<< "nEventsMin="<<fMinNodeEvents<< "--> MinNodeSize="<<fMinNodeSize<<"%"
<< Endl;
Log() << kWARNING << "Note also that explicitly setting *nEventsMin* so far OVERWRITES the option recomeded \n"
<< " *MinNodeSize* = " << fMinNodeSizeS << " option !!" << Endl ;
fMinNodeSizeS = Form("%F3.2",fMinNodeSize);
}else{
SetMinNodeSize(fMinNodeSizeS);
}
fAdaBoostR2Loss.ToLower();
if (fBoostType=="Grad") {
fPruneMethod = DecisionTree::kNoPruning;
if (fNegWeightTreatment=="InverseBoostNegWeights"){
Log() << kWARNING << "the option *InverseBoostNegWeights* does not exist for BoostType=Grad --> change to *IgnoreNegWeightsInTraining*" << Endl;
fNegWeightTreatment="IgnoreNegWeightsInTraining";
fNoNegWeightsInTraining=kTRUE;
}
} else if (fBoostType=="RealAdaBoost"){
fBoostType = "AdaBoost";
fUseYesNoLeaf = kFALSE;
} else if (fBoostType=="AdaCost"){
fUseYesNoLeaf = kFALSE;
}
if (fFValidationEvents < 0.0) fFValidationEvents = 0.0;
if (fAutomatic && fFValidationEvents > 0.5) {
Log() << kWARNING << "You have chosen to use more than half of your training sample "
<< "to optimize the automatic pruning algorithm. This is probably wasteful "
<< "and your overall results will be degraded. Are you sure you want this?"
<< Endl;
}
if (this->Data()->HasNegativeEventWeights()){
Log() << kINFO << " You are using a Monte Carlo that has also negative weights. "
<< "That should in principle be fine as long as on average you end up with "
<< "something positive. For this you have to make sure that the minimal number "
<< "of (un-weighted) events demanded for a tree node (currently you use: MinNodeSize="
<< fMinNodeSizeS << " ("<< fMinNodeSize << "%)"
<<", (or the deprecated equivalent nEventsMin) you can set this via the "
<<"BDT option string when booking the "
<< "classifier) is large enough to allow for reasonable averaging!!! "
<< " If this does not help.. maybe you want to try the option: IgnoreNegWeightsInTraining "
<< "which ignores events with negative weight in the training. " << Endl
<< Endl << "Note: You'll get a WARNING message during the training if that should ever happen" << Endl;
}
if (DoRegression()) {
if (fUseYesNoLeaf && !IsConstructedFromWeightFile()){
Log() << kWARNING << "Regression Trees do not work with fUseYesNoLeaf=TRUE --> I will set it to FALSE" << Endl;
fUseYesNoLeaf = kFALSE;
}
if (fSepType != NULL){
Log() << kWARNING << "Regression Trees do not work with Separation type other than <RegressionVariance> --> I will use it instead" << Endl;
fSepType = NULL;
}
if (fUseFisherCuts){
Log() << kWARNING << "Sorry, UseFisherCuts is not available for regression analysis, I will ignore it!" << Endl;
fUseFisherCuts = kFALSE;
}
if (fNCuts < 0) {
Log() << kWARNING << "Sorry, the option of nCuts<0 using a more elaborate node splitting algorithm " << Endl;
Log() << kWARNING << "is not implemented for regression analysis ! " << Endl;
Log() << kWARNING << "--> I switch do default nCuts = 20 and use standard node splitting"<<Endl;
fNCuts=20;
}
}
if (fRandomisedTrees){
Log() << kINFO << " Randomised trees use no pruning" << Endl;
fPruneMethod = DecisionTree::kNoPruning;
}
if (fUseFisherCuts) {
Log() << kWARNING << "Sorry, when using the option UseFisherCuts, the other option nCuts<0 (i.e. using" << Endl;
Log() << kWARNING << " a more elaborate node splitting algorithm) is not implemented. I will switch o " << Endl;
Log() << kWARNING << "--> I switch do default nCuts = 20 and use standard node splitting WITH possible Fisher criteria"<<Endl;
fNCuts=20;
}
if (fNTrees==0){
Log() << kERROR << " Zero Decision Trees demanded... that does not work !! "
<< " I set it to 1 .. just so that the program does not crash"
<< Endl;
fNTrees = 1;
}
fNegWeightTreatment.ToLower();
if (fNegWeightTreatment == "ignorenegweightsintraining") fNoNegWeightsInTraining = kTRUE;
else if (fNegWeightTreatment == "nonegweightsintraining") fNoNegWeightsInTraining = kTRUE;
else if (fNegWeightTreatment == "inverseboostnegweights") fInverseBoostNegWeights = kTRUE;
else if (fNegWeightTreatment == "pairnegweightsglobal") fPairNegWeightsGlobal = kTRUE;
else if (fNegWeightTreatment == "pray") Log() << kWARNING << "Yes, good luck with praying " << Endl;
else {
Log() << kINFO << GetOptions() << Endl;
Log() << kFATAL << "<ProcessOptions> unknown option for treating negative event weights during training " << fNegWeightTreatment << " requested" << Endl;
}
if (fNegWeightTreatment == "pairnegweightsglobal")
Log() << kWARNING << " you specified the option NegWeightTreatment=PairNegWeightsGlobal : This option is still considered EXPERIMENTAL !! " << Endl;
if (fNNodesMax>0) {
UInt_t tmp=1;
fMaxDepth=0;
while (tmp < fNNodesMax){
tmp+=2*tmp;
fMaxDepth++;
}
Log() << kWARNING << "You have specified a deprecated option *NNodesMax="<<fNNodesMax
<< "* \n this has been translated to MaxDepth="<<fMaxDepth<<Endl;
}
if (fUseNTrainEvents>0){
fBaggedSampleFraction = (Double_t) fUseNTrainEvents/Data()->GetNTrainingEvents();
Log() << kWARNING << "You have specified a deprecated option *UseNTrainEvents="<<fUseNTrainEvents
<< "* \n this has been translated to BaggedSampleFraction="<<fBaggedSampleFraction<<"(%)"<<Endl;
}
if (fBoostType=="Bagging") fBaggedBoost = kTRUE;
if (fBaggedGradBoost){
fBaggedBoost = kTRUE;
Log() << kWARNING << "You have specified a deprecated option *UseBaggedGrad* --> please use *UseBaggedBoost* instead" << Endl;
}
}
void TMVA::MethodBDT::SetMinNodeSize(Double_t sizeInPercent){
if (sizeInPercent > 0 && sizeInPercent < 50){
fMinNodeSize=sizeInPercent;
} else {
Log() << kFATAL << "you have demanded a minimal node size of "
<< sizeInPercent << "% of the training events.. \n"
<< " that somehow does not make sense "<<Endl;
}
}
void TMVA::MethodBDT::SetMinNodeSize(TString sizeInPercent){
sizeInPercent.ReplaceAll("%","");
sizeInPercent.ReplaceAll(" ","");
if (sizeInPercent.IsFloat()) SetMinNodeSize(sizeInPercent.Atof());
else {
Log() << kFATAL << "I had problems reading the option MinNodeEvents, which "
<< "after removing a possible % sign now reads " << sizeInPercent << Endl;
}
}
void TMVA::MethodBDT::Init( void )
{
fNTrees = 800;
if (fAnalysisType == Types::kClassification || fAnalysisType == Types::kMulticlass ) {
fMaxDepth = 3;
fBoostType = "AdaBoost";
if(DataInfo().GetNClasses()!=0)
fMinNodeSize = 5.;
}else {
fMaxDepth = 50;
fBoostType = "AdaBoostR2";
fAdaBoostR2Loss = "Quadratic";
if(DataInfo().GetNClasses()!=0)
fMinNodeSize = .2;
}
fNCuts = 20;
fPruneMethodS = "NoPruning";
fPruneMethod = DecisionTree::kNoPruning;
fPruneStrength = 0;
fAutomatic = kFALSE;
fFValidationEvents = 0.5;
fRandomisedTrees = kFALSE;
fUseNvars = UInt_t(TMath::Sqrt(GetNvar())+0.6);
fUsePoissonNvars = kTRUE;
fShrinkage = 1.0;
fSumOfWeights = 0.0;
SetSignalReferenceCut( 0 );
}
void TMVA::MethodBDT::Reset( void )
{
for (UInt_t i=0; i<fForest.size(); i++) delete fForest[i];
fForest.clear();
fBoostWeights.clear();
if (fMonitorNtuple) fMonitorNtuple->Delete(); fMonitorNtuple=NULL;
fVariableImportance.clear();
fResiduals.clear();
if (Data()) Data()->DeleteResults(GetMethodName(), Types::kTraining, GetAnalysisType());
Log() << kDEBUG << " successfully(?) reset the method " << Endl;
}
TMVA::MethodBDT::~MethodBDT( void )
{
for (UInt_t i=0; i<fForest.size(); i++) delete fForest[i];
}
void TMVA::MethodBDT::InitEventSample( void )
{
if (!HasTrainingTree()) Log() << kFATAL << "<Init> Data().TrainingTree() is zero pointer" << Endl;
if (fEventSample.size() > 0) {
for (UInt_t iev=0; iev<fEventSample.size(); iev++) fEventSample[iev]->SetBoostWeight(1.);
} else {
Data()->SetCurrentType(Types::kTraining);
UInt_t nevents = Data()->GetNTrainingEvents();
std::vector<const TMVA::Event*> tmpEventSample;
for (Long64_t ievt=0; ievt<nevents; ievt++) {
Event* event = new Event( *GetTrainingEvent(ievt) );
tmpEventSample.push_back(event);
}
if (!DoRegression()) DeterminePreselectionCuts(tmpEventSample);
else fDoPreselection = kFALSE;
for (UInt_t i=0; i<tmpEventSample.size(); i++) delete tmpEventSample[i];
Bool_t firstNegWeight=kTRUE;
Bool_t firstZeroWeight=kTRUE;
for (Long64_t ievt=0; ievt<nevents; ievt++) {
Event* event = new Event( *GetTrainingEvent(ievt) );
if (fDoPreselection){
if (TMath::Abs(ApplyPreselectionCuts(event)) > 0.05) continue;
}
if (event->GetWeight() < 0 && (IgnoreEventsWithNegWeightsInTraining() || fNoNegWeightsInTraining)){
if (firstNegWeight) {
Log() << kWARNING << " Note, you have events with negative event weight in the sample, but you've chosen to ignore them" << Endl;
firstNegWeight=kFALSE;
}
delete event;
}else if (event->GetWeight()==0){
if (firstZeroWeight) {
firstZeroWeight = kFALSE;
Log() << "Events with weight == 0 are going to be simply ignored " << Endl;
}
delete event;
}else{
if (event->GetWeight() < 0) {
fTrainWithNegWeights=kTRUE;
if (firstNegWeight){
firstNegWeight = kFALSE;
if (fPairNegWeightsGlobal){
Log() << kWARNING << "Events with negative event weights are found and "
<< " will be removed prior to the actual BDT training by global "
<< " paring (and subsequent annihilation) with positiv weight events"
<< Endl;
}else{
Log() << kWARNING << "Events with negative event weights are USED during "
<< "the BDT training. This might cause problems with small node sizes "
<< "or with the boosting. Please remove negative events from training "
<< "using the option *IgnoreEventsWithNegWeightsInTraining* in case you "
<< "observe problems with the boosting"
<< Endl;
}
}
}
if (fAutomatic) {
Double_t modulo = 1.0/(fFValidationEvents);
Int_t imodulo = static_cast<Int_t>( fmod(modulo,1.0) > 0.5 ? ceil(modulo) : floor(modulo) );
if (ievt % imodulo == 0) fValidationSample.push_back( event );
else fEventSample.push_back( event );
}
else {
fEventSample.push_back(event);
}
}
}
if (fAutomatic) {
Log() << kINFO << "<InitEventSample> Internally I use " << fEventSample.size()
<< " for Training and " << fValidationSample.size()
<< " for Pruning Validation (" << ((Float_t)fValidationSample.size())/((Float_t)fEventSample.size()+fValidationSample.size())*100.0
<< "% of training used for validation)" << Endl;
}
if (fPairNegWeightsGlobal) PreProcessNegativeEventWeights();
}
if (!DoRegression()){
Log() << kINFO << "<InitEventSample> For classification trees, "<< Endl;
Log() << kINFO << " the effective number of backgrounds is scaled to match "<<Endl;
Log() << kINFO << " the signal. Othersise the first boosting step would do 'just that'!"<<Endl;
Double_t nevents = fEventSample.size();
Double_t sumSigW=0, sumBkgW=0;
Int_t sumSig=0, sumBkg=0;
for (UInt_t ievt=0; ievt<fEventSample.size(); ievt++) {
if ((DataInfo().IsSignal(fEventSample[ievt])) ) {
sumSigW += fEventSample[ievt]->GetWeight();
sumSig++;
} else {
sumBkgW += fEventSample[ievt]->GetWeight();
sumBkg++;
}
}
if (sumSigW && sumBkgW){
Double_t normSig = nevents/((1+fSigToBkgFraction)*sumSigW)*fSigToBkgFraction;
Double_t normBkg = nevents/((1+fSigToBkgFraction)*sumBkgW); ;
Log() << kINFO << "re-normlise events such that Sig and Bkg have respective sum of weights = "
<< fSigToBkgFraction << Endl;
Log() << kINFO << " sig->sig*"<<normSig << "ev. bkg->bkg*"<<normBkg << "ev." <<Endl;
Log() << kINFO << "#events: (reweighted) sig: "<< sumSigW*normSig << " bkg: " << sumBkgW*normBkg << Endl;
Log() << kINFO << "#events: (unweighted) sig: "<< sumSig << " bkg: " << sumBkg << Endl;
for (Long64_t ievt=0; ievt<nevents; ievt++) {
if ((DataInfo().IsSignal(fEventSample[ievt])) ) fEventSample[ievt]->SetBoostWeight(normSig);
else fEventSample[ievt]->SetBoostWeight(normBkg);
}
}else{
Log() << kINFO << "--> could not determine scaleing factors as either there are " << Endl;
Log() << kINFO << " no signal events (sumSigW="<<sumSigW<<") or no bkg ev. (sumBkgW="<<sumBkgW<<")"<<Endl;
}
}
fTrainSample = &fEventSample;
if (fBaggedBoost){
GetBaggedSubSample(fEventSample);
fTrainSample = &fSubSample;
}
}
void TMVA::MethodBDT::PreProcessNegativeEventWeights(){
Double_t totalNegWeights = 0;
Double_t totalPosWeights = 0;
Double_t totalWeights = 0;
std::vector<const Event*> negEvents;
for (UInt_t iev = 0; iev < fEventSample.size(); iev++){
if (fEventSample[iev]->GetWeight() < 0) {
totalNegWeights += fEventSample[iev]->GetWeight();
negEvents.push_back(fEventSample[iev]);
} else {
totalPosWeights += fEventSample[iev]->GetWeight();
}
totalWeights += fEventSample[iev]->GetWeight();
}
if (totalNegWeights == 0 ) {
Log() << kINFO << "no negative event weights found .. no preprocessing necessary" << Endl;
return;
} else {
Log() << kINFO << "found a total of " << totalNegWeights << " of negative event weights which I am going to try to pair with positive events to annihilate them" << Endl;
Log() << kINFO << "found a total of " << totalPosWeights << " of events with positive weights" << Endl;
Log() << kINFO << "--> total sum of weights = " << totalWeights << " = " << totalNegWeights+totalPosWeights << Endl;
}
std::vector<TMatrixDSym*>* cov = gTools().CalcCovarianceMatrices( fEventSample, 2);
TMatrixDSym *invCov;
for (Int_t i=0; i<2; i++){
invCov = ((*cov)[i]);
if ( TMath::Abs(invCov->Determinant()) < 10E-24 ) {
std::cout << "<MethodBDT::PreProcessNeg...> matrix is almost singular with deterninant="
<< TMath::Abs(invCov->Determinant())
<< " did you use the variables that are linear combinations or highly correlated?"
<< std::endl;
}
if ( TMath::Abs(invCov->Determinant()) < 10E-120 ) {
std::cout << "<MethodBDT::PreProcessNeg...> matrix is singular with determinant="
<< TMath::Abs(invCov->Determinant())
<< " did you use the variables that are linear combinations?"
<< std::endl;
}
invCov->Invert();
}
Log() << kINFO << "Found a total of " << totalNegWeights << " in negative weights out of " << fEventSample.size() << " training events " << Endl;
Timer timer(negEvents.size(),"Negative Event paired");
for (UInt_t nev = 0; nev < negEvents.size(); nev++){
timer.DrawProgressBar( nev );
Double_t weight = negEvents[nev]->GetWeight();
UInt_t iClassID = negEvents[nev]->GetClass();
invCov = ((*cov)[iClassID]);
while (weight < 0){
Int_t iMin=-1;
Double_t dist, minDist=10E270;
for (UInt_t iev = 0; iev < fEventSample.size(); iev++){
if (iClassID==fEventSample[iev]->GetClass() && fEventSample[iev]->GetWeight() > 0){
dist=0;
for (UInt_t ivar=0; ivar < GetNvar(); ivar++){
for (UInt_t jvar=0; jvar<GetNvar(); jvar++){
dist += (negEvents[nev]->GetValue(ivar)-fEventSample[iev]->GetValue(ivar))*
(*invCov)[ivar][jvar]*
(negEvents[nev]->GetValue(jvar)-fEventSample[iev]->GetValue(jvar));
}
}
if (dist < minDist) { iMin=iev; minDist=dist;}
}
}
if (iMin > -1) {
Double_t newWeight = (negEvents[nev]->GetWeight() + fEventSample[iMin]->GetWeight());
if (newWeight > 0){
negEvents[nev]->SetBoostWeight( 0 );
fEventSample[iMin]->SetBoostWeight( newWeight/fEventSample[iMin]->GetOriginalWeight() );
} else {
negEvents[nev]->SetBoostWeight( newWeight/negEvents[nev]->GetOriginalWeight() );
fEventSample[iMin]->SetBoostWeight( 0 );
}
} else Log() << kFATAL << "preprocessing didn't find event to pair with the negative weight ... probably a bug" << Endl;
weight = negEvents[nev]->GetWeight();
}
}
Log() << kINFO << "<Negative Event Pairing> took: " << timer.GetElapsedTime()
<< " " << Endl;
totalNegWeights = 0;
totalPosWeights = 0;
totalWeights = 0;
Double_t sigWeight=0;
Double_t bkgWeight=0;
Int_t nSig=0;
Int_t nBkg=0;
std::vector<const Event*> newEventSample;
for (UInt_t iev = 0; iev < fEventSample.size(); iev++){
if (fEventSample[iev]->GetWeight() < 0) {
totalNegWeights += fEventSample[iev]->GetWeight();
totalWeights += fEventSample[iev]->GetWeight();
} else {
totalPosWeights += fEventSample[iev]->GetWeight();
totalWeights += fEventSample[iev]->GetWeight();
}
if (fEventSample[iev]->GetWeight() > 0) {
newEventSample.push_back(new Event(*fEventSample[iev]));
if (fEventSample[iev]->GetClass() == fSignalClass){
sigWeight += fEventSample[iev]->GetWeight();
nSig+=1;
}else{
bkgWeight += fEventSample[iev]->GetWeight();
nBkg+=1;
}
}
}
if (totalNegWeights < 0) Log() << kFATAL << " compenstion of negative event weights with positive ones did not work " << totalNegWeights << Endl;
for (UInt_t i=0; i<fEventSample.size(); i++) delete fEventSample[i];
fEventSample = newEventSample;
Log() << kINFO << " after PreProcessing, the Event sample is left with " << fEventSample.size() << " events (unweighted), all with positive weights, adding up to " << totalWeights << Endl;
Log() << kINFO << " nSig="<<nSig << " sigWeight="<<sigWeight << " nBkg="<<nBkg << " bkgWeight="<<bkgWeight << Endl;
}
std::map<TString,Double_t> TMVA::MethodBDT::OptimizeTuningParameters(TString fomType, TString fitType)
{
std::map<TString,TMVA::Interval*> tuneParameters;
std::map<TString,Double_t> tunedParameters;
tuneParameters.insert(std::pair<TString,Interval*>("NTrees", new Interval(10,1000,5)));
tuneParameters.insert(std::pair<TString,Interval*>("MaxDepth", new Interval(2,4,3)));
tuneParameters.insert(std::pair<TString,Interval*>("MinNodeSize", new LogInterval(1,30,30)));
if (fBoostType=="AdaBoost"){
tuneParameters.insert(std::pair<TString,Interval*>("AdaBoostBeta", new Interval(.2,1.,5)));
}else if (fBoostType=="Grad"){
tuneParameters.insert(std::pair<TString,Interval*>("Shrinkage", new Interval(0.05,0.50,5)));
}else if (fBoostType=="Bagging" && fRandomisedTrees){
Int_t min_var = TMath::FloorNint( GetNvar() * .25 );
Int_t max_var = TMath::CeilNint( GetNvar() * .75 );
tuneParameters.insert(std::pair<TString,Interval*>("UseNvars", new Interval(min_var,max_var,4)));
}
Log()<<kINFO << " the following BDT parameters will be tuned on the respective *grid*\n"<<Endl;
std::map<TString,TMVA::Interval*>::iterator it;
for(it=tuneParameters.begin(); it!= tuneParameters.end(); it++){
Log() << kWARNING << it->first << Endl;
(it->second)->Print(Log());
Log()<<Endl;
}
OptimizeConfigParameters optimize(this, tuneParameters, fomType, fitType);
tunedParameters=optimize.optimize();
return tunedParameters;
}
void TMVA::MethodBDT::SetTuneParameters(std::map<TString,Double_t> tuneParameters)
{
std::map<TString,Double_t>::iterator it;
for(it=tuneParameters.begin(); it!= tuneParameters.end(); it++){
Log() << kWARNING << it->first << " = " << it->second << Endl;
if (it->first == "MaxDepth" ) SetMaxDepth ((Int_t)it->second);
else if (it->first == "MinNodeSize" ) SetMinNodeSize (it->second);
else if (it->first == "NTrees" ) SetNTrees ((Int_t)it->second);
else if (it->first == "NodePurityLimit") SetNodePurityLimit (it->second);
else if (it->first == "AdaBoostBeta" ) SetAdaBoostBeta (it->second);
else if (it->first == "Shrinkage" ) SetShrinkage (it->second);
else if (it->first == "UseNvars" ) SetUseNvars ((Int_t)it->second);
else if (it->first == "BaggedSampleFraction" ) SetBaggedSampleFraction (it->second);
else Log() << kFATAL << " SetParameter for " << it->first << " not yet implemented " <<Endl;
}
}
void TMVA::MethodBDT::Train()
{
TMVA::DecisionTreeNode::fgIsTraining=true;
InitEventSample();
if (fNTrees==0){
Log() << kERROR << " Zero Decision Trees demanded... that does not work !! "
<< " I set it to 1 .. just so that the program does not crash"
<< Endl;
fNTrees = 1;
}
if (IsNormalised()) Log() << kFATAL << "\"Normalise\" option cannot be used with BDT; "
<< "please remove the option from the configuration string, or "
<< "use \"!Normalise\""
<< Endl;
Log() << kINFO << "Training "<< fNTrees << " Decision Trees ... patience please" << Endl;
Log() << kDEBUG << "Training with maximal depth = " <<fMaxDepth
<< ", MinNodeEvents=" << fMinNodeEvents
<< ", NTrees="<<fNTrees
<< ", NodePurityLimit="<<fNodePurityLimit
<< ", AdaBoostBeta="<<fAdaBoostBeta
<< Endl;
Int_t nBins;
Double_t xMin,xMax;
TString hname = "AdaBooost weight distribution";
nBins= 100;
xMin = 0;
xMax = 30;
if (DoRegression()) {
nBins= 100;
xMin = 0;
xMax = 1;
hname="Boost event weights distribution";
}
TH1* h = new TH1F("BoostWeight",hname,nBins,xMin,xMax);
TH1* nodesBeforePruningVsTree = new TH1I("NodesBeforePruning","nodes before pruning",fNTrees,0,fNTrees);
TH1* nodesAfterPruningVsTree = new TH1I("NodesAfterPruning","nodes after pruning",fNTrees,0,fNTrees);
if(!DoMulticlass()){
Results* results = Data()->GetResults(GetMethodName(), Types::kTraining, GetAnalysisType());
h->SetXTitle("boost weight");
results->Store(h, "BoostWeights");
if (fDoBoostMonitor){
TH2* boostMonitor = new TH2F("BoostMonitor","ROC Integral Vs iTree",2,0,fNTrees,2,0,1.05);
boostMonitor->SetXTitle("#tree");
boostMonitor->SetYTitle("ROC Integral");
results->Store(boostMonitor, "BoostMonitor");
TGraph *boostMonitorGraph = new TGraph();
boostMonitorGraph->SetName("BoostMonitorGraph");
boostMonitorGraph->SetTitle("ROCIntegralVsNTrees");
results->Store(boostMonitorGraph, "BoostMonitorGraph");
}
h = new TH1F("BoostWeightVsTree","Boost weights vs tree",fNTrees,0,fNTrees);
h->SetXTitle("#tree");
h->SetYTitle("boost weight");
results->Store(h, "BoostWeightsVsTree");
h = new TH1F("ErrFractHist","error fraction vs tree number",fNTrees,0,fNTrees);
h->SetXTitle("#tree");
h->SetYTitle("error fraction");
results->Store(h, "ErrorFrac");
nodesBeforePruningVsTree->SetXTitle("#tree");
nodesBeforePruningVsTree->SetYTitle("#tree nodes");
results->Store(nodesBeforePruningVsTree);
nodesAfterPruningVsTree->SetXTitle("#tree");
nodesAfterPruningVsTree->SetYTitle("#tree nodes");
results->Store(nodesAfterPruningVsTree);
}
fMonitorNtuple= new TTree("MonitorNtuple","BDT variables");
fMonitorNtuple->Branch("iTree",&fITree,"iTree/I");
fMonitorNtuple->Branch("boostWeight",&fBoostWeight,"boostWeight/D");
fMonitorNtuple->Branch("errorFraction",&fErrorFraction,"errorFraction/D");
Timer timer( fNTrees, GetName() );
Int_t nNodesBeforePruningCount = 0;
Int_t nNodesAfterPruningCount = 0;
Int_t nNodesBeforePruning = 0;
Int_t nNodesAfterPruning = 0;
if(fBoostType=="Grad"){
InitGradBoost(fEventSample);
}
Int_t itree=0;
Bool_t continueBoost=kTRUE;
while (itree < fNTrees && continueBoost){
timer.DrawProgressBar( itree );
if(DoMulticlass()){
if (fBoostType!="Grad"){
Log() << kFATAL << "Multiclass is currently only supported by gradient boost. "
<< "Please change boost option accordingly (GradBoost)."
<< Endl;
}
UInt_t nClasses = DataInfo().GetNClasses();
for (UInt_t i=0;i<nClasses;i++){
fForest.push_back( new DecisionTree( fSepType, fMinNodeSize, fNCuts, &(DataInfo()), i,
fRandomisedTrees, fUseNvars, fUsePoissonNvars, fMaxDepth,
itree*nClasses+i, fNodePurityLimit, itree*nClasses+1));
fForest.back()->SetNVars(GetNvar());
if (fUseFisherCuts) {
fForest.back()->SetUseFisherCuts();
fForest.back()->SetMinLinCorrForFisher(fMinLinCorrForFisher);
fForest.back()->SetUseExclusiveVars(fUseExclusiveVars);
}
nNodesBeforePruning = fForest.back()->BuildTree(*fTrainSample);
Double_t bw = this->Boost(*fTrainSample, fForest.back(),i);
if (bw > 0) {
fBoostWeights.push_back(bw);
}else{
fBoostWeights.push_back(0);
Log() << kWARNING << "stopped boosting at itree="<<itree << Endl;
continueBoost=kFALSE;
}
}
}
else{
fForest.push_back( new DecisionTree( fSepType, fMinNodeSize, fNCuts, &(DataInfo()), fSignalClass,
fRandomisedTrees, fUseNvars, fUsePoissonNvars, fMaxDepth,
itree, fNodePurityLimit, itree));
fForest.back()->SetNVars(GetNvar());
if (fUseFisherCuts) {
fForest.back()->SetUseFisherCuts();
fForest.back()->SetMinLinCorrForFisher(fMinLinCorrForFisher);
fForest.back()->SetUseExclusiveVars(fUseExclusiveVars);
}
nNodesBeforePruning = fForest.back()->BuildTree(*fTrainSample);
if (fUseYesNoLeaf && !DoRegression() && fBoostType!="Grad") {
nNodesBeforePruning = fForest.back()->CleanTree();
}
nNodesBeforePruningCount += nNodesBeforePruning;
nodesBeforePruningVsTree->SetBinContent(itree+1,nNodesBeforePruning);
fForest.back()->SetPruneMethod(fPruneMethod);
fForest.back()->SetPruneStrength(fPruneStrength);
std::vector<const Event*> * validationSample = NULL;
if(fAutomatic) validationSample = &fValidationSample;
Double_t bw = this->Boost(*fTrainSample, fForest.back());
if (bw > 0) {
fBoostWeights.push_back(bw);
}else{
fBoostWeights.push_back(0);
Log() << kWARNING << "stopped boosting at itree="<<itree << Endl;
continueBoost=kFALSE;
}
if (fPruneMethod != DecisionTree::kNoPruning) fForest.back()->PruneTree(validationSample);
if (fUseYesNoLeaf && !DoRegression() && fBoostType!="Grad"){
fForest.back()->CleanTree();
}
nNodesAfterPruning = fForest.back()->GetNNodes();
nNodesAfterPruningCount += nNodesAfterPruning;
nodesAfterPruningVsTree->SetBinContent(itree+1,nNodesAfterPruning);
fITree = itree;
fMonitorNtuple->Fill();
if (fDoBoostMonitor){
if (! DoRegression() ){
if ( itree==fNTrees-1 || (!(itree%500)) ||
(!(itree%250) && itree <1000)||
(!(itree%100) && itree < 500)||
(!(itree%50) && itree < 250)||
(!(itree%25) && itree < 150)||
(!(itree%10) && itree < 50)||
(!(itree%5) && itree < 20)
) BoostMonitor(itree);
}
}
}
itree++;
}
Log() << kINFO << "<Train> elapsed time: " << timer.GetElapsedTime()
<< " " << Endl;
if (fPruneMethod == DecisionTree::kNoPruning) {
Log() << kINFO << "<Train> average number of nodes (w/o pruning) : "
<< nNodesBeforePruningCount/GetNTrees() << Endl;
}
else {
Log() << kINFO << "<Train> average number of nodes before/after pruning : "
<< nNodesBeforePruningCount/GetNTrees() << " / "
<< nNodesAfterPruningCount/GetNTrees()
<< Endl;
}
TMVA::DecisionTreeNode::fgIsTraining=false;
Log() << kDEBUG << "Now I delete the privat data sample"<< Endl;
for (UInt_t i=0; i<fEventSample.size(); i++) delete fEventSample[i];
for (UInt_t i=0; i<fValidationSample.size(); i++) delete fValidationSample[i];
fEventSample.clear();
fValidationSample.clear();
}
Double_t TMVA::MethodBDT::GetGradBoostMVA(const TMVA::Event* e, UInt_t nTrees)
{
Double_t sum=0;
for (UInt_t itree=0; itree<nTrees; itree++) {
sum += fForest[itree]->CheckEvent(e,kFALSE);
}
return 2.0/(1.0+exp(-2.0*sum))-1;
}
void TMVA::MethodBDT::UpdateTargets(std::vector<const TMVA::Event*>& eventSample, UInt_t cls)
{
if(DoMulticlass()){
UInt_t nClasses = DataInfo().GetNClasses();
for (std::vector<const TMVA::Event*>::iterator e=eventSample.begin(); e!=eventSample.end();e++) {
fResiduals[*e].at(cls)+=fForest.back()->CheckEvent(*e,kFALSE);
if(cls == nClasses-1){
for(UInt_t i=0;i<nClasses;i++){
Double_t norm = 0.0;
for(UInt_t j=0;j<nClasses;j++){
if(i!=j)
norm+=exp(fResiduals[*e].at(j)-fResiduals[*e].at(i));
}
Double_t p_cls = 1.0/(1.0+norm);
Double_t res = ((*e)->GetClass()==i)?(1.0-p_cls):(-p_cls);
const_cast<TMVA::Event*>(*e)->SetTarget(i,res);
}
}
}
}
else{
for (std::vector<const TMVA::Event*>::const_iterator e=eventSample.begin(); e!=eventSample.end();e++) {
fResiduals[*e].at(0)+=fForest.back()->CheckEvent(*e,kFALSE);
Double_t p_sig=1.0/(1.0+exp(-2.0*fResiduals[*e].at(0)));
Double_t res = (DataInfo().IsSignal(*e)?1:0)-p_sig;
const_cast<TMVA::Event*>(*e)->SetTarget(0,res);
}
}
}
void TMVA::MethodBDT::UpdateTargetsRegression(std::vector<const TMVA::Event*>& eventSample, Bool_t first)
{
for (std::vector<const TMVA::Event*>::const_iterator e=fEventSample.begin(); e!=fEventSample.end();e++) {
if(!first){
fWeightedResiduals[*e].first -= fForest.back()->CheckEvent(*e,kFALSE);
}
}
fSumOfWeights = 0;
vector< std::pair<Double_t, Double_t> > temp;
for (std::vector<const TMVA::Event*>::const_iterator e=eventSample.begin(); e!=eventSample.end();e++){
temp.push_back(make_pair(fabs(fWeightedResiduals[*e].first),fWeightedResiduals[*e].second));
fSumOfWeights += (*e)->GetWeight();
}
fTransitionPoint = GetWeightedQuantile(temp,0.7,fSumOfWeights);
Int_t i=0;
for (std::vector<const TMVA::Event*>::const_iterator e=eventSample.begin(); e!=eventSample.end();e++) {
if(temp[i].first<=fTransitionPoint)
const_cast<TMVA::Event*>(*e)->SetTarget(0,fWeightedResiduals[*e].first);
else
const_cast<TMVA::Event*>(*e)->SetTarget(0,fTransitionPoint*(fWeightedResiduals[*e].first<0?-1.0:1.0));
i++;
}
}
Double_t TMVA::MethodBDT::GetWeightedQuantile(vector< std::pair<Double_t, Double_t> > vec, const Double_t quantile, const Double_t norm){
Double_t temp = 0.0;
std::sort(vec.begin(), vec.end());
UInt_t i = 0;
while(i<vec.size() && temp <= norm*quantile){
temp += vec[i].second;
i++;
}
if (i >= vec.size()) return 0.;
return vec[i].first;
}
Double_t TMVA::MethodBDT::GradBoost(std::vector<const TMVA::Event*>& eventSample, DecisionTree *dt, UInt_t cls)
{
std::map<TMVA::DecisionTreeNode*,std::vector<Double_t> > leaves;
for (std::vector<const TMVA::Event*>::const_iterator e=eventSample.begin(); e!=eventSample.end();e++) {
Double_t weight = (*e)->GetWeight();
TMVA::DecisionTreeNode* node = dt->GetEventNode(*(*e));
if ((leaves[node]).empty()){
(leaves[node]).push_back((*e)->GetTarget(cls)* weight);
(leaves[node]).push_back(fabs((*e)->GetTarget(cls))*(1.0-fabs((*e)->GetTarget(cls))) * weight* weight);
}
else {
(leaves[node])[0]+=((*e)->GetTarget(cls)* weight);
(leaves[node])[1]+=fabs((*e)->GetTarget(cls))*(1.0-fabs((*e)->GetTarget(cls))) * weight* weight;
}
}
for (std::map<TMVA::DecisionTreeNode*,std::vector<Double_t> >::iterator iLeave=leaves.begin();
iLeave!=leaves.end();++iLeave){
if ((iLeave->second)[1]<1e-30) (iLeave->second)[1]=1e-30;
(iLeave->first)->SetResponse(fShrinkage/DataInfo().GetNClasses()*(iLeave->second)[0]/((iLeave->second)[1]));
}
DoMulticlass() ? UpdateTargets(fEventSample, cls) : UpdateTargets(fEventSample);
return 1;
}
Double_t TMVA::MethodBDT::GradBoostRegression(std::vector<const TMVA::Event*>& eventSample, DecisionTree *dt )
{
std::map<TMVA::DecisionTreeNode*,Double_t > leaveWeights;
std::map<TMVA::DecisionTreeNode*,vector< std::pair<Double_t, Double_t> > > leaves;
UInt_t i =0;
for (std::vector<const TMVA::Event*>::const_iterator e=eventSample.begin(); e!=eventSample.end();e++) {
TMVA::DecisionTreeNode* node = dt->GetEventNode(*(*e));
(leaves[node]).push_back(make_pair(fWeightedResiduals[*e].first,(*e)->GetWeight()));
(leaveWeights[node]) += (*e)->GetWeight();
i++;
}
for (std::map<TMVA::DecisionTreeNode*,vector< std::pair<Double_t, Double_t> > >::iterator iLeave=leaves.begin();
iLeave!=leaves.end();++iLeave){
Double_t shift=0,diff= 0;
Double_t ResidualMedian = GetWeightedQuantile(iLeave->second,0.5,leaveWeights[iLeave->first]);
for(UInt_t j=0;j<((iLeave->second).size());j++){
diff = (iLeave->second)[j].first-ResidualMedian;
shift+=1.0/((iLeave->second).size())*((diff<0)?-1.0:1.0)*TMath::Min(fTransitionPoint,fabs(diff));
}
(iLeave->first)->SetResponse(fShrinkage*(ResidualMedian+shift));
}
UpdateTargetsRegression(*fTrainSample);
return 1;
}
void TMVA::MethodBDT::InitGradBoost( std::vector<const TMVA::Event*>& eventSample)
{
fSumOfWeights = 0;
fSepType=NULL;
std::vector<std::pair<Double_t, Double_t> > temp;
if(DoRegression()){
for (std::vector<const TMVA::Event*>::const_iterator e=eventSample.begin(); e!=eventSample.end();e++) {
fWeightedResiduals[*e]= make_pair((*e)->GetTarget(0), (*e)->GetWeight());
fSumOfWeights+=(*e)->GetWeight();
temp.push_back(make_pair(fWeightedResiduals[*e].first,fWeightedResiduals[*e].second));
}
Double_t weightedMedian = GetWeightedQuantile(temp,0.5, fSumOfWeights);
fBoostWeights.push_back(weightedMedian);
std::map<const TMVA::Event*, std::pair<Double_t, Double_t> >::iterator res = fWeightedResiduals.begin();
for (; res!=fWeightedResiduals.end(); ++res ) {
(*res).second.first -= weightedMedian;
}
UpdateTargetsRegression(*fTrainSample,kTRUE);
return;
}
else if(DoMulticlass()){
UInt_t nClasses = DataInfo().GetNClasses();
for (std::vector<const TMVA::Event*>::const_iterator e=eventSample.begin(); e!=eventSample.end();e++) {
for (UInt_t i=0;i<nClasses;i++){
Double_t r = (*e)->GetClass()==i?(1-1.0/nClasses):(-1.0/nClasses);
const_cast<TMVA::Event*>(*e)->SetTarget(i,r);
fResiduals[*e].push_back(0);
}
}
}
else{
for (std::vector<const TMVA::Event*>::const_iterator e=eventSample.begin(); e!=eventSample.end();e++) {
Double_t r = (DataInfo().IsSignal(*e)?1:0)-0.5;
const_cast<TMVA::Event*>(*e)->SetTarget(0,r);
fResiduals[*e].push_back(0);
}
}
}
Double_t TMVA::MethodBDT::TestTreeQuality( DecisionTree *dt )
{
Double_t ncorrect=0, nfalse=0;
for (UInt_t ievt=0; ievt<fValidationSample.size(); ievt++) {
Bool_t isSignalType= (dt->CheckEvent(fValidationSample[ievt]) > fNodePurityLimit ) ? 1 : 0;
if (isSignalType == (DataInfo().IsSignal(fValidationSample[ievt])) ) {
ncorrect += fValidationSample[ievt]->GetWeight();
}
else{
nfalse += fValidationSample[ievt]->GetWeight();
}
}
return ncorrect / (ncorrect + nfalse);
}
Double_t TMVA::MethodBDT::Boost( std::vector<const TMVA::Event*>& eventSample, DecisionTree *dt, UInt_t cls )
{
Double_t returnVal=-1;
if (fBoostType=="AdaBoost") returnVal = this->AdaBoost (eventSample, dt);
else if (fBoostType=="AdaCost") returnVal = this->AdaCost (eventSample, dt);
else if (fBoostType=="Bagging") returnVal = this->Bagging ( );
else if (fBoostType=="RegBoost") returnVal = this->RegBoost (eventSample, dt);
else if (fBoostType=="AdaBoostR2") returnVal = this->AdaBoostR2(eventSample, dt);
else if (fBoostType=="Grad"){
if(DoRegression())
returnVal = this->GradBoostRegression(eventSample, dt);
else if(DoMulticlass())
returnVal = this->GradBoost (eventSample, dt, cls);
else
returnVal = this->GradBoost (eventSample, dt);
}
else {
Log() << kINFO << GetOptions() << Endl;
Log() << kFATAL << "<Boost> unknown boost option " << fBoostType<< " called" << Endl;
}
if (fBaggedBoost){
GetBaggedSubSample(fEventSample);
}
return returnVal;
}
void TMVA::MethodBDT::BoostMonitor(Int_t iTree)
{
Results* results = Data()->GetResults(GetMethodName(),Types::kTraining, Types::kMaxAnalysisType);
TH1F *tmpS = new TH1F( "tmpS", "", 100 , -1., 1.00001 );
TH1F *tmpB = new TH1F( "tmpB", "", 100 , -1., 1.00001 );
TH1F *tmp;
UInt_t signalClassNr = DataInfo().GetClassInfo("Signal")->GetNumber();
UInt_t nevents = Data()->GetNTestEvents();
for (UInt_t iev=0; iev < nevents; iev++){
const Event* event = GetTestingEvent(iev);
if (event->GetClass() == signalClassNr) {tmp=tmpS;}
else {tmp=tmpB;}
tmp->Fill(PrivateGetMvaValue(event),event->GetWeight());
}
Double_t max=1;
std::vector<TH1F*> hS;
std::vector<TH1F*> hB;
for (UInt_t ivar=0; ivar<GetNvar(); ivar++){
hS.push_back(new TH1F(Form("SigVar%dAtTree%d",ivar,iTree),Form("SigVar%dAtTree%d",ivar,iTree),100,DataInfo().GetVariableInfo(ivar).GetMin(),DataInfo().GetVariableInfo(ivar).GetMax()));
hB.push_back(new TH1F(Form("BkgVar%dAtTree%d",ivar,iTree),Form("BkgVar%dAtTree%d",ivar,iTree),100,DataInfo().GetVariableInfo(ivar).GetMin(),DataInfo().GetVariableInfo(ivar).GetMax()));
results->Store(hS.back(),hS.back()->GetTitle());
results->Store(hB.back(),hB.back()->GetTitle());
}
for (UInt_t iev=0; iev < fEventSample.size(); iev++){
if (fEventSample[iev]->GetBoostWeight() > max) max = 1.01*fEventSample[iev]->GetBoostWeight();
}
TH1F *tmpBoostWeightsS = new TH1F(Form("BoostWeightsInTreeS%d",iTree),Form("BoostWeightsInTreeS%d",iTree),100,0.,max);
TH1F *tmpBoostWeightsB = new TH1F(Form("BoostWeightsInTreeB%d",iTree),Form("BoostWeightsInTreeB%d",iTree),100,0.,max);
results->Store(tmpBoostWeightsS,tmpBoostWeightsS->GetTitle());
results->Store(tmpBoostWeightsB,tmpBoostWeightsB->GetTitle());
TH1F *tmpBoostWeights;
std::vector<TH1F*> *h;
for (UInt_t iev=0; iev < fEventSample.size(); iev++){
if (fEventSample[iev]->GetClass() == signalClassNr) {
tmpBoostWeights=tmpBoostWeightsS;
h=&hS;
}else{
tmpBoostWeights=tmpBoostWeightsB;
h=&hB;
}
tmpBoostWeights->Fill(fEventSample[iev]->GetBoostWeight());
for (UInt_t ivar=0; ivar<GetNvar(); ivar++){
(*h)[ivar]->Fill(fEventSample[iev]->GetValue(ivar),fEventSample[iev]->GetWeight());
}
}
TMVA::PDF *sig = new TMVA::PDF( " PDF Sig", tmpS, TMVA::PDF::kSpline3 );
TMVA::PDF *bkg = new TMVA::PDF( " PDF Bkg", tmpB, TMVA::PDF::kSpline3 );
TGraph* gr=results->GetGraph("BoostMonitorGraph");
Int_t nPoints = gr->GetN();
gr->Set(nPoints+1);
gr->SetPoint(nPoints,(Double_t)iTree+1,GetROCIntegral(sig,bkg));
tmpS->Delete();
tmpB->Delete();
delete sig;
delete bkg;
return;
}
Double_t TMVA::MethodBDT::AdaBoost( std::vector<const TMVA::Event*>& eventSample, DecisionTree *dt )
{
Double_t err=0, sumGlobalw=0, sumGlobalwfalse=0, sumGlobalwfalse2=0;
std::vector<Double_t> sumw(DataInfo().GetNClasses(),0);
std::map<Node*,Int_t> sigEventsInNode;
Double_t maxDev=0;
for (std::vector<const TMVA::Event*>::const_iterator e=eventSample.begin(); e!=eventSample.end();e++) {
Double_t w = (*e)->GetWeight();
sumGlobalw += w;
UInt_t iclass=(*e)->GetClass();
sumw[iclass] += w;
if ( DoRegression() ) {
Double_t tmpDev = TMath::Abs(dt->CheckEvent(*e,kFALSE) - (*e)->GetTarget(0) );
sumGlobalwfalse += w * tmpDev;
sumGlobalwfalse2 += w * tmpDev*tmpDev;
if (tmpDev > maxDev) maxDev = tmpDev;
}else{
if (fUseYesNoLeaf){
Bool_t isSignalType = (dt->CheckEvent(*e,fUseYesNoLeaf) > fNodePurityLimit );
if (!(isSignalType == DataInfo().IsSignal(*e))) {
sumGlobalwfalse+= w;
}
}else{
Double_t dtoutput = (dt->CheckEvent(*e,fUseYesNoLeaf) - 0.5)*2.;
Int_t trueType;
if (DataInfo().IsSignal(*e)) trueType = 1;
else trueType = -1;
sumGlobalwfalse+= w*trueType*dtoutput;
}
}
}
err = sumGlobalwfalse/sumGlobalw ;
if ( DoRegression() ) {
if (fAdaBoostR2Loss=="linear"){
err = sumGlobalwfalse/maxDev/sumGlobalw ;
}
else if (fAdaBoostR2Loss=="quadratic"){
err = sumGlobalwfalse2/maxDev/maxDev/sumGlobalw ;
}
else if (fAdaBoostR2Loss=="exponential"){
err = 0;
for (std::vector<const TMVA::Event*>::const_iterator e=eventSample.begin(); e!=eventSample.end();e++) {
Double_t w = (*e)->GetWeight();
Double_t tmpDev = TMath::Abs(dt->CheckEvent(*e,kFALSE) - (*e)->GetTarget(0) );
err += w * (1 - exp (-tmpDev/maxDev)) / sumGlobalw;
}
}
else {
Log() << kFATAL << " you've chosen a Loss type for Adaboost other than linear, quadratic or exponential "
<< " namely " << fAdaBoostR2Loss << "\n"
<< "and this is not implemented... a typo in the options ??" <<Endl;
}
}
Log() << kDEBUG << "BDT AdaBoos wrong/all: " << sumGlobalwfalse << "/" << sumGlobalw << Endl;
Double_t newSumGlobalw=0;
std::vector<Double_t> newSumw(sumw.size(),0);
Double_t boostWeight=1.;
if (err >= 0.5 && fUseYesNoLeaf) {
if (dt->GetNNodes() == 1){
Log() << kERROR << " YOUR tree has only 1 Node... kind of a funny *tree*. I cannot "
<< "boost such a thing... if after 1 step the error rate is == 0.5"
<< Endl
<< "please check why this happens, maybe too many events per node requested ?"
<< Endl;
}else{
Log() << kERROR << " The error rate in the BDT boosting is > 0.5. ("<< err
<< ") That should not happen, please check your code (i.e... the BDT code), I "
<< " stop boosting here" << Endl;
return -1;
}
err = 0.5;
} else if (err < 0) {
Log() << kERROR << " The error rate in the BDT boosting is < 0. That can happen"
<< " due to improper treatment of negative weights in a Monte Carlo.. (if you have"
<< " an idea on how to do it in a better way, please let me know (Helge.Voss@cern.ch)"
<< " for the time being I set it to its absolute value.. just to continue.." << Endl;
err = TMath::Abs(err);
}
if (fUseYesNoLeaf)
boostWeight = TMath::Log((1.-err)/err)*fAdaBoostBeta;
else
boostWeight = TMath::Log((1.+err)/(1-err))*fAdaBoostBeta;
Log() << kDEBUG << "BDT AdaBoos wrong/all: " << sumGlobalwfalse << "/" << sumGlobalw << " 1-err/err="<<boostWeight<< " log.."<<TMath::Log(boostWeight)<<Endl;
Results* results = Data()->GetResults(GetMethodName(),Types::kTraining, Types::kMaxAnalysisType);
for (std::vector<const TMVA::Event*>::const_iterator e=eventSample.begin(); e!=eventSample.end();e++) {
if (fUseYesNoLeaf||DoRegression()){
if ((!( (dt->CheckEvent(*e,fUseYesNoLeaf) > fNodePurityLimit ) == DataInfo().IsSignal(*e))) || DoRegression()) {
Double_t boostfactor = TMath::Exp(boostWeight);
if (DoRegression()) boostfactor = TMath::Power(1/boostWeight,(1.-TMath::Abs(dt->CheckEvent(*e,kFALSE) - (*e)->GetTarget(0) )/maxDev ) );
if ( (*e)->GetWeight() > 0 ){
(*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostfactor);
if (DoRegression()) results->GetHist("BoostWeights")->Fill(boostfactor);
} else {
if ( fInverseBoostNegWeights )(*e)->ScaleBoostWeight( 1. / boostfactor);
else (*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostfactor);
}
}
}else{
Double_t dtoutput = (dt->CheckEvent(*e,fUseYesNoLeaf) - 0.5)*2.;
Int_t trueType;
if (DataInfo().IsSignal(*e)) trueType = 1;
else trueType = -1;
Double_t boostfactor = TMath::Exp(-1*boostWeight*trueType*dtoutput);
if ( (*e)->GetWeight() > 0 ){
(*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostfactor);
if (DoRegression()) results->GetHist("BoostWeights")->Fill(boostfactor);
} else {
if ( fInverseBoostNegWeights )(*e)->ScaleBoostWeight( 1. / boostfactor);
else (*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostfactor);
}
}
newSumGlobalw+=(*e)->GetWeight();
newSumw[(*e)->GetClass()] += (*e)->GetWeight();
}
Double_t globalNormWeight=( (Double_t) eventSample.size())/newSumGlobalw;
Log() << kDEBUG << "new Nsig="<<newSumw[0]*globalNormWeight << " new Nbkg="<<newSumw[1]*globalNormWeight << Endl;
for (std::vector<const TMVA::Event*>::const_iterator e=eventSample.begin(); e!=eventSample.end();e++) {
if (DataInfo().IsSignal(*e))(*e)->ScaleBoostWeight( globalNormWeight * fSigToBkgFraction );
else (*e)->ScaleBoostWeight( globalNormWeight );
}
if (!(DoRegression()))results->GetHist("BoostWeights")->Fill(boostWeight);
results->GetHist("BoostWeightsVsTree")->SetBinContent(fForest.size(),boostWeight);
results->GetHist("ErrorFrac")->SetBinContent(fForest.size(),err);
fBoostWeight = boostWeight;
fErrorFraction = err;
return boostWeight;
}
Double_t TMVA::MethodBDT::AdaCost( vector<const TMVA::Event*>& eventSample, DecisionTree *dt )
{
Double_t Css = fCss;
Double_t Cbb = fCbb;
Double_t Cts_sb = fCts_sb;
Double_t Ctb_ss = fCtb_ss;
Double_t err=0, sumGlobalWeights=0, sumGlobalCost=0;
std::vector<Double_t> sumw(DataInfo().GetNClasses(),0);
std::map<Node*,Int_t> sigEventsInNode;
for (vector<const TMVA::Event*>::const_iterator e=eventSample.begin(); e!=eventSample.end();e++) {
Double_t w = (*e)->GetWeight();
sumGlobalWeights += w;
UInt_t iclass=(*e)->GetClass();
sumw[iclass] += w;
if ( DoRegression() ) {
Log() << kFATAL << " AdaCost not implemented for regression"<<Endl;
}else{
Double_t dtoutput = (dt->CheckEvent(*e,false) - 0.5)*2.;
Int_t trueType;
Bool_t isTrueSignal = DataInfo().IsSignal(*e);
Bool_t isSelectedSignal = (dtoutput>0);
if (isTrueSignal) trueType = 1;
else trueType = -1;
Double_t cost=0;
if (isTrueSignal && isSelectedSignal) cost=Css;
else if (isTrueSignal && !isSelectedSignal) cost=Cts_sb;
else if (!isTrueSignal && isSelectedSignal) cost=Ctb_ss;
else if (!isTrueSignal && !isSelectedSignal) cost=Cbb;
else Log() << kERROR << "something went wrong in AdaCost" << Endl;
sumGlobalCost+= w*trueType*dtoutput*cost;
}
}
if ( DoRegression() ) {
Log() << kFATAL << " AdaCost not implemented for regression"<<Endl;
}
sumGlobalCost /= sumGlobalWeights;
Double_t newSumGlobalWeights=0;
vector<Double_t> newSumClassWeights(sumw.size(),0);
Double_t boostWeight = TMath::Log((1+sumGlobalCost)/(1-sumGlobalCost)) * fAdaBoostBeta;
Results* results = Data()->GetResults(GetMethodName(),Types::kTraining, Types::kMaxAnalysisType);
for (vector<const TMVA::Event*>::const_iterator e=eventSample.begin(); e!=eventSample.end();e++) {
Double_t dtoutput = (dt->CheckEvent(*e,false) - 0.5)*2.;
Int_t trueType;
Bool_t isTrueSignal = DataInfo().IsSignal(*e);
Bool_t isSelectedSignal = (dtoutput>0);
if (isTrueSignal) trueType = 1;
else trueType = -1;
Double_t cost=0;
if (isTrueSignal && isSelectedSignal) cost=Css;
else if (isTrueSignal && !isSelectedSignal) cost=Cts_sb;
else if (!isTrueSignal && isSelectedSignal) cost=Ctb_ss;
else if (!isTrueSignal && !isSelectedSignal) cost=Cbb;
else Log() << kERROR << "something went wrong in AdaCost" << Endl;
Double_t boostfactor = TMath::Exp(-1*boostWeight*trueType*dtoutput*cost);
if (DoRegression())Log() << kFATAL << " AdaCost not implemented for regression"<<Endl;
if ( (*e)->GetWeight() > 0 ){
(*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostfactor);
if (DoRegression())Log() << kFATAL << " AdaCost not implemented for regression"<<Endl;
} else {
if ( fInverseBoostNegWeights )(*e)->ScaleBoostWeight( 1. / boostfactor);
}
newSumGlobalWeights+=(*e)->GetWeight();
newSumClassWeights[(*e)->GetClass()] += (*e)->GetWeight();
}
Double_t globalNormWeight=Double_t(eventSample.size())/newSumGlobalWeights;
Log() << kDEBUG << "new Nsig="<<newSumClassWeights[0]*globalNormWeight << " new Nbkg="<<newSumClassWeights[1]*globalNormWeight << Endl;
for (std::vector<const TMVA::Event*>::const_iterator e=eventSample.begin(); e!=eventSample.end();e++) {
if (DataInfo().IsSignal(*e))(*e)->ScaleBoostWeight( globalNormWeight * fSigToBkgFraction );
else (*e)->ScaleBoostWeight( globalNormWeight );
}
if (!(DoRegression()))results->GetHist("BoostWeights")->Fill(boostWeight);
results->GetHist("BoostWeightsVsTree")->SetBinContent(fForest.size(),boostWeight);
results->GetHist("ErrorFrac")->SetBinContent(fForest.size(),err);
fBoostWeight = boostWeight;
fErrorFraction = err;
return boostWeight;
}
Double_t TMVA::MethodBDT::Bagging( )
{
return 1.;
}
void TMVA::MethodBDT::GetBaggedSubSample(std::vector<const TMVA::Event*>& eventSample)
{
Double_t n;
TRandom3 *trandom = new TRandom3(100*fForest.size()+1234);
if (!fSubSample.empty()) fSubSample.clear();
for (std::vector<const TMVA::Event*>::const_iterator e=eventSample.begin(); e!=eventSample.end();e++) {
n = trandom->PoissonD(fBaggedSampleFraction);
for (Int_t i=0;i<n;i++) fSubSample.push_back(*e);
}
delete trandom;
return;
}
Double_t TMVA::MethodBDT::RegBoost( std::vector<const TMVA::Event*>& , DecisionTree* )
{
return 1;
}
Double_t TMVA::MethodBDT::AdaBoostR2( std::vector<const TMVA::Event*>& eventSample, DecisionTree *dt )
{
if ( !DoRegression() ) Log() << kFATAL << "Somehow you chose a regression boost method for a classification job" << Endl;
Double_t err=0, sumw=0, sumwfalse=0, sumwfalse2=0;
Double_t maxDev=0;
for (std::vector<const TMVA::Event*>::const_iterator e=eventSample.begin(); e!=eventSample.end();e++) {
Double_t w = (*e)->GetWeight();
sumw += w;
Double_t tmpDev = TMath::Abs(dt->CheckEvent(*e,kFALSE) - (*e)->GetTarget(0) );
sumwfalse += w * tmpDev;
sumwfalse2 += w * tmpDev*tmpDev;
if (tmpDev > maxDev) maxDev = tmpDev;
}
if (fAdaBoostR2Loss=="linear"){
err = sumwfalse/maxDev/sumw ;
}
else if (fAdaBoostR2Loss=="quadratic"){
err = sumwfalse2/maxDev/maxDev/sumw ;
}
else if (fAdaBoostR2Loss=="exponential"){
err = 0;
for (std::vector<const TMVA::Event*>::const_iterator e=eventSample.begin(); e!=eventSample.end();e++) {
Double_t w = (*e)->GetWeight();
Double_t tmpDev = TMath::Abs(dt->CheckEvent(*e,kFALSE) - (*e)->GetTarget(0) );
err += w * (1 - exp (-tmpDev/maxDev)) / sumw;
}
}
else {
Log() << kFATAL << " you've chosen a Loss type for Adaboost other than linear, quadratic or exponential "
<< " namely " << fAdaBoostR2Loss << "\n"
<< "and this is not implemented... a typo in the options ??" <<Endl;
}
if (err >= 0.5) {
if (dt->GetNNodes() == 1){
Log() << kERROR << " YOUR tree has only 1 Node... kind of a funny *tree*. I cannot "
<< "boost such a thing... if after 1 step the error rate is == 0.5"
<< Endl
<< "please check why this happens, maybe too many events per node requested ?"
<< Endl;
}else{
Log() << kERROR << " The error rate in the BDT boosting is > 0.5. ("<< err
<< ") That should not happen, but is possible for regression trees, and"
<< " should trigger a stop for the boosting. please check your code (i.e... the BDT code), I "
<< " stop boosting " << Endl;
return -1;
}
err = 0.5;
} else if (err < 0) {
Log() << kERROR << " The error rate in the BDT boosting is < 0. That can happen"
<< " due to improper treatment of negative weights in a Monte Carlo.. (if you have"
<< " an idea on how to do it in a better way, please let me know (Helge.Voss@cern.ch)"
<< " for the time being I set it to its absolute value.. just to continue.." << Endl;
err = TMath::Abs(err);
}
Double_t boostWeight = err / (1.-err);
Double_t newSumw=0;
Results* results = Data()->GetResults(GetMethodName(), Types::kTraining, Types::kMaxAnalysisType);
for (std::vector<const TMVA::Event*>::const_iterator e=eventSample.begin(); e!=eventSample.end();e++) {
Double_t boostfactor = TMath::Power(boostWeight,(1.-TMath::Abs(dt->CheckEvent(*e,kFALSE) - (*e)->GetTarget(0) )/maxDev ) );
results->GetHist("BoostWeights")->Fill(boostfactor);
if ( (*e)->GetWeight() > 0 ){
Float_t newBoostWeight = (*e)->GetBoostWeight() * boostfactor;
Float_t newWeight = (*e)->GetWeight() * (*e)->GetBoostWeight() * boostfactor;
if (newWeight == 0) {
Log() << kINFO << "Weight= " << (*e)->GetWeight() << Endl;
Log() << kINFO << "BoostWeight= " << (*e)->GetBoostWeight() << Endl;
Log() << kINFO << "boostweight="<<boostWeight << " err= " <<err << Endl;
Log() << kINFO << "NewBoostWeight= " << newBoostWeight << Endl;
Log() << kINFO << "boostfactor= " << boostfactor << Endl;
Log() << kINFO << "maxDev = " << maxDev << Endl;
Log() << kINFO << "tmpDev = " << TMath::Abs(dt->CheckEvent(*e,kFALSE) - (*e)->GetTarget(0) ) << Endl;
Log() << kINFO << "target = " << (*e)->GetTarget(0) << Endl;
Log() << kINFO << "estimate = " << dt->CheckEvent(*e,kFALSE) << Endl;
}
(*e)->SetBoostWeight( newBoostWeight );
} else {
(*e)->SetBoostWeight( (*e)->GetBoostWeight() / boostfactor);
}
newSumw+=(*e)->GetWeight();
}
Double_t normWeight = sumw / newSumw;
for (std::vector<const TMVA::Event*>::const_iterator e=eventSample.begin(); e!=eventSample.end();e++) {
(*e)->SetBoostWeight( (*e)->GetBoostWeight() * normWeight );
}
results->GetHist("BoostWeightsVsTree")->SetBinContent(fForest.size(),1./boostWeight);
results->GetHist("ErrorFrac")->SetBinContent(fForest.size(),err);
fBoostWeight = boostWeight;
fErrorFraction = err;
return TMath::Log(1./boostWeight);
}
void TMVA::MethodBDT::AddWeightsXMLTo( void* parent ) const
{
void* wght = gTools().AddChild(parent, "Weights");
if (fDoPreselection){
for (UInt_t ivar=0; ivar<GetNvar(); ivar++){
gTools().AddAttr( wght, Form("PreselectionLowBkgVar%d",ivar), fIsLowBkgCut[ivar]);
gTools().AddAttr( wght, Form("PreselectionLowBkgVar%dValue",ivar), fLowBkgCut[ivar]);
gTools().AddAttr( wght, Form("PreselectionLowSigVar%d",ivar), fIsLowSigCut[ivar]);
gTools().AddAttr( wght, Form("PreselectionLowSigVar%dValue",ivar), fLowSigCut[ivar]);
gTools().AddAttr( wght, Form("PreselectionHighBkgVar%d",ivar), fIsHighBkgCut[ivar]);
gTools().AddAttr( wght, Form("PreselectionHighBkgVar%dValue",ivar),fHighBkgCut[ivar]);
gTools().AddAttr( wght, Form("PreselectionHighSigVar%d",ivar), fIsHighSigCut[ivar]);
gTools().AddAttr( wght, Form("PreselectionHighSigVar%dValue",ivar),fHighSigCut[ivar]);
}
}
gTools().AddAttr( wght, "NTrees", fForest.size() );
gTools().AddAttr( wght, "AnalysisType", fForest.back()->GetAnalysisType() );
for (UInt_t i=0; i< fForest.size(); i++) {
void* trxml = fForest[i]->AddXMLTo(wght);
gTools().AddAttr( trxml, "boostWeight", fBoostWeights[i] );
gTools().AddAttr( trxml, "itree", i );
}
}
void TMVA::MethodBDT::ReadWeightsFromXML(void* parent) {
UInt_t i;
for (i=0; i<fForest.size(); i++) delete fForest[i];
fForest.clear();
fBoostWeights.clear();
UInt_t ntrees;
UInt_t analysisType;
Float_t boostWeight;
if (gTools().HasAttr( parent, Form("PreselectionLowBkgVar%d",0))) {
fIsLowBkgCut.resize(GetNvar());
fLowBkgCut.resize(GetNvar());
fIsLowSigCut.resize(GetNvar());
fLowSigCut.resize(GetNvar());
fIsHighBkgCut.resize(GetNvar());
fHighBkgCut.resize(GetNvar());
fIsHighSigCut.resize(GetNvar());
fHighSigCut.resize(GetNvar());
Bool_t tmpBool;
Double_t tmpDouble;
for (UInt_t ivar=0; ivar<GetNvar(); ivar++){
gTools().ReadAttr( parent, Form("PreselectionLowBkgVar%d",ivar), tmpBool);
fIsLowBkgCut[ivar]=tmpBool;
gTools().ReadAttr( parent, Form("PreselectionLowBkgVar%dValue",ivar), tmpDouble);
fLowBkgCut[ivar]=tmpDouble;
gTools().ReadAttr( parent, Form("PreselectionLowSigVar%d",ivar), tmpBool);
fIsLowSigCut[ivar]=tmpBool;
gTools().ReadAttr( parent, Form("PreselectionLowSigVar%dValue",ivar), tmpDouble);
fLowSigCut[ivar]=tmpDouble;
gTools().ReadAttr( parent, Form("PreselectionHighBkgVar%d",ivar), tmpBool);
fIsHighBkgCut[ivar]=tmpBool;
gTools().ReadAttr( parent, Form("PreselectionHighBkgVar%dValue",ivar), tmpDouble);
fHighBkgCut[ivar]=tmpDouble;
gTools().ReadAttr( parent, Form("PreselectionHighSigVar%d",ivar),tmpBool);
fIsHighSigCut[ivar]=tmpBool;
gTools().ReadAttr( parent, Form("PreselectionHighSigVar%dValue",ivar), tmpDouble);
fHighSigCut[ivar]=tmpDouble;
}
}
gTools().ReadAttr( parent, "NTrees", ntrees );
if(gTools().HasAttr(parent, "TreeType")) {
gTools().ReadAttr( parent, "TreeType", analysisType );
} else {
gTools().ReadAttr( parent, "AnalysisType", analysisType );
}
void* ch = gTools().GetChild(parent);
i=0;
while(ch) {
fForest.push_back( dynamic_cast<DecisionTree*>( DecisionTree::CreateFromXML(ch, GetTrainingTMVAVersionCode()) ) );
fForest.back()->SetAnalysisType(Types::EAnalysisType(analysisType));
fForest.back()->SetTreeID(i++);
gTools().ReadAttr(ch,"boostWeight",boostWeight);
fBoostWeights.push_back(boostWeight);
ch = gTools().GetNextChild(ch);
}
}
void TMVA::MethodBDT::ReadWeightsFromStream( std::istream& istr )
{
TString dummy;
Int_t analysisType(0);
istr >> dummy >> fNTrees;
Log() << kINFO << "Read " << fNTrees << " Decision trees" << Endl;
for (UInt_t i=0;i<fForest.size();i++) delete fForest[i];
fForest.clear();
fBoostWeights.clear();
Int_t iTree;
Double_t boostWeight;
for (int i=0;i<fNTrees;i++) {
istr >> dummy >> iTree >> dummy >> boostWeight;
if (iTree != i) {
fForest.back()->Print( std::cout );
Log() << kFATAL << "Error while reading weight file; mismatch iTree="
<< iTree << " i=" << i
<< " dummy " << dummy
<< " boostweight " << boostWeight
<< Endl;
}
fForest.push_back( new DecisionTree() );
fForest.back()->SetAnalysisType(Types::EAnalysisType(analysisType));
fForest.back()->SetTreeID(i);
fForest.back()->Read(istr, GetTrainingTMVAVersionCode());
fBoostWeights.push_back(boostWeight);
}
}
Double_t TMVA::MethodBDT::GetMvaValue( Double_t* err, Double_t* errUpper ){
return this->GetMvaValue( err, errUpper, 0 );
}
Double_t TMVA::MethodBDT::GetMvaValue( Double_t* err, Double_t* errUpper, UInt_t useNTrees )
{
const Event* ev = GetEvent();
if (fDoPreselection) {
Double_t val = ApplyPreselectionCuts(ev);
if (TMath::Abs(val)>0.05) return val;
}
return PrivateGetMvaValue(ev, err, errUpper, useNTrees);
}
Double_t TMVA::MethodBDT::PrivateGetMvaValue(const TMVA::Event* ev, Double_t* err, Double_t* errUpper, UInt_t useNTrees )
{
NoErrorCalc(err, errUpper);
UInt_t nTrees = fForest.size();
if (useNTrees > 0 ) nTrees = useNTrees;
if (fBoostType=="Grad") return GetGradBoostMVA(ev,nTrees);
Double_t myMVA = 0;
Double_t norm = 0;
for (UInt_t itree=0; itree<nTrees; itree++) {
myMVA += fBoostWeights[itree] * fForest[itree]->CheckEvent(ev,fUseYesNoLeaf);
norm += fBoostWeights[itree];
}
return ( norm > std::numeric_limits<double>::epsilon() ) ? myMVA /= norm : 0 ;
}
const std::vector<Float_t>& TMVA::MethodBDT::GetMulticlassValues()
{
const TMVA::Event *e = GetEvent();
if (fMulticlassReturnVal == NULL) fMulticlassReturnVal = new std::vector<Float_t>();
fMulticlassReturnVal->clear();
std::vector<double> temp;
UInt_t nClasses = DataInfo().GetNClasses();
for(UInt_t iClass=0; iClass<nClasses; iClass++){
temp.push_back(0.0);
for(UInt_t itree = iClass; itree<fForest.size(); itree+=nClasses){
temp[iClass] += fForest[itree]->CheckEvent(e,kFALSE);
}
}
for(UInt_t iClass=0; iClass<nClasses; iClass++){
Double_t norm = 0.0;
for(UInt_t j=0;j<nClasses;j++){
if(iClass!=j)
norm+=exp(temp[j]-temp[iClass]);
}
(*fMulticlassReturnVal).push_back(1.0/(1.0+norm));
}
return *fMulticlassReturnVal;
}
const std::vector<Float_t> & TMVA::MethodBDT::GetRegressionValues()
{
if (fRegressionReturnVal == NULL) fRegressionReturnVal = new std::vector<Float_t>();
fRegressionReturnVal->clear();
const Event * ev = GetEvent();
Event * evT = new Event(*ev);
Double_t myMVA = 0;
Double_t norm = 0;
if (fBoostType=="AdaBoostR2") {
vector< Double_t > response(fForest.size());
vector< Double_t > weight(fForest.size());
Double_t totalSumOfWeights = 0;
for (UInt_t itree=0; itree<fForest.size(); itree++) {
response[itree] = fForest[itree]->CheckEvent(ev,kFALSE);
weight[itree] = fBoostWeights[itree];
totalSumOfWeights += fBoostWeights[itree];
}
std::vector< std::vector<Double_t> > vtemp;
vtemp.push_back( response );
vtemp.push_back( weight );
gTools().UsefulSortAscending( vtemp );
Int_t t=0;
Double_t sumOfWeights = 0;
while (sumOfWeights <= totalSumOfWeights/2.) {
sumOfWeights += vtemp[1][t];
t++;
}
Double_t rVal=0;
Int_t count=0;
for (UInt_t i= TMath::Max(UInt_t(0),UInt_t(t-(fForest.size()/6)-0.5));
i< TMath::Min(UInt_t(fForest.size()),UInt_t(t+(fForest.size()/6)+0.5)); i++) {
count++;
rVal+=vtemp[0][i];
}
evT->SetTarget(0, rVal/Double_t(count) );
}
else if(fBoostType=="Grad"){
for (UInt_t itree=0; itree<fForest.size(); itree++) {
myMVA += fForest[itree]->CheckEvent(ev,kFALSE);
}
evT->SetTarget(0, myMVA+fBoostWeights[0] );
}
else{
for (UInt_t itree=0; itree<fForest.size(); itree++) {
myMVA += fBoostWeights[itree] * fForest[itree]->CheckEvent(ev,kFALSE);
norm += fBoostWeights[itree];
}
evT->SetTarget(0, ( norm > std::numeric_limits<double>::epsilon() ) ? myMVA /= norm : 0 );
}
const Event* evT2 = GetTransformationHandler().InverseTransform( evT );
fRegressionReturnVal->push_back( evT2->GetTarget(0) );
delete evT;
return *fRegressionReturnVal;
}
void TMVA::MethodBDT::WriteMonitoringHistosToFile( void ) const
{
Log() << kINFO << "Write monitoring histograms to file: " << BaseDir()->GetPath() << Endl;
fMonitorNtuple->Write();
}
vector< Double_t > TMVA::MethodBDT::GetVariableImportance()
{
fVariableImportance.resize(GetNvar());
for (UInt_t ivar = 0; ivar < GetNvar(); ivar++) {
fVariableImportance[ivar]=0;
}
Double_t sum=0;
for (UInt_t itree = 0; itree < GetNTrees(); itree++) {
std::vector<Double_t> relativeImportance(fForest[itree]->GetVariableImportance());
for (UInt_t i=0; i< relativeImportance.size(); i++) {
fVariableImportance[i] += fBoostWeights[itree] * relativeImportance[i];
}
}
for (UInt_t ivar=0; ivar< fVariableImportance.size(); ivar++){
fVariableImportance[ivar] = TMath::Sqrt(fVariableImportance[ivar]);
sum += fVariableImportance[ivar];
}
for (UInt_t ivar=0; ivar< fVariableImportance.size(); ivar++) fVariableImportance[ivar] /= sum;
return fVariableImportance;
}
Double_t TMVA::MethodBDT::GetVariableImportance( UInt_t ivar )
{
std::vector<Double_t> relativeImportance = this->GetVariableImportance();
if (ivar < (UInt_t)relativeImportance.size()) return relativeImportance[ivar];
else Log() << kFATAL << "<GetVariableImportance> ivar = " << ivar << " is out of range " << Endl;
return -1;
}
const TMVA::Ranking* TMVA::MethodBDT::CreateRanking()
{
fRanking = new Ranking( GetName(), "Variable Importance" );
vector< Double_t> importance(this->GetVariableImportance());
for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
fRanking->AddRank( Rank( GetInputLabel(ivar), importance[ivar] ) );
}
return fRanking;
}
void TMVA::MethodBDT::GetHelpMessage() const
{
Log() << Endl;
Log() << gTools().Color("bold") << "--- Short description:" << gTools().Color("reset") << Endl;
Log() << Endl;
Log() << "Boosted Decision Trees are a collection of individual decision" << Endl;
Log() << "trees which form a multivariate classifier by (weighted) majority " << Endl;
Log() << "vote of the individual trees. Consecutive decision trees are " << Endl;
Log() << "trained using the original training data set with re-weighted " << Endl;
Log() << "events. By default, the AdaBoost method is employed, which gives " << Endl;
Log() << "events that were misclassified in the previous tree a larger " << Endl;
Log() << "weight in the training of the following tree." << Endl;
Log() << Endl;
Log() << "Decision trees are a sequence of binary splits of the data sample" << Endl;
Log() << "using a single descriminant variable at a time. A test event " << Endl;
Log() << "ending up after the sequence of left-right splits in a final " << Endl;
Log() << "(\"leaf\") node is classified as either signal or background" << Endl;
Log() << "depending on the majority type of training events in that node." << Endl;
Log() << Endl;
Log() << gTools().Color("bold") << "--- Performance optimisation:" << gTools().Color("reset") << Endl;
Log() << Endl;
Log() << "By the nature of the binary splits performed on the individual" << Endl;
Log() << "variables, decision trees do not deal well with linear correlations" << Endl;
Log() << "between variables (they need to approximate the linear split in" << Endl;
Log() << "the two dimensional space by a sequence of splits on the two " << Endl;
Log() << "variables individually). Hence decorrelation could be useful " << Endl;
Log() << "to optimise the BDT performance." << Endl;
Log() << Endl;
Log() << gTools().Color("bold") << "--- Performance tuning via configuration options:" << gTools().Color("reset") << Endl;
Log() << Endl;
Log() << "The two most important parameters in the configuration are the " << Endl;
Log() << "minimal number of events requested by a leaf node as percentage of the " <<Endl;
Log() << " number of training events (option \"MinNodeSize\" replacing the actual number " << Endl;
Log() << " of events \"nEventsMin\" as given in earlier versions" << Endl;
Log() << "If this number is too large, detailed features " << Endl;
Log() << "in the parameter space are hard to be modelled. If it is too small, " << Endl;
Log() << "the risk to overtrain rises and boosting seems to be less effective" << Endl;
Log() << " typical values from our current expericience for best performance " << Endl;
Log() << " are between 0.5(%) and 10(%) " << Endl;
Log() << Endl;
Log() << "The default minimal number is currently set to " << Endl;
Log() << " max(20, (N_training_events / N_variables^2 / 10)) " << Endl;
Log() << "and can be changed by the user." << Endl;
Log() << Endl;
Log() << "The other crucial parameter, the pruning strength (\"PruneStrength\")," << Endl;
Log() << "is also related to overtraining. It is a regularisation parameter " << Endl;
Log() << "that is used when determining after the training which splits " << Endl;
Log() << "are considered statistically insignificant and are removed. The" << Endl;
Log() << "user is advised to carefully watch the BDT screen output for" << Endl;
Log() << "the comparison between efficiencies obtained on the training and" << Endl;
Log() << "the independent test sample. They should be equal within statistical" << Endl;
Log() << "errors, in order to minimize statistical fluctuations in different samples." << Endl;
}
void TMVA::MethodBDT::MakeClassSpecific( std::ostream& fout, const TString& className ) const
{
TString nodeName = className;
nodeName.ReplaceAll("Read","");
nodeName.Append("Node");
fout << " std::vector<"<<nodeName<<"*> fForest; // i.e. root nodes of decision trees" << std::endl;
fout << " std::vector<double> fBoostWeights; // the weights applied in the individual boosts" << std::endl;
fout << "};" << std::endl << std::endl;
fout << "double " << className << "::GetMvaValue__( const std::vector<double>& inputValues ) const" << std::endl;
fout << "{" << std::endl;
fout << " double myMVA = 0;" << std::endl;
if (fDoPreselection){
for (UInt_t ivar = 0; ivar< fIsLowBkgCut.size(); ivar++){
if (fIsLowBkgCut[ivar]){
fout << " if (inputValues["<<ivar<<"] < " << fLowBkgCut[ivar] << ") return -1; // is background preselection cut" << std::endl;
}
if (fIsLowSigCut[ivar]){
fout << " if (inputValues["<<ivar<<"] < "<< fLowSigCut[ivar] << ") return 1; // is signal preselection cut" << std::endl;
}
if (fIsHighBkgCut[ivar]){
fout << " if (inputValues["<<ivar<<"] > "<<fHighBkgCut[ivar] <<") return -1; // is background preselection cut" << std::endl;
}
if (fIsHighSigCut[ivar]){
fout << " if (inputValues["<<ivar<<"] > "<<fHighSigCut[ivar]<<") return 1; // is signal preselection cut" << std::endl;
}
}
}
if (fBoostType!="Grad"){
fout << " double norm = 0;" << std::endl;
}
fout << " for (unsigned int itree=0; itree<fForest.size(); itree++){" << std::endl;
fout << " "<<nodeName<<" *current = fForest[itree];" << std::endl;
fout << " while (current->GetNodeType() == 0) { //intermediate node" << std::endl;
fout << " if (current->GoesRight(inputValues)) current=("<<nodeName<<"*)current->GetRight();" << std::endl;
fout << " else current=("<<nodeName<<"*)current->GetLeft();" << std::endl;
fout << " }" << std::endl;
if (fBoostType=="Grad"){
fout << " myMVA += current->GetResponse();" << std::endl;
}else{
if (fUseYesNoLeaf) fout << " myMVA += fBoostWeights[itree] * current->GetNodeType();" << std::endl;
else fout << " myMVA += fBoostWeights[itree] * current->GetPurity();" << std::endl;
fout << " norm += fBoostWeights[itree];" << std::endl;
}
fout << " }" << std::endl;
if (fBoostType=="Grad"){
fout << " return 2.0/(1.0+exp(-2.0*myMVA))-1.0;" << std::endl;
}
else fout << " return myMVA /= norm;" << std::endl;
fout << "};" << std::endl << std::endl;
fout << "void " << className << "::Initialize()" << std::endl;
fout << "{" << std::endl;
for (UInt_t itree=0; itree<GetNTrees(); itree++) {
fout << " // itree = " << itree << std::endl;
fout << " fBoostWeights.push_back(" << fBoostWeights[itree] << ");" << std::endl;
fout << " fForest.push_back( " << std::endl;
this->MakeClassInstantiateNode((DecisionTreeNode*)fForest[itree]->GetRoot(), fout, className);
fout <<" );" << std::endl;
}
fout << " return;" << std::endl;
fout << "};" << std::endl;
fout << " " << std::endl;
fout << "// Clean up" << std::endl;
fout << "inline void " << className << "::Clear() " << std::endl;
fout << "{" << std::endl;
fout << " for (unsigned int itree=0; itree<fForest.size(); itree++) { " << std::endl;
fout << " delete fForest[itree]; " << std::endl;
fout << " }" << std::endl;
fout << "}" << std::endl;
}
void TMVA::MethodBDT::MakeClassSpecificHeader( std::ostream& fout, const TString& className) const
{
TString nodeName = className;
nodeName.ReplaceAll("Read","");
nodeName.Append("Node");
fout << "#define NN new "<<nodeName << std::endl;
fout << " " << std::endl;
fout << "#ifndef "<<nodeName<<"__def" << std::endl;
fout << "#define "<<nodeName<<"__def" << std::endl;
fout << " " << std::endl;
fout << "class "<<nodeName<<" {" << std::endl;
fout << " " << std::endl;
fout << "public:" << std::endl;
fout << " " << std::endl;
fout << " // constructor of an essentially \"empty\" node floating in space" << std::endl;
fout << " "<<nodeName<<" ( "<<nodeName<<"* left,"<<nodeName<<"* right," << std::endl;
if (fUseFisherCuts){
fout << " int nFisherCoeff," << std::endl;
for (UInt_t i=0;i<GetNVariables()+1;i++){
fout << " double fisherCoeff"<<i<<"," << std::endl;
}
}
fout << " int selector, double cutValue, bool cutType, " << std::endl;
fout << " int nodeType, double purity, double response ) :" << std::endl;
fout << " fLeft ( left )," << std::endl;
fout << " fRight ( right )," << std::endl;
if (fUseFisherCuts) fout << " fNFisherCoeff ( nFisherCoeff )," << std::endl;
fout << " fSelector ( selector )," << std::endl;
fout << " fCutValue ( cutValue )," << std::endl;
fout << " fCutType ( cutType )," << std::endl;
fout << " fNodeType ( nodeType )," << std::endl;
fout << " fPurity ( purity )," << std::endl;
fout << " fResponse ( response ){" << std::endl;
if (fUseFisherCuts){
for (UInt_t i=0;i<GetNVariables()+1;i++){
fout << " fFisherCoeff.push_back(fisherCoeff"<<i<<");" << std::endl;
}
}
fout << " }" << std::endl << std::endl;
fout << " virtual ~"<<nodeName<<"();" << std::endl << std::endl;
fout << " // test event if it decends the tree at this node to the right" << std::endl;
fout << " virtual bool GoesRight( const std::vector<double>& inputValues ) const;" << std::endl;
fout << " "<<nodeName<<"* GetRight( void ) {return fRight; };" << std::endl << std::endl;
fout << " // test event if it decends the tree at this node to the left " << std::endl;
fout << " virtual bool GoesLeft ( const std::vector<double>& inputValues ) const;" << std::endl;
fout << " "<<nodeName<<"* GetLeft( void ) { return fLeft; }; " << std::endl << std::endl;
fout << " // return S/(S+B) (purity) at this node (from training)" << std::endl << std::endl;
fout << " double GetPurity( void ) const { return fPurity; } " << std::endl;
fout << " // return the node type" << std::endl;
fout << " int GetNodeType( void ) const { return fNodeType; }" << std::endl;
fout << " double GetResponse(void) const {return fResponse;}" << std::endl << std::endl;
fout << "private:" << std::endl << std::endl;
fout << " "<<nodeName<<"* fLeft; // pointer to the left daughter node" << std::endl;
fout << " "<<nodeName<<"* fRight; // pointer to the right daughter node" << std::endl;
if (fUseFisherCuts){
fout << " int fNFisherCoeff; // =0 if this node doesn use fisher, else =nvar+1 " << std::endl;
fout << " std::vector<double> fFisherCoeff; // the fisher coeff (offset at the last element)" << std::endl;
}
fout << " int fSelector; // index of variable used in node selection (decision tree) " << std::endl;
fout << " double fCutValue; // cut value appplied on this node to discriminate bkg against sig" << std::endl;
fout << " bool fCutType; // true: if event variable > cutValue ==> signal , false otherwise" << std::endl;
fout << " int fNodeType; // Type of node: -1 == Bkg-leaf, 1 == Signal-leaf, 0 = internal " << std::endl;
fout << " double fPurity; // Purity of node from training"<< std::endl;
fout << " double fResponse; // Regression response value of node" << std::endl;
fout << "}; " << std::endl;
fout << " " << std::endl;
fout << "//_______________________________________________________________________" << std::endl;
fout << " "<<nodeName<<"::~"<<nodeName<<"()" << std::endl;
fout << "{" << std::endl;
fout << " if (fLeft != NULL) delete fLeft;" << std::endl;
fout << " if (fRight != NULL) delete fRight;" << std::endl;
fout << "}; " << std::endl;
fout << " " << std::endl;
fout << "//_______________________________________________________________________" << std::endl;
fout << "bool "<<nodeName<<"::GoesRight( const std::vector<double>& inputValues ) const" << std::endl;
fout << "{" << std::endl;
fout << " // test event if it decends the tree at this node to the right" << std::endl;
fout << " bool result;" << std::endl;
if (fUseFisherCuts){
fout << " if (fNFisherCoeff == 0){" << std::endl;
fout << " result = (inputValues[fSelector] > fCutValue );" << std::endl;
fout << " }else{" << std::endl;
fout << " double fisher = fFisherCoeff.at(fFisherCoeff.size()-1);" << std::endl;
fout << " for (unsigned int ivar=0; ivar<fFisherCoeff.size()-1; ivar++)" << std::endl;
fout << " fisher += fFisherCoeff.at(ivar)*inputValues.at(ivar);" << std::endl;
fout << " result = fisher > fCutValue;" << std::endl;
fout << " }" << std::endl;
}else{
fout << " result = (inputValues[fSelector] > fCutValue );" << std::endl;
}
fout << " if (fCutType == true) return result; //the cuts are selecting Signal ;" << std::endl;
fout << " else return !result;" << std::endl;
fout << "}" << std::endl;
fout << " " << std::endl;
fout << "//_______________________________________________________________________" << std::endl;
fout << "bool "<<nodeName<<"::GoesLeft( const std::vector<double>& inputValues ) const" << std::endl;
fout << "{" << std::endl;
fout << " // test event if it decends the tree at this node to the left" << std::endl;
fout << " if (!this->GoesRight(inputValues)) return true;" << std::endl;
fout << " else return false;" << std::endl;
fout << "}" << std::endl;
fout << " " << std::endl;
fout << "#endif" << std::endl;
fout << " " << std::endl;
}
void TMVA::MethodBDT::MakeClassInstantiateNode( DecisionTreeNode *n, std::ostream& fout, const TString& className ) const
{
if (n == NULL) {
Log() << kFATAL << "MakeClassInstantiateNode: started with undefined node" <<Endl;
return ;
}
fout << "NN("<<std::endl;
if (n->GetLeft() != NULL){
this->MakeClassInstantiateNode( (DecisionTreeNode*)n->GetLeft() , fout, className);
}
else {
fout << "0";
}
fout << ", " <<std::endl;
if (n->GetRight() != NULL){
this->MakeClassInstantiateNode( (DecisionTreeNode*)n->GetRight(), fout, className );
}
else {
fout << "0";
}
fout << ", " << std::endl
<< std::setprecision(6);
if (fUseFisherCuts){
fout << n->GetNFisherCoeff() << ", ";
for (UInt_t i=0; i< GetNVariables()+1; i++) {
if (n->GetNFisherCoeff() == 0 ){
fout << "0, ";
}else{
fout << n->GetFisherCoeff(i) << ", ";
}
}
}
fout << n->GetSelector() << ", "
<< n->GetCutValue() << ", "
<< n->GetCutType() << ", "
<< n->GetNodeType() << ", "
<< n->GetPurity() << ","
<< n->GetResponse() << ") ";
}
void TMVA::MethodBDT::DeterminePreselectionCuts(const std::vector<const TMVA::Event*>& eventSample)
{
Double_t nTotS = 0.0, nTotB = 0.0;
Int_t nTotS_unWeighted = 0, nTotB_unWeighted = 0;
std::vector<TMVA::BDTEventWrapper> bdtEventSample;
fIsLowSigCut.assign(GetNvar(),kFALSE);
fIsLowBkgCut.assign(GetNvar(),kFALSE);
fIsHighSigCut.assign(GetNvar(),kFALSE);
fIsHighBkgCut.assign(GetNvar(),kFALSE);
fLowSigCut.assign(GetNvar(),0.);
fLowBkgCut.assign(GetNvar(),0.);
fHighSigCut.assign(GetNvar(),0.);
fHighBkgCut.assign(GetNvar(),0.);
for( std::vector<const TMVA::Event*>::const_iterator it = eventSample.begin(); it != eventSample.end(); ++it ) {
if (DataInfo().IsSignal(*it)){
nTotS += (*it)->GetWeight();
++nTotS_unWeighted;
}
else {
nTotB += (*it)->GetWeight();
++nTotB_unWeighted;
}
bdtEventSample.push_back(TMVA::BDTEventWrapper(*it));
}
for( UInt_t ivar = 0; ivar < GetNvar(); ivar++ ) {
TMVA::BDTEventWrapper::SetVarIndex(ivar);
std::sort( bdtEventSample.begin(),bdtEventSample.end() );
Double_t bkgWeightCtr = 0.0, sigWeightCtr = 0.0;
std::vector<TMVA::BDTEventWrapper>::iterator it = bdtEventSample.begin(), it_end = bdtEventSample.end();
for( ; it != it_end; ++it ) {
if (DataInfo().IsSignal(**it))
sigWeightCtr += (**it)->GetWeight();
else
bkgWeightCtr += (**it)->GetWeight();
it->SetCumulativeWeight(false,bkgWeightCtr);
it->SetCumulativeWeight(true,sigWeightCtr);
}
Double_t dVal = (DataInfo().GetVariableInfo(ivar).GetMax() - DataInfo().GetVariableInfo(ivar).GetMin())/100. ;
Double_t nSelS, nSelB, effS=0.05, effB=0.05, rejS=0.05, rejB=0.05;
Double_t tmpEffS, tmpEffB, tmpRejS, tmpRejB;
for(UInt_t iev = 1; iev < bdtEventSample.size(); iev++) {
nSelS = bdtEventSample[iev].GetCumulativeWeight(true);
nSelB = bdtEventSample[iev].GetCumulativeWeight(false);
tmpEffS=nSelS/nTotS;
tmpEffB=nSelB/nTotB;
tmpRejS=1-tmpEffS;
tmpRejB=1-tmpEffB;
if (nSelS==0 && tmpEffB>effB) {effB=tmpEffB; fLowBkgCut[ivar] = bdtEventSample[iev].GetVal() - dVal; fIsLowBkgCut[ivar]=kTRUE;}
else if (nSelB==0 && tmpEffS>effS) {effS=tmpEffS; fLowSigCut[ivar] = bdtEventSample[iev].GetVal() - dVal; fIsLowSigCut[ivar]=kTRUE;}
else if (nSelB==nTotB && tmpRejS>rejS) {rejS=tmpRejS; fHighSigCut[ivar] = bdtEventSample[iev].GetVal() + dVal; fIsHighSigCut[ivar]=kTRUE;}
else if (nSelS==nTotS && tmpRejB>rejB) {rejB=tmpRejB; fHighBkgCut[ivar] = bdtEventSample[iev].GetVal() + dVal; fIsHighBkgCut[ivar]=kTRUE;}
}
}
Log() << kINFO << " found and suggest the following possible pre-selection cuts " << Endl;
if (fDoPreselection) Log() << kINFO << "the training will be done after these cuts... and GetMVA value returns +1, (-1) for a signal (bkg) event that passes these cuts" << Endl;
else Log() << kINFO << "as option DoPreselection was not used, these cuts however will not be performed, but the training will see the full sample"<<Endl;
for (UInt_t ivar=0; ivar < GetNvar(); ivar++ ) {
if (fIsLowBkgCut[ivar]){
Log() << kINFO << " found cut: Bkg if var " << ivar << " < " << fLowBkgCut[ivar] << Endl;
}
if (fIsLowSigCut[ivar]){
Log() << kINFO << " found cut: Sig if var " << ivar << " < " << fLowSigCut[ivar] << Endl;
}
if (fIsHighBkgCut[ivar]){
Log() << kINFO << " found cut: Bkg if var " << ivar << " > " << fHighBkgCut[ivar] << Endl;
}
if (fIsHighSigCut[ivar]){
Log() << kINFO << " found cut: Sig if var " << ivar << " > " << fHighSigCut[ivar] << Endl;
}
}
return;
}
Double_t TMVA::MethodBDT::ApplyPreselectionCuts(const Event* ev)
{
Double_t result=0;
for (UInt_t ivar=0; ivar < GetNvar(); ivar++ ) {
if (fIsLowBkgCut[ivar]){
if (ev->GetValue(ivar) < fLowBkgCut[ivar]) result = -1;
}
if (fIsLowSigCut[ivar]){
if (ev->GetValue(ivar) < fLowSigCut[ivar]) result = 1;
}
if (fIsHighBkgCut[ivar]){
if (ev->GetValue(ivar) > fHighBkgCut[ivar]) result = -1;
}
if (fIsHighSigCut[ivar]){
if (ev->GetValue(ivar) > fHighSigCut[ivar]) result = 1;
}
}
return result;
}