149#include <unordered_map>
170 , fSigToBkgFraction(0)
175 , fBaggedGradBoost(
kFALSE)
179 , fMinNodeSizeS(
"5%")
182 , fMinLinCorrForFisher(.8)
183 , fUseExclusiveVars(0)
185 , fNodePurityLimit(0)
190 , fFValidationEvents(0)
192 , fRandomisedTrees(
kFALSE)
194 , fUsePoissonNvars(0)
195 , fUseNTrainEvents(0)
196 , fBaggedSampleFraction(0)
197 , fNoNegWeightsInTraining(
kFALSE)
198 , fInverseBoostNegWeights(
kFALSE)
199 , fPairNegWeightsGlobal(
kFALSE)
200 , fTrainWithNegWeights(
kFALSE)
210 , fSkipNormalization(
kFALSE)
225 , fSigToBkgFraction(0)
230 , fBaggedGradBoost(
kFALSE)
234 , fMinNodeSizeS(
"5%")
237 , fMinLinCorrForFisher(.8)
238 , fUseExclusiveVars(0)
240 , fNodePurityLimit(0)
245 , fFValidationEvents(0)
247 , fRandomisedTrees(
kFALSE)
249 , fUsePoissonNvars(0)
250 , fUseNTrainEvents(0)
251 , fBaggedSampleFraction(0)
252 , fNoNegWeightsInTraining(
kFALSE)
253 , fInverseBoostNegWeights(
kFALSE)
254 , fPairNegWeightsGlobal(
kFALSE)
255 , fTrainWithNegWeights(
kFALSE)
265 , fSkipNormalization(
kFALSE)
335 DeclareOptionRef(fNTrees,
"NTrees",
"Number of trees in the forest");
336 if (DoRegression()) {
337 DeclareOptionRef(fMaxDepth=50,
"MaxDepth",
"Max depth of the decision tree allowed");
339 DeclareOptionRef(fMaxDepth=3,
"MaxDepth",
"Max depth of the decision tree allowed");
342 TString tmp=
"5%";
if (DoRegression()) tmp=
"0.2%";
343 DeclareOptionRef(fMinNodeSizeS=tmp,
"MinNodeSize",
"Minimum percentage of training events required in a leaf node (default: Classification: 5%, Regression: 0.2%)");
345 DeclareOptionRef(fNCuts,
"nCuts",
"Number of grid points in variable range used in finding optimal cut in node splitting");
347 DeclareOptionRef(fBoostType,
"BoostType",
"Boosting type for the trees in the forest (note: AdaCost is still experimental)");
349 AddPreDefVal(
TString(
"AdaBoost"));
350 AddPreDefVal(
TString(
"RealAdaBoost"));
351 AddPreDefVal(
TString(
"AdaCost"));
352 AddPreDefVal(
TString(
"Bagging"));
354 AddPreDefVal(
TString(
"AdaBoostR2"));
356 if (DoRegression()) {
357 fBoostType =
"AdaBoostR2";
359 fBoostType =
"AdaBoost";
361 DeclareOptionRef(fAdaBoostR2Loss=
"Quadratic",
"AdaBoostR2Loss",
"Type of Loss function in AdaBoostR2");
362 AddPreDefVal(
TString(
"Linear"));
363 AddPreDefVal(
TString(
"Quadratic"));
364 AddPreDefVal(
TString(
"Exponential"));
366 DeclareOptionRef(fBaggedBoost=
kFALSE,
"UseBaggedBoost",
"Use only a random subsample of all events for growing the trees in each boost iteration.");
367 DeclareOptionRef(fShrinkage = 1.0,
"Shrinkage",
"Learning rate for BoostType=Grad algorithm");
368 DeclareOptionRef(fAdaBoostBeta=.5,
"AdaBoostBeta",
"Learning rate for AdaBoost algorithm");
369 DeclareOptionRef(fRandomisedTrees,
"UseRandomisedTrees",
"Determine at each node splitting the cut variable only as the best out of a random subset of variables (like in RandomForests)");
370 DeclareOptionRef(fUseNvars,
"UseNvars",
"Size of the subset of variables used with RandomisedTree option");
371 DeclareOptionRef(fUsePoissonNvars,
"UsePoissonNvars",
"Interpret \"UseNvars\" not as fixed number but as mean of a Poisson distribution in each split with RandomisedTree option");
372 DeclareOptionRef(fBaggedSampleFraction=.6,
"BaggedSampleFraction",
"Relative size of bagged event sample to original size of the data sample (used whenever bagging is used (i.e. UseBaggedBoost, Bagging,)" );
374 DeclareOptionRef(fUseYesNoLeaf=
kTRUE,
"UseYesNoLeaf",
375 "Use Sig or Bkg categories, or the purity=S/(S+B) as classification of the leaf node -> Real-AdaBoost");
376 if (DoRegression()) {
380 DeclareOptionRef(fNegWeightTreatment=
"InverseBoostNegWeights",
"NegWeightTreatment",
"How to treat events with negative weights in the BDT training (particular the boosting) : IgnoreInTraining; Boost With inverse boostweight; Pair events with negative and positive weights in training sample and *annihilate* them (experimental!)");
381 AddPreDefVal(
TString(
"InverseBoostNegWeights"));
382 AddPreDefVal(
TString(
"IgnoreNegWeightsInTraining"));
383 AddPreDefVal(
TString(
"NoNegWeightsInTraining"));
384 AddPreDefVal(
TString(
"PairNegWeightsGlobal"));
389 DeclareOptionRef(fCss=1.,
"Css",
"AdaCost: cost of true signal selected signal");
390 DeclareOptionRef(fCts_sb=1.,
"Cts_sb",
"AdaCost: cost of true signal selected bkg");
391 DeclareOptionRef(fCtb_ss=1.,
"Ctb_ss",
"AdaCost: cost of true bkg selected signal");
392 DeclareOptionRef(fCbb=1.,
"Cbb",
"AdaCost: cost of true bkg selected bkg ");
394 DeclareOptionRef(fNodePurityLimit=0.5,
"NodePurityLimit",
"In boosting/pruning, nodes with purity > NodePurityLimit are signal; background otherwise.");
397 DeclareOptionRef(fSepTypeS,
"SeparationType",
"Separation criterion for node splitting");
398 AddPreDefVal(
TString(
"CrossEntropy"));
399 AddPreDefVal(
TString(
"GiniIndex"));
400 AddPreDefVal(
TString(
"GiniIndexWithLaplace"));
401 AddPreDefVal(
TString(
"MisClassificationError"));
402 AddPreDefVal(
TString(
"SDivSqrtSPlusB"));
403 AddPreDefVal(
TString(
"RegressionVariance"));
404 if (DoRegression()) {
405 fSepTypeS =
"RegressionVariance";
407 fSepTypeS =
"GiniIndex";
410 DeclareOptionRef(fRegressionLossFunctionBDTGS =
"Huber",
"RegressionLossFunctionBDTG",
"Loss function for BDTG regression.");
411 AddPreDefVal(
TString(
"Huber"));
412 AddPreDefVal(
TString(
"AbsoluteDeviation"));
413 AddPreDefVal(
TString(
"LeastSquares"));
415 DeclareOptionRef(fHuberQuantile = 0.7,
"HuberQuantile",
"In the Huber loss function this is the quantile that separates the core from the tails in the residuals distribution.");
417 DeclareOptionRef(fDoBoostMonitor=
kFALSE,
"DoBoostMonitor",
"Create control plot with ROC integral vs tree number");
419 DeclareOptionRef(fUseFisherCuts=
kFALSE,
"UseFisherCuts",
"Use multivariate splits using the Fisher criterion");
420 DeclareOptionRef(fMinLinCorrForFisher=.8,
"MinLinCorrForFisher",
"The minimum linear correlation between two variables demanded for use in Fisher criterion in node splitting");
421 DeclareOptionRef(fUseExclusiveVars=
kFALSE,
"UseExclusiveVars",
"Variables already used in fisher criterion are not anymore analysed individually for node splitting");
424 DeclareOptionRef(fDoPreselection=
kFALSE,
"DoPreselection",
"and and apply automatic pre-selection for 100% efficient signal (bkg) cuts prior to training");
427 DeclareOptionRef(fSigToBkgFraction=1,
"SigToBkgFraction",
"Sig to Bkg ratio used in Training (similar to NodePurityLimit, which cannot be used in real adaboost");
429 DeclareOptionRef(fPruneMethodS,
"PruneMethod",
"Note: for BDTs use small trees (e.g.MaxDepth=3) and NoPruning: Pruning: Method used for pruning (removal) of statistically insignificant branches ");
430 AddPreDefVal(
TString(
"NoPruning"));
431 AddPreDefVal(
TString(
"ExpectedError"));
432 AddPreDefVal(
TString(
"CostComplexity"));
434 DeclareOptionRef(fPruneStrength,
"PruneStrength",
"Pruning strength");
436 DeclareOptionRef(fFValidationEvents=0.5,
"PruningValFraction",
"Fraction of events to use for optimizing automatic pruning.");
438 DeclareOptionRef(fSkipNormalization=
kFALSE,
"SkipNormalization",
"Skip normalization at initialization, to keep expectation value of BDT output according to the fraction of events");
441 DeclareOptionRef(fMinNodeEvents=0,
"nEventsMin",
"deprecated: Use MinNodeSize (in % of training events) instead");
443 DeclareOptionRef(fBaggedGradBoost=
kFALSE,
"UseBaggedGrad",
"deprecated: Use *UseBaggedBoost* instead: Use only a random subsample of all events for growing the trees in each iteration.");
444 DeclareOptionRef(fBaggedSampleFraction,
"GradBaggingFraction",
"deprecated: Use *BaggedSampleFraction* instead: Defines the fraction of events to be used in each iteration, e.g. when UseBaggedGrad=kTRUE. ");
445 DeclareOptionRef(fUseNTrainEvents,
"UseNTrainEvents",
"deprecated: Use *BaggedSampleFraction* instead: Number of randomly picked training events used in randomised (and bagged) trees");
446 DeclareOptionRef(fNNodesMax,
"NNodesMax",
"deprecated: Use MaxDepth instead to limit the tree size" );
458 DeclareOptionRef(fHistoricBool=
kTRUE,
"UseWeightedTrees",
459 "Use weighted trees or simple average in classification from the forest");
460 DeclareOptionRef(fHistoricBool=
kFALSE,
"PruneBeforeBoost",
"Flag to prune the tree before applying boosting algorithm");
461 DeclareOptionRef(fHistoricBool=
kFALSE,
"RenormByClass",
"Individually re-normalize each event class to the original size after boosting");
463 AddPreDefVal(
TString(
"NegWeightTreatment"),
TString(
"IgnoreNegWeights"));
474 else if (fSepTypeS ==
"giniindex") fSepType =
new GiniIndex();
476 else if (fSepTypeS ==
"crossentropy") fSepType =
new CrossEntropy();
477 else if (fSepTypeS ==
"sdivsqrtsplusb") fSepType =
new SdivSqrtSplusB();
478 else if (fSepTypeS ==
"regressionvariance") fSepType = NULL;
480 Log() << kINFO << GetOptions() <<
Endl;
481 Log() << kFATAL <<
"<ProcessOptions> unknown Separation Index option " << fSepTypeS <<
" called" <<
Endl;
484 if(!(fHuberQuantile >= 0.0 && fHuberQuantile <= 1.0)){
485 Log() << kINFO << GetOptions() <<
Endl;
486 Log() << kFATAL <<
"<ProcessOptions> Huber Quantile must be in range [0,1]. Value given, " << fHuberQuantile <<
", does not match this criteria" <<
Endl;
490 fRegressionLossFunctionBDTGS.ToLower();
491 if (fRegressionLossFunctionBDTGS ==
"huber") fRegressionLossFunctionBDTG =
new HuberLossFunctionBDT(fHuberQuantile);
495 Log() << kINFO << GetOptions() <<
Endl;
496 Log() << kFATAL <<
"<ProcessOptions> unknown Regression Loss Function BDT option " << fRegressionLossFunctionBDTGS <<
" called" <<
Endl;
499 fPruneMethodS.ToLower();
504 Log() << kINFO << GetOptions() <<
Endl;
505 Log() << kFATAL <<
"<ProcessOptions> unknown PruneMethod " << fPruneMethodS <<
" option called" <<
Endl;
511 <<
"Sorry automatic pruning strength determination is not implemented yet for ExpectedErrorPruning" <<
Endl;
515 if (fMinNodeEvents > 0){
516 fMinNodeSize =
Double_t(fMinNodeEvents*100.) / Data()->GetNTrainingEvents();
517 Log() << kWARNING <<
"You have explicitly set ** nEventsMin = " << fMinNodeEvents<<
" ** the min absolute number \n"
518 <<
"of events in a leaf node. This is DEPRECATED, please use the option \n"
519 <<
"*MinNodeSize* giving the relative number as percentage of training \n"
520 <<
"events instead. \n"
521 <<
"nEventsMin="<<fMinNodeEvents<<
"--> MinNodeSize="<<fMinNodeSize<<
"%"
523 Log() << kWARNING <<
"Note also that explicitly setting *nEventsMin* so far OVERWRITES the option recommended \n"
524 <<
" *MinNodeSize* = " << fMinNodeSizeS <<
" option !!" <<
Endl ;
525 fMinNodeSizeS =
Form(
"%F3.2",fMinNodeSize);
528 SetMinNodeSize(fMinNodeSizeS);
532 fAdaBoostR2Loss.ToLower();
534 if (fBoostType==
"Grad") {
536 if (fNegWeightTreatment==
"InverseBoostNegWeights"){
537 Log() << kINFO <<
"the option NegWeightTreatment=InverseBoostNegWeights does"
538 <<
" not exist for BoostType=Grad" <<
Endl;
539 Log() << kINFO <<
"--> change to new default NegWeightTreatment=Pray" <<
Endl;
540 Log() << kDEBUG <<
"i.e. simply keep them as if which should work fine for Grad Boost" <<
Endl;
541 fNegWeightTreatment=
"Pray";
542 fNoNegWeightsInTraining=
kFALSE;
544 }
else if (fBoostType==
"RealAdaBoost"){
545 fBoostType =
"AdaBoost";
547 }
else if (fBoostType==
"AdaCost"){
551 if (fFValidationEvents < 0.0) fFValidationEvents = 0.0;
552 if (fAutomatic && fFValidationEvents > 0.5) {
553 Log() << kWARNING <<
"You have chosen to use more than half of your training sample "
554 <<
"to optimize the automatic pruning algorithm. This is probably wasteful "
555 <<
"and your overall results will be degraded. Are you sure you want this?"
560 if (this->Data()->HasNegativeEventWeights()){
561 Log() << kINFO <<
" You are using a Monte Carlo that has also negative weights. "
562 <<
"That should in principle be fine as long as on average you end up with "
563 <<
"something positive. For this you have to make sure that the minimal number "
564 <<
"of (un-weighted) events demanded for a tree node (currently you use: MinNodeSize="
565 << fMinNodeSizeS <<
" ("<< fMinNodeSize <<
"%)"
566 <<
", (or the deprecated equivalent nEventsMin) you can set this via the "
567 <<
"BDT option string when booking the "
568 <<
"classifier) is large enough to allow for reasonable averaging!!! "
569 <<
" If this does not help.. maybe you want to try the option: IgnoreNegWeightsInTraining "
570 <<
"which ignores events with negative weight in the training. " <<
Endl
571 <<
Endl <<
"Note: You'll get a WARNING message during the training if that should ever happen" <<
Endl;
574 if (DoRegression()) {
575 if (fUseYesNoLeaf && !IsConstructedFromWeightFile()){
576 Log() << kWARNING <<
"Regression Trees do not work with fUseYesNoLeaf=TRUE --> I will set it to FALSE" <<
Endl;
580 if (fSepType != NULL){
581 Log() << kWARNING <<
"Regression Trees do not work with Separation type other than <RegressionVariance> --> I will use it instead" <<
Endl;
585 Log() << kWARNING <<
"Sorry, UseFisherCuts is not available for regression analysis, I will ignore it!" <<
Endl;
589 Log() << kWARNING <<
"Sorry, the option of nCuts<0 using a more elaborate node splitting algorithm " <<
Endl;
590 Log() << kWARNING <<
"is not implemented for regression analysis ! " <<
Endl;
591 Log() << kWARNING <<
"--> I switch do default nCuts = 20 and use standard node splitting"<<
Endl;
595 if (fRandomisedTrees){
596 Log() << kINFO <<
" Randomised trees use no pruning" <<
Endl;
601 if (fUseFisherCuts) {
602 Log() << kWARNING <<
"When using the option UseFisherCuts, the other option nCuts<0 (i.e. using" <<
Endl;
603 Log() <<
" a more elaborate node splitting algorithm) is not implemented. " <<
Endl;
610 Log() << kERROR <<
" Zero Decision Trees demanded... that does not work !! "
611 <<
" I set it to 1 .. just so that the program does not crash"
616 fNegWeightTreatment.ToLower();
617 if (fNegWeightTreatment ==
"ignorenegweightsintraining") fNoNegWeightsInTraining =
kTRUE;
618 else if (fNegWeightTreatment ==
"nonegweightsintraining") fNoNegWeightsInTraining =
kTRUE;
619 else if (fNegWeightTreatment ==
"inverseboostnegweights") fInverseBoostNegWeights =
kTRUE;
620 else if (fNegWeightTreatment ==
"pairnegweightsglobal") fPairNegWeightsGlobal =
kTRUE;
621 else if (fNegWeightTreatment ==
"pray") Log() << kDEBUG <<
"Yes, good luck with praying " <<
Endl;
623 Log() << kINFO << GetOptions() <<
Endl;
624 Log() << kFATAL <<
"<ProcessOptions> unknown option for treating negative event weights during training " << fNegWeightTreatment <<
" requested" <<
Endl;
627 if (fNegWeightTreatment ==
"pairnegweightsglobal")
628 Log() << kWARNING <<
" you specified the option NegWeightTreatment=PairNegWeightsGlobal : This option is still considered EXPERIMENTAL !! " <<
Endl;
635 while (tmp < fNNodesMax){
639 Log() << kWARNING <<
"You have specified a deprecated option *NNodesMax="<<fNNodesMax
640 <<
"* \n this has been translated to MaxDepth="<<fMaxDepth<<
Endl;
644 if (fUseNTrainEvents>0){
645 fBaggedSampleFraction = (
Double_t) fUseNTrainEvents/Data()->GetNTrainingEvents();
646 Log() << kWARNING <<
"You have specified a deprecated option *UseNTrainEvents="<<fUseNTrainEvents
647 <<
"* \n this has been translated to BaggedSampleFraction="<<fBaggedSampleFraction<<
"(%)"<<
Endl;
650 if (fBoostType==
"Bagging") fBaggedBoost =
kTRUE;
651 if (fBaggedGradBoost){
652 fBaggedBoost =
kTRUE;
653 Log() << kWARNING <<
"You have specified a deprecated option *UseBaggedGrad* --> please use *UseBaggedBoost* instead" <<
Endl;
661 if (sizeInPercent > 0 && sizeInPercent < 50){
662 fMinNodeSize=sizeInPercent;
665 Log() << kFATAL <<
"you have demanded a minimal node size of "
666 << sizeInPercent <<
"% of the training events.. \n"
667 <<
" that somehow does not make sense "<<
Endl;
677 if (sizeInPercent.
IsFloat()) SetMinNodeSize(sizeInPercent.
Atof());
679 Log() << kFATAL <<
"I had problems reading the option MinNodeEvents, which "
680 <<
"after removing a possible % sign now reads " << sizeInPercent <<
Endl;
692 fBoostType =
"AdaBoost";
693 if(DataInfo().GetNClasses()!=0)
697 fBoostType =
"AdaBoostR2";
698 fAdaBoostR2Loss =
"Quadratic";
699 if(DataInfo().GetNClasses()!=0)
705 fPruneMethodS =
"NoPruning";
709 fFValidationEvents = 0.5;
710 fRandomisedTrees =
kFALSE;
713 fUsePoissonNvars =
kTRUE;
718 SetSignalReferenceCut( 0 );
731 for (
UInt_t i=0; i<fForest.size(); i++)
delete fForest[i];
734 fBoostWeights.clear();
735 if (fMonitorNtuple) { fMonitorNtuple->Delete(); fMonitorNtuple=NULL; }
736 fVariableImportance.clear();
738 fLossFunctionEventInfo.clear();
742 if (Data()) Data()->DeleteResults(GetMethodName(),
Types::kTraining, GetAnalysisType());
743 Log() << kDEBUG <<
" successfully(?) reset the method " <<
Endl;
755 for (
UInt_t i=0; i<fForest.size(); i++)
delete fForest[i];
763 if (!HasTrainingTree()) Log() << kFATAL <<
"<Init> Data().TrainingTree() is zero pointer" <<
Endl;
765 if (fEventSample.size() > 0) {
767 for (
UInt_t iev=0; iev<fEventSample.size(); iev++) fEventSample[iev]->SetBoostWeight(1.);
770 UInt_t nevents = Data()->GetNTrainingEvents();
772 std::vector<const TMVA::Event*> tmpEventSample;
773 for (
Long64_t ievt=0; ievt<nevents; ievt++) {
775 Event*
event =
new Event( *GetTrainingEvent(ievt) );
776 tmpEventSample.push_back(event);
779 if (!DoRegression()) DeterminePreselectionCuts(tmpEventSample);
780 else fDoPreselection =
kFALSE;
782 for (
UInt_t i=0; i<tmpEventSample.size(); i++)
delete tmpEventSample[i];
787 for (
Long64_t ievt=0; ievt<nevents; ievt++) {
790 Event*
event =
new Event( *GetTrainingEvent(ievt) );
791 if (fDoPreselection){
792 if (
TMath::Abs(ApplyPreselectionCuts(event)) > 0.05) {
798 if (event->GetWeight() < 0 && (IgnoreEventsWithNegWeightsInTraining() || fNoNegWeightsInTraining)){
799 if (firstNegWeight) {
800 Log() << kWARNING <<
" Note, you have events with negative event weight in the sample, but you've chosen to ignore them" <<
Endl;
804 }
else if (event->GetWeight()==0){
805 if (firstZeroWeight) {
807 Log() <<
"Events with weight == 0 are going to be simply ignored " <<
Endl;
811 if (event->GetWeight() < 0) {
812 fTrainWithNegWeights=
kTRUE;
815 if (fPairNegWeightsGlobal){
816 Log() << kWARNING <<
"Events with negative event weights are found and "
817 <<
" will be removed prior to the actual BDT training by global "
818 <<
" paring (and subsequent annihilation) with positiv weight events"
821 Log() << kWARNING <<
"Events with negative event weights are USED during "
822 <<
"the BDT training. This might cause problems with small node sizes "
823 <<
"or with the boosting. Please remove negative events from training "
824 <<
"using the option *IgnoreEventsWithNegWeightsInTraining* in case you "
825 <<
"observe problems with the boosting"
832 Double_t modulo = 1.0/(fFValidationEvents);
833 Int_t imodulo =
static_cast<Int_t>( fmod(modulo,1.0) > 0.5 ?
ceil(modulo) :
floor(modulo) );
834 if (ievt % imodulo == 0) fValidationSample.push_back( event );
835 else fEventSample.push_back( event );
838 fEventSample.push_back(event);
844 Log() << kINFO <<
"<InitEventSample> Internally I use " << fEventSample.size()
845 <<
" for Training and " << fValidationSample.size()
846 <<
" for Pruning Validation (" << ((
Float_t)fValidationSample.size())/((
Float_t)fEventSample.size()+fValidationSample.size())*100.0
847 <<
"% of training used for validation)" <<
Endl;
851 if (fPairNegWeightsGlobal) PreProcessNegativeEventWeights();
854 if (DoRegression()) {
856 }
else if (DoMulticlass()) {
858 }
else if (!fSkipNormalization) {
860 Log() << kDEBUG <<
"\t<InitEventSample> For classification trees, "<<
Endl;
861 Log() << kDEBUG <<
" \tthe effective number of backgrounds is scaled to match "<<
Endl;
862 Log() << kDEBUG <<
" \tthe signal. Otherwise the first boosting step would do 'just that'!"<<
Endl;
876 Double_t nevents = fEventSample.size();
878 Int_t sumSig=0, sumBkg=0;
879 for (
UInt_t ievt=0; ievt<fEventSample.size(); ievt++) {
880 if ((DataInfo().IsSignal(fEventSample[ievt])) ) {
881 sumSigW += fEventSample[ievt]->GetWeight();
884 sumBkgW += fEventSample[ievt]->GetWeight();
888 if (sumSigW && sumBkgW){
889 Double_t normSig = nevents/((1+fSigToBkgFraction)*sumSigW)*fSigToBkgFraction;
890 Double_t normBkg = nevents/((1+fSigToBkgFraction)*sumBkgW); ;
891 Log() << kDEBUG <<
"\tre-normalise events such that Sig and Bkg have respective sum of weights = "
892 << fSigToBkgFraction <<
Endl;
893 Log() << kDEBUG <<
" \tsig->sig*"<<normSig <<
"ev. bkg->bkg*"<<normBkg <<
"ev." <<
Endl;
894 Log() << kHEADER <<
"#events: (reweighted) sig: "<< sumSigW*normSig <<
" bkg: " << sumBkgW*normBkg <<
Endl;
895 Log() << kINFO <<
"#events: (unweighted) sig: "<< sumSig <<
" bkg: " << sumBkg <<
Endl;
896 for (
Long64_t ievt=0; ievt<nevents; ievt++) {
897 if ((DataInfo().IsSignal(fEventSample[ievt])) ) fEventSample[ievt]->SetBoostWeight(normSig);
898 else fEventSample[ievt]->SetBoostWeight(normBkg);
901 Log() << kINFO <<
"--> could not determine scaling factors as either there are " <<
Endl;
902 Log() << kINFO <<
" no signal events (sumSigW="<<sumSigW<<
") or no bkg ev. (sumBkgW="<<sumBkgW<<
")"<<
Endl;
907 fTrainSample = &fEventSample;
909 GetBaggedSubSample(fEventSample);
910 fTrainSample = &fSubSample;
936 std::vector<const Event*> negEvents;
937 for (
UInt_t iev = 0; iev < fEventSample.size(); iev++){
938 if (fEventSample[iev]->GetWeight() < 0) {
939 totalNegWeights += fEventSample[iev]->GetWeight();
940 negEvents.push_back(fEventSample[iev]);
942 totalPosWeights += fEventSample[iev]->GetWeight();
944 totalWeights += fEventSample[iev]->GetWeight();
946 if (totalNegWeights == 0 ) {
947 Log() << kINFO <<
"no negative event weights found .. no preprocessing necessary" <<
Endl;
950 Log() << kINFO <<
"found a total of " << totalNegWeights <<
" of negative event weights which I am going to try to pair with positive events to annihilate them" <<
Endl;
951 Log() << kINFO <<
"found a total of " << totalPosWeights <<
" of events with positive weights" <<
Endl;
952 Log() << kINFO <<
"--> total sum of weights = " << totalWeights <<
" = " << totalNegWeights+totalPosWeights <<
Endl;
959 for (
Int_t i=0; i<2; i++){
960 invCov = ((*cov)[i]);
962 std::cout <<
"<MethodBDT::PreProcessNeg...> matrix is almost singular with determinant="
964 <<
" did you use the variables that are linear combinations or highly correlated?"
968 std::cout <<
"<MethodBDT::PreProcessNeg...> matrix is singular with determinant="
970 <<
" did you use the variables that are linear combinations?"
979 Log() << kINFO <<
"Found a total of " << totalNegWeights <<
" in negative weights out of " << fEventSample.size() <<
" training events " <<
Endl;
980 Timer timer(negEvents.size(),
"Negative Event paired");
981 for (
UInt_t nev = 0; nev < negEvents.size(); nev++){
983 Double_t weight = negEvents[nev]->GetWeight();
984 UInt_t iClassID = negEvents[nev]->GetClass();
985 invCov = ((*cov)[iClassID]);
991 for (
UInt_t iev = 0; iev < fEventSample.size(); iev++){
992 if (iClassID==fEventSample[iev]->GetClass() && fEventSample[iev]->GetWeight() > 0){
994 for (
UInt_t ivar=0; ivar < GetNvar(); ivar++){
995 for (
UInt_t jvar=0; jvar<GetNvar(); jvar++){
996 dist += (negEvents[nev]->GetValue(ivar)-fEventSample[iev]->GetValue(ivar))*
997 (*invCov)[ivar][jvar]*
998 (negEvents[nev]->GetValue(jvar)-fEventSample[iev]->GetValue(jvar));
1001 if (dist < minDist) { iMin=iev; minDist=dist;}
1007 Double_t newWeight = (negEvents[nev]->GetWeight() + fEventSample[iMin]->GetWeight());
1009 negEvents[nev]->SetBoostWeight( 0 );
1010 fEventSample[iMin]->SetBoostWeight( newWeight/fEventSample[iMin]->GetOriginalWeight() );
1012 negEvents[nev]->SetBoostWeight( newWeight/negEvents[nev]->GetOriginalWeight() );
1013 fEventSample[iMin]->SetBoostWeight( 0 );
1016 }
else Log() << kFATAL <<
"preprocessing didn't find event to pair with the negative weight ... probably a bug" <<
Endl;
1017 weight = negEvents[nev]->GetWeight();
1020 Log() << kINFO <<
"<Negative Event Pairing> took: " << timer.
GetElapsedTime()
1024 totalNegWeights = 0;
1025 totalPosWeights = 0;
1032 std::vector<const Event*> newEventSample;
1034 for (
UInt_t iev = 0; iev < fEventSample.size(); iev++){
1035 if (fEventSample[iev]->GetWeight() < 0) {
1036 totalNegWeights += fEventSample[iev]->GetWeight();
1037 totalWeights += fEventSample[iev]->GetWeight();
1039 totalPosWeights += fEventSample[iev]->GetWeight();
1040 totalWeights += fEventSample[iev]->GetWeight();
1042 if (fEventSample[iev]->GetWeight() > 0) {
1043 newEventSample.push_back(
new Event(*fEventSample[iev]));
1044 if (fEventSample[iev]->GetClass() == fSignalClass){
1045 sigWeight += fEventSample[iev]->GetWeight();
1048 bkgWeight += fEventSample[iev]->GetWeight();
1053 if (totalNegWeights < 0) Log() << kFATAL <<
" compensation of negative event weights with positive ones did not work " << totalNegWeights <<
Endl;
1055 for (
UInt_t i=0; i<fEventSample.size(); i++)
delete fEventSample[i];
1056 fEventSample = newEventSample;
1058 Log() << kINFO <<
" after PreProcessing, the Event sample is left with " << fEventSample.size() <<
" events (unweighted), all with positive weights, adding up to " << totalWeights <<
Endl;
1059 Log() << kINFO <<
" nSig="<<nSig <<
" sigWeight="<<sigWeight <<
" nBkg="<<nBkg <<
" bkgWeight="<<bkgWeight <<
Endl;
1071 std::map<TString,TMVA::Interval*> tuneParameters;
1072 std::map<TString,Double_t> tunedParameters;
1081 tuneParameters.insert(std::pair<TString,Interval*>(
"NTrees",
new Interval(10,1000,5)));
1082 tuneParameters.insert(std::pair<TString,Interval*>(
"MaxDepth",
new Interval(2,4,3)));
1083 tuneParameters.insert(std::pair<TString,Interval*>(
"MinNodeSize",
new LogInterval(1,30,30)));
1088 if (fBoostType==
"AdaBoost"){
1089 tuneParameters.insert(std::pair<TString,Interval*>(
"AdaBoostBeta",
new Interval(.2,1.,5)));
1091 }
else if (fBoostType==
"Grad"){
1092 tuneParameters.insert(std::pair<TString,Interval*>(
"Shrinkage",
new Interval(0.05,0.50,5)));
1094 }
else if (fBoostType==
"Bagging" && fRandomisedTrees){
1097 tuneParameters.insert(std::pair<TString,Interval*>(
"UseNvars",
new Interval(min_var,max_var,4)));
1101 Log()<<kINFO <<
" the following BDT parameters will be tuned on the respective *grid*\n"<<
Endl;
1102 std::map<TString,TMVA::Interval*>::iterator it;
1103 for(it=tuneParameters.begin(); it!= tuneParameters.end(); ++it){
1104 Log() << kWARNING << it->first <<
Endl;
1105 std::ostringstream oss;
1106 (it->second)->Print(oss);
1112 tunedParameters=optimize.
optimize();
1114 return tunedParameters;
1123 std::map<TString,Double_t>::iterator it;
1124 for(it=tuneParameters.begin(); it!= tuneParameters.end(); ++it){
1125 Log() << kWARNING << it->first <<
" = " << it->second <<
Endl;
1126 if (it->first ==
"MaxDepth" ) SetMaxDepth ((
Int_t)it->second);
1127 else if (it->first ==
"MinNodeSize" ) SetMinNodeSize (it->second);
1128 else if (it->first ==
"NTrees" ) SetNTrees ((
Int_t)it->second);
1129 else if (it->first ==
"NodePurityLimit") SetNodePurityLimit (it->second);
1130 else if (it->first ==
"AdaBoostBeta" ) SetAdaBoostBeta (it->second);
1131 else if (it->first ==
"Shrinkage" ) SetShrinkage (it->second);
1132 else if (it->first ==
"UseNvars" ) SetUseNvars ((
Int_t)it->second);
1133 else if (it->first ==
"BaggedSampleFraction" ) SetBaggedSampleFraction (it->second);
1134 else Log() << kFATAL <<
" SetParameter for " << it->first <<
" not yet implemented " <<
Endl;
1152 Log() << kERROR <<
" Zero Decision Trees demanded... that does not work !! "
1153 <<
" I set it to 1 .. just so that the program does not crash"
1158 if (fInteractive && fInteractive->NotInitialized()){
1159 std::vector<TString> titles = {
"Boost weight",
"Error Fraction"};
1160 fInteractive->Init(titles);
1162 fIPyMaxIter = fNTrees;
1163 fExitFromTraining =
false;
1167 if (IsNormalised()) Log() << kFATAL <<
"\"Normalise\" option cannot be used with BDT; "
1168 <<
"please remove the option from the configuration string, or "
1169 <<
"use \"!Normalise\""
1173 Log() << kINFO <<
"Regression Loss Function: "<< fRegressionLossFunctionBDTG->Name() <<
Endl;
1175 Log() << kINFO <<
"Training "<< fNTrees <<
" Decision Trees ... patience please" <<
Endl;
1177 Log() << kDEBUG <<
"Training with maximal depth = " <<fMaxDepth
1178 <<
", MinNodeEvents=" << fMinNodeEvents
1179 <<
", NTrees="<<fNTrees
1180 <<
", NodePurityLimit="<<fNodePurityLimit
1181 <<
", AdaBoostBeta="<<fAdaBoostBeta
1187 TString hname =
"AdaBooost weight distribution";
1193 if (DoRegression()) {
1197 hname=
"Boost event weights distribution";
1202 TH1*
h =
new TH1F(
Form(
"%s_BoostWeight",DataInfo().GetName()),hname,nBins,xMin,xMax);
1203 TH1* nodesBeforePruningVsTree =
new TH1I(
Form(
"%s_NodesBeforePruning",DataInfo().GetName()),
"nodes before pruning",fNTrees,0,fNTrees);
1204 TH1* nodesAfterPruningVsTree =
new TH1I(
Form(
"%s_NodesAfterPruning",DataInfo().GetName()),
"nodes after pruning",fNTrees,0,fNTrees);
1208 if(!DoMulticlass()){
1211 h->SetXTitle(
"boost weight");
1212 results->
Store(
h,
"BoostWeights");
1216 if (fDoBoostMonitor){
1217 TH2* boostMonitor =
new TH2F(
"BoostMonitor",
"ROC Integral Vs iTree",2,0,fNTrees,2,0,1.05);
1219 boostMonitor->
SetYTitle(
"ROC Integral");
1220 results->
Store(boostMonitor,
"BoostMonitor");
1222 boostMonitorGraph->
SetName(
"BoostMonitorGraph");
1223 boostMonitorGraph->
SetTitle(
"ROCIntegralVsNTrees");
1224 results->
Store(boostMonitorGraph,
"BoostMonitorGraph");
1228 h =
new TH1F(
"BoostWeightVsTree",
"Boost weights vs tree",fNTrees,0,fNTrees);
1229 h->SetXTitle(
"#tree");
1230 h->SetYTitle(
"boost weight");
1231 results->
Store(
h,
"BoostWeightsVsTree");
1234 h =
new TH1F(
"ErrFractHist",
"error fraction vs tree number",fNTrees,0,fNTrees);
1235 h->SetXTitle(
"#tree");
1236 h->SetYTitle(
"error fraction");
1237 results->
Store(
h,
"ErrorFrac");
1240 nodesBeforePruningVsTree->
SetXTitle(
"#tree");
1241 nodesBeforePruningVsTree->
SetYTitle(
"#tree nodes");
1242 results->
Store(nodesBeforePruningVsTree);
1245 nodesAfterPruningVsTree->
SetXTitle(
"#tree");
1246 nodesAfterPruningVsTree->
SetYTitle(
"#tree nodes");
1247 results->
Store(nodesAfterPruningVsTree);
1251 fMonitorNtuple=
new TTree(
"MonitorNtuple",
"BDT variables");
1252 fMonitorNtuple->Branch(
"iTree",&fITree,
"iTree/I");
1253 fMonitorNtuple->Branch(
"boostWeight",&fBoostWeight,
"boostWeight/D");
1254 fMonitorNtuple->Branch(
"errorFraction",&fErrorFraction,
"errorFraction/D");
1256 Timer timer( fNTrees, GetName() );
1257 Int_t nNodesBeforePruningCount = 0;
1258 Int_t nNodesAfterPruningCount = 0;
1260 Int_t nNodesBeforePruning = 0;
1261 Int_t nNodesAfterPruning = 0;
1263 if(fBoostType==
"Grad"){
1264 InitGradBoost(fEventSample);
1271 while (itree < fNTrees && continueBoost){
1272 if (fExitFromTraining)
break;
1273 fIPyCurrentIter = itree;
1286 if (fBoostType!=
"Grad"){
1287 Log() << kFATAL <<
"Multiclass is currently only supported by gradient boost. "
1288 <<
"Please change boost option accordingly (BoostType=Grad)." <<
Endl;
1291 UInt_t nClasses = DataInfo().GetNClasses();
1292 for (
UInt_t i=0;i<nClasses;i++){
1296 fForest.push_back(
new DecisionTree( fSepType, fMinNodeSize, fNCuts, &(DataInfo()), i,
1297 fRandomisedTrees, fUseNvars, fUsePoissonNvars, fMaxDepth,
1298 itree*nClasses+i, fNodePurityLimit, itree*nClasses+1));
1299 fForest.back()->SetNVars(GetNvar());
1300 if (fUseFisherCuts) {
1301 fForest.back()->SetUseFisherCuts();
1302 fForest.back()->SetMinLinCorrForFisher(fMinLinCorrForFisher);
1303 fForest.back()->SetUseExclusiveVars(fUseExclusiveVars);
1307 nNodesBeforePruning = fForest.back()->BuildTree(*fTrainSample);
1308 Double_t bw = this->Boost(*fTrainSample, fForest.back(),i);
1310 fBoostWeights.push_back(bw);
1312 fBoostWeights.push_back(0);
1313 Log() << kWARNING <<
"stopped boosting at itree="<<itree <<
Endl;
1322 fRandomisedTrees, fUseNvars, fUsePoissonNvars, fMaxDepth,
1323 itree, fNodePurityLimit, itree);
1325 fForest.push_back(dt);
1326 fForest.back()->SetNVars(GetNvar());
1327 if (fUseFisherCuts) {
1328 fForest.back()->SetUseFisherCuts();
1329 fForest.back()->SetMinLinCorrForFisher(fMinLinCorrForFisher);
1330 fForest.back()->SetUseExclusiveVars(fUseExclusiveVars);
1333 nNodesBeforePruning = fForest.back()->BuildTree(*fTrainSample);
1335 if (fUseYesNoLeaf && !DoRegression() && fBoostType!=
"Grad") {
1336 nNodesBeforePruning = fForest.back()->CleanTree();
1339 nNodesBeforePruningCount += nNodesBeforePruning;
1340 nodesBeforePruningVsTree->
SetBinContent(itree+1,nNodesBeforePruning);
1342 fForest.back()->SetPruneMethod(fPruneMethod);
1343 fForest.back()->SetPruneStrength(fPruneStrength);
1345 std::vector<const Event*> * validationSample = NULL;
1346 if(fAutomatic) validationSample = &fValidationSample;
1347 Double_t bw = this->Boost(*fTrainSample, fForest.back());
1349 fBoostWeights.push_back(bw);
1351 fBoostWeights.push_back(0);
1352 Log() << kWARNING <<
"stopped boosting at itree="<<itree <<
Endl;
1361 if (fUseYesNoLeaf && !DoRegression() && fBoostType!=
"Grad"){
1362 fForest.back()->CleanTree();
1364 nNodesAfterPruning = fForest.back()->GetNNodes();
1365 nNodesAfterPruningCount += nNodesAfterPruning;
1366 nodesAfterPruningVsTree->
SetBinContent(itree+1,nNodesAfterPruning);
1369 fInteractive->AddPoint(itree, fBoostWeight, fErrorFraction);
1372 fMonitorNtuple->Fill();
1373 if (fDoBoostMonitor){
1374 if (! DoRegression() ){
1375 if ( itree==fNTrees-1 || (!(itree%500)) ||
1376 (!(itree%250) && itree <1000)||
1377 (!(itree%100) && itree < 500)||
1378 (!(itree%50) && itree < 250)||
1379 (!(itree%25) && itree < 150)||
1380 (!(itree%10) && itree < 50)||
1381 (!(itree%5) && itree < 20)
1382 ) BoostMonitor(itree);
1390 Log() << kDEBUG <<
"\t<Train> elapsed time: " << timer.
GetElapsedTime()
1393 Log() << kDEBUG <<
"\t<Train> average number of nodes (w/o pruning) : "
1394 << nNodesBeforePruningCount/GetNTrees() <<
Endl;
1397 Log() << kDEBUG <<
"\t<Train> average number of nodes before/after pruning : "
1398 << nNodesBeforePruningCount/GetNTrees() <<
" / "
1399 << nNodesAfterPruningCount/GetNTrees()
1407 Log() << kDEBUG <<
"Now I delete the privat data sample"<<
Endl;
1408 for (
UInt_t i=0; i<fEventSample.size(); i++)
delete fEventSample[i];
1409 for (
UInt_t i=0; i<fValidationSample.size(); i++)
delete fValidationSample[i];
1410 fEventSample.clear();
1411 fValidationSample.clear();
1413 if (!fExitFromTraining) fIPyMaxIter = fIPyCurrentIter;
1424 for (
UInt_t itree=0; itree<nTrees; itree++) {
1429 return 2.0/(1.0+
exp(-2.0*
sum))-1;
1437 if (DoMulticlass()) {
1438 UInt_t nClasses = DataInfo().GetNClasses();
1439 Bool_t isLastClass = (cls == nClasses - 1);
1451 std::map<const TMVA::Event *, std::vector<double>> & residuals = this->fResiduals;
1454 auto update_residuals = [&residuals, &lastTree, cls](
const TMVA::Event *
e) {
1458 auto update_residuals_last = [&residuals, &lastTree, cls, nClasses](
const TMVA::Event *
e) {
1461 auto &residualsThisEvent = residuals[
e];
1463 std::vector<Double_t> expCache(nClasses, 0.0);
1464 std::transform(residualsThisEvent.begin(),
1465 residualsThisEvent.begin() + nClasses,
1466 expCache.begin(), [](
Double_t d) { return exp(d); });
1468 Double_t exp_sum = std::accumulate(expCache.begin(),
1469 expCache.begin() + nClasses,
1472 for (
UInt_t i = 0; i < nClasses; i++) {
1473 Double_t p_cls = expCache[i] / exp_sum;
1475 Double_t res = (
e->GetClass() == i) ? (1.0 - p_cls) : (-p_cls);
1482 .
Foreach(update_residuals_last, eventSample);
1485 .
Foreach(update_residuals, eventSample);
1491 std::vector<Double_t> expCache;
1493 expCache.resize(nClasses);
1496 for (
auto e : eventSample) {
1497 fResiduals[
e].at(cls) += fForest.back()->CheckEvent(
e,
kFALSE);
1499 auto &residualsThisEvent = fResiduals[
e];
1500 std::transform(residualsThisEvent.begin(),
1501 residualsThisEvent.begin() + nClasses,
1502 expCache.begin(), [](
Double_t d) { return exp(d); });
1504 Double_t exp_sum = std::accumulate(expCache.begin(),
1505 expCache.begin() + nClasses,
1508 for (
UInt_t i = 0; i < nClasses; i++) {
1509 Double_t p_cls = expCache[i] / exp_sum;
1511 Double_t res = (
e->GetClass() == i) ? (1.0 - p_cls) : (-p_cls);
1518 std::map<const TMVA::Event *, std::vector<double>> & residuals = this->fResiduals;
1521 UInt_t signalClass = DataInfo().GetSignalClassIndex();
1524 auto update_residuals = [&residuals, &lastTree, signalClass](
const TMVA::Event *
e) {
1525 double & residualAt0 = residuals[
e].at(0);
1528 Double_t p_sig = 1.0 / (1.0 +
exp(-2.0 * residualAt0));
1529 Double_t res = ((
e->GetClass() == signalClass) ? (1.0 - p_sig) : (-p_sig));
1535 .
Foreach(update_residuals, eventSample);
1537 for (
auto e : eventSample) {
1538 double & residualAt0 = residuals[
e].at(0);
1541 Double_t p_sig = 1.0 / (1.0 +
exp(-2.0 * residualAt0));
1542 Double_t res = ((
e->GetClass() == signalClass) ? (1.0 - p_sig) : (-p_sig));
1565 auto f = [
this, &nPartitions](
UInt_t partition = 0) ->
Int_t {
1566 Int_t start = 1.0 * partition / nPartitions * this->fEventSample.size();
1567 Int_t end = (partition + 1.0) / nPartitions * this->fEventSample.size();
1569 for (
Int_t i = start; i < end; ++i) {
1588 fRegressionLossFunctionBDTG->SetTargets(eventSample, fLossFunctionEventInfo);
1602 std::unordered_map<TMVA::DecisionTreeNode*, LeafInfo> leaves;
1603 for (
auto e : eventSample) {
1606 auto &
v = leaves[node];
1607 auto target =
e->GetTarget(cls);
1608 v.sumWeightTarget += target * weight;
1609 v.sum2 += fabs(target) * (1.0 - fabs(target)) * weight;
1611 for (
auto &iLeave : leaves) {
1612 constexpr auto minValue = 1
e-30;
1613 if (iLeave.second.sum2 < minValue) {
1614 iLeave.second.sum2 = minValue;
1616 const Double_t K = DataInfo().GetNClasses();
1617 iLeave.first->SetResponse(fShrinkage * (K - 1) / K * iLeave.second.sumWeightTarget / iLeave.second.sum2);
1622 DoMulticlass() ? UpdateTargets(fEventSample, cls) : UpdateTargets(fEventSample);
1634 std::map<TMVA::DecisionTreeNode*,vector< TMVA::LossFunctionEventInfo > > leaves;
1635 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
1637 (leaves[node]).push_back(fLossFunctionEventInfo[*
e]);
1644 for (std::map<
TMVA::DecisionTreeNode*,vector< TMVA::LossFunctionEventInfo > >::iterator iLeave=leaves.begin();
1645 iLeave!=leaves.end();++iLeave){
1646 Double_t fit = fRegressionLossFunctionBDTG->Fit(iLeave->second);
1647 (iLeave->first)->SetResponse(fShrinkage*fit);
1650 UpdateTargetsRegression(*fTrainSample);
1665 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
1669 fRegressionLossFunctionBDTG->Init(fLossFunctionEventInfo, fBoostWeights);
1670 UpdateTargetsRegression(*fTrainSample,
kTRUE);
1674 else if(DoMulticlass()){
1675 UInt_t nClasses = DataInfo().GetNClasses();
1676 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
1677 for (
UInt_t i=0;i<nClasses;i++){
1679 Double_t r = (*e)->GetClass()==i?(1-1.0/nClasses):(-1.0/nClasses);
1681 fResiduals[*
e].push_back(0);
1686 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
1687 Double_t r = (DataInfo().IsSignal(*
e)?1:0)-0.5;
1689 fResiduals[*
e].push_back(0);
1700 for (
UInt_t ievt=0; ievt<fValidationSample.size(); ievt++) {
1701 Bool_t isSignalType= (dt->
CheckEvent(fValidationSample[ievt]) > fNodePurityLimit ) ? 1 : 0;
1703 if (isSignalType == (DataInfo().IsSignal(fValidationSample[ievt])) ) {
1704 ncorrect += fValidationSample[ievt]->GetWeight();
1707 nfalse += fValidationSample[ievt]->GetWeight();
1711 return ncorrect / (ncorrect + nfalse);
1722 if (fBoostType==
"AdaBoost") returnVal = this->AdaBoost (eventSample, dt);
1723 else if (fBoostType==
"AdaCost") returnVal = this->AdaCost (eventSample, dt);
1724 else if (fBoostType==
"Bagging") returnVal = this->Bagging ( );
1725 else if (fBoostType==
"RegBoost") returnVal = this->RegBoost (eventSample, dt);
1726 else if (fBoostType==
"AdaBoostR2") returnVal = this->AdaBoostR2(eventSample, dt);
1727 else if (fBoostType==
"Grad"){
1729 returnVal = this->GradBoostRegression(eventSample, dt);
1730 else if(DoMulticlass())
1731 returnVal = this->GradBoost (eventSample, dt, cls);
1733 returnVal = this->GradBoost (eventSample, dt);
1736 Log() << kINFO << GetOptions() <<
Endl;
1737 Log() << kFATAL <<
"<Boost> unknown boost option " << fBoostType<<
" called" <<
Endl;
1741 GetBaggedSubSample(fEventSample);
1756 TH1F *tmpS =
new TH1F(
"tmpS",
"", 100 , -1., 1.00001 );
1757 TH1F *tmpB =
new TH1F(
"tmpB",
"", 100 , -1., 1.00001 );
1761 UInt_t signalClassNr = DataInfo().GetClassInfo(
"Signal")->GetNumber();
1771 UInt_t nevents = Data()->GetNTestEvents();
1772 for (
UInt_t iev=0; iev < nevents; iev++){
1773 const Event*
event = GetTestingEvent(iev);
1775 if (event->GetClass() == signalClassNr) {tmp=tmpS;}
1777 tmp->
Fill(PrivateGetMvaValue(event),event->GetWeight());
1781 std::vector<TH1F*> hS;
1782 std::vector<TH1F*> hB;
1783 for (
UInt_t ivar=0; ivar<GetNvar(); ivar++){
1784 hS.push_back(
new TH1F(
Form(
"SigVar%dAtTree%d",ivar,iTree),
Form(
"SigVar%dAtTree%d",ivar,iTree),100,DataInfo().GetVariableInfo(ivar).GetMin(),DataInfo().GetVariableInfo(ivar).GetMax()));
1785 hB.push_back(
new TH1F(
Form(
"BkgVar%dAtTree%d",ivar,iTree),
Form(
"BkgVar%dAtTree%d",ivar,iTree),100,DataInfo().GetVariableInfo(ivar).GetMin(),DataInfo().GetVariableInfo(ivar).GetMax()));
1786 results->
Store(hS.back(),hS.back()->GetTitle());
1787 results->
Store(hB.back(),hB.back()->GetTitle());
1791 for (
UInt_t iev=0; iev < fEventSample.size(); iev++){
1792 if (fEventSample[iev]->GetBoostWeight() > max) max = 1.01*fEventSample[iev]->GetBoostWeight();
1794 TH1F *tmpBoostWeightsS =
new TH1F(
Form(
"BoostWeightsInTreeS%d",iTree),
Form(
"BoostWeightsInTreeS%d",iTree),100,0.,max);
1795 TH1F *tmpBoostWeightsB =
new TH1F(
Form(
"BoostWeightsInTreeB%d",iTree),
Form(
"BoostWeightsInTreeB%d",iTree),100,0.,max);
1796 results->
Store(tmpBoostWeightsS,tmpBoostWeightsS->
GetTitle());
1797 results->
Store(tmpBoostWeightsB,tmpBoostWeightsB->
GetTitle());
1799 TH1F *tmpBoostWeights;
1800 std::vector<TH1F*> *
h;
1802 for (
UInt_t iev=0; iev < fEventSample.size(); iev++){
1803 if (fEventSample[iev]->GetClass() == signalClassNr) {
1804 tmpBoostWeights=tmpBoostWeightsS;
1807 tmpBoostWeights=tmpBoostWeightsB;
1810 tmpBoostWeights->
Fill(fEventSample[iev]->GetBoostWeight());
1811 for (
UInt_t ivar=0; ivar<GetNvar(); ivar++){
1812 (*h)[ivar]->Fill(fEventSample[iev]->GetValue(ivar),fEventSample[iev]->GetWeight());
1848 Double_t err=0, sumGlobalw=0, sumGlobalwfalse=0, sumGlobalwfalse2=0;
1850 std::vector<Double_t> sumw(DataInfo().GetNClasses(),0);
1853 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
1856 UInt_t iclass=(*e)->GetClass();
1859 if ( DoRegression() ) {
1861 sumGlobalwfalse += w * tmpDev;
1862 sumGlobalwfalse2 += w * tmpDev*tmpDev;
1863 if (tmpDev > maxDev) maxDev = tmpDev;
1868 if (!(isSignalType == DataInfo().IsSignal(*
e))) {
1869 sumGlobalwfalse+= w;
1874 if (DataInfo().IsSignal(*
e)) trueType = 1;
1876 sumGlobalwfalse+= w*trueType*dtoutput;
1881 err = sumGlobalwfalse/sumGlobalw ;
1882 if ( DoRegression() ) {
1884 if (fAdaBoostR2Loss==
"linear"){
1885 err = sumGlobalwfalse/maxDev/sumGlobalw ;
1887 else if (fAdaBoostR2Loss==
"quadratic"){
1888 err = sumGlobalwfalse2/maxDev/maxDev/sumGlobalw ;
1890 else if (fAdaBoostR2Loss==
"exponential"){
1892 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
1895 err += w * (1 -
exp (-tmpDev/maxDev)) / sumGlobalw;
1900 Log() << kFATAL <<
" you've chosen a Loss type for Adaboost other than linear, quadratic or exponential "
1901 <<
" namely " << fAdaBoostR2Loss <<
"\n"
1902 <<
"and this is not implemented... a typo in the options ??" <<
Endl;
1906 Log() << kDEBUG <<
"BDT AdaBoos wrong/all: " << sumGlobalwfalse <<
"/" << sumGlobalw <<
Endl;
1910 std::vector<Double_t> newSumw(sumw.size(),0);
1913 if (err >= 0.5 && fUseYesNoLeaf) {
1917 Log() << kERROR <<
" YOUR tree has only 1 Node... kind of a funny *tree*. I cannot "
1918 <<
"boost such a thing... if after 1 step the error rate is == 0.5"
1920 <<
"please check why this happens, maybe too many events per node requested ?"
1924 Log() << kERROR <<
" The error rate in the BDT boosting is > 0.5. ("<< err
1925 <<
") That should not happen, please check your code (i.e... the BDT code), I "
1926 <<
" stop boosting here" <<
Endl;
1930 }
else if (err < 0) {
1931 Log() << kERROR <<
" The error rate in the BDT boosting is < 0. That can happen"
1932 <<
" due to improper treatment of negative weights in a Monte Carlo.. (if you have"
1933 <<
" an idea on how to do it in a better way, please let me know (Helge.Voss@cern.ch)"
1934 <<
" for the time being I set it to its absolute value.. just to continue.." <<
Endl;
1938 boostWeight =
TMath::Log((1.-err)/err)*fAdaBoostBeta;
1940 boostWeight =
TMath::Log((1.+err)/(1-err))*fAdaBoostBeta;
1943 Log() << kDEBUG <<
"BDT AdaBoos wrong/all: " << sumGlobalwfalse <<
"/" << sumGlobalw <<
" 1-err/err="<<boostWeight<<
" log.."<<
TMath::Log(boostWeight)<<
Endl;
1948 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
1950 if (fUseYesNoLeaf||DoRegression()){
1951 if ((!( (dt->
CheckEvent(*
e,fUseYesNoLeaf) > fNodePurityLimit ) == DataInfo().IsSignal(*
e))) || DoRegression()) {
1955 if ( (*e)->GetWeight() > 0 ){
1956 (*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostfactor);
1958 if (DoRegression()) results->
GetHist(
"BoostWeights")->
Fill(boostfactor);
1960 if ( fInverseBoostNegWeights )(*e)->ScaleBoostWeight( 1. / boostfactor);
1961 else (*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostfactor);
1969 if (DataInfo().IsSignal(*
e)) trueType = 1;
1973 if ( (*e)->GetWeight() > 0 ){
1974 (*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostfactor);
1976 if (DoRegression()) results->
GetHist(
"BoostWeights")->
Fill(boostfactor);
1978 if ( fInverseBoostNegWeights )(*e)->ScaleBoostWeight( 1. / boostfactor);
1979 else (*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostfactor);
1982 newSumGlobalw+=(*e)->GetWeight();
1983 newSumw[(*e)->GetClass()] += (*e)->GetWeight();
1989 Log() << kDEBUG <<
"new Nsig="<<newSumw[0]*globalNormWeight <<
" new Nbkg="<<newSumw[1]*globalNormWeight <<
Endl;
1992 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
1996 if (DataInfo().IsSignal(*
e))(*e)->ScaleBoostWeight( globalNormWeight * fSigToBkgFraction );
1997 else (*e)->ScaleBoostWeight( globalNormWeight );
2000 if (!(DoRegression()))results->
GetHist(
"BoostWeights")->
Fill(boostWeight);
2004 fBoostWeight = boostWeight;
2005 fErrorFraction = err;
2031 Double_t err=0, sumGlobalWeights=0, sumGlobalCost=0;
2033 std::vector<Double_t> sumw(DataInfo().GetNClasses(),0);
2035 for (vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
2037 sumGlobalWeights += w;
2038 UInt_t iclass=(*e)->GetClass();
2042 if ( DoRegression() ) {
2043 Log() << kFATAL <<
" AdaCost not implemented for regression"<<
Endl;
2048 Bool_t isTrueSignal = DataInfo().IsSignal(*
e);
2049 Bool_t isSelectedSignal = (dtoutput>0);
2050 if (isTrueSignal) trueType = 1;
2054 if (isTrueSignal && isSelectedSignal) cost=Css;
2055 else if (isTrueSignal && !isSelectedSignal) cost=Cts_sb;
2056 else if (!isTrueSignal && isSelectedSignal) cost=Ctb_ss;
2057 else if (!isTrueSignal && !isSelectedSignal) cost=Cbb;
2058 else Log() << kERROR <<
"something went wrong in AdaCost" <<
Endl;
2060 sumGlobalCost+= w*trueType*dtoutput*cost;
2065 if ( DoRegression() ) {
2066 Log() << kFATAL <<
" AdaCost not implemented for regression"<<
Endl;
2071 sumGlobalCost /= sumGlobalWeights;
2076 vector<Double_t> newSumClassWeights(sumw.size(),0);
2078 Double_t boostWeight =
TMath::Log((1+sumGlobalCost)/(1-sumGlobalCost)) * fAdaBoostBeta;
2082 for (vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
2085 Bool_t isTrueSignal = DataInfo().IsSignal(*
e);
2086 Bool_t isSelectedSignal = (dtoutput>0);
2087 if (isTrueSignal) trueType = 1;
2091 if (isTrueSignal && isSelectedSignal) cost=Css;
2092 else if (isTrueSignal && !isSelectedSignal) cost=Cts_sb;
2093 else if (!isTrueSignal && isSelectedSignal) cost=Ctb_ss;
2094 else if (!isTrueSignal && !isSelectedSignal) cost=Cbb;
2095 else Log() << kERROR <<
"something went wrong in AdaCost" <<
Endl;
2098 if (DoRegression())Log() << kFATAL <<
" AdaCost not implemented for regression"<<
Endl;
2099 if ( (*e)->GetWeight() > 0 ){
2100 (*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostfactor);
2102 if (DoRegression())Log() << kFATAL <<
" AdaCost not implemented for regression"<<
Endl;
2104 if ( fInverseBoostNegWeights )(*e)->ScaleBoostWeight( 1. / boostfactor);
2107 newSumGlobalWeights+=(*e)->GetWeight();
2108 newSumClassWeights[(*e)->GetClass()] += (*e)->GetWeight();
2113 Double_t globalNormWeight=
Double_t(eventSample.size())/newSumGlobalWeights;
2114 Log() << kDEBUG <<
"new Nsig="<<newSumClassWeights[0]*globalNormWeight <<
" new Nbkg="<<newSumClassWeights[1]*globalNormWeight <<
Endl;
2117 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
2120 if (DataInfo().IsSignal(*
e))(*e)->ScaleBoostWeight( globalNormWeight * fSigToBkgFraction );
2121 else (*e)->ScaleBoostWeight( globalNormWeight );
2125 if (!(DoRegression()))results->
GetHist(
"BoostWeights")->
Fill(boostWeight);
2129 fBoostWeight = boostWeight;
2130 fErrorFraction = err;
2157 if (!fSubSample.empty()) fSubSample.clear();
2159 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
2160 n = trandom->
PoissonD(fBaggedSampleFraction);
2161 for (
Int_t i=0;i<
n;i++) fSubSample.push_back(*
e);
2195 if ( !DoRegression() ) Log() << kFATAL <<
"Somehow you chose a regression boost method for a classification job" <<
Endl;
2197 Double_t err=0, sumw=0, sumwfalse=0, sumwfalse2=0;
2199 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
2204 sumwfalse += w * tmpDev;
2205 sumwfalse2 += w * tmpDev*tmpDev;
2206 if (tmpDev > maxDev) maxDev = tmpDev;
2210 if (fAdaBoostR2Loss==
"linear"){
2211 err = sumwfalse/maxDev/sumw ;
2213 else if (fAdaBoostR2Loss==
"quadratic"){
2214 err = sumwfalse2/maxDev/maxDev/sumw ;
2216 else if (fAdaBoostR2Loss==
"exponential"){
2218 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
2221 err += w * (1 -
exp (-tmpDev/maxDev)) / sumw;
2226 Log() << kFATAL <<
" you've chosen a Loss type for Adaboost other than linear, quadratic or exponential "
2227 <<
" namely " << fAdaBoostR2Loss <<
"\n"
2228 <<
"and this is not implemented... a typo in the options ??" <<
Endl;
2236 Log() << kERROR <<
" YOUR tree has only 1 Node... kind of a funny *tree*. I cannot "
2237 <<
"boost such a thing... if after 1 step the error rate is == 0.5"
2239 <<
"please check why this happens, maybe too many events per node requested ?"
2243 Log() << kERROR <<
" The error rate in the BDT boosting is > 0.5. ("<< err
2244 <<
") That should not happen, but is possible for regression trees, and"
2245 <<
" should trigger a stop for the boosting. please check your code (i.e... the BDT code), I "
2246 <<
" stop boosting " <<
Endl;
2250 }
else if (err < 0) {
2251 Log() << kERROR <<
" The error rate in the BDT boosting is < 0. That can happen"
2252 <<
" due to improper treatment of negative weights in a Monte Carlo.. (if you have"
2253 <<
" an idea on how to do it in a better way, please let me know (Helge.Voss@cern.ch)"
2254 <<
" for the time being I set it to its absolute value.. just to continue.." <<
Endl;
2258 Double_t boostWeight = err / (1.-err);
2263 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
2265 results->
GetHist(
"BoostWeights")->
Fill(boostfactor);
2267 if ( (*e)->GetWeight() > 0 ){
2268 Float_t newBoostWeight = (*e)->GetBoostWeight() * boostfactor;
2269 Float_t newWeight = (*e)->GetWeight() * (*e)->GetBoostWeight() * boostfactor;
2270 if (newWeight == 0) {
2271 Log() << kINFO <<
"Weight= " << (*e)->GetWeight() <<
Endl;
2272 Log() << kINFO <<
"BoostWeight= " << (*e)->GetBoostWeight() <<
Endl;
2273 Log() << kINFO <<
"boostweight="<<boostWeight <<
" err= " <<err <<
Endl;
2274 Log() << kINFO <<
"NewBoostWeight= " << newBoostWeight <<
Endl;
2275 Log() << kINFO <<
"boostfactor= " << boostfactor <<
Endl;
2276 Log() << kINFO <<
"maxDev = " << maxDev <<
Endl;
2278 Log() << kINFO <<
"target = " << (*e)->GetTarget(0) <<
Endl;
2281 (*e)->SetBoostWeight( newBoostWeight );
2284 (*e)->SetBoostWeight( (*e)->GetBoostWeight() / boostfactor);
2286 newSumw+=(*e)->GetWeight();
2290 Double_t normWeight = sumw / newSumw;
2291 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
2294 (*e)->SetBoostWeight( (*e)->GetBoostWeight() * normWeight );
2301 fBoostWeight = boostWeight;
2302 fErrorFraction = err;
2314 if (fDoPreselection){
2315 for (
UInt_t ivar=0; ivar<GetNvar(); ivar++){
2316 gTools().
AddAttr( wght,
Form(
"PreselectionLowBkgVar%d",ivar), fIsLowBkgCut[ivar]);
2317 gTools().
AddAttr( wght,
Form(
"PreselectionLowBkgVar%dValue",ivar), fLowBkgCut[ivar]);
2318 gTools().
AddAttr( wght,
Form(
"PreselectionLowSigVar%d",ivar), fIsLowSigCut[ivar]);
2319 gTools().
AddAttr( wght,
Form(
"PreselectionLowSigVar%dValue",ivar), fLowSigCut[ivar]);
2320 gTools().
AddAttr( wght,
Form(
"PreselectionHighBkgVar%d",ivar), fIsHighBkgCut[ivar]);
2321 gTools().
AddAttr( wght,
Form(
"PreselectionHighBkgVar%dValue",ivar),fHighBkgCut[ivar]);
2322 gTools().
AddAttr( wght,
Form(
"PreselectionHighSigVar%d",ivar), fIsHighSigCut[ivar]);
2323 gTools().
AddAttr( wght,
Form(
"PreselectionHighSigVar%dValue",ivar),fHighSigCut[ivar]);
2329 gTools().
AddAttr( wght,
"AnalysisType", fForest.back()->GetAnalysisType() );
2331 for (
UInt_t i=0; i< fForest.size(); i++) {
2332 void* trxml = fForest[i]->AddXMLTo(wght);
2343 for (i=0; i<fForest.size(); i++)
delete fForest[i];
2345 fBoostWeights.clear();
2352 if (
gTools().HasAttr( parent,
Form(
"PreselectionLowBkgVar%d",0))) {
2353 fIsLowBkgCut.resize(GetNvar());
2354 fLowBkgCut.resize(GetNvar());
2355 fIsLowSigCut.resize(GetNvar());
2356 fLowSigCut.resize(GetNvar());
2357 fIsHighBkgCut.resize(GetNvar());
2358 fHighBkgCut.resize(GetNvar());
2359 fIsHighSigCut.resize(GetNvar());
2360 fHighSigCut.resize(GetNvar());
2364 for (
UInt_t ivar=0; ivar<GetNvar(); ivar++){
2366 fIsLowBkgCut[ivar]=tmpBool;
2368 fLowBkgCut[ivar]=tmpDouble;
2370 fIsLowSigCut[ivar]=tmpBool;
2372 fLowSigCut[ivar]=tmpDouble;
2374 fIsHighBkgCut[ivar]=tmpBool;
2376 fHighBkgCut[ivar]=tmpDouble;
2378 fIsHighSigCut[ivar]=tmpBool;
2380 fHighSigCut[ivar]=tmpDouble;
2387 if(
gTools().HasAttr(parent,
"TreeType")) {
2398 fForest.back()->SetTreeID(i++);
2400 fBoostWeights.push_back(boostWeight);
2412 Int_t analysisType(0);
2415 istr >> dummy >> fNTrees;
2416 Log() << kINFO <<
"Read " << fNTrees <<
" Decision trees" <<
Endl;
2418 for (
UInt_t i=0;i<fForest.size();i++)
delete fForest[i];
2420 fBoostWeights.clear();
2423 for (
int i=0;i<fNTrees;i++) {
2424 istr >> dummy >> iTree >> dummy >> boostWeight;
2426 fForest.back()->Print( std::cout );
2427 Log() << kFATAL <<
"Error while reading weight file; mismatch iTree="
2428 << iTree <<
" i=" << i
2429 <<
" dummy " << dummy
2430 <<
" boostweight " << boostWeight
2435 fForest.back()->SetTreeID(i);
2436 fForest.back()->
Read(istr, GetTrainingTMVAVersionCode());
2437 fBoostWeights.push_back(boostWeight);
2444 return this->GetMvaValue( err, errUpper, 0 );
2454 const Event* ev = GetEvent();
2455 if (fDoPreselection) {
2456 Double_t val = ApplyPreselectionCuts(ev);
2459 return PrivateGetMvaValue(ev, err, errUpper, useNTrees);
2471 NoErrorCalc(err, errUpper);
2475 UInt_t nTrees = fForest.size();
2477 if (useNTrees > 0 ) nTrees = useNTrees;
2479 if (fBoostType==
"Grad")
return GetGradBoostMVA(ev,nTrees);
2483 for (
UInt_t itree=0; itree<nTrees; itree++) {
2485 myMVA += fBoostWeights[itree] * fForest[itree]->CheckEvent(ev,fUseYesNoLeaf);
2486 norm += fBoostWeights[itree];
2488 return ( norm > std::numeric_limits<double>::epsilon() ) ? myMVA /= norm : 0 ;
2498 if (fMulticlassReturnVal == NULL) fMulticlassReturnVal =
new std::vector<Float_t>();
2499 fMulticlassReturnVal->clear();
2501 UInt_t nClasses = DataInfo().GetNClasses();
2502 std::vector<Double_t> temp(nClasses);
2503 auto forestSize = fForest.size();
2506 std::vector<TMVA::DecisionTree *> forest = fForest;
2507 auto get_output = [&
e, &forest, &temp, forestSize, nClasses](
UInt_t iClass) {
2508 for (
UInt_t itree = iClass; itree < forestSize; itree += nClasses) {
2509 temp[iClass] += forest[itree]->CheckEvent(
e,
kFALSE);
2519 for (
UInt_t itree = 0; itree < forestSize; ++itree) {
2520 temp[classOfTree] += fForest[itree]->CheckEvent(
e,
kFALSE);
2521 if (++classOfTree == nClasses) classOfTree = 0;
2527 std::transform(temp.begin(), temp.end(), temp.begin(), [](
Double_t d){return exp(d);});
2529 Double_t exp_sum = std::accumulate(temp.begin(), temp.end(), 0.0);
2531 for (
UInt_t i = 0; i < nClasses; i++) {
2532 Double_t p_cls = temp[i] / exp_sum;
2533 (*fMulticlassReturnVal).push_back(p_cls);
2536 return *fMulticlassReturnVal;
2545 if (fRegressionReturnVal == NULL) fRegressionReturnVal =
new std::vector<Float_t>();
2546 fRegressionReturnVal->clear();
2548 const Event * ev = GetEvent();
2553 if (fBoostType==
"AdaBoostR2") {
2564 vector< Double_t > response(fForest.size());
2565 vector< Double_t > weight(fForest.size());
2568 for (
UInt_t itree=0; itree<fForest.size(); itree++) {
2569 response[itree] = fForest[itree]->CheckEvent(ev,
kFALSE);
2570 weight[itree] = fBoostWeights[itree];
2571 totalSumOfWeights += fBoostWeights[itree];
2574 std::vector< std::vector<Double_t> > vtemp;
2575 vtemp.push_back( response );
2576 vtemp.push_back( weight );
2581 while (sumOfWeights <= totalSumOfWeights/2.) {
2582 sumOfWeights += vtemp[1][t];
2596 else if(fBoostType==
"Grad"){
2597 for (
UInt_t itree=0; itree<fForest.size(); itree++) {
2598 myMVA += fForest[itree]->CheckEvent(ev,
kFALSE);
2601 evT->
SetTarget(0, myMVA+fBoostWeights[0] );
2604 for (
UInt_t itree=0; itree<fForest.size(); itree++) {
2606 myMVA += fBoostWeights[itree] * fForest[itree]->CheckEvent(ev,
kFALSE);
2607 norm += fBoostWeights[itree];
2610 evT->
SetTarget(0, ( norm > std::numeric_limits<double>::epsilon() ) ? myMVA /= norm : 0 );
2615 const Event* evT2 = GetTransformationHandler().InverseTransform( evT );
2616 fRegressionReturnVal->push_back( evT2->
GetTarget(0) );
2621 return *fRegressionReturnVal;
2630 Log() << kDEBUG <<
"\tWrite monitoring histograms to file: " << BaseDir()->GetPath() <<
Endl;
2634 fMonitorNtuple->
Write();
2645 fVariableImportance.resize(GetNvar());
2646 for (
UInt_t ivar = 0; ivar < GetNvar(); ivar++) {
2647 fVariableImportance[ivar]=0;
2650 for (
UInt_t itree = 0; itree < GetNTrees(); itree++) {
2651 std::vector<Double_t> relativeImportance(fForest[itree]->GetVariableImportance());
2652 for (
UInt_t i=0; i< relativeImportance.size(); i++) {
2653 fVariableImportance[i] += fBoostWeights[itree] * relativeImportance[i];
2657 for (
UInt_t ivar=0; ivar< fVariableImportance.size(); ivar++){
2658 fVariableImportance[ivar] =
TMath::Sqrt(fVariableImportance[ivar]);
2659 sum += fVariableImportance[ivar];
2661 for (
UInt_t ivar=0; ivar< fVariableImportance.size(); ivar++) fVariableImportance[ivar] /=
sum;
2663 return fVariableImportance;
2673 std::vector<Double_t> relativeImportance = this->GetVariableImportance();
2674 if (ivar < (
UInt_t)relativeImportance.size())
return relativeImportance[ivar];
2675 else Log() << kFATAL <<
"<GetVariableImportance> ivar = " << ivar <<
" is out of range " <<
Endl;
2686 fRanking =
new Ranking( GetName(),
"Variable Importance" );
2687 vector< Double_t> importance(this->GetVariableImportance());
2689 for (
UInt_t ivar=0; ivar<GetNvar(); ivar++) {
2691 fRanking->AddRank(
Rank( GetInputLabel(ivar), importance[ivar] ) );
2705 Log() <<
"Boosted Decision Trees are a collection of individual decision" <<
Endl;
2706 Log() <<
"trees which form a multivariate classifier by (weighted) majority " <<
Endl;
2707 Log() <<
"vote of the individual trees. Consecutive decision trees are " <<
Endl;
2708 Log() <<
"trained using the original training data set with re-weighted " <<
Endl;
2709 Log() <<
"events. By default, the AdaBoost method is employed, which gives " <<
Endl;
2710 Log() <<
"events that were misclassified in the previous tree a larger " <<
Endl;
2711 Log() <<
"weight in the training of the following tree." <<
Endl;
2713 Log() <<
"Decision trees are a sequence of binary splits of the data sample" <<
Endl;
2714 Log() <<
"using a single discriminant variable at a time. A test event " <<
Endl;
2715 Log() <<
"ending up after the sequence of left-right splits in a final " <<
Endl;
2716 Log() <<
"(\"leaf\") node is classified as either signal or background" <<
Endl;
2717 Log() <<
"depending on the majority type of training events in that node." <<
Endl;
2721 Log() <<
"By the nature of the binary splits performed on the individual" <<
Endl;
2722 Log() <<
"variables, decision trees do not deal well with linear correlations" <<
Endl;
2723 Log() <<
"between variables (they need to approximate the linear split in" <<
Endl;
2724 Log() <<
"the two dimensional space by a sequence of splits on the two " <<
Endl;
2725 Log() <<
"variables individually). Hence decorrelation could be useful " <<
Endl;
2726 Log() <<
"to optimise the BDT performance." <<
Endl;
2730 Log() <<
"The two most important parameters in the configuration are the " <<
Endl;
2731 Log() <<
"minimal number of events requested by a leaf node as percentage of the " <<
Endl;
2732 Log() <<
" number of training events (option \"MinNodeSize\" replacing the actual number " <<
Endl;
2733 Log() <<
" of events \"nEventsMin\" as given in earlier versions" <<
Endl;
2734 Log() <<
"If this number is too large, detailed features " <<
Endl;
2735 Log() <<
"in the parameter space are hard to be modelled. If it is too small, " <<
Endl;
2736 Log() <<
"the risk to overtrain rises and boosting seems to be less effective" <<
Endl;
2737 Log() <<
" typical values from our current experience for best performance " <<
Endl;
2738 Log() <<
" are between 0.5(%) and 10(%) " <<
Endl;
2740 Log() <<
"The default minimal number is currently set to " <<
Endl;
2741 Log() <<
" max(20, (N_training_events / N_variables^2 / 10)) " <<
Endl;
2742 Log() <<
"and can be changed by the user." <<
Endl;
2744 Log() <<
"The other crucial parameter, the pruning strength (\"PruneStrength\")," <<
Endl;
2745 Log() <<
"is also related to overtraining. It is a regularisation parameter " <<
Endl;
2746 Log() <<
"that is used when determining after the training which splits " <<
Endl;
2747 Log() <<
"are considered statistically insignificant and are removed. The" <<
Endl;
2748 Log() <<
"user is advised to carefully watch the BDT screen output for" <<
Endl;
2749 Log() <<
"the comparison between efficiencies obtained on the training and" <<
Endl;
2750 Log() <<
"the independent test sample. They should be equal within statistical" <<
Endl;
2751 Log() <<
"errors, in order to minimize statistical fluctuations in different samples." <<
Endl;
2763 fout <<
" std::vector<"<<nodeName<<
"*> fForest; // i.e. root nodes of decision trees" << std::endl;
2764 fout <<
" std::vector<double> fBoostWeights; // the weights applied in the individual boosts" << std::endl;
2765 fout <<
"};" << std::endl << std::endl;
2768 fout <<
"std::vector<double> ReadBDTG::GetMulticlassValues__( const std::vector<double>& inputValues ) const" << std::endl;
2769 fout <<
"{" << std::endl;
2770 fout <<
" uint nClasses = " << DataInfo().GetNClasses() <<
";" << std::endl;
2771 fout <<
" std::vector<double> fMulticlassReturnVal;" << std::endl;
2772 fout <<
" fMulticlassReturnVal.reserve(nClasses);" << std::endl;
2774 fout <<
" std::vector<double> temp(nClasses);" << std::endl;
2775 fout <<
" auto forestSize = fForest.size();" << std::endl;
2776 fout <<
" // trees 0, nClasses, 2*nClasses, ... belong to class 0" << std::endl;
2777 fout <<
" // trees 1, nClasses+1, 2*nClasses+1, ... belong to class 1 and so forth" << std::endl;
2778 fout <<
" uint classOfTree = 0;" << std::endl;
2779 fout <<
" for (uint itree = 0; itree < forestSize; ++itree) {" << std::endl;
2780 fout <<
" BDTGNode *current = fForest[itree];" << std::endl;
2781 fout <<
" while (current->GetNodeType() == 0) { //intermediate node" << std::endl;
2782 fout <<
" if (current->GoesRight(inputValues)) current=(BDTGNode*)current->GetRight();" << std::endl;
2783 fout <<
" else current=(BDTGNode*)current->GetLeft();" << std::endl;
2784 fout <<
" }" << std::endl;
2785 fout <<
" temp[classOfTree] += current->GetResponse();" << std::endl;
2786 fout <<
" if (++classOfTree == nClasses) classOfTree = 0; // cheap modulo" << std::endl;
2787 fout <<
" }" << std::endl;
2789 fout <<
" // we want to calculate sum of exp(temp[j] - temp[i]) for all i,j (i!=j)" << std::endl;
2790 fout <<
" // first calculate exp(), then replace minus with division." << std::endl;
2791 fout <<
" std::transform(temp.begin(), temp.end(), temp.begin(), [](double d){return exp(d);});" << std::endl;
2793 fout <<
" for(uint iClass=0; iClass<nClasses; iClass++){" << std::endl;
2794 fout <<
" double norm = 0.0;" << std::endl;
2795 fout <<
" for(uint j=0;j<nClasses;j++){" << std::endl;
2796 fout <<
" if(iClass!=j)" << std::endl;
2797 fout <<
" norm += temp[j] / temp[iClass];" << std::endl;
2798 fout <<
" }" << std::endl;
2799 fout <<
" fMulticlassReturnVal.push_back(1.0/(1.0+norm));" << std::endl;
2800 fout <<
" }" << std::endl;
2802 fout <<
" return fMulticlassReturnVal;" << std::endl;
2803 fout <<
"}" << std::endl;
2805 fout <<
"double " << className <<
"::GetMvaValue__( const std::vector<double>& inputValues ) const" << std::endl;
2806 fout <<
"{" << std::endl;
2807 fout <<
" double myMVA = 0;" << std::endl;
2808 if (fDoPreselection){
2809 for (
UInt_t ivar = 0; ivar< fIsLowBkgCut.size(); ivar++){
2810 if (fIsLowBkgCut[ivar]){
2811 fout <<
" if (inputValues["<<ivar<<
"] < " << fLowBkgCut[ivar] <<
") return -1; // is background preselection cut" << std::endl;
2813 if (fIsLowSigCut[ivar]){
2814 fout <<
" if (inputValues["<<ivar<<
"] < "<< fLowSigCut[ivar] <<
") return 1; // is signal preselection cut" << std::endl;
2816 if (fIsHighBkgCut[ivar]){
2817 fout <<
" if (inputValues["<<ivar<<
"] > "<<fHighBkgCut[ivar] <<
") return -1; // is background preselection cut" << std::endl;
2819 if (fIsHighSigCut[ivar]){
2820 fout <<
" if (inputValues["<<ivar<<
"] > "<<fHighSigCut[ivar]<<
") return 1; // is signal preselection cut" << std::endl;
2825 if (fBoostType!=
"Grad"){
2826 fout <<
" double norm = 0;" << std::endl;
2828 fout <<
" for (unsigned int itree=0; itree<fForest.size(); itree++){" << std::endl;
2829 fout <<
" "<<nodeName<<
" *current = fForest[itree];" << std::endl;
2830 fout <<
" while (current->GetNodeType() == 0) { //intermediate node" << std::endl;
2831 fout <<
" if (current->GoesRight(inputValues)) current=("<<nodeName<<
"*)current->GetRight();" << std::endl;
2832 fout <<
" else current=("<<nodeName<<
"*)current->GetLeft();" << std::endl;
2833 fout <<
" }" << std::endl;
2834 if (fBoostType==
"Grad"){
2835 fout <<
" myMVA += current->GetResponse();" << std::endl;
2837 if (fUseYesNoLeaf) fout <<
" myMVA += fBoostWeights[itree] * current->GetNodeType();" << std::endl;
2838 else fout <<
" myMVA += fBoostWeights[itree] * current->GetPurity();" << std::endl;
2839 fout <<
" norm += fBoostWeights[itree];" << std::endl;
2841 fout <<
" }" << std::endl;
2842 if (fBoostType==
"Grad"){
2843 fout <<
" return 2.0/(1.0+exp(-2.0*myMVA))-1.0;" << std::endl;
2845 else fout <<
" return myMVA /= norm;" << std::endl;
2846 fout <<
"}" << std::endl << std::endl;
2849 fout <<
"void " << className <<
"::Initialize()" << std::endl;
2850 fout <<
"{" << std::endl;
2851 fout <<
" double inf = std::numeric_limits<double>::infinity();" << std::endl;
2852 fout <<
" double nan = std::numeric_limits<double>::quiet_NaN();" << std::endl;
2854 for (
UInt_t itree=0; itree<GetNTrees(); itree++) {
2855 fout <<
" // itree = " << itree << std::endl;
2856 fout <<
" fBoostWeights.push_back(" << fBoostWeights[itree] <<
");" << std::endl;
2857 fout <<
" fForest.push_back( " << std::endl;
2858 this->MakeClassInstantiateNode((
DecisionTreeNode*)fForest[itree]->GetRoot(), fout, className);
2859 fout <<
" );" << std::endl;
2861 fout <<
" return;" << std::endl;
2862 fout <<
"};" << std::endl;
2864 fout <<
"// Clean up" << std::endl;
2865 fout <<
"inline void " << className <<
"::Clear() " << std::endl;
2866 fout <<
"{" << std::endl;
2867 fout <<
" for (unsigned int itree=0; itree<fForest.size(); itree++) { " << std::endl;
2868 fout <<
" delete fForest[itree]; " << std::endl;
2869 fout <<
" }" << std::endl;
2870 fout <<
"}" << std::endl;
2882 fout <<
"#include <algorithm>" << std::endl;
2883 fout <<
"#include <limits>" << std::endl;
2886 fout <<
"#define NN new "<<nodeName << std::endl;
2889 fout <<
"#ifndef "<<nodeName<<
"__def" << std::endl;
2890 fout <<
"#define "<<nodeName<<
"__def" << std::endl;
2892 fout <<
"class "<<nodeName<<
" {" << std::endl;
2894 fout <<
"public:" << std::endl;
2896 fout <<
" // constructor of an essentially \"empty\" node floating in space" << std::endl;
2897 fout <<
" "<<nodeName<<
" ( "<<nodeName<<
"* left,"<<nodeName<<
"* right," << std::endl;
2898 if (fUseFisherCuts){
2899 fout <<
" int nFisherCoeff," << std::endl;
2900 for (
UInt_t i=0;i<GetNVariables()+1;i++){
2901 fout <<
" double fisherCoeff"<<i<<
"," << std::endl;
2904 fout <<
" int selector, double cutValue, bool cutType, " << std::endl;
2905 fout <<
" int nodeType, double purity, double response ) :" << std::endl;
2906 fout <<
" fLeft ( left )," << std::endl;
2907 fout <<
" fRight ( right )," << std::endl;
2908 if (fUseFisherCuts) fout <<
" fNFisherCoeff ( nFisherCoeff )," << std::endl;
2909 fout <<
" fSelector ( selector )," << std::endl;
2910 fout <<
" fCutValue ( cutValue )," << std::endl;
2911 fout <<
" fCutType ( cutType )," << std::endl;
2912 fout <<
" fNodeType ( nodeType )," << std::endl;
2913 fout <<
" fPurity ( purity )," << std::endl;
2914 fout <<
" fResponse ( response ){" << std::endl;
2915 if (fUseFisherCuts){
2916 for (
UInt_t i=0;i<GetNVariables()+1;i++){
2917 fout <<
" fFisherCoeff.push_back(fisherCoeff"<<i<<
");" << std::endl;
2920 fout <<
" }" << std::endl << std::endl;
2921 fout <<
" virtual ~"<<nodeName<<
"();" << std::endl << std::endl;
2922 fout <<
" // test event if it descends the tree at this node to the right" << std::endl;
2923 fout <<
" virtual bool GoesRight( const std::vector<double>& inputValues ) const;" << std::endl;
2924 fout <<
" "<<nodeName<<
"* GetRight( void ) {return fRight; };" << std::endl << std::endl;
2925 fout <<
" // test event if it descends the tree at this node to the left " << std::endl;
2926 fout <<
" virtual bool GoesLeft ( const std::vector<double>& inputValues ) const;" << std::endl;
2927 fout <<
" "<<nodeName<<
"* GetLeft( void ) { return fLeft; }; " << std::endl << std::endl;
2928 fout <<
" // return S/(S+B) (purity) at this node (from training)" << std::endl << std::endl;
2929 fout <<
" double GetPurity( void ) const { return fPurity; } " << std::endl;
2930 fout <<
" // return the node type" << std::endl;
2931 fout <<
" int GetNodeType( void ) const { return fNodeType; }" << std::endl;
2932 fout <<
" double GetResponse(void) const {return fResponse;}" << std::endl << std::endl;
2933 fout <<
"private:" << std::endl << std::endl;
2934 fout <<
" "<<nodeName<<
"* fLeft; // pointer to the left daughter node" << std::endl;
2935 fout <<
" "<<nodeName<<
"* fRight; // pointer to the right daughter node" << std::endl;
2936 if (fUseFisherCuts){
2937 fout <<
" int fNFisherCoeff; // =0 if this node doesn't use fisher, else =nvar+1 " << std::endl;
2938 fout <<
" std::vector<double> fFisherCoeff; // the fisher coeff (offset at the last element)" << std::endl;
2940 fout <<
" int fSelector; // index of variable used in node selection (decision tree) " << std::endl;
2941 fout <<
" double fCutValue; // cut value applied on this node to discriminate bkg against sig" << std::endl;
2942 fout <<
" bool fCutType; // true: if event variable > cutValue ==> signal , false otherwise" << std::endl;
2943 fout <<
" int fNodeType; // Type of node: -1 == Bkg-leaf, 1 == Signal-leaf, 0 = internal " << std::endl;
2944 fout <<
" double fPurity; // Purity of node from training"<< std::endl;
2945 fout <<
" double fResponse; // Regression response value of node" << std::endl;
2946 fout <<
"}; " << std::endl;
2948 fout <<
"//_______________________________________________________________________" << std::endl;
2949 fout <<
" "<<nodeName<<
"::~"<<nodeName<<
"()" << std::endl;
2950 fout <<
"{" << std::endl;
2951 fout <<
" if (fLeft != NULL) delete fLeft;" << std::endl;
2952 fout <<
" if (fRight != NULL) delete fRight;" << std::endl;
2953 fout <<
"}; " << std::endl;
2955 fout <<
"//_______________________________________________________________________" << std::endl;
2956 fout <<
"bool "<<nodeName<<
"::GoesRight( const std::vector<double>& inputValues ) const" << std::endl;
2957 fout <<
"{" << std::endl;
2958 fout <<
" // test event if it descends the tree at this node to the right" << std::endl;
2959 fout <<
" bool result;" << std::endl;
2960 if (fUseFisherCuts){
2961 fout <<
" if (fNFisherCoeff == 0){" << std::endl;
2962 fout <<
" result = (inputValues[fSelector] >= fCutValue );" << std::endl;
2963 fout <<
" }else{" << std::endl;
2964 fout <<
" double fisher = fFisherCoeff.at(fFisherCoeff.size()-1);" << std::endl;
2965 fout <<
" for (unsigned int ivar=0; ivar<fFisherCoeff.size()-1; ivar++)" << std::endl;
2966 fout <<
" fisher += fFisherCoeff.at(ivar)*inputValues.at(ivar);" << std::endl;
2967 fout <<
" result = fisher > fCutValue;" << std::endl;
2968 fout <<
" }" << std::endl;
2970 fout <<
" result = (inputValues[fSelector] >= fCutValue );" << std::endl;
2972 fout <<
" if (fCutType == true) return result; //the cuts are selecting Signal ;" << std::endl;
2973 fout <<
" else return !result;" << std::endl;
2974 fout <<
"}" << std::endl;
2976 fout <<
"//_______________________________________________________________________" << std::endl;
2977 fout <<
"bool "<<nodeName<<
"::GoesLeft( const std::vector<double>& inputValues ) const" << std::endl;
2978 fout <<
"{" << std::endl;
2979 fout <<
" // test event if it descends the tree at this node to the left" << std::endl;
2980 fout <<
" if (!this->GoesRight(inputValues)) return true;" << std::endl;
2981 fout <<
" else return false;" << std::endl;
2982 fout <<
"}" << std::endl;
2984 fout <<
"#endif" << std::endl;
2994 Log() << kFATAL <<
"MakeClassInstantiateNode: started with undefined node" <<
Endl;
2997 fout <<
"NN("<<std::endl;
2998 if (
n->GetLeft() != NULL){
2999 this->MakeClassInstantiateNode( (
DecisionTreeNode*)
n->GetLeft() , fout, className);
3004 fout <<
", " <<std::endl;
3005 if (
n->GetRight() != NULL){
3006 this->MakeClassInstantiateNode( (
DecisionTreeNode*)
n->GetRight(), fout, className );
3011 fout <<
", " << std::endl
3012 << std::setprecision(6);
3013 if (fUseFisherCuts){
3014 fout <<
n->GetNFisherCoeff() <<
", ";
3015 for (
UInt_t i=0; i< GetNVariables()+1; i++) {
3016 if (
n->GetNFisherCoeff() == 0 ){
3019 fout <<
n->GetFisherCoeff(i) <<
", ";
3023 fout <<
n->GetSelector() <<
", "
3024 <<
n->GetCutValue() <<
", "
3025 <<
n->GetCutType() <<
", "
3026 <<
n->GetNodeType() <<
", "
3027 <<
n->GetPurity() <<
","
3028 <<
n->GetResponse() <<
") ";
3039 Int_t nTotS_unWeighted = 0, nTotB_unWeighted = 0;
3041 std::vector<TMVA::BDTEventWrapper> bdtEventSample;
3043 fIsLowSigCut.assign(GetNvar(),
kFALSE);
3044 fIsLowBkgCut.assign(GetNvar(),
kFALSE);
3045 fIsHighSigCut.assign(GetNvar(),
kFALSE);
3046 fIsHighBkgCut.assign(GetNvar(),
kFALSE);
3048 fLowSigCut.assign(GetNvar(),0.);
3049 fLowBkgCut.assign(GetNvar(),0.);
3050 fHighSigCut.assign(GetNvar(),0.);
3051 fHighBkgCut.assign(GetNvar(),0.);
3056 for( std::vector<const TMVA::Event*>::const_iterator it = eventSample.begin(); it != eventSample.end(); ++it ) {
3057 if (DataInfo().IsSignal(*it)){
3058 nTotS += (*it)->GetWeight();
3062 nTotB += (*it)->GetWeight();
3068 for(
UInt_t ivar = 0; ivar < GetNvar(); ivar++ ) {
3070 std::sort( bdtEventSample.begin(),bdtEventSample.end() );
3072 Double_t bkgWeightCtr = 0.0, sigWeightCtr = 0.0;
3073 std::vector<TMVA::BDTEventWrapper>::iterator it = bdtEventSample.begin(), it_end = bdtEventSample.end();
3074 for( ; it != it_end; ++it ) {
3075 if (DataInfo().IsSignal(**it))
3076 sigWeightCtr += (**it)->GetWeight();
3078 bkgWeightCtr += (**it)->GetWeight();
3080 it->SetCumulativeWeight(
false,bkgWeightCtr);
3081 it->SetCumulativeWeight(
true,sigWeightCtr);
3086 Double_t dVal = (DataInfo().GetVariableInfo(ivar).GetMax() - DataInfo().GetVariableInfo(ivar).GetMin())/100. ;
3087 Double_t nSelS, nSelB, effS=0.05, effB=0.05, rejS=0.05, rejB=0.05;
3088 Double_t tmpEffS, tmpEffB, tmpRejS, tmpRejB;
3093 for(
UInt_t iev = 1; iev < bdtEventSample.size(); iev++) {
3096 nSelS = bdtEventSample[iev].GetCumulativeWeight(
true);
3097 nSelB = bdtEventSample[iev].GetCumulativeWeight(
false);
3099 tmpEffS=nSelS/nTotS;
3100 tmpEffB=nSelB/nTotB;
3103 if (nSelS==0 && tmpEffB>effB) {effB=tmpEffB; fLowBkgCut[ivar] = bdtEventSample[iev].GetVal() - dVal; fIsLowBkgCut[ivar]=
kTRUE;}
3104 else if (nSelB==0 && tmpEffS>effS) {effS=tmpEffS; fLowSigCut[ivar] = bdtEventSample[iev].GetVal() - dVal; fIsLowSigCut[ivar]=
kTRUE;}
3105 else if (nSelB==nTotB && tmpRejS>rejS) {rejS=tmpRejS; fHighSigCut[ivar] = bdtEventSample[iev].GetVal() + dVal; fIsHighSigCut[ivar]=
kTRUE;}
3106 else if (nSelS==nTotS && tmpRejB>rejB) {rejB=tmpRejB; fHighBkgCut[ivar] = bdtEventSample[iev].GetVal() + dVal; fIsHighBkgCut[ivar]=
kTRUE;}
3111 Log() << kDEBUG <<
" \tfound and suggest the following possible pre-selection cuts " <<
Endl;
3112 if (fDoPreselection) Log() << kDEBUG <<
"\tthe training will be done after these cuts... and GetMVA value returns +1, (-1) for a signal (bkg) event that passes these cuts" <<
Endl;
3113 else Log() << kDEBUG <<
"\tas option DoPreselection was not used, these cuts however will not be performed, but the training will see the full sample"<<
Endl;
3114 for (
UInt_t ivar=0; ivar < GetNvar(); ivar++ ) {
3115 if (fIsLowBkgCut[ivar]){
3116 Log() << kDEBUG <<
" \tfound cut: Bkg if var " << ivar <<
" < " << fLowBkgCut[ivar] <<
Endl;
3118 if (fIsLowSigCut[ivar]){
3119 Log() << kDEBUG <<
" \tfound cut: Sig if var " << ivar <<
" < " << fLowSigCut[ivar] <<
Endl;
3121 if (fIsHighBkgCut[ivar]){
3122 Log() << kDEBUG <<
" \tfound cut: Bkg if var " << ivar <<
" > " << fHighBkgCut[ivar] <<
Endl;
3124 if (fIsHighSigCut[ivar]){
3125 Log() << kDEBUG <<
" \tfound cut: Sig if var " << ivar <<
" > " << fHighSigCut[ivar] <<
Endl;
3140 for (
UInt_t ivar=0; ivar < GetNvar(); ivar++ ) {
3141 if (fIsLowBkgCut[ivar]){
3142 if (ev->
GetValue(ivar) < fLowBkgCut[ivar]) result = -1;
3144 if (fIsLowSigCut[ivar]){
3145 if (ev->
GetValue(ivar) < fLowSigCut[ivar]) result = 1;
3147 if (fIsHighBkgCut[ivar]){
3148 if (ev->
GetValue(ivar) > fHighBkgCut[ivar]) result = -1;
3150 if (fIsHighSigCut[ivar]){
3151 if (ev->
GetValue(ivar) > fHighSigCut[ivar]) result = 1;
#define REGISTER_METHOD(CLASS)
for example
char * Form(const char *fmt,...)
A pseudo container class which is a generator of indices.
A TGraph is an object made of two arrays X and Y with npoints each.
virtual void SetPoint(Int_t i, Double_t x, Double_t y)
Set x and y values for point number i.
virtual void SetName(const char *name="")
Set graph name.
virtual void SetTitle(const char *title="")
Change (i.e.
virtual void Set(Int_t n)
Set number of points in the graph Existing coordinates are preserved New coordinates above fNpoints a...
1-D histogram with a float per channel (see TH1 documentation)}
1-D histogram with an int per channel (see TH1 documentation)}
TH1 is the base class of all histogram classes in ROOT.
virtual void SetXTitle(const char *title)
virtual Int_t Fill(Double_t x)
Increment bin with abscissa X by 1.
virtual void SetBinContent(Int_t bin, Double_t content)
Set bin content see convention for numbering bins in TH1::GetBin In case the bin number is greater th...
virtual void SetYTitle(const char *title)
2-D histogram with a float per channel (see TH1 documentation)}
Service class for 2-Dim histogram classes.
Absolute Deviation BDT Loss Function.
static void SetVarIndex(Int_t iVar)
Executor & GetThreadExecutor()
Get executor class for multi-thread usage In case when MT is not enabled will return a serial executo...
static Config & Instance()
static function: returns TMVA instance
Implementation of the CrossEntropy as separation criterion.
Class that contains all the data information.
static void SetIsTraining(bool on)
Implementation of a Decision Tree.
TMVA::DecisionTreeNode * GetEventNode(const TMVA::Event &e) const
get the pointer to the leaf node where a particular event ends up in... (used in gradient boosting)
Double_t CheckEvent(const TMVA::Event *, Bool_t UseYesNoLeaf=kFALSE) const
the event e is put into the decision tree (starting at the root node) and the output is NodeType (sig...
static DecisionTree * CreateFromXML(void *node, UInt_t tmva_Version_Code=TMVA_VERSION_CODE)
re-create a new tree (decision tree or search tree) from XML
Float_t GetValue(UInt_t ivar) const
return value of i'th variable
void SetTarget(UInt_t itgt, Float_t value)
set the target value (dimension itgt) to value
Float_t GetTarget(UInt_t itgt) const
void Foreach(Function func, unsigned int nTimes, unsigned nChunks=0)
wrap TExecutor::Foreach
unsigned int GetPoolSize() const
auto Map(F func, unsigned nTimes) -> std::vector< typename std::result_of< F()>::type >
Wrap TExecutor::Map functions.
Implementation of the GiniIndex With Laplace correction as separation criterion.
Implementation of the GiniIndex as separation criterion.
The TMVA::Interval Class.
Least Squares BDT Loss Function.
The TMVA::Interval Class.
Analysis of Boosted Decision Trees.
void Init(void)
Common initialisation with defaults for the BDT-Method.
static const Int_t fgDebugLevel
MethodBDT(const TString &jobName, const TString &methodTitle, DataSetInfo &theData, const TString &theOption="")
The standard constructor for the "boosted decision trees".
void BoostMonitor(Int_t iTree)
Fills the ROCIntegral vs Itree from the testSample for the monitoring plots during the training .
const std::vector< Float_t > & GetMulticlassValues()
Get the multiclass MVA response for the BDT classifier.
Double_t AdaBoostR2(std::vector< const TMVA::Event * > &, DecisionTree *dt)
Adaption of the AdaBoost to regression problems (see H.Drucker 1997).
void MakeClassSpecific(std::ostream &, const TString &) const
Make ROOT-independent C++ class for classifier response (classifier-specific implementation).
void GetHelpMessage() const
Get help message text.
LossFunctionBDT * fRegressionLossFunctionBDTG
void DeterminePreselectionCuts(const std::vector< const TMVA::Event * > &eventSample)
Find useful preselection cuts that will be applied before and Decision Tree training.
Double_t GradBoost(std::vector< const TMVA::Event * > &, DecisionTree *dt, UInt_t cls=0)
Calculate the desired response value for each region.
const Ranking * CreateRanking()
Compute ranking of input variables.
virtual void SetTuneParameters(std::map< TString, Double_t > tuneParameters)
Set the tuning parameters according to the argument.
Double_t AdaCost(std::vector< const TMVA::Event * > &, DecisionTree *dt)
The AdaCost boosting algorithm takes a simple cost Matrix (currently fixed for all events....
void DeclareOptions()
Define the options (their key words).
virtual std::map< TString, Double_t > OptimizeTuningParameters(TString fomType="ROCIntegral", TString fitType="FitGA")
Call the Optimizer with the set of parameters and ranges that are meant to be tuned.
Double_t Boost(std::vector< const TMVA::Event * > &, DecisionTree *dt, UInt_t cls=0)
Apply the boosting algorithm (the algorithm is selecte via the the "option" given in the constructor.
Double_t TestTreeQuality(DecisionTree *dt)
Test the tree quality.. in terms of Misclassification.
Double_t Bagging()
Call it boot-strapping, re-sampling or whatever you like, in the end it is nothing else but applying ...
void UpdateTargets(std::vector< const TMVA::Event * > &, UInt_t cls=0)
Calculate residual for all events.
void UpdateTargetsRegression(std::vector< const TMVA::Event * > &, Bool_t first=kFALSE)
Calculate residuals for all events and update targets for next iter.
Double_t GradBoostRegression(std::vector< const TMVA::Event * > &, DecisionTree *dt)
Implementation of M_TreeBoost using any loss function as described by Friedman 1999.
void WriteMonitoringHistosToFile(void) const
Here we could write some histograms created during the processing to the output file.
virtual ~MethodBDT(void)
Destructor.
void AddWeightsXMLTo(void *parent) const
Write weights to XML.
Double_t GetGradBoostMVA(const TMVA::Event *e, UInt_t nTrees)
Returns MVA value: -1 for background, 1 for signal.
virtual Bool_t HasAnalysisType(Types::EAnalysisType type, UInt_t numberClasses, UInt_t numberTargets)
BDT can handle classification with multiple classes and regression with one regression-target.
Double_t RegBoost(std::vector< const TMVA::Event * > &, DecisionTree *dt)
A special boosting only for Regression (not implemented).
void InitEventSample()
Initialize the event sample (i.e. reset the boost-weights... etc).
Double_t ApplyPreselectionCuts(const Event *ev)
Apply the preselection cuts before even bothering about any Decision Trees in the GetMVA .
void SetMinNodeSize(Double_t sizeInPercent)
void ProcessOptions()
The option string is decoded, for available options see "DeclareOptions".
void PreProcessNegativeEventWeights()
O.k.
void MakeClassInstantiateNode(DecisionTreeNode *n, std::ostream &fout, const TString &className) const
Recursively descends a tree and writes the node instance to the output stream.
Double_t AdaBoost(std::vector< const TMVA::Event * > &, DecisionTree *dt)
The AdaBoost implementation.
std::vector< Double_t > GetVariableImportance()
Return the relative variable importance, normalized to all variables together having the importance 1...
Double_t GetMvaValue(Double_t *err=0, Double_t *errUpper=0)
Double_t PrivateGetMvaValue(const TMVA::Event *ev, Double_t *err=0, Double_t *errUpper=0, UInt_t useNTrees=0)
Return the MVA value (range [-1;1]) that classifies the event according to the majority vote from the...
void InitGradBoost(std::vector< const TMVA::Event * > &)
Initialize targets for first tree.
void Train(void)
BDT training.
void GetBaggedSubSample(std::vector< const TMVA::Event * > &)
Fills fEventSample with fBaggedSampleFraction*NEvents random training events.
const std::vector< Float_t > & GetRegressionValues()
Get the regression value generated by the BDTs.
SeparationBase * fSepType
void ReadWeightsFromXML(void *parent)
Reads the BDT from the xml file.
void ReadWeightsFromStream(std::istream &istr)
Read the weights (BDT coefficients).
void Reset(void)
Reset the method, as if it had just been instantiated (forget all training etc.).
void MakeClassSpecificHeader(std::ostream &, const TString &) const
Specific class header.
void DeclareCompatibilityOptions()
Options that are used ONLY for the READER to ensure backward compatibility.
Virtual base Class for all MVA method.
virtual void DeclareCompatibilityOptions()
options that are used ONLY for the READER to ensure backward compatibility they are hence without any...
Implementation of the MisClassificationError as separation criterion.
std::map< TString, Double_t > optimize()
PDF wrapper for histograms; uses user-defined spline interpolation.
Ranking for variables in method (implementation)
Class that is the base-class for a vector of result.
TGraph * GetGraph(const TString &alias) const
TH1 * GetHist(const TString &alias) const
void Store(TObject *obj, const char *alias=0)
Implementation of the SdivSqrtSplusB as separation criterion.
Timing information for training and evaluation of MVA methods.
TString GetElapsedTime(Bool_t Scientific=kTRUE)
returns pretty string with elapsed time
void DrawProgressBar(Int_t, const TString &comment="")
draws progress bar in color or B&W caution:
Singleton class for Global types used by TMVA.
virtual Double_t Determinant() const
TMatrixTSym< Element > & Invert(Double_t *det=0)
Invert the matrix and calculate its determinant Notice that the LU decomposition is used instead of B...
virtual const char * GetTitle() const
Returns title of object.
virtual Int_t Write(const char *name=0, Int_t option=0, Int_t bufsize=0)
Write this object to the current directory.
virtual void Delete(Option_t *option="")
Delete this object.
virtual Int_t Read(const char *name)
Read contents of object with specified name from the current directory.
Random number generator class based on M.
virtual Double_t PoissonD(Double_t mean)
Generates a random number according to a Poisson law.
Double_t Atof() const
Return floating-point value contained in string.
Bool_t IsFloat() const
Returns kTRUE if string contains a floating point or integer number.
TString & ReplaceAll(const TString &s1, const TString &s2)
TString & Append(const char *cs)
A TTree represents a columnar dataset.
TSeq< unsigned int > TSeqU
create variable transformations
MsgLogger & Endl(MsgLogger &ml)
Short_t Max(Short_t a, Short_t b)
Int_t FloorNint(Double_t x)
Double_t Sqrt(Double_t x)
LongDouble_t Power(LongDouble_t x, LongDouble_t y)
Int_t CeilNint(Double_t x)
Short_t Min(Short_t a, Short_t b)
static uint64_t sum(uint64_t i)