151#include <unordered_map>
172 , fSigToBkgFraction(0)
177 , fBaggedGradBoost(
kFALSE)
181 , fMinNodeSizeS(
"5%")
184 , fMinLinCorrForFisher(.8)
185 , fUseExclusiveVars(0)
187 , fNodePurityLimit(0)
192 , fFValidationEvents(0)
194 , fRandomisedTrees(
kFALSE)
196 , fUsePoissonNvars(0)
197 , fUseNTrainEvents(0)
198 , fBaggedSampleFraction(0)
199 , fNoNegWeightsInTraining(
kFALSE)
200 , fInverseBoostNegWeights(
kFALSE)
201 , fPairNegWeightsGlobal(
kFALSE)
202 , fTrainWithNegWeights(
kFALSE)
212 , fSkipNormalization(
kFALSE)
227 , fSigToBkgFraction(0)
232 , fBaggedGradBoost(
kFALSE)
236 , fMinNodeSizeS(
"5%")
239 , fMinLinCorrForFisher(.8)
240 , fUseExclusiveVars(0)
242 , fNodePurityLimit(0)
247 , fFValidationEvents(0)
249 , fRandomisedTrees(
kFALSE)
251 , fUsePoissonNvars(0)
252 , fUseNTrainEvents(0)
253 , fBaggedSampleFraction(0)
254 , fNoNegWeightsInTraining(
kFALSE)
255 , fInverseBoostNegWeights(
kFALSE)
256 , fPairNegWeightsGlobal(
kFALSE)
257 , fTrainWithNegWeights(
kFALSE)
267 , fSkipNormalization(
kFALSE)
337 DeclareOptionRef(fNTrees,
"NTrees",
"Number of trees in the forest");
338 if (DoRegression()) {
339 DeclareOptionRef(fMaxDepth=50,
"MaxDepth",
"Max depth of the decision tree allowed");
341 DeclareOptionRef(fMaxDepth=3,
"MaxDepth",
"Max depth of the decision tree allowed");
344 TString tmp=
"5%";
if (DoRegression()) tmp=
"0.2%";
345 DeclareOptionRef(fMinNodeSizeS=tmp,
"MinNodeSize",
"Minimum percentage of training events required in a leaf node (default: Classification: 5%, Regression: 0.2%)");
347 DeclareOptionRef(fNCuts,
"nCuts",
"Number of grid points in variable range used in finding optimal cut in node splitting");
349 DeclareOptionRef(fBoostType,
"BoostType",
"Boosting type for the trees in the forest (note: AdaCost is still experimental)");
351 AddPreDefVal(
TString(
"AdaBoost"));
352 AddPreDefVal(
TString(
"RealAdaBoost"));
353 AddPreDefVal(
TString(
"AdaCost"));
354 AddPreDefVal(
TString(
"Bagging"));
356 AddPreDefVal(
TString(
"AdaBoostR2"));
358 if (DoRegression()) {
359 fBoostType =
"AdaBoostR2";
361 fBoostType =
"AdaBoost";
363 DeclareOptionRef(fAdaBoostR2Loss=
"Quadratic",
"AdaBoostR2Loss",
"Type of Loss function in AdaBoostR2");
364 AddPreDefVal(
TString(
"Linear"));
365 AddPreDefVal(
TString(
"Quadratic"));
366 AddPreDefVal(
TString(
"Exponential"));
368 DeclareOptionRef(fBaggedBoost=
kFALSE,
"UseBaggedBoost",
"Use only a random subsample of all events for growing the trees in each boost iteration.");
369 DeclareOptionRef(fShrinkage = 1.0,
"Shrinkage",
"Learning rate for BoostType=Grad algorithm");
370 DeclareOptionRef(fAdaBoostBeta=.5,
"AdaBoostBeta",
"Learning rate for AdaBoost algorithm");
371 DeclareOptionRef(fRandomisedTrees,
"UseRandomisedTrees",
"Determine at each node splitting the cut variable only as the best out of a random subset of variables (like in RandomForests)");
372 DeclareOptionRef(fUseNvars,
"UseNvars",
"Size of the subset of variables used with RandomisedTree option");
373 DeclareOptionRef(fUsePoissonNvars,
"UsePoissonNvars",
"Interpret \"UseNvars\" not as fixed number but as mean of a Poisson distribution in each split with RandomisedTree option");
374 DeclareOptionRef(fBaggedSampleFraction=.6,
"BaggedSampleFraction",
"Relative size of bagged event sample to original size of the data sample (used whenever bagging is used (i.e. UseBaggedBoost, Bagging,)" );
376 DeclareOptionRef(fUseYesNoLeaf=
kTRUE,
"UseYesNoLeaf",
377 "Use Sig or Bkg categories, or the purity=S/(S+B) as classification of the leaf node -> Real-AdaBoost");
378 if (DoRegression()) {
382 DeclareOptionRef(fNegWeightTreatment=
"InverseBoostNegWeights",
"NegWeightTreatment",
"How to treat events with negative weights in the BDT training (particular the boosting) : IgnoreInTraining; Boost With inverse boostweight; Pair events with negative and positive weights in training sample and *annihilate* them (experimental!)");
383 AddPreDefVal(
TString(
"InverseBoostNegWeights"));
384 AddPreDefVal(
TString(
"IgnoreNegWeightsInTraining"));
385 AddPreDefVal(
TString(
"NoNegWeightsInTraining"));
386 AddPreDefVal(
TString(
"PairNegWeightsGlobal"));
391 DeclareOptionRef(fCss=1.,
"Css",
"AdaCost: cost of true signal selected signal");
392 DeclareOptionRef(fCts_sb=1.,
"Cts_sb",
"AdaCost: cost of true signal selected bkg");
393 DeclareOptionRef(fCtb_ss=1.,
"Ctb_ss",
"AdaCost: cost of true bkg selected signal");
394 DeclareOptionRef(fCbb=1.,
"Cbb",
"AdaCost: cost of true bkg selected bkg ");
396 DeclareOptionRef(fNodePurityLimit=0.5,
"NodePurityLimit",
"In boosting/pruning, nodes with purity > NodePurityLimit are signal; background otherwise.");
399 DeclareOptionRef(fSepTypeS,
"SeparationType",
"Separation criterion for node splitting");
400 AddPreDefVal(
TString(
"CrossEntropy"));
401 AddPreDefVal(
TString(
"GiniIndex"));
402 AddPreDefVal(
TString(
"GiniIndexWithLaplace"));
403 AddPreDefVal(
TString(
"MisClassificationError"));
404 AddPreDefVal(
TString(
"SDivSqrtSPlusB"));
405 AddPreDefVal(
TString(
"RegressionVariance"));
406 if (DoRegression()) {
407 fSepTypeS =
"RegressionVariance";
409 fSepTypeS =
"GiniIndex";
412 DeclareOptionRef(fRegressionLossFunctionBDTGS =
"Huber",
"RegressionLossFunctionBDTG",
"Loss function for BDTG regression.");
413 AddPreDefVal(
TString(
"Huber"));
414 AddPreDefVal(
TString(
"AbsoluteDeviation"));
415 AddPreDefVal(
TString(
"LeastSquares"));
417 DeclareOptionRef(fHuberQuantile = 0.7,
"HuberQuantile",
"In the Huber loss function this is the quantile that separates the core from the tails in the residuals distribution.");
419 DeclareOptionRef(fDoBoostMonitor=
kFALSE,
"DoBoostMonitor",
"Create control plot with ROC integral vs tree number");
421 DeclareOptionRef(fUseFisherCuts=
kFALSE,
"UseFisherCuts",
"Use multivariate splits using the Fisher criterion");
422 DeclareOptionRef(fMinLinCorrForFisher=.8,
"MinLinCorrForFisher",
"The minimum linear correlation between two variables demanded for use in Fisher criterion in node splitting");
423 DeclareOptionRef(fUseExclusiveVars=
kFALSE,
"UseExclusiveVars",
"Variables already used in fisher criterion are not anymore analysed individually for node splitting");
426 DeclareOptionRef(fDoPreselection=
kFALSE,
"DoPreselection",
"and and apply automatic pre-selection for 100% efficient signal (bkg) cuts prior to training");
429 DeclareOptionRef(fSigToBkgFraction=1,
"SigToBkgFraction",
"Sig to Bkg ratio used in Training (similar to NodePurityLimit, which cannot be used in real adaboost");
431 DeclareOptionRef(fPruneMethodS,
"PruneMethod",
"Note: for BDTs use small trees (e.g.MaxDepth=3) and NoPruning: Pruning: Method used for pruning (removal) of statistically insignificant branches ");
432 AddPreDefVal(
TString(
"NoPruning"));
433 AddPreDefVal(
TString(
"ExpectedError"));
434 AddPreDefVal(
TString(
"CostComplexity"));
436 DeclareOptionRef(fPruneStrength,
"PruneStrength",
"Pruning strength");
438 DeclareOptionRef(fFValidationEvents=0.5,
"PruningValFraction",
"Fraction of events to use for optimizing automatic pruning.");
440 DeclareOptionRef(fSkipNormalization=
kFALSE,
"SkipNormalization",
"Skip normalization at initialization, to keep expectation value of BDT output according to the fraction of events");
443 DeclareOptionRef(fMinNodeEvents=0,
"nEventsMin",
"deprecated: Use MinNodeSize (in % of training events) instead");
445 DeclareOptionRef(fBaggedGradBoost=
kFALSE,
"UseBaggedGrad",
"deprecated: Use *UseBaggedBoost* instead: Use only a random subsample of all events for growing the trees in each iteration.");
446 DeclareOptionRef(fBaggedSampleFraction,
"GradBaggingFraction",
"deprecated: Use *BaggedSampleFraction* instead: Defines the fraction of events to be used in each iteration, e.g. when UseBaggedGrad=kTRUE. ");
447 DeclareOptionRef(fUseNTrainEvents,
"UseNTrainEvents",
"deprecated: Use *BaggedSampleFraction* instead: Number of randomly picked training events used in randomised (and bagged) trees");
448 DeclareOptionRef(fNNodesMax,
"NNodesMax",
"deprecated: Use MaxDepth instead to limit the tree size" );
460 DeclareOptionRef(fHistoricBool=
kTRUE,
"UseWeightedTrees",
461 "Use weighted trees or simple average in classification from the forest");
462 DeclareOptionRef(fHistoricBool=
kFALSE,
"PruneBeforeBoost",
"Flag to prune the tree before applying boosting algorithm");
463 DeclareOptionRef(fHistoricBool=
kFALSE,
"RenormByClass",
"Individually re-normalize each event class to the original size after boosting");
465 AddPreDefVal(
TString(
"NegWeightTreatment"),
TString(
"IgnoreNegWeights"));
476 else if (fSepTypeS ==
"giniindex") fSepType =
new GiniIndex();
478 else if (fSepTypeS ==
"crossentropy") fSepType =
new CrossEntropy();
479 else if (fSepTypeS ==
"sdivsqrtsplusb") fSepType =
new SdivSqrtSplusB();
480 else if (fSepTypeS ==
"regressionvariance") fSepType = NULL;
482 Log() << kINFO << GetOptions() <<
Endl;
483 Log() << kFATAL <<
"<ProcessOptions> unknown Separation Index option " << fSepTypeS <<
" called" <<
Endl;
486 if(!(fHuberQuantile >= 0.0 && fHuberQuantile <= 1.0)){
487 Log() << kINFO << GetOptions() <<
Endl;
488 Log() << kFATAL <<
"<ProcessOptions> Huber Quantile must be in range [0,1]. Value given, " << fHuberQuantile <<
", does not match this criteria" <<
Endl;
492 fRegressionLossFunctionBDTGS.ToLower();
493 if (fRegressionLossFunctionBDTGS ==
"huber") fRegressionLossFunctionBDTG =
new HuberLossFunctionBDT(fHuberQuantile);
497 Log() << kINFO << GetOptions() <<
Endl;
498 Log() << kFATAL <<
"<ProcessOptions> unknown Regression Loss Function BDT option " << fRegressionLossFunctionBDTGS <<
" called" <<
Endl;
501 fPruneMethodS.ToLower();
506 Log() << kINFO << GetOptions() <<
Endl;
507 Log() << kFATAL <<
"<ProcessOptions> unknown PruneMethod " << fPruneMethodS <<
" option called" <<
Endl;
513 <<
"Sorry automatic pruning strength determination is not implemented yet for ExpectedErrorPruning" <<
Endl;
517 if (fMinNodeEvents > 0){
518 fMinNodeSize =
Double_t(fMinNodeEvents*100.) / Data()->GetNTrainingEvents();
519 Log() << kWARNING <<
"You have explicitly set ** nEventsMin = " << fMinNodeEvents<<
" ** the min absolute number \n"
520 <<
"of events in a leaf node. This is DEPRECATED, please use the option \n"
521 <<
"*MinNodeSize* giving the relative number as percentage of training \n"
522 <<
"events instead. \n"
523 <<
"nEventsMin="<<fMinNodeEvents<<
"--> MinNodeSize="<<fMinNodeSize<<
"%"
525 Log() << kWARNING <<
"Note also that explicitly setting *nEventsMin* so far OVERWRITES the option recommended \n"
526 <<
" *MinNodeSize* = " << fMinNodeSizeS <<
" option !!" <<
Endl ;
527 fMinNodeSizeS =
Form(
"%F3.2",fMinNodeSize);
530 SetMinNodeSize(fMinNodeSizeS);
534 fAdaBoostR2Loss.ToLower();
536 if (fBoostType==
"Grad") {
538 if (fNegWeightTreatment==
"InverseBoostNegWeights"){
539 Log() << kINFO <<
"the option NegWeightTreatment=InverseBoostNegWeights does"
540 <<
" not exist for BoostType=Grad" <<
Endl;
541 Log() << kINFO <<
"--> change to new default NegWeightTreatment=Pray" <<
Endl;
542 Log() << kDEBUG <<
"i.e. simply keep them as if which should work fine for Grad Boost" <<
Endl;
543 fNegWeightTreatment=
"Pray";
544 fNoNegWeightsInTraining=
kFALSE;
546 }
else if (fBoostType==
"RealAdaBoost"){
547 fBoostType =
"AdaBoost";
549 }
else if (fBoostType==
"AdaCost"){
553 if (fFValidationEvents < 0.0) fFValidationEvents = 0.0;
554 if (fAutomatic && fFValidationEvents > 0.5) {
555 Log() << kWARNING <<
"You have chosen to use more than half of your training sample "
556 <<
"to optimize the automatic pruning algorithm. This is probably wasteful "
557 <<
"and your overall results will be degraded. Are you sure you want this?"
562 if (this->Data()->HasNegativeEventWeights()){
563 Log() << kINFO <<
" You are using a Monte Carlo that has also negative weights. "
564 <<
"That should in principle be fine as long as on average you end up with "
565 <<
"something positive. For this you have to make sure that the minimal number "
566 <<
"of (un-weighted) events demanded for a tree node (currently you use: MinNodeSize="
567 << fMinNodeSizeS <<
" ("<< fMinNodeSize <<
"%)"
568 <<
", (or the deprecated equivalent nEventsMin) you can set this via the "
569 <<
"BDT option string when booking the "
570 <<
"classifier) is large enough to allow for reasonable averaging!!! "
571 <<
" If this does not help.. maybe you want to try the option: IgnoreNegWeightsInTraining "
572 <<
"which ignores events with negative weight in the training. " <<
Endl
573 <<
Endl <<
"Note: You'll get a WARNING message during the training if that should ever happen" <<
Endl;
576 if (DoRegression()) {
577 if (fUseYesNoLeaf && !IsConstructedFromWeightFile()){
578 Log() << kWARNING <<
"Regression Trees do not work with fUseYesNoLeaf=TRUE --> I will set it to FALSE" <<
Endl;
582 if (fSepType != NULL){
583 Log() << kWARNING <<
"Regression Trees do not work with Separation type other than <RegressionVariance> --> I will use it instead" <<
Endl;
587 Log() << kWARNING <<
"Sorry, UseFisherCuts is not available for regression analysis, I will ignore it!" <<
Endl;
591 Log() << kWARNING <<
"Sorry, the option of nCuts<0 using a more elaborate node splitting algorithm " <<
Endl;
592 Log() << kWARNING <<
"is not implemented for regression analysis ! " <<
Endl;
593 Log() << kWARNING <<
"--> I switch do default nCuts = 20 and use standard node splitting"<<
Endl;
597 if (fRandomisedTrees){
598 Log() << kINFO <<
" Randomised trees use no pruning" <<
Endl;
603 if (fUseFisherCuts) {
604 Log() << kWARNING <<
"When using the option UseFisherCuts, the other option nCuts<0 (i.e. using" <<
Endl;
605 Log() <<
" a more elaborate node splitting algorithm) is not implemented. " <<
Endl;
612 Log() << kERROR <<
" Zero Decision Trees demanded... that does not work !! "
613 <<
" I set it to 1 .. just so that the program does not crash"
618 fNegWeightTreatment.ToLower();
619 if (fNegWeightTreatment ==
"ignorenegweightsintraining") fNoNegWeightsInTraining =
kTRUE;
620 else if (fNegWeightTreatment ==
"nonegweightsintraining") fNoNegWeightsInTraining =
kTRUE;
621 else if (fNegWeightTreatment ==
"inverseboostnegweights") fInverseBoostNegWeights =
kTRUE;
622 else if (fNegWeightTreatment ==
"pairnegweightsglobal") fPairNegWeightsGlobal =
kTRUE;
623 else if (fNegWeightTreatment ==
"pray")
Log() << kDEBUG <<
"Yes, good luck with praying " <<
Endl;
625 Log() << kINFO << GetOptions() <<
Endl;
626 Log() << kFATAL <<
"<ProcessOptions> unknown option for treating negative event weights during training " << fNegWeightTreatment <<
" requested" <<
Endl;
629 if (fNegWeightTreatment ==
"pairnegweightsglobal")
630 Log() << kWARNING <<
" you specified the option NegWeightTreatment=PairNegWeightsGlobal : This option is still considered EXPERIMENTAL !! " <<
Endl;
637 while (tmp < fNNodesMax){
641 Log() << kWARNING <<
"You have specified a deprecated option *NNodesMax="<<fNNodesMax
642 <<
"* \n this has been translated to MaxDepth="<<fMaxDepth<<
Endl;
646 if (fUseNTrainEvents>0){
647 fBaggedSampleFraction = (
Double_t) fUseNTrainEvents/Data()->GetNTrainingEvents();
648 Log() << kWARNING <<
"You have specified a deprecated option *UseNTrainEvents="<<fUseNTrainEvents
649 <<
"* \n this has been translated to BaggedSampleFraction="<<fBaggedSampleFraction<<
"(%)"<<
Endl;
652 if (fBoostType==
"Bagging") fBaggedBoost =
kTRUE;
653 if (fBaggedGradBoost){
654 fBaggedBoost =
kTRUE;
655 Log() << kWARNING <<
"You have specified a deprecated option *UseBaggedGrad* --> please use *UseBaggedBoost* instead" <<
Endl;
663 if (sizeInPercent > 0 && sizeInPercent < 50){
664 fMinNodeSize=sizeInPercent;
667 Log() << kFATAL <<
"you have demanded a minimal node size of "
668 << sizeInPercent <<
"% of the training events.. \n"
669 <<
" that somehow does not make sense "<<
Endl;
679 if (sizeInPercent.
IsFloat()) SetMinNodeSize(sizeInPercent.
Atof());
681 Log() << kFATAL <<
"I had problems reading the option MinNodeEvents, which "
682 <<
"after removing a possible % sign now reads " << sizeInPercent <<
Endl;
694 fBoostType =
"AdaBoost";
695 if(DataInfo().GetNClasses()!=0)
699 fBoostType =
"AdaBoostR2";
700 fAdaBoostR2Loss =
"Quadratic";
701 if(DataInfo().GetNClasses()!=0)
707 fPruneMethodS =
"NoPruning";
711 fFValidationEvents = 0.5;
712 fRandomisedTrees =
kFALSE;
715 fUsePoissonNvars =
kTRUE;
720 SetSignalReferenceCut( 0 );
733 for (
UInt_t i=0; i<fForest.size(); i++)
delete fForest[i];
736 fBoostWeights.clear();
737 if (fMonitorNtuple) { fMonitorNtuple->Delete(); fMonitorNtuple=NULL; }
738 fVariableImportance.clear();
740 fLossFunctionEventInfo.clear();
745 Log() << kDEBUG <<
" successfully(?) reset the method " <<
Endl;
757 for (
UInt_t i=0; i<fForest.size(); i++)
delete fForest[i];
765 if (!HasTrainingTree())
Log() << kFATAL <<
"<Init> Data().TrainingTree() is zero pointer" <<
Endl;
767 if (fEventSample.size() > 0) {
769 for (
UInt_t iev=0; iev<fEventSample.size(); iev++) fEventSample[iev]->SetBoostWeight(1.);
772 UInt_t nevents = Data()->GetNTrainingEvents();
774 std::vector<const TMVA::Event*> tmpEventSample;
775 for (
Long64_t ievt=0; ievt<nevents; ievt++) {
777 Event*
event =
new Event( *GetTrainingEvent(ievt) );
778 tmpEventSample.push_back(event);
781 if (!DoRegression()) DeterminePreselectionCuts(tmpEventSample);
782 else fDoPreselection =
kFALSE;
784 for (
UInt_t i=0; i<tmpEventSample.size(); i++)
delete tmpEventSample[i];
789 for (
Long64_t ievt=0; ievt<nevents; ievt++) {
792 Event*
event =
new Event( *GetTrainingEvent(ievt) );
793 if (fDoPreselection){
794 if (
TMath::Abs(ApplyPreselectionCuts(event)) > 0.05) {
800 if (event->GetWeight() < 0 && (IgnoreEventsWithNegWeightsInTraining() || fNoNegWeightsInTraining)){
801 if (firstNegWeight) {
802 Log() << kWARNING <<
" Note, you have events with negative event weight in the sample, but you've chosen to ignore them" <<
Endl;
806 }
else if (event->GetWeight()==0){
807 if (firstZeroWeight) {
809 Log() <<
"Events with weight == 0 are going to be simply ignored " <<
Endl;
813 if (event->GetWeight() < 0) {
814 fTrainWithNegWeights=
kTRUE;
817 if (fPairNegWeightsGlobal){
818 Log() << kWARNING <<
"Events with negative event weights are found and "
819 <<
" will be removed prior to the actual BDT training by global "
820 <<
" paring (and subsequent annihilation) with positiv weight events"
823 Log() << kWARNING <<
"Events with negative event weights are USED during "
824 <<
"the BDT training. This might cause problems with small node sizes "
825 <<
"or with the boosting. Please remove negative events from training "
826 <<
"using the option *IgnoreEventsWithNegWeightsInTraining* in case you "
827 <<
"observe problems with the boosting"
834 Double_t modulo = 1.0/(fFValidationEvents);
835 Int_t imodulo =
static_cast<Int_t>( fmod(modulo,1.0) > 0.5 ?
ceil(modulo) :
floor(modulo) );
836 if (ievt % imodulo == 0) fValidationSample.push_back( event );
837 else fEventSample.push_back( event );
840 fEventSample.push_back(event);
846 Log() << kINFO <<
"<InitEventSample> Internally I use " << fEventSample.size()
847 <<
" for Training and " << fValidationSample.size()
848 <<
" for Pruning Validation (" << ((
Float_t)fValidationSample.size())/((
Float_t)fEventSample.size()+fValidationSample.size())*100.0
849 <<
"% of training used for validation)" <<
Endl;
853 if (fPairNegWeightsGlobal) PreProcessNegativeEventWeights();
856 if (DoRegression()) {
858 }
else if (DoMulticlass()) {
860 }
else if (!fSkipNormalization) {
862 Log() << kDEBUG <<
"\t<InitEventSample> For classification trees, "<<
Endl;
863 Log() << kDEBUG <<
" \tthe effective number of backgrounds is scaled to match "<<
Endl;
864 Log() << kDEBUG <<
" \tthe signal. Otherwise the first boosting step would do 'just that'!"<<
Endl;
878 Double_t nevents = fEventSample.size();
880 Int_t sumSig=0, sumBkg=0;
881 for (
UInt_t ievt=0; ievt<fEventSample.size(); ievt++) {
882 if ((DataInfo().IsSignal(fEventSample[ievt])) ) {
883 sumSigW += fEventSample[ievt]->GetWeight();
886 sumBkgW += fEventSample[ievt]->GetWeight();
890 if (sumSigW && sumBkgW){
891 Double_t normSig = nevents/((1+fSigToBkgFraction)*sumSigW)*fSigToBkgFraction;
892 Double_t normBkg = nevents/((1+fSigToBkgFraction)*sumBkgW); ;
893 Log() << kDEBUG <<
"\tre-normalise events such that Sig and Bkg have respective sum of weights = "
894 << fSigToBkgFraction <<
Endl;
895 Log() << kDEBUG <<
" \tsig->sig*"<<normSig <<
"ev. bkg->bkg*"<<normBkg <<
"ev." <<
Endl;
896 Log() << kHEADER <<
"#events: (reweighted) sig: "<< sumSigW*normSig <<
" bkg: " << sumBkgW*normBkg <<
Endl;
897 Log() << kINFO <<
"#events: (unweighted) sig: "<< sumSig <<
" bkg: " << sumBkg <<
Endl;
898 for (
Long64_t ievt=0; ievt<nevents; ievt++) {
899 if ((DataInfo().IsSignal(fEventSample[ievt])) ) fEventSample[ievt]->SetBoostWeight(normSig);
900 else fEventSample[ievt]->SetBoostWeight(normBkg);
903 Log() << kINFO <<
"--> could not determine scaling factors as either there are " <<
Endl;
904 Log() << kINFO <<
" no signal events (sumSigW="<<sumSigW<<
") or no bkg ev. (sumBkgW="<<sumBkgW<<
")"<<
Endl;
909 fTrainSample = &fEventSample;
911 GetBaggedSubSample(fEventSample);
912 fTrainSample = &fSubSample;
938 std::vector<const Event*> negEvents;
939 for (
UInt_t iev = 0; iev < fEventSample.size(); iev++){
940 if (fEventSample[iev]->GetWeight() < 0) {
941 totalNegWeights += fEventSample[iev]->GetWeight();
942 negEvents.push_back(fEventSample[iev]);
944 totalPosWeights += fEventSample[iev]->GetWeight();
946 totalWeights += fEventSample[iev]->GetWeight();
948 if (totalNegWeights == 0 ) {
949 Log() << kINFO <<
"no negative event weights found .. no preprocessing necessary" <<
Endl;
952 Log() << kINFO <<
"found a total of " << totalNegWeights <<
" of negative event weights which I am going to try to pair with positive events to annihilate them" <<
Endl;
953 Log() << kINFO <<
"found a total of " << totalPosWeights <<
" of events with positive weights" <<
Endl;
954 Log() << kINFO <<
"--> total sum of weights = " << totalWeights <<
" = " << totalNegWeights+totalPosWeights <<
Endl;
961 for (
Int_t i=0; i<2; i++){
962 invCov = ((*cov)[i]);
964 std::cout <<
"<MethodBDT::PreProcessNeg...> matrix is almost singular with determinant="
966 <<
" did you use the variables that are linear combinations or highly correlated?"
970 std::cout <<
"<MethodBDT::PreProcessNeg...> matrix is singular with determinant="
972 <<
" did you use the variables that are linear combinations?"
981 Log() << kINFO <<
"Found a total of " << totalNegWeights <<
" in negative weights out of " << fEventSample.size() <<
" training events " <<
Endl;
982 Timer timer(negEvents.size(),
"Negative Event paired");
983 for (
UInt_t nev = 0; nev < negEvents.size(); nev++){
985 Double_t weight = negEvents[nev]->GetWeight();
986 UInt_t iClassID = negEvents[nev]->GetClass();
987 invCov = ((*cov)[iClassID]);
993 for (
UInt_t iev = 0; iev < fEventSample.size(); iev++){
994 if (iClassID==fEventSample[iev]->
GetClass() && fEventSample[iev]->GetWeight() > 0){
996 for (
UInt_t ivar=0; ivar < GetNvar(); ivar++){
997 for (
UInt_t jvar=0; jvar<GetNvar(); jvar++){
998 dist += (negEvents[nev]->GetValue(ivar)-fEventSample[iev]->GetValue(ivar))*
999 (*invCov)[ivar][jvar]*
1000 (negEvents[nev]->GetValue(jvar)-fEventSample[iev]->GetValue(jvar));
1003 if (
dist < minDist) { iMin=iev; minDist=
dist;}
1009 Double_t newWeight = (negEvents[nev]->GetWeight() + fEventSample[iMin]->GetWeight());
1011 negEvents[nev]->SetBoostWeight( 0 );
1012 fEventSample[iMin]->SetBoostWeight( newWeight/fEventSample[iMin]->GetOriginalWeight() );
1014 negEvents[nev]->SetBoostWeight( newWeight/negEvents[nev]->GetOriginalWeight() );
1015 fEventSample[iMin]->SetBoostWeight( 0 );
1018 }
else Log() << kFATAL <<
"preprocessing didn't find event to pair with the negative weight ... probably a bug" <<
Endl;
1019 weight = negEvents[nev]->GetWeight();
1026 totalNegWeights = 0;
1027 totalPosWeights = 0;
1034 std::vector<const Event*> newEventSample;
1036 for (
UInt_t iev = 0; iev < fEventSample.size(); iev++){
1037 if (fEventSample[iev]->GetWeight() < 0) {
1038 totalNegWeights += fEventSample[iev]->GetWeight();
1039 totalWeights += fEventSample[iev]->GetWeight();
1041 totalPosWeights += fEventSample[iev]->GetWeight();
1042 totalWeights += fEventSample[iev]->GetWeight();
1044 if (fEventSample[iev]->GetWeight() > 0) {
1045 newEventSample.push_back(
new Event(*fEventSample[iev]));
1046 if (fEventSample[iev]->
GetClass() == fSignalClass){
1047 sigWeight += fEventSample[iev]->GetWeight();
1050 bkgWeight += fEventSample[iev]->GetWeight();
1055 if (totalNegWeights < 0)
Log() << kFATAL <<
" compensation of negative event weights with positive ones did not work " << totalNegWeights <<
Endl;
1057 for (
UInt_t i=0; i<fEventSample.size(); i++)
delete fEventSample[i];
1058 fEventSample = newEventSample;
1060 Log() << kINFO <<
" after PreProcessing, the Event sample is left with " << fEventSample.size() <<
" events (unweighted), all with positive weights, adding up to " << totalWeights <<
Endl;
1061 Log() << kINFO <<
" nSig="<<nSig <<
" sigWeight="<<sigWeight <<
" nBkg="<<nBkg <<
" bkgWeight="<<bkgWeight <<
Endl;
1073 std::map<TString,TMVA::Interval*> tuneParameters;
1074 std::map<TString,Double_t> tunedParameters;
1083 tuneParameters.insert(std::pair<TString,Interval*>(
"NTrees",
new Interval(10,1000,5)));
1084 tuneParameters.insert(std::pair<TString,Interval*>(
"MaxDepth",
new Interval(2,4,3)));
1085 tuneParameters.insert(std::pair<TString,Interval*>(
"MinNodeSize",
new LogInterval(1,30,30)));
1090 if (fBoostType==
"AdaBoost"){
1091 tuneParameters.insert(std::pair<TString,Interval*>(
"AdaBoostBeta",
new Interval(.2,1.,5)));
1093 }
else if (fBoostType==
"Grad"){
1094 tuneParameters.insert(std::pair<TString,Interval*>(
"Shrinkage",
new Interval(0.05,0.50,5)));
1096 }
else if (fBoostType==
"Bagging" && fRandomisedTrees){
1099 tuneParameters.insert(std::pair<TString,Interval*>(
"UseNvars",
new Interval(min_var,max_var,4)));
1103 Log()<<kINFO <<
" the following BDT parameters will be tuned on the respective *grid*\n"<<
Endl;
1104 std::map<TString,TMVA::Interval*>::iterator it;
1105 for(it=tuneParameters.begin(); it!= tuneParameters.end(); ++it){
1106 Log() << kWARNING << it->first <<
Endl;
1107 std::ostringstream oss;
1108 (it->second)->
Print(oss);
1114 tunedParameters=optimize.
optimize();
1116 return tunedParameters;
1125 std::map<TString,Double_t>::iterator it;
1126 for(it=tuneParameters.begin(); it!= tuneParameters.end(); ++it){
1127 Log() << kWARNING << it->first <<
" = " << it->second <<
Endl;
1128 if (it->first ==
"MaxDepth" ) SetMaxDepth ((
Int_t)it->second);
1129 else if (it->first ==
"MinNodeSize" ) SetMinNodeSize (it->second);
1130 else if (it->first ==
"NTrees" ) SetNTrees ((
Int_t)it->second);
1131 else if (it->first ==
"NodePurityLimit") SetNodePurityLimit (it->second);
1132 else if (it->first ==
"AdaBoostBeta" ) SetAdaBoostBeta (it->second);
1133 else if (it->first ==
"Shrinkage" ) SetShrinkage (it->second);
1134 else if (it->first ==
"UseNvars" ) SetUseNvars ((
Int_t)it->second);
1135 else if (it->first ==
"BaggedSampleFraction" ) SetBaggedSampleFraction (it->second);
1136 else Log() << kFATAL <<
" SetParameter for " << it->first <<
" not yet implemented " <<
Endl;
1154 Log() << kERROR <<
" Zero Decision Trees demanded... that does not work !! "
1155 <<
" I set it to 1 .. just so that the program does not crash"
1160 if (fInteractive && fInteractive->NotInitialized()){
1161 std::vector<TString> titles = {
"Boost weight",
"Error Fraction"};
1162 fInteractive->Init(titles);
1164 fIPyMaxIter = fNTrees;
1165 fExitFromTraining =
false;
1169 if (IsNormalised())
Log() << kFATAL <<
"\"Normalise\" option cannot be used with BDT; "
1170 <<
"please remove the option from the configuration string, or "
1171 <<
"use \"!Normalise\""
1175 Log() << kINFO <<
"Regression Loss Function: "<< fRegressionLossFunctionBDTG->Name() <<
Endl;
1177 Log() << kINFO <<
"Training "<< fNTrees <<
" Decision Trees ... patience please" <<
Endl;
1179 Log() << kDEBUG <<
"Training with maximal depth = " <<fMaxDepth
1180 <<
", MinNodeEvents=" << fMinNodeEvents
1181 <<
", NTrees="<<fNTrees
1182 <<
", NodePurityLimit="<<fNodePurityLimit
1183 <<
", AdaBoostBeta="<<fAdaBoostBeta
1189 TString hname =
"AdaBooost weight distribution";
1195 if (DoRegression()) {
1199 hname=
"Boost event weights distribution";
1205 TH1* nodesBeforePruningVsTree =
new TH1I(
Form(
"%s_NodesBeforePruning",DataInfo().
GetName()),
"nodes before pruning",fNTrees,0,fNTrees);
1206 TH1* nodesAfterPruningVsTree =
new TH1I(
Form(
"%s_NodesAfterPruning",DataInfo().
GetName()),
"nodes after pruning",fNTrees,0,fNTrees);
1210 if(!DoMulticlass()){
1213 h->SetXTitle(
"boost weight");
1214 results->
Store(
h,
"BoostWeights");
1218 if (fDoBoostMonitor){
1219 TH2* boostMonitor =
new TH2F(
"BoostMonitor",
"ROC Integral Vs iTree",2,0,fNTrees,2,0,1.05);
1221 boostMonitor->
SetYTitle(
"ROC Integral");
1222 results->
Store(boostMonitor,
"BoostMonitor");
1224 boostMonitorGraph->
SetName(
"BoostMonitorGraph");
1225 boostMonitorGraph->
SetTitle(
"ROCIntegralVsNTrees");
1226 results->
Store(boostMonitorGraph,
"BoostMonitorGraph");
1230 h =
new TH1F(
"BoostWeightVsTree",
"Boost weights vs tree",fNTrees,0,fNTrees);
1231 h->SetXTitle(
"#tree");
1232 h->SetYTitle(
"boost weight");
1233 results->
Store(
h,
"BoostWeightsVsTree");
1236 h =
new TH1F(
"ErrFractHist",
"error fraction vs tree number",fNTrees,0,fNTrees);
1237 h->SetXTitle(
"#tree");
1238 h->SetYTitle(
"error fraction");
1239 results->
Store(
h,
"ErrorFrac");
1242 nodesBeforePruningVsTree->
SetXTitle(
"#tree");
1243 nodesBeforePruningVsTree->
SetYTitle(
"#tree nodes");
1244 results->
Store(nodesBeforePruningVsTree);
1247 nodesAfterPruningVsTree->
SetXTitle(
"#tree");
1248 nodesAfterPruningVsTree->
SetYTitle(
"#tree nodes");
1249 results->
Store(nodesAfterPruningVsTree);
1253 fMonitorNtuple=
new TTree(
"MonitorNtuple",
"BDT variables");
1254 fMonitorNtuple->Branch(
"iTree",&fITree,
"iTree/I");
1255 fMonitorNtuple->Branch(
"boostWeight",&fBoostWeight,
"boostWeight/D");
1256 fMonitorNtuple->Branch(
"errorFraction",&fErrorFraction,
"errorFraction/D");
1259 Int_t nNodesBeforePruningCount = 0;
1260 Int_t nNodesAfterPruningCount = 0;
1262 Int_t nNodesBeforePruning = 0;
1263 Int_t nNodesAfterPruning = 0;
1265 if(fBoostType==
"Grad"){
1266 InitGradBoost(fEventSample);
1273 while (itree < fNTrees && continueBoost){
1274 if (fExitFromTraining)
break;
1275 fIPyCurrentIter = itree;
1288 if (fBoostType!=
"Grad"){
1289 Log() << kFATAL <<
"Multiclass is currently only supported by gradient boost. "
1290 <<
"Please change boost option accordingly (BoostType=Grad)." <<
Endl;
1293 UInt_t nClasses = DataInfo().GetNClasses();
1294 for (
UInt_t i=0;i<nClasses;i++){
1298 fForest.push_back(
new DecisionTree( fSepType, fMinNodeSize, fNCuts, &(DataInfo()), i,
1299 fRandomisedTrees, fUseNvars, fUsePoissonNvars, fMaxDepth,
1300 itree*nClasses+i, fNodePurityLimit, itree*nClasses+1));
1301 fForest.back()->SetNVars(GetNvar());
1302 if (fUseFisherCuts) {
1303 fForest.back()->SetUseFisherCuts();
1304 fForest.back()->SetMinLinCorrForFisher(fMinLinCorrForFisher);
1305 fForest.back()->SetUseExclusiveVars(fUseExclusiveVars);
1309 nNodesBeforePruning = fForest.back()->BuildTree(*fTrainSample);
1310 Double_t bw = this->Boost(*fTrainSample, fForest.back(),i);
1312 fBoostWeights.push_back(bw);
1314 fBoostWeights.push_back(0);
1315 Log() << kWARNING <<
"stopped boosting at itree="<<itree <<
Endl;
1324 fRandomisedTrees, fUseNvars, fUsePoissonNvars, fMaxDepth,
1325 itree, fNodePurityLimit, itree);
1327 fForest.push_back(dt);
1328 fForest.back()->SetNVars(GetNvar());
1329 if (fUseFisherCuts) {
1330 fForest.back()->SetUseFisherCuts();
1331 fForest.back()->SetMinLinCorrForFisher(fMinLinCorrForFisher);
1332 fForest.back()->SetUseExclusiveVars(fUseExclusiveVars);
1335 nNodesBeforePruning = fForest.back()->BuildTree(*fTrainSample);
1337 if (fUseYesNoLeaf && !DoRegression() && fBoostType!=
"Grad") {
1338 nNodesBeforePruning = fForest.back()->CleanTree();
1341 nNodesBeforePruningCount += nNodesBeforePruning;
1342 nodesBeforePruningVsTree->
SetBinContent(itree+1,nNodesBeforePruning);
1344 fForest.back()->SetPruneMethod(fPruneMethod);
1345 fForest.back()->SetPruneStrength(fPruneStrength);
1347 std::vector<const Event*> * validationSample = NULL;
1348 if(fAutomatic) validationSample = &fValidationSample;
1349 Double_t bw = this->Boost(*fTrainSample, fForest.back());
1351 fBoostWeights.push_back(bw);
1353 fBoostWeights.push_back(0);
1354 Log() << kWARNING <<
"stopped boosting at itree="<<itree <<
Endl;
1363 if (fUseYesNoLeaf && !DoRegression() && fBoostType!=
"Grad"){
1364 fForest.back()->CleanTree();
1366 nNodesAfterPruning = fForest.back()->GetNNodes();
1367 nNodesAfterPruningCount += nNodesAfterPruning;
1368 nodesAfterPruningVsTree->
SetBinContent(itree+1,nNodesAfterPruning);
1371 fInteractive->AddPoint(itree, fBoostWeight, fErrorFraction);
1374 fMonitorNtuple->Fill();
1375 if (fDoBoostMonitor){
1376 if (! DoRegression() ){
1377 if ( itree==fNTrees-1 || (!(itree%500)) ||
1378 (!(itree%250) && itree <1000)||
1379 (!(itree%100) && itree < 500)||
1380 (!(itree%50) && itree < 250)||
1381 (!(itree%25) && itree < 150)||
1382 (!(itree%10) && itree < 50)||
1383 (!(itree%5) && itree < 20)
1384 ) BoostMonitor(itree);
1395 Log() << kDEBUG <<
"\t<Train> average number of nodes (w/o pruning) : "
1396 << nNodesBeforePruningCount/GetNTrees() <<
Endl;
1399 Log() << kDEBUG <<
"\t<Train> average number of nodes before/after pruning : "
1400 << nNodesBeforePruningCount/GetNTrees() <<
" / "
1401 << nNodesAfterPruningCount/GetNTrees()
1409 Log() << kDEBUG <<
"Now I delete the privat data sample"<<
Endl;
1410 for (
UInt_t i=0; i<fEventSample.size(); i++)
delete fEventSample[i];
1411 for (
UInt_t i=0; i<fValidationSample.size(); i++)
delete fValidationSample[i];
1412 fEventSample.clear();
1413 fValidationSample.clear();
1415 if (!fExitFromTraining) fIPyMaxIter = fIPyCurrentIter;
1426 for (
UInt_t itree=0; itree<nTrees; itree++) {
1431 return 2.0/(1.0+
exp(-2.0*
sum))-1;
1439 if (DoMulticlass()) {
1440 UInt_t nClasses = DataInfo().GetNClasses();
1441 Bool_t isLastClass = (cls == nClasses - 1);
1453 std::map<const TMVA::Event *, std::vector<double>> & residuals = this->fResiduals;
1456 auto update_residuals = [&residuals, &lastTree, cls](
const TMVA::Event *
e) {
1460 auto update_residuals_last = [&residuals, &lastTree, cls, nClasses](
const TMVA::Event *
e) {
1463 auto &residualsThisEvent = residuals[
e];
1465 std::vector<Double_t> expCache(nClasses, 0.0);
1466 std::transform(residualsThisEvent.begin(),
1467 residualsThisEvent.begin() + nClasses,
1468 expCache.begin(), [](
Double_t d) { return exp(d); });
1470 Double_t exp_sum = std::accumulate(expCache.begin(),
1471 expCache.begin() + nClasses,
1474 for (
UInt_t i = 0; i < nClasses; i++) {
1475 Double_t p_cls = expCache[i] / exp_sum;
1477 Double_t res = (
e->GetClass() == i) ? (1.0 - p_cls) : (-p_cls);
1484 .
Foreach(update_residuals_last, eventSample);
1487 .
Foreach(update_residuals, eventSample);
1493 std::vector<Double_t> expCache;
1495 expCache.resize(nClasses);
1498 for (
auto e : eventSample) {
1499 fResiduals[
e].at(cls) += fForest.back()->CheckEvent(
e,
kFALSE);
1501 auto &residualsThisEvent = fResiduals[
e];
1502 std::transform(residualsThisEvent.begin(),
1503 residualsThisEvent.begin() + nClasses,
1504 expCache.begin(), [](
Double_t d) { return exp(d); });
1506 Double_t exp_sum = std::accumulate(expCache.begin(),
1507 expCache.begin() + nClasses,
1510 for (
UInt_t i = 0; i < nClasses; i++) {
1511 Double_t p_cls = expCache[i] / exp_sum;
1513 Double_t res = (
e->GetClass() == i) ? (1.0 - p_cls) : (-p_cls);
1520 std::map<const TMVA::Event *, std::vector<double>> & residuals = this->fResiduals;
1523 UInt_t signalClass = DataInfo().GetSignalClassIndex();
1526 auto update_residuals = [&residuals, &lastTree, signalClass](
const TMVA::Event *
e) {
1527 double & residualAt0 = residuals[
e].at(0);
1530 Double_t p_sig = 1.0 / (1.0 +
exp(-2.0 * residualAt0));
1531 Double_t res = ((
e->GetClass() == signalClass) ? (1.0 - p_sig) : (-p_sig));
1537 .
Foreach(update_residuals, eventSample);
1539 for (
auto e : eventSample) {
1540 double & residualAt0 = residuals[
e].at(0);
1543 Double_t p_sig = 1.0 / (1.0 +
exp(-2.0 * residualAt0));
1544 Double_t res = ((
e->GetClass() == signalClass) ? (1.0 - p_sig) : (-p_sig));
1567 auto f = [
this, &nPartitions](
UInt_t partition = 0) ->
Int_t {
1568 Int_t start = 1.0 * partition / nPartitions * this->fEventSample.size();
1569 Int_t end = (partition + 1.0) / nPartitions * this->fEventSample.size();
1571 for (
Int_t i = start; i < end; ++i) {
1590 fRegressionLossFunctionBDTG->SetTargets(eventSample, fLossFunctionEventInfo);
1604 std::unordered_map<TMVA::DecisionTreeNode*, LeafInfo> leaves;
1605 for (
auto e : eventSample) {
1608 auto &
v = leaves[node];
1609 auto target =
e->GetTarget(cls);
1610 v.sumWeightTarget += target * weight;
1611 v.sum2 +=
fabs(target) * (1.0 -
fabs(target)) * weight;
1613 for (
auto &iLeave : leaves) {
1614 constexpr auto minValue = 1
e-30;
1615 if (iLeave.second.sum2 < minValue) {
1616 iLeave.second.sum2 = minValue;
1618 const Double_t K = DataInfo().GetNClasses();
1619 iLeave.first->SetResponse(fShrinkage * (
K - 1) /
K * iLeave.second.sumWeightTarget / iLeave.second.sum2);
1624 DoMulticlass() ? UpdateTargets(fEventSample, cls) : UpdateTargets(fEventSample);
1636 std::map<TMVA::DecisionTreeNode*,vector< TMVA::LossFunctionEventInfo > > leaves;
1637 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
1639 (leaves[node]).push_back(fLossFunctionEventInfo[*
e]);
1646 for (std::map<
TMVA::DecisionTreeNode*,vector< TMVA::LossFunctionEventInfo > >::iterator iLeave=leaves.begin();
1647 iLeave!=leaves.end();++iLeave){
1648 Double_t fit = fRegressionLossFunctionBDTG->Fit(iLeave->second);
1649 (iLeave->first)->SetResponse(fShrinkage*fit);
1652 UpdateTargetsRegression(*fTrainSample);
1667 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
1671 fRegressionLossFunctionBDTG->Init(fLossFunctionEventInfo, fBoostWeights);
1672 UpdateTargetsRegression(*fTrainSample,
kTRUE);
1676 else if(DoMulticlass()){
1677 UInt_t nClasses = DataInfo().GetNClasses();
1678 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
1679 for (
UInt_t i=0;i<nClasses;i++){
1681 Double_t r = (*e)->GetClass()==i?(1-1.0/nClasses):(-1.0/nClasses);
1683 fResiduals[*
e].push_back(0);
1688 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
1689 Double_t r = (DataInfo().IsSignal(*
e)?1:0)-0.5;
1691 fResiduals[*
e].push_back(0);
1702 for (
UInt_t ievt=0; ievt<fValidationSample.size(); ievt++) {
1703 Bool_t isSignalType= (dt->
CheckEvent(fValidationSample[ievt]) > fNodePurityLimit ) ? 1 : 0;
1705 if (isSignalType == (DataInfo().IsSignal(fValidationSample[ievt])) ) {
1706 ncorrect += fValidationSample[ievt]->GetWeight();
1709 nfalse += fValidationSample[ievt]->GetWeight();
1713 return ncorrect / (ncorrect + nfalse);
1724 if (fBoostType==
"AdaBoost") returnVal = this->AdaBoost (eventSample, dt);
1725 else if (fBoostType==
"AdaCost") returnVal = this->AdaCost (eventSample, dt);
1726 else if (fBoostType==
"Bagging") returnVal = this->Bagging ( );
1727 else if (fBoostType==
"RegBoost") returnVal = this->RegBoost (eventSample, dt);
1728 else if (fBoostType==
"AdaBoostR2") returnVal = this->AdaBoostR2(eventSample, dt);
1729 else if (fBoostType==
"Grad"){
1731 returnVal = this->GradBoostRegression(eventSample, dt);
1732 else if(DoMulticlass())
1733 returnVal = this->GradBoost (eventSample, dt, cls);
1735 returnVal = this->GradBoost (eventSample, dt);
1738 Log() << kINFO << GetOptions() <<
Endl;
1739 Log() << kFATAL <<
"<Boost> unknown boost option " << fBoostType<<
" called" <<
Endl;
1743 GetBaggedSubSample(fEventSample);
1758 TH1F *tmpS =
new TH1F(
"tmpS",
"", 100 , -1., 1.00001 );
1759 TH1F *tmpB =
new TH1F(
"tmpB",
"", 100 , -1., 1.00001 );
1763 UInt_t signalClassNr = DataInfo().GetClassInfo(
"Signal")->GetNumber();
1773 UInt_t nevents = Data()->GetNTestEvents();
1774 for (
UInt_t iev=0; iev < nevents; iev++){
1775 const Event*
event = GetTestingEvent(iev);
1777 if (event->GetClass() == signalClassNr) {tmp=tmpS;}
1779 tmp->
Fill(PrivateGetMvaValue(event),event->GetWeight());
1783 std::vector<TH1F*> hS;
1784 std::vector<TH1F*> hB;
1785 for (
UInt_t ivar=0; ivar<GetNvar(); ivar++){
1786 hS.push_back(
new TH1F(
Form(
"SigVar%dAtTree%d",ivar,iTree),
Form(
"SigVar%dAtTree%d",ivar,iTree),100,DataInfo().GetVariableInfo(ivar).GetMin(),DataInfo().GetVariableInfo(ivar).GetMax()));
1787 hB.push_back(
new TH1F(
Form(
"BkgVar%dAtTree%d",ivar,iTree),
Form(
"BkgVar%dAtTree%d",ivar,iTree),100,DataInfo().GetVariableInfo(ivar).GetMin(),DataInfo().GetVariableInfo(ivar).GetMax()));
1788 results->
Store(hS.back(),hS.back()->GetTitle());
1789 results->
Store(hB.back(),hB.back()->GetTitle());
1793 for (
UInt_t iev=0; iev < fEventSample.size(); iev++){
1794 if (fEventSample[iev]->GetBoostWeight() > max) max = 1.01*fEventSample[iev]->GetBoostWeight();
1796 TH1F *tmpBoostWeightsS =
new TH1F(
Form(
"BoostWeightsInTreeS%d",iTree),
Form(
"BoostWeightsInTreeS%d",iTree),100,0.,max);
1797 TH1F *tmpBoostWeightsB =
new TH1F(
Form(
"BoostWeightsInTreeB%d",iTree),
Form(
"BoostWeightsInTreeB%d",iTree),100,0.,max);
1798 results->
Store(tmpBoostWeightsS,tmpBoostWeightsS->
GetTitle());
1799 results->
Store(tmpBoostWeightsB,tmpBoostWeightsB->
GetTitle());
1801 TH1F *tmpBoostWeights;
1802 std::vector<TH1F*> *
h;
1804 for (
UInt_t iev=0; iev < fEventSample.size(); iev++){
1805 if (fEventSample[iev]->
GetClass() == signalClassNr) {
1806 tmpBoostWeights=tmpBoostWeightsS;
1809 tmpBoostWeights=tmpBoostWeightsB;
1812 tmpBoostWeights->
Fill(fEventSample[iev]->GetBoostWeight());
1813 for (
UInt_t ivar=0; ivar<GetNvar(); ivar++){
1814 (*h)[ivar]->Fill(fEventSample[iev]->GetValue(ivar),fEventSample[iev]->GetWeight());
1850 Double_t err=0, sumGlobalw=0, sumGlobalwfalse=0, sumGlobalwfalse2=0;
1852 std::vector<Double_t> sumw(DataInfo().GetNClasses(),0);
1855 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
1858 UInt_t iclass=(*e)->GetClass();
1861 if ( DoRegression() ) {
1863 sumGlobalwfalse += w * tmpDev;
1864 sumGlobalwfalse2 += w * tmpDev*tmpDev;
1865 if (tmpDev > maxDev) maxDev = tmpDev;
1870 if (!(isSignalType == DataInfo().IsSignal(*
e))) {
1871 sumGlobalwfalse+= w;
1876 if (DataInfo().IsSignal(*
e)) trueType = 1;
1878 sumGlobalwfalse+= w*trueType*dtoutput;
1883 err = sumGlobalwfalse/sumGlobalw ;
1884 if ( DoRegression() ) {
1886 if (fAdaBoostR2Loss==
"linear"){
1887 err = sumGlobalwfalse/maxDev/sumGlobalw ;
1889 else if (fAdaBoostR2Loss==
"quadratic"){
1890 err = sumGlobalwfalse2/maxDev/maxDev/sumGlobalw ;
1892 else if (fAdaBoostR2Loss==
"exponential"){
1894 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
1897 err += w * (1 -
exp (-tmpDev/maxDev)) / sumGlobalw;
1902 Log() << kFATAL <<
" you've chosen a Loss type for Adaboost other than linear, quadratic or exponential "
1903 <<
" namely " << fAdaBoostR2Loss <<
"\n"
1904 <<
"and this is not implemented... a typo in the options ??" <<
Endl;
1908 Log() << kDEBUG <<
"BDT AdaBoos wrong/all: " << sumGlobalwfalse <<
"/" << sumGlobalw <<
Endl;
1912 std::vector<Double_t> newSumw(sumw.size(),0);
1915 if (err >= 0.5 && fUseYesNoLeaf) {
1919 Log() << kERROR <<
" YOUR tree has only 1 Node... kind of a funny *tree*. I cannot "
1920 <<
"boost such a thing... if after 1 step the error rate is == 0.5"
1922 <<
"please check why this happens, maybe too many events per node requested ?"
1926 Log() << kERROR <<
" The error rate in the BDT boosting is > 0.5. ("<< err
1927 <<
") That should not happen, please check your code (i.e... the BDT code), I "
1928 <<
" stop boosting here" <<
Endl;
1932 }
else if (err < 0) {
1933 Log() << kERROR <<
" The error rate in the BDT boosting is < 0. That can happen"
1934 <<
" due to improper treatment of negative weights in a Monte Carlo.. (if you have"
1935 <<
" an idea on how to do it in a better way, please let me know (Helge.Voss@cern.ch)"
1936 <<
" for the time being I set it to its absolute value.. just to continue.." <<
Endl;
1940 boostWeight =
TMath::Log((1.-err)/err)*fAdaBoostBeta;
1942 boostWeight =
TMath::Log((1.+err)/(1-err))*fAdaBoostBeta;
1945 Log() << kDEBUG <<
"BDT AdaBoos wrong/all: " << sumGlobalwfalse <<
"/" << sumGlobalw <<
" 1-err/err="<<boostWeight<<
" log.."<<
TMath::Log(boostWeight)<<
Endl;
1950 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
1952 if (fUseYesNoLeaf||DoRegression()){
1953 if ((!( (dt->
CheckEvent(*
e,fUseYesNoLeaf) > fNodePurityLimit ) == DataInfo().IsSignal(*
e))) || DoRegression()) {
1957 if ( (*e)->GetWeight() > 0 ){
1958 (*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostfactor);
1960 if (DoRegression()) results->
GetHist(
"BoostWeights")->
Fill(boostfactor);
1962 if ( fInverseBoostNegWeights )(*e)->ScaleBoostWeight( 1. / boostfactor);
1963 else (*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostfactor);
1971 if (DataInfo().IsSignal(*
e)) trueType = 1;
1975 if ( (*e)->GetWeight() > 0 ){
1976 (*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostfactor);
1978 if (DoRegression()) results->
GetHist(
"BoostWeights")->
Fill(boostfactor);
1980 if ( fInverseBoostNegWeights )(*e)->ScaleBoostWeight( 1. / boostfactor);
1981 else (*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostfactor);
1984 newSumGlobalw+=(*e)->GetWeight();
1985 newSumw[(*e)->GetClass()] += (*e)->GetWeight();
1991 Log() << kDEBUG <<
"new Nsig="<<newSumw[0]*globalNormWeight <<
" new Nbkg="<<newSumw[1]*globalNormWeight <<
Endl;
1994 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
1998 if (DataInfo().IsSignal(*
e))(*e)->ScaleBoostWeight( globalNormWeight * fSigToBkgFraction );
1999 else (*e)->ScaleBoostWeight( globalNormWeight );
2002 if (!(DoRegression()))results->
GetHist(
"BoostWeights")->
Fill(boostWeight);
2006 fBoostWeight = boostWeight;
2007 fErrorFraction = err;
2033 Double_t err=0, sumGlobalWeights=0, sumGlobalCost=0;
2035 std::vector<Double_t> sumw(DataInfo().GetNClasses(),0);
2037 for (vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
2039 sumGlobalWeights += w;
2040 UInt_t iclass=(*e)->GetClass();
2044 if ( DoRegression() ) {
2045 Log() << kFATAL <<
" AdaCost not implemented for regression"<<
Endl;
2050 Bool_t isTrueSignal = DataInfo().IsSignal(*
e);
2051 Bool_t isSelectedSignal = (dtoutput>0);
2052 if (isTrueSignal) trueType = 1;
2056 if (isTrueSignal && isSelectedSignal) cost=Css;
2057 else if (isTrueSignal && !isSelectedSignal) cost=Cts_sb;
2058 else if (!isTrueSignal && isSelectedSignal) cost=Ctb_ss;
2059 else if (!isTrueSignal && !isSelectedSignal) cost=Cbb;
2060 else Log() << kERROR <<
"something went wrong in AdaCost" <<
Endl;
2062 sumGlobalCost+= w*trueType*dtoutput*cost;
2067 if ( DoRegression() ) {
2068 Log() << kFATAL <<
" AdaCost not implemented for regression"<<
Endl;
2073 sumGlobalCost /= sumGlobalWeights;
2078 vector<Double_t> newSumClassWeights(sumw.size(),0);
2080 Double_t boostWeight =
TMath::Log((1+sumGlobalCost)/(1-sumGlobalCost)) * fAdaBoostBeta;
2084 for (vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
2087 Bool_t isTrueSignal = DataInfo().IsSignal(*
e);
2088 Bool_t isSelectedSignal = (dtoutput>0);
2089 if (isTrueSignal) trueType = 1;
2093 if (isTrueSignal && isSelectedSignal) cost=Css;
2094 else if (isTrueSignal && !isSelectedSignal) cost=Cts_sb;
2095 else if (!isTrueSignal && isSelectedSignal) cost=Ctb_ss;
2096 else if (!isTrueSignal && !isSelectedSignal) cost=Cbb;
2097 else Log() << kERROR <<
"something went wrong in AdaCost" <<
Endl;
2100 if (DoRegression())
Log() << kFATAL <<
" AdaCost not implemented for regression"<<
Endl;
2101 if ( (*e)->GetWeight() > 0 ){
2102 (*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostfactor);
2104 if (DoRegression())
Log() << kFATAL <<
" AdaCost not implemented for regression"<<
Endl;
2106 if ( fInverseBoostNegWeights )(*e)->ScaleBoostWeight( 1. / boostfactor);
2109 newSumGlobalWeights+=(*e)->GetWeight();
2110 newSumClassWeights[(*e)->GetClass()] += (*e)->GetWeight();
2115 Double_t globalNormWeight=
Double_t(eventSample.size())/newSumGlobalWeights;
2116 Log() << kDEBUG <<
"new Nsig="<<newSumClassWeights[0]*globalNormWeight <<
" new Nbkg="<<newSumClassWeights[1]*globalNormWeight <<
Endl;
2119 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
2122 if (DataInfo().IsSignal(*
e))(*e)->ScaleBoostWeight( globalNormWeight * fSigToBkgFraction );
2123 else (*e)->ScaleBoostWeight( globalNormWeight );
2127 if (!(DoRegression()))results->
GetHist(
"BoostWeights")->
Fill(boostWeight);
2131 fBoostWeight = boostWeight;
2132 fErrorFraction = err;
2159 if (!fSubSample.empty()) fSubSample.clear();
2161 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
2162 n = trandom->
PoissonD(fBaggedSampleFraction);
2163 for (
Int_t i=0;i<
n;i++) fSubSample.push_back(*
e);
2197 if ( !DoRegression() )
Log() << kFATAL <<
"Somehow you chose a regression boost method for a classification job" <<
Endl;
2199 Double_t err=0, sumw=0, sumwfalse=0, sumwfalse2=0;
2201 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
2206 sumwfalse += w * tmpDev;
2207 sumwfalse2 += w * tmpDev*tmpDev;
2208 if (tmpDev > maxDev) maxDev = tmpDev;
2212 if (fAdaBoostR2Loss==
"linear"){
2213 err = sumwfalse/maxDev/sumw ;
2215 else if (fAdaBoostR2Loss==
"quadratic"){
2216 err = sumwfalse2/maxDev/maxDev/sumw ;
2218 else if (fAdaBoostR2Loss==
"exponential"){
2220 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
2223 err += w * (1 -
exp (-tmpDev/maxDev)) / sumw;
2228 Log() << kFATAL <<
" you've chosen a Loss type for Adaboost other than linear, quadratic or exponential "
2229 <<
" namely " << fAdaBoostR2Loss <<
"\n"
2230 <<
"and this is not implemented... a typo in the options ??" <<
Endl;
2238 Log() << kERROR <<
" YOUR tree has only 1 Node... kind of a funny *tree*. I cannot "
2239 <<
"boost such a thing... if after 1 step the error rate is == 0.5"
2241 <<
"please check why this happens, maybe too many events per node requested ?"
2245 Log() << kERROR <<
" The error rate in the BDT boosting is > 0.5. ("<< err
2246 <<
") That should not happen, but is possible for regression trees, and"
2247 <<
" should trigger a stop for the boosting. please check your code (i.e... the BDT code), I "
2248 <<
" stop boosting " <<
Endl;
2252 }
else if (err < 0) {
2253 Log() << kERROR <<
" The error rate in the BDT boosting is < 0. That can happen"
2254 <<
" due to improper treatment of negative weights in a Monte Carlo.. (if you have"
2255 <<
" an idea on how to do it in a better way, please let me know (Helge.Voss@cern.ch)"
2256 <<
" for the time being I set it to its absolute value.. just to continue.." <<
Endl;
2260 Double_t boostWeight = err / (1.-err);
2265 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
2267 results->
GetHist(
"BoostWeights")->
Fill(boostfactor);
2269 if ( (*e)->GetWeight() > 0 ){
2270 Float_t newBoostWeight = (*e)->GetBoostWeight() * boostfactor;
2271 Float_t newWeight = (*e)->GetWeight() * (*e)->GetBoostWeight() * boostfactor;
2272 if (newWeight == 0) {
2273 Log() << kINFO <<
"Weight= " << (*e)->GetWeight() <<
Endl;
2274 Log() << kINFO <<
"BoostWeight= " << (*e)->GetBoostWeight() <<
Endl;
2275 Log() << kINFO <<
"boostweight="<<boostWeight <<
" err= " <<err <<
Endl;
2276 Log() << kINFO <<
"NewBoostWeight= " << newBoostWeight <<
Endl;
2277 Log() << kINFO <<
"boostfactor= " << boostfactor <<
Endl;
2278 Log() << kINFO <<
"maxDev = " << maxDev <<
Endl;
2280 Log() << kINFO <<
"target = " << (*e)->GetTarget(0) <<
Endl;
2283 (*e)->SetBoostWeight( newBoostWeight );
2286 (*e)->SetBoostWeight( (*e)->GetBoostWeight() / boostfactor);
2288 newSumw+=(*e)->GetWeight();
2292 Double_t normWeight = sumw / newSumw;
2293 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();++
e) {
2296 (*e)->SetBoostWeight( (*e)->GetBoostWeight() * normWeight );
2303 fBoostWeight = boostWeight;
2304 fErrorFraction = err;
2316 if (fDoPreselection){
2317 for (
UInt_t ivar=0; ivar<GetNvar(); ivar++){
2318 gTools().
AddAttr( wght,
Form(
"PreselectionLowBkgVar%d",ivar), fIsLowBkgCut[ivar]);
2319 gTools().
AddAttr( wght,
Form(
"PreselectionLowBkgVar%dValue",ivar), fLowBkgCut[ivar]);
2320 gTools().
AddAttr( wght,
Form(
"PreselectionLowSigVar%d",ivar), fIsLowSigCut[ivar]);
2321 gTools().
AddAttr( wght,
Form(
"PreselectionLowSigVar%dValue",ivar), fLowSigCut[ivar]);
2322 gTools().
AddAttr( wght,
Form(
"PreselectionHighBkgVar%d",ivar), fIsHighBkgCut[ivar]);
2323 gTools().
AddAttr( wght,
Form(
"PreselectionHighBkgVar%dValue",ivar),fHighBkgCut[ivar]);
2324 gTools().
AddAttr( wght,
Form(
"PreselectionHighSigVar%d",ivar), fIsHighSigCut[ivar]);
2325 gTools().
AddAttr( wght,
Form(
"PreselectionHighSigVar%dValue",ivar),fHighSigCut[ivar]);
2331 gTools().
AddAttr( wght,
"AnalysisType", fForest.back()->GetAnalysisType() );
2333 for (
UInt_t i=0; i< fForest.size(); i++) {
2334 void* trxml = fForest[i]->AddXMLTo(wght);
2345 for (i=0; i<fForest.size(); i++)
delete fForest[i];
2347 fBoostWeights.clear();
2354 if (
gTools().HasAttr( parent,
Form(
"PreselectionLowBkgVar%d",0))) {
2355 fIsLowBkgCut.resize(GetNvar());
2356 fLowBkgCut.resize(GetNvar());
2357 fIsLowSigCut.resize(GetNvar());
2358 fLowSigCut.resize(GetNvar());
2359 fIsHighBkgCut.resize(GetNvar());
2360 fHighBkgCut.resize(GetNvar());
2361 fIsHighSigCut.resize(GetNvar());
2362 fHighSigCut.resize(GetNvar());
2366 for (
UInt_t ivar=0; ivar<GetNvar(); ivar++){
2368 fIsLowBkgCut[ivar]=tmpBool;
2370 fLowBkgCut[ivar]=tmpDouble;
2372 fIsLowSigCut[ivar]=tmpBool;
2374 fLowSigCut[ivar]=tmpDouble;
2376 fIsHighBkgCut[ivar]=tmpBool;
2378 fHighBkgCut[ivar]=tmpDouble;
2380 fIsHighSigCut[ivar]=tmpBool;
2382 fHighSigCut[ivar]=tmpDouble;
2389 if(
gTools().HasAttr(parent,
"TreeType")) {
2400 fForest.back()->SetTreeID(i++);
2402 fBoostWeights.push_back(boostWeight);
2414 Int_t analysisType(0);
2417 istr >>
dummy >> fNTrees;
2418 Log() << kINFO <<
"Read " << fNTrees <<
" Decision trees" <<
Endl;
2420 for (
UInt_t i=0;i<fForest.size();i++)
delete fForest[i];
2422 fBoostWeights.clear();
2425 for (
int i=0;i<fNTrees;i++) {
2426 istr >>
dummy >> iTree >>
dummy >> boostWeight;
2428 fForest.back()->Print( std::cout );
2429 Log() << kFATAL <<
"Error while reading weight file; mismatch iTree="
2430 << iTree <<
" i=" << i
2431 <<
" dummy " <<
dummy
2432 <<
" boostweight " << boostWeight
2437 fForest.back()->SetTreeID(i);
2438 fForest.back()->
Read(istr, GetTrainingTMVAVersionCode());
2439 fBoostWeights.push_back(boostWeight);
2446 return this->GetMvaValue( err, errUpper, 0 );
2456 const Event* ev = GetEvent();
2457 if (fDoPreselection) {
2458 Double_t val = ApplyPreselectionCuts(ev);
2461 return PrivateGetMvaValue(ev, err, errUpper, useNTrees);
2473 NoErrorCalc(err, errUpper);
2477 UInt_t nTrees = fForest.size();
2479 if (useNTrees > 0 ) nTrees = useNTrees;
2481 if (fBoostType==
"Grad")
return GetGradBoostMVA(ev,nTrees);
2485 for (
UInt_t itree=0; itree<nTrees; itree++) {
2487 myMVA += fBoostWeights[itree] * fForest[itree]->CheckEvent(ev,fUseYesNoLeaf);
2488 norm += fBoostWeights[itree];
2500 if (fMulticlassReturnVal == NULL) fMulticlassReturnVal =
new std::vector<Float_t>();
2501 fMulticlassReturnVal->clear();
2503 UInt_t nClasses = DataInfo().GetNClasses();
2504 std::vector<Double_t> temp(nClasses);
2505 auto forestSize = fForest.size();
2508 std::vector<TMVA::DecisionTree *> forest = fForest;
2509 auto get_output = [&
e, &forest, &temp, forestSize, nClasses](
UInt_t iClass) {
2510 for (
UInt_t itree = iClass; itree < forestSize; itree += nClasses) {
2511 temp[iClass] += forest[itree]->CheckEvent(
e,
kFALSE);
2521 for (
UInt_t itree = 0; itree < forestSize; ++itree) {
2522 temp[classOfTree] += fForest[itree]->CheckEvent(
e,
kFALSE);
2523 if (++classOfTree == nClasses) classOfTree = 0;
2529 std::transform(temp.begin(), temp.end(), temp.begin(), [](
Double_t d){return exp(d);});
2531 Double_t exp_sum = std::accumulate(temp.begin(), temp.end(), 0.0);
2533 for (
UInt_t i = 0; i < nClasses; i++) {
2534 Double_t p_cls = temp[i] / exp_sum;
2535 (*fMulticlassReturnVal).push_back(p_cls);
2538 return *fMulticlassReturnVal;
2547 if (fRegressionReturnVal == NULL) fRegressionReturnVal =
new std::vector<Float_t>();
2548 fRegressionReturnVal->clear();
2550 const Event * ev = GetEvent();
2555 if (fBoostType==
"AdaBoostR2") {
2566 vector< Double_t > response(fForest.size());
2567 vector< Double_t > weight(fForest.size());
2570 for (
UInt_t itree=0; itree<fForest.size(); itree++) {
2571 response[itree] = fForest[itree]->CheckEvent(ev,
kFALSE);
2572 weight[itree] = fBoostWeights[itree];
2573 totalSumOfWeights += fBoostWeights[itree];
2576 std::vector< std::vector<Double_t> > vtemp;
2577 vtemp.push_back( response );
2578 vtemp.push_back( weight );
2583 while (sumOfWeights <= totalSumOfWeights/2.) {
2584 sumOfWeights += vtemp[1][t];
2598 else if(fBoostType==
"Grad"){
2599 for (
UInt_t itree=0; itree<fForest.size(); itree++) {
2600 myMVA += fForest[itree]->CheckEvent(ev,
kFALSE);
2603 evT->
SetTarget(0, myMVA+fBoostWeights[0] );
2606 for (
UInt_t itree=0; itree<fForest.size(); itree++) {
2608 myMVA += fBoostWeights[itree] * fForest[itree]->CheckEvent(ev,
kFALSE);
2609 norm += fBoostWeights[itree];
2617 const Event* evT2 = GetTransformationHandler().InverseTransform( evT );
2618 fRegressionReturnVal->push_back( evT2->
GetTarget(0) );
2623 return *fRegressionReturnVal;
2632 Log() << kDEBUG <<
"\tWrite monitoring histograms to file: " << BaseDir()->GetPath() <<
Endl;
2636 fMonitorNtuple->
Write();
2647 fVariableImportance.resize(GetNvar());
2648 for (
UInt_t ivar = 0; ivar < GetNvar(); ivar++) {
2649 fVariableImportance[ivar]=0;
2652 for (
UInt_t itree = 0; itree < GetNTrees(); itree++) {
2653 std::vector<Double_t> relativeImportance(fForest[itree]->GetVariableImportance());
2654 for (
UInt_t i=0; i< relativeImportance.size(); i++) {
2655 fVariableImportance[i] += fBoostWeights[itree] * relativeImportance[i];
2659 for (
UInt_t ivar=0; ivar< fVariableImportance.size(); ivar++){
2660 fVariableImportance[ivar] =
TMath::Sqrt(fVariableImportance[ivar]);
2661 sum += fVariableImportance[ivar];
2663 for (
UInt_t ivar=0; ivar< fVariableImportance.size(); ivar++) fVariableImportance[ivar] /=
sum;
2665 return fVariableImportance;
2675 std::vector<Double_t> relativeImportance = this->GetVariableImportance();
2676 if (ivar < (
UInt_t)relativeImportance.size())
return relativeImportance[ivar];
2677 else Log() << kFATAL <<
"<GetVariableImportance> ivar = " << ivar <<
" is out of range " <<
Endl;
2689 vector< Double_t> importance(this->GetVariableImportance());
2691 for (
UInt_t ivar=0; ivar<GetNvar(); ivar++) {
2693 fRanking->AddRank(
Rank( GetInputLabel(ivar), importance[ivar] ) );
2707 Log() <<
"Boosted Decision Trees are a collection of individual decision" <<
Endl;
2708 Log() <<
"trees which form a multivariate classifier by (weighted) majority " <<
Endl;
2709 Log() <<
"vote of the individual trees. Consecutive decision trees are " <<
Endl;
2710 Log() <<
"trained using the original training data set with re-weighted " <<
Endl;
2711 Log() <<
"events. By default, the AdaBoost method is employed, which gives " <<
Endl;
2712 Log() <<
"events that were misclassified in the previous tree a larger " <<
Endl;
2713 Log() <<
"weight in the training of the following tree." <<
Endl;
2715 Log() <<
"Decision trees are a sequence of binary splits of the data sample" <<
Endl;
2716 Log() <<
"using a single discriminant variable at a time. A test event " <<
Endl;
2717 Log() <<
"ending up after the sequence of left-right splits in a final " <<
Endl;
2718 Log() <<
"(\"leaf\") node is classified as either signal or background" <<
Endl;
2719 Log() <<
"depending on the majority type of training events in that node." <<
Endl;
2723 Log() <<
"By the nature of the binary splits performed on the individual" <<
Endl;
2724 Log() <<
"variables, decision trees do not deal well with linear correlations" <<
Endl;
2725 Log() <<
"between variables (they need to approximate the linear split in" <<
Endl;
2726 Log() <<
"the two dimensional space by a sequence of splits on the two " <<
Endl;
2727 Log() <<
"variables individually). Hence decorrelation could be useful " <<
Endl;
2728 Log() <<
"to optimise the BDT performance." <<
Endl;
2732 Log() <<
"The two most important parameters in the configuration are the " <<
Endl;
2733 Log() <<
"minimal number of events requested by a leaf node as percentage of the " <<
Endl;
2734 Log() <<
" number of training events (option \"MinNodeSize\" replacing the actual number " <<
Endl;
2735 Log() <<
" of events \"nEventsMin\" as given in earlier versions" <<
Endl;
2736 Log() <<
"If this number is too large, detailed features " <<
Endl;
2737 Log() <<
"in the parameter space are hard to be modelled. If it is too small, " <<
Endl;
2738 Log() <<
"the risk to overtrain rises and boosting seems to be less effective" <<
Endl;
2739 Log() <<
" typical values from our current experience for best performance " <<
Endl;
2740 Log() <<
" are between 0.5(%) and 10(%) " <<
Endl;
2742 Log() <<
"The default minimal number is currently set to " <<
Endl;
2743 Log() <<
" max(20, (N_training_events / N_variables^2 / 10)) " <<
Endl;
2744 Log() <<
"and can be changed by the user." <<
Endl;
2746 Log() <<
"The other crucial parameter, the pruning strength (\"PruneStrength\")," <<
Endl;
2747 Log() <<
"is also related to overtraining. It is a regularisation parameter " <<
Endl;
2748 Log() <<
"that is used when determining after the training which splits " <<
Endl;
2749 Log() <<
"are considered statistically insignificant and are removed. The" <<
Endl;
2750 Log() <<
"user is advised to carefully watch the BDT screen output for" <<
Endl;
2751 Log() <<
"the comparison between efficiencies obtained on the training and" <<
Endl;
2752 Log() <<
"the independent test sample. They should be equal within statistical" <<
Endl;
2753 Log() <<
"errors, in order to minimize statistical fluctuations in different samples." <<
Endl;
2765 fout <<
" std::vector<"<<nodeName<<
"*> fForest; // i.e. root nodes of decision trees" << std::endl;
2766 fout <<
" std::vector<double> fBoostWeights; // the weights applied in the individual boosts" << std::endl;
2767 fout <<
"};" << std::endl << std::endl;
2768 fout <<
"double " << className <<
"::GetMvaValue__( const std::vector<double>& inputValues ) const" << std::endl;
2769 fout <<
"{" << std::endl;
2770 fout <<
" double myMVA = 0;" << std::endl;
2771 if (fDoPreselection){
2772 for (
UInt_t ivar = 0; ivar< fIsLowBkgCut.size(); ivar++){
2773 if (fIsLowBkgCut[ivar]){
2774 fout <<
" if (inputValues["<<ivar<<
"] < " << fLowBkgCut[ivar] <<
") return -1; // is background preselection cut" << std::endl;
2776 if (fIsLowSigCut[ivar]){
2777 fout <<
" if (inputValues["<<ivar<<
"] < "<< fLowSigCut[ivar] <<
") return 1; // is signal preselection cut" << std::endl;
2779 if (fIsHighBkgCut[ivar]){
2780 fout <<
" if (inputValues["<<ivar<<
"] > "<<fHighBkgCut[ivar] <<
") return -1; // is background preselection cut" << std::endl;
2782 if (fIsHighSigCut[ivar]){
2783 fout <<
" if (inputValues["<<ivar<<
"] > "<<fHighSigCut[ivar]<<
") return 1; // is signal preselection cut" << std::endl;
2788 if (fBoostType!=
"Grad"){
2789 fout <<
" double norm = 0;" << std::endl;
2791 fout <<
" for (unsigned int itree=0; itree<fForest.size(); itree++){" << std::endl;
2792 fout <<
" "<<nodeName<<
" *current = fForest[itree];" << std::endl;
2793 fout <<
" while (current->GetNodeType() == 0) { //intermediate node" << std::endl;
2794 fout <<
" if (current->GoesRight(inputValues)) current=("<<nodeName<<
"*)current->GetRight();" << std::endl;
2795 fout <<
" else current=("<<nodeName<<
"*)current->GetLeft();" << std::endl;
2796 fout <<
" }" << std::endl;
2797 if (fBoostType==
"Grad"){
2798 fout <<
" myMVA += current->GetResponse();" << std::endl;
2800 if (fUseYesNoLeaf) fout <<
" myMVA += fBoostWeights[itree] * current->GetNodeType();" << std::endl;
2801 else fout <<
" myMVA += fBoostWeights[itree] * current->GetPurity();" << std::endl;
2802 fout <<
" norm += fBoostWeights[itree];" << std::endl;
2804 fout <<
" }" << std::endl;
2805 if (fBoostType==
"Grad"){
2806 fout <<
" return 2.0/(1.0+exp(-2.0*myMVA))-1.0;" << std::endl;
2808 else fout <<
" return myMVA /= norm;" << std::endl;
2809 fout <<
"};" << std::endl << std::endl;
2810 fout <<
"void " << className <<
"::Initialize()" << std::endl;
2811 fout <<
"{" << std::endl;
2813 for (
UInt_t itree=0; itree<GetNTrees(); itree++) {
2814 fout <<
" // itree = " << itree << std::endl;
2815 fout <<
" fBoostWeights.push_back(" << fBoostWeights[itree] <<
");" << std::endl;
2816 fout <<
" fForest.push_back( " << std::endl;
2817 this->MakeClassInstantiateNode((
DecisionTreeNode*)fForest[itree]->GetRoot(), fout, className);
2818 fout <<
" );" << std::endl;
2820 fout <<
" return;" << std::endl;
2821 fout <<
"};" << std::endl;
2822 fout <<
" " << std::endl;
2823 fout <<
"// Clean up" << std::endl;
2824 fout <<
"inline void " << className <<
"::Clear() " << std::endl;
2825 fout <<
"{" << std::endl;
2826 fout <<
" for (unsigned int itree=0; itree<fForest.size(); itree++) { " << std::endl;
2827 fout <<
" delete fForest[itree]; " << std::endl;
2828 fout <<
" }" << std::endl;
2829 fout <<
"}" << std::endl;
2841 fout <<
"#define NN new "<<nodeName << std::endl;
2843 fout <<
" " << std::endl;
2844 fout <<
"#ifndef "<<nodeName<<
"__def" << std::endl;
2845 fout <<
"#define "<<nodeName<<
"__def" << std::endl;
2846 fout <<
" " << std::endl;
2847 fout <<
"class "<<nodeName<<
" {" << std::endl;
2848 fout <<
" " << std::endl;
2849 fout <<
"public:" << std::endl;
2850 fout <<
" " << std::endl;
2851 fout <<
" // constructor of an essentially \"empty\" node floating in space" << std::endl;
2852 fout <<
" "<<nodeName<<
" ( "<<nodeName<<
"* left,"<<nodeName<<
"* right," << std::endl;
2853 if (fUseFisherCuts){
2854 fout <<
" int nFisherCoeff," << std::endl;
2855 for (
UInt_t i=0;i<GetNVariables()+1;i++){
2856 fout <<
" double fisherCoeff"<<i<<
"," << std::endl;
2859 fout <<
" int selector, double cutValue, bool cutType, " << std::endl;
2860 fout <<
" int nodeType, double purity, double response ) :" << std::endl;
2861 fout <<
" fLeft ( left )," << std::endl;
2862 fout <<
" fRight ( right )," << std::endl;
2863 if (fUseFisherCuts) fout <<
" fNFisherCoeff ( nFisherCoeff )," << std::endl;
2864 fout <<
" fSelector ( selector )," << std::endl;
2865 fout <<
" fCutValue ( cutValue )," << std::endl;
2866 fout <<
" fCutType ( cutType )," << std::endl;
2867 fout <<
" fNodeType ( nodeType )," << std::endl;
2868 fout <<
" fPurity ( purity )," << std::endl;
2869 fout <<
" fResponse ( response ){" << std::endl;
2870 if (fUseFisherCuts){
2871 for (
UInt_t i=0;i<GetNVariables()+1;i++){
2872 fout <<
" fFisherCoeff.push_back(fisherCoeff"<<i<<
");" << std::endl;
2875 fout <<
" }" << std::endl << std::endl;
2876 fout <<
" virtual ~"<<nodeName<<
"();" << std::endl << std::endl;
2877 fout <<
" // test event if it descends the tree at this node to the right" << std::endl;
2878 fout <<
" virtual bool GoesRight( const std::vector<double>& inputValues ) const;" << std::endl;
2879 fout <<
" "<<nodeName<<
"* GetRight( void ) {return fRight; };" << std::endl << std::endl;
2880 fout <<
" // test event if it descends the tree at this node to the left " << std::endl;
2881 fout <<
" virtual bool GoesLeft ( const std::vector<double>& inputValues ) const;" << std::endl;
2882 fout <<
" "<<nodeName<<
"* GetLeft( void ) { return fLeft; }; " << std::endl << std::endl;
2883 fout <<
" // return S/(S+B) (purity) at this node (from training)" << std::endl << std::endl;
2884 fout <<
" double GetPurity( void ) const { return fPurity; } " << std::endl;
2885 fout <<
" // return the node type" << std::endl;
2886 fout <<
" int GetNodeType( void ) const { return fNodeType; }" << std::endl;
2887 fout <<
" double GetResponse(void) const {return fResponse;}" << std::endl << std::endl;
2888 fout <<
"private:" << std::endl << std::endl;
2889 fout <<
" "<<nodeName<<
"* fLeft; // pointer to the left daughter node" << std::endl;
2890 fout <<
" "<<nodeName<<
"* fRight; // pointer to the right daughter node" << std::endl;
2891 if (fUseFisherCuts){
2892 fout <<
" int fNFisherCoeff; // =0 if this node doesn't use fisher, else =nvar+1 " << std::endl;
2893 fout <<
" std::vector<double> fFisherCoeff; // the fisher coeff (offset at the last element)" << std::endl;
2895 fout <<
" int fSelector; // index of variable used in node selection (decision tree) " << std::endl;
2896 fout <<
" double fCutValue; // cut value applied on this node to discriminate bkg against sig" << std::endl;
2897 fout <<
" bool fCutType; // true: if event variable > cutValue ==> signal , false otherwise" << std::endl;
2898 fout <<
" int fNodeType; // Type of node: -1 == Bkg-leaf, 1 == Signal-leaf, 0 = internal " << std::endl;
2899 fout <<
" double fPurity; // Purity of node from training"<< std::endl;
2900 fout <<
" double fResponse; // Regression response value of node" << std::endl;
2901 fout <<
"}; " << std::endl;
2902 fout <<
" " << std::endl;
2903 fout <<
"//_______________________________________________________________________" << std::endl;
2904 fout <<
" "<<nodeName<<
"::~"<<nodeName<<
"()" << std::endl;
2905 fout <<
"{" << std::endl;
2906 fout <<
" if (fLeft != NULL) delete fLeft;" << std::endl;
2907 fout <<
" if (fRight != NULL) delete fRight;" << std::endl;
2908 fout <<
"}; " << std::endl;
2909 fout <<
" " << std::endl;
2910 fout <<
"//_______________________________________________________________________" << std::endl;
2911 fout <<
"bool "<<nodeName<<
"::GoesRight( const std::vector<double>& inputValues ) const" << std::endl;
2912 fout <<
"{" << std::endl;
2913 fout <<
" // test event if it descends the tree at this node to the right" << std::endl;
2914 fout <<
" bool result;" << std::endl;
2915 if (fUseFisherCuts){
2916 fout <<
" if (fNFisherCoeff == 0){" << std::endl;
2917 fout <<
" result = (inputValues[fSelector] > fCutValue );" << std::endl;
2918 fout <<
" }else{" << std::endl;
2919 fout <<
" double fisher = fFisherCoeff.at(fFisherCoeff.size()-1);" << std::endl;
2920 fout <<
" for (unsigned int ivar=0; ivar<fFisherCoeff.size()-1; ivar++)" << std::endl;
2921 fout <<
" fisher += fFisherCoeff.at(ivar)*inputValues.at(ivar);" << std::endl;
2922 fout <<
" result = fisher > fCutValue;" << std::endl;
2923 fout <<
" }" << std::endl;
2925 fout <<
" result = (inputValues[fSelector] > fCutValue );" << std::endl;
2927 fout <<
" if (fCutType == true) return result; //the cuts are selecting Signal ;" << std::endl;
2928 fout <<
" else return !result;" << std::endl;
2929 fout <<
"}" << std::endl;
2930 fout <<
" " << std::endl;
2931 fout <<
"//_______________________________________________________________________" << std::endl;
2932 fout <<
"bool "<<nodeName<<
"::GoesLeft( const std::vector<double>& inputValues ) const" << std::endl;
2933 fout <<
"{" << std::endl;
2934 fout <<
" // test event if it descends the tree at this node to the left" << std::endl;
2935 fout <<
" if (!this->GoesRight(inputValues)) return true;" << std::endl;
2936 fout <<
" else return false;" << std::endl;
2937 fout <<
"}" << std::endl;
2938 fout <<
" " << std::endl;
2939 fout <<
"#endif" << std::endl;
2940 fout <<
" " << std::endl;
2949 Log() << kFATAL <<
"MakeClassInstantiateNode: started with undefined node" <<
Endl;
2952 fout <<
"NN("<<std::endl;
2953 if (
n->GetLeft() != NULL){
2954 this->MakeClassInstantiateNode( (
DecisionTreeNode*)
n->GetLeft() , fout, className);
2959 fout <<
", " <<std::endl;
2960 if (
n->GetRight() != NULL){
2961 this->MakeClassInstantiateNode( (
DecisionTreeNode*)
n->GetRight(), fout, className );
2966 fout <<
", " << std::endl
2967 << std::setprecision(6);
2968 if (fUseFisherCuts){
2969 fout <<
n->GetNFisherCoeff() <<
", ";
2970 for (
UInt_t i=0; i< GetNVariables()+1; i++) {
2971 if (
n->GetNFisherCoeff() == 0 ){
2974 fout <<
n->GetFisherCoeff(i) <<
", ";
2978 fout <<
n->GetSelector() <<
", "
2979 <<
n->GetCutValue() <<
", "
2980 <<
n->GetCutType() <<
", "
2981 <<
n->GetNodeType() <<
", "
2982 <<
n->GetPurity() <<
","
2983 <<
n->GetResponse() <<
") ";
2994 Int_t nTotS_unWeighted = 0, nTotB_unWeighted = 0;
2996 std::vector<TMVA::BDTEventWrapper> bdtEventSample;
2998 fIsLowSigCut.assign(GetNvar(),
kFALSE);
2999 fIsLowBkgCut.assign(GetNvar(),
kFALSE);
3000 fIsHighSigCut.assign(GetNvar(),
kFALSE);
3001 fIsHighBkgCut.assign(GetNvar(),
kFALSE);
3003 fLowSigCut.assign(GetNvar(),0.);
3004 fLowBkgCut.assign(GetNvar(),0.);
3005 fHighSigCut.assign(GetNvar(),0.);
3006 fHighBkgCut.assign(GetNvar(),0.);
3011 for( std::vector<const TMVA::Event*>::const_iterator it = eventSample.begin(); it != eventSample.end(); ++it ) {
3012 if (DataInfo().IsSignal(*it)){
3013 nTotS += (*it)->GetWeight();
3017 nTotB += (*it)->GetWeight();
3023 for(
UInt_t ivar = 0; ivar < GetNvar(); ivar++ ) {
3025 std::sort( bdtEventSample.begin(),bdtEventSample.end() );
3027 Double_t bkgWeightCtr = 0.0, sigWeightCtr = 0.0;
3028 std::vector<TMVA::BDTEventWrapper>::iterator it = bdtEventSample.begin(), it_end = bdtEventSample.end();
3029 for( ; it != it_end; ++it ) {
3030 if (DataInfo().IsSignal(**it))
3031 sigWeightCtr += (**it)->GetWeight();
3033 bkgWeightCtr += (**it)->GetWeight();
3035 it->SetCumulativeWeight(
false,bkgWeightCtr);
3036 it->SetCumulativeWeight(
true,sigWeightCtr);
3041 Double_t dVal = (DataInfo().GetVariableInfo(ivar).GetMax() - DataInfo().GetVariableInfo(ivar).GetMin())/100. ;
3042 Double_t nSelS, nSelB, effS=0.05, effB=0.05, rejS=0.05, rejB=0.05;
3043 Double_t tmpEffS, tmpEffB, tmpRejS, tmpRejB;
3048 for(
UInt_t iev = 1; iev < bdtEventSample.size(); iev++) {
3051 nSelS = bdtEventSample[iev].GetCumulativeWeight(
true);
3052 nSelB = bdtEventSample[iev].GetCumulativeWeight(
false);
3054 tmpEffS=nSelS/nTotS;
3055 tmpEffB=nSelB/nTotB;
3058 if (nSelS==0 && tmpEffB>effB) {effB=tmpEffB; fLowBkgCut[ivar] = bdtEventSample[iev].GetVal() - dVal; fIsLowBkgCut[ivar]=
kTRUE;}
3059 else if (nSelB==0 && tmpEffS>effS) {effS=tmpEffS; fLowSigCut[ivar] = bdtEventSample[iev].GetVal() - dVal; fIsLowSigCut[ivar]=
kTRUE;}
3060 else if (nSelB==nTotB && tmpRejS>rejS) {rejS=tmpRejS; fHighSigCut[ivar] = bdtEventSample[iev].GetVal() + dVal; fIsHighSigCut[ivar]=
kTRUE;}
3061 else if (nSelS==nTotS && tmpRejB>rejB) {rejB=tmpRejB; fHighBkgCut[ivar] = bdtEventSample[iev].GetVal() + dVal; fIsHighBkgCut[ivar]=
kTRUE;}
3066 Log() << kDEBUG <<
" \tfound and suggest the following possible pre-selection cuts " <<
Endl;
3067 if (fDoPreselection)
Log() << kDEBUG <<
"\tthe training will be done after these cuts... and GetMVA value returns +1, (-1) for a signal (bkg) event that passes these cuts" <<
Endl;
3068 else Log() << kDEBUG <<
"\tas option DoPreselection was not used, these cuts however will not be performed, but the training will see the full sample"<<
Endl;
3069 for (
UInt_t ivar=0; ivar < GetNvar(); ivar++ ) {
3070 if (fIsLowBkgCut[ivar]){
3071 Log() << kDEBUG <<
" \tfound cut: Bkg if var " << ivar <<
" < " << fLowBkgCut[ivar] <<
Endl;
3073 if (fIsLowSigCut[ivar]){
3074 Log() << kDEBUG <<
" \tfound cut: Sig if var " << ivar <<
" < " << fLowSigCut[ivar] <<
Endl;
3076 if (fIsHighBkgCut[ivar]){
3077 Log() << kDEBUG <<
" \tfound cut: Bkg if var " << ivar <<
" > " << fHighBkgCut[ivar] <<
Endl;
3079 if (fIsHighSigCut[ivar]){
3080 Log() << kDEBUG <<
" \tfound cut: Sig if var " << ivar <<
" > " << fHighSigCut[ivar] <<
Endl;
3095 for (
UInt_t ivar=0; ivar < GetNvar(); ivar++ ) {
3096 if (fIsLowBkgCut[ivar]){
3097 if (ev->
GetValue(ivar) < fLowBkgCut[ivar]) result = -1;
3099 if (fIsLowSigCut[ivar]){
3100 if (ev->
GetValue(ivar) < fLowSigCut[ivar]) result = 1;
3102 if (fIsHighBkgCut[ivar]){
3103 if (ev->
GetValue(ivar) > fHighBkgCut[ivar]) result = -1;
3105 if (fIsHighSigCut[ivar]){
3106 if (ev->
GetValue(ivar) > fHighSigCut[ivar]) result = 1;
#define REGISTER_METHOD(CLASS)
for example
static RooMathCoreReg dummy
char * Form(const char *fmt,...)
A pseudo container class which is a generator of indices.
auto Map(F func, unsigned nTimes) -> std::vector< typename std::result_of< F()>::type >
Execute func (with no arguments) nTimes in parallel.
void Foreach(F func, unsigned nTimes, unsigned nChunks=0)
Execute func (with no arguments) nTimes in parallel.
A Graph is a graphics object made of two arrays X and Y with npoints each.
virtual void SetPoint(Int_t i, Double_t x, Double_t y)
Set x and y values for point number i.
virtual void SetName(const char *name="")
Set graph name.
virtual void SetTitle(const char *title="")
Set graph title.
virtual void Set(Int_t n)
Set number of points in the graph Existing coordinates are preserved New coordinates above fNpoints a...
1-D histogram with a float per channel (see TH1 documentation)}
1-D histogram with an int per channel (see TH1 documentation)}
virtual void SetXTitle(const char *title)
virtual Int_t Fill(Double_t x)
Increment bin with abscissa X by 1.
virtual void SetBinContent(Int_t bin, Double_t content)
Set bin content see convention for numbering bins in TH1::GetBin In case the bin number is greater th...
virtual void SetYTitle(const char *title)
2-D histogram with a float per channel (see TH1 documentation)}
Service class for 2-Dim histogram classes.
Absolute Deviation BDT Loss Function.
static void SetVarIndex(Int_t iVar)
static Config & Instance()
static function: returns TMVA instance
ROOT::TThreadExecutor & GetThreadExecutor()
Implementation of the CrossEntropy as separation criterion.
Class that contains all the data information.
Implementation of a Decision Tree.
TMVA::DecisionTreeNode * GetEventNode(const TMVA::Event &e) const
get the pointer to the leaf node where a particular event ends up in... (used in gradient boosting)
Double_t CheckEvent(const TMVA::Event *, Bool_t UseYesNoLeaf=kFALSE) const
the event e is put into the decision tree (starting at the root node) and the output is NodeType (sig...
static DecisionTree * CreateFromXML(void *node, UInt_t tmva_Version_Code=TMVA_VERSION_CODE)
re-create a new tree (decision tree or search tree) from XML
Float_t GetValue(UInt_t ivar) const
return value of i'th variable
void SetTarget(UInt_t itgt, Float_t value)
set the target value (dimension itgt) to value
Float_t GetTarget(UInt_t itgt) const
Implementation of the GiniIndex With Laplace correction as separation criterion.
Implementation of the GiniIndex as separation criterion.
The TMVA::Interval Class.
Least Squares BDT Loss Function.
The TMVA::Interval Class.
Analysis of Boosted Decision Trees.
void Init(void)
Common initialisation with defaults for the BDT-Method.
static const Int_t fgDebugLevel
MethodBDT(const TString &jobName, const TString &methodTitle, DataSetInfo &theData, const TString &theOption="")
The standard constructor for the "boosted decision trees".
void BoostMonitor(Int_t iTree)
Fills the ROCIntegral vs Itree from the testSample for the monitoring plots during the training .
const std::vector< Float_t > & GetMulticlassValues()
Get the multiclass MVA response for the BDT classifier.
Double_t AdaBoostR2(std::vector< const TMVA::Event * > &, DecisionTree *dt)
Adaption of the AdaBoost to regression problems (see H.Drucker 1997).
void MakeClassSpecific(std::ostream &, const TString &) const
Make ROOT-independent C++ class for classifier response (classifier-specific implementation).
void GetHelpMessage() const
Get help message text.
LossFunctionBDT * fRegressionLossFunctionBDTG
void DeterminePreselectionCuts(const std::vector< const TMVA::Event * > &eventSample)
Find useful preselection cuts that will be applied before and Decision Tree training.
Double_t GradBoost(std::vector< const TMVA::Event * > &, DecisionTree *dt, UInt_t cls=0)
Calculate the desired response value for each region.
const Ranking * CreateRanking()
Compute ranking of input variables.
virtual void SetTuneParameters(std::map< TString, Double_t > tuneParameters)
Set the tuning parameters according to the argument.
Double_t AdaCost(std::vector< const TMVA::Event * > &, DecisionTree *dt)
The AdaCost boosting algorithm takes a simple cost Matrix (currently fixed for all events....
void DeclareOptions()
Define the options (their key words).
virtual std::map< TString, Double_t > OptimizeTuningParameters(TString fomType="ROCIntegral", TString fitType="FitGA")
Call the Optimizer with the set of parameters and ranges that are meant to be tuned.
Double_t Boost(std::vector< const TMVA::Event * > &, DecisionTree *dt, UInt_t cls=0)
Apply the boosting algorithm (the algorithm is selecte via the the "option" given in the constructor.
Double_t TestTreeQuality(DecisionTree *dt)
Test the tree quality.. in terms of Misclassification.
Double_t Bagging()
Call it boot-strapping, re-sampling or whatever you like, in the end it is nothing else but applying ...
void UpdateTargets(std::vector< const TMVA::Event * > &, UInt_t cls=0)
Calculate residual for all events.
void UpdateTargetsRegression(std::vector< const TMVA::Event * > &, Bool_t first=kFALSE)
Calculate residuals for all events and update targets for next iter.
Double_t GradBoostRegression(std::vector< const TMVA::Event * > &, DecisionTree *dt)
Implementation of M_TreeBoost using any loss function as described by Friedman 1999.
void WriteMonitoringHistosToFile(void) const
Here we could write some histograms created during the processing to the output file.
virtual ~MethodBDT(void)
Destructor.
void AddWeightsXMLTo(void *parent) const
Write weights to XML.
Double_t GetGradBoostMVA(const TMVA::Event *e, UInt_t nTrees)
Returns MVA value: -1 for background, 1 for signal.
virtual Bool_t HasAnalysisType(Types::EAnalysisType type, UInt_t numberClasses, UInt_t numberTargets)
BDT can handle classification with multiple classes and regression with one regression-target.
Double_t RegBoost(std::vector< const TMVA::Event * > &, DecisionTree *dt)
A special boosting only for Regression (not implemented).
void InitEventSample()
Initialize the event sample (i.e. reset the boost-weights... etc).
Double_t ApplyPreselectionCuts(const Event *ev)
Apply the preselection cuts before even bothering about any Decision Trees in the GetMVA .
void SetMinNodeSize(Double_t sizeInPercent)
void ProcessOptions()
The option string is decoded, for available options see "DeclareOptions".
void PreProcessNegativeEventWeights()
O.k.
void MakeClassInstantiateNode(DecisionTreeNode *n, std::ostream &fout, const TString &className) const
Recursively descends a tree and writes the node instance to the output stream.
Double_t AdaBoost(std::vector< const TMVA::Event * > &, DecisionTree *dt)
The AdaBoost implementation.
std::vector< Double_t > GetVariableImportance()
Return the relative variable importance, normalized to all variables together having the importance 1...
Double_t GetMvaValue(Double_t *err=0, Double_t *errUpper=0)
Double_t PrivateGetMvaValue(const TMVA::Event *ev, Double_t *err=0, Double_t *errUpper=0, UInt_t useNTrees=0)
Return the MVA value (range [-1;1]) that classifies the event according to the majority vote from the...
void InitGradBoost(std::vector< const TMVA::Event * > &)
Initialize targets for first tree.
void Train(void)
BDT training.
void GetBaggedSubSample(std::vector< const TMVA::Event * > &)
Fills fEventSample with fBaggedSampleFraction*NEvents random training events.
const std::vector< Float_t > & GetRegressionValues()
Get the regression value generated by the BDTs.
SeparationBase * fSepType
void ReadWeightsFromXML(void *parent)
Reads the BDT from the xml file.
void ReadWeightsFromStream(std::istream &istr)
Read the weights (BDT coefficients).
void Reset(void)
Reset the method, as if it had just been instantiated (forget all training etc.).
void MakeClassSpecificHeader(std::ostream &, const TString &) const
Specific class header.
void DeclareCompatibilityOptions()
Options that are used ONLY for the READER to ensure backward compatibility.
Virtual base Class for all MVA method.
virtual void DeclareCompatibilityOptions()
options that are used ONLY for the READER to ensure backward compatibility they are hence without any...
Implementation of the MisClassificationError as separation criterion.
std::map< TString, Double_t > optimize()
PDF wrapper for histograms; uses user-defined spline interpolation.
Ranking for variables in method (implementation)
Class that is the base-class for a vector of result.
TGraph * GetGraph(const TString &alias) const
TH1 * GetHist(const TString &alias) const
void Store(TObject *obj, const char *alias=0)
Implementation of the SdivSqrtSplusB as separation criterion.
Timing information for training and evaluation of MVA methods.
TString GetElapsedTime(Bool_t Scientific=kTRUE)
returns pretty string with elapsed time
void DrawProgressBar(Int_t, const TString &comment="")
draws progress bar in color or B&W caution:
Singleton class for Global types used by TMVA.
virtual Double_t Determinant() const
TMatrixTSym< Element > & Invert(Double_t *det=0)
Invert the matrix and calculate its determinant Notice that the LU decomposition is used instead of B...
virtual const char * GetTitle() const
Returns title of object.
virtual Int_t Write(const char *name=0, Int_t option=0, Int_t bufsize=0)
Write this object to the current directory.
virtual void Delete(Option_t *option="")
Delete this object.
virtual Int_t Read(const char *name)
Read contents of object with specified name from the current directory.
Random number generator class based on M.
virtual Double_t PoissonD(Double_t mean)
Generates a random number according to a Poisson law.
Double_t Atof() const
Return floating-point value contained in string.
Bool_t IsFloat() const
Returns kTRUE if string contains a floating point or integer number.
TString & ReplaceAll(const TString &s1, const TString &s2)
TString & Append(const char *cs)
A TTree object has a header with a name and a title.
std::string GetMethodName(TCppMethod_t)
std::string GetName(const std::string &scope_name)
static const uint32_t K[64]
void Print(std::ostream &os, const OptionType &opt)
double dist(Rotation3D const &r1, Rotation3D const &r2)
VecExpr< UnaryOp< Fabs< T >, VecExpr< A, T, D >, T >, T, D > fabs(const VecExpr< A, T, D > &rhs)
TSeq< unsigned int > TSeqU
Abstract ClassifierFactory template that handles arbitrary types.
void BDT(TString dataset, const TString &fin="TMVA.root")
MsgLogger & Endl(MsgLogger &ml)
Short_t Max(Short_t a, Short_t b)
Int_t FloorNint(Double_t x)
constexpr Double_t E()
Base of natural log:
Double_t Sqrt(Double_t x)
LongDouble_t Power(LongDouble_t x, LongDouble_t y)
Int_t CeilNint(Double_t x)
Short_t Min(Short_t a, Short_t b)
static long int sum(long int i)