Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
MethodBoost.cxx
Go to the documentation of this file.
1// @(#)root/tmva $Id$
2// Author: Andreas Hoecker, Joerg Stelzer, Helge Voss, Kai Voss,Or Cohen, Jan Therhaag, Eckhard von Toerne
3
4/**********************************************************************************
5 * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6 * Package: TMVA *
7 * Class : MethodCompositeBase *
8 * *
9 * *
10 * Description: *
11 * Virtual base class for all MVA method *
12 * *
13 * Authors (alphabetical): *
14 * Andreas Hoecker <Andreas.Hocker@cern.ch> - CERN, Switzerland *
15 * Peter Speckmayer <Peter.Speckmazer@cern.ch> - CERN, Switzerland *
16 * Joerg Stelzer <Joerg.Stelzer@cern.ch> - CERN, Switzerland *
17 * Helge Voss <Helge.Voss@cern.ch> - MPI-K Heidelberg, Germany *
18 * Jan Therhaag <Jan.Therhaag@cern.ch> - U of Bonn, Germany *
19 * Eckhard v. Toerne <evt@uni-bonn.de> - U of Bonn, Germany *
20 * *
21 * Copyright (c) 2005-2011: *
22 * CERN, Switzerland *
23 * U. of Victoria, Canada *
24 * MPI-K Heidelberg, Germany *
25 * U. of Bonn, Germany *
26 * *
27 * Redistribution and use in source and binary forms, with or without *
28 * modification, are permitted according to the terms listed in LICENSE *
29 * (see tmva/doc/LICENSE) *
30 **********************************************************************************/
31
32/*! \class TMVA::MethodBoost
33\ingroup TMVA
34
35Class for boosting a TMVA method
36
37This class is meant to boost a single classifier. Boosting means
38training the classifier a few times. Every time the weights of the
39events are modified according to how well the classifier performed
40on the test sample.
41
42*/
43
44#include "TMVA/MethodBoost.h"
45
47#include "TMVA/Config.h"
48#include "TMVA/Configurable.h"
49#include "TMVA/DataSet.h"
50#include "TMVA/DataSetInfo.h"
51#include "TMVA/IMethod.h"
52#include "TMVA/MethodBase.h"
53#include "TMVA/MethodCategory.h"
55#include "TMVA/MethodDT.h"
56#include "TMVA/MethodFisher.h"
57#include "TMVA/PDF.h"
58#include "TMVA/Results.h"
59#include "TMVA/Timer.h"
60#include "TMVA/Tools.h"
61#include "TMVA/Types.h"
62
63#include "TMVA/SeparationBase.h"
65#include "TMVA/GiniIndex.h"
66#include "TMVA/CrossEntropy.h"
69
70#include "TRandom3.h"
71#include "TMath.h"
72#include "TH1F.h"
73#include "TH2F.h"
74#include "TSpline.h"
75#include "TDirectory.h"
76#include "TTree.h"
77
78#include <iostream>
79#include <algorithm>
80#include <vector>
81#include <cmath>
82
83
84REGISTER_METHOD(Boost)
85
86
87////////////////////////////////////////////////////////////////////////////////
88
90 const TString& methodTitle,
93 TMVA::MethodCompositeBase( jobName, Types::kBoost, methodTitle, theData, theOption)
94 , fBoostNum(0)
95 , fDetailedMonitoring(kFALSE)
96 , fAdaBoostBeta(0)
97 , fRandomSeed(0)
98 , fBaggedSampleFraction(0)
99 , fBoostedMethodTitle(methodTitle)
100 , fBoostedMethodOptions(theOption)
101 , fMonitorBoostedMethod(kFALSE)
102 , fMonitorTree(0)
103 , fBoostWeight(0)
104 , fMethodError(0)
105 , fROC_training(0.0)
106 , fOverlap_integral(0.0)
107 , fMVAvalues(0)
108{
109 fMVAvalues = new std::vector<Float_t>;
110 fDataSetManager = NULL;
111 fHistoricBoolOption = kFALSE;
112}
113
114////////////////////////////////////////////////////////////////////////////////
115
117 const TString& theWeightFile)
119 , fBoostNum(0)
120 , fDetailedMonitoring(kFALSE)
121 , fAdaBoostBeta(0)
122 , fRandomSeed(0)
123 , fBaggedSampleFraction(0)
124 , fBoostedMethodTitle("")
125 , fBoostedMethodOptions("")
126 , fMonitorBoostedMethod(kFALSE)
127 , fMonitorTree(0)
128 , fBoostWeight(0)
129 , fMethodError(0)
130 , fROC_training(0.0)
131 , fOverlap_integral(0.0)
132 , fMVAvalues(0)
133{
134 fMVAvalues = new std::vector<Float_t>;
137}
138
139////////////////////////////////////////////////////////////////////////////////
140/// destructor
141
143{
144 fMethodWeight.clear();
145
146 // the histogram themselves are deleted when the file is closed
147
148 fTrainSigMVAHist.clear();
149 fTrainBgdMVAHist.clear();
150 fBTrainSigMVAHist.clear();
151 fBTrainBgdMVAHist.clear();
152 fTestSigMVAHist.clear();
153 fTestBgdMVAHist.clear();
154
155 if (fMVAvalues) {
156 delete fMVAvalues;
157 fMVAvalues = 0;
158 }
159}
160
161
162////////////////////////////////////////////////////////////////////////////////
163/// Boost can handle classification with 2 classes and regression with one regression-target
164
166{
167 if (type == Types::kClassification && numberClasses == 2) return kTRUE;
168 // if (type == Types::kRegression && numberTargets == 1) return kTRUE;
169 return kFALSE;
170}
171
172
173////////////////////////////////////////////////////////////////////////////////
174
176{
177 DeclareOptionRef( fBoostNum = 1, "Boost_Num",
178 "Number of times the classifier is boosted" );
179
180 DeclareOptionRef( fMonitorBoostedMethod = kTRUE, "Boost_MonitorMethod",
181 "Write monitoring histograms for each boosted classifier" );
182
183 DeclareOptionRef( fDetailedMonitoring = kFALSE, "Boost_DetailedMonitoring",
184 "Produce histograms for detailed boost monitoring" );
185
186 DeclareOptionRef( fBoostType = "AdaBoost", "Boost_Type", "Boosting type for the classifiers" );
187 AddPreDefVal(TString("RealAdaBoost"));
188 AddPreDefVal(TString("AdaBoost"));
189 AddPreDefVal(TString("Bagging"));
190
191 DeclareOptionRef(fBaggedSampleFraction=.6,"Boost_BaggedSampleFraction","Relative size of bagged event sample to original size of the data sample (used whenever bagging is used)" );
192
193 DeclareOptionRef( fAdaBoostBeta = 1.0, "Boost_AdaBoostBeta",
194 "The ADA boost parameter that sets the effect of every boost step on the events' weights" );
195
196 DeclareOptionRef( fTransformString = "step", "Boost_Transform",
197 "Type of transform applied to every boosted method linear, log, step" );
198 AddPreDefVal(TString("step"));
199 AddPreDefVal(TString("linear"));
200 AddPreDefVal(TString("log"));
201 AddPreDefVal(TString("gauss"));
202
203 DeclareOptionRef( fRandomSeed = 0, "Boost_RandomSeed",
204 "Seed for random number generator used for bagging" );
205
206 TMVA::MethodCompositeBase::fMethods.reserve(fBoostNum);
207}
208
209////////////////////////////////////////////////////////////////////////////////
210/// options that are used ONLY for the READER to ensure backward compatibility
211/// they are hence without any effect (the reader is only reading the training
212/// options that HAD been used at the training of the .xml weight file at hand
213
215{
216
218
219 DeclareOptionRef( fHistoricOption = "ByError", "Boost_MethodWeightType",
220 "How to set the final weight of the boosted classifiers" );
221 AddPreDefVal(TString("ByError"));
222 AddPreDefVal(TString("Average"));
223 AddPreDefVal(TString("ByROC"));
224 AddPreDefVal(TString("ByOverlap"));
225 AddPreDefVal(TString("LastMethod"));
226
227 DeclareOptionRef( fHistoricOption = "step", "Boost_Transform",
228 "Type of transform applied to every boosted method linear, log, step" );
229 AddPreDefVal(TString("step"));
230 AddPreDefVal(TString("linear"));
231 AddPreDefVal(TString("log"));
232 AddPreDefVal(TString("gauss"));
233
234 // this option here
235 //DeclareOptionRef( fBoostType = "AdaBoost", "Boost_Type", "Boosting type for the classifiers" );
236 // still exists, but these two possible values
237 AddPreDefVal(TString("HighEdgeGauss"));
238 AddPreDefVal(TString("HighEdgeCoPara"));
239 // have been deleted .. hope that works :)
240
241 DeclareOptionRef( fHistoricBoolOption, "Boost_RecalculateMVACut",
242 "Recalculate the classifier MVA Signallike cut at every boost iteration" );
243
244}
245
246////////////////////////////////////////////////////////////////////////////////
247/// just registering the string from which the boosted classifier will be created
248
250{
251 fBoostedMethodName = Types::Instance().GetMethodName( theMethod );
252 fBoostedMethodTitle = methodTitle;
253 fBoostedMethodOptions = theOption;
255 opts.ToLower();
256 // if (opts.Contains("vartransform")) Log() << kFATAL << "It is not possible to use boost in conjunction with variable transform. Please remove either Boost_Num or VarTransform from the option string"<< methodTitle<<Endl;
257
258 return kTRUE;
259}
260
261////////////////////////////////////////////////////////////////////////////////
262
264{
265}
266
267////////////////////////////////////////////////////////////////////////////////
268/// initialisation routine
269
271{
272
273 Results* results = Data()->GetResults(GetMethodName(), Types::kTraining, GetAnalysisType());
274
275 results->Store(new TH1F("MethodWeight","Normalized Classifier Weight",fBoostNum,0,fBoostNum),"ClassifierWeight");
276 results->Store(new TH1F("BoostWeight","Boost Weight",fBoostNum,0,fBoostNum),"BoostWeight");
277 results->Store(new TH1F("ErrFraction","Error Fraction (by boosted event weights)",fBoostNum,0,fBoostNum),"ErrorFraction");
278 if (fDetailedMonitoring){
279 results->Store(new TH1F("ROCIntegral_test","ROC integral of single classifier (testing sample)",fBoostNum,0,fBoostNum),"ROCIntegral_test");
280 results->Store(new TH1F("ROCIntegralBoosted_test","ROC integral of boosted method (testing sample)",fBoostNum,0,fBoostNum),"ROCIntegralBoosted_test");
281 results->Store(new TH1F("ROCIntegral_train","ROC integral of single classifier (training sample)",fBoostNum,0,fBoostNum),"ROCIntegral_train");
282 results->Store(new TH1F("ROCIntegralBoosted_train","ROC integral of boosted method (training sample)",fBoostNum,0,fBoostNum),"ROCIntegralBoosted_train");
283 results->Store(new TH1F("OverlapIntegal_train","Overlap integral (training sample)",fBoostNum,0,fBoostNum),"Overlap");
284 }
285
286
287 results->GetHist("ClassifierWeight")->GetXaxis()->SetTitle("Index of boosted classifier");
288 results->GetHist("ClassifierWeight")->GetYaxis()->SetTitle("Classifier Weight");
289 results->GetHist("BoostWeight")->GetXaxis()->SetTitle("Index of boosted classifier");
290 results->GetHist("BoostWeight")->GetYaxis()->SetTitle("Boost Weight");
291 results->GetHist("ErrorFraction")->GetXaxis()->SetTitle("Index of boosted classifier");
292 results->GetHist("ErrorFraction")->GetYaxis()->SetTitle("Error Fraction");
293 if (fDetailedMonitoring){
294 results->GetHist("ROCIntegral_test")->GetXaxis()->SetTitle("Index of boosted classifier");
295 results->GetHist("ROCIntegral_test")->GetYaxis()->SetTitle("ROC integral of single classifier");
296 results->GetHist("ROCIntegralBoosted_test")->GetXaxis()->SetTitle("Number of boosts");
297 results->GetHist("ROCIntegralBoosted_test")->GetYaxis()->SetTitle("ROC integral boosted");
298 results->GetHist("ROCIntegral_train")->GetXaxis()->SetTitle("Index of boosted classifier");
299 results->GetHist("ROCIntegral_train")->GetYaxis()->SetTitle("ROC integral of single classifier");
300 results->GetHist("ROCIntegralBoosted_train")->GetXaxis()->SetTitle("Number of boosts");
301 results->GetHist("ROCIntegralBoosted_train")->GetYaxis()->SetTitle("ROC integral boosted");
302 results->GetHist("Overlap")->GetXaxis()->SetTitle("Index of boosted classifier");
303 results->GetHist("Overlap")->GetYaxis()->SetTitle("Overlap integral");
304 }
305
306 results->Store(new TH1F("SoverBtotal","S/B in reweighted training sample",fBoostNum,0,fBoostNum),"SoverBtotal");
307 results->GetHist("SoverBtotal")->GetYaxis()->SetTitle("S/B (boosted sample)");
308 results->GetHist("SoverBtotal")->GetXaxis()->SetTitle("Index of boosted classifier");
309
310 results->Store(new TH1F("SeparationGain","SeparationGain",fBoostNum,0,fBoostNum),"SeparationGain");
311 results->GetHist("SeparationGain")->GetYaxis()->SetTitle("SeparationGain");
312 results->GetHist("SeparationGain")->GetXaxis()->SetTitle("Index of boosted classifier");
313
314
315
316 fMonitorTree= new TTree("MonitorBoost","Boost variables");
317 fMonitorTree->Branch("iMethod",&fCurrentMethodIdx,"iMethod/I");
318 fMonitorTree->Branch("boostWeight",&fBoostWeight,"boostWeight/D");
319 fMonitorTree->Branch("errorFraction",&fMethodError,"errorFraction/D");
320 fMonitorBoostedMethod = kTRUE;
321
322}
323
324
325////////////////////////////////////////////////////////////////////////////////
326
328{
329 Log() << kDEBUG << "CheckSetup: fBoostType="<<fBoostType << Endl;
330 Log() << kDEBUG << "CheckSetup: fAdaBoostBeta="<<fAdaBoostBeta<<Endl;
331 Log() << kDEBUG << "CheckSetup: fBoostWeight="<<fBoostWeight<<Endl;
332 Log() << kDEBUG << "CheckSetup: fMethodError="<<fMethodError<<Endl;
333 Log() << kDEBUG << "CheckSetup: fBoostNum="<<fBoostNum << Endl;
334 Log() << kDEBUG << "CheckSetup: fRandomSeed=" << fRandomSeed<< Endl;
335 Log() << kDEBUG << "CheckSetup: fTrainSigMVAHist.size()="<<fTrainSigMVAHist.size()<<Endl;
336 Log() << kDEBUG << "CheckSetup: fTestSigMVAHist.size()="<<fTestSigMVAHist.size()<<Endl;
337 Log() << kDEBUG << "CheckSetup: fMonitorBoostedMethod=" << (fMonitorBoostedMethod? "true" : "false") << Endl;
338 Log() << kDEBUG << "CheckSetup: MName=" << fBoostedMethodName << " Title="<< fBoostedMethodTitle<< Endl;
339 Log() << kDEBUG << "CheckSetup: MOptions="<< fBoostedMethodOptions << Endl;
340 Log() << kDEBUG << "CheckSetup: fMonitorTree=" << fMonitorTree <<Endl;
341 Log() << kDEBUG << "CheckSetup: fCurrentMethodIdx=" <<fCurrentMethodIdx << Endl;
342 if (fMethods.size()>0) Log() << kDEBUG << "CheckSetup: fMethods[0]" <<fMethods[0]<<Endl;
343 Log() << kDEBUG << "CheckSetup: fMethodWeight.size()" << fMethodWeight.size() << Endl;
344 if (fMethodWeight.size()>0) Log() << kDEBUG << "CheckSetup: fMethodWeight[0]="<<fMethodWeight[0]<<Endl;
345 Log() << kDEBUG << "CheckSetup: trying to repair things" << Endl;
346
347}
348////////////////////////////////////////////////////////////////////////////////
349
351{
352 TDirectory* methodDir = nullptr;
355 Results* results = Data()->GetResults(GetMethodName(), Types::kTraining, GetAnalysisType());
356
357
358 InitHistos();
359
360 if (Data()->GetNTrainingEvents()==0) Log() << kFATAL << "<Train> Data() has zero events" << Endl;
361 Data()->SetCurrentType(Types::kTraining);
362
363 if (fMethods.size() > 0) fMethods.clear();
364 fMVAvalues->resize(Data()->GetNTrainingEvents(), 0.0);
365
366 Log() << kINFO << "Training "<< fBoostNum << " " << fBoostedMethodName << " with title " << fBoostedMethodTitle << " Classifiers ... patience please" << Endl;
367 Timer timer( fBoostNum, GetName() );
368
369 ResetBoostWeights();
370
371 // clean boosted method options
372 CleanBoostOptions();
373
374
375 // remove transformations for individual boosting steps
376 // the transformation of the main method will be rerouted to each of the boost steps
377 Ssiz_t varTrafoStart=fBoostedMethodOptions.Index("~VarTransform=");
378 if (varTrafoStart >0) {
379 Ssiz_t varTrafoEnd =fBoostedMethodOptions.Index(":",varTrafoStart);
381 varTrafoEnd=fBoostedMethodOptions.Length();
382 fBoostedMethodOptions.Remove(varTrafoStart,varTrafoEnd-varTrafoStart);
383 }
384
385 //
386 // training and boosting the classifiers
387 for (fCurrentMethodIdx=0;fCurrentMethodIdx<fBoostNum;fCurrentMethodIdx++) {
388 // the first classifier shows the option string output, the rest not
389 if (fCurrentMethodIdx>0) TMVA::MsgLogger::InhibitOutput();
390
392 fBoostedMethodName.Data(), GetJobName(), TString::Format("%s_B%04i", fBoostedMethodTitle.Data(), fCurrentMethodIdx),
393 DataInfo(), fBoostedMethodOptions);
395
396 // suppressing the rest of the classifier output the right way
397 fCurrentMethod = (dynamic_cast<MethodBase*>(method));
398
399 if (fCurrentMethod==0) {
400 Log() << kFATAL << "uups.. guess the booking of the " << fCurrentMethodIdx << "-th classifier somehow failed" << Endl;
401 return; // hope that makes coverity happy (as if fears I might use the pointer later on, not knowing that FATAL exits
402 }
403
404 // set fDataSetManager if MethodCategory (to enable Category to create datasetinfo objects) // DSMTEST
405 if (fCurrentMethod->GetMethodType() == Types::kCategory) { // DSMTEST
406 MethodCategory *methCat = (dynamic_cast<MethodCategory*>(fCurrentMethod)); // DSMTEST
407 if (!methCat) // DSMTEST
408 Log() << kFATAL << "Method with type kCategory cannot be casted to MethodCategory. /MethodBoost" << Endl; // DSMTEST
409 methCat->fDataSetManager = fDataSetManager; // DSMTEST
410 } // DSMTEST
411
412 fCurrentMethod->SetMsgType(kWARNING);
413 fCurrentMethod->SetupMethod();
414 fCurrentMethod->ParseOptions();
415 // put SetAnalysisType here for the needs of MLP
416 fCurrentMethod->SetAnalysisType( GetAnalysisType() );
417 fCurrentMethod->ProcessSetup();
418 fCurrentMethod->CheckSetup();
419
420
421 // reroute transformationhandler
422 fCurrentMethod->RerouteTransformationHandler (&(this->GetTransformationHandler()));
423
424
425 // creating the directory of the classifier
426 if(!IsSilentFile())
427 {
428 if (fMonitorBoostedMethod) {
429 methodDir = GetFile()->GetDirectory(dirName=TString::Format("%s_B%04i",fBoostedMethodName.Data(),fCurrentMethodIdx));
430 if (!methodDir) {
431 methodDir = BaseDir()->mkdir(dirName,dirTitle=TString::Format("Directory Boosted %s #%04i", fBoostedMethodName.Data(),fCurrentMethodIdx));
432 }
433 fCurrentMethod->SetMethodDir(methodDir);
434 fCurrentMethod->BaseDir()->cd();
435 }
436 }
437
438 // training
440 timer.DrawProgressBar( fCurrentMethodIdx );
441 if (fCurrentMethodIdx==0) MonitorBoost(Types::kBoostProcBegin,fCurrentMethodIdx);
442 MonitorBoost(Types::kBeforeTraining,fCurrentMethodIdx);
443 TMVA::MsgLogger::InhibitOutput(); //suppressing Logger outside the method
444 if (fBoostType=="Bagging") Bagging(); // you want also to train the first classifier on a bagged sample
445 SingleTrain();
447 if(!IsSilentFile())fCurrentMethod->WriteMonitoringHistosToFile();
448
449 // calculate MVA values of current method for all events in training sample
450 // (used later on to get 'misclassified events' etc for the boosting
451 CalcMVAValues();
452
453 if(!IsSilentFile()) if (fCurrentMethodIdx==0 && fMonitorBoostedMethod) CreateMVAHistorgrams();
454
455 // get ROC integral and overlap integral for single method on
456 // training sample if fMethodWeightType == "ByROC" or the user
457 // wants detailed monitoring
458
459 // boosting (reweight training sample)
460 MonitorBoost(Types::kBeforeBoosting,fCurrentMethodIdx);
461 SingleBoost(fCurrentMethod);
462
463 MonitorBoost(Types::kAfterBoosting,fCurrentMethodIdx);
464 results->GetHist("BoostWeight")->SetBinContent(fCurrentMethodIdx+1,fBoostWeight);
465 results->GetHist("ErrorFraction")->SetBinContent(fCurrentMethodIdx+1,fMethodError);
466
467 if (fDetailedMonitoring) {
468 fROC_training = GetBoostROCIntegral(kTRUE, Types::kTraining, kTRUE);
469 results->GetHist("ROCIntegral_test")->SetBinContent(fCurrentMethodIdx+1, GetBoostROCIntegral(kTRUE, Types::kTesting));
470 results->GetHist("ROCIntegralBoosted_test")->SetBinContent(fCurrentMethodIdx+1, GetBoostROCIntegral(kFALSE, Types::kTesting));
471 results->GetHist("ROCIntegral_train")->SetBinContent(fCurrentMethodIdx+1, fROC_training);
472 results->GetHist("ROCIntegralBoosted_train")->SetBinContent(fCurrentMethodIdx+1, GetBoostROCIntegral(kFALSE, Types::kTraining));
473 results->GetHist("Overlap")->SetBinContent(fCurrentMethodIdx+1, fOverlap_integral);
474 }
475
476
477
478 fMonitorTree->Fill();
479
480 // stop boosting if needed when error has reached 0.5
481 // thought of counting a few steps, but it doesn't seem to be necessary
482 Log() << kDEBUG << "AdaBoost (methodErr) err = " << fMethodError << Endl;
483 if (fMethodError > 0.49999) StopCounter++;
484 if (StopCounter > 0 && fBoostType != "Bagging") {
485 timer.DrawProgressBar( fBoostNum );
486 fBoostNum = fCurrentMethodIdx+1;
487 Log() << kINFO << "Error rate has reached 0.5 ("<< fMethodError<<"), boosting process stopped at #" << fBoostNum << " classifier" << Endl;
488 if (fBoostNum < 5)
489 Log() << kINFO << "The classifier might be too strong to boost with Beta = " << fAdaBoostBeta << ", try reducing it." <<Endl;
490 break;
491 }
492 }
493
494 //as MethodBoost acts not on a private event sample (like MethodBDT does), we need to remember not
495 // to leave "boosted" events to the next classifier in the factory
496
497 ResetBoostWeights();
498
499 Timer* timer1= new Timer( fBoostNum, GetName() );
500 // normalizing the weights of the classifiers
501 for (fCurrentMethodIdx=0;fCurrentMethodIdx<fBoostNum;fCurrentMethodIdx++) {
502 // performing post-boosting actions
503
504 timer1->DrawProgressBar( fCurrentMethodIdx );
505
506 if (fCurrentMethodIdx==fBoostNum) {
507 Log() << kINFO << "Elapsed time: " << timer1->GetElapsedTime()
508 << " " << Endl;
509 }
510
511 TH1F* tmp = dynamic_cast<TH1F*>( results->GetHist("ClassifierWeight") );
512 if (tmp) tmp->SetBinContent(fCurrentMethodIdx+1,fMethodWeight[fCurrentMethodIdx]);
513
514 }
515
516 // Ensure that in case of only 1 boost the method weight equals
517 // 1.0. This avoids unexpected behaviour in case of very bad
518 // classifiers which have fBoostWeight=1 or fMethodError=0.5,
519 // because their weight would be set to zero. This behaviour is
520 // not ok if one boosts just one time.
521 if (fMethods.size()==1) fMethodWeight[0] = 1.0;
522
523 MonitorBoost(Types::kBoostProcEnd);
524
525 delete timer1;
526}
527
528////////////////////////////////////////////////////////////////////////////////
529
531{
532 fBoostedMethodOptions=GetOptions();
533}
534
535////////////////////////////////////////////////////////////////////////////////
536
538{
539 if (fBoostNum <=0) Log() << kFATAL << "CreateHistograms called before fBoostNum is initialized" << Endl;
540 // calculating histograms boundaries and creating histograms..
541 // nrms = number of rms around the average to use for outline (of the 0 classifier)
543 Int_t signalClass = 0;
544 if (DataInfo().GetClassInfo("Signal") != 0) {
545 signalClass = DataInfo().GetClassInfo("Signal")->GetNumber();
546 }
547 gTools().ComputeStat( GetEventCollection( Types::kMaxTreeType ), fMVAvalues,
549
550 fNbins = gConfig().fVariablePlotting.fNbinsXOfROCCurve;
552 xmax = TMath::Min( TMath::Max(meanS + nrms*rmsS, meanB + nrms*rmsB ), xmax ) + 0.00001;
553
554 // creating all the histograms
555 for (UInt_t imtd=0; imtd<fBoostNum; imtd++) {
556 fTrainSigMVAHist .push_back( new TH1F( TString::Format("MVA_Train_S_%04i",imtd).Data(), "MVA_Train_S", fNbins, xmin, xmax ) );
557 fTrainBgdMVAHist .push_back( new TH1F( TString::Format("MVA_Train_B%04i", imtd).Data(), "MVA_Train_B", fNbins, xmin, xmax ) );
558 fBTrainSigMVAHist.push_back( new TH1F( TString::Format("MVA_BTrain_S%04i",imtd).Data(), "MVA_BoostedTrain_S", fNbins, xmin, xmax ) );
559 fBTrainBgdMVAHist.push_back( new TH1F( TString::Format("MVA_BTrain_B%04i",imtd).Data(), "MVA_BoostedTrain_B", fNbins, xmin, xmax ) );
560 fTestSigMVAHist .push_back( new TH1F( TString::Format("MVA_Test_S%04i", imtd).Data(), "MVA_Test_S", fNbins, xmin, xmax ) );
561 fTestBgdMVAHist .push_back( new TH1F( TString::Format("MVA_Test_B%04i", imtd).Data(), "MVA_Test_B", fNbins, xmin, xmax ) );
562 }
563}
564
565////////////////////////////////////////////////////////////////////////////////
566/// resetting back the boosted weights of the events to 1
567
569{
570 for (Long64_t ievt=0; ievt<GetNEvents(); ievt++) {
571 const Event *ev = Data()->GetEvent(ievt);
572 ev->SetBoostWeight( 1.0 );
573 }
574}
575
576////////////////////////////////////////////////////////////////////////////////
577
579{
580 TDirectory* dir=0;
581 if (fMonitorBoostedMethod) {
582 for (UInt_t imtd=0;imtd<fBoostNum;imtd++) {
583
584 //writing the histograms in the specific classifier's directory
585 MethodBase* m = dynamic_cast<MethodBase*>(fMethods[imtd]);
586 if (!m) continue;
587 dir = m->BaseDir();
588 dir->cd();
589 fTrainSigMVAHist[imtd]->SetDirectory(dir);
590 fTrainSigMVAHist[imtd]->Write();
591 fTrainBgdMVAHist[imtd]->SetDirectory(dir);
592 fTrainBgdMVAHist[imtd]->Write();
593 fBTrainSigMVAHist[imtd]->SetDirectory(dir);
594 fBTrainSigMVAHist[imtd]->Write();
595 fBTrainBgdMVAHist[imtd]->SetDirectory(dir);
596 fBTrainBgdMVAHist[imtd]->Write();
597 }
598 }
599
600 // going back to the original folder
601 BaseDir()->cd();
602
603 fMonitorTree->Write();
604}
605
606////////////////////////////////////////////////////////////////////////////////
607
609{
611 if (fMonitorBoostedMethod) {
612 UInt_t nloop = fTestSigMVAHist.size();
613 if (fMethods.size()<nloop) nloop = fMethods.size();
614 //running over all the events and populating the test MVA histograms
615 Data()->SetCurrentType(Types::kTesting);
616 for (Long64_t ievt=0; ievt<GetNEvents(); ievt++) {
617 const Event* ev = GetEvent(ievt);
618 Float_t w = ev->GetWeight();
619 if (DataInfo().IsSignal(ev)) {
620 for (UInt_t imtd=0; imtd<nloop; imtd++) {
621 fTestSigMVAHist[imtd]->Fill(fMethods[imtd]->GetMvaValue(),w);
622 }
623 }
624 else {
625 for (UInt_t imtd=0; imtd<nloop; imtd++) {
626 fTestBgdMVAHist[imtd]->Fill(fMethods[imtd]->GetMvaValue(),w);
627 }
628 }
629 }
630 Data()->SetCurrentType(Types::kTraining);
631 }
632}
633
634////////////////////////////////////////////////////////////////////////////////
635
637{
639 if (treetype==Types::kTraining) return;
640 UInt_t nloop = fTestSigMVAHist.size();
641 if (fMethods.size()<nloop) nloop = fMethods.size();
642 if (fMonitorBoostedMethod) {
643 TDirectory* dir=0;
644 for (UInt_t imtd=0;imtd<nloop;imtd++) {
645 //writing the histograms in the specific classifier's directory
646 MethodBase* mva = dynamic_cast<MethodBase*>(fMethods[imtd]);
647 if (!mva) continue;
648 dir = mva->BaseDir();
649 if (dir==0) continue;
650 dir->cd();
651 fTestSigMVAHist[imtd]->SetDirectory(dir);
652 fTestSigMVAHist[imtd]->Write();
653 fTestBgdMVAHist[imtd]->SetDirectory(dir);
654 fTestBgdMVAHist[imtd]->Write();
655 }
656 }
657}
658
659////////////////////////////////////////////////////////////////////////////////
660/// process user options
661
665
666////////////////////////////////////////////////////////////////////////////////
667/// initialization
668
670{
671 Data()->SetCurrentType(Types::kTraining);
672 MethodBase* meth = dynamic_cast<MethodBase*>(GetLastMethod());
673 if (meth){
674 meth->SetSilentFile(IsSilentFile());
675 if(IsModelPersistence()){
676 TString _fFileDir= DataInfo().GetName();
678 meth->SetWeightFileDir(_fFileDir);
679 }
680 meth->SetModelPersistence(IsModelPersistence());
681 meth->TrainMethod();
682 }
683}
684
685////////////////////////////////////////////////////////////////////////////////
686/// find the CUT on the individual MVA that defines an event as
687/// correct or misclassified (to be used in the boosting process)
688
690{
691 if (!method || method->GetMethodType() == Types::kDT ){ return;}
692
693 // creating a fine histograms containing the error rate
694 const Int_t nBins=10001;
695 Double_t minMVA=150000;
696 Double_t maxMVA=-150000;
697 for (Long64_t ievt=0; ievt<Data()->GetNEvents(); ievt++) {
698 GetEvent(ievt);
699 Double_t val=method->GetMvaValue();
700 //Helge .. I think one could very well use fMVAValues for that ... -->to do
701 if (val>maxMVA) maxMVA=val;
702 if (val<minMVA) minMVA=val;
703 }
704 maxMVA = maxMVA+(maxMVA-minMVA)/nBins;
705
706 TH1D *mvaS = new TH1D(TString::Format("MVAS_%d",fCurrentMethodIdx) ,"",nBins,minMVA,maxMVA);
707 TH1D *mvaB = new TH1D(TString::Format("MVAB_%d",fCurrentMethodIdx) ,"",nBins,minMVA,maxMVA);
708 TH1D *mvaSC = new TH1D(TString::Format("MVASC_%d",fCurrentMethodIdx),"",nBins,minMVA,maxMVA);
709 TH1D *mvaBC = new TH1D(TString::Format("MVABC_%d",fCurrentMethodIdx),"",nBins,minMVA,maxMVA);
710
711
712 Results* results = Data()->GetResults(GetMethodName(), Types::kTraining, GetAnalysisType());
713 if (fDetailedMonitoring){
714 results->Store(mvaS, TString::Format("MVAS_%d",fCurrentMethodIdx));
715 results->Store(mvaB, TString::Format("MVAB_%d",fCurrentMethodIdx));
716 results->Store(mvaSC,TString::Format("MVASC_%d",fCurrentMethodIdx));
717 results->Store(mvaBC,TString::Format("MVABC_%d",fCurrentMethodIdx));
718 }
719
720 for (Long64_t ievt=0; ievt<Data()->GetNEvents(); ievt++) {
721
722 Double_t weight = GetEvent(ievt)->GetWeight();
723 Double_t mvaVal=method->GetMvaValue();
724 if (DataInfo().IsSignal(GetEvent(ievt))){
725 mvaS->Fill(mvaVal,weight);
726 }else {
727 mvaB->Fill(mvaVal,weight);
728 }
729 }
731
732
733 // Boosting should use Misclassification not Gini Index (changed, Helge 31.5.2013)
734 // WARNING! It works with Misclassification only if you fix the signal to
735 // background at every step. Strangely enough, there are better results
736 // ( as seen with BDT ) if you use Gini Index, and accept that sometimes no
737 // sensible cut is found - i.e. the cut is then outside the MVA value range,
738 // all events are classified as background and then according to the Boost
739 // algorithm something is renormed 'automatically' ... so that in the next
740 // step again the result is something sensible.
741 // Strange ... that THIS is supposed to be right?
742
743 // SeparationBase *sepGain2 = new MisClassificationError();
744 //sepGain = new MisClassificationError();
745 sepGain = new GiniIndex();
746 //sepGain = new CrossEntropy();
747
748 Double_t sTot = mvaS->GetSum();
749 Double_t bTot = mvaB->GetSum();
750
751 mvaSC->SetBinContent(1,mvaS->GetBinContent(1));
752 mvaBC->SetBinContent(1,mvaB->GetBinContent(1));
753 Double_t sSel=0;
754 Double_t bSel=0;
755 Double_t separationGain=sepGain->GetSeparationGain(sSel,bSel,sTot,bTot);
756 Double_t mvaCut=mvaSC->GetBinLowEdge(1);
759 // std::cout << "minMVA =" << minMVA << " maxMVA = " << maxMVA << " width = " << mvaSC->GetBinWidth(1) << std::endl;
760
761 // for (Int_t ibin=1;ibin<=nBins;ibin++) std::cout << " cutvalues[" << ibin<<"]="<<mvaSC->GetBinLowEdge(ibin) << " " << mvaSC->GetBinCenter(ibin) << std::endl;
762 Double_t mvaCutOrientation=1; // 1 if mva > mvaCut --> Signal and -1 if mva < mvaCut (i.e. mva*-1 > mvaCut*-1) --> Signal
763 for (Int_t ibin=1;ibin<=nBins;ibin++){
764 mvaSC->SetBinContent(ibin,mvaS->GetBinContent(ibin)+mvaSC->GetBinContent(ibin-1));
765 mvaBC->SetBinContent(ibin,mvaB->GetBinContent(ibin)+mvaBC->GetBinContent(ibin-1));
766
767 sSel=mvaSC->GetBinContent(ibin);
768 bSel=mvaBC->GetBinContent(ibin);
769
770 // if (ibin==nBins){
771 // std::cout << "Last bin s="<< sSel <<" b="<<bSel << " s="<< sTot-sSel <<" b="<<bTot-bSel << endl;
772 // }
773
774 if (separationGain < sepGain->GetSeparationGain(sSel,bSel,sTot,bTot)
775 // && (mvaSC->GetBinCenter(ibin) >0 || (fCurrentMethodIdx+1)%2 )
776 ){
777 separationGain = sepGain->GetSeparationGain(sSel,bSel,sTot,bTot);
778 // mvaCut=mvaSC->GetBinCenter(ibin);
779 mvaCut=mvaSC->GetBinLowEdge(ibin+1);
780 // if (sSel/bSel > (sTot-sSel)/(bTot-bSel)) mvaCutOrientation=-1;
782 else mvaCutOrientation=1;
785 // std::cout << "new cut at " << mvaCut << "with s="<<sTot-sSel << " b="<<bTot-bSel << std::endl;
786 }
787 /*
788 Double_t ori;
789 if (sSel/bSel > (sTot-sSel)/(bTot-bSel)) ori=-1;
790 else ori=1;
791 std::cout << ibin << " mvacut="<<mvaCut
792 << " sTot=" << sTot
793 << " bTot=" << bTot
794 << " sSel=" << sSel
795 << " bSel=" << bSel
796 << " s/b(1)=" << sSel/bSel
797 << " s/b(2)=" << (sTot-sSel)/(bTot-bSel)
798 << " sepGain="<<sepGain->GetSeparationGain(sSel,bSel,sTot,bTot)
799 << " sepGain2="<<sepGain2->GetSeparationGain(sSel,bSel,sTot,bTot)
800 << " " <<ori
801 << std::endl;
802 */
803
804 }
805
806 if (0){
807 double parentIndex=sepGain->GetSeparationIndex(sTot,bTot);
808 double leftIndex =sepGain->GetSeparationIndex(sSelCut,bSelCut);
809 double rightIndex =sepGain->GetSeparationIndex(sTot-sSelCut,bTot-bSelCut);
810 std::cout
811 << " sTot=" << sTot
812 << " bTot=" << bTot
813 << " s="<<sSelCut
814 << " b="<<bSelCut
815 << " s2="<<(sTot-sSelCut)
816 << " b2="<<(bTot-bSelCut)
817 << " s/b(1)=" << sSelCut/bSelCut
818 << " s/b(2)=" << (sTot-sSelCut)/(bTot-bSelCut)
819 << " index before cut=" << parentIndex
820 << " after: left=" << leftIndex
821 << " after: right=" << rightIndex
823 << " sepGain="<<separationGain
824 << " sepGain="<<sepGain->GetSeparationGain(sSelCut,bSelCut,sTot,bTot)
825 << " cut=" << mvaCut
826 << " idx="<<fCurrentMethodIdx
827 << " cutOrientation="<<mvaCutOrientation
828 << std::endl;
829 }
830 method->SetSignalReferenceCut(mvaCut);
831 method->SetSignalReferenceCutOrientation(mvaCutOrientation);
832
833 results->GetHist("SeparationGain")->SetBinContent(fCurrentMethodIdx+1,separationGain);
834
835
836 Log() << kDEBUG << "(old step) Setting method cut to " <<method->GetSignalReferenceCut()<< Endl;
837
838 if(IsSilentFile())
839 {
840 mvaS ->Delete();
841 mvaB ->Delete();
842 mvaSC->Delete();
843 mvaBC->Delete();
844 }
845}
846
847////////////////////////////////////////////////////////////////////////////////
848
850{
852
853
854 if (fBoostType=="AdaBoost") returnVal = this->AdaBoost (method,1);
855 else if (fBoostType=="RealAdaBoost") returnVal = this->AdaBoost (method,0);
856 else if (fBoostType=="Bagging") returnVal = this->Bagging ();
857 else{
858 Log() << kFATAL << "<Boost> unknown boost option " << fBoostType<< " called" << Endl;
859 }
860 fMethodWeight.push_back(returnVal);
861 return returnVal;
862}
863////////////////////////////////////////////////////////////////////////////////
864/// the standard (discrete or real) AdaBoost algorithm
865
867{
868 if (!method) {
869 Log() << kWARNING << " AdaBoost called without classifier reference - needed for calculating AdaBoost " << Endl;
870 return 0;
871 }
872
873 Float_t w,v; Bool_t sig=kTRUE;
875 Bool_t* WrongDetection=new Bool_t[GetNEvents()];
877
878 if (discreteAdaBoost) {
879 FindMVACut(method);
880 Log() << kDEBUG << " individual mva cut value = " << method->GetSignalReferenceCut() << Endl;
881 } else {
883 // the RealAdaBoost does use a simple "yes (signal)" or "no (background)"
884 // answer from your single MVA, but a "signal probability" instead (in the BDT case,
885 // that would be the 'purity' in the leaf node. For some MLP parameter, the MVA output
886 // can also interpreted as a probability, but here I try a general approach to get this
887 // probability from the MVA distributions...
888
889 for (Long64_t evt=0; evt<GetNEvents(); evt++) {
890 const Event* ev = Data()->GetEvent(evt);
891 MVAProb->AddEvent(fMVAvalues->at(evt),ev->GetWeight(),ev->GetClass());
892 }
893 }
894
895
896 for (Long64_t ievt=0; ievt<GetNEvents(); ievt++) WrongDetection[ievt]=kTRUE;
897
898 // finding the wrong events and calculating their total weights
899 for (Long64_t ievt=0; ievt<GetNEvents(); ievt++) {
900 const Event* ev = GetEvent(ievt);
901 sig=DataInfo().IsSignal(ev);
902 v = fMVAvalues->at(ievt);
903 w = ev->GetWeight();
904 sumAll += w;
905 if(!IsSilentFile())
906 {
907 if (fMonitorBoostedMethod) {
908 if (sig) {
909 fBTrainSigMVAHist[fCurrentMethodIdx]->Fill(v,w);
910 fTrainSigMVAHist[fCurrentMethodIdx]->Fill(v,ev->GetOriginalWeight());
911 }
912 else {
913 fBTrainBgdMVAHist[fCurrentMethodIdx]->Fill(v,w);
914 fTrainBgdMVAHist[fCurrentMethodIdx]->Fill(v,ev->GetOriginalWeight());
915 }
916 }
917 }
918
919 if (discreteAdaBoost){
920 if (sig == method->IsSignalLike(fMVAvalues->at(ievt))){
922 }else{
924 sumWrong+=w;
925 }
926 }else{
927 Double_t mvaProb = MVAProb->GetMVAProbAt((Float_t)fMVAvalues->at(ievt));
928 mvaProb = 2*(mvaProb-0.5);
930 if (DataInfo().IsSignal(ev)) trueType = 1;
931 else trueType = -1;
933 }
934 }
935
936 fMethodError=sumWrong/sumAll;
937
938 // calculating the fMethodError and the boostWeight out of it uses the formula
939 // w = ((1-err)/err)^beta
940
942
943 if (fMethodError == 0) { //no misclassification made.. perfect, no boost ;)
944 Log() << kWARNING << "Your classifier worked perfectly on the training sample --> serious overtraining expected and no boosting done " << Endl;
945 }else{
946
948 boostWeight = TMath::Log((1.-fMethodError)/fMethodError)*fAdaBoostBeta;
949 else
950 boostWeight = TMath::Log((1.+fMethodError)/(1-fMethodError))*fAdaBoostBeta;
951
952
953 // std::cout << "boostweight = " << boostWeight << std::endl;
954
955 // ADA boosting, rescaling the weight of the wrong events according to the error level
956 // over the entire test sample rescaling all the weights to have the same sum, but without
957 // touching the original weights (changing only the boosted weight of all the events)
958 // first reweight
959
960 Double_t newSum=0., oldSum=0.;
961
962
964
965
966 for (Long64_t ievt=0; ievt<GetNEvents(); ievt++) {
967 const Event* ev = Data()->GetEvent(ievt);
968 oldSum += ev->GetWeight();
969 if (discreteAdaBoost){
970 // events are classified as Signal OR background .. right or wrong
971 if (WrongDetection[ievt] && boostWeight != 0) {
972 if (ev->GetWeight() > 0) ev->ScaleBoostWeight(boostfactor);
973 else ev->ScaleBoostWeight(1./boostfactor);
974 }
975 // if (ievt<30) std::cout<<ievt<<" var0="<<ev->GetValue(0)<<" var1="<<ev->GetValue(1)<<" weight="<<ev->GetWeight() << " boostby:"<<boostfactor<<std::endl;
976
977 }else{
978 // events are classified by their probability of being signal or background
979 // (eventually you should write this one - i.e. re-use the MVA value that were already
980 // calculated and stored.. however ,for the moment ..
981 Double_t mvaProb = MVAProb->GetMVAProbAt((Float_t)fMVAvalues->at(ievt));
982 mvaProb = 2*(mvaProb-0.5);
983 // mvaProb = (1-mvaProb);
984
985 Int_t trueType=1;
986 if (DataInfo().IsSignal(ev)) trueType = 1;
987 else trueType = -1;
988
990 if (ev->GetWeight() > 0) ev->ScaleBoostWeight(boostfactor);
991 else ev->ScaleBoostWeight(1./boostfactor);
992
993 }
994 newSum += ev->GetWeight();
995 }
996
998 // next normalize the weights
1000 for (Long64_t ievt=0; ievt<GetNEvents(); ievt++) {
1001 const Event* ev = Data()->GetEvent(ievt);
1002 ev->ScaleBoostWeight(normWeight);
1003 if (ev->GetClass()) normSig+=ev->GetWeight();
1004 else normBkg+=ev->GetWeight();
1005 }
1006
1007 Results* results = Data()->GetResults(GetMethodName(), Types::kTraining, GetAnalysisType());
1008 results->GetHist("SoverBtotal")->SetBinContent(fCurrentMethodIdx+1, normSig/normBkg);
1009
1010 for (Long64_t ievt=0; ievt<GetNEvents(); ievt++) {
1011 const Event* ev = Data()->GetEvent(ievt);
1012
1013 if (ev->GetClass()) ev->ScaleBoostWeight(oldSum/normSig/2);
1014 else ev->ScaleBoostWeight(oldSum/normBkg/2);
1015 }
1016 }
1017
1018 delete[] WrongDetection;
1019 if (MVAProb) delete MVAProb;
1020
1021 fBoostWeight = boostWeight; // used ONLY for the monitoring tree
1022
1023 return boostWeight;
1024}
1025
1026
1027////////////////////////////////////////////////////////////////////////////////
1028/// Bagging or Bootstrap boosting, gives new random poisson weight for every event
1029
1031{
1032 TRandom3 *trandom = new TRandom3(fRandomSeed+fMethods.size());
1033 for (Long64_t ievt=0; ievt<GetNEvents(); ievt++) {
1034 const Event* ev = Data()->GetEvent(ievt);
1035 ev->SetBoostWeight(trandom->PoissonD(fBaggedSampleFraction));
1036 }
1037 fBoostWeight = 1; // used ONLY for the monitoring tree
1038 return 1.;
1039}
1040
1041
1042////////////////////////////////////////////////////////////////////////////////
1043/// Get help message text
1044///
1045/// typical length of text line:
1046/// "|--------------------------------------------------------------|"
1047
1049{
1050 Log() << Endl;
1051 Log() << gTools().Color("bold") << "--- Short description:" << gTools().Color("reset") << Endl;
1052 Log() << Endl;
1053 Log() << "This method combines several classifier of one species in a "<<Endl;
1054 Log() << "single multivariate quantity via the boost algorithm." << Endl;
1055 Log() << "the output is a weighted sum over all individual classifiers" <<Endl;
1056 Log() << "By default, the AdaBoost method is employed, which gives " << Endl;
1057 Log() << "events that were misclassified in the previous tree a larger " << Endl;
1058 Log() << "weight in the training of the following classifier."<<Endl;
1059 Log() << "Optionally, Bagged boosting can also be applied." << Endl;
1060 Log() << Endl;
1061 Log() << gTools().Color("bold") << "--- Performance tuning via configuration options:" << gTools().Color("reset") << Endl;
1062 Log() << Endl;
1063 Log() << "The most important parameter in the configuration is the "<<Endl;
1064 Log() << "number of boosts applied (Boost_Num) and the choice of boosting"<<Endl;
1065 Log() << "(Boost_Type), which can be set to either AdaBoost or Bagging." << Endl;
1066 Log() << "AdaBoosting: The most important parameters in this configuration" <<Endl;
1067 Log() << "is the beta parameter (Boost_AdaBoostBeta) " << Endl;
1068 Log() << "When boosting a linear classifier, it is sometimes advantageous"<<Endl;
1069 Log() << "to transform the MVA output non-linearly. The following options" <<Endl;
1070 Log() << "are available: step, log, and minmax, the default is no transform."<<Endl;
1071 Log() <<Endl;
1072 Log() << "Some classifiers are hard to boost and do not improve much in"<<Endl;
1073 Log() << "their performance by boosting them, some even slightly deteriorate"<< Endl;
1074 Log() << "due to the boosting." <<Endl;
1075 Log() << "The booking of the boost method is special since it requires"<<Endl;
1076 Log() << "the booing of the method to be boosted and the boost itself."<<Endl;
1077 Log() << "This is solved by booking the method to be boosted and to add"<<Endl;
1078 Log() << "all Boost parameters, which all begin with \"Boost_\" to the"<<Endl;
1079 Log() << "options string. The factory separates the options and initiates"<<Endl;
1080 Log() << "the boost process. The TMVA macro directory contains the example"<<Endl;
1081 Log() << "macro \"Boost.C\"" <<Endl;
1082}
1083
1084////////////////////////////////////////////////////////////////////////////////
1085
1087{
1088 return 0;
1089}
1090
1091////////////////////////////////////////////////////////////////////////////////
1092/// return boosted MVA response
1093
1095{
1096 Double_t mvaValue = 0;
1097 Double_t norm = 0;
1098 Double_t epsilon = TMath::Exp(-1.);
1099 //Double_t fact = TMath::Exp(-1.)+TMath::Exp(1.);
1100 for (UInt_t i=0;i< fMethods.size(); i++){
1101 MethodBase* m = dynamic_cast<MethodBase*>(fMethods[i]);
1102 if (m==0) continue;
1103 Double_t val = fTmpEvent ? m->GetMvaValue(fTmpEvent) : m->GetMvaValue();
1104 Double_t sigcut = m->GetSignalReferenceCut();
1105
1106 // default is no transform
1107 if (fTransformString == "linear"){
1108
1109 }
1110 else if (fTransformString == "log"){
1111 if (val < sigcut) val = sigcut;
1112
1113 val = TMath::Log((val-sigcut)+epsilon);
1114 }
1115 else if (fTransformString == "step" ){
1116 if (m->IsSignalLike(val)) val = 1.;
1117 else val = -1.;
1118 }
1119 else if (fTransformString == "gauss"){
1120 val = TMath::Gaus((val-sigcut),1);
1121 }
1122 else {
1123 Log() << kFATAL << "error unknown transformation " << fTransformString<<Endl;
1124 }
1125 mvaValue+=val*fMethodWeight[i];
1126 norm +=fMethodWeight[i];
1127 // std::cout << "mva("<<i<<") = "<<val<<" " << valx<< " " << mvaValue<<" and sigcut="<<sigcut << std::endl;
1128 }
1129 mvaValue/=norm;
1130 // cannot determine error
1131 NoErrorCalc(err, errUpper);
1132
1133 return mvaValue;
1134}
1135
1136////////////////////////////////////////////////////////////////////////////////
1137/// Calculate the ROC integral of a single classifier or even the
1138/// whole boosted classifier. The tree type (training or testing
1139/// sample) is specified by 'eTT'.
1140///
1141/// If tree type kTraining is set, the original training sample is
1142/// used to compute the ROC integral (original weights).
1143///
1144/// - singleMethod - if kTRUE, return ROC integral of single (last
1145/// trained) classifier; if kFALSE, return ROC
1146/// integral of full classifier
1147///
1148/// - eTT - tree type (Types::kTraining / Types::kTesting)
1149///
1150/// - CalcOverlapIntergral - if kTRUE, the overlap integral of the
1151/// signal/background MVA distributions
1152/// is calculated and stored in
1153/// 'fOverlap_integral'
1154
1156{
1157 // set data sample training / testing
1158 Data()->SetCurrentType(eTT);
1159
1160 MethodBase* method = singleMethod ? dynamic_cast<MethodBase*>(fMethods.back()) : 0; // ToDo CoVerity flags this line as there is no protection against a zero-pointer delivered by dynamic_cast
1161 // to make CoVerity happy (although, OF COURSE, the last method in the committee
1162 // has to be also of type MethodBase as ANY method is... hence the dynamic_cast
1163 // will never by "zero" ...
1164 if (singleMethod && !method) {
1165 Log() << kFATAL << " What do you do? Your method:"
1166 << fMethods.back()->GetName()
1167 << " seems not to be a propper TMVA method"
1168 << Endl;
1169 std::exit(1);
1170 }
1171 Double_t err = 0.0;
1172
1173 // temporary renormalize the method weights in case of evaluation
1174 // of full classifier.
1175 // save the old normalization of the methods
1176 std::vector<Double_t> OldMethodWeight(fMethodWeight);
1177 if (!singleMethod) {
1178 // calculate sum of weights of all methods
1180 for (UInt_t i=0; i<=fCurrentMethodIdx; i++)
1181 AllMethodsWeight += fMethodWeight.at(i);
1182 // normalize the weights of the classifiers
1183 if (AllMethodsWeight != 0.0) {
1184 for (UInt_t i=0; i<=fCurrentMethodIdx; i++)
1185 fMethodWeight[i] /= AllMethodsWeight;
1186 }
1187 }
1188
1189 // calculate MVA values
1191 std::vector <Float_t>* mvaRes;
1193 mvaRes = fMVAvalues; // values already calculated
1194 else {
1195 mvaRes = new std::vector <Float_t>(GetNEvents());
1196 for (Long64_t ievt=0; ievt<GetNEvents(); ievt++) {
1197 GetEvent(ievt);
1198 (*mvaRes)[ievt] = singleMethod ? method->GetMvaValue(&err) : GetMvaValue(&err);
1199 }
1200 }
1201
1202 // restore the method weights
1203 if (!singleMethod)
1204 fMethodWeight = OldMethodWeight;
1205
1206 // now create histograms for calculation of the ROC integral
1207 Int_t signalClass = 0;
1208 if (DataInfo().GetClassInfo("Signal") != 0) {
1209 signalClass = DataInfo().GetClassInfo("Signal")->GetNumber();
1210 }
1211 gTools().ComputeStat( GetEventCollection(eTT), mvaRes,
1213
1214 fNbins = gConfig().fVariablePlotting.fNbinsXOfROCCurve;
1216 xmax = TMath::Min( TMath::Max(meanS + nrms*rmsS, meanB + nrms*rmsB ), xmax ) + 0.0001;
1217
1218 // calculate ROC integral
1219 TH1* mva_s = new TH1F( "MVA_S", "MVA_S", fNbins, xmin, xmax );
1220 TH1* mva_b = new TH1F( "MVA_B", "MVA_B", fNbins, xmin, xmax );
1223 mva_s_overlap = new TH1F( "MVA_S_OVERLAP", "MVA_S_OVERLAP", fNbins, xmin, xmax );
1224 mva_b_overlap = new TH1F( "MVA_B_OVERLAP", "MVA_B_OVERLAP", fNbins, xmin, xmax );
1225 }
1226 for (Long64_t ievt=0; ievt<GetNEvents(); ievt++) {
1227 const Event* ev = GetEvent(ievt);
1228 Float_t w = (eTT==Types::kTesting ? ev->GetWeight() : ev->GetOriginalWeight());
1229 if (DataInfo().IsSignal(ev)) mva_s->Fill( (*mvaRes)[ievt], w );
1230 else mva_b->Fill( (*mvaRes)[ievt], w );
1231
1233 Float_t w_ov = ev->GetWeight();
1234 if (DataInfo().IsSignal(ev))
1235 mva_s_overlap->Fill( (*mvaRes)[ievt], w_ov );
1236 else
1237 mva_b_overlap->Fill( (*mvaRes)[ievt], w_ov );
1238 }
1239 }
1240 gTools().NormHist( mva_s );
1241 gTools().NormHist( mva_b );
1242 PDF *fS = new PDF( "PDF Sig", mva_s, PDF::kSpline2 );
1243 PDF *fB = new PDF( "PDF Bkg", mva_b, PDF::kSpline2 );
1244
1245 // calculate ROC integral from fS, fB
1247
1248 // calculate overlap integral
1252
1253 fOverlap_integral = 0.0;
1254 for (Int_t bin=1; bin<=mva_s_overlap->GetNbinsX(); bin++){
1255 Double_t bc_s = mva_s_overlap->GetBinContent(bin);
1256 Double_t bc_b = mva_b_overlap->GetBinContent(bin);
1257 if (bc_s > 0.0 && bc_b > 0.0)
1258 fOverlap_integral += TMath::Min(bc_s, bc_b);
1259 }
1260
1261 delete mva_s_overlap;
1262 delete mva_b_overlap;
1263 }
1264
1265 delete mva_s;
1266 delete mva_b;
1267 delete fS;
1268 delete fB;
1269 if (!(singleMethod && eTT==Types::kTraining)) delete mvaRes;
1270
1271 Data()->SetCurrentType(Types::kTraining);
1272
1273 return ROC;
1274}
1275
1277{
1278 // Calculate MVA values of current method fMethods.back() on
1279 // training sample
1280
1281 Data()->SetCurrentType(Types::kTraining);
1282 MethodBase* method = dynamic_cast<MethodBase*>(fMethods.back());
1283 if (!method) {
1284 Log() << kFATAL << "dynamic cast to MethodBase* failed" <<Endl;
1285 return;
1286 }
1287 // calculate MVA values
1288 for (Long64_t ievt=0; ievt<GetNEvents(); ievt++) {
1289 GetEvent(ievt);
1290 fMVAvalues->at(ievt) = method->GetMvaValue();
1291 }
1292
1293 // fill cumulative mva distribution
1294
1295
1296}
1297
1298////////////////////////////////////////////////////////////////////////////////
1299/// fill various monitoring histograms from information of the individual classifiers that
1300/// have been boosted.
1301/// of course.... this depends very much on the individual classifiers, and so far, only for
1302/// Decision Trees, this monitoring is actually implemented
1303
1305{
1306 Results* results = Data()->GetResults(GetMethodName(), Types::kTraining, GetAnalysisType());
1307
1308 if (GetCurrentMethod(methodIndex)->GetMethodType() == TMVA::Types::kDT) {
1309 TMVA::MethodDT* currentDT=dynamic_cast<TMVA::MethodDT*>(GetCurrentMethod(methodIndex));
1310 if (currentDT){
1311 if (stage == Types::kBoostProcBegin){
1312 results->Store(new TH1I("NodesBeforePruning","nodes before pruning",this->GetBoostNum(),0,this->GetBoostNum()),"NodesBeforePruning");
1313 results->Store(new TH1I("NodesAfterPruning","nodes after pruning",this->GetBoostNum(),0,this->GetBoostNum()),"NodesAfterPruning");
1314 }
1315
1316 if (stage == Types::kBeforeTraining){
1317 }
1318 else if (stage == Types::kBeforeBoosting){
1319 results->GetHist("NodesBeforePruning")->SetBinContent(methodIndex+1,currentDT->GetNNodesBeforePruning());
1320 results->GetHist("NodesAfterPruning")->SetBinContent(methodIndex+1,currentDT->GetNNodes());
1321 }
1322 else if (stage == Types::kAfterBoosting){
1323
1324 }
1325 else if (stage != Types::kBoostProcEnd){
1326 Log() << kINFO << "<Train> average number of nodes before/after pruning : "
1327 << results->GetHist("NodesBeforePruning")->GetMean() << " / "
1328 << results->GetHist("NodesAfterPruning")->GetMean()
1329 << Endl;
1330 }
1331 }
1332
1333 }else if (GetCurrentMethod(methodIndex)->GetMethodType() == TMVA::Types::kFisher) {
1334 if (stage == Types::kAfterBoosting){
1336 }
1337 }else{
1338 if (methodIndex < 3){
1339 Log() << kDEBUG << "No detailed boost monitoring for "
1340 << GetCurrentMethod(methodIndex)->GetMethodName()
1341 << " yet available " << Endl;
1342 }
1343 }
1344
1345 //boosting plots universal for all classifiers 'typically for debug purposes only as they are not general enough'
1346
1347 if (stage == Types::kBeforeBoosting){
1348 // if you want to display the weighted events for 2D case at each boost step:
1349 if (fDetailedMonitoring){
1350 // the following code is useful only for 2D examples - mainly illustration for debug/educational purposes:
1351 if (DataInfo().GetNVariables() == 2) {
1352 results->Store(new TH2F(TString::Format("EventDistSig_%d",methodIndex),TString::Format("EventDistSig_%d",methodIndex),100,0,7,100,0,7));
1353 results->GetHist(TString::Format("EventDistSig_%d",methodIndex))->SetMarkerColor(4);
1354 results->Store(new TH2F(TString::Format("EventDistBkg_%d",methodIndex),TString::Format("EventDistBkg_%d",methodIndex),100,0,7,100,0,7));
1355 results->GetHist(TString::Format("EventDistBkg_%d",methodIndex))->SetMarkerColor(2);
1356
1357 Data()->SetCurrentType(Types::kTraining);
1358 for (Long64_t ievt=0; ievt<GetNEvents(); ievt++) {
1359 const Event* ev = GetEvent(ievt);
1360 Float_t w = ev->GetWeight();
1361 Float_t v0= ev->GetValue(0);
1362 Float_t v1= ev->GetValue(1);
1363 // if (ievt<3) std::cout<<ievt<<" var0="<<v0<<" var1="<<v1<<" weight="<<w<<std::endl;
1364 TH2* h;
1365 if (DataInfo().IsSignal(ev)) h = results->GetHist2D(TString::Format("EventDistSig_%d",methodIndex));
1366 else h = results->GetHist2D(TString::Format("EventDistBkg_%d",methodIndex));
1367 if (h) h->Fill(v0,v1,w);
1368 }
1369 }
1370 }
1371 }
1372
1373 return;
1374}
1375
1376
#define REGISTER_METHOD(CLASS)
for example
#define h(i)
Definition RSha256.hxx:106
float Float_t
Float 4 bytes (float)
Definition RtypesCore.h:71
constexpr Bool_t kFALSE
Definition RtypesCore.h:108
constexpr Bool_t kTRUE
Definition RtypesCore.h:107
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char Pixmap_t Pixmap_t PictureAttributes_t attr const char char ret_data h unsigned char height h Atom_t Int_t ULong_t ULong_t unsigned char prop_list Atom_t Atom_t Atom_t Time_t type
float xmin
float xmax
Describe directory structure in memory.
Definition TDirectory.h:45
virtual Bool_t cd()
Change current directory to "this" directory.
1-D histogram with a double per channel (see TH1 documentation)
Definition TH1.h:927
1-D histogram with a float per channel (see TH1 documentation)
Definition TH1.h:879
1-D histogram with an int per channel (see TH1 documentation)
Definition TH1.h:797
TH1 is the base class of all histogram classes in ROOT.
Definition TH1.h:109
2-D histogram with a float per channel (see TH1 documentation)
Definition TH2.h:345
Service class for 2-D histogram classes.
Definition TH2.h:30
static ClassifierFactory & Instance()
access to the ClassifierFactory singleton creates the instance if needed
TString fWeightFileDir
Definition Config.h:124
class TMVA::Config::VariablePlotting fVariablePlotting
IONames & GetIONames()
Definition Config.h:98
Class that contains all the data information.
Definition DataSetInfo.h:62
Implementation of the GiniIndex as separation criterion.
Definition GiniIndex.h:63
Interface for all concrete MVA method implementations.
Definition IMethod.h:53
Virtual base Class for all MVA method.
Definition MethodBase.h:111
virtual void DeclareCompatibilityOptions()
options that are used ONLY for the READER to ensure backward compatibility they are hence without any...
virtual void WriteEvaluationHistosToFile(Types::ETreeType treetype)
writes all MVA evaluation histograms to file
virtual void TestClassification()
initialization
virtual Double_t GetROCIntegral(TH1D *histS, TH1D *histB) const
calculate the area (integral) under the ROC curve as a overall quality measure of the classification
friend class MethodBoost
Definition MethodBase.h:116
Class for boosting a TMVA method.
Definition MethodBoost.h:58
void MonitorBoost(Types::EBoostStage stage, UInt_t methodIdx=0)
fill various monitoring histograms from information of the individual classifiers that have been boos...
virtual void WriteEvaluationHistosToFile(Types::ETreeType treetype) override
writes all MVA evaluation histograms to file
void ResetBoostWeights()
resetting back the boosted weights of the events to 1
void DeclareCompatibilityOptions() override
options that are used ONLY for the READER to ensure backward compatibility they are hence without any...
void ProcessOptions() override
process user options
void SingleTrain()
initialization
virtual void TestClassification() override
initialization
DataSetManager * fDataSetManager
DSMTEST.
Bool_t fHistoricBoolOption
historic variable, only needed for "CompatibilityOptions"
void DeclareOptions() override
Double_t AdaBoost(MethodBase *method, Bool_t useYesNoLeaf)
the standard (discrete or real) AdaBoost algorithm
Bool_t BookMethod(Types::EMVA theMethod, TString methodTitle, TString theOption)
just registering the string from which the boosted classifier will be created
void InitHistos()
initialisation routine
void GetHelpMessage() const override
Get help message text.
void Init() override
Bool_t HasAnalysisType(Types::EAnalysisType type, UInt_t numberClasses, UInt_t) override
Boost can handle classification with 2 classes and regression with one regression-target.
Double_t GetMvaValue(Double_t *err=nullptr, Double_t *errUpper=nullptr) override
return boosted MVA response
Double_t GetBoostROCIntegral(Bool_t, Types::ETreeType, Bool_t CalcOverlapIntergral=kFALSE)
Calculate the ROC integral of a single classifier or even the whole boosted classifier.
const Ranking * CreateRanking() override
Double_t SingleBoost(MethodBase *method)
void Train(void) override
Double_t Bagging()
Bagging or Bootstrap boosting, gives new random poisson weight for every event.
void WriteMonitoringHistosToFile(void) const override
write special monitoring histograms to file dummy implementation here --------------—
std::vector< Float_t > * fMVAvalues
mva values for the last trained method
virtual ~MethodBoost(void)
destructor
void FindMVACut(MethodBase *method)
find the CUT on the individual MVA that defines an event as correct or misclassified (to be used in t...
void CheckSetup() override
check may be overridden by derived class (sometimes, eg, fitters are used which can only be implement...
Class for categorizing the phase space.
Virtual base class for combining several TMVA method.
std::vector< IMethod * > fMethods
vector of all classifiers
Analysis of Boosted Decision Trees.
Definition MethodDT.h:49
static void InhibitOutput()
Definition MsgLogger.cxx:66
static void EnableOutput()
Definition MsgLogger.cxx:67
PDF wrapper for histograms; uses user-defined spline interpolation.
Definition PDF.h:63
@ kSpline2
Definition PDF.h:70
Ranking for variables in method (implementation)
Definition Ranking.h:48
Class that is the base-class for a vector of result.
Definition Results.h:57
An interface to calculate the "SeparationGain" for different separation criteria used in various trai...
Timing information for training and evaluation of MVA methods.
Definition Timer.h:58
void ComputeStat(const std::vector< TMVA::Event * > &, std::vector< Float_t > *, Double_t &, Double_t &, Double_t &, Double_t &, Double_t &, Double_t &, Int_t signalClass, Bool_t norm=kFALSE)
sanity check
Definition Tools.cxx:202
const TString & Color(const TString &)
human readable color strings
Definition Tools.cxx:828
Double_t NormHist(TH1 *theHist, Double_t norm=1.0)
normalises histogram
Definition Tools.cxx:383
Singleton class for Global types used by TMVA.
Definition Types.h:71
@ kBoostProcBegin
Definition Types.h:151
@ kBeforeBoosting
Definition Types.h:153
@ kAfterBoosting
Definition Types.h:154
@ kBoostProcEnd
Definition Types.h:155
@ kBeforeTraining
Definition Types.h:152
static Types & Instance()
The single instance of "Types" if existing already, or create it (Singleton)
Definition Types.cxx:70
@ kFisher
Definition Types.h:82
@ kCategory
Definition Types.h:97
@ kClassification
Definition Types.h:127
@ kMaxTreeType
also used as temporary storage for trees not yet assigned for testing;training...
Definition Types.h:145
@ kTraining
Definition Types.h:143
Random number generator class based on M.
Definition TRandom3.h:27
Basic string class.
Definition TString.h:138
static TString Format(const char *fmt,...)
Static method which formats a string using a printf style format descriptor and return a TString.
Definition TString.cxx:2384
A TTree represents a columnar dataset.
Definition TTree.h:89
create variable transformations
Config & gConfig()
Tools & gTools()
MsgLogger & Endl(MsgLogger &ml)
Definition MsgLogger.h:148
Double_t Gaus(Double_t x, Double_t mean=0, Double_t sigma=1, Bool_t norm=kFALSE)
Calculates a gaussian function with mean and sigma.
Definition TMath.cxx:471
Short_t Max(Short_t a, Short_t b)
Returns the largest of a and b.
Definition TMathBase.h:251
Double_t Exp(Double_t x)
Returns the base-e exponential function of x, which is e raised to the power x.
Definition TMath.h:720
Double_t Log(Double_t x)
Returns the natural logarithm of x.
Definition TMath.h:767
Short_t Min(Short_t a, Short_t b)
Returns the smallest of a and b.
Definition TMathBase.h:199
TMarker m
Definition textangle.C:8