Logo ROOT   6.08/07
Reference Guide
MethodCuts.cxx
Go to the documentation of this file.
1 // @(#)root/tmva $Id$
2 // Author: Andreas Hoecker, Matt Jachowski, Peter Speckmayer, Eckhard von Toerne, Helge Voss, Kai Voss
3 
4 /**********************************************************************************
5  * Project: TMVA - a Root-integrated toolkit for multivariate Data analysis *
6  * Package: TMVA *
7  * Class : TMVA::MethodCuts *
8  * Web : http://tmva.sourceforge.net *
9  * *
10  * Description: *
11  * Implementation (see header for description) *
12  * *
13  * Authors (alphabetical): *
14  * Andreas Hoecker <Andreas.Hocker@cern.ch> - CERN, Switzerland *
15  * Matt Jachowski <jachowski@stanford.edu> - Stanford University, USA *
16  * Peter Speckmayer <speckmay@mail.cern.ch> - CERN, Switzerland *
17  * Eckhard von Toerne <evt@physik.uni-bonn.de> - U. of Bonn, Germany *
18  * Helge Voss <Helge.Voss@cern.ch> - MPI-K Heidelberg, Germany *
19  * Kai Voss <Kai.Voss@cern.ch> - U. of Victoria, Canada *
20  * *
21  * Copyright (c) 2005: *
22  * CERN, Switzerland *
23  * U. of Victoria, Canada *
24  * MPI-K Heidelberg, Germany *
25  * *
26  * Redistribution and use in source and binary forms, with or without *
27  * modification, are permitted according to the terms listed in LICENSE *
28  * (http://tmva.sourceforge.net/LICENSE) *
29  **********************************************************************************/
30 
31 ////////////////////////////////////////////////////////////////////////////////
32 
33 /* Begin_Html
34  Multivariate optimisation of signal efficiency for given background
35  efficiency, applying rectangular minimum and maximum requirements.
36 
37  <p>
38  Also implemented is a "decorrelate/diagonlized cuts approach",
39  which improves over the uncorrelated cuts ansatz by
40  transforming linearly the input variables into a diagonal space,
41  using the square-root of the covariance matrix.
42 
43  <p>
44  <font size="-1">
45  Other optimisation criteria, such as maximising the signal significance-
46  squared, S^2/(S+B), with S and B being the signal and background yields,
47  correspond to a particular point in the optimised background rejection
48  versus signal efficiency curve. This working point requires the knowledge
49  of the expected yields, which is not the case in general. Note also that
50  for rare signals, Poissonian statistics should be used, which modifies
51  the significance criterion.
52  </font>
53 
54  <p>
55  The rectangular cut of a volume in the variable space is performed using
56  a binary tree to sort the training events. This provides a significant
57  reduction in computing time (up to several orders of magnitudes, depending
58  on the complexity of the problem at hand).
59 
60  <p>
61  Technically, optimisation is achieved in TMVA by two methods:
62 
63  <ol>
64  <li>Monte Carlo generation using uniform priors for the lower cut value,
65  and the cut width, thrown within the variable ranges.
66 
67  <li>A Genetic Algorithm (GA) searches for the optimal ("fittest") cut sample.
68  The GA is configurable by many external settings through the option
69  string. For difficult cases (such as many variables), some tuning
70  may be necessary to achieve satisfying results
71  </ol>
72 
73  <p>
74  <font size="-1">
75  Attempts to use Minuit fits (Simplex ot Migrad) instead have not shown
76  superior results, and often failed due to convergence at local minima.
77  </font>
78 
79  <p>
80  The tests we have performed so far showed that in generic applications,
81  the GA is superior to MC sampling, and hence GA is the default method.
82  It is worthwhile trying both anyway.
83 
84  <b>Decorrelated (or "diagonalized") Cuts</b>
85 
86  <p>
87  See class description for Method Likelihood for a detailed explanation.
88  End_Html */
89 //
90 
91 #include "TMVA/MethodCuts.h"
92 
93 #include "TMVA/BinarySearchTree.h"
94 #include "TMVA/ClassifierFactory.h"
95 #include "TMVA/Config.h"
96 #include "TMVA/Configurable.h"
97 #include "TMVA/DataSet.h"
98 #include "TMVA/DataSetInfo.h"
99 #include "TMVA/Event.h"
100 #include "TMVA/IFitterTarget.h"
101 #include "TMVA/IMethod.h"
102 #include "TMVA/GeneticFitter.h"
103 #include "TMVA/Interval.h"
104 #include "TMVA/FitterBase.h"
105 #include "TMVA/MCFitter.h"
106 #include "TMVA/MethodBase.h"
107 #include "TMVA/MethodFDA.h"
108 #include "TMVA/MinuitFitter.h"
109 #include "TMVA/MsgLogger.h"
110 #include "TMVA/PDF.h"
111 #include "TMVA/Results.h"
113 #include "TMVA/Timer.h"
114 #include "TMVA/Tools.h"
116 #include "TMVA/Types.h"
117 #include "TMVA/TSpline1.h"
119 
120 #include "Riostream.h"
121 #include "TH1F.h"
122 #include "TObjString.h"
123 #include "TDirectory.h"
124 #include "TMath.h"
125 #include "TGraph.h"
126 #include "TSpline.h"
127 #include "TRandom3.h"
128 
129 #include <cstdlib>
130 #include <iostream>
131 
132 using std::atof;
133 
134 REGISTER_METHOD(Cuts)
135 
137 
139 
140 ////////////////////////////////////////////////////////////////////////////////
141 /// standard constructor
142 
144  const TString& methodTitle,
145  DataSetInfo& theData,
146  const TString& theOption ) :
147  MethodBase( jobName, Types::kCuts, methodTitle, theData, theOption),
148  fFitMethod ( kUseGeneticAlgorithm ),
149  fEffMethod ( kUseEventSelection ),
150  fFitParams (0),
151  fTestSignalEff(0.7),
152  fEffSMin ( 0 ),
153  fEffSMax ( 0 ),
154  fCutRangeMin( 0 ),
155  fCutRangeMax( 0 ),
156  fBinaryTreeS( 0 ),
157  fBinaryTreeB( 0 ),
158  fCutMin ( 0 ),
159  fCutMax ( 0 ),
160  fTmpCutMin ( 0 ),
161  fTmpCutMax ( 0 ),
162  fAllVarsI ( 0 ),
163  fNpar ( 0 ),
164  fEffRef ( 0 ),
165  fRangeSign ( 0 ),
166  fRandom ( 0 ),
167  fMeanS ( 0 ),
168  fMeanB ( 0 ),
169  fRmsS ( 0 ),
170  fRmsB ( 0 ),
171  fEffBvsSLocal( 0 ),
172  fVarHistS ( 0 ),
173  fVarHistB ( 0 ),
174  fVarHistS_smooth( 0 ),
175  fVarHistB_smooth( 0 ),
176  fVarPdfS ( 0 ),
177  fVarPdfB ( 0 ),
178  fNegEffWarning( kFALSE )
179 {
180 }
181 
182 ////////////////////////////////////////////////////////////////////////////////
183 /// construction from weight file
184 
186  const TString& theWeightFile) :
187  MethodBase( Types::kCuts, theData, theWeightFile),
190  fFitParams (0),
191  fTestSignalEff(0.7),
192  fEffSMin ( 0 ),
193  fEffSMax ( 0 ),
194  fCutRangeMin( 0 ),
195  fCutRangeMax( 0 ),
196  fBinaryTreeS( 0 ),
197  fBinaryTreeB( 0 ),
198  fCutMin ( 0 ),
199  fCutMax ( 0 ),
200  fTmpCutMin ( 0 ),
201  fTmpCutMax ( 0 ),
202  fAllVarsI ( 0 ),
203  fNpar ( 0 ),
204  fEffRef ( 0 ),
205  fRangeSign ( 0 ),
206  fRandom ( 0 ),
207  fMeanS ( 0 ),
208  fMeanB ( 0 ),
209  fRmsS ( 0 ),
210  fRmsB ( 0 ),
211  fEffBvsSLocal( 0 ),
212  fVarHistS ( 0 ),
213  fVarHistB ( 0 ),
214  fVarHistS_smooth( 0 ),
215  fVarHistB_smooth( 0 ),
216  fVarPdfS ( 0 ),
217  fVarPdfB ( 0 ),
219 {
220 }
221 
222 ////////////////////////////////////////////////////////////////////////////////
223 /// Cuts can only handle classification with 2 classes
224 
226  UInt_t /*numberTargets*/ )
227 {
228  return (type == Types::kClassification && numberClasses == 2);
229 }
230 
231 ////////////////////////////////////////////////////////////////////////////////
232 /// default initialisation called by all constructors
233 
235 {
236  fVarHistS = fVarHistB = 0;
238  fVarPdfS = fVarPdfB = 0;
239  fFitParams = 0;
241  fEffSMin = 0;
242  fEffSMax = 0;
243 
244  // vector with fit results
245  fNpar = 2*GetNvar();
246  fRangeSign = new std::vector<Int_t> ( GetNvar() );
247  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) (*fRangeSign)[ivar] = +1;
248 
249  fMeanS = new std::vector<Double_t>( GetNvar() );
250  fMeanB = new std::vector<Double_t>( GetNvar() );
251  fRmsS = new std::vector<Double_t>( GetNvar() );
252  fRmsB = new std::vector<Double_t>( GetNvar() );
253 
254  // get the variable specific options, first initialize default
255  fFitParams = new std::vector<EFitParameters>( GetNvar() );
256  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) (*fFitParams)[ivar] = kNotEnforced;
257 
259  fTestSignalEff = -1;
260 
261  // create LUT for cuts
262  fCutMin = new Double_t*[GetNvar()];
263  fCutMax = new Double_t*[GetNvar()];
264  for (UInt_t i=0; i<GetNvar(); i++) {
265  fCutMin[i] = new Double_t[fNbins];
266  fCutMax[i] = new Double_t[fNbins];
267  }
268 
269  // init
270  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
271  for (Int_t ibin=0; ibin<fNbins; ibin++) {
272  fCutMin[ivar][ibin] = 0;
273  fCutMax[ivar][ibin] = 0;
274  }
275  }
276 
277  fTmpCutMin = new Double_t[GetNvar()];
278  fTmpCutMax = new Double_t[GetNvar()];
279 }
280 
281 ////////////////////////////////////////////////////////////////////////////////
282 /// destructor
283 
285 {
286  delete fRangeSign;
287  delete fMeanS;
288  delete fMeanB;
289  delete fRmsS;
290  delete fRmsB;
291  delete fFitParams;
292  delete fEffBvsSLocal;
293 
294  if (NULL != fCutRangeMin) delete [] fCutRangeMin;
295  if (NULL != fCutRangeMax) delete [] fCutRangeMax;
296  if (NULL != fAllVarsI) delete [] fAllVarsI;
297 
298  for (UInt_t i=0;i<GetNvar();i++) {
299  if (NULL != fCutMin[i] ) delete [] fCutMin[i];
300  if (NULL != fCutMax[i] ) delete [] fCutMax[i];
301  if (NULL != fCutRange[i]) delete fCutRange[i];
302  }
303 
304  if (NULL != fCutMin) delete [] fCutMin;
305  if (NULL != fCutMax) delete [] fCutMax;
306 
307  if (NULL != fTmpCutMin) delete [] fTmpCutMin;
308  if (NULL != fTmpCutMax) delete [] fTmpCutMax;
309 
310  if (NULL != fBinaryTreeS) delete fBinaryTreeS;
311  if (NULL != fBinaryTreeB) delete fBinaryTreeB;
312 }
313 
314 ////////////////////////////////////////////////////////////////////////////////
315 /// define the options (their key words) that can be set in the option string
316 /// know options:
317 /// Method <string> Minimisation method
318 /// available values are: MC Monte Carlo <default>
319 /// GA Genetic Algorithm
320 /// SA Simulated annealing
321 ///
322 /// EffMethod <string> Efficiency selection method
323 /// available values are: EffSel <default>
324 /// EffPDF
325 ///
326 /// VarProp <string> Property of variable 1 for the MC method (taking precedence over the
327 /// globale setting. The same values as for the global option are available. Variables 1..10 can be
328 /// set this way
329 ///
330 /// CutRangeMin/Max <float> user-defined ranges in which cuts are varied
331 
333 {
334  DeclareOptionRef(fFitMethodS = "GA", "FitMethod", "Minimisation Method (GA, SA, and MC are the primary methods to be used; the others have been introduced for testing purposes and are depreciated)");
335  AddPreDefVal(TString("GA"));
336  AddPreDefVal(TString("SA"));
337  AddPreDefVal(TString("MC"));
338  AddPreDefVal(TString("MCEvents"));
339  AddPreDefVal(TString("MINUIT"));
340  AddPreDefVal(TString("EventScan"));
341 
342  // selection type
343  DeclareOptionRef(fEffMethodS = "EffSel", "EffMethod", "Selection Method");
344  AddPreDefVal(TString("EffSel"));
345  AddPreDefVal(TString("EffPDF"));
346 
347  // cut ranges
348  fCutRange.resize(GetNvar());
349  fCutRangeMin = new Double_t[GetNvar()];
350  fCutRangeMax = new Double_t[GetNvar()];
351  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
352  fCutRange[ivar] = 0;
353  fCutRangeMin[ivar] = fCutRangeMax[ivar] = -1;
354  }
355 
356  DeclareOptionRef( fCutRangeMin, GetNvar(), "CutRangeMin", "Minimum of allowed cut range (set per variable)" );
357  DeclareOptionRef( fCutRangeMax, GetNvar(), "CutRangeMax", "Maximum of allowed cut range (set per variable)" );
358 
359  fAllVarsI = new TString[GetNvar()];
360 
361  for (UInt_t i=0; i<GetNvar(); i++) fAllVarsI[i] = "NotEnforced";
362 
363  DeclareOptionRef(fAllVarsI, GetNvar(), "VarProp", "Categorisation of cuts");
364  AddPreDefVal(TString("NotEnforced"));
365  AddPreDefVal(TString("FMax"));
366  AddPreDefVal(TString("FMin"));
367  AddPreDefVal(TString("FSmart"));
368 }
369 
370 ////////////////////////////////////////////////////////////////////////////////
371 /// process user options
372 /// sanity check, do not allow the input variables to be normalised, because this
373 /// only creates problems when interpreting the cuts
374 
376 {
377  if (IsNormalised()) {
378  Log() << kWARNING << "Normalisation of the input variables for cut optimisation is not" << Endl;
379  Log() << kWARNING << "supported because this provides intransparent cut values, and no" << Endl;
380  Log() << kWARNING << "improvement in the performance of the algorithm." << Endl;
381  Log() << kWARNING << "Please remove \"Normalise\" option from booking option string" << Endl;
382  Log() << kWARNING << "==> Will reset normalisation flag to \"False\"" << Endl;
384  }
385 
387  Log() << kFATAL << "Mechanism to ignore events with negative weights in training not yet available for method: "
388  << GetMethodTypeName()
389  << " --> Please remove \"IgnoreNegWeightsInTraining\" option from booking string."
390  << Endl;
391  }
392 
393  if (fFitMethodS == "MC" ) fFitMethod = kUseMonteCarlo;
394  else if (fFitMethodS == "MCEvents") fFitMethod = kUseMonteCarloEvents;
395  else if (fFitMethodS == "GA" ) fFitMethod = kUseGeneticAlgorithm;
396  else if (fFitMethodS == "SA" ) fFitMethod = kUseSimulatedAnnealing;
397  else if (fFitMethodS == "MINUIT" ) {
399  Log() << kWARNING << "poor performance of MINUIT in MethodCuts; preferred fit method: GA" << Endl;
400  }
401  else if (fFitMethodS == "EventScan" ) fFitMethod = kUseEventScan;
402  else Log() << kFATAL << "unknown minimisation method: " << fFitMethodS << Endl;
403 
404  if (fEffMethodS == "EFFSEL" ) fEffMethod = kUseEventSelection; // highly recommended
405  else if (fEffMethodS == "EFFPDF" ) fEffMethod = kUsePDFs;
407 
408  // options output
409  Log() << kINFO << Form("Use optimization method: \"%s\"",
410  (fFitMethod == kUseMonteCarlo) ? "Monte Carlo" :
411  (fFitMethod == kUseMonteCarlo) ? "Monte-Carlo-Event sampling" :
412  (fFitMethod == kUseEventScan) ? "Full Event Scan (slow)" :
413  (fFitMethod == kUseMinuit) ? "MINUIT" : "Genetic Algorithm" ) << Endl;
414  Log() << kINFO << Form("Use efficiency computation method: \"%s\"",
415  (fEffMethod == kUseEventSelection) ? "Event Selection" : "PDF" ) << Endl;
416 
417  // cut ranges
418  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
419  fCutRange[ivar] = new Interval( fCutRangeMin[ivar], fCutRangeMax[ivar] );
420  }
421 
422  // individual options
423  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
424  EFitParameters theFitP = kNotEnforced;
425  if (fAllVarsI[ivar] == "" || fAllVarsI[ivar] == "NotEnforced") theFitP = kNotEnforced;
426  else if (fAllVarsI[ivar] == "FMax" ) theFitP = kForceMax;
427  else if (fAllVarsI[ivar] == "FMin" ) theFitP = kForceMin;
428  else if (fAllVarsI[ivar] == "FSmart" ) theFitP = kForceSmart;
429  else {
430  Log() << kFATAL << "unknown value \'" << fAllVarsI[ivar]
431  << "\' for fit parameter option " << Form("VarProp[%i]",ivar) << Endl;
432  }
433  (*fFitParams)[ivar] = theFitP;
434 
435  if (theFitP != kNotEnforced)
436  Log() << kINFO << "Use \"" << fAllVarsI[ivar]
437  << "\" cuts for variable: " << "'" << (*fInputVars)[ivar] << "'" << Endl;
438  }
439 }
440 
441 ////////////////////////////////////////////////////////////////////////////////
442 /// cut evaluation: returns 1.0 if event passed, 0.0 otherwise
443 
445 {
446  // cannot determine error
447  NoErrorCalc(err, errUpper);
448 
449  // sanity check
450  if (fCutMin == NULL || fCutMax == NULL || fNbins == 0) {
451  Log() << kFATAL << "<Eval_Cuts> fCutMin/Max have zero pointer. "
452  << "Did you book Cuts ?" << Endl;
453  }
454 
455  const Event* ev = GetEvent();
456 
457  // sanity check
458  if (fTestSignalEff > 0) {
459  // get efficiency bin
461  if (ibin < 0 ) ibin = 0;
462  else if (ibin >= fNbins) ibin = fNbins - 1;
463 
464  Bool_t passed = kTRUE;
465  for (UInt_t ivar=0; ivar<GetNvar(); ivar++)
466  passed &= ( (ev->GetValue(ivar) > fCutMin[ivar][ibin]) &&
467  (ev->GetValue(ivar) <= fCutMax[ivar][ibin]) );
468 
469  return passed ? 1. : 0. ;
470  }
471  else return 0;
472 }
473 
474 ////////////////////////////////////////////////////////////////////////////////
475 /// print cuts
476 
478 {
479  std::vector<Double_t> cutsMin;
480  std::vector<Double_t> cutsMax;
481  Int_t ibin = fEffBvsSLocal->FindBin( effS );
482 
483  Double_t trueEffS = GetCuts( effS, cutsMin, cutsMax );
484 
485  // retrieve variable expressions (could be transformations)
486  std::vector<TString>* varVec = 0;
487  if (GetTransformationHandler().GetNumOfTransformations() == 0) {
488  // no transformation applied, replace by current variables
489  varVec = new std::vector<TString>;
490  for (UInt_t ivar=0; ivar<cutsMin.size(); ivar++) {
491  varVec->push_back( DataInfo().GetVariableInfo(ivar).GetLabel() );
492  }
493  }
494  else if (GetTransformationHandler().GetNumOfTransformations() == 1) {
495  // get transformation string
497  }
498  else {
499  // replace transformation print by current variables and indicated incompleteness
500  varVec = new std::vector<TString>;
501  for (UInt_t ivar=0; ivar<cutsMin.size(); ivar++) {
502  varVec->push_back( DataInfo().GetVariableInfo(ivar).GetLabel() + " [transformed]" );
503  }
504  }
505 
506  UInt_t maxL = 0;
507  for (UInt_t ivar=0; ivar<cutsMin.size(); ivar++) {
508  if ((UInt_t)(*varVec)[ivar].Length() > maxL) maxL = (*varVec)[ivar].Length();
509  }
510  UInt_t maxLine = 20+maxL+16;
511 
512  for (UInt_t i=0; i<maxLine; i++) Log() << "-";
513  Log() << Endl;
514  Log() << kHEADER << "Cut values for requested signal efficiency: " << trueEffS << Endl;
515  Log() << kINFO << "Corresponding background efficiency : " << fEffBvsSLocal->GetBinContent( ibin ) << Endl;
516  if (GetTransformationHandler().GetNumOfTransformations() == 1) {
517  Log() << kINFO << "Transformation applied to input variables : \""
519  }
520  else if (GetTransformationHandler().GetNumOfTransformations() > 1) {
521  Log() << kINFO << "[ More than one (=" << GetTransformationHandler().GetNumOfTransformations() << ") "
522  << " transformations applied in transformation chain; cuts applied on transformed quantities ] " << Endl;
523  }
524  else {
525  Log() << kINFO << "Transformation applied to input variables : None" << Endl;
526  }
527  for (UInt_t i=0; i<maxLine; i++) Log() << "-";
528  Log() << Endl;
529  for (UInt_t ivar=0; ivar<cutsMin.size(); ivar++) {
530  Log() << kINFO
531  << "Cut[" << std::setw(2) << ivar << "]: "
532  << std::setw(10) << cutsMin[ivar]
533  << " < "
534  << std::setw(maxL) << (*varVec)[ivar]
535  << " <= "
536  << std::setw(10) << cutsMax[ivar] << Endl;
537  }
538  for (UInt_t i=0; i<maxLine; i++) Log() << "-";
539  Log() << Endl;
540 
541  delete varVec; // yes, ownership has been given to us
542 }
543 
544 ////////////////////////////////////////////////////////////////////////////////
545 /// retrieve cut values for given signal efficiency
546 /// assume vector of correct size !!
547 
549 {
550  std::vector<Double_t> cMin( GetNvar() );
551  std::vector<Double_t> cMax( GetNvar() );
552  Double_t trueEffS = GetCuts( effS, cMin, cMax );
553  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
554  cutMin[ivar] = cMin[ivar];
555  cutMax[ivar] = cMax[ivar];
556  }
557  return trueEffS;
558 }
559 
560 ////////////////////////////////////////////////////////////////////////////////
561 /// retrieve cut values for given signal efficiency
562 
564  std::vector<Double_t>& cutMin,
565  std::vector<Double_t>& cutMax ) const
566 {
567  // find corresponding bin
568  Int_t ibin = fEffBvsSLocal->FindBin( effS );
569 
570  // get the true efficiency which is the one on the "left hand" side of the bin
571  Double_t trueEffS = fEffBvsSLocal->GetBinLowEdge( ibin );
572 
573  ibin--; // the 'cut' vector has 0...fNbins indices
574  if (ibin < 0 ) ibin = 0;
575  else if (ibin >= fNbins) ibin = fNbins - 1;
576 
577  cutMin.clear();
578  cutMax.clear();
579  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
580  cutMin.push_back( fCutMin[ivar][ibin] );
581  cutMax.push_back( fCutMax[ivar][ibin] );
582  }
583 
584  return trueEffS;
585 }
586 
587 ////////////////////////////////////////////////////////////////////////////////
588 /// training method: here the cuts are optimised for the training sample
589 
591 {
592  if (fEffMethod == kUsePDFs) CreateVariablePDFs(); // create PDFs for variables
593 
594  // create binary trees (global member variables) for signal and background
595  if (fBinaryTreeS != 0) { delete fBinaryTreeS; fBinaryTreeS = 0; }
596  if (fBinaryTreeB != 0) { delete fBinaryTreeB; fBinaryTreeB = 0; }
597 
598  // the variables may be transformed by a transformation method: to coherently
599  // treat signal and background one must decide which transformation type shall
600  // be used: our default is signal-type
601 
606 
607  for (UInt_t ivar =0; ivar < Data()->GetNVariables(); ivar++) {
608  (*fMeanS)[ivar] = fBinaryTreeS->Mean(Types::kSignal, ivar);
609  (*fRmsS)[ivar] = fBinaryTreeS->RMS (Types::kSignal, ivar);
610  (*fMeanB)[ivar] = fBinaryTreeB->Mean(Types::kBackground, ivar);
611  (*fRmsB)[ivar] = fBinaryTreeB->RMS (Types::kBackground, ivar);
612 
613  // update interval ?
618 
619  // redefine ranges to be slightly smaller and larger than xmin and xmax, respectively
620  Double_t eps = 0.01*(xmax - xmin);
621  xmin -= eps;
622  xmax += eps;
623 
624  if (TMath::Abs(fCutRange[ivar]->GetMin() - fCutRange[ivar]->GetMax()) < 1.0e-300 ) {
625  fCutRange[ivar]->SetMin( xmin );
626  fCutRange[ivar]->SetMax( xmax );
627  }
628  else if (xmin > fCutRange[ivar]->GetMin()) fCutRange[ivar]->SetMin( xmin );
629  else if (xmax < fCutRange[ivar]->GetMax()) fCutRange[ivar]->SetMax( xmax );
630  }
631 
632  std::vector<TH1F*> signalDist, bkgDist;
633 
634  // this is important: reset the branch addresses of the training tree to the current event
635  delete fEffBvsSLocal;
636  fEffBvsSLocal = new TH1F( GetTestvarName() + "_effBvsSLocal",
637  TString(GetName()) + " efficiency of B vs S", fNbins, 0.0, 1.0 );
638  fEffBvsSLocal->SetDirectory(0); // it's local
639 
640  // init
641  for (Int_t ibin=1; ibin<=fNbins; ibin++) fEffBvsSLocal->SetBinContent( ibin, -0.1 );
642 
643  // --------------------------------------------------------------------------
646  fFitMethod == kUseMinuit ||
648 
649  // ranges
650  std::vector<Interval*> ranges;
651 
652  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
653 
654  Int_t nbins = 0;
655  if (DataInfo().GetVariableInfo(ivar).GetVarType() == 'I') {
656  nbins = Int_t(fCutRange[ivar]->GetMax() - fCutRange[ivar]->GetMin()) + 1;
657  }
658 
659  if ((*fFitParams)[ivar] == kForceSmart) {
660  if ((*fMeanS)[ivar] > (*fMeanB)[ivar]) (*fFitParams)[ivar] = kForceMax;
661  else (*fFitParams)[ivar] = kForceMin;
662  }
663 
664  if ((*fFitParams)[ivar] == kForceMin) {
665  ranges.push_back( new Interval( fCutRange[ivar]->GetMin(), fCutRange[ivar]->GetMin(), nbins ) );
666  ranges.push_back( new Interval( 0, fCutRange[ivar]->GetMax() - fCutRange[ivar]->GetMin(), nbins ) );
667  }
668  else if ((*fFitParams)[ivar] == kForceMax) {
669  ranges.push_back( new Interval( fCutRange[ivar]->GetMin(), fCutRange[ivar]->GetMax(), nbins ) );
670  ranges.push_back( new Interval( fCutRange[ivar]->GetMax() - fCutRange[ivar]->GetMin(),
671  fCutRange[ivar]->GetMax() - fCutRange[ivar]->GetMin(), nbins ) );
672  }
673  else {
674  ranges.push_back( new Interval( fCutRange[ivar]->GetMin(), fCutRange[ivar]->GetMax(), nbins ) );
675  ranges.push_back( new Interval( 0, fCutRange[ivar]->GetMax() - fCutRange[ivar]->GetMin(), nbins ) );
676  }
677  }
678 
679  // create the fitter
680  FitterBase* fitter = NULL;
681 
682  switch (fFitMethod) {
684  fitter = new GeneticFitter( *this, Form("%sFitter_GA", GetName()), ranges, GetOptions() );
685  break;
686  case kUseMonteCarlo:
687  fitter = new MCFitter ( *this, Form("%sFitter_MC", GetName()), ranges, GetOptions() );
688  break;
689  case kUseMinuit:
690  fitter = new MinuitFitter ( *this, Form("%sFitter_MINUIT", GetName()), ranges, GetOptions() );
691  break;
693  fitter = new SimulatedAnnealingFitter( *this, Form("%sFitter_SA", GetName()), ranges, GetOptions() );
694  break;
695  default:
696  Log() << kFATAL << "Wrong fit method: " << fFitMethod << Endl;
697  }
698 
700 
701  fitter->CheckForUnusedOptions();
702 
703  // perform the fit
704  fitter->Run();
705 
706  // clean up
707  for (UInt_t ivar=0; ivar<ranges.size(); ivar++) delete ranges[ivar];
708  delete fitter;
709 
710  }
711  // --------------------------------------------------------------------------
712  else if (fFitMethod == kUseEventScan) {
713 
714  Int_t nevents = Data()->GetNEvents();
715  Int_t ic = 0;
716 
717  // timing of MC
718  Int_t nsamples = Int_t(0.5*nevents*(nevents - 1));
719  Timer timer( nsamples, GetName() );
720  fIPyMaxIter = nsamples;
721 
722  Log() << kINFO << "Running full event scan: " << Endl;
723  for (Int_t ievt1=0; ievt1<nevents; ievt1++) {
724  for (Int_t ievt2=ievt1+1; ievt2<nevents; ievt2++) {
725 
726  fIPyCurrentIter = ic;
727  if (fExitFromTraining) break;
728  EstimatorFunction( ievt1, ievt2 );
729 
730  // what's the time please?
731  ic++;
732  if ((nsamples<10000) || ic%10000 == 0) timer.DrawProgressBar( ic );
733  }
734  }
735  }
736  // --------------------------------------------------------------------------
737  else if (fFitMethod == kUseMonteCarloEvents) {
738 
739  Int_t nsamples = 200000;
740  UInt_t seed = 100;
741  DeclareOptionRef( nsamples, "SampleSize", "Number of Monte-Carlo-Event samples" );
742  DeclareOptionRef( seed, "Seed", "Seed for the random generator (0 takes random seeds)" );
743  ParseOptions();
744 
745  Int_t nevents = Data()->GetNEvents();
746  Int_t ic = 0;
747 
748  // timing of MC
749  Timer timer( nsamples, GetName() );
750  fIPyMaxIter = nsamples;
751 
752  // random generator
753  TRandom3*rnd = new TRandom3( seed );
754 
755  Log() << kINFO << "Running Monte-Carlo-Event sampling over " << nsamples << " events" << Endl;
756  std::vector<Double_t> pars( 2*GetNvar() );
757 
758  for (Int_t itoy=0; itoy<nsamples; itoy++) {
759  fIPyCurrentIter = ic;
760  if (fExitFromTraining) break;
761 
762  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
763 
764  // generate minimum and delta cuts for this variable
765 
766  // retrieve signal events
767  Bool_t isSignal = kFALSE;
768  Int_t ievt1, ievt2;
769  Double_t evt1 = 0., evt2 = 0.;
770  Int_t nbreak = 0;
771  while (!isSignal) {
772  ievt1 = Int_t(rnd->Uniform(0.,1.)*nevents);
773  ievt2 = Int_t(rnd->Uniform(0.,1.)*nevents);
774 
775  const Event *ev1 = GetEvent(ievt1);
776  isSignal = DataInfo().IsSignal(ev1);
777  evt1 = ev1->GetValue( ivar );
778 
779  const Event *ev2 = GetEvent(ievt2);
780  isSignal &= DataInfo().IsSignal(ev2);
781  evt2 = ev2->GetValue( ivar );
782 
783  if (nbreak++ > 10000) Log() << kFATAL << "<MCEvents>: could not find signal events"
784  << " after 10000 trials - do you have signal events in your sample ?"
785  << Endl;
786  isSignal = 1;
787  }
788 
789  // sort
790  if (evt1 > evt2) { Double_t z = evt1; evt1 = evt2; evt2 = z; }
791  pars[2*ivar] = evt1;
792  pars[2*ivar+1] = evt2 - evt1;
793  }
794 
795  // compute estimator
796  EstimatorFunction( pars );
797 
798  // what's the time please?
799  ic++;
800  if ((nsamples<1000) || ic%1000 == 0) timer.DrawProgressBar( ic );
801  }
802 
803  delete rnd;
804  }
805  // --------------------------------------------------------------------------
806  else Log() << kFATAL << "Unknown minimisation method: " << fFitMethod << Endl;
807 
808  if (fBinaryTreeS != 0) { delete fBinaryTreeS; fBinaryTreeS = 0; }
809  if (fBinaryTreeB != 0) { delete fBinaryTreeB; fBinaryTreeB = 0; }
810 
811  // force cut ranges within limits
812  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
813  for (Int_t ibin=0; ibin<fNbins; ibin++) {
814 
815  if ((*fFitParams)[ivar] == kForceMin && fCutMin[ivar][ibin] > -fgMaxAbsCutVal) {
816  fCutMin[ivar][ibin] = -fgMaxAbsCutVal;
817  }
818  if ((*fFitParams)[ivar] == kForceMax && fCutMax[ivar][ibin] < fgMaxAbsCutVal) {
819  fCutMax[ivar][ibin] = fgMaxAbsCutVal;
820  }
821  }
822  }
823 
824  // some output
825  // the efficiency which is asked for has to be slightly higher than the bin-borders.
826  // if not, then the wrong bin is taken in some cases.
827  Double_t epsilon = 0.0001;
828  for (Double_t eff=0.1; eff<0.95; eff += 0.1) PrintCuts( eff+epsilon );
829 
832 }
833 
834 ////////////////////////////////////////////////////////////////////////////////
835 /// nothing to test
836 
838 {
839 }
840 
841 ////////////////////////////////////////////////////////////////////////////////
842 /// for full event scan
843 
845 {
846  const Event *ev1 = GetEvent(ievt1);
847  if (!DataInfo().IsSignal(ev1)) return -1;
848 
849  const Event *ev2 = GetEvent(ievt2);
850  if (!DataInfo().IsSignal(ev2)) return -1;
851 
852  const Int_t nvar = GetNvar();
853  Double_t* evt1 = new Double_t[nvar];
854  Double_t* evt2 = new Double_t[nvar];
855 
856  for (Int_t ivar=0; ivar<nvar; ivar++) {
857  evt1[ivar] = ev1->GetValue( ivar );
858  evt2[ivar] = ev2->GetValue( ivar );
859  }
860 
861  // determine cuts
862  std::vector<Double_t> pars;
863  for (Int_t ivar=0; ivar<nvar; ivar++) {
864  Double_t cutMin;
865  Double_t cutMax;
866  if (evt1[ivar] < evt2[ivar]) {
867  cutMin = evt1[ivar];
868  cutMax = evt2[ivar];
869  }
870  else {
871  cutMin = evt2[ivar];
872  cutMax = evt1[ivar];
873  }
874 
875  pars.push_back( cutMin );
876  pars.push_back( cutMax - cutMin );
877  }
878 
879  delete [] evt1;
880  delete [] evt2;
881 
882  return ComputeEstimator( pars );
883 }
884 
885 ////////////////////////////////////////////////////////////////////////////////
886 /// returns estimator for "cut fitness" used by GA
887 
888 Double_t TMVA::MethodCuts::EstimatorFunction( std::vector<Double_t>& pars )
889 {
890  return ComputeEstimator( pars );
891 }
892 
893 ////////////////////////////////////////////////////////////////////////////////
894 /// returns estimator for "cut fitness" used by GA
895 /// there are two requirements:
896 /// 1) the signal efficiency must be equal to the required one in the
897 /// efficiency scan
898 /// 2) the background efficiency must be as small as possible
899 /// the requirement 1) has priority over 2)
900 
901 Double_t TMVA::MethodCuts::ComputeEstimator( std::vector<Double_t>& pars )
902 {
903  // caution: the npar gives the _free_ parameters
904  // however: the "pars" array contains all parameters
905 
906  // determine cuts
907  Double_t effS = 0, effB = 0;
908  this->MatchParsToCuts( pars, &fTmpCutMin[0], &fTmpCutMax[0] );
909 
910  // retrieve signal and background efficiencies for given cut
911  switch (fEffMethod) {
912  case kUsePDFs:
913  this->GetEffsfromPDFs (&fTmpCutMin[0], &fTmpCutMax[0], effS, effB);
914  break;
915  case kUseEventSelection:
916  this->GetEffsfromSelection (&fTmpCutMin[0], &fTmpCutMax[0], effS, effB);
917  break;
918  default:
919  this->GetEffsfromSelection (&fTmpCutMin[0], &fTmpCutMax[0], effS, effB);
920  }
921 
922  Double_t eta = 0;
923 
924  // test for a estimator function which optimizes on the whole background-rejection signal-efficiency plot
925 
926  // get the backg-reject. and sig-eff for the parameters given to this function
927  // effS, effB
928 
929  // get best background rejection for given signal efficiency
930  Int_t ibinS = fEffBvsSLocal->FindBin( effS );
931 
932  Double_t effBH = fEffBvsSLocal->GetBinContent( ibinS );
933  Double_t effBH_left = (ibinS > 1 ) ? fEffBvsSLocal->GetBinContent( ibinS-1 ) : effBH;
934  Double_t effBH_right = (ibinS < fNbins) ? fEffBvsSLocal->GetBinContent( ibinS+1 ) : effBH;
935 
936  Double_t average = 0.5*(effBH_left + effBH_right);
937  if (effBH < effB) average = effBH;
938 
939  // if the average of the bin right and left is larger than this one, add the difference to
940  // the current value of the estimator (because you can do at least so much better)
941  eta = ( -TMath::Abs(effBH-average) + (1.0 - (effBH - effB))) / (1.0 + effS);
942  // alternative idea
943  //if (effBH<0) eta = (1.e-6+effB)/(1.0 + effS);
944  //else eta = (effB - effBH) * (1.0 + 10.* effS);
945 
946  // if a point is found which is better than an existing one, ... replace it.
947  // preliminary best event -> backup
948  if (effBH < 0 || effBH > effB) {
949  fEffBvsSLocal->SetBinContent( ibinS, effB );
950  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
951  fCutMin[ivar][ibinS-1] = fTmpCutMin[ivar]; // bin 1 stored in index 0
952  fCutMax[ivar][ibinS-1] = fTmpCutMax[ivar];
953  }
954  }
955 
956  // caution (!) this value is not good for a decision for MC, .. it is designed for GA
957  // but .. it doesn't matter, as MC samplings are independent from the former ones
958  // and the replacement of the best variables by better ones is done about 10 lines above.
959  // ( if (effBH < 0 || effBH > effB) { .... )
960 
961  if (ibinS<=1) {
962  // add penalty for effS=0 bin
963  // to avoid that the minimizer gets stuck in the zero-bin
964  // force it towards higher efficiency
965  Double_t penalty=0.,diff=0.;
966  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
967  diff=(fCutRange[ivar]->GetMax()-fTmpCutMax[ivar])/(fCutRange[ivar]->GetMax()-fCutRange[ivar]->GetMin());
968  penalty+=diff*diff;
969  diff=(fCutRange[ivar]->GetMin()-fTmpCutMin[ivar])/(fCutRange[ivar]->GetMax()-fCutRange[ivar]->GetMin());
970  penalty+=4.*diff*diff;
971  }
972 
973  if (effS<1.e-4) return 10.0+penalty;
974  else return 10.*(1.-10.*effS);
975  }
976  return eta;
977 }
978 
979 ////////////////////////////////////////////////////////////////////////////////
980 /// translates parameters into cuts
981 
982 void TMVA::MethodCuts::MatchParsToCuts( const std::vector<Double_t> & pars,
983  Double_t* cutMin, Double_t* cutMax )
984 {
985  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
986  Int_t ipar = 2*ivar;
987  cutMin[ivar] = ((*fRangeSign)[ivar] > 0) ? pars[ipar] : pars[ipar] - pars[ipar+1];
988  cutMax[ivar] = ((*fRangeSign)[ivar] > 0) ? pars[ipar] + pars[ipar+1] : pars[ipar];
989  }
990 }
991 
992 ////////////////////////////////////////////////////////////////////////////////
993 /// translate the cuts into parameters (obsolete function)
994 
995 void TMVA::MethodCuts::MatchCutsToPars( std::vector<Double_t>& pars,
996  Double_t** cutMinAll, Double_t** cutMaxAll, Int_t ibin )
997 {
998  if (ibin < 1 || ibin > fNbins) Log() << kFATAL << "::MatchCutsToPars: bin error: "
999  << ibin << Endl;
1000 
1001  const UInt_t nvar = GetNvar();
1002  Double_t *cutMin = new Double_t[nvar];
1003  Double_t *cutMax = new Double_t[nvar];
1004  for (UInt_t ivar=0; ivar<nvar; ivar++) {
1005  cutMin[ivar] = cutMinAll[ivar][ibin-1];
1006  cutMax[ivar] = cutMaxAll[ivar][ibin-1];
1007  }
1008 
1009  MatchCutsToPars( pars, cutMin, cutMax );
1010  delete [] cutMin;
1011  delete [] cutMax;
1012 }
1013 
1014 ////////////////////////////////////////////////////////////////////////////////
1015 /// translates cuts into parameters
1016 
1017 void TMVA::MethodCuts::MatchCutsToPars( std::vector<Double_t>& pars,
1018  Double_t* cutMin, Double_t* cutMax )
1019 {
1020  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
1021  Int_t ipar = 2*ivar;
1022  pars[ipar] = ((*fRangeSign)[ivar] > 0) ? cutMin[ivar] : cutMax[ivar];
1023  pars[ipar+1] = cutMax[ivar] - cutMin[ivar];
1024  }
1025 }
1026 
1027 ////////////////////////////////////////////////////////////////////////////////
1028 /// compute signal and background efficiencies from PDFs
1029 /// for given cut sample
1030 
1032  Double_t& effS, Double_t& effB )
1033 {
1034  effS = 1.0;
1035  effB = 1.0;
1036  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
1037  effS *= (*fVarPdfS)[ivar]->GetIntegral( cutMin[ivar], cutMax[ivar] );
1038  effB *= (*fVarPdfB)[ivar]->GetIntegral( cutMin[ivar], cutMax[ivar] );
1039  }
1040 
1041  // quick fix to prevent from efficiencies < 0
1042  if( effS < 0.0 ) {
1043  effS = 0.0;
1044  if( !fNegEffWarning ) Log() << kWARNING << "Negative signal efficiency found and set to 0. This is probably due to many events with negative weights in a certain cut-region." << Endl;
1046  }
1047  if( effB < 0.0 ) {
1048  effB = 0.0;
1049  if( !fNegEffWarning ) Log() << kWARNING << "Negative background efficiency found and set to 0. This is probably due to many events with negative weights in a certain cut-region." << Endl;
1051  }
1052 }
1053 
1054 ////////////////////////////////////////////////////////////////////////////////
1055 /// compute signal and background efficiencies from event counting
1056 /// for given cut sample
1057 
1059  Double_t& effS, Double_t& effB)
1060 {
1061  Float_t nTotS = 0, nTotB = 0;
1062  Float_t nSelS = 0, nSelB = 0;
1063 
1064  Volume* volume = new Volume( cutMin, cutMax, GetNvar() );
1065 
1066  // search for all events lying in the volume, and add up their weights
1067  nSelS = fBinaryTreeS->SearchVolume( volume );
1068  nSelB = fBinaryTreeB->SearchVolume( volume );
1069 
1070  delete volume;
1071 
1072  // total number of "events" (sum of weights) as reference to compute efficiency
1073  nTotS = fBinaryTreeS->GetSumOfWeights();
1074  nTotB = fBinaryTreeB->GetSumOfWeights();
1075 
1076  // sanity check
1077  if (nTotS == 0 && nTotB == 0) {
1078  Log() << kFATAL << "<GetEffsfromSelection> fatal error in zero total number of events:"
1079  << " nTotS, nTotB: " << nTotS << " " << nTotB << " ***" << Endl;
1080  }
1081 
1082  // efficiencies
1083  if (nTotS == 0 ) {
1084  effS = 0;
1085  effB = nSelB/nTotB;
1086  Log() << kWARNING << "<ComputeEstimator> zero number of signal events" << Endl;
1087  }
1088  else if (nTotB == 0) {
1089  effB = 0;
1090  effS = nSelS/nTotS;
1091  Log() << kWARNING << "<ComputeEstimator> zero number of background events" << Endl;
1092  }
1093  else {
1094  effS = nSelS/nTotS;
1095  effB = nSelB/nTotB;
1096  }
1097 
1098  // quick fix to prevent from efficiencies < 0
1099  if( effS < 0.0 ) {
1100  effS = 0.0;
1101  if( !fNegEffWarning ) Log() << kWARNING << "Negative signal efficiency found and set to 0. This is probably due to many events with negative weights in a certain cut-region." << Endl;
1103  }
1104  if( effB < 0.0 ) {
1105  effB = 0.0;
1106  if( !fNegEffWarning ) Log() << kWARNING << "Negative background efficiency found and set to 0. This is probably due to many events with negative weights in a certain cut-region." << Endl;
1108  }
1109 }
1110 
1111 ////////////////////////////////////////////////////////////////////////////////
1112 /// for PDF method: create efficiency reference histograms and PDFs
1113 
1115 {
1116  // create list of histograms and PDFs
1117  fVarHistS = new std::vector<TH1*>( GetNvar() );
1118  fVarHistB = new std::vector<TH1*>( GetNvar() );
1119  fVarHistS_smooth = new std::vector<TH1*>( GetNvar() );
1120  fVarHistB_smooth = new std::vector<TH1*>( GetNvar() );
1121  fVarPdfS = new std::vector<PDF*>( GetNvar() );
1122  fVarPdfB = new std::vector<PDF*>( GetNvar() );
1123 
1124  Int_t nsmooth = 0;
1125 
1126  // get min and max values of all events
1127  Double_t minVal = DBL_MAX;
1128  Double_t maxVal = -DBL_MAX;
1129  for( UInt_t ievt=0; ievt<Data()->GetNEvents(); ievt++ ){
1130  const Event *ev = GetEvent(ievt);
1131  Float_t val = ev->GetValue(ievt);
1132  if( val > minVal ) minVal = val;
1133  if( val < maxVal ) maxVal = val;
1134  }
1135 
1136  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
1137 
1138  // ---- signal
1139  TString histTitle = (*fInputVars)[ivar] + " signal training";
1140  TString histName = (*fInputVars)[ivar] + "_sig";
1141  // TString drawOpt = (*fInputVars)[ivar] + ">>h(";
1142  // drawOpt += fNbins;
1143  // drawOpt += ")";
1144 
1145  // selection
1146  // Data().GetTrainingTree()->Draw( drawOpt, "type==1", "goff" );
1147  // (*fVarHistS)[ivar] = (TH1F*)gDirectory->Get("h");
1148  // (*fVarHistS)[ivar]->SetName(histName);
1149  // (*fVarHistS)[ivar]->SetTitle(histTitle);
1150 
1151  (*fVarHistS)[ivar] = new TH1F(histName.Data(), histTitle.Data(), fNbins, minVal, maxVal );
1152 
1153  // ---- background
1154  histTitle = (*fInputVars)[ivar] + " background training";
1155  histName = (*fInputVars)[ivar] + "_bgd";
1156  // drawOpt = (*fInputVars)[ivar] + ">>h(";
1157  // drawOpt += fNbins;
1158  // drawOpt += ")";
1159 
1160  // Data().GetTrainingTree()->Draw( drawOpt, "type==0", "goff" );
1161  // (*fVarHistB)[ivar] = (TH1F*)gDirectory->Get("h");
1162  // (*fVarHistB)[ivar]->SetName(histName);
1163  // (*fVarHistB)[ivar]->SetTitle(histTitle);
1164 
1165 
1166  (*fVarHistB)[ivar] = new TH1F(histName.Data(), histTitle.Data(), fNbins, minVal, maxVal );
1167 
1168  for( UInt_t ievt=0; ievt<Data()->GetNEvents(); ievt++ ){
1169  const Event *ev = GetEvent(ievt);
1170  Float_t val = ev->GetValue(ievt);
1171  if( DataInfo().IsSignal(ev) ){
1172  (*fVarHistS)[ivar]->Fill( val );
1173  }else{
1174  (*fVarHistB)[ivar]->Fill( val );
1175  }
1176  }
1177 
1178 
1179 
1180  // make copy for smoothed histos
1181  (*fVarHistS_smooth)[ivar] = (TH1F*)(*fVarHistS)[ivar]->Clone();
1182  histTitle = (*fInputVars)[ivar] + " signal training smoothed ";
1183  histTitle += nsmooth;
1184  histTitle +=" times";
1185  histName = (*fInputVars)[ivar] + "_sig_smooth";
1186  (*fVarHistS_smooth)[ivar]->SetName(histName);
1187  (*fVarHistS_smooth)[ivar]->SetTitle(histTitle);
1188 
1189  // smooth
1190  (*fVarHistS_smooth)[ivar]->Smooth(nsmooth);
1191 
1192  // ---- background
1193  // histTitle = (*fInputVars)[ivar] + " background training";
1194  // histName = (*fInputVars)[ivar] + "_bgd";
1195  // drawOpt = (*fInputVars)[ivar] + ">>h(";
1196  // drawOpt += fNbins;
1197  // drawOpt += ")";
1198 
1199  // Data().GetTrainingTree()->Draw( drawOpt, "type==0", "goff" );
1200  // (*fVarHistB)[ivar] = (TH1F*)gDirectory->Get("h");
1201  // (*fVarHistB)[ivar]->SetName(histName);
1202  // (*fVarHistB)[ivar]->SetTitle(histTitle);
1203 
1204  // make copy for smoothed histos
1205  (*fVarHistB_smooth)[ivar] = (TH1F*)(*fVarHistB)[ivar]->Clone();
1206  histTitle = (*fInputVars)[ivar]+" background training smoothed ";
1207  histTitle += nsmooth;
1208  histTitle +=" times";
1209  histName = (*fInputVars)[ivar]+"_bgd_smooth";
1210  (*fVarHistB_smooth)[ivar]->SetName(histName);
1211  (*fVarHistB_smooth)[ivar]->SetTitle(histTitle);
1212 
1213  // smooth
1214  (*fVarHistB_smooth)[ivar]->Smooth(nsmooth);
1215 
1216  // create PDFs
1217  (*fVarPdfS)[ivar] = new PDF( TString(GetName()) + " PDF Var Sig " + GetInputVar( ivar ), (*fVarHistS_smooth)[ivar], PDF::kSpline2 );
1218  (*fVarPdfB)[ivar] = new PDF( TString(GetName()) + " PDF Var Bkg " + GetInputVar( ivar ), (*fVarHistB_smooth)[ivar], PDF::kSpline2 );
1219  }
1220 }
1221 
1222 ////////////////////////////////////////////////////////////////////////////////
1223 /// read the cuts from stream
1224 
1226 {
1227  TString dummy;
1228  UInt_t dummyInt;
1229 
1230  // first the dimensions
1231  istr >> dummy >> dummy;
1232  // coverity[tainted_data_argument]
1233  istr >> dummy >> fNbins;
1234 
1235  // get rid of one read-in here because we read in once all ready to check for decorrelation
1236  istr >> dummy >> dummy >> dummy >> dummy >> dummy >> dummy >> dummyInt >> dummy ;
1237 
1238  // sanity check
1239  if (dummyInt != Data()->GetNVariables()) {
1240  Log() << kFATAL << "<ReadWeightsFromStream> fatal error: mismatch "
1241  << "in number of variables: " << dummyInt << " != " << Data()->GetNVariables() << Endl;
1242  }
1243  //SetNvar(dummyInt);
1244 
1245  // print some information
1246  if (fFitMethod == kUseMonteCarlo) {
1247  Log() << kWARNING << "Read cuts optimised using sample of MC events" << Endl;
1248  }
1249  else if (fFitMethod == kUseMonteCarloEvents) {
1250  Log() << kWARNING << "Read cuts optimised using sample of MC events" << Endl;
1251  }
1252  else if (fFitMethod == kUseGeneticAlgorithm) {
1253  Log() << kINFO << "Read cuts optimised using Genetic Algorithm" << Endl;
1254  }
1255  else if (fFitMethod == kUseSimulatedAnnealing) {
1256  Log() << kINFO << "Read cuts optimised using Simulated Annealing algorithm" << Endl;
1257  }
1258  else if (fFitMethod == kUseEventScan) {
1259  Log() << kINFO << "Read cuts optimised using Full Event Scan" << Endl;
1260  }
1261  else {
1262  Log() << kWARNING << "unknown method: " << fFitMethod << Endl;
1263  }
1264  Log() << kINFO << "in " << fNbins << " signal efficiency bins and for " << GetNvar() << " variables" << Endl;
1265 
1266  // now read the cuts
1267  char buffer[200];
1268  istr.getline(buffer,200);
1269  istr.getline(buffer,200);
1270 
1271  Int_t tmpbin;
1272  Float_t tmpeffS, tmpeffB;
1273  if (fEffBvsSLocal != 0) delete fEffBvsSLocal;
1274  fEffBvsSLocal = new TH1F( GetTestvarName() + "_effBvsSLocal",
1275  TString(GetName()) + " efficiency of B vs S", fNbins, 0.0, 1.0 );
1276  fEffBvsSLocal->SetDirectory(0); // it's local
1277 
1278  for (Int_t ibin=0; ibin<fNbins; ibin++) {
1279  istr >> tmpbin >> tmpeffS >> tmpeffB;
1280  fEffBvsSLocal->SetBinContent( ibin+1, tmpeffB );
1281 
1282  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
1283  istr >> fCutMin[ivar][ibin] >> fCutMax[ivar][ibin];
1284  }
1285  }
1286 
1288  fEffSMax = fEffBvsSLocal->GetBinCenter(fNbins);
1289 }
1290 
1291 ////////////////////////////////////////////////////////////////////////////////
1292 /// create XML description for LD classification and regression
1293 /// (for arbitrary number of output classes/targets)
1294 
1295 void TMVA::MethodCuts::AddWeightsXMLTo( void* parent ) const
1296 {
1297  // write all necessary information to the stream
1298  std::vector<Double_t> cutsMin;
1299  std::vector<Double_t> cutsMax;
1300 
1301  void* wght = gTools().AddChild(parent, "Weights");
1302  gTools().AddAttr( wght, "OptimisationMethod", (Int_t)fEffMethod);
1303  gTools().AddAttr( wght, "FitMethod", (Int_t)fFitMethod );
1304  gTools().AddAttr( wght, "nbins", fNbins );
1305  gTools().AddComment( wght, Form( "Below are the optimised cuts for %i variables: Format: ibin(hist) effS effB cutMin[ivar=0] cutMax[ivar=0] ... cutMin[ivar=n-1] cutMax[ivar=n-1]", GetNvar() ) );
1306 
1307  // NOTE: The signal efficiency written out into
1308  // the weight file does not correspond to the center of the bin within which the
1309  // background rejection is maximised (as before) but to the lower left edge of it.
1310  // This is because the cut optimisation algorithm determines the best background
1311  // rejection for all signal efficiencies belonging into a bin. Since the best background
1312  // rejection is in general obtained for the lowest possible signal efficiency, the
1313  // reference signal efficeincy is the lowest value in the bin.
1314 
1315  for (Int_t ibin=0; ibin<fNbins; ibin++) {
1316  Double_t effS = fEffBvsSLocal->GetBinCenter ( ibin + 1 );
1317  Double_t trueEffS = GetCuts( effS, cutsMin, cutsMax );
1318  if (TMath::Abs(trueEffS) < 1e-10) trueEffS = 0;
1319 
1320  void* binxml = gTools().AddChild( wght, "Bin" );
1321  gTools().AddAttr( binxml, "ibin", ibin+1 );
1322  gTools().AddAttr( binxml, "effS", trueEffS );
1323  gTools().AddAttr( binxml, "effB", fEffBvsSLocal->GetBinContent( ibin + 1 ) );
1324  void* cutsxml = gTools().AddChild( binxml, "Cuts" );
1325  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
1326  gTools().AddAttr( cutsxml, Form( "cutMin_%i", ivar ), cutsMin[ivar] );
1327  gTools().AddAttr( cutsxml, Form( "cutMax_%i", ivar ), cutsMax[ivar] );
1328  }
1329  }
1330 }
1331 
1332 ////////////////////////////////////////////////////////////////////////////////
1333 /// read coefficients from xml weight file
1334 
1336 {
1337  // delete old min and max
1338  for (UInt_t i=0; i<GetNvar(); i++) {
1339  if (fCutMin[i] != 0) delete [] fCutMin[i];
1340  if (fCutMax[i] != 0) delete [] fCutMax[i];
1341  }
1342  if (fCutMin != 0) delete [] fCutMin;
1343  if (fCutMax != 0) delete [] fCutMax;
1344 
1345  Int_t tmpEffMethod, tmpFitMethod;
1346  gTools().ReadAttr( wghtnode, "OptimisationMethod", tmpEffMethod );
1347  gTools().ReadAttr( wghtnode, "FitMethod", tmpFitMethod );
1348  gTools().ReadAttr( wghtnode, "nbins", fNbins );
1349 
1350  fEffMethod = (EEffMethod)tmpEffMethod;
1351  fFitMethod = (EFitMethodType)tmpFitMethod;
1352 
1353  // print some information
1354  if (fFitMethod == kUseMonteCarlo) {
1355  Log() << kINFO << "Read cuts optimised using sample of MC events" << Endl;
1356  }
1357  else if (fFitMethod == kUseMonteCarloEvents) {
1358  Log() << kINFO << "Read cuts optimised using sample of MC-Event events" << Endl;
1359  }
1360  else if (fFitMethod == kUseGeneticAlgorithm) {
1361  Log() << kINFO << "Read cuts optimised using Genetic Algorithm" << Endl;
1362  }
1363  else if (fFitMethod == kUseSimulatedAnnealing) {
1364  Log() << kINFO << "Read cuts optimised using Simulated Annealing algorithm" << Endl;
1365  }
1366  else if (fFitMethod == kUseEventScan) {
1367  Log() << kINFO << "Read cuts optimised using Full Event Scan" << Endl;
1368  }
1369  else {
1370  Log() << kWARNING << "unknown method: " << fFitMethod << Endl;
1371  }
1372  Log() << kINFO << "Reading " << fNbins << " signal efficiency bins for " << GetNvar() << " variables" << Endl;
1373 
1374  delete fEffBvsSLocal;
1375  fEffBvsSLocal = new TH1F( GetTestvarName() + "_effBvsSLocal",
1376  TString(GetName()) + " efficiency of B vs S", fNbins, 0.0, 1.0 );
1377  fEffBvsSLocal->SetDirectory(0); // it's local
1378  for (Int_t ibin=1; ibin<=fNbins; ibin++) fEffBvsSLocal->SetBinContent( ibin, -0.1 ); // Init
1379 
1380  fCutMin = new Double_t*[GetNvar()];
1381  fCutMax = new Double_t*[GetNvar()];
1382  for (UInt_t i=0;i<GetNvar();i++) {
1383  fCutMin[i] = new Double_t[fNbins];
1384  fCutMax[i] = new Double_t[fNbins];
1385  }
1386 
1387  // read efficeincies and cuts
1388  Int_t tmpbin;
1389  Float_t tmpeffS, tmpeffB;
1390  void* ch = gTools().GetChild(wghtnode,"Bin");
1391  while (ch) {
1392  // if (strcmp(gTools().GetName(ch),"Bin") !=0) {
1393  // ch = gTools().GetNextChild(ch);
1394  // continue;
1395  // }
1396 
1397  gTools().ReadAttr( ch, "ibin", tmpbin );
1398  gTools().ReadAttr( ch, "effS", tmpeffS );
1399  gTools().ReadAttr( ch, "effB", tmpeffB );
1400 
1401  // sanity check
1402  if (tmpbin-1 >= fNbins || tmpbin-1 < 0) {
1403  Log() << kFATAL << "Mismatch in bins: " << tmpbin-1 << " >= " << fNbins << Endl;
1404  }
1405 
1406  fEffBvsSLocal->SetBinContent( tmpbin, tmpeffB );
1407  void* ct = gTools().GetChild(ch);
1408  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
1409  gTools().ReadAttr( ct, Form( "cutMin_%i", ivar ), fCutMin[ivar][tmpbin-1] );
1410  gTools().ReadAttr( ct, Form( "cutMax_%i", ivar ), fCutMax[ivar][tmpbin-1] );
1411  }
1412  ch = gTools().GetNextChild(ch, "Bin");
1413  }
1414 }
1415 
1416 ////////////////////////////////////////////////////////////////////////////////
1417 /// write histograms and PDFs to file for monitoring purposes
1418 
1420 {
1421  Log() << kINFO << "Write monitoring histograms to file: " << BaseDir()->GetPath() << Endl;
1422 
1423  fEffBvsSLocal->Write();
1424 
1425  // save reference histograms to file
1426  if (fEffMethod == kUsePDFs) {
1427  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
1428  (*fVarHistS)[ivar]->Write();
1429  (*fVarHistB)[ivar]->Write();
1430  (*fVarHistS_smooth)[ivar]->Write();
1431  (*fVarHistB_smooth)[ivar]->Write();
1432  (*fVarPdfS)[ivar]->GetPDFHist()->Write();
1433  (*fVarPdfB)[ivar]->GetPDFHist()->Write();
1434  }
1435  }
1436 }
1437 
1438 ////////////////////////////////////////////////////////////////////////////////
1439 /// - overloaded function to create background efficiency (rejection) versus
1440 /// signal efficiency plot (first call of this function)
1441 /// - the function returns the signal efficiency at background efficiency
1442 /// indicated in theString
1443 ///
1444 /// "theString" must have two entries:
1445 /// [0]: "Efficiency"
1446 /// [1]: the value of background efficiency at which the signal efficiency
1447 /// is to be returned
1448 
1450 {
1451  // parse input string for required background efficiency
1452  TList* list = gTools().ParseFormatLine( theString );
1453  // sanity check
1454  if (list->GetSize() != 2) {
1455  Log() << kFATAL << "<GetTrainingEfficiency> wrong number of arguments"
1456  << " in string: " << theString
1457  << " | required format, e.g., Efficiency:0.05" << Endl;
1458  return -1;
1459  }
1460 
1462 
1463  // that will be the value of the efficiency retured (does not affect
1464  // the efficiency-vs-bkg plot which is done anyway.
1465  Float_t effBref = atof( ((TObjString*)list->At(1))->GetString() );
1466 
1467  delete list;
1468 
1469  // first round ? --> create histograms
1470  if (results->GetHist("EFF_BVSS_TR")==0) {
1471 
1472  if (fBinaryTreeS != 0) { delete fBinaryTreeS; fBinaryTreeS = 0; }
1473  if (fBinaryTreeB != 0) { delete fBinaryTreeB; fBinaryTreeB = 0; }
1474 
1479  // there is no really good equivalent to the fEffS; fEffB (efficiency vs cutvalue)
1480  // for the "Cuts" method (unless we had only one cut). Maybe later I might add here
1481  // histograms for each of the cuts...but this would require also a change in the
1482  // base class, and it is not really necessary, as we get exactly THIS info from the
1483  // "evaluateAllVariables" anyway.
1484 
1485  // now create efficiency curve: background versus signal
1486  TH1* eff_bvss_tr = new TH1F( GetTestvarName() + "_trainingEffBvsS", GetTestvarName() + "", fNbins, 0, 1 );
1487  for (Int_t ibin=1; ibin<=fNbins; ibin++) eff_bvss_tr->SetBinContent( ibin, -0.1 ); // Init
1488  TH1* rej_bvss_tr = new TH1F( GetTestvarName() + "_trainingRejBvsS", GetTestvarName() + "", fNbins, 0, 1 );
1489  for (Int_t ibin=1; ibin<=fNbins; ibin++) rej_bvss_tr->SetBinContent( ibin, 0. ); // Init
1490  results->Store(eff_bvss_tr, "EFF_BVSS_TR");
1491  results->Store(rej_bvss_tr, "REJ_BVSS_TR");
1492 
1493  // use root finder
1494 
1495  // make the background-vs-signal efficiency plot
1496  Double_t* tmpCutMin = new Double_t[GetNvar()];
1497  Double_t* tmpCutMax = new Double_t[GetNvar()];
1498  Int_t nFailedBins=0;
1499  for (Int_t bini=1; bini<=fNbins; bini++) {
1500  for (UInt_t ivar=0; ivar <GetNvar(); ivar++){
1501  tmpCutMin[ivar] = fCutMin[ivar][bini-1];
1502  tmpCutMax[ivar] = fCutMax[ivar][bini-1];
1503  }
1504  // find cut value corresponding to a given signal efficiency
1505  Double_t effS, effB;
1506  this->GetEffsfromSelection( &tmpCutMin[0], &tmpCutMax[0], effS, effB);
1507  // check that effS matches bini
1508  Int_t effBin = eff_bvss_tr->GetXaxis()->FindBin(effS);
1509  if (effBin != bini){
1510  Log()<< kVERBOSE << "unable to fill efficiency bin " << bini<< " " << effBin <<Endl;
1511  nFailedBins++;
1512  }
1513  else{
1514  // and fill histograms
1515  eff_bvss_tr->SetBinContent( bini, effB );
1516  rej_bvss_tr->SetBinContent( bini, 1.0-effB );
1517  }
1518  }
1519  if (nFailedBins>0) Log()<< kWARNING << " unable to fill "<< nFailedBins <<" efficiency bins " <<Endl;
1520 
1521  delete [] tmpCutMin;
1522  delete [] tmpCutMax;
1523 
1524  // create splines for histogram
1525  fSplTrainEffBvsS = new TSpline1( "trainEffBvsS", new TGraph( eff_bvss_tr ) );
1526  }
1527 
1528  // must exist...
1529  if (NULL == fSplTrainEffBvsS) return 0.0;
1530 
1531  // now find signal efficiency that corresponds to required background efficiency
1532  Double_t effS = 0., effB, effS_ = 0., effB_ = 0.;
1533  Int_t nbins_ = 1000;
1534 
1535  // loop over efficiency bins until the background eff. matches the requirement
1536  for (Int_t bini=1; bini<=nbins_; bini++) {
1537  // get corresponding signal and background efficiencies
1538  effS = (bini - 0.5)/Float_t(nbins_);
1539  effB = fSplTrainEffBvsS->Eval( effS );
1540 
1541  // find signal efficiency that corresponds to required background efficiency
1542  if ((effB - effBref)*(effB_ - effBref) < 0) break;
1543  effS_ = effS;
1544  effB_ = effB;
1545  }
1546 
1547  return 0.5*(effS + effS_);
1548 }
1549 
1550 ////////////////////////////////////////////////////////////////////////////////
1551 /// - overloaded function to create background efficiency (rejection) versus
1552 /// signal efficiency plot (first call of this function)
1553 /// - the function returns the signal efficiency at background efficiency
1554 /// indicated in theString
1555 ///
1556 /// "theString" must have two entries:
1557 /// [0]: "Efficiency"
1558 /// [1]: the value of background efficiency at which the signal efficiency
1559 /// is to be returned
1560 
1562 {
1563  Data()->SetCurrentType(type);
1564 
1566 
1567  // parse input string for required background efficiency
1568  TList* list = gTools().ParseFormatLine( theString, ":" );
1569 
1570  if (list->GetSize() > 2) {
1571  delete list;
1572  Log() << kFATAL << "<GetEfficiency> wrong number of arguments"
1573  << " in string: " << theString
1574  << " | required format, e.g., Efficiency:0.05, or empty string" << Endl;
1575  return -1;
1576  }
1577 
1578  // sanity check
1579  Bool_t computeArea = (list->GetSize() < 2); // the area is computed
1580 
1581  // that will be the value of the efficiency retured (does not affect
1582  // the efficiency-vs-bkg plot which is done anyway.
1583  Float_t effBref = (computeArea?1.:atof( ((TObjString*)list->At(1))->GetString() ));
1584 
1585  delete list;
1586 
1587 
1588  // first round ? --> create histograms
1589  if (results->GetHist("MVA_EFF_BvsS")==0) {
1590 
1591  if (fBinaryTreeS!=0) { delete fBinaryTreeS; fBinaryTreeS = 0; }
1592  if (fBinaryTreeB!=0) { delete fBinaryTreeB; fBinaryTreeB = 0; }
1593 
1594  // the variables may be transformed by a transformation method: to coherently
1595  // treat signal and background one must decide which transformation type shall
1596  // be used: our default is signal-type
1601 
1602  // there is no really good equivalent to the fEffS; fEffB (efficiency vs cutvalue)
1603  // for the "Cuts" method (unless we had only one cut). Maybe later I might add here
1604  // histograms for each of the cuts...but this would require also a change in the
1605  // base class, and it is not really necessary, as we get exactly THIS info from the
1606  // "evaluateAllVariables" anyway.
1607 
1608  // now create efficiency curve: background versus signal
1609  TH1* eff_BvsS = new TH1F( GetTestvarName() + "_effBvsS", GetTestvarName() + "", fNbins, 0, 1 );
1610  for (Int_t ibin=1; ibin<=fNbins; ibin++) eff_BvsS->SetBinContent( ibin, -0.1 ); // Init
1611  TH1* rej_BvsS = new TH1F( GetTestvarName() + "_rejBvsS", GetTestvarName() + "", fNbins, 0, 1 );
1612  for (Int_t ibin=1; ibin<=fNbins; ibin++) rej_BvsS->SetBinContent( ibin, 0.0 ); // Init
1613  results->Store(eff_BvsS, "MVA_EFF_BvsS");
1614  results->Store(rej_BvsS);
1615 
1616  Double_t xmin = 0.;
1617  Double_t xmax = 1.000001;
1618 
1619  TH1* eff_s = new TH1F( GetTestvarName() + "_effS", GetTestvarName() + " (signal)", fNbins, xmin, xmax);
1620  for (Int_t ibin=1; ibin<=fNbins; ibin++) eff_s->SetBinContent( ibin, -0.1 ); // Init
1621  TH1* eff_b = new TH1F( GetTestvarName() + "_effB", GetTestvarName() + " (background)", fNbins, xmin, xmax);
1622  for (Int_t ibin=1; ibin<=fNbins; ibin++) eff_b->SetBinContent( ibin, -0.1 ); // Init
1623  results->Store(eff_s, "MVA_S");
1624  results->Store(eff_b, "MVA_B");
1625 
1626  // use root finder
1627 
1628  // make the background-vs-signal efficiency plot
1629  Double_t* tmpCutMin = new Double_t[GetNvar()];
1630  Double_t* tmpCutMax = new Double_t[GetNvar()];
1631  TGraph* tmpBvsS = new TGraph(fNbins+1);
1632  tmpBvsS->SetPoint(0, 0., 0.);
1633 
1634  for (Int_t bini=1; bini<=fNbins; bini++) {
1635  for (UInt_t ivar=0; ivar <GetNvar(); ivar++) {
1636  tmpCutMin[ivar] = fCutMin[ivar][bini-1];
1637  tmpCutMax[ivar] = fCutMax[ivar][bini-1];
1638  }
1639  // find cut value corresponding to a given signal efficiency
1640  Double_t effS, effB;
1641  this->GetEffsfromSelection( &tmpCutMin[0], &tmpCutMax[0], effS, effB);
1642  tmpBvsS->SetPoint(bini, effS, effB);
1643 
1644  eff_s->SetBinContent(bini, effS);
1645  eff_b->SetBinContent(bini, effB);
1646  }
1647  tmpBvsS->SetPoint(fNbins+1, 1., 1.);
1648 
1649  delete [] tmpCutMin;
1650  delete [] tmpCutMax;
1651 
1652  // create splines for histogram
1653  fSpleffBvsS = new TSpline1( "effBvsS", tmpBvsS );
1654  for (Int_t bini=1; bini<=fNbins; bini++) {
1655  Double_t effS = (bini - 0.5)/Float_t(fNbins);
1656  Double_t effB = fSpleffBvsS->Eval( effS );
1657  eff_BvsS->SetBinContent( bini, effB );
1658  rej_BvsS->SetBinContent( bini, 1.0-effB );
1659  }
1660  }
1661 
1662  // must exist...
1663  if (NULL == fSpleffBvsS) return 0.0;
1664 
1665  // now find signal efficiency that corresponds to required background efficiency
1666  Double_t effS = 0, effB = 0, effS_ = 0, effB_ = 0;
1667  Int_t nbins_ = 1000;
1668 
1669  if (computeArea) {
1670 
1671  // compute area of rej-vs-eff plot
1672  Double_t integral = 0;
1673  for (Int_t bini=1; bini<=nbins_; bini++) {
1674 
1675  // get corresponding signal and background efficiencies
1676  effS = (bini - 0.5)/Float_t(nbins_);
1677  effB = fSpleffBvsS->Eval( effS );
1678  integral += (1.0 - effB);
1679  }
1680  integral /= nbins_;
1681 
1682  return integral;
1683  }
1684  else {
1685 
1686  // loop over efficiency bins until the background eff. matches the requirement
1687  for (Int_t bini=1; bini<=nbins_; bini++) {
1688  // get corresponding signal and background efficiencies
1689  effS = (bini - 0.5)/Float_t(nbins_);
1690  effB = fSpleffBvsS->Eval( effS );
1691 
1692  // find signal efficiency that corresponds to required background efficiency
1693  if ((effB - effBref)*(effB_ - effBref) < 0) break;
1694  effS_ = effS;
1695  effB_ = effB;
1696  }
1697 
1698  effS = 0.5*(effS + effS_);
1699  effSerr = 0;
1700  if (Data()->GetNEvtSigTest() > 0)
1701  effSerr = TMath::Sqrt( effS*(1.0 - effS)/Double_t(Data()->GetNEvtSigTest()) );
1702 
1703  return effS;
1704 
1705  }
1706 
1707  return -1;
1708 }
1709 
1710 ////////////////////////////////////////////////////////////////////////////////
1711 /// write specific classifier response
1712 
1713 void TMVA::MethodCuts::MakeClassSpecific( std::ostream& fout, const TString& className ) const
1714 {
1715  fout << " // not implemented for class: \"" << className << "\"" << std::endl;
1716  fout << "};" << std::endl;
1717 }
1718 
1719 ////////////////////////////////////////////////////////////////////////////////
1720 /// get help message text
1721 ///
1722 /// typical length of text line:
1723 /// "|--------------------------------------------------------------|"
1724 
1726 {
1727  TString bold = gConfig().WriteOptionsReference() ? "<b>" : "";
1728  TString resbold = gConfig().WriteOptionsReference() ? "</b>" : "";
1729  TString brk = gConfig().WriteOptionsReference() ? "<br>" : "";
1730 
1731  Log() << Endl;
1732  Log() << gTools().Color("bold") << "--- Short description:" << gTools().Color("reset") << Endl;
1733  Log() << Endl;
1734  Log() << "The optimisation of rectangular cuts performed by TMVA maximises " << Endl;
1735  Log() << "the background rejection at given signal efficiency, and scans " << Endl;
1736  Log() << "over the full range of the latter quantity. Three optimisation" << Endl;
1737  Log() << "methods are optional: Monte Carlo sampling (MC), a Genetics" << Endl;
1738  Log() << "Algorithm (GA), and Simulated Annealing (SA). GA and SA are" << Endl;
1739  Log() << "expected to perform best." << Endl;
1740  Log() << Endl;
1741  Log() << "The difficulty to find the optimal cuts strongly increases with" << Endl;
1742  Log() << "the dimensionality (number of input variables) of the problem." << Endl;
1743  Log() << "This behavior is due to the non-uniqueness of the solution space."<< Endl;
1744  Log() << Endl;
1745  Log() << gTools().Color("bold") << "--- Performance optimisation:" << gTools().Color("reset") << Endl;
1746  Log() << Endl;
1747  Log() << "If the dimensionality exceeds, say, 4 input variables, it is " << Endl;
1748  Log() << "advisable to scrutinize the separation power of the variables," << Endl;
1749  Log() << "and to remove the weakest ones. If some among the input variables" << Endl;
1750  Log() << "can be described by a single cut (e.g., because signal tends to be" << Endl;
1751  Log() << "larger than background), this can be indicated to MethodCuts via" << Endl;
1752  Log() << "the \"Fsmart\" options (see option string). Choosing this option" << Endl;
1753  Log() << "reduces the number of requirements for the variable from 2 (min/max)" << Endl;
1754  Log() << "to a single one (TMVA finds out whether it is to be interpreted as" << Endl;
1755  Log() << "min or max)." << Endl;
1756  Log() << Endl;
1757  Log() << gTools().Color("bold") << "--- Performance tuning via configuration options:" << gTools().Color("reset") << Endl;
1758  Log() << "" << Endl;
1759  Log() << bold << "Monte Carlo sampling:" << resbold << Endl;
1760  Log() << "" << Endl;
1761  Log() << "Apart form the \"Fsmart\" option for the variables, the only way" << Endl;
1762  Log() << "to improve the MC sampling is to increase the sampling rate. This" << Endl;
1763  Log() << "is done via the configuration option \"MC_NRandCuts\". The execution" << Endl;
1764  Log() << "time scales linearly with the sampling rate." << Endl;
1765  Log() << "" << Endl;
1766  Log() << bold << "Genetic Algorithm:" << resbold << Endl;
1767  Log() << "" << Endl;
1768  Log() << "The algorithm terminates if no significant fitness increase has" << Endl;
1769  Log() << "been achieved within the last \"nsteps\" steps of the calculation." << Endl;
1770  Log() << "Wiggles in the ROC curve or constant background rejection of 1" << Endl;
1771  Log() << "indicate that the GA failed to always converge at the true maximum" << Endl;
1772  Log() << "fitness. In such a case, it is recommended to broaden the search " << Endl;
1773  Log() << "by increasing the population size (\"popSize\") and to give the GA " << Endl;
1774  Log() << "more time to find improvements by increasing the number of steps" << Endl;
1775  Log() << "(\"nsteps\")" << Endl;
1776  Log() << " -> increase \"popSize\" (at least >10 * number of variables)" << Endl;
1777  Log() << " -> increase \"nsteps\"" << Endl;
1778  Log() << "" << Endl;
1779  Log() << bold << "Simulated Annealing (SA) algorithm:" << resbold << Endl;
1780  Log() << "" << Endl;
1781  Log() << "\"Increasing Adaptive\" approach:" << Endl;
1782  Log() << "" << Endl;
1783  Log() << "The algorithm seeks local minima and explores their neighborhood, while" << Endl;
1784  Log() << "changing the ambient temperature depending on the number of failures" << Endl;
1785  Log() << "in the previous steps. The performance can be improved by increasing" << Endl;
1786  Log() << "the number of iteration steps (\"MaxCalls\"), or by adjusting the" << Endl;
1787  Log() << "minimal temperature (\"MinTemperature\"). Manual adjustments of the" << Endl;
1788  Log() << "speed of the temperature increase (\"TemperatureScale\" and \"AdaptiveSpeed\")" << Endl;
1789  Log() << "to individual data sets should also help. Summary:" << brk << Endl;
1790  Log() << " -> increase \"MaxCalls\"" << brk << Endl;
1791  Log() << " -> adjust \"MinTemperature\"" << brk << Endl;
1792  Log() << " -> adjust \"TemperatureScale\"" << brk << Endl;
1793  Log() << " -> adjust \"AdaptiveSpeed\"" << Endl;
1794  Log() << "" << Endl;
1795  Log() << "\"Decreasing Adaptive\" approach:" << Endl;
1796  Log() << "" << Endl;
1797  Log() << "The algorithm calculates the initial temperature (based on the effect-" << Endl;
1798  Log() << "iveness of large steps) and the multiplier that ensures to reach the" << Endl;
1799  Log() << "minimal temperature with the requested number of iteration steps." << Endl;
1800  Log() << "The performance can be improved by adjusting the minimal temperature" << Endl;
1801  Log() << " (\"MinTemperature\") and by increasing number of steps (\"MaxCalls\"):" << brk << Endl;
1802  Log() << " -> increase \"MaxCalls\"" << brk << Endl;
1803  Log() << " -> adjust \"MinTemperature\"" << Endl;
1804  Log() << " " << Endl;
1805  Log() << "Other kernels:" << Endl;
1806  Log() << "" << Endl;
1807  Log() << "Alternative ways of counting the temperature change are implemented. " << Endl;
1808  Log() << "Each of them starts with the maximum temperature (\"MaxTemperature\")" << Endl;
1809  Log() << "and descreases while changing the temperature according to a given" << Endl;
1810  Log() << "prescription:" << brk << Endl;
1811  Log() << "CurrentTemperature =" << brk << Endl;
1812  Log() << " - Sqrt: InitialTemperature / Sqrt(StepNumber+2) * TemperatureScale" << brk << Endl;
1813  Log() << " - Log: InitialTemperature / Log(StepNumber+2) * TemperatureScale" << brk << Endl;
1814  Log() << " - Homo: InitialTemperature / (StepNumber+2) * TemperatureScale" << brk << Endl;
1815  Log() << " - Sin: (Sin(StepNumber / TemperatureScale) + 1) / (StepNumber + 1)*InitialTemperature + Eps" << brk << Endl;
1816  Log() << " - Geo: CurrentTemperature * TemperatureScale" << Endl;
1817  Log() << "" << Endl;
1818  Log() << "Their performance can be improved by adjusting initial temperature" << Endl;
1819  Log() << "(\"InitialTemperature\"), the number of iteration steps (\"MaxCalls\")," << Endl;
1820  Log() << "and the multiplier that scales the termperature descrease" << Endl;
1821  Log() << "(\"TemperatureScale\")" << brk << Endl;
1822  Log() << " -> increase \"MaxCalls\"" << brk << Endl;
1823  Log() << " -> adjust \"InitialTemperature\"" << brk << Endl;
1824  Log() << " -> adjust \"TemperatureScale\"" << brk << Endl;
1825  Log() << " -> adjust \"KernelTemperature\"" << Endl;
1826 }
std::vector< Double_t > * fRmsS
Definition: MethodCuts.h:213
Config & gConfig()
Definition: Config.cxx:43
EEffMethod fEffMethod
Definition: MethodCuts.h:184
virtual Int_t Write(const char *name=0, Int_t option=0, Int_t bufsize=0)
Write this object to the current directory.
Definition: TObject.cxx:830
Double_t GetMvaValue(Double_t *err=0, Double_t *errUpper=0)
cut evaluation: returns 1.0 if event passed, 0.0 otherwise
Definition: MethodCuts.cxx:444
virtual Int_t FindBin(Double_t x, Double_t y=0, Double_t z=0)
Return Global bin number corresponding to x,y,z.
Definition: TH1.cxx:3440
void GetHelpMessage() const
get help message text
float xmin
Definition: THbookFile.cxx:93
void WriteMonitoringHistosToFile(void) const
write histograms and PDFs to file for monitoring purposes
Random number generator class based on M.
Definition: TRandom3.h:29
virtual Double_t GetBinCenter(Int_t bin) const
Return bin center for 1D histogram.
Definition: TH1.cxx:8251
MsgLogger & Endl(MsgLogger &ml)
Definition: MsgLogger.h:162
TString fEffMethodS
Definition: MethodCuts.h:183
Double_t ComputeEstimator(std::vector< Double_t > &)
returns estimator for "cut fitness" used by GA there are two requirements: 1) the signal efficiency m...
Definition: MethodCuts.cxx:901
void TestClassification()
nothing to test
Definition: MethodCuts.cxx:837
Collectable string class.
Definition: TObjString.h:32
float Float_t
Definition: RtypesCore.h:53
virtual void SetDirectory(TDirectory *dir)
By default when an histogram is created, it is added to the list of histogram objects in the current ...
Definition: TH1.cxx:8051
MethodCuts(const TString &jobName, const TString &methodTitle, DataSetInfo &theData, const TString &theOption="MC:150:10000:")
standard constructor
Definition: MethodCuts.cxx:143
Double_t fTestSignalEff
Definition: MethodCuts.h:186
UInt_t GetNvar() const
Definition: MethodBase.h:340
THist< 1, float, THistStatContent, THistStatUncertainty > TH1F
Definition: THist.hxx:302
MsgLogger & Log() const
Definition: Configurable.h:128
OptionBase * DeclareOptionRef(T &ref, const TString &name, const TString &desc="")
void MatchParsToCuts(const std::vector< Double_t > &, Double_t *, Double_t *)
translates parameters into cuts
Definition: MethodCuts.cxx:982
Double_t * fTmpCutMin
Definition: MethodCuts.h:200
virtual Double_t GetBinContent(Int_t bin) const
Return content of bin number bin.
Definition: TH1.cxx:4638
EAnalysisType
Definition: Types.h:129
void Clone(Ssiz_t nc)
Make self a distinct copy with capacity of at least tot, where tot cannot be smaller than the current...
Definition: TString.cxx:1176
Double_t Fill(const std::vector< TMVA::Event *> &events, const std::vector< Int_t > &theVars, Int_t theType=-1)
create the search tree from the event collection using ONLY the variables specified in "theVars" ...
UInt_t GetNVariables() const
access the number of variables through the datasetinfo
Definition: DataSet.cxx:225
void DrawProgressBar(Int_t, const TString &comment="")
draws progress bar in color or B&W caution:
Definition: Timer.cxx:186
bool fExitFromTraining
Definition: MethodBase.h:443
Double_t * fCutRangeMax
Definition: MethodCuts.h:190
std::vector< TH1 * > * fVarHistS
Definition: MethodCuts.h:219
Basic string class.
Definition: TString.h:137
tomato 1-D histogram with a float per channel (see TH1 documentation)}
Definition: TH1.h:575
TransformationHandler & GetTransformationHandler(Bool_t takeReroutedIfAvailable=true)
Definition: MethodBase.h:390
Double_t GetCuts(Double_t effS, std::vector< Double_t > &cutMin, std::vector< Double_t > &cutMax) const
retrieve cut values for given signal efficiency
Definition: MethodCuts.cxx:563
Short_t Min(Short_t a, Short_t b)
Definition: TMathBase.h:170
int Int_t
Definition: RtypesCore.h:41
bool Bool_t
Definition: RtypesCore.h:59
const Bool_t kFALSE
Definition: Rtypes.h:92
int nbins[3]
virtual Double_t GetBinLowEdge(Int_t bin) const
Return bin lower edge for 1D histogram.
Definition: TH1.cxx:8262
std::vector< PDF * > * fVarPdfS
Definition: MethodCuts.h:223
Double_t GetSumOfWeights(void) const
return the sum of event (node) weights
void AddAttr(void *node, const char *, const T &value, Int_t precision=16)
Definition: Tools.h:309
Bool_t AddComment(void *node, const char *comment)
Definition: Tools.cxx:1142
void * AddChild(void *parent, const char *childname, const char *content=0, bool isRootNode=false)
add child node
Definition: Tools.cxx:1134
Short_t Abs(Short_t d)
Definition: TMathBase.h:110
UInt_t fSignalClass
Definition: MethodBase.h:683
std::vector< TH1 * > * fVarHistS_smooth
Definition: MethodCuts.h:221
const TString & GetInputVar(Int_t i) const
Definition: MethodBase.h:345
Tools & gTools()
Definition: Tools.cxx:79
Double_t EstimatorFunction(std::vector< Double_t > &)
returns estimator for "cut fitness" used by GA
Definition: MethodCuts.cxx:888
char GetVarType() const
Definition: VariableInfo.h:69
TStopwatch timer
Definition: pirndm.C:37
std::vector< Double_t > * fMeanB
Definition: MethodCuts.h:212
Double_t Run()
estimator function interface for fitting
Definition: FitterBase.cxx:80
void SetIPythonInteractive(bool *ExitFromTraining, UInt_t *fIPyMaxIter_, UInt_t *fIPyCurrentIter_)
Definition: FitterBase.h:79
std::vector< EFitParameters > * fFitParams
Definition: MethodCuts.h:185
void Init(void)
default initialisation called by all constructors
Definition: MethodCuts.cxx:234
const Event * GetEvent() const
Definition: MethodBase.h:745
DataSet * Data() const
Definition: MethodBase.h:405
void CreateVariablePDFs(void)
for PDF method: create efficiency reference histograms and PDFs
void MakeClassSpecific(std::ostream &, const TString &) const
write specific classifier response
BinarySearchTree * fBinaryTreeS
Definition: MethodCuts.h:194
void * GetChild(void *parent, const char *childname=0)
get child node
Definition: Tools.cxx:1158
void GetEffsfromSelection(Double_t *cutMin, Double_t *cutMax, Double_t &effS, Double_t &effB)
compute signal and background efficiencies from event counting for given cut sample ...
virtual void ParseOptions()
options parser
DataSetInfo & DataInfo() const
Definition: MethodBase.h:406
void AddWeightsXMLTo(void *parent) const
create XML description for LD classification and regression (for arbitrary number of output classes/t...
std::vector< TH1 * > * fVarHistB_smooth
Definition: MethodCuts.h:222
void MatchCutsToPars(std::vector< Double_t > &, Double_t *, Double_t *)
translates cuts into parameters
Definition: PDF.h:71
TSpline * fSpleffBvsS
Definition: MethodBase.h:646
std::vector< PDF * > * fVarPdfB
Definition: MethodCuts.h:224
UInt_t fIPyCurrentIter
Definition: MethodBase.h:444
A doubly linked list.
Definition: TList.h:47
void Train(void)
training method: here the cuts are optimised for the training sample
Definition: MethodCuts.cxx:590
Float_t Max(Types::ESBType sb, UInt_t var)
Results * GetResults(const TString &, Types::ETreeType type, Types::EAnalysisType analysistype)
TString info(resultsName+"/"); switch(type) { case Types::kTraining: info += "kTraining/"; break; cas...
Definition: DataSet.cxx:286
std::vector< Int_t > * fRangeSign
Definition: MethodCuts.h:207
Float_t Min(Types::ESBType sb, UInt_t var)
const char * GetName() const
Definition: MethodBase.h:330
Long64_t GetNEvtSigTest()
return number of signal test events in dataset
Definition: DataSet.cxx:419
void ReadWeightsFromXML(void *wghtnode)
read coefficients from xml weight file
virtual void SetBinContent(Int_t bin, Double_t content)
Set bin content see convention for numbering bins in TH1::GetBin In case the bin number is greater th...
Definition: TH1.cxx:8323
UInt_t fIPyMaxIter
Definition: MethodBase.h:444
unsigned int UInt_t
Definition: RtypesCore.h:42
Double_t fEffRef
Definition: MethodCuts.h:206
char * Form(const char *fmt,...)
void GetEffsfromPDFs(Double_t *cutMin, Double_t *cutMax, Double_t &effS, Double_t &effB)
compute signal and background efficiencies from PDFs for given cut sample
const TString & GetMethodName() const
Definition: MethodBase.h:327
virtual Double_t Eval(Double_t x) const =0
void ReadAttr(void *node, const char *, T &value)
Definition: Tools.h:296
TSpline * fSplTrainEffBvsS
Definition: MethodBase.h:650
float xmax
Definition: THbookFile.cxx:93
virtual TObject * At(Int_t idx) const
Returns the object at position idx. Returns 0 if idx is out of range.
Definition: TList.cxx:311
void DeclareOptions()
define the options (their key words) that can be set in the option string know options: Method <strin...
Definition: MethodCuts.cxx:332
Double_t ** fCutMin
Definition: MethodCuts.h:198
virtual const char * GetPath() const
Returns the full path of the directory.
Definition: TDirectory.cxx:911
void SetNormalised(Bool_t norm)
Definition: MethodBase.h:491
const char * GetNameOfLastTransform() const
returns string for transformation
REAL epsilon
Definition: triangle.c:617
UInt_t GetNVariables() const
Definition: MethodBase.h:341
Float_t GetValue(UInt_t ivar) const
return value of i&#39;th variable
Definition: Event.cxx:233
Double_t * fCutRangeMin
Definition: MethodCuts.h:189
Double_t fEffSMax
Definition: MethodCuts.h:188
virtual Int_t FindBin(Double_t x)
Find bin number corresponding to abscissa x.
Definition: TAxis.cxx:279
UInt_t fBackgroundClass
Definition: MethodBase.h:684
TRandom * fRandom
Definition: MethodCuts.h:208
Bool_t IgnoreEventsWithNegWeightsInTraining() const
Definition: MethodBase.h:680
const std::vector< TMVA::Event * > & GetEventCollection(Types::ETreeType type)
returns the event collection (i.e.
#define ClassImp(name)
Definition: Rtypes.h:279
EFitMethodType fFitMethod
Definition: MethodCuts.h:182
double Double_t
Definition: RtypesCore.h:55
std::vector< Double_t > * fMeanS
Definition: MethodCuts.h:211
Bool_t WriteOptionsReference() const
Definition: Config.h:66
Float_t RMS(Types::ESBType sb, UInt_t var)
Bool_t fNegEffWarning
Definition: MethodCuts.h:227
Bool_t IsNormalised() const
Definition: MethodBase.h:490
TH1 * GetHist(const TString &alias) const
Definition: Results.cxx:127
int type
Definition: TGX11.cxx:120
void ReadWeightsFromStream(std::istream &i)
read the cuts from stream
static RooMathCoreReg dummy
void * GetNextChild(void *prevchild, const char *childname=0)
XML helpers.
Definition: Tools.cxx:1170
void SetCurrentType(Types::ETreeType type) const
Definition: DataSet.h:114
The TH1 histogram class.
Definition: TH1.h:80
Double_t GetEfficiency(const TString &, Types::ETreeType, Double_t &)
you should not use this method at all Int_t Int_t Double_t Double_t Double_t e
Definition: TRolke.cxx:630
TString fFitMethodS
Definition: MethodCuts.h:181
VariableInfo & GetVariableInfo(Int_t i)
Definition: DataSetInfo.h:114
void AddPreDefVal(const T &)
Definition: Configurable.h:174
virtual Double_t Uniform(Double_t x1=1)
Returns a uniform deviate on the interval (0, x1).
Definition: TRandom.cxx:606
void ExitFromTraining()
Definition: MethodBase.h:458
std::vector< Double_t > * fRmsB
Definition: MethodCuts.h:214
const TString & GetOptions() const
Definition: Configurable.h:90
Float_t Mean(Types::ESBType sb, UInt_t var)
const TString & Color(const TString &)
human readable color strings
Definition: Tools.cxx:837
you should not use this method at all Int_t Int_t z
Definition: TRolke.cxx:630
#define REGISTER_METHOD(CLASS)
for example
virtual void SetPoint(Int_t i, Double_t x, Double_t y)
Set x and y values for point number i.
Definition: TGraph.cxx:2150
virtual ~MethodCuts(void)
destructor
Definition: MethodCuts.cxx:284
IPythonInteractive * fInteractive
Definition: MethodBase.h:442
TDirectory * BaseDir() const
returns the ROOT directory where info/histograms etc of the corresponding MVA method instance are sto...
TString GetMethodTypeName() const
Definition: MethodBase.h:328
Short_t Max(Short_t a, Short_t b)
Definition: TMathBase.h:202
A Graph is a graphics object made of two arrays X and Y with npoints each.
Definition: TGraph.h:53
static const Double_t fgMaxAbsCutVal
Definition: MethodCuts.h:144
void ProcessOptions()
process user options sanity check, do not allow the input variables to be normalised, because this only creates problems when interpreting the cuts
Definition: MethodCuts.cxx:375
TList * ParseFormatLine(TString theString, const char *sep=":")
Parse the string and cut into labels separated by ":".
Definition: Tools.cxx:413
Double_t ** fCutMax
Definition: MethodCuts.h:199
#define NULL
Definition: Rtypes.h:82
Long64_t GetNEvents(Types::ETreeType type=Types::kMaxTreeType) const
Definition: DataSet.h:229
Bool_t IsSignal(const Event *ev) const
void PrintCuts(Double_t effS) const
print cuts
Definition: MethodCuts.cxx:477
Types::EAnalysisType GetAnalysisType() const
Definition: MethodBase.h:433
Double_t * fTmpCutMax
Definition: MethodCuts.h:201
void Store(TObject *obj, const char *alias=0)
Definition: Results.cxx:83
Double_t Sqrt(Double_t x)
Definition: TMath.h:464
Double_t SearchVolume(Volume *, std::vector< const TMVA::BinarySearchTreeNode *> *events=0)
search the whole tree and add up all weigths of events that lie within the given voluem ...
const TString & GetTestvarName() const
Definition: MethodBase.h:331
TString * fAllVarsI
Definition: MethodCuts.h:202
virtual Int_t GetSize() const
Definition: TCollection.h:95
Double_t GetTrainingEfficiency(const TString &)
std::vector< TString > * GetTransformationStringsOfLastTransform() const
returns string for transformation
const Bool_t kTRUE
Definition: Rtypes.h:91
virtual Bool_t HasAnalysisType(Types::EAnalysisType type, UInt_t numberClasses, UInt_t numberTargets)
Cuts can only handle classification with 2 classes.
Definition: MethodCuts.cxx:225
std::vector< Interval * > fCutRange
Definition: MethodCuts.h:191
void CheckForUnusedOptions() const
checks for unused options in option string
BinarySearchTree * fBinaryTreeB
Definition: MethodCuts.h:195
Double_t fEffSMin
Definition: MethodCuts.h:187
std::vector< TH1 * > * fVarHistB
Definition: MethodCuts.h:220
TAxis * GetXaxis()
Definition: TH1.h:324
void NoErrorCalc(Double_t *const err, Double_t *const errUpper)
Definition: MethodBase.cxx:819
const char * Data() const
Definition: TString.h:349