ROOT  6.07/01
Reference Guide
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
MethodCuts.cxx
Go to the documentation of this file.
1 // @(#)root/tmva $Id$
2 // Author: Andreas Hoecker, Matt Jachowski, Peter Speckmayer, Eckhard von Toerne, Helge Voss, Kai Voss
3 
4 /**********************************************************************************
5  * Project: TMVA - a Root-integrated toolkit for multivariate Data analysis *
6  * Package: TMVA *
7  * Class : TMVA::MethodCuts *
8  * Web : http://tmva.sourceforge.net *
9  * *
10  * Description: *
11  * Implementation (see header for description) *
12  * *
13  * Authors (alphabetical): *
14  * Andreas Hoecker <Andreas.Hocker@cern.ch> - CERN, Switzerland *
15  * Matt Jachowski <jachowski@stanford.edu> - Stanford University, USA *
16  * Peter Speckmayer <speckmay@mail.cern.ch> - CERN, Switzerland *
17  * Eckhard von Toerne <evt@physik.uni-bonn.de> - U. of Bonn, Germany *
18  * Helge Voss <Helge.Voss@cern.ch> - MPI-K Heidelberg, Germany *
19  * Kai Voss <Kai.Voss@cern.ch> - U. of Victoria, Canada *
20  * *
21  * Copyright (c) 2005: *
22  * CERN, Switzerland *
23  * U. of Victoria, Canada *
24  * MPI-K Heidelberg, Germany *
25  * *
26  * Redistribution and use in source and binary forms, with or without *
27  * modification, are permitted according to the terms listed in LICENSE *
28  * (http://tmva.sourceforge.net/LICENSE) *
29  **********************************************************************************/
30 
31 ////////////////////////////////////////////////////////////////////////////////
32 
33 /* Begin_Html
34  Multivariate optimisation of signal efficiency for given background
35  efficiency, applying rectangular minimum and maximum requirements.
36 
37  <p>
38  Also implemented is a "decorrelate/diagonlized cuts approach",
39  which improves over the uncorrelated cuts ansatz by
40  transforming linearly the input variables into a diagonal space,
41  using the square-root of the covariance matrix.
42 
43  <p>
44  <font size="-1">
45  Other optimisation criteria, such as maximising the signal significance-
46  squared, S^2/(S+B), with S and B being the signal and background yields,
47  correspond to a particular point in the optimised background rejection
48  versus signal efficiency curve. This working point requires the knowledge
49  of the expected yields, which is not the case in general. Note also that
50  for rare signals, Poissonian statistics should be used, which modifies
51  the significance criterion.
52  </font>
53 
54  <p>
55  The rectangular cut of a volume in the variable space is performed using
56  a binary tree to sort the training events. This provides a significant
57  reduction in computing time (up to several orders of magnitudes, depending
58  on the complexity of the problem at hand).
59 
60  <p>
61  Technically, optimisation is achieved in TMVA by two methods:
62 
63  <ol>
64  <li>Monte Carlo generation using uniform priors for the lower cut value,
65  and the cut width, thrown within the variable ranges.
66 
67  <li>A Genetic Algorithm (GA) searches for the optimal ("fittest") cut sample.
68  The GA is configurable by many external settings through the option
69  string. For difficult cases (such as many variables), some tuning
70  may be necessary to achieve satisfying results
71  </ol>
72 
73  <p>
74  <font size="-1">
75  Attempts to use Minuit fits (Simplex ot Migrad) instead have not shown
76  superior results, and often failed due to convergence at local minima.
77  </font>
78 
79  <p>
80  The tests we have performed so far showed that in generic applications,
81  the GA is superior to MC sampling, and hence GA is the default method.
82  It is worthwhile trying both anyway.
83 
84  <b>Decorrelated (or "diagonalized") Cuts</b>
85 
86  <p>
87  See class description for Method Likelihood for a detailed explanation.
88 End_Html */
89 //
90 
91 #include "TMVA/MethodCuts.h"
92 
93 #include <iostream>
94 #include <cstdlib>
95 
96 #include "Riostream.h"
97 #include "TH1F.h"
98 #include "TObjString.h"
99 #include "TDirectory.h"
100 #include "TMath.h"
101 #include "TGraph.h"
102 #include "TSpline.h"
103 #include "TRandom3.h"
104 
105 #include "TMVA/BinarySearchTree.h"
106 #include "TMVA/ClassifierFactory.h"
107 #include "TMVA/Config.h"
108 #include "TMVA/DataSet.h"
109 #include "TMVA/DataSetInfo.h"
110 #include "TMVA/GeneticFitter.h"
111 #include "TMVA/Event.h"
112 #include "TMVA/Interval.h"
113 #include "TMVA/FitterBase.h"
114 #include "TMVA/MCFitter.h"
115 #include "TMVA/MethodFDA.h"
116 #include "TMVA/MinuitFitter.h"
117 #include "TMVA/MsgLogger.h"
118 #include "TMVA/PDF.h"
119 #include "TMVA/Results.h"
121 #include "TMVA/Timer.h"
122 #include "TMVA/Tools.h"
124 #include "TMVA/Types.h"
125 #include "TMVA/TSpline1.h"
127 
128 using std::atof;
129 
130 REGISTER_METHOD(Cuts)
131 
132 ClassImp(TMVA::MethodCuts)
133 
134 const Double_t TMVA::MethodCuts::fgMaxAbsCutVal = 1.0e30;
135 
136 ////////////////////////////////////////////////////////////////////////////////
137 /// standard constructor
138 
139 TMVA::MethodCuts::MethodCuts( const TString& jobName,
140  const TString& methodTitle,
141  DataSetInfo& theData,
142  const TString& theOption,
143  TDirectory* theTargetDir ) :
144  MethodBase( jobName, Types::kCuts, methodTitle, theData, theOption, theTargetDir ),
145  fFitMethod ( kUseGeneticAlgorithm ),
146  fEffMethod ( kUseEventSelection ),
147  fFitParams (0),
148  fTestSignalEff(0.7),
149  fEffSMin ( 0 ),
150  fEffSMax ( 0 ),
151  fCutRangeMin( 0 ),
152  fCutRangeMax( 0 ),
153  fBinaryTreeS( 0 ),
154  fBinaryTreeB( 0 ),
155  fCutMin ( 0 ),
156  fCutMax ( 0 ),
157  fTmpCutMin ( 0 ),
158  fTmpCutMax ( 0 ),
159  fAllVarsI ( 0 ),
160  fNpar ( 0 ),
161  fEffRef ( 0 ),
162  fRangeSign ( 0 ),
163  fRandom ( 0 ),
164  fMeanS ( 0 ),
165  fMeanB ( 0 ),
166  fRmsS ( 0 ),
167  fRmsB ( 0 ),
168  fEffBvsSLocal( 0 ),
169  fVarHistS ( 0 ),
170  fVarHistB ( 0 ),
171  fVarHistS_smooth( 0 ),
172  fVarHistB_smooth( 0 ),
173  fVarPdfS ( 0 ),
174  fVarPdfB ( 0 ),
175  fNegEffWarning( kFALSE )
176 {
177 }
178 
179 ////////////////////////////////////////////////////////////////////////////////
180 /// construction from weight file
181 
183  const TString& theWeightFile,
184  TDirectory* theTargetDir ) :
185  MethodBase( Types::kCuts, theData, theWeightFile, theTargetDir ),
186  fFitMethod ( kUseGeneticAlgorithm ),
187  fEffMethod ( kUseEventSelection ),
188  fFitParams (0),
189  fTestSignalEff(0.7),
190  fEffSMin ( 0 ),
191  fEffSMax ( 0 ),
192  fCutRangeMin( 0 ),
193  fCutRangeMax( 0 ),
194  fBinaryTreeS( 0 ),
195  fBinaryTreeB( 0 ),
196  fCutMin ( 0 ),
197  fCutMax ( 0 ),
198  fTmpCutMin ( 0 ),
199  fTmpCutMax ( 0 ),
200  fAllVarsI ( 0 ),
201  fNpar ( 0 ),
202  fEffRef ( 0 ),
203  fRangeSign ( 0 ),
204  fRandom ( 0 ),
205  fMeanS ( 0 ),
206  fMeanB ( 0 ),
207  fRmsS ( 0 ),
208  fRmsB ( 0 ),
209  fEffBvsSLocal( 0 ),
210  fVarHistS ( 0 ),
211  fVarHistB ( 0 ),
212  fVarHistS_smooth( 0 ),
213  fVarHistB_smooth( 0 ),
214  fVarPdfS ( 0 ),
215  fVarPdfB ( 0 ),
216  fNegEffWarning( kFALSE )
217 {
218 }
219 
220 ////////////////////////////////////////////////////////////////////////////////
221 /// Cuts can only handle classification with 2 classes
222 
224  UInt_t /*numberTargets*/ )
225 {
226  return (type == Types::kClassification && numberClasses == 2);
227 }
228 
229 ////////////////////////////////////////////////////////////////////////////////
230 /// default initialisation called by all constructors
231 
233 {
234  fVarHistS = fVarHistB = 0;
235  fVarHistS_smooth = fVarHistB_smooth = 0;
236  fVarPdfS = fVarPdfB = 0;
237  fFitParams = 0;
238  fBinaryTreeS = fBinaryTreeB = 0;
239  fEffSMin = 0;
240  fEffSMax = 0;
241 
242  // vector with fit results
243  fNpar = 2*GetNvar();
244  fRangeSign = new std::vector<Int_t> ( GetNvar() );
245  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) (*fRangeSign)[ivar] = +1;
246 
247  fMeanS = new std::vector<Double_t>( GetNvar() );
248  fMeanB = new std::vector<Double_t>( GetNvar() );
249  fRmsS = new std::vector<Double_t>( GetNvar() );
250  fRmsB = new std::vector<Double_t>( GetNvar() );
251 
252  // get the variable specific options, first initialize default
253  fFitParams = new std::vector<EFitParameters>( GetNvar() );
254  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) (*fFitParams)[ivar] = kNotEnforced;
255 
256  fFitMethod = kUseMonteCarlo;
257  fTestSignalEff = -1;
258 
259  // create LUT for cuts
260  fCutMin = new Double_t*[GetNvar()];
261  fCutMax = new Double_t*[GetNvar()];
262  for (UInt_t i=0; i<GetNvar(); i++) {
263  fCutMin[i] = new Double_t[fNbins];
264  fCutMax[i] = new Double_t[fNbins];
265  }
266 
267  // init
268  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
269  for (Int_t ibin=0; ibin<fNbins; ibin++) {
270  fCutMin[ivar][ibin] = 0;
271  fCutMax[ivar][ibin] = 0;
272  }
273  }
274 
275  fTmpCutMin = new Double_t[GetNvar()];
276  fTmpCutMax = new Double_t[GetNvar()];
277 }
278 
279 ////////////////////////////////////////////////////////////////////////////////
280 /// destructor
281 
283 {
284  delete fRangeSign;
285  delete fMeanS;
286  delete fMeanB;
287  delete fRmsS;
288  delete fRmsB;
289  delete fFitParams;
290  delete fEffBvsSLocal;
291 
292  if (NULL != fCutRangeMin) delete [] fCutRangeMin;
293  if (NULL != fCutRangeMax) delete [] fCutRangeMax;
294  if (NULL != fAllVarsI) delete [] fAllVarsI;
295 
296  for (UInt_t i=0;i<GetNvar();i++) {
297  if (NULL != fCutMin[i] ) delete [] fCutMin[i];
298  if (NULL != fCutMax[i] ) delete [] fCutMax[i];
299  if (NULL != fCutRange[i]) delete fCutRange[i];
300  }
301 
302  if (NULL != fCutMin) delete [] fCutMin;
303  if (NULL != fCutMax) delete [] fCutMax;
304 
305  if (NULL != fTmpCutMin) delete [] fTmpCutMin;
306  if (NULL != fTmpCutMax) delete [] fTmpCutMax;
307 
308  if (NULL != fBinaryTreeS) delete fBinaryTreeS;
309  if (NULL != fBinaryTreeB) delete fBinaryTreeB;
310 }
311 
312 ////////////////////////////////////////////////////////////////////////////////
313 /// define the options (their key words) that can be set in the option string
314 /// know options:
315 /// Method <string> Minimisation method
316 /// available values are: MC Monte Carlo <default>
317 /// GA Genetic Algorithm
318 /// SA Simulated annealing
319 ///
320 /// EffMethod <string> Efficiency selection method
321 /// available values are: EffSel <default>
322 /// EffPDF
323 ///
324 /// VarProp <string> Property of variable 1 for the MC method (taking precedence over the
325 /// globale setting. The same values as for the global option are available. Variables 1..10 can be
326 /// set this way
327 ///
328 /// CutRangeMin/Max <float> user-defined ranges in which cuts are varied
329 
331 {
332  DeclareOptionRef(fFitMethodS = "GA", "FitMethod", "Minimisation Method (GA, SA, and MC are the primary methods to be used; the others have been introduced for testing purposes and are depreciated)");
333  AddPreDefVal(TString("GA"));
334  AddPreDefVal(TString("SA"));
335  AddPreDefVal(TString("MC"));
336  AddPreDefVal(TString("MCEvents"));
337  AddPreDefVal(TString("MINUIT"));
338  AddPreDefVal(TString("EventScan"));
339 
340  // selection type
341  DeclareOptionRef(fEffMethodS = "EffSel", "EffMethod", "Selection Method");
342  AddPreDefVal(TString("EffSel"));
343  AddPreDefVal(TString("EffPDF"));
344 
345  // cut ranges
346  fCutRange.resize(GetNvar());
347  fCutRangeMin = new Double_t[GetNvar()];
348  fCutRangeMax = new Double_t[GetNvar()];
349  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
350  fCutRange[ivar] = 0;
351  fCutRangeMin[ivar] = fCutRangeMax[ivar] = -1;
352  }
353 
354  DeclareOptionRef( fCutRangeMin, GetNvar(), "CutRangeMin", "Minimum of allowed cut range (set per variable)" );
355  DeclareOptionRef( fCutRangeMax, GetNvar(), "CutRangeMax", "Maximum of allowed cut range (set per variable)" );
356 
357  fAllVarsI = new TString[GetNvar()];
358 
359  for (UInt_t i=0; i<GetNvar(); i++) fAllVarsI[i] = "NotEnforced";
360 
361  DeclareOptionRef(fAllVarsI, GetNvar(), "VarProp", "Categorisation of cuts");
362  AddPreDefVal(TString("NotEnforced"));
363  AddPreDefVal(TString("FMax"));
364  AddPreDefVal(TString("FMin"));
365  AddPreDefVal(TString("FSmart"));
366 }
367 
368 ////////////////////////////////////////////////////////////////////////////////
369 /// process user options
370 /// sanity check, do not allow the input variables to be normalised, because this
371 /// only creates problems when interpreting the cuts
372 
374 {
375  if (IsNormalised()) {
376  Log() << kWARNING << "Normalisation of the input variables for cut optimisation is not" << Endl;
377  Log() << kWARNING << "supported because this provides intransparent cut values, and no" << Endl;
378  Log() << kWARNING << "improvement in the performance of the algorithm." << Endl;
379  Log() << kWARNING << "Please remove \"Normalise\" option from booking option string" << Endl;
380  Log() << kWARNING << "==> Will reset normalisation flag to \"False\"" << Endl;
381  SetNormalised( kFALSE );
382  }
383 
384  if (IgnoreEventsWithNegWeightsInTraining()) {
385  Log() << kFATAL << "Mechanism to ignore events with negative weights in training not yet available for method: "
386  << GetMethodTypeName()
387  << " --> Please remove \"IgnoreNegWeightsInTraining\" option from booking string."
388  << Endl;
389  }
390 
391  if (fFitMethodS == "MC" ) fFitMethod = kUseMonteCarlo;
392  else if (fFitMethodS == "MCEvents") fFitMethod = kUseMonteCarloEvents;
393  else if (fFitMethodS == "GA" ) fFitMethod = kUseGeneticAlgorithm;
394  else if (fFitMethodS == "SA" ) fFitMethod = kUseSimulatedAnnealing;
395  else if (fFitMethodS == "MINUIT" ) {
396  fFitMethod = kUseMinuit;
397  Log() << kWARNING << "poor performance of MINUIT in MethodCuts; preferred fit method: GA" << Endl;
398  }
399  else if (fFitMethodS == "EventScan" ) fFitMethod = kUseEventScan;
400  else Log() << kFATAL << "unknown minimisation method: " << fFitMethodS << Endl;
401 
402  if (fEffMethodS == "EFFSEL" ) fEffMethod = kUseEventSelection; // highly recommended
403  else if (fEffMethodS == "EFFPDF" ) fEffMethod = kUsePDFs;
404  else fEffMethod = kUseEventSelection;
405 
406  // options output
407  Log() << kINFO << Form("Use optimization method: \"%s\"",
408  (fFitMethod == kUseMonteCarlo) ? "Monte Carlo" :
409  (fFitMethod == kUseMonteCarlo) ? "Monte-Carlo-Event sampling" :
410  (fFitMethod == kUseEventScan) ? "Full Event Scan (slow)" :
411  (fFitMethod == kUseMinuit) ? "MINUIT" : "Genetic Algorithm" ) << Endl;
412  Log() << kINFO << Form("Use efficiency computation method: \"%s\"",
413  (fEffMethod == kUseEventSelection) ? "Event Selection" : "PDF" ) << Endl;
414 
415  // cut ranges
416  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
417  fCutRange[ivar] = new Interval( fCutRangeMin[ivar], fCutRangeMax[ivar] );
418  }
419 
420  // individual options
421  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
422  EFitParameters theFitP = kNotEnforced;
423  if (fAllVarsI[ivar] == "" || fAllVarsI[ivar] == "NotEnforced") theFitP = kNotEnforced;
424  else if (fAllVarsI[ivar] == "FMax" ) theFitP = kForceMax;
425  else if (fAllVarsI[ivar] == "FMin" ) theFitP = kForceMin;
426  else if (fAllVarsI[ivar] == "FSmart" ) theFitP = kForceSmart;
427  else {
428  Log() << kFATAL << "unknown value \'" << fAllVarsI[ivar]
429  << "\' for fit parameter option " << Form("VarProp[%i]",ivar) << Endl;
430  }
431  (*fFitParams)[ivar] = theFitP;
432 
433  if (theFitP != kNotEnforced)
434  Log() << kINFO << "Use \"" << fAllVarsI[ivar]
435  << "\" cuts for variable: " << "'" << (*fInputVars)[ivar] << "'" << Endl;
436  }
437 }
438 
439 ////////////////////////////////////////////////////////////////////////////////
440 /// cut evaluation: returns 1.0 if event passed, 0.0 otherwise
441 
443 {
444  // cannot determine error
445  NoErrorCalc(err, errUpper);
446 
447  // sanity check
448  if (fCutMin == NULL || fCutMax == NULL || fNbins == 0) {
449  Log() << kFATAL << "<Eval_Cuts> fCutMin/Max have zero pointer. "
450  << "Did you book Cuts ?" << Endl;
451  }
452 
453  const Event* ev = GetEvent();
454 
455  // sanity check
456  if (fTestSignalEff > 0) {
457  // get efficiency bin
458  Int_t ibin = fEffBvsSLocal->FindBin( fTestSignalEff );
459  if (ibin < 0 ) ibin = 0;
460  else if (ibin >= fNbins) ibin = fNbins - 1;
461 
462  Bool_t passed = kTRUE;
463  for (UInt_t ivar=0; ivar<GetNvar(); ivar++)
464  passed &= ( (ev->GetValue(ivar) > fCutMin[ivar][ibin]) &&
465  (ev->GetValue(ivar) <= fCutMax[ivar][ibin]) );
466 
467  return passed ? 1. : 0. ;
468  }
469  else return 0;
470 }
471 
472 ////////////////////////////////////////////////////////////////////////////////
473 /// print cuts
474 
476 {
477  std::vector<Double_t> cutsMin;
478  std::vector<Double_t> cutsMax;
479  Int_t ibin = fEffBvsSLocal->FindBin( effS );
480 
481  Double_t trueEffS = GetCuts( effS, cutsMin, cutsMax );
482 
483  // retrieve variable expressions (could be transformations)
484  std::vector<TString>* varVec = 0;
485  if (GetTransformationHandler().GetNumOfTransformations() == 0) {
486  // no transformation applied, replace by current variables
487  varVec = new std::vector<TString>;
488  for (UInt_t ivar=0; ivar<cutsMin.size(); ivar++) {
489  varVec->push_back( DataInfo().GetVariableInfo(ivar).GetLabel() );
490  }
491  }
492  else if (GetTransformationHandler().GetNumOfTransformations() == 1) {
493  // get transformation string
494  varVec = GetTransformationHandler().GetTransformationStringsOfLastTransform();
495  }
496  else {
497  // replace transformation print by current variables and indicated incompleteness
498  varVec = new std::vector<TString>;
499  for (UInt_t ivar=0; ivar<cutsMin.size(); ivar++) {
500  varVec->push_back( DataInfo().GetVariableInfo(ivar).GetLabel() + " [transformed]" );
501  }
502  }
503 
504  UInt_t maxL = 0;
505  for (UInt_t ivar=0; ivar<cutsMin.size(); ivar++) {
506  if ((UInt_t)(*varVec)[ivar].Length() > maxL) maxL = (*varVec)[ivar].Length();
507  }
508  UInt_t maxLine = 20+maxL+16;
509 
510  for (UInt_t i=0; i<maxLine; i++) Log() << "-";
511  Log() << Endl;
512  Log() << kINFO << "Cut values for requested signal efficiency: " << trueEffS << Endl;
513  Log() << kINFO << "Corresponding background efficiency : " << fEffBvsSLocal->GetBinContent( ibin ) << Endl;
514  if (GetTransformationHandler().GetNumOfTransformations() == 1) {
515  Log() << kINFO << "Transformation applied to input variables : \""
516  << GetTransformationHandler().GetNameOfLastTransform() << "\"" << Endl;
517  }
518  else if (GetTransformationHandler().GetNumOfTransformations() > 1) {
519  Log() << kINFO << "[ More than one (=" << GetTransformationHandler().GetNumOfTransformations() << ") "
520  << " transformations applied in transformation chain; cuts applied on transformed quantities ] " << Endl;
521  }
522  else {
523  Log() << kINFO << "Transformation applied to input variables : None" << Endl;
524  }
525  for (UInt_t i=0; i<maxLine; i++) Log() << "-";
526  Log() << Endl;
527  for (UInt_t ivar=0; ivar<cutsMin.size(); ivar++) {
528  Log() << kINFO
529  << "Cut[" << std::setw(2) << ivar << "]: "
530  << std::setw(10) << cutsMin[ivar]
531  << " < "
532  << std::setw(maxL) << (*varVec)[ivar]
533  << " <= "
534  << std::setw(10) << cutsMax[ivar] << Endl;
535  }
536  for (UInt_t i=0; i<maxLine; i++) Log() << "-";
537  Log() << Endl;
538 
539  delete varVec; // yes, ownership has been given to us
540 }
541 
542 ////////////////////////////////////////////////////////////////////////////////
543 /// retrieve cut values for given signal efficiency
544 /// assume vector of correct size !!
545 
547 {
548  std::vector<Double_t> cMin( GetNvar() );
549  std::vector<Double_t> cMax( GetNvar() );
550  Double_t trueEffS = GetCuts( effS, cMin, cMax );
551  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
552  cutMin[ivar] = cMin[ivar];
553  cutMax[ivar] = cMax[ivar];
554  }
555  return trueEffS;
556 }
557 
558 ////////////////////////////////////////////////////////////////////////////////
559 /// retrieve cut values for given signal efficiency
560 
562  std::vector<Double_t>& cutMin,
563  std::vector<Double_t>& cutMax ) const
564 {
565  // find corresponding bin
566  Int_t ibin = fEffBvsSLocal->FindBin( effS );
567 
568  // get the true efficiency which is the one on the "left hand" side of the bin
569  Double_t trueEffS = fEffBvsSLocal->GetBinLowEdge( ibin );
570 
571  ibin--; // the 'cut' vector has 0...fNbins indices
572  if (ibin < 0 ) ibin = 0;
573  else if (ibin >= fNbins) ibin = fNbins - 1;
574 
575  cutMin.clear();
576  cutMax.clear();
577  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
578  cutMin.push_back( fCutMin[ivar][ibin] );
579  cutMax.push_back( fCutMax[ivar][ibin] );
580  }
581 
582  return trueEffS;
583 }
584 
585 ////////////////////////////////////////////////////////////////////////////////
586 /// training method: here the cuts are optimised for the training sample
587 
589 {
590  if (fEffMethod == kUsePDFs) CreateVariablePDFs(); // create PDFs for variables
591 
592  // create binary trees (global member variables) for signal and background
593  if (fBinaryTreeS != 0) { delete fBinaryTreeS; fBinaryTreeS = 0; }
594  if (fBinaryTreeB != 0) { delete fBinaryTreeB; fBinaryTreeB = 0; }
595 
596  // the variables may be transformed by a transformation method: to coherently
597  // treat signal and background one must decide which transformation type shall
598  // be used: our default is signal-type
599 
600  fBinaryTreeS = new BinarySearchTree();
601  fBinaryTreeS->Fill( GetEventCollection(Types::kTraining), fSignalClass );
602  fBinaryTreeB = new BinarySearchTree();
603  fBinaryTreeB->Fill( GetEventCollection(Types::kTraining), fBackgroundClass );
604 
605  for (UInt_t ivar =0; ivar < Data()->GetNVariables(); ivar++) {
606  (*fMeanS)[ivar] = fBinaryTreeS->Mean(Types::kSignal, ivar);
607  (*fRmsS)[ivar] = fBinaryTreeS->RMS (Types::kSignal, ivar);
608  (*fMeanB)[ivar] = fBinaryTreeB->Mean(Types::kBackground, ivar);
609  (*fRmsB)[ivar] = fBinaryTreeB->RMS (Types::kBackground, ivar);
610 
611  // update interval ?
612  Double_t xmin = TMath::Min(fBinaryTreeS->Min(Types::kSignal, ivar),
613  fBinaryTreeB->Min(Types::kBackground, ivar));
614  Double_t xmax = TMath::Max(fBinaryTreeS->Max(Types::kSignal, ivar),
615  fBinaryTreeB->Max(Types::kBackground, ivar));
616 
617  // redefine ranges to be slightly smaller and larger than xmin and xmax, respectively
618  Double_t eps = 0.01*(xmax - xmin);
619  xmin -= eps;
620  xmax += eps;
621 
622  if (TMath::Abs(fCutRange[ivar]->GetMin() - fCutRange[ivar]->GetMax()) < 1.0e-300 ) {
623  fCutRange[ivar]->SetMin( xmin );
624  fCutRange[ivar]->SetMax( xmax );
625  }
626  else if (xmin > fCutRange[ivar]->GetMin()) fCutRange[ivar]->SetMin( xmin );
627  else if (xmax < fCutRange[ivar]->GetMax()) fCutRange[ivar]->SetMax( xmax );
628  }
629 
630  std::vector<TH1F*> signalDist, bkgDist;
631 
632  // this is important: reset the branch addresses of the training tree to the current event
633  delete fEffBvsSLocal;
634  fEffBvsSLocal = new TH1F( GetTestvarName() + "_effBvsSLocal",
635  TString(GetName()) + " efficiency of B vs S", fNbins, 0.0, 1.0 );
636  fEffBvsSLocal->SetDirectory(0); // it's local
637 
638  // init
639  for (Int_t ibin=1; ibin<=fNbins; ibin++) fEffBvsSLocal->SetBinContent( ibin, -0.1 );
640 
641  // --------------------------------------------------------------------------
642  if (fFitMethod == kUseGeneticAlgorithm ||
643  fFitMethod == kUseMonteCarlo ||
644  fFitMethod == kUseMinuit ||
645  fFitMethod == kUseSimulatedAnnealing) {
646 
647  // ranges
648  std::vector<Interval*> ranges;
649 
650  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
651 
652  Int_t nbins = 0;
653  if (DataInfo().GetVariableInfo(ivar).GetVarType() == 'I') {
654  nbins = Int_t(fCutRange[ivar]->GetMax() - fCutRange[ivar]->GetMin()) + 1;
655  }
656 
657  if ((*fFitParams)[ivar] == kForceSmart) {
658  if ((*fMeanS)[ivar] > (*fMeanB)[ivar]) (*fFitParams)[ivar] = kForceMax;
659  else (*fFitParams)[ivar] = kForceMin;
660  }
661 
662  if ((*fFitParams)[ivar] == kForceMin) {
663  ranges.push_back( new Interval( fCutRange[ivar]->GetMin(), fCutRange[ivar]->GetMin(), nbins ) );
664  ranges.push_back( new Interval( 0, fCutRange[ivar]->GetMax() - fCutRange[ivar]->GetMin(), nbins ) );
665  }
666  else if ((*fFitParams)[ivar] == kForceMax) {
667  ranges.push_back( new Interval( fCutRange[ivar]->GetMin(), fCutRange[ivar]->GetMax(), nbins ) );
668  ranges.push_back( new Interval( fCutRange[ivar]->GetMax() - fCutRange[ivar]->GetMin(),
669  fCutRange[ivar]->GetMax() - fCutRange[ivar]->GetMin(), nbins ) );
670  }
671  else {
672  ranges.push_back( new Interval( fCutRange[ivar]->GetMin(), fCutRange[ivar]->GetMax(), nbins ) );
673  ranges.push_back( new Interval( 0, fCutRange[ivar]->GetMax() - fCutRange[ivar]->GetMin(), nbins ) );
674  }
675  }
676 
677  // create the fitter
678  FitterBase* fitter = NULL;
679 
680  switch (fFitMethod) {
681  case kUseGeneticAlgorithm:
682  fitter = new GeneticFitter( *this, Form("%sFitter_GA", GetName()), ranges, GetOptions() );
683  break;
684  case kUseMonteCarlo:
685  fitter = new MCFitter ( *this, Form("%sFitter_MC", GetName()), ranges, GetOptions() );
686  break;
687  case kUseMinuit:
688  fitter = new MinuitFitter ( *this, Form("%sFitter_MINUIT", GetName()), ranges, GetOptions() );
689  break;
690  case kUseSimulatedAnnealing:
691  fitter = new SimulatedAnnealingFitter( *this, Form("%sFitter_SA", GetName()), ranges, GetOptions() );
692  break;
693  default:
694  Log() << kFATAL << "Wrong fit method: " << fFitMethod << Endl;
695  }
696 
697  fitter->CheckForUnusedOptions();
698 
699  // perform the fit
700  fitter->Run();
701 
702  // clean up
703  for (UInt_t ivar=0; ivar<ranges.size(); ivar++) delete ranges[ivar];
704  delete fitter;
705 
706  }
707  // --------------------------------------------------------------------------
708  else if (fFitMethod == kUseEventScan) {
709 
710  Int_t nevents = Data()->GetNEvents();
711  Int_t ic = 0;
712 
713  // timing of MC
714  Int_t nsamples = Int_t(0.5*nevents*(nevents - 1));
715  Timer timer( nsamples, GetName() );
716 
717  Log() << kINFO << "Running full event scan: " << Endl;
718  for (Int_t ievt1=0; ievt1<nevents; ievt1++) {
719  for (Int_t ievt2=ievt1+1; ievt2<nevents; ievt2++) {
720 
721  EstimatorFunction( ievt1, ievt2 );
722 
723  // what's the time please?
724  ic++;
725  if ((nsamples<10000) || ic%10000 == 0) timer.DrawProgressBar( ic );
726  }
727  }
728  }
729  // --------------------------------------------------------------------------
730  else if (fFitMethod == kUseMonteCarloEvents) {
731 
732  Int_t nsamples = 200000;
733  UInt_t seed = 100;
734  DeclareOptionRef( nsamples, "SampleSize", "Number of Monte-Carlo-Event samples" );
735  DeclareOptionRef( seed, "Seed", "Seed for the random generator (0 takes random seeds)" );
736  ParseOptions();
737 
738  Int_t nevents = Data()->GetNEvents();
739  Int_t ic = 0;
740 
741  // timing of MC
742  Timer timer( nsamples, GetName() );
743 
744  // random generator
745  TRandom3*rnd = new TRandom3( seed );
746 
747  Log() << kINFO << "Running Monte-Carlo-Event sampling over " << nsamples << " events" << Endl;
748  std::vector<Double_t> pars( 2*GetNvar() );
749 
750  for (Int_t itoy=0; itoy<nsamples; itoy++) {
751 
752  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
753 
754  // generate minimum and delta cuts for this variable
755 
756  // retrieve signal events
757  Bool_t isSignal = kFALSE;
758  Int_t ievt1, ievt2;
759  Double_t evt1 = 0., evt2 = 0.;
760  Int_t nbreak = 0;
761  while (!isSignal) {
762  ievt1 = Int_t(rnd->Uniform(0.,1.)*nevents);
763  ievt2 = Int_t(rnd->Uniform(0.,1.)*nevents);
764 
765  const Event *ev1 = GetEvent(ievt1);
766  isSignal = DataInfo().IsSignal(ev1);
767  evt1 = ev1->GetValue( ivar );
768 
769  const Event *ev2 = GetEvent(ievt2);
770  isSignal &= DataInfo().IsSignal(ev2);
771  evt2 = ev2->GetValue( ivar );
772 
773  if (nbreak++ > 10000) Log() << kFATAL << "<MCEvents>: could not find signal events"
774  << " after 10000 trials - do you have signal events in your sample ?"
775  << Endl;
776  isSignal = 1;
777  }
778 
779  // sort
780  if (evt1 > evt2) { Double_t z = evt1; evt1 = evt2; evt2 = z; }
781  pars[2*ivar] = evt1;
782  pars[2*ivar+1] = evt2 - evt1;
783  }
784 
785  // compute estimator
786  EstimatorFunction( pars );
787 
788  // what's the time please?
789  ic++;
790  if ((nsamples<1000) || ic%1000 == 0) timer.DrawProgressBar( ic );
791  }
792 
793  delete rnd;
794  }
795  // --------------------------------------------------------------------------
796  else Log() << kFATAL << "Unknown minimisation method: " << fFitMethod << Endl;
797 
798  if (fBinaryTreeS != 0) { delete fBinaryTreeS; fBinaryTreeS = 0; }
799  if (fBinaryTreeB != 0) { delete fBinaryTreeB; fBinaryTreeB = 0; }
800 
801  // force cut ranges within limits
802  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
803  for (Int_t ibin=0; ibin<fNbins; ibin++) {
804 
805  if ((*fFitParams)[ivar] == kForceMin && fCutMin[ivar][ibin] > -fgMaxAbsCutVal) {
806  fCutMin[ivar][ibin] = -fgMaxAbsCutVal;
807  }
808  if ((*fFitParams)[ivar] == kForceMax && fCutMax[ivar][ibin] < fgMaxAbsCutVal) {
809  fCutMax[ivar][ibin] = fgMaxAbsCutVal;
810  }
811  }
812  }
813 
814  // some output
815  // the efficiency which is asked for has to be slightly higher than the bin-borders.
816  // if not, then the wrong bin is taken in some cases.
817  Double_t epsilon = 0.0001;
818  for (Double_t eff=0.1; eff<0.95; eff += 0.1) PrintCuts( eff+epsilon );
819 }
820 
821 ////////////////////////////////////////////////////////////////////////////////
822 /// nothing to test
823 
825 {
826 }
827 
828 ////////////////////////////////////////////////////////////////////////////////
829 /// for full event scan
830 
832 {
833  const Event *ev1 = GetEvent(ievt1);
834  if (!DataInfo().IsSignal(ev1)) return -1;
835 
836  const Event *ev2 = GetEvent(ievt2);
837  if (!DataInfo().IsSignal(ev2)) return -1;
838 
839  const Int_t nvar = GetNvar();
840  Double_t* evt1 = new Double_t[nvar];
841  Double_t* evt2 = new Double_t[nvar];
842 
843  for (Int_t ivar=0; ivar<nvar; ivar++) {
844  evt1[ivar] = ev1->GetValue( ivar );
845  evt2[ivar] = ev2->GetValue( ivar );
846  }
847 
848  // determine cuts
849  std::vector<Double_t> pars;
850  for (Int_t ivar=0; ivar<nvar; ivar++) {
851  Double_t cutMin;
852  Double_t cutMax;
853  if (evt1[ivar] < evt2[ivar]) {
854  cutMin = evt1[ivar];
855  cutMax = evt2[ivar];
856  }
857  else {
858  cutMin = evt2[ivar];
859  cutMax = evt1[ivar];
860  }
861 
862  pars.push_back( cutMin );
863  pars.push_back( cutMax - cutMin );
864  }
865 
866  delete [] evt1;
867  delete [] evt2;
868 
869  return ComputeEstimator( pars );
870 }
871 
872 ////////////////////////////////////////////////////////////////////////////////
873 /// returns estimator for "cut fitness" used by GA
874 
875 Double_t TMVA::MethodCuts::EstimatorFunction( std::vector<Double_t>& pars )
876 {
877  return ComputeEstimator( pars );
878 }
879 
880 ////////////////////////////////////////////////////////////////////////////////
881 /// returns estimator for "cut fitness" used by GA
882 /// there are two requirements:
883 /// 1) the signal efficiency must be equal to the required one in the
884 /// efficiency scan
885 /// 2) the background efficiency must be as small as possible
886 /// the requirement 1) has priority over 2)
887 
888 Double_t TMVA::MethodCuts::ComputeEstimator( std::vector<Double_t>& pars )
889 {
890  // caution: the npar gives the _free_ parameters
891  // however: the "pars" array contains all parameters
892 
893  // determine cuts
894  Double_t effS = 0, effB = 0;
895  this->MatchParsToCuts( pars, &fTmpCutMin[0], &fTmpCutMax[0] );
896 
897  // retrieve signal and background efficiencies for given cut
898  switch (fEffMethod) {
899  case kUsePDFs:
900  this->GetEffsfromPDFs (&fTmpCutMin[0], &fTmpCutMax[0], effS, effB);
901  break;
902  case kUseEventSelection:
903  this->GetEffsfromSelection (&fTmpCutMin[0], &fTmpCutMax[0], effS, effB);
904  break;
905  default:
906  this->GetEffsfromSelection (&fTmpCutMin[0], &fTmpCutMax[0], effS, effB);
907  }
908 
909  Double_t eta = 0;
910 
911  // test for a estimator function which optimizes on the whole background-rejection signal-efficiency plot
912 
913  // get the backg-reject. and sig-eff for the parameters given to this function
914  // effS, effB
915 
916  // get best background rejection for given signal efficiency
917  Int_t ibinS = fEffBvsSLocal->FindBin( effS );
918 
919  Double_t effBH = fEffBvsSLocal->GetBinContent( ibinS );
920  Double_t effBH_left = (ibinS > 1 ) ? fEffBvsSLocal->GetBinContent( ibinS-1 ) : effBH;
921  Double_t effBH_right = (ibinS < fNbins) ? fEffBvsSLocal->GetBinContent( ibinS+1 ) : effBH;
922 
923  Double_t average = 0.5*(effBH_left + effBH_right);
924  if (effBH < effB) average = effBH;
925 
926  // if the average of the bin right and left is larger than this one, add the difference to
927  // the current value of the estimator (because you can do at least so much better)
928  eta = ( -TMath::Abs(effBH-average) + (1.0 - (effBH - effB))) / (1.0 + effS);
929  // alternative idea
930  //if (effBH<0) eta = (1.e-6+effB)/(1.0 + effS);
931  //else eta = (effB - effBH) * (1.0 + 10.* effS);
932 
933  // if a point is found which is better than an existing one, ... replace it.
934  // preliminary best event -> backup
935  if (effBH < 0 || effBH > effB) {
936  fEffBvsSLocal->SetBinContent( ibinS, effB );
937  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
938  fCutMin[ivar][ibinS-1] = fTmpCutMin[ivar]; // bin 1 stored in index 0
939  fCutMax[ivar][ibinS-1] = fTmpCutMax[ivar];
940  }
941  }
942 
943  // caution (!) this value is not good for a decision for MC, .. it is designed for GA
944  // but .. it doesn't matter, as MC samplings are independent from the former ones
945  // and the replacement of the best variables by better ones is done about 10 lines above.
946  // ( if (effBH < 0 || effBH > effB) { .... )
947 
948  if (ibinS<=1) {
949  // add penalty for effS=0 bin
950  // to avoid that the minimizer gets stuck in the zero-bin
951  // force it towards higher efficiency
952  Double_t penalty=0.,diff=0.;
953  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
954  diff=(fCutRange[ivar]->GetMax()-fTmpCutMax[ivar])/(fCutRange[ivar]->GetMax()-fCutRange[ivar]->GetMin());
955  penalty+=diff*diff;
956  diff=(fCutRange[ivar]->GetMin()-fTmpCutMin[ivar])/(fCutRange[ivar]->GetMax()-fCutRange[ivar]->GetMin());
957  penalty+=4.*diff*diff;
958  }
959 
960  if (effS<1.e-4) return 10.0+penalty;
961  else return 10.*(1.-10.*effS);
962  }
963  return eta;
964 }
965 
966 ////////////////////////////////////////////////////////////////////////////////
967 /// translates parameters into cuts
968 
969 void TMVA::MethodCuts::MatchParsToCuts( const std::vector<Double_t> & pars,
970  Double_t* cutMin, Double_t* cutMax )
971 {
972  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
973  Int_t ipar = 2*ivar;
974  cutMin[ivar] = ((*fRangeSign)[ivar] > 0) ? pars[ipar] : pars[ipar] - pars[ipar+1];
975  cutMax[ivar] = ((*fRangeSign)[ivar] > 0) ? pars[ipar] + pars[ipar+1] : pars[ipar];
976  }
977 }
978 
979 ////////////////////////////////////////////////////////////////////////////////
980 /// translate the cuts into parameters (obsolete function)
981 
982 void TMVA::MethodCuts::MatchCutsToPars( std::vector<Double_t>& pars,
983  Double_t** cutMinAll, Double_t** cutMaxAll, Int_t ibin )
984 {
985  if (ibin < 1 || ibin > fNbins) Log() << kFATAL << "::MatchCutsToPars: bin error: "
986  << ibin << Endl;
987 
988  const UInt_t nvar = GetNvar();
989  Double_t *cutMin = new Double_t[nvar];
990  Double_t *cutMax = new Double_t[nvar];
991  for (UInt_t ivar=0; ivar<nvar; ivar++) {
992  cutMin[ivar] = cutMinAll[ivar][ibin-1];
993  cutMax[ivar] = cutMaxAll[ivar][ibin-1];
994  }
995 
996  MatchCutsToPars( pars, cutMin, cutMax );
997  delete [] cutMin;
998  delete [] cutMax;
999 }
1000 
1001 ////////////////////////////////////////////////////////////////////////////////
1002 /// translates cuts into parameters
1003 
1004 void TMVA::MethodCuts::MatchCutsToPars( std::vector<Double_t>& pars,
1005  Double_t* cutMin, Double_t* cutMax )
1006 {
1007  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
1008  Int_t ipar = 2*ivar;
1009  pars[ipar] = ((*fRangeSign)[ivar] > 0) ? cutMin[ivar] : cutMax[ivar];
1010  pars[ipar+1] = cutMax[ivar] - cutMin[ivar];
1011  }
1012 }
1013 
1014 ////////////////////////////////////////////////////////////////////////////////
1015 /// compute signal and background efficiencies from PDFs
1016 /// for given cut sample
1017 
1019  Double_t& effS, Double_t& effB )
1020 {
1021  effS = 1.0;
1022  effB = 1.0;
1023  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
1024  effS *= (*fVarPdfS)[ivar]->GetIntegral( cutMin[ivar], cutMax[ivar] );
1025  effB *= (*fVarPdfB)[ivar]->GetIntegral( cutMin[ivar], cutMax[ivar] );
1026  }
1027 
1028  // quick fix to prevent from efficiencies < 0
1029  if( effS < 0.0 ) {
1030  effS = 0.0;
1031  if( !fNegEffWarning ) Log() << kWARNING << "Negative signal efficiency found and set to 0. This is probably due to many events with negative weights in a certain cut-region." << Endl;
1032  fNegEffWarning = kTRUE;
1033  }
1034  if( effB < 0.0 ) {
1035  effB = 0.0;
1036  if( !fNegEffWarning ) Log() << kWARNING << "Negative background efficiency found and set to 0. This is probably due to many events with negative weights in a certain cut-region." << Endl;
1037  fNegEffWarning = kTRUE;
1038  }
1039 }
1040 
1041 ////////////////////////////////////////////////////////////////////////////////
1042 /// compute signal and background efficiencies from event counting
1043 /// for given cut sample
1044 
1046  Double_t& effS, Double_t& effB)
1047 {
1048  Float_t nTotS = 0, nTotB = 0;
1049  Float_t nSelS = 0, nSelB = 0;
1050 
1051  Volume* volume = new Volume( cutMin, cutMax, GetNvar() );
1052 
1053  // search for all events lying in the volume, and add up their weights
1054  nSelS = fBinaryTreeS->SearchVolume( volume );
1055  nSelB = fBinaryTreeB->SearchVolume( volume );
1056 
1057  delete volume;
1058 
1059  // total number of "events" (sum of weights) as reference to compute efficiency
1060  nTotS = fBinaryTreeS->GetSumOfWeights();
1061  nTotB = fBinaryTreeB->GetSumOfWeights();
1062 
1063  // sanity check
1064  if (nTotS == 0 && nTotB == 0) {
1065  Log() << kFATAL << "<GetEffsfromSelection> fatal error in zero total number of events:"
1066  << " nTotS, nTotB: " << nTotS << " " << nTotB << " ***" << Endl;
1067  }
1068 
1069  // efficiencies
1070  if (nTotS == 0 ) {
1071  effS = 0;
1072  effB = nSelB/nTotB;
1073  Log() << kWARNING << "<ComputeEstimator> zero number of signal events" << Endl;
1074  }
1075  else if (nTotB == 0) {
1076  effB = 0;
1077  effS = nSelS/nTotS;
1078  Log() << kWARNING << "<ComputeEstimator> zero number of background events" << Endl;
1079  }
1080  else {
1081  effS = nSelS/nTotS;
1082  effB = nSelB/nTotB;
1083  }
1084 
1085  // quick fix to prevent from efficiencies < 0
1086  if( effS < 0.0 ) {
1087  effS = 0.0;
1088  if( !fNegEffWarning ) Log() << kWARNING << "Negative signal efficiency found and set to 0. This is probably due to many events with negative weights in a certain cut-region." << Endl;
1089  fNegEffWarning = kTRUE;
1090  }
1091  if( effB < 0.0 ) {
1092  effB = 0.0;
1093  if( !fNegEffWarning ) Log() << kWARNING << "Negative background efficiency found and set to 0. This is probably due to many events with negative weights in a certain cut-region." << Endl;
1094  fNegEffWarning = kTRUE;
1095  }
1096 }
1097 
1098 ////////////////////////////////////////////////////////////////////////////////
1099 /// for PDF method: create efficiency reference histograms and PDFs
1100 
1102 {
1103  // create list of histograms and PDFs
1104  fVarHistS = new std::vector<TH1*>( GetNvar() );
1105  fVarHistB = new std::vector<TH1*>( GetNvar() );
1106  fVarHistS_smooth = new std::vector<TH1*>( GetNvar() );
1107  fVarHistB_smooth = new std::vector<TH1*>( GetNvar() );
1108  fVarPdfS = new std::vector<PDF*>( GetNvar() );
1109  fVarPdfB = new std::vector<PDF*>( GetNvar() );
1110 
1111  Int_t nsmooth = 0;
1112 
1113  // get min and max values of all events
1114  Double_t minVal = DBL_MAX;
1115  Double_t maxVal = -DBL_MAX;
1116  for( UInt_t ievt=0; ievt<Data()->GetNEvents(); ievt++ ){
1117  const Event *ev = GetEvent(ievt);
1118  Float_t val = ev->GetValue(ievt);
1119  if( val > minVal ) minVal = val;
1120  if( val < maxVal ) maxVal = val;
1121  }
1122 
1123  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
1124 
1125  // ---- signal
1126  TString histTitle = (*fInputVars)[ivar] + " signal training";
1127  TString histName = (*fInputVars)[ivar] + "_sig";
1128  // TString drawOpt = (*fInputVars)[ivar] + ">>h(";
1129  // drawOpt += fNbins;
1130  // drawOpt += ")";
1131 
1132  // selection
1133  // Data().GetTrainingTree()->Draw( drawOpt, "type==1", "goff" );
1134  // (*fVarHistS)[ivar] = (TH1F*)gDirectory->Get("h");
1135  // (*fVarHistS)[ivar]->SetName(histName);
1136  // (*fVarHistS)[ivar]->SetTitle(histTitle);
1137 
1138  (*fVarHistS)[ivar] = new TH1F(histName.Data(), histTitle.Data(), fNbins, minVal, maxVal );
1139 
1140  // ---- background
1141  histTitle = (*fInputVars)[ivar] + " background training";
1142  histName = (*fInputVars)[ivar] + "_bgd";
1143  // drawOpt = (*fInputVars)[ivar] + ">>h(";
1144  // drawOpt += fNbins;
1145  // drawOpt += ")";
1146 
1147  // Data().GetTrainingTree()->Draw( drawOpt, "type==0", "goff" );
1148  // (*fVarHistB)[ivar] = (TH1F*)gDirectory->Get("h");
1149  // (*fVarHistB)[ivar]->SetName(histName);
1150  // (*fVarHistB)[ivar]->SetTitle(histTitle);
1151 
1152 
1153  (*fVarHistB)[ivar] = new TH1F(histName.Data(), histTitle.Data(), fNbins, minVal, maxVal );
1154 
1155  for( UInt_t ievt=0; ievt<Data()->GetNEvents(); ievt++ ){
1156  const Event *ev = GetEvent(ievt);
1157  Float_t val = ev->GetValue(ievt);
1158  if( DataInfo().IsSignal(ev) ){
1159  (*fVarHistS)[ivar]->Fill( val );
1160  }else{
1161  (*fVarHistB)[ivar]->Fill( val );
1162  }
1163  }
1164 
1165 
1166 
1167  // make copy for smoothed histos
1168  (*fVarHistS_smooth)[ivar] = (TH1F*)(*fVarHistS)[ivar]->Clone();
1169  histTitle = (*fInputVars)[ivar] + " signal training smoothed ";
1170  histTitle += nsmooth;
1171  histTitle +=" times";
1172  histName = (*fInputVars)[ivar] + "_sig_smooth";
1173  (*fVarHistS_smooth)[ivar]->SetName(histName);
1174  (*fVarHistS_smooth)[ivar]->SetTitle(histTitle);
1175 
1176  // smooth
1177  (*fVarHistS_smooth)[ivar]->Smooth(nsmooth);
1178 
1179  // ---- background
1180  // histTitle = (*fInputVars)[ivar] + " background training";
1181  // histName = (*fInputVars)[ivar] + "_bgd";
1182  // drawOpt = (*fInputVars)[ivar] + ">>h(";
1183  // drawOpt += fNbins;
1184  // drawOpt += ")";
1185 
1186  // Data().GetTrainingTree()->Draw( drawOpt, "type==0", "goff" );
1187  // (*fVarHistB)[ivar] = (TH1F*)gDirectory->Get("h");
1188  // (*fVarHistB)[ivar]->SetName(histName);
1189  // (*fVarHistB)[ivar]->SetTitle(histTitle);
1190 
1191  // make copy for smoothed histos
1192  (*fVarHistB_smooth)[ivar] = (TH1F*)(*fVarHistB)[ivar]->Clone();
1193  histTitle = (*fInputVars)[ivar]+" background training smoothed ";
1194  histTitle += nsmooth;
1195  histTitle +=" times";
1196  histName = (*fInputVars)[ivar]+"_bgd_smooth";
1197  (*fVarHistB_smooth)[ivar]->SetName(histName);
1198  (*fVarHistB_smooth)[ivar]->SetTitle(histTitle);
1199 
1200  // smooth
1201  (*fVarHistB_smooth)[ivar]->Smooth(nsmooth);
1202 
1203  // create PDFs
1204  (*fVarPdfS)[ivar] = new PDF( TString(GetName()) + " PDF Var Sig " + GetInputVar( ivar ), (*fVarHistS_smooth)[ivar], PDF::kSpline2 );
1205  (*fVarPdfB)[ivar] = new PDF( TString(GetName()) + " PDF Var Bkg " + GetInputVar( ivar ), (*fVarHistB_smooth)[ivar], PDF::kSpline2 );
1206  }
1207 }
1208 
1209 ////////////////////////////////////////////////////////////////////////////////
1210 /// read the cuts from stream
1211 
1213 {
1214  TString dummy;
1215  UInt_t dummyInt;
1216 
1217  // first the dimensions
1218  istr >> dummy >> dummy;
1219  // coverity[tainted_data_argument]
1220  istr >> dummy >> fNbins;
1221 
1222  // get rid of one read-in here because we read in once all ready to check for decorrelation
1223  istr >> dummy >> dummy >> dummy >> dummy >> dummy >> dummy >> dummyInt >> dummy ;
1224 
1225  // sanity check
1226  if (dummyInt != Data()->GetNVariables()) {
1227  Log() << kFATAL << "<ReadWeightsFromStream> fatal error: mismatch "
1228  << "in number of variables: " << dummyInt << " != " << Data()->GetNVariables() << Endl;
1229  }
1230  //SetNvar(dummyInt);
1231 
1232  // print some information
1233  if (fFitMethod == kUseMonteCarlo) {
1234  Log() << kINFO << "Read cuts optimised using sample of MC events" << Endl;
1235  }
1236  else if (fFitMethod == kUseMonteCarloEvents) {
1237  Log() << kINFO << "Read cuts optimised using sample of MC events" << Endl;
1238  }
1239  else if (fFitMethod == kUseGeneticAlgorithm) {
1240  Log() << kINFO << "Read cuts optimised using Genetic Algorithm" << Endl;
1241  }
1242  else if (fFitMethod == kUseSimulatedAnnealing) {
1243  Log() << kINFO << "Read cuts optimised using Simulated Annealing algorithm" << Endl;
1244  }
1245  else if (fFitMethod == kUseEventScan) {
1246  Log() << kINFO << "Read cuts optimised using Full Event Scan" << Endl;
1247  }
1248  else {
1249  Log() << kWARNING << "unknown method: " << fFitMethod << Endl;
1250  }
1251  Log() << kINFO << "in " << fNbins << " signal efficiency bins and for " << GetNvar() << " variables" << Endl;
1252 
1253  // now read the cuts
1254  char buffer[200];
1255  istr.getline(buffer,200);
1256  istr.getline(buffer,200);
1257 
1258  Int_t tmpbin;
1259  Float_t tmpeffS, tmpeffB;
1260  if (fEffBvsSLocal != 0) delete fEffBvsSLocal;
1261  fEffBvsSLocal = new TH1F( GetTestvarName() + "_effBvsSLocal",
1262  TString(GetName()) + " efficiency of B vs S", fNbins, 0.0, 1.0 );
1263  fEffBvsSLocal->SetDirectory(0); // it's local
1264 
1265  for (Int_t ibin=0; ibin<fNbins; ibin++) {
1266  istr >> tmpbin >> tmpeffS >> tmpeffB;
1267  fEffBvsSLocal->SetBinContent( ibin+1, tmpeffB );
1268 
1269  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
1270  istr >> fCutMin[ivar][ibin] >> fCutMax[ivar][ibin];
1271  }
1272  }
1273 
1274  fEffSMin = fEffBvsSLocal->GetBinCenter(1);
1275  fEffSMax = fEffBvsSLocal->GetBinCenter(fNbins);
1276 }
1277 
1278 ////////////////////////////////////////////////////////////////////////////////
1279 /// create XML description for LD classification and regression
1280 /// (for arbitrary number of output classes/targets)
1281 
1282 void TMVA::MethodCuts::AddWeightsXMLTo( void* parent ) const
1283 {
1284  // write all necessary information to the stream
1285  std::vector<Double_t> cutsMin;
1286  std::vector<Double_t> cutsMax;
1287 
1288  void* wght = gTools().AddChild(parent, "Weights");
1289  gTools().AddAttr( wght, "OptimisationMethod", (Int_t)fEffMethod);
1290  gTools().AddAttr( wght, "FitMethod", (Int_t)fFitMethod );
1291  gTools().AddAttr( wght, "nbins", fNbins );
1292  gTools().AddComment( wght, Form( "Below are the optimised cuts for %i variables: Format: ibin(hist) effS effB cutMin[ivar=0] cutMax[ivar=0] ... cutMin[ivar=n-1] cutMax[ivar=n-1]", GetNvar() ) );
1293 
1294  // NOTE: The signal efficiency written out into
1295  // the weight file does not correspond to the center of the bin within which the
1296  // background rejection is maximised (as before) but to the lower left edge of it.
1297  // This is because the cut optimisation algorithm determines the best background
1298  // rejection for all signal efficiencies belonging into a bin. Since the best background
1299  // rejection is in general obtained for the lowest possible signal efficiency, the
1300  // reference signal efficeincy is the lowest value in the bin.
1301 
1302  for (Int_t ibin=0; ibin<fNbins; ibin++) {
1303  Double_t effS = fEffBvsSLocal->GetBinCenter ( ibin + 1 );
1304  Double_t trueEffS = GetCuts( effS, cutsMin, cutsMax );
1305  if (TMath::Abs(trueEffS) < 1e-10) trueEffS = 0;
1306 
1307  void* binxml = gTools().AddChild( wght, "Bin" );
1308  gTools().AddAttr( binxml, "ibin", ibin+1 );
1309  gTools().AddAttr( binxml, "effS", trueEffS );
1310  gTools().AddAttr( binxml, "effB", fEffBvsSLocal->GetBinContent( ibin + 1 ) );
1311  void* cutsxml = gTools().AddChild( binxml, "Cuts" );
1312  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
1313  gTools().AddAttr( cutsxml, Form( "cutMin_%i", ivar ), cutsMin[ivar] );
1314  gTools().AddAttr( cutsxml, Form( "cutMax_%i", ivar ), cutsMax[ivar] );
1315  }
1316  }
1317 }
1318 
1319 ////////////////////////////////////////////////////////////////////////////////
1320 /// read coefficients from xml weight file
1321 
1323 {
1324  // delete old min and max
1325  for (UInt_t i=0; i<GetNvar(); i++) {
1326  if (fCutMin[i] != 0) delete [] fCutMin[i];
1327  if (fCutMax[i] != 0) delete [] fCutMax[i];
1328  }
1329  if (fCutMin != 0) delete [] fCutMin;
1330  if (fCutMax != 0) delete [] fCutMax;
1331 
1332  Int_t tmpEffMethod, tmpFitMethod;
1333  gTools().ReadAttr( wghtnode, "OptimisationMethod", tmpEffMethod );
1334  gTools().ReadAttr( wghtnode, "FitMethod", tmpFitMethod );
1335  gTools().ReadAttr( wghtnode, "nbins", fNbins );
1336 
1337  fEffMethod = (EEffMethod)tmpEffMethod;
1338  fFitMethod = (EFitMethodType)tmpFitMethod;
1339 
1340  // print some information
1341  if (fFitMethod == kUseMonteCarlo) {
1342  Log() << kINFO << "Read cuts optimised using sample of MC events" << Endl;
1343  }
1344  else if (fFitMethod == kUseMonteCarloEvents) {
1345  Log() << kINFO << "Read cuts optimised using sample of MC-Event events" << Endl;
1346  }
1347  else if (fFitMethod == kUseGeneticAlgorithm) {
1348  Log() << kINFO << "Read cuts optimised using Genetic Algorithm" << Endl;
1349  }
1350  else if (fFitMethod == kUseSimulatedAnnealing) {
1351  Log() << kINFO << "Read cuts optimised using Simulated Annealing algorithm" << Endl;
1352  }
1353  else if (fFitMethod == kUseEventScan) {
1354  Log() << kINFO << "Read cuts optimised using Full Event Scan" << Endl;
1355  }
1356  else {
1357  Log() << kWARNING << "unknown method: " << fFitMethod << Endl;
1358  }
1359  Log() << kINFO << "Reading " << fNbins << " signal efficiency bins for " << GetNvar() << " variables" << Endl;
1360 
1361  delete fEffBvsSLocal;
1362  fEffBvsSLocal = new TH1F( GetTestvarName() + "_effBvsSLocal",
1363  TString(GetName()) + " efficiency of B vs S", fNbins, 0.0, 1.0 );
1364  fEffBvsSLocal->SetDirectory(0); // it's local
1365  for (Int_t ibin=1; ibin<=fNbins; ibin++) fEffBvsSLocal->SetBinContent( ibin, -0.1 ); // Init
1366 
1367  fCutMin = new Double_t*[GetNvar()];
1368  fCutMax = new Double_t*[GetNvar()];
1369  for (UInt_t i=0;i<GetNvar();i++) {
1370  fCutMin[i] = new Double_t[fNbins];
1371  fCutMax[i] = new Double_t[fNbins];
1372  }
1373 
1374  // read efficeincies and cuts
1375  Int_t tmpbin;
1376  Float_t tmpeffS, tmpeffB;
1377  void* ch = gTools().GetChild(wghtnode,"Bin");
1378  while (ch) {
1379 // if (strcmp(gTools().GetName(ch),"Bin") !=0) {
1380 // ch = gTools().GetNextChild(ch);
1381 // continue;
1382 // }
1383 
1384  gTools().ReadAttr( ch, "ibin", tmpbin );
1385  gTools().ReadAttr( ch, "effS", tmpeffS );
1386  gTools().ReadAttr( ch, "effB", tmpeffB );
1387 
1388  // sanity check
1389  if (tmpbin-1 >= fNbins || tmpbin-1 < 0) {
1390  Log() << kFATAL << "Mismatch in bins: " << tmpbin-1 << " >= " << fNbins << Endl;
1391  }
1392 
1393  fEffBvsSLocal->SetBinContent( tmpbin, tmpeffB );
1394  void* ct = gTools().GetChild(ch);
1395  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
1396  gTools().ReadAttr( ct, Form( "cutMin_%i", ivar ), fCutMin[ivar][tmpbin-1] );
1397  gTools().ReadAttr( ct, Form( "cutMax_%i", ivar ), fCutMax[ivar][tmpbin-1] );
1398  }
1399  ch = gTools().GetNextChild(ch, "Bin");
1400  }
1401 }
1402 
1403 ////////////////////////////////////////////////////////////////////////////////
1404 /// write histograms and PDFs to file for monitoring purposes
1405 
1407 {
1408  Log() << kINFO << "Write monitoring histograms to file: " << BaseDir()->GetPath() << Endl;
1409 
1410  fEffBvsSLocal->Write();
1411 
1412  // save reference histograms to file
1413  if (fEffMethod == kUsePDFs) {
1414  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
1415  (*fVarHistS)[ivar]->Write();
1416  (*fVarHistB)[ivar]->Write();
1417  (*fVarHistS_smooth)[ivar]->Write();
1418  (*fVarHistB_smooth)[ivar]->Write();
1419  (*fVarPdfS)[ivar]->GetPDFHist()->Write();
1420  (*fVarPdfB)[ivar]->GetPDFHist()->Write();
1421  }
1422  }
1423 }
1424 
1425 ////////////////////////////////////////////////////////////////////////////////
1426 /// - overloaded function to create background efficiency (rejection) versus
1427 /// signal efficiency plot (first call of this function)
1428 /// - the function returns the signal efficiency at background efficiency
1429 /// indicated in theString
1430 ///
1431 /// "theString" must have two entries:
1432 /// [0]: "Efficiency"
1433 /// [1]: the value of background efficiency at which the signal efficiency
1434 /// is to be returned
1435 
1437 {
1438  // parse input string for required background efficiency
1439  TList* list = gTools().ParseFormatLine( theString );
1440  // sanity check
1441  if (list->GetSize() != 2) {
1442  Log() << kFATAL << "<GetTrainingEfficiency> wrong number of arguments"
1443  << " in string: " << theString
1444  << " | required format, e.g., Efficiency:0.05" << Endl;
1445  return -1;
1446  }
1447 
1448  Results* results = Data()->GetResults(GetMethodName(), Types::kTesting, GetAnalysisType());
1449 
1450  // that will be the value of the efficiency retured (does not affect
1451  // the efficiency-vs-bkg plot which is done anyway.
1452  Float_t effBref = atof( ((TObjString*)list->At(1))->GetString() );
1453 
1454  delete list;
1455 
1456  // first round ? --> create histograms
1457  if (results->GetHist("EFF_BVSS_TR")==0) {
1458 
1459  if (fBinaryTreeS != 0) { delete fBinaryTreeS; fBinaryTreeS = 0; }
1460  if (fBinaryTreeB != 0) { delete fBinaryTreeB; fBinaryTreeB = 0; }
1461 
1462  fBinaryTreeS = new BinarySearchTree();
1463  fBinaryTreeS->Fill( GetEventCollection(Types::kTraining), fSignalClass );
1464  fBinaryTreeB = new BinarySearchTree();
1465  fBinaryTreeB->Fill( GetEventCollection(Types::kTraining), fBackgroundClass );
1466  // there is no really good equivalent to the fEffS; fEffB (efficiency vs cutvalue)
1467  // for the "Cuts" method (unless we had only one cut). Maybe later I might add here
1468  // histograms for each of the cuts...but this would require also a change in the
1469  // base class, and it is not really necessary, as we get exactly THIS info from the
1470  // "evaluateAllVariables" anyway.
1471 
1472  // now create efficiency curve: background versus signal
1473  TH1* eff_bvss_tr = new TH1F( GetTestvarName() + "_trainingEffBvsS", GetTestvarName() + "", fNbins, 0, 1 );
1474  for (Int_t ibin=1; ibin<=fNbins; ibin++) eff_bvss_tr->SetBinContent( ibin, -0.1 ); // Init
1475  TH1* rej_bvss_tr = new TH1F( GetTestvarName() + "_trainingRejBvsS", GetTestvarName() + "", fNbins, 0, 1 );
1476  for (Int_t ibin=1; ibin<=fNbins; ibin++) rej_bvss_tr->SetBinContent( ibin, 0. ); // Init
1477  results->Store(eff_bvss_tr, "EFF_BVSS_TR");
1478  results->Store(rej_bvss_tr, "REJ_BVSS_TR");
1479 
1480  // use root finder
1481 
1482  // make the background-vs-signal efficiency plot
1483  Double_t* tmpCutMin = new Double_t[GetNvar()];
1484  Double_t* tmpCutMax = new Double_t[GetNvar()];
1485  Int_t nFailedBins=0;
1486  for (Int_t bini=1; bini<=fNbins; bini++) {
1487  for (UInt_t ivar=0; ivar <GetNvar(); ivar++){
1488  tmpCutMin[ivar] = fCutMin[ivar][bini-1];
1489  tmpCutMax[ivar] = fCutMax[ivar][bini-1];
1490  }
1491  // find cut value corresponding to a given signal efficiency
1492  Double_t effS, effB;
1493  this->GetEffsfromSelection( &tmpCutMin[0], &tmpCutMax[0], effS, effB);
1494  // check that effS matches bini
1495  Int_t effBin = eff_bvss_tr->GetXaxis()->FindBin(effS);
1496  if (effBin != bini){
1497  Log()<< kVERBOSE << "unable to fill efficiency bin " << bini<< " " << effBin <<Endl;
1498  nFailedBins++;
1499  }
1500  else{
1501  // and fill histograms
1502  eff_bvss_tr->SetBinContent( bini, effB );
1503  rej_bvss_tr->SetBinContent( bini, 1.0-effB );
1504  }
1505  }
1506  if (nFailedBins>0) Log()<< kWARNING << " unable to fill "<< nFailedBins <<" efficiency bins " <<Endl;
1507 
1508  delete [] tmpCutMin;
1509  delete [] tmpCutMax;
1510 
1511  // create splines for histogram
1512  fSplTrainEffBvsS = new TSpline1( "trainEffBvsS", new TGraph( eff_bvss_tr ) );
1513  }
1514 
1515  // must exist...
1516  if (NULL == fSplTrainEffBvsS) return 0.0;
1517 
1518  // now find signal efficiency that corresponds to required background efficiency
1519  Double_t effS = 0., effB, effS_ = 0., effB_ = 0.;
1520  Int_t nbins_ = 1000;
1521 
1522  // loop over efficiency bins until the background eff. matches the requirement
1523  for (Int_t bini=1; bini<=nbins_; bini++) {
1524  // get corresponding signal and background efficiencies
1525  effS = (bini - 0.5)/Float_t(nbins_);
1526  effB = fSplTrainEffBvsS->Eval( effS );
1527 
1528  // find signal efficiency that corresponds to required background efficiency
1529  if ((effB - effBref)*(effB_ - effBref) < 0) break;
1530  effS_ = effS;
1531  effB_ = effB;
1532  }
1533 
1534  return 0.5*(effS + effS_);
1535 }
1536 
1537 ////////////////////////////////////////////////////////////////////////////////
1538 /// - overloaded function to create background efficiency (rejection) versus
1539 /// signal efficiency plot (first call of this function)
1540 /// - the function returns the signal efficiency at background efficiency
1541 /// indicated in theString
1542 ///
1543 /// "theString" must have two entries:
1544 /// [0]: "Efficiency"
1545 /// [1]: the value of background efficiency at which the signal efficiency
1546 /// is to be returned
1547 
1549 {
1550  Data()->SetCurrentType(type);
1551 
1552  Results* results = Data()->GetResults( GetMethodName(), Types::kTesting, GetAnalysisType() );
1553 
1554  // parse input string for required background efficiency
1555  TList* list = gTools().ParseFormatLine( theString, ":" );
1556 
1557  if (list->GetSize() > 2) {
1558  delete list;
1559  Log() << kFATAL << "<GetEfficiency> wrong number of arguments"
1560  << " in string: " << theString
1561  << " | required format, e.g., Efficiency:0.05, or empty string" << Endl;
1562  return -1;
1563  }
1564 
1565  // sanity check
1566  Bool_t computeArea = (list->GetSize() < 2); // the area is computed
1567 
1568  // that will be the value of the efficiency retured (does not affect
1569  // the efficiency-vs-bkg plot which is done anyway.
1570  Float_t effBref = (computeArea?1.:atof( ((TObjString*)list->At(1))->GetString() ));
1571 
1572  delete list;
1573 
1574 
1575  // first round ? --> create histograms
1576  if (results->GetHist("MVA_EFF_BvsS")==0) {
1577 
1578  if (fBinaryTreeS!=0) { delete fBinaryTreeS; fBinaryTreeS = 0; }
1579  if (fBinaryTreeB!=0) { delete fBinaryTreeB; fBinaryTreeB = 0; }
1580 
1581  // the variables may be transformed by a transformation method: to coherently
1582  // treat signal and background one must decide which transformation type shall
1583  // be used: our default is signal-type
1584  fBinaryTreeS = new BinarySearchTree();
1585  fBinaryTreeS->Fill( GetEventCollection(Types::kTesting), fSignalClass );
1586  fBinaryTreeB = new BinarySearchTree();
1587  fBinaryTreeB->Fill( GetEventCollection(Types::kTesting), fBackgroundClass );
1588 
1589  // there is no really good equivalent to the fEffS; fEffB (efficiency vs cutvalue)
1590  // for the "Cuts" method (unless we had only one cut). Maybe later I might add here
1591  // histograms for each of the cuts...but this would require also a change in the
1592  // base class, and it is not really necessary, as we get exactly THIS info from the
1593  // "evaluateAllVariables" anyway.
1594 
1595  // now create efficiency curve: background versus signal
1596  TH1* eff_BvsS = new TH1F( GetTestvarName() + "_effBvsS", GetTestvarName() + "", fNbins, 0, 1 );
1597  for (Int_t ibin=1; ibin<=fNbins; ibin++) eff_BvsS->SetBinContent( ibin, -0.1 ); // Init
1598  TH1* rej_BvsS = new TH1F( GetTestvarName() + "_rejBvsS", GetTestvarName() + "", fNbins, 0, 1 );
1599  for (Int_t ibin=1; ibin<=fNbins; ibin++) rej_BvsS->SetBinContent( ibin, 0.0 ); // Init
1600  results->Store(eff_BvsS, "MVA_EFF_BvsS");
1601  results->Store(rej_BvsS);
1602 
1603  Double_t xmin = 0.;
1604  Double_t xmax = 1.000001;
1605 
1606  TH1* eff_s = new TH1F( GetTestvarName() + "_effS", GetTestvarName() + " (signal)", fNbins, xmin, xmax);
1607  for (Int_t ibin=1; ibin<=fNbins; ibin++) eff_s->SetBinContent( ibin, -0.1 ); // Init
1608  TH1* eff_b = new TH1F( GetTestvarName() + "_effB", GetTestvarName() + " (background)", fNbins, xmin, xmax);
1609  for (Int_t ibin=1; ibin<=fNbins; ibin++) eff_b->SetBinContent( ibin, -0.1 ); // Init
1610  results->Store(eff_s, "MVA_S");
1611  results->Store(eff_b, "MVA_B");
1612 
1613  // use root finder
1614 
1615  // make the background-vs-signal efficiency plot
1616  Double_t* tmpCutMin = new Double_t[GetNvar()];
1617  Double_t* tmpCutMax = new Double_t[GetNvar()];
1618  TGraph* tmpBvsS = new TGraph(fNbins+1);
1619  tmpBvsS->SetPoint(0, 0., 0.);
1620 
1621  for (Int_t bini=1; bini<=fNbins; bini++) {
1622  for (UInt_t ivar=0; ivar <GetNvar(); ivar++) {
1623  tmpCutMin[ivar] = fCutMin[ivar][bini-1];
1624  tmpCutMax[ivar] = fCutMax[ivar][bini-1];
1625  }
1626  // find cut value corresponding to a given signal efficiency
1627  Double_t effS, effB;
1628  this->GetEffsfromSelection( &tmpCutMin[0], &tmpCutMax[0], effS, effB);
1629  tmpBvsS->SetPoint(bini, effS, effB);
1630 
1631  eff_s->SetBinContent(bini, effS);
1632  eff_b->SetBinContent(bini, effB);
1633  }
1634  tmpBvsS->SetPoint(fNbins+1, 1., 1.);
1635 
1636  delete [] tmpCutMin;
1637  delete [] tmpCutMax;
1638 
1639  // create splines for histogram
1640  fSpleffBvsS = new TSpline1( "effBvsS", tmpBvsS );
1641  for (Int_t bini=1; bini<=fNbins; bini++) {
1642  Double_t effS = (bini - 0.5)/Float_t(fNbins);
1643  Double_t effB = fSpleffBvsS->Eval( effS );
1644  eff_BvsS->SetBinContent( bini, effB );
1645  rej_BvsS->SetBinContent( bini, 1.0-effB );
1646  }
1647  }
1648 
1649  // must exist...
1650  if (NULL == fSpleffBvsS) return 0.0;
1651 
1652  // now find signal efficiency that corresponds to required background efficiency
1653  Double_t effS = 0, effB = 0, effS_ = 0, effB_ = 0;
1654  Int_t nbins_ = 1000;
1655 
1656  if (computeArea) {
1657 
1658  // compute area of rej-vs-eff plot
1659  Double_t integral = 0;
1660  for (Int_t bini=1; bini<=nbins_; bini++) {
1661 
1662  // get corresponding signal and background efficiencies
1663  effS = (bini - 0.5)/Float_t(nbins_);
1664  effB = fSpleffBvsS->Eval( effS );
1665  integral += (1.0 - effB);
1666  }
1667  integral /= nbins_;
1668 
1669  return integral;
1670  }
1671  else {
1672 
1673  // loop over efficiency bins until the background eff. matches the requirement
1674  for (Int_t bini=1; bini<=nbins_; bini++) {
1675  // get corresponding signal and background efficiencies
1676  effS = (bini - 0.5)/Float_t(nbins_);
1677  effB = fSpleffBvsS->Eval( effS );
1678 
1679  // find signal efficiency that corresponds to required background efficiency
1680  if ((effB - effBref)*(effB_ - effBref) < 0) break;
1681  effS_ = effS;
1682  effB_ = effB;
1683  }
1684 
1685  effS = 0.5*(effS + effS_);
1686  effSerr = 0;
1687  if (Data()->GetNEvtSigTest() > 0)
1688  effSerr = TMath::Sqrt( effS*(1.0 - effS)/Double_t(Data()->GetNEvtSigTest()) );
1689 
1690  return effS;
1691 
1692  }
1693 
1694  return -1;
1695 }
1696 
1697 ////////////////////////////////////////////////////////////////////////////////
1698 /// write specific classifier response
1699 
1700 void TMVA::MethodCuts::MakeClassSpecific( std::ostream& fout, const TString& className ) const
1701 {
1702  fout << " // not implemented for class: \"" << className << "\"" << std::endl;
1703  fout << "};" << std::endl;
1704 }
1705 
1706 ////////////////////////////////////////////////////////////////////////////////
1707 /// get help message text
1708 ///
1709 /// typical length of text line:
1710 /// "|--------------------------------------------------------------|"
1711 
1713 {
1714  TString bold = gConfig().WriteOptionsReference() ? "<b>" : "";
1715  TString resbold = gConfig().WriteOptionsReference() ? "</b>" : "";
1716  TString brk = gConfig().WriteOptionsReference() ? "<br>" : "";
1717 
1718  Log() << Endl;
1719  Log() << gTools().Color("bold") << "--- Short description:" << gTools().Color("reset") << Endl;
1720  Log() << Endl;
1721  Log() << "The optimisation of rectangular cuts performed by TMVA maximises " << Endl;
1722  Log() << "the background rejection at given signal efficiency, and scans " << Endl;
1723  Log() << "over the full range of the latter quantity. Three optimisation" << Endl;
1724  Log() << "methods are optional: Monte Carlo sampling (MC), a Genetics" << Endl;
1725  Log() << "Algorithm (GA), and Simulated Annealing (SA). GA and SA are" << Endl;
1726  Log() << "expected to perform best." << Endl;
1727  Log() << Endl;
1728  Log() << "The difficulty to find the optimal cuts strongly increases with" << Endl;
1729  Log() << "the dimensionality (number of input variables) of the problem." << Endl;
1730  Log() << "This behavior is due to the non-uniqueness of the solution space."<< Endl;
1731  Log() << Endl;
1732  Log() << gTools().Color("bold") << "--- Performance optimisation:" << gTools().Color("reset") << Endl;
1733  Log() << Endl;
1734  Log() << "If the dimensionality exceeds, say, 4 input variables, it is " << Endl;
1735  Log() << "advisable to scrutinize the separation power of the variables," << Endl;
1736  Log() << "and to remove the weakest ones. If some among the input variables" << Endl;
1737  Log() << "can be described by a single cut (e.g., because signal tends to be" << Endl;
1738  Log() << "larger than background), this can be indicated to MethodCuts via" << Endl;
1739  Log() << "the \"Fsmart\" options (see option string). Choosing this option" << Endl;
1740  Log() << "reduces the number of requirements for the variable from 2 (min/max)" << Endl;
1741  Log() << "to a single one (TMVA finds out whether it is to be interpreted as" << Endl;
1742  Log() << "min or max)." << Endl;
1743  Log() << Endl;
1744  Log() << gTools().Color("bold") << "--- Performance tuning via configuration options:" << gTools().Color("reset") << Endl;
1745  Log() << "" << Endl;
1746  Log() << bold << "Monte Carlo sampling:" << resbold << Endl;
1747  Log() << "" << Endl;
1748  Log() << "Apart form the \"Fsmart\" option for the variables, the only way" << Endl;
1749  Log() << "to improve the MC sampling is to increase the sampling rate. This" << Endl;
1750  Log() << "is done via the configuration option \"MC_NRandCuts\". The execution" << Endl;
1751  Log() << "time scales linearly with the sampling rate." << Endl;
1752  Log() << "" << Endl;
1753  Log() << bold << "Genetic Algorithm:" << resbold << Endl;
1754  Log() << "" << Endl;
1755  Log() << "The algorithm terminates if no significant fitness increase has" << Endl;
1756  Log() << "been achieved within the last \"nsteps\" steps of the calculation." << Endl;
1757  Log() << "Wiggles in the ROC curve or constant background rejection of 1" << Endl;
1758  Log() << "indicate that the GA failed to always converge at the true maximum" << Endl;
1759  Log() << "fitness. In such a case, it is recommended to broaden the search " << Endl;
1760  Log() << "by increasing the population size (\"popSize\") and to give the GA " << Endl;
1761  Log() << "more time to find improvements by increasing the number of steps" << Endl;
1762  Log() << "(\"nsteps\")" << Endl;
1763  Log() << " -> increase \"popSize\" (at least >10 * number of variables)" << Endl;
1764  Log() << " -> increase \"nsteps\"" << Endl;
1765  Log() << "" << Endl;
1766  Log() << bold << "Simulated Annealing (SA) algorithm:" << resbold << Endl;
1767  Log() << "" << Endl;
1768  Log() << "\"Increasing Adaptive\" approach:" << Endl;
1769  Log() << "" << Endl;
1770  Log() << "The algorithm seeks local minima and explores their neighborhood, while" << Endl;
1771  Log() << "changing the ambient temperature depending on the number of failures" << Endl;
1772  Log() << "in the previous steps. The performance can be improved by increasing" << Endl;
1773  Log() << "the number of iteration steps (\"MaxCalls\"), or by adjusting the" << Endl;
1774  Log() << "minimal temperature (\"MinTemperature\"). Manual adjustments of the" << Endl;
1775  Log() << "speed of the temperature increase (\"TemperatureScale\" and \"AdaptiveSpeed\")" << Endl;
1776  Log() << "to individual data sets should also help. Summary:" << brk << Endl;
1777  Log() << " -> increase \"MaxCalls\"" << brk << Endl;
1778  Log() << " -> adjust \"MinTemperature\"" << brk << Endl;
1779  Log() << " -> adjust \"TemperatureScale\"" << brk << Endl;
1780  Log() << " -> adjust \"AdaptiveSpeed\"" << Endl;
1781  Log() << "" << Endl;
1782  Log() << "\"Decreasing Adaptive\" approach:" << Endl;
1783  Log() << "" << Endl;
1784  Log() << "The algorithm calculates the initial temperature (based on the effect-" << Endl;
1785  Log() << "iveness of large steps) and the multiplier that ensures to reach the" << Endl;
1786  Log() << "minimal temperature with the requested number of iteration steps." << Endl;
1787  Log() << "The performance can be improved by adjusting the minimal temperature" << Endl;
1788  Log() << " (\"MinTemperature\") and by increasing number of steps (\"MaxCalls\"):" << brk << Endl;
1789  Log() << " -> increase \"MaxCalls\"" << brk << Endl;
1790  Log() << " -> adjust \"MinTemperature\"" << Endl;
1791  Log() << " " << Endl;
1792  Log() << "Other kernels:" << Endl;
1793  Log() << "" << Endl;
1794  Log() << "Alternative ways of counting the temperature change are implemented. " << Endl;
1795  Log() << "Each of them starts with the maximum temperature (\"MaxTemperature\")" << Endl;
1796  Log() << "and descreases while changing the temperature according to a given" << Endl;
1797  Log() << "prescription:" << brk << Endl;
1798  Log() << "CurrentTemperature =" << brk << Endl;
1799  Log() << " - Sqrt: InitialTemperature / Sqrt(StepNumber+2) * TemperatureScale" << brk << Endl;
1800  Log() << " - Log: InitialTemperature / Log(StepNumber+2) * TemperatureScale" << brk << Endl;
1801  Log() << " - Homo: InitialTemperature / (StepNumber+2) * TemperatureScale" << brk << Endl;
1802  Log() << " - Sin: (Sin(StepNumber / TemperatureScale) + 1) / (StepNumber + 1)*InitialTemperature + Eps" << brk << Endl;
1803  Log() << " - Geo: CurrentTemperature * TemperatureScale" << Endl;
1804  Log() << "" << Endl;
1805  Log() << "Their performance can be improved by adjusting initial temperature" << Endl;
1806  Log() << "(\"InitialTemperature\"), the number of iteration steps (\"MaxCalls\")," << Endl;
1807  Log() << "and the multiplier that scales the termperature descrease" << Endl;
1808  Log() << "(\"TemperatureScale\")" << brk << Endl;
1809  Log() << " -> increase \"MaxCalls\"" << brk << Endl;
1810  Log() << " -> adjust \"InitialTemperature\"" << brk << Endl;
1811  Log() << " -> adjust \"TemperatureScale\"" << brk << Endl;
1812  Log() << " -> adjust \"KernelTemperature\"" << Endl;
1813 }
virtual Int_t Write(const char *name=0, Int_t option=0, Int_t bufsize=0)
Write this object to the current directory.
Definition: TObject.cxx:823
Double_t GetMvaValue(Double_t *err=0, Double_t *errUpper=0)
cut evaluation: returns 1.0 if event passed, 0.0 otherwise
Definition: MethodCuts.cxx:442
float xmin
Definition: THbookFile.cxx:93
Random number generator class based on M.
Definition: TRandom3.h:29
tuple buffer
Definition: tree.py:99
MsgLogger & Endl(MsgLogger &ml)
Definition: MsgLogger.h:162
TH1 * GetHist(const TString &alias) const
Definition: Results.cxx:113
Double_t ComputeEstimator(std::vector< Double_t > &)
returns estimator for "cut fitness" used by GA there are two requirements: 1) the signal efficiency m...
Definition: MethodCuts.cxx:888
void TestClassification()
nothing to test
Definition: MethodCuts.cxx:824
Collectable string class.
Definition: TObjString.h:32
float Float_t
Definition: RtypesCore.h:53
void CheckForUnusedOptions() const
checks for unused options in option string
Config & gConfig()
void MatchParsToCuts(const std::vector< Double_t > &, Double_t *, Double_t *)
translates parameters into cuts
Definition: MethodCuts.cxx:969
EAnalysisType
Definition: Types.h:124
void Clone(Ssiz_t nc)
Make self a distinct copy with capacity of at least tot, where tot cannot be smaller than the current...
Definition: TString.cxx:1162
Basic string class.
Definition: TString.h:137
1-D histogram with a float per channel (see TH1 documentation)}
Definition: TH1.h:570
Short_t Min(Short_t a, Short_t b)
Definition: TMathBase.h:170
int Int_t
Definition: RtypesCore.h:41
bool Bool_t
Definition: RtypesCore.h:59
void AddWeightsXMLTo(void *parent) const
create XML description for LD classification and regression (for arbitrary number of output classes/t...
const Bool_t kFALSE
Definition: Rtypes.h:92
int nbins[3]
void AddAttr(void *node, const char *, const T &value, Int_t precision=16)
Definition: Tools.h:308
Bool_t AddComment(void *node, const char *comment)
Definition: Tools.cxx:1142
void * AddChild(void *parent, const char *childname, const char *content=0, bool isRootNode=false)
add child node
Definition: Tools.cxx:1134
Short_t Abs(Short_t d)
Definition: TMathBase.h:110
virtual TObject * At(Int_t idx) const
Returns the object at position idx. Returns 0 if idx is out of range.
Definition: TList.cxx:311
const char * Data() const
Definition: TString.h:349
Tools & gTools()
Definition: Tools.cxx:79
Double_t EstimatorFunction(std::vector< Double_t > &)
returns estimator for "cut fitness" used by GA
Definition: MethodCuts.cxx:875
TStopwatch timer
Definition: pirndm.C:37
Double_t Run()
estimator function interface for fitting
Definition: FitterBase.cxx:79
void Init(void)
default initialisation called by all constructors
Definition: MethodCuts.cxx:232
void CreateVariablePDFs(void)
for PDF method: create efficiency reference histograms and PDFs
void * GetChild(void *parent, const char *childname=0)
get child node
Definition: Tools.cxx:1158
Double_t GetCuts(Double_t effS, std::vector< Double_t > &cutMin, std::vector< Double_t > &cutMax) const
retrieve cut values for given signal efficiency
Definition: MethodCuts.cxx:561
void GetEffsfromSelection(Double_t *cutMin, Double_t *cutMax, Double_t &effS, Double_t &effB)
compute signal and background efficiencies from event counting for given cut sample ...
std::vector< std::vector< double > > Data
void MatchCutsToPars(std::vector< Double_t > &, Double_t *, Double_t *)
translates cuts into parameters
Definition: PDF.h:71
Float_t z[5]
Definition: Ifit.C:16
A doubly linked list.
Definition: TList.h:47
void Train(void)
training method: here the cuts are optimised for the training sample
Definition: MethodCuts.cxx:588
std::string GetMethodName(TCppMethod_t)
Definition: Cppyy.cxx:706
MethodCuts(const TString &jobName, const TString &methodTitle, DataSetInfo &theData, const TString &theOption="MC:150:10000:", TDirectory *theTargetFile=0)
void PrintCuts(Double_t effS) const
print cuts
Definition: MethodCuts.cxx:475
void MakeClassSpecific(std::ostream &, const TString &) const
write specific classifier response
void ReadWeightsFromXML(void *wghtnode)
read coefficients from xml weight file
virtual void SetBinContent(Int_t bin, Double_t content)
Set bin content see convention for numbering bins in TH1::GetBin In case the bin number is greater th...
Definition: TH1.cxx:8543
unsigned int UInt_t
Definition: RtypesCore.h:42
char * Form(const char *fmt,...)
void GetEffsfromPDFs(Double_t *cutMin, Double_t *cutMax, Double_t &effS, Double_t &effB)
compute signal and background efficiencies from PDFs for given cut sample
void ReadAttr(void *node, const char *, T &value)
Definition: Tools.h:295
float xmax
Definition: THbookFile.cxx:93
void DeclareOptions()
define the options (their key words) that can be set in the option string know options: Method <strin...
Definition: MethodCuts.cxx:330
REAL epsilon
Definition: triangle.c:617
virtual Int_t FindBin(Double_t x)
Find bin number corresponding to abscissa x.
Definition: TAxis.cxx:264
virtual Int_t GetSize() const
Definition: TCollection.h:95
#define ClassImp(name)
Definition: Rtypes.h:279
double Double_t
Definition: RtypesCore.h:55
void WriteMonitoringHistosToFile(void) const
write histograms and PDFs to file for monitoring purposes
Describe directory structure in memory.
Definition: TDirectory.h:44
int type
Definition: TGX11.cxx:120
void ReadWeightsFromStream(std::istream &i)
read the cuts from stream
static RooMathCoreReg dummy
void * GetNextChild(void *prevchild, const char *childname=0)
XML helpers.
Definition: Tools.cxx:1170
Float_t GetValue(UInt_t ivar) const
return value of i'th variable
Definition: Event.cxx:231
The TH1 histogram class.
Definition: TH1.h:80
Double_t GetEfficiency(const TString &, Types::ETreeType, Double_t &)
virtual Double_t Uniform(Double_t x1=1)
Returns a uniform deviate on the interval (0, x1).
Definition: TRandom.cxx:606
void GetHelpMessage() const
get help message text
const TString & Color(const TString &)
human readable color strings
Definition: Tools.cxx:837
#define REGISTER_METHOD(CLASS)
for example
virtual void SetPoint(Int_t i, Double_t x, Double_t y)
Set x and y values for point number i.
Definition: TGraph.cxx:2127
virtual ~MethodCuts(void)
destructor
Definition: MethodCuts.cxx:282
Short_t Max(Short_t a, Short_t b)
Definition: TMathBase.h:202
A Graph is a graphics object made of two arrays X and Y with npoints each.
Definition: TGraph.h:53
void ProcessOptions()
process user options sanity check, do not allow the input variables to be normalised, because this only creates problems when interpreting the cuts
Definition: MethodCuts.cxx:373
TList * ParseFormatLine(TString theString, const char *sep=":")
Parse the string and cut into labels separated by ":".
Definition: Tools.cxx:413
#define NULL
Definition: Rtypes.h:82
void DrawProgressBar(Int_t, const TString &comment="")
draws progress bar in color or B&W caution:
Definition: Timer.cxx:183
Bool_t WriteOptionsReference() const
Definition: Config.h:66
tuple ct
Definition: tornado.py:53
void Store(TObject *obj, const char *alias=0)
Definition: Results.cxx:69
Double_t Sqrt(Double_t x)
Definition: TMath.h:464
Double_t GetTrainingEfficiency(const TString &)
const Bool_t kTRUE
Definition: Rtypes.h:91
virtual Bool_t HasAnalysisType(Types::EAnalysisType type, UInt_t numberClasses, UInt_t numberTargets)
Cuts can only handle classification with 2 classes.
Definition: MethodCuts.cxx:223
Definition: math.cpp:60
TAxis * GetXaxis()
Definition: TH1.h:319