ROOT  6.06/09
Reference Guide
MethodCuts.cxx
Go to the documentation of this file.
1 // @(#)root/tmva $Id$
2 // Author: Andreas Hoecker, Matt Jachowski, Peter Speckmayer, Eckhard von Toerne, Helge Voss, Kai Voss
3 
4 /**********************************************************************************
5  * Project: TMVA - a Root-integrated toolkit for multivariate Data analysis *
6  * Package: TMVA *
7  * Class : TMVA::MethodCuts *
8  * Web : http://tmva.sourceforge.net *
9  * *
10  * Description: *
11  * Implementation (see header for description) *
12  * *
13  * Authors (alphabetical): *
14  * Andreas Hoecker <Andreas.Hocker@cern.ch> - CERN, Switzerland *
15  * Matt Jachowski <jachowski@stanford.edu> - Stanford University, USA *
16  * Peter Speckmayer <speckmay@mail.cern.ch> - CERN, Switzerland *
17  * Eckhard von Toerne <evt@physik.uni-bonn.de> - U. of Bonn, Germany *
18  * Helge Voss <Helge.Voss@cern.ch> - MPI-K Heidelberg, Germany *
19  * Kai Voss <Kai.Voss@cern.ch> - U. of Victoria, Canada *
20  * *
21  * Copyright (c) 2005: *
22  * CERN, Switzerland *
23  * U. of Victoria, Canada *
24  * MPI-K Heidelberg, Germany *
25  * *
26  * Redistribution and use in source and binary forms, with or without *
27  * modification, are permitted according to the terms listed in LICENSE *
28  * (http://tmva.sourceforge.net/LICENSE) *
29  **********************************************************************************/
30 
31 ////////////////////////////////////////////////////////////////////////////////
32 
33 /* Begin_Html
34  Multivariate optimisation of signal efficiency for given background
35  efficiency, applying rectangular minimum and maximum requirements.
36 
37  <p>
38  Also implemented is a "decorrelate/diagonlized cuts approach",
39  which improves over the uncorrelated cuts ansatz by
40  transforming linearly the input variables into a diagonal space,
41  using the square-root of the covariance matrix.
42 
43  <p>
44  <font size="-1">
45  Other optimisation criteria, such as maximising the signal significance-
46  squared, S^2/(S+B), with S and B being the signal and background yields,
47  correspond to a particular point in the optimised background rejection
48  versus signal efficiency curve. This working point requires the knowledge
49  of the expected yields, which is not the case in general. Note also that
50  for rare signals, Poissonian statistics should be used, which modifies
51  the significance criterion.
52  </font>
53 
54  <p>
55  The rectangular cut of a volume in the variable space is performed using
56  a binary tree to sort the training events. This provides a significant
57  reduction in computing time (up to several orders of magnitudes, depending
58  on the complexity of the problem at hand).
59 
60  <p>
61  Technically, optimisation is achieved in TMVA by two methods:
62 
63  <ol>
64  <li>Monte Carlo generation using uniform priors for the lower cut value,
65  and the cut width, thrown within the variable ranges.
66 
67  <li>A Genetic Algorithm (GA) searches for the optimal ("fittest") cut sample.
68  The GA is configurable by many external settings through the option
69  string. For difficult cases (such as many variables), some tuning
70  may be necessary to achieve satisfying results
71  </ol>
72 
73  <p>
74  <font size="-1">
75  Attempts to use Minuit fits (Simplex ot Migrad) instead have not shown
76  superior results, and often failed due to convergence at local minima.
77  </font>
78 
79  <p>
80  The tests we have performed so far showed that in generic applications,
81  the GA is superior to MC sampling, and hence GA is the default method.
82  It is worthwhile trying both anyway.
83 
84  <b>Decorrelated (or "diagonalized") Cuts</b>
85 
86  <p>
87  See class description for Method Likelihood for a detailed explanation.
88 End_Html */
89 //
90 
91 #include <iostream>
92 #include <cstdlib>
93 
94 #include "Riostream.h"
95 #include "TH1F.h"
96 #include "TObjString.h"
97 #include "TDirectory.h"
98 #include "TMath.h"
99 #include "TGraph.h"
100 #include "TSpline.h"
101 #include "TRandom3.h"
102 
103 #include "TMVA/ClassifierFactory.h"
104 #include "TMVA/MethodCuts.h"
105 #include "TMVA/GeneticFitter.h"
106 #include "TMVA/MinuitFitter.h"
107 #include "TMVA/MCFitter.h"
109 #include "TMVA/PDF.h"
110 #include "TMVA/Tools.h"
111 #include "TMVA/Timer.h"
112 #include "TMVA/Interval.h"
113 #include "TMVA/TSpline1.h"
114 #include "TMVA/Config.h"
116 #include "TMVA/Results.h"
117 
118 using std::atof;
119 
120 REGISTER_METHOD(Cuts)
121 
122 ClassImp(TMVA::MethodCuts)
123 
124 const Double_t TMVA::MethodCuts::fgMaxAbsCutVal = 1.0e30;
125 
126 ////////////////////////////////////////////////////////////////////////////////
127 /// standard constructor
128 
129 TMVA::MethodCuts::MethodCuts( const TString& jobName,
130  const TString& methodTitle,
131  DataSetInfo& theData,
132  const TString& theOption,
133  TDirectory* theTargetDir ) :
134  MethodBase( jobName, Types::kCuts, methodTitle, theData, theOption, theTargetDir ),
135  fFitMethod ( kUseGeneticAlgorithm ),
136  fEffMethod ( kUseEventSelection ),
137  fFitParams (0),
138  fTestSignalEff(0.7),
139  fEffSMin ( 0 ),
140  fEffSMax ( 0 ),
141  fCutRangeMin( 0 ),
142  fCutRangeMax( 0 ),
143  fBinaryTreeS( 0 ),
144  fBinaryTreeB( 0 ),
145  fCutMin ( 0 ),
146  fCutMax ( 0 ),
147  fTmpCutMin ( 0 ),
148  fTmpCutMax ( 0 ),
149  fAllVarsI ( 0 ),
150  fNpar ( 0 ),
151  fEffRef ( 0 ),
152  fRangeSign ( 0 ),
153  fRandom ( 0 ),
154  fMeanS ( 0 ),
155  fMeanB ( 0 ),
156  fRmsS ( 0 ),
157  fRmsB ( 0 ),
158  fEffBvsSLocal( 0 ),
159  fVarHistS ( 0 ),
160  fVarHistB ( 0 ),
161  fVarHistS_smooth( 0 ),
162  fVarHistB_smooth( 0 ),
163  fVarPdfS ( 0 ),
164  fVarPdfB ( 0 ),
165  fNegEffWarning( kFALSE )
166 {
167 }
168 
169 ////////////////////////////////////////////////////////////////////////////////
170 /// construction from weight file
171 
173  const TString& theWeightFile,
174  TDirectory* theTargetDir ) :
175  MethodBase( Types::kCuts, theData, theWeightFile, theTargetDir ),
176  fFitMethod ( kUseGeneticAlgorithm ),
177  fEffMethod ( kUseEventSelection ),
178  fFitParams (0),
179  fTestSignalEff(0.7),
180  fEffSMin ( 0 ),
181  fEffSMax ( 0 ),
182  fCutRangeMin( 0 ),
183  fCutRangeMax( 0 ),
184  fBinaryTreeS( 0 ),
185  fBinaryTreeB( 0 ),
186  fCutMin ( 0 ),
187  fCutMax ( 0 ),
188  fTmpCutMin ( 0 ),
189  fTmpCutMax ( 0 ),
190  fAllVarsI ( 0 ),
191  fNpar ( 0 ),
192  fEffRef ( 0 ),
193  fRangeSign ( 0 ),
194  fRandom ( 0 ),
195  fMeanS ( 0 ),
196  fMeanB ( 0 ),
197  fRmsS ( 0 ),
198  fRmsB ( 0 ),
199  fEffBvsSLocal( 0 ),
200  fVarHistS ( 0 ),
201  fVarHistB ( 0 ),
202  fVarHistS_smooth( 0 ),
203  fVarHistB_smooth( 0 ),
204  fVarPdfS ( 0 ),
205  fVarPdfB ( 0 ),
206  fNegEffWarning( kFALSE )
207 {
208 }
209 
210 ////////////////////////////////////////////////////////////////////////////////
211 /// Cuts can only handle classification with 2 classes
212 
214  UInt_t /*numberTargets*/ )
215 {
216  return (type == Types::kClassification && numberClasses == 2);
217 }
218 
219 ////////////////////////////////////////////////////////////////////////////////
220 /// default initialisation called by all constructors
221 
223 {
224  fVarHistS = fVarHistB = 0;
225  fVarHistS_smooth = fVarHistB_smooth = 0;
226  fVarPdfS = fVarPdfB = 0;
227  fFitParams = 0;
228  fBinaryTreeS = fBinaryTreeB = 0;
229  fEffSMin = 0;
230  fEffSMax = 0;
231 
232  // vector with fit results
233  fNpar = 2*GetNvar();
234  fRangeSign = new std::vector<Int_t> ( GetNvar() );
235  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) (*fRangeSign)[ivar] = +1;
236 
237  fMeanS = new std::vector<Double_t>( GetNvar() );
238  fMeanB = new std::vector<Double_t>( GetNvar() );
239  fRmsS = new std::vector<Double_t>( GetNvar() );
240  fRmsB = new std::vector<Double_t>( GetNvar() );
241 
242  // get the variable specific options, first initialize default
243  fFitParams = new std::vector<EFitParameters>( GetNvar() );
244  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) (*fFitParams)[ivar] = kNotEnforced;
245 
246  fFitMethod = kUseMonteCarlo;
247  fTestSignalEff = -1;
248 
249  // create LUT for cuts
250  fCutMin = new Double_t*[GetNvar()];
251  fCutMax = new Double_t*[GetNvar()];
252  for (UInt_t i=0; i<GetNvar(); i++) {
253  fCutMin[i] = new Double_t[fNbins];
254  fCutMax[i] = new Double_t[fNbins];
255  }
256 
257  // init
258  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
259  for (Int_t ibin=0; ibin<fNbins; ibin++) {
260  fCutMin[ivar][ibin] = 0;
261  fCutMax[ivar][ibin] = 0;
262  }
263  }
264 
265  fTmpCutMin = new Double_t[GetNvar()];
266  fTmpCutMax = new Double_t[GetNvar()];
267 }
268 
269 ////////////////////////////////////////////////////////////////////////////////
270 /// destructor
271 
273 {
274  delete fRangeSign;
275  delete fMeanS;
276  delete fMeanB;
277  delete fRmsS;
278  delete fRmsB;
279  delete fFitParams;
280  delete fEffBvsSLocal;
281 
282  if (NULL != fCutRangeMin) delete [] fCutRangeMin;
283  if (NULL != fCutRangeMax) delete [] fCutRangeMax;
284  if (NULL != fAllVarsI) delete [] fAllVarsI;
285 
286  for (UInt_t i=0;i<GetNvar();i++) {
287  if (NULL != fCutMin[i] ) delete [] fCutMin[i];
288  if (NULL != fCutMax[i] ) delete [] fCutMax[i];
289  if (NULL != fCutRange[i]) delete fCutRange[i];
290  }
291 
292  if (NULL != fCutMin) delete [] fCutMin;
293  if (NULL != fCutMax) delete [] fCutMax;
294 
295  if (NULL != fTmpCutMin) delete [] fTmpCutMin;
296  if (NULL != fTmpCutMax) delete [] fTmpCutMax;
297 
298  if (NULL != fBinaryTreeS) delete fBinaryTreeS;
299  if (NULL != fBinaryTreeB) delete fBinaryTreeB;
300 }
301 
302 ////////////////////////////////////////////////////////////////////////////////
303 /// define the options (their key words) that can be set in the option string
304 /// know options:
305 /// Method <string> Minimisation method
306 /// available values are: MC Monte Carlo <default>
307 /// GA Genetic Algorithm
308 /// SA Simulated annealing
309 ///
310 /// EffMethod <string> Efficiency selection method
311 /// available values are: EffSel <default>
312 /// EffPDF
313 ///
314 /// VarProp <string> Property of variable 1 for the MC method (taking precedence over the
315 /// globale setting. The same values as for the global option are available. Variables 1..10 can be
316 /// set this way
317 ///
318 /// CutRangeMin/Max <float> user-defined ranges in which cuts are varied
319 
321 {
322  DeclareOptionRef(fFitMethodS = "GA", "FitMethod", "Minimisation Method (GA, SA, and MC are the primary methods to be used; the others have been introduced for testing purposes and are depreciated)");
323  AddPreDefVal(TString("GA"));
324  AddPreDefVal(TString("SA"));
325  AddPreDefVal(TString("MC"));
326  AddPreDefVal(TString("MCEvents"));
327  AddPreDefVal(TString("MINUIT"));
328  AddPreDefVal(TString("EventScan"));
329 
330  // selection type
331  DeclareOptionRef(fEffMethodS = "EffSel", "EffMethod", "Selection Method");
332  AddPreDefVal(TString("EffSel"));
333  AddPreDefVal(TString("EffPDF"));
334 
335  // cut ranges
336  fCutRange.resize(GetNvar());
337  fCutRangeMin = new Double_t[GetNvar()];
338  fCutRangeMax = new Double_t[GetNvar()];
339  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
340  fCutRange[ivar] = 0;
341  fCutRangeMin[ivar] = fCutRangeMax[ivar] = -1;
342  }
343 
344  DeclareOptionRef( fCutRangeMin, GetNvar(), "CutRangeMin", "Minimum of allowed cut range (set per variable)" );
345  DeclareOptionRef( fCutRangeMax, GetNvar(), "CutRangeMax", "Maximum of allowed cut range (set per variable)" );
346 
347  fAllVarsI = new TString[GetNvar()];
348 
349  for (UInt_t i=0; i<GetNvar(); i++) fAllVarsI[i] = "NotEnforced";
350 
351  DeclareOptionRef(fAllVarsI, GetNvar(), "VarProp", "Categorisation of cuts");
352  AddPreDefVal(TString("NotEnforced"));
353  AddPreDefVal(TString("FMax"));
354  AddPreDefVal(TString("FMin"));
355  AddPreDefVal(TString("FSmart"));
356 }
357 
358 ////////////////////////////////////////////////////////////////////////////////
359 /// process user options
360 /// sanity check, do not allow the input variables to be normalised, because this
361 /// only creates problems when interpreting the cuts
362 
364 {
365  if (IsNormalised()) {
366  Log() << kWARNING << "Normalisation of the input variables for cut optimisation is not" << Endl;
367  Log() << kWARNING << "supported because this provides intransparent cut values, and no" << Endl;
368  Log() << kWARNING << "improvement in the performance of the algorithm." << Endl;
369  Log() << kWARNING << "Please remove \"Normalise\" option from booking option string" << Endl;
370  Log() << kWARNING << "==> Will reset normalisation flag to \"False\"" << Endl;
371  SetNormalised( kFALSE );
372  }
373 
374  if (IgnoreEventsWithNegWeightsInTraining()) {
375  Log() << kFATAL << "Mechanism to ignore events with negative weights in training not yet available for method: "
376  << GetMethodTypeName()
377  << " --> Please remove \"IgnoreNegWeightsInTraining\" option from booking string."
378  << Endl;
379  }
380 
381  if (fFitMethodS == "MC" ) fFitMethod = kUseMonteCarlo;
382  else if (fFitMethodS == "MCEvents") fFitMethod = kUseMonteCarloEvents;
383  else if (fFitMethodS == "GA" ) fFitMethod = kUseGeneticAlgorithm;
384  else if (fFitMethodS == "SA" ) fFitMethod = kUseSimulatedAnnealing;
385  else if (fFitMethodS == "MINUIT" ) {
386  fFitMethod = kUseMinuit;
387  Log() << kWARNING << "poor performance of MINUIT in MethodCuts; preferred fit method: GA" << Endl;
388  }
389  else if (fFitMethodS == "EventScan" ) fFitMethod = kUseEventScan;
390  else Log() << kFATAL << "unknown minimisation method: " << fFitMethodS << Endl;
391 
392  if (fEffMethodS == "EFFSEL" ) fEffMethod = kUseEventSelection; // highly recommended
393  else if (fEffMethodS == "EFFPDF" ) fEffMethod = kUsePDFs;
394  else fEffMethod = kUseEventSelection;
395 
396  // options output
397  Log() << kINFO << Form("Use optimization method: \"%s\"",
398  (fFitMethod == kUseMonteCarlo) ? "Monte Carlo" :
399  (fFitMethod == kUseMonteCarlo) ? "Monte-Carlo-Event sampling" :
400  (fFitMethod == kUseEventScan) ? "Full Event Scan (slow)" :
401  (fFitMethod == kUseMinuit) ? "MINUIT" : "Genetic Algorithm" ) << Endl;
402  Log() << kINFO << Form("Use efficiency computation method: \"%s\"",
403  (fEffMethod == kUseEventSelection) ? "Event Selection" : "PDF" ) << Endl;
404 
405  // cut ranges
406  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
407  fCutRange[ivar] = new Interval( fCutRangeMin[ivar], fCutRangeMax[ivar] );
408  }
409 
410  // individual options
411  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
412  EFitParameters theFitP = kNotEnforced;
413  if (fAllVarsI[ivar] == "" || fAllVarsI[ivar] == "NotEnforced") theFitP = kNotEnforced;
414  else if (fAllVarsI[ivar] == "FMax" ) theFitP = kForceMax;
415  else if (fAllVarsI[ivar] == "FMin" ) theFitP = kForceMin;
416  else if (fAllVarsI[ivar] == "FSmart" ) theFitP = kForceSmart;
417  else {
418  Log() << kFATAL << "unknown value \'" << fAllVarsI[ivar]
419  << "\' for fit parameter option " << Form("VarProp[%i]",ivar) << Endl;
420  }
421  (*fFitParams)[ivar] = theFitP;
422 
423  if (theFitP != kNotEnforced)
424  Log() << kINFO << "Use \"" << fAllVarsI[ivar]
425  << "\" cuts for variable: " << "'" << (*fInputVars)[ivar] << "'" << Endl;
426  }
427 }
428 
429 ////////////////////////////////////////////////////////////////////////////////
430 /// cut evaluation: returns 1.0 if event passed, 0.0 otherwise
431 
433 {
434  // cannot determine error
435  NoErrorCalc(err, errUpper);
436 
437  // sanity check
438  if (fCutMin == NULL || fCutMax == NULL || fNbins == 0) {
439  Log() << kFATAL << "<Eval_Cuts> fCutMin/Max have zero pointer. "
440  << "Did you book Cuts ?" << Endl;
441  }
442 
443  const Event* ev = GetEvent();
444 
445  // sanity check
446  if (fTestSignalEff > 0) {
447  // get efficiency bin
448  Int_t ibin = fEffBvsSLocal->FindBin( fTestSignalEff );
449  if (ibin < 0 ) ibin = 0;
450  else if (ibin >= fNbins) ibin = fNbins - 1;
451 
452  Bool_t passed = kTRUE;
453  for (UInt_t ivar=0; ivar<GetNvar(); ivar++)
454  passed &= ( (ev->GetValue(ivar) > fCutMin[ivar][ibin]) &&
455  (ev->GetValue(ivar) <= fCutMax[ivar][ibin]) );
456 
457  return passed ? 1. : 0. ;
458  }
459  else return 0;
460 }
461 
462 ////////////////////////////////////////////////////////////////////////////////
463 /// print cuts
464 
466 {
467  std::vector<Double_t> cutsMin;
468  std::vector<Double_t> cutsMax;
469  Int_t ibin = fEffBvsSLocal->FindBin( effS );
470 
471  Double_t trueEffS = GetCuts( effS, cutsMin, cutsMax );
472 
473  // retrieve variable expressions (could be transformations)
474  std::vector<TString>* varVec = 0;
475  if (GetTransformationHandler().GetNumOfTransformations() == 0) {
476  // no transformation applied, replace by current variables
477  varVec = new std::vector<TString>;
478  for (UInt_t ivar=0; ivar<cutsMin.size(); ivar++) {
479  varVec->push_back( DataInfo().GetVariableInfo(ivar).GetLabel() );
480  }
481  }
482  else if (GetTransformationHandler().GetNumOfTransformations() == 1) {
483  // get transformation string
484  varVec = GetTransformationHandler().GetTransformationStringsOfLastTransform();
485  }
486  else {
487  // replace transformation print by current variables and indicated incompleteness
488  varVec = new std::vector<TString>;
489  for (UInt_t ivar=0; ivar<cutsMin.size(); ivar++) {
490  varVec->push_back( DataInfo().GetVariableInfo(ivar).GetLabel() + " [transformed]" );
491  }
492  }
493 
494  UInt_t maxL = 0;
495  for (UInt_t ivar=0; ivar<cutsMin.size(); ivar++) {
496  if ((UInt_t)(*varVec)[ivar].Length() > maxL) maxL = (*varVec)[ivar].Length();
497  }
498  UInt_t maxLine = 20+maxL+16;
499 
500  for (UInt_t i=0; i<maxLine; i++) Log() << "-";
501  Log() << Endl;
502  Log() << kINFO << "Cut values for requested signal efficiency: " << trueEffS << Endl;
503  Log() << kINFO << "Corresponding background efficiency : " << fEffBvsSLocal->GetBinContent( ibin ) << Endl;
504  if (GetTransformationHandler().GetNumOfTransformations() == 1) {
505  Log() << kINFO << "Transformation applied to input variables : \""
506  << GetTransformationHandler().GetNameOfLastTransform() << "\"" << Endl;
507  }
508  else if (GetTransformationHandler().GetNumOfTransformations() > 1) {
509  Log() << kINFO << "[ More than one (=" << GetTransformationHandler().GetNumOfTransformations() << ") "
510  << " transformations applied in transformation chain; cuts applied on transformed quantities ] " << Endl;
511  }
512  else {
513  Log() << kINFO << "Transformation applied to input variables : None" << Endl;
514  }
515  for (UInt_t i=0; i<maxLine; i++) Log() << "-";
516  Log() << Endl;
517  for (UInt_t ivar=0; ivar<cutsMin.size(); ivar++) {
518  Log() << kINFO
519  << "Cut[" << std::setw(2) << ivar << "]: "
520  << std::setw(10) << cutsMin[ivar]
521  << " < "
522  << std::setw(maxL) << (*varVec)[ivar]
523  << " <= "
524  << std::setw(10) << cutsMax[ivar] << Endl;
525  }
526  for (UInt_t i=0; i<maxLine; i++) Log() << "-";
527  Log() << Endl;
528 
529  delete varVec; // yes, ownership has been given to us
530 }
531 
532 ////////////////////////////////////////////////////////////////////////////////
533 /// retrieve cut values for given signal efficiency
534 /// assume vector of correct size !!
535 
537 {
538  std::vector<Double_t> cMin( GetNvar() );
539  std::vector<Double_t> cMax( GetNvar() );
540  Double_t trueEffS = GetCuts( effS, cMin, cMax );
541  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
542  cutMin[ivar] = cMin[ivar];
543  cutMax[ivar] = cMax[ivar];
544  }
545  return trueEffS;
546 }
547 
548 ////////////////////////////////////////////////////////////////////////////////
549 /// retrieve cut values for given signal efficiency
550 
552  std::vector<Double_t>& cutMin,
553  std::vector<Double_t>& cutMax ) const
554 {
555  // find corresponding bin
556  Int_t ibin = fEffBvsSLocal->FindBin( effS );
557 
558  // get the true efficiency which is the one on the "left hand" side of the bin
559  Double_t trueEffS = fEffBvsSLocal->GetBinLowEdge( ibin );
560 
561  ibin--; // the 'cut' vector has 0...fNbins indices
562  if (ibin < 0 ) ibin = 0;
563  else if (ibin >= fNbins) ibin = fNbins - 1;
564 
565  cutMin.clear();
566  cutMax.clear();
567  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
568  cutMin.push_back( fCutMin[ivar][ibin] );
569  cutMax.push_back( fCutMax[ivar][ibin] );
570  }
571 
572  return trueEffS;
573 }
574 
575 ////////////////////////////////////////////////////////////////////////////////
576 /// training method: here the cuts are optimised for the training sample
577 
579 {
580  if (fEffMethod == kUsePDFs) CreateVariablePDFs(); // create PDFs for variables
581 
582  // create binary trees (global member variables) for signal and background
583  if (fBinaryTreeS != 0) { delete fBinaryTreeS; fBinaryTreeS = 0; }
584  if (fBinaryTreeB != 0) { delete fBinaryTreeB; fBinaryTreeB = 0; }
585 
586  // the variables may be transformed by a transformation method: to coherently
587  // treat signal and background one must decide which transformation type shall
588  // be used: our default is signal-type
589 
590  fBinaryTreeS = new BinarySearchTree();
591  fBinaryTreeS->Fill( GetEventCollection(Types::kTraining), fSignalClass );
592  fBinaryTreeB = new BinarySearchTree();
593  fBinaryTreeB->Fill( GetEventCollection(Types::kTraining), fBackgroundClass );
594 
595  for (UInt_t ivar =0; ivar < Data()->GetNVariables(); ivar++) {
596  (*fMeanS)[ivar] = fBinaryTreeS->Mean(Types::kSignal, ivar);
597  (*fRmsS)[ivar] = fBinaryTreeS->RMS (Types::kSignal, ivar);
598  (*fMeanB)[ivar] = fBinaryTreeB->Mean(Types::kBackground, ivar);
599  (*fRmsB)[ivar] = fBinaryTreeB->RMS (Types::kBackground, ivar);
600 
601  // update interval ?
602  Double_t xmin = TMath::Min(fBinaryTreeS->Min(Types::kSignal, ivar),
603  fBinaryTreeB->Min(Types::kBackground, ivar));
604  Double_t xmax = TMath::Max(fBinaryTreeS->Max(Types::kSignal, ivar),
605  fBinaryTreeB->Max(Types::kBackground, ivar));
606 
607  // redefine ranges to be slightly smaller and larger than xmin and xmax, respectively
608  Double_t eps = 0.01*(xmax - xmin);
609  xmin -= eps;
610  xmax += eps;
611 
612  if (TMath::Abs(fCutRange[ivar]->GetMin() - fCutRange[ivar]->GetMax()) < 1.0e-300 ) {
613  fCutRange[ivar]->SetMin( xmin );
614  fCutRange[ivar]->SetMax( xmax );
615  }
616  else if (xmin > fCutRange[ivar]->GetMin()) fCutRange[ivar]->SetMin( xmin );
617  else if (xmax < fCutRange[ivar]->GetMax()) fCutRange[ivar]->SetMax( xmax );
618  }
619 
620  std::vector<TH1F*> signalDist, bkgDist;
621 
622  // this is important: reset the branch addresses of the training tree to the current event
623  delete fEffBvsSLocal;
624  fEffBvsSLocal = new TH1F( GetTestvarName() + "_effBvsSLocal",
625  TString(GetName()) + " efficiency of B vs S", fNbins, 0.0, 1.0 );
626  fEffBvsSLocal->SetDirectory(0); // it's local
627 
628  // init
629  for (Int_t ibin=1; ibin<=fNbins; ibin++) fEffBvsSLocal->SetBinContent( ibin, -0.1 );
630 
631  // --------------------------------------------------------------------------
632  if (fFitMethod == kUseGeneticAlgorithm ||
633  fFitMethod == kUseMonteCarlo ||
634  fFitMethod == kUseMinuit ||
635  fFitMethod == kUseSimulatedAnnealing) {
636 
637  // ranges
638  std::vector<Interval*> ranges;
639 
640  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
641 
642  Int_t nbins = 0;
643  if (DataInfo().GetVariableInfo(ivar).GetVarType() == 'I') {
644  nbins = Int_t(fCutRange[ivar]->GetMax() - fCutRange[ivar]->GetMin()) + 1;
645  }
646 
647  if ((*fFitParams)[ivar] == kForceSmart) {
648  if ((*fMeanS)[ivar] > (*fMeanB)[ivar]) (*fFitParams)[ivar] = kForceMax;
649  else (*fFitParams)[ivar] = kForceMin;
650  }
651 
652  if ((*fFitParams)[ivar] == kForceMin) {
653  ranges.push_back( new Interval( fCutRange[ivar]->GetMin(), fCutRange[ivar]->GetMin(), nbins ) );
654  ranges.push_back( new Interval( 0, fCutRange[ivar]->GetMax() - fCutRange[ivar]->GetMin(), nbins ) );
655  }
656  else if ((*fFitParams)[ivar] == kForceMax) {
657  ranges.push_back( new Interval( fCutRange[ivar]->GetMin(), fCutRange[ivar]->GetMax(), nbins ) );
658  ranges.push_back( new Interval( fCutRange[ivar]->GetMax() - fCutRange[ivar]->GetMin(),
659  fCutRange[ivar]->GetMax() - fCutRange[ivar]->GetMin(), nbins ) );
660  }
661  else {
662  ranges.push_back( new Interval( fCutRange[ivar]->GetMin(), fCutRange[ivar]->GetMax(), nbins ) );
663  ranges.push_back( new Interval( 0, fCutRange[ivar]->GetMax() - fCutRange[ivar]->GetMin(), nbins ) );
664  }
665  }
666 
667  // create the fitter
668  FitterBase* fitter = NULL;
669 
670  switch (fFitMethod) {
671  case kUseGeneticAlgorithm:
672  fitter = new GeneticFitter( *this, Form("%sFitter_GA", GetName()), ranges, GetOptions() );
673  break;
674  case kUseMonteCarlo:
675  fitter = new MCFitter ( *this, Form("%sFitter_MC", GetName()), ranges, GetOptions() );
676  break;
677  case kUseMinuit:
678  fitter = new MinuitFitter ( *this, Form("%sFitter_MINUIT", GetName()), ranges, GetOptions() );
679  break;
680  case kUseSimulatedAnnealing:
681  fitter = new SimulatedAnnealingFitter( *this, Form("%sFitter_SA", GetName()), ranges, GetOptions() );
682  break;
683  default:
684  Log() << kFATAL << "Wrong fit method: " << fFitMethod << Endl;
685  }
686 
687  fitter->CheckForUnusedOptions();
688 
689  // perform the fit
690  fitter->Run();
691 
692  // clean up
693  for (UInt_t ivar=0; ivar<ranges.size(); ivar++) delete ranges[ivar];
694  delete fitter;
695 
696  }
697  // --------------------------------------------------------------------------
698  else if (fFitMethod == kUseEventScan) {
699 
700  Int_t nevents = Data()->GetNEvents();
701  Int_t ic = 0;
702 
703  // timing of MC
704  Int_t nsamples = Int_t(0.5*nevents*(nevents - 1));
705  Timer timer( nsamples, GetName() );
706 
707  Log() << kINFO << "Running full event scan: " << Endl;
708  for (Int_t ievt1=0; ievt1<nevents; ievt1++) {
709  for (Int_t ievt2=ievt1+1; ievt2<nevents; ievt2++) {
710 
711  EstimatorFunction( ievt1, ievt2 );
712 
713  // what's the time please?
714  ic++;
715  if ((nsamples<10000) || ic%10000 == 0) timer.DrawProgressBar( ic );
716  }
717  }
718  }
719  // --------------------------------------------------------------------------
720  else if (fFitMethod == kUseMonteCarloEvents) {
721 
722  Int_t nsamples = 200000;
723  UInt_t seed = 100;
724  DeclareOptionRef( nsamples, "SampleSize", "Number of Monte-Carlo-Event samples" );
725  DeclareOptionRef( seed, "Seed", "Seed for the random generator (0 takes random seeds)" );
726  ParseOptions();
727 
728  Int_t nevents = Data()->GetNEvents();
729  Int_t ic = 0;
730 
731  // timing of MC
732  Timer timer( nsamples, GetName() );
733 
734  // random generator
735  TRandom3*rnd = new TRandom3( seed );
736 
737  Log() << kINFO << "Running Monte-Carlo-Event sampling over " << nsamples << " events" << Endl;
738  std::vector<Double_t> pars( 2*GetNvar() );
739 
740  for (Int_t itoy=0; itoy<nsamples; itoy++) {
741 
742  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
743 
744  // generate minimum and delta cuts for this variable
745 
746  // retrieve signal events
747  Bool_t isSignal = kFALSE;
748  Int_t ievt1, ievt2;
749  Double_t evt1 = 0., evt2 = 0.;
750  Int_t nbreak = 0;
751  while (!isSignal) {
752  ievt1 = Int_t(rnd->Uniform(0.,1.)*nevents);
753  ievt2 = Int_t(rnd->Uniform(0.,1.)*nevents);
754 
755  const Event *ev1 = GetEvent(ievt1);
756  isSignal = DataInfo().IsSignal(ev1);
757  evt1 = ev1->GetValue( ivar );
758 
759  const Event *ev2 = GetEvent(ievt2);
760  isSignal &= DataInfo().IsSignal(ev2);
761  evt2 = ev2->GetValue( ivar );
762 
763  if (nbreak++ > 10000) Log() << kFATAL << "<MCEvents>: could not find signal events"
764  << " after 10000 trials - do you have signal events in your sample ?"
765  << Endl;
766  isSignal = 1;
767  }
768 
769  // sort
770  if (evt1 > evt2) { Double_t z = evt1; evt1 = evt2; evt2 = z; }
771  pars[2*ivar] = evt1;
772  pars[2*ivar+1] = evt2 - evt1;
773  }
774 
775  // compute estimator
776  EstimatorFunction( pars );
777 
778  // what's the time please?
779  ic++;
780  if ((nsamples<1000) || ic%1000 == 0) timer.DrawProgressBar( ic );
781  }
782 
783  delete rnd;
784  }
785  // --------------------------------------------------------------------------
786  else Log() << kFATAL << "Unknown minimisation method: " << fFitMethod << Endl;
787 
788  if (fBinaryTreeS != 0) { delete fBinaryTreeS; fBinaryTreeS = 0; }
789  if (fBinaryTreeB != 0) { delete fBinaryTreeB; fBinaryTreeB = 0; }
790 
791  // force cut ranges within limits
792  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
793  for (Int_t ibin=0; ibin<fNbins; ibin++) {
794 
795  if ((*fFitParams)[ivar] == kForceMin && fCutMin[ivar][ibin] > -fgMaxAbsCutVal) {
796  fCutMin[ivar][ibin] = -fgMaxAbsCutVal;
797  }
798  if ((*fFitParams)[ivar] == kForceMax && fCutMax[ivar][ibin] < fgMaxAbsCutVal) {
799  fCutMax[ivar][ibin] = fgMaxAbsCutVal;
800  }
801  }
802  }
803 
804  // some output
805  // the efficiency which is asked for has to be slightly higher than the bin-borders.
806  // if not, then the wrong bin is taken in some cases.
807  Double_t epsilon = 0.0001;
808  for (Double_t eff=0.1; eff<0.95; eff += 0.1) PrintCuts( eff+epsilon );
809 }
810 
811 ////////////////////////////////////////////////////////////////////////////////
812 /// nothing to test
813 
815 {
816 }
817 
818 ////////////////////////////////////////////////////////////////////////////////
819 /// for full event scan
820 
822 {
823  const Event *ev1 = GetEvent(ievt1);
824  if (!DataInfo().IsSignal(ev1)) return -1;
825 
826  const Event *ev2 = GetEvent(ievt2);
827  if (!DataInfo().IsSignal(ev2)) return -1;
828 
829  const Int_t nvar = GetNvar();
830  Double_t* evt1 = new Double_t[nvar];
831  Double_t* evt2 = new Double_t[nvar];
832 
833  for (Int_t ivar=0; ivar<nvar; ivar++) {
834  evt1[ivar] = ev1->GetValue( ivar );
835  evt2[ivar] = ev2->GetValue( ivar );
836  }
837 
838  // determine cuts
839  std::vector<Double_t> pars;
840  for (Int_t ivar=0; ivar<nvar; ivar++) {
841  Double_t cutMin;
842  Double_t cutMax;
843  if (evt1[ivar] < evt2[ivar]) {
844  cutMin = evt1[ivar];
845  cutMax = evt2[ivar];
846  }
847  else {
848  cutMin = evt2[ivar];
849  cutMax = evt1[ivar];
850  }
851 
852  pars.push_back( cutMin );
853  pars.push_back( cutMax - cutMin );
854  }
855 
856  delete [] evt1;
857  delete [] evt2;
858 
859  return ComputeEstimator( pars );
860 }
861 
862 ////////////////////////////////////////////////////////////////////////////////
863 /// returns estimator for "cut fitness" used by GA
864 
865 Double_t TMVA::MethodCuts::EstimatorFunction( std::vector<Double_t>& pars )
866 {
867  return ComputeEstimator( pars );
868 }
869 
870 ////////////////////////////////////////////////////////////////////////////////
871 /// returns estimator for "cut fitness" used by GA
872 /// there are two requirements:
873 /// 1) the signal efficiency must be equal to the required one in the
874 /// efficiency scan
875 /// 2) the background efficiency must be as small as possible
876 /// the requirement 1) has priority over 2)
877 
878 Double_t TMVA::MethodCuts::ComputeEstimator( std::vector<Double_t>& pars )
879 {
880  // caution: the npar gives the _free_ parameters
881  // however: the "pars" array contains all parameters
882 
883  // determine cuts
884  Double_t effS = 0, effB = 0;
885  this->MatchParsToCuts( pars, &fTmpCutMin[0], &fTmpCutMax[0] );
886 
887  // retrieve signal and background efficiencies for given cut
888  switch (fEffMethod) {
889  case kUsePDFs:
890  this->GetEffsfromPDFs (&fTmpCutMin[0], &fTmpCutMax[0], effS, effB);
891  break;
892  case kUseEventSelection:
893  this->GetEffsfromSelection (&fTmpCutMin[0], &fTmpCutMax[0], effS, effB);
894  break;
895  default:
896  this->GetEffsfromSelection (&fTmpCutMin[0], &fTmpCutMax[0], effS, effB);
897  }
898 
899  Double_t eta = 0;
900 
901  // test for a estimator function which optimizes on the whole background-rejection signal-efficiency plot
902 
903  // get the backg-reject. and sig-eff for the parameters given to this function
904  // effS, effB
905 
906  // get best background rejection for given signal efficiency
907  Int_t ibinS = fEffBvsSLocal->FindBin( effS );
908 
909  Double_t effBH = fEffBvsSLocal->GetBinContent( ibinS );
910  Double_t effBH_left = (ibinS > 1 ) ? fEffBvsSLocal->GetBinContent( ibinS-1 ) : effBH;
911  Double_t effBH_right = (ibinS < fNbins) ? fEffBvsSLocal->GetBinContent( ibinS+1 ) : effBH;
912 
913  Double_t average = 0.5*(effBH_left + effBH_right);
914  if (effBH < effB) average = effBH;
915 
916  // if the average of the bin right and left is larger than this one, add the difference to
917  // the current value of the estimator (because you can do at least so much better)
918  eta = ( -TMath::Abs(effBH-average) + (1.0 - (effBH - effB))) / (1.0 + effS);
919  // alternative idea
920  //if (effBH<0) eta = (1.e-6+effB)/(1.0 + effS);
921  //else eta = (effB - effBH) * (1.0 + 10.* effS);
922 
923  // if a point is found which is better than an existing one, ... replace it.
924  // preliminary best event -> backup
925  if (effBH < 0 || effBH > effB) {
926  fEffBvsSLocal->SetBinContent( ibinS, effB );
927  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
928  fCutMin[ivar][ibinS-1] = fTmpCutMin[ivar]; // bin 1 stored in index 0
929  fCutMax[ivar][ibinS-1] = fTmpCutMax[ivar];
930  }
931  }
932 
933  // caution (!) this value is not good for a decision for MC, .. it is designed for GA
934  // but .. it doesn't matter, as MC samplings are independent from the former ones
935  // and the replacement of the best variables by better ones is done about 10 lines above.
936  // ( if (effBH < 0 || effBH > effB) { .... )
937 
938  if (ibinS<=1) {
939  // add penalty for effS=0 bin
940  // to avoid that the minimizer gets stuck in the zero-bin
941  // force it towards higher efficiency
942  Double_t penalty=0.,diff=0.;
943  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
944  diff=(fCutRange[ivar]->GetMax()-fTmpCutMax[ivar])/(fCutRange[ivar]->GetMax()-fCutRange[ivar]->GetMin());
945  penalty+=diff*diff;
946  diff=(fCutRange[ivar]->GetMin()-fTmpCutMin[ivar])/(fCutRange[ivar]->GetMax()-fCutRange[ivar]->GetMin());
947  penalty+=4.*diff*diff;
948  }
949 
950  if (effS<1.e-4) return 10.0+penalty;
951  else return 10.*(1.-10.*effS);
952  }
953  return eta;
954 }
955 
956 ////////////////////////////////////////////////////////////////////////////////
957 /// translates parameters into cuts
958 
959 void TMVA::MethodCuts::MatchParsToCuts( const std::vector<Double_t> & pars,
960  Double_t* cutMin, Double_t* cutMax )
961 {
962  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
963  Int_t ipar = 2*ivar;
964  cutMin[ivar] = ((*fRangeSign)[ivar] > 0) ? pars[ipar] : pars[ipar] - pars[ipar+1];
965  cutMax[ivar] = ((*fRangeSign)[ivar] > 0) ? pars[ipar] + pars[ipar+1] : pars[ipar];
966  }
967 }
968 
969 ////////////////////////////////////////////////////////////////////////////////
970 /// translate the cuts into parameters (obsolete function)
971 
972 void TMVA::MethodCuts::MatchCutsToPars( std::vector<Double_t>& pars,
973  Double_t** cutMinAll, Double_t** cutMaxAll, Int_t ibin )
974 {
975  if (ibin < 1 || ibin > fNbins) Log() << kFATAL << "::MatchCutsToPars: bin error: "
976  << ibin << Endl;
977 
978  const UInt_t nvar = GetNvar();
979  Double_t *cutMin = new Double_t[nvar];
980  Double_t *cutMax = new Double_t[nvar];
981  for (UInt_t ivar=0; ivar<nvar; ivar++) {
982  cutMin[ivar] = cutMinAll[ivar][ibin-1];
983  cutMax[ivar] = cutMaxAll[ivar][ibin-1];
984  }
985 
986  MatchCutsToPars( pars, cutMin, cutMax );
987  delete [] cutMin;
988  delete [] cutMax;
989 }
990 
991 ////////////////////////////////////////////////////////////////////////////////
992 /// translates cuts into parameters
993 
994 void TMVA::MethodCuts::MatchCutsToPars( std::vector<Double_t>& pars,
995  Double_t* cutMin, Double_t* cutMax )
996 {
997  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
998  Int_t ipar = 2*ivar;
999  pars[ipar] = ((*fRangeSign)[ivar] > 0) ? cutMin[ivar] : cutMax[ivar];
1000  pars[ipar+1] = cutMax[ivar] - cutMin[ivar];
1001  }
1002 }
1003 
1004 ////////////////////////////////////////////////////////////////////////////////
1005 /// compute signal and background efficiencies from PDFs
1006 /// for given cut sample
1007 
1009  Double_t& effS, Double_t& effB )
1010 {
1011  effS = 1.0;
1012  effB = 1.0;
1013  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
1014  effS *= (*fVarPdfS)[ivar]->GetIntegral( cutMin[ivar], cutMax[ivar] );
1015  effB *= (*fVarPdfB)[ivar]->GetIntegral( cutMin[ivar], cutMax[ivar] );
1016  }
1017 
1018  // quick fix to prevent from efficiencies < 0
1019  if( effS < 0.0 ) {
1020  effS = 0.0;
1021  if( !fNegEffWarning ) Log() << kWARNING << "Negative signal efficiency found and set to 0. This is probably due to many events with negative weights in a certain cut-region." << Endl;
1022  fNegEffWarning = kTRUE;
1023  }
1024  if( effB < 0.0 ) {
1025  effB = 0.0;
1026  if( !fNegEffWarning ) Log() << kWARNING << "Negative background efficiency found and set to 0. This is probably due to many events with negative weights in a certain cut-region." << Endl;
1027  fNegEffWarning = kTRUE;
1028  }
1029 }
1030 
1031 ////////////////////////////////////////////////////////////////////////////////
1032 /// compute signal and background efficiencies from event counting
1033 /// for given cut sample
1034 
1036  Double_t& effS, Double_t& effB)
1037 {
1038  Float_t nTotS = 0, nTotB = 0;
1039  Float_t nSelS = 0, nSelB = 0;
1040 
1041  Volume* volume = new Volume( cutMin, cutMax, GetNvar() );
1042 
1043  // search for all events lying in the volume, and add up their weights
1044  nSelS = fBinaryTreeS->SearchVolume( volume );
1045  nSelB = fBinaryTreeB->SearchVolume( volume );
1046 
1047  delete volume;
1048 
1049  // total number of "events" (sum of weights) as reference to compute efficiency
1050  nTotS = fBinaryTreeS->GetSumOfWeights();
1051  nTotB = fBinaryTreeB->GetSumOfWeights();
1052 
1053  // sanity check
1054  if (nTotS == 0 && nTotB == 0) {
1055  Log() << kFATAL << "<GetEffsfromSelection> fatal error in zero total number of events:"
1056  << " nTotS, nTotB: " << nTotS << " " << nTotB << " ***" << Endl;
1057  }
1058 
1059  // efficiencies
1060  if (nTotS == 0 ) {
1061  effS = 0;
1062  effB = nSelB/nTotB;
1063  Log() << kWARNING << "<ComputeEstimator> zero number of signal events" << Endl;
1064  }
1065  else if (nTotB == 0) {
1066  effB = 0;
1067  effS = nSelS/nTotS;
1068  Log() << kWARNING << "<ComputeEstimator> zero number of background events" << Endl;
1069  }
1070  else {
1071  effS = nSelS/nTotS;
1072  effB = nSelB/nTotB;
1073  }
1074 
1075  // quick fix to prevent from efficiencies < 0
1076  if( effS < 0.0 ) {
1077  effS = 0.0;
1078  if( !fNegEffWarning ) Log() << kWARNING << "Negative signal efficiency found and set to 0. This is probably due to many events with negative weights in a certain cut-region." << Endl;
1079  fNegEffWarning = kTRUE;
1080  }
1081  if( effB < 0.0 ) {
1082  effB = 0.0;
1083  if( !fNegEffWarning ) Log() << kWARNING << "Negative background efficiency found and set to 0. This is probably due to many events with negative weights in a certain cut-region." << Endl;
1084  fNegEffWarning = kTRUE;
1085  }
1086 }
1087 
1088 ////////////////////////////////////////////////////////////////////////////////
1089 /// for PDF method: create efficiency reference histograms and PDFs
1090 
1092 {
1093  // create list of histograms and PDFs
1094  fVarHistS = new std::vector<TH1*>( GetNvar() );
1095  fVarHistB = new std::vector<TH1*>( GetNvar() );
1096  fVarHistS_smooth = new std::vector<TH1*>( GetNvar() );
1097  fVarHistB_smooth = new std::vector<TH1*>( GetNvar() );
1098  fVarPdfS = new std::vector<PDF*>( GetNvar() );
1099  fVarPdfB = new std::vector<PDF*>( GetNvar() );
1100 
1101  Int_t nsmooth = 0;
1102 
1103  // get min and max values of all events
1104  Double_t minVal = DBL_MAX;
1105  Double_t maxVal = -DBL_MAX;
1106  for( UInt_t ievt=0; ievt<Data()->GetNEvents(); ievt++ ){
1107  const Event *ev = GetEvent(ievt);
1108  Float_t val = ev->GetValue(ievt);
1109  if( val > minVal ) minVal = val;
1110  if( val < maxVal ) maxVal = val;
1111  }
1112 
1113  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
1114 
1115  // ---- signal
1116  TString histTitle = (*fInputVars)[ivar] + " signal training";
1117  TString histName = (*fInputVars)[ivar] + "_sig";
1118  // TString drawOpt = (*fInputVars)[ivar] + ">>h(";
1119  // drawOpt += fNbins;
1120  // drawOpt += ")";
1121 
1122  // selection
1123  // Data().GetTrainingTree()->Draw( drawOpt, "type==1", "goff" );
1124  // (*fVarHistS)[ivar] = (TH1F*)gDirectory->Get("h");
1125  // (*fVarHistS)[ivar]->SetName(histName);
1126  // (*fVarHistS)[ivar]->SetTitle(histTitle);
1127 
1128  (*fVarHistS)[ivar] = new TH1F(histName.Data(), histTitle.Data(), fNbins, minVal, maxVal );
1129 
1130  // ---- background
1131  histTitle = (*fInputVars)[ivar] + " background training";
1132  histName = (*fInputVars)[ivar] + "_bgd";
1133  // drawOpt = (*fInputVars)[ivar] + ">>h(";
1134  // drawOpt += fNbins;
1135  // drawOpt += ")";
1136 
1137  // Data().GetTrainingTree()->Draw( drawOpt, "type==0", "goff" );
1138  // (*fVarHistB)[ivar] = (TH1F*)gDirectory->Get("h");
1139  // (*fVarHistB)[ivar]->SetName(histName);
1140  // (*fVarHistB)[ivar]->SetTitle(histTitle);
1141 
1142 
1143  (*fVarHistB)[ivar] = new TH1F(histName.Data(), histTitle.Data(), fNbins, minVal, maxVal );
1144 
1145  for( UInt_t ievt=0; ievt<Data()->GetNEvents(); ievt++ ){
1146  const Event *ev = GetEvent(ievt);
1147  Float_t val = ev->GetValue(ievt);
1148  if( DataInfo().IsSignal(ev) ){
1149  (*fVarHistS)[ivar]->Fill( val );
1150  }else{
1151  (*fVarHistB)[ivar]->Fill( val );
1152  }
1153  }
1154 
1155 
1156 
1157  // make copy for smoothed histos
1158  (*fVarHistS_smooth)[ivar] = (TH1F*)(*fVarHistS)[ivar]->Clone();
1159  histTitle = (*fInputVars)[ivar] + " signal training smoothed ";
1160  histTitle += nsmooth;
1161  histTitle +=" times";
1162  histName = (*fInputVars)[ivar] + "_sig_smooth";
1163  (*fVarHistS_smooth)[ivar]->SetName(histName);
1164  (*fVarHistS_smooth)[ivar]->SetTitle(histTitle);
1165 
1166  // smooth
1167  (*fVarHistS_smooth)[ivar]->Smooth(nsmooth);
1168 
1169  // ---- background
1170  // histTitle = (*fInputVars)[ivar] + " background training";
1171  // histName = (*fInputVars)[ivar] + "_bgd";
1172  // drawOpt = (*fInputVars)[ivar] + ">>h(";
1173  // drawOpt += fNbins;
1174  // drawOpt += ")";
1175 
1176  // Data().GetTrainingTree()->Draw( drawOpt, "type==0", "goff" );
1177  // (*fVarHistB)[ivar] = (TH1F*)gDirectory->Get("h");
1178  // (*fVarHistB)[ivar]->SetName(histName);
1179  // (*fVarHistB)[ivar]->SetTitle(histTitle);
1180 
1181  // make copy for smoothed histos
1182  (*fVarHistB_smooth)[ivar] = (TH1F*)(*fVarHistB)[ivar]->Clone();
1183  histTitle = (*fInputVars)[ivar]+" background training smoothed ";
1184  histTitle += nsmooth;
1185  histTitle +=" times";
1186  histName = (*fInputVars)[ivar]+"_bgd_smooth";
1187  (*fVarHistB_smooth)[ivar]->SetName(histName);
1188  (*fVarHistB_smooth)[ivar]->SetTitle(histTitle);
1189 
1190  // smooth
1191  (*fVarHistB_smooth)[ivar]->Smooth(nsmooth);
1192 
1193  // create PDFs
1194  (*fVarPdfS)[ivar] = new PDF( TString(GetName()) + " PDF Var Sig " + GetInputVar( ivar ), (*fVarHistS_smooth)[ivar], PDF::kSpline2 );
1195  (*fVarPdfB)[ivar] = new PDF( TString(GetName()) + " PDF Var Bkg " + GetInputVar( ivar ), (*fVarHistB_smooth)[ivar], PDF::kSpline2 );
1196  }
1197 }
1198 
1199 ////////////////////////////////////////////////////////////////////////////////
1200 /// read the cuts from stream
1201 
1203 {
1204  TString dummy;
1205  UInt_t dummyInt;
1206 
1207  // first the dimensions
1208  istr >> dummy >> dummy;
1209  // coverity[tainted_data_argument]
1210  istr >> dummy >> fNbins;
1211 
1212  // get rid of one read-in here because we read in once all ready to check for decorrelation
1213  istr >> dummy >> dummy >> dummy >> dummy >> dummy >> dummy >> dummyInt >> dummy ;
1214 
1215  // sanity check
1216  if (dummyInt != Data()->GetNVariables()) {
1217  Log() << kFATAL << "<ReadWeightsFromStream> fatal error: mismatch "
1218  << "in number of variables: " << dummyInt << " != " << Data()->GetNVariables() << Endl;
1219  }
1220  //SetNvar(dummyInt);
1221 
1222  // print some information
1223  if (fFitMethod == kUseMonteCarlo) {
1224  Log() << kINFO << "Read cuts optimised using sample of MC events" << Endl;
1225  }
1226  else if (fFitMethod == kUseMonteCarloEvents) {
1227  Log() << kINFO << "Read cuts optimised using sample of MC events" << Endl;
1228  }
1229  else if (fFitMethod == kUseGeneticAlgorithm) {
1230  Log() << kINFO << "Read cuts optimised using Genetic Algorithm" << Endl;
1231  }
1232  else if (fFitMethod == kUseSimulatedAnnealing) {
1233  Log() << kINFO << "Read cuts optimised using Simulated Annealing algorithm" << Endl;
1234  }
1235  else if (fFitMethod == kUseEventScan) {
1236  Log() << kINFO << "Read cuts optimised using Full Event Scan" << Endl;
1237  }
1238  else {
1239  Log() << kWARNING << "unknown method: " << fFitMethod << Endl;
1240  }
1241  Log() << kINFO << "in " << fNbins << " signal efficiency bins and for " << GetNvar() << " variables" << Endl;
1242 
1243  // now read the cuts
1244  char buffer[200];
1245  istr.getline(buffer,200);
1246  istr.getline(buffer,200);
1247 
1248  Int_t tmpbin;
1249  Float_t tmpeffS, tmpeffB;
1250  if (fEffBvsSLocal != 0) delete fEffBvsSLocal;
1251  fEffBvsSLocal = new TH1F( GetTestvarName() + "_effBvsSLocal",
1252  TString(GetName()) + " efficiency of B vs S", fNbins, 0.0, 1.0 );
1253  fEffBvsSLocal->SetDirectory(0); // it's local
1254 
1255  for (Int_t ibin=0; ibin<fNbins; ibin++) {
1256  istr >> tmpbin >> tmpeffS >> tmpeffB;
1257  fEffBvsSLocal->SetBinContent( ibin+1, tmpeffB );
1258 
1259  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
1260  istr >> fCutMin[ivar][ibin] >> fCutMax[ivar][ibin];
1261  }
1262  }
1263 
1264  fEffSMin = fEffBvsSLocal->GetBinCenter(1);
1265  fEffSMax = fEffBvsSLocal->GetBinCenter(fNbins);
1266 }
1267 
1268 ////////////////////////////////////////////////////////////////////////////////
1269 /// create XML description for LD classification and regression
1270 /// (for arbitrary number of output classes/targets)
1271 
1272 void TMVA::MethodCuts::AddWeightsXMLTo( void* parent ) const
1273 {
1274  // write all necessary information to the stream
1275  std::vector<Double_t> cutsMin;
1276  std::vector<Double_t> cutsMax;
1277 
1278  void* wght = gTools().AddChild(parent, "Weights");
1279  gTools().AddAttr( wght, "OptimisationMethod", (Int_t)fEffMethod);
1280  gTools().AddAttr( wght, "FitMethod", (Int_t)fFitMethod );
1281  gTools().AddAttr( wght, "nbins", fNbins );
1282  gTools().AddComment( wght, Form( "Below are the optimised cuts for %i variables: Format: ibin(hist) effS effB cutMin[ivar=0] cutMax[ivar=0] ... cutMin[ivar=n-1] cutMax[ivar=n-1]", GetNvar() ) );
1283 
1284  // NOTE: The signal efficiency written out into
1285  // the weight file does not correspond to the center of the bin within which the
1286  // background rejection is maximised (as before) but to the lower left edge of it.
1287  // This is because the cut optimisation algorithm determines the best background
1288  // rejection for all signal efficiencies belonging into a bin. Since the best background
1289  // rejection is in general obtained for the lowest possible signal efficiency, the
1290  // reference signal efficeincy is the lowest value in the bin.
1291 
1292  for (Int_t ibin=0; ibin<fNbins; ibin++) {
1293  Double_t effS = fEffBvsSLocal->GetBinCenter ( ibin + 1 );
1294  Double_t trueEffS = GetCuts( effS, cutsMin, cutsMax );
1295  if (TMath::Abs(trueEffS) < 1e-10) trueEffS = 0;
1296 
1297  void* binxml = gTools().AddChild( wght, "Bin" );
1298  gTools().AddAttr( binxml, "ibin", ibin+1 );
1299  gTools().AddAttr( binxml, "effS", trueEffS );
1300  gTools().AddAttr( binxml, "effB", fEffBvsSLocal->GetBinContent( ibin + 1 ) );
1301  void* cutsxml = gTools().AddChild( binxml, "Cuts" );
1302  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
1303  gTools().AddAttr( cutsxml, Form( "cutMin_%i", ivar ), cutsMin[ivar] );
1304  gTools().AddAttr( cutsxml, Form( "cutMax_%i", ivar ), cutsMax[ivar] );
1305  }
1306  }
1307 }
1308 
1309 ////////////////////////////////////////////////////////////////////////////////
1310 /// read coefficients from xml weight file
1311 
1313 {
1314  // delete old min and max
1315  for (UInt_t i=0; i<GetNvar(); i++) {
1316  if (fCutMin[i] != 0) delete [] fCutMin[i];
1317  if (fCutMax[i] != 0) delete [] fCutMax[i];
1318  }
1319  if (fCutMin != 0) delete [] fCutMin;
1320  if (fCutMax != 0) delete [] fCutMax;
1321 
1322  Int_t tmpEffMethod, tmpFitMethod;
1323  gTools().ReadAttr( wghtnode, "OptimisationMethod", tmpEffMethod );
1324  gTools().ReadAttr( wghtnode, "FitMethod", tmpFitMethod );
1325  gTools().ReadAttr( wghtnode, "nbins", fNbins );
1326 
1327  fEffMethod = (EEffMethod)tmpEffMethod;
1328  fFitMethod = (EFitMethodType)tmpFitMethod;
1329 
1330  // print some information
1331  if (fFitMethod == kUseMonteCarlo) {
1332  Log() << kINFO << "Read cuts optimised using sample of MC events" << Endl;
1333  }
1334  else if (fFitMethod == kUseMonteCarloEvents) {
1335  Log() << kINFO << "Read cuts optimised using sample of MC-Event events" << Endl;
1336  }
1337  else if (fFitMethod == kUseGeneticAlgorithm) {
1338  Log() << kINFO << "Read cuts optimised using Genetic Algorithm" << Endl;
1339  }
1340  else if (fFitMethod == kUseSimulatedAnnealing) {
1341  Log() << kINFO << "Read cuts optimised using Simulated Annealing algorithm" << Endl;
1342  }
1343  else if (fFitMethod == kUseEventScan) {
1344  Log() << kINFO << "Read cuts optimised using Full Event Scan" << Endl;
1345  }
1346  else {
1347  Log() << kWARNING << "unknown method: " << fFitMethod << Endl;
1348  }
1349  Log() << kINFO << "Reading " << fNbins << " signal efficiency bins for " << GetNvar() << " variables" << Endl;
1350 
1351  delete fEffBvsSLocal;
1352  fEffBvsSLocal = new TH1F( GetTestvarName() + "_effBvsSLocal",
1353  TString(GetName()) + " efficiency of B vs S", fNbins, 0.0, 1.0 );
1354  fEffBvsSLocal->SetDirectory(0); // it's local
1355  for (Int_t ibin=1; ibin<=fNbins; ibin++) fEffBvsSLocal->SetBinContent( ibin, -0.1 ); // Init
1356 
1357  fCutMin = new Double_t*[GetNvar()];
1358  fCutMax = new Double_t*[GetNvar()];
1359  for (UInt_t i=0;i<GetNvar();i++) {
1360  fCutMin[i] = new Double_t[fNbins];
1361  fCutMax[i] = new Double_t[fNbins];
1362  }
1363 
1364  // read efficeincies and cuts
1365  Int_t tmpbin;
1366  Float_t tmpeffS, tmpeffB;
1367  void* ch = gTools().GetChild(wghtnode,"Bin");
1368  while (ch) {
1369 // if (strcmp(gTools().GetName(ch),"Bin") !=0) {
1370 // ch = gTools().GetNextChild(ch);
1371 // continue;
1372 // }
1373 
1374  gTools().ReadAttr( ch, "ibin", tmpbin );
1375  gTools().ReadAttr( ch, "effS", tmpeffS );
1376  gTools().ReadAttr( ch, "effB", tmpeffB );
1377 
1378  // sanity check
1379  if (tmpbin-1 >= fNbins || tmpbin-1 < 0) {
1380  Log() << kFATAL << "Mismatch in bins: " << tmpbin-1 << " >= " << fNbins << Endl;
1381  }
1382 
1383  fEffBvsSLocal->SetBinContent( tmpbin, tmpeffB );
1384  void* ct = gTools().GetChild(ch);
1385  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
1386  gTools().ReadAttr( ct, Form( "cutMin_%i", ivar ), fCutMin[ivar][tmpbin-1] );
1387  gTools().ReadAttr( ct, Form( "cutMax_%i", ivar ), fCutMax[ivar][tmpbin-1] );
1388  }
1389  ch = gTools().GetNextChild(ch, "Bin");
1390  }
1391 }
1392 
1393 ////////////////////////////////////////////////////////////////////////////////
1394 /// write histograms and PDFs to file for monitoring purposes
1395 
1397 {
1398  Log() << kINFO << "Write monitoring histograms to file: " << BaseDir()->GetPath() << Endl;
1399 
1400  fEffBvsSLocal->Write();
1401 
1402  // save reference histograms to file
1403  if (fEffMethod == kUsePDFs) {
1404  for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
1405  (*fVarHistS)[ivar]->Write();
1406  (*fVarHistB)[ivar]->Write();
1407  (*fVarHistS_smooth)[ivar]->Write();
1408  (*fVarHistB_smooth)[ivar]->Write();
1409  (*fVarPdfS)[ivar]->GetPDFHist()->Write();
1410  (*fVarPdfB)[ivar]->GetPDFHist()->Write();
1411  }
1412  }
1413 }
1414 
1415 ////////////////////////////////////////////////////////////////////////////////
1416 /// - overloaded function to create background efficiency (rejection) versus
1417 /// signal efficiency plot (first call of this function)
1418 /// - the function returns the signal efficiency at background efficiency
1419 /// indicated in theString
1420 ///
1421 /// "theString" must have two entries:
1422 /// [0]: "Efficiency"
1423 /// [1]: the value of background efficiency at which the signal efficiency
1424 /// is to be returned
1425 
1427 {
1428  // parse input string for required background efficiency
1429  TList* list = gTools().ParseFormatLine( theString );
1430  // sanity check
1431  if (list->GetSize() != 2) {
1432  Log() << kFATAL << "<GetTrainingEfficiency> wrong number of arguments"
1433  << " in string: " << theString
1434  << " | required format, e.g., Efficiency:0.05" << Endl;
1435  return -1;
1436  }
1437 
1438  Results* results = Data()->GetResults(GetMethodName(), Types::kTesting, GetAnalysisType());
1439 
1440  // that will be the value of the efficiency retured (does not affect
1441  // the efficiency-vs-bkg plot which is done anyway.
1442  Float_t effBref = atof( ((TObjString*)list->At(1))->GetString() );
1443 
1444  delete list;
1445 
1446  // first round ? --> create histograms
1447  if (results->GetHist("EFF_BVSS_TR")==0) {
1448 
1449  if (fBinaryTreeS != 0) { delete fBinaryTreeS; fBinaryTreeS = 0; }
1450  if (fBinaryTreeB != 0) { delete fBinaryTreeB; fBinaryTreeB = 0; }
1451 
1452  fBinaryTreeS = new BinarySearchTree();
1453  fBinaryTreeS->Fill( GetEventCollection(Types::kTraining), fSignalClass );
1454  fBinaryTreeB = new BinarySearchTree();
1455  fBinaryTreeB->Fill( GetEventCollection(Types::kTraining), fBackgroundClass );
1456  // there is no really good equivalent to the fEffS; fEffB (efficiency vs cutvalue)
1457  // for the "Cuts" method (unless we had only one cut). Maybe later I might add here
1458  // histograms for each of the cuts...but this would require also a change in the
1459  // base class, and it is not really necessary, as we get exactly THIS info from the
1460  // "evaluateAllVariables" anyway.
1461 
1462  // now create efficiency curve: background versus signal
1463  TH1* eff_bvss_tr = new TH1F( GetTestvarName() + "_trainingEffBvsS", GetTestvarName() + "", fNbins, 0, 1 );
1464  for (Int_t ibin=1; ibin<=fNbins; ibin++) eff_bvss_tr->SetBinContent( ibin, -0.1 ); // Init
1465  TH1* rej_bvss_tr = new TH1F( GetTestvarName() + "_trainingRejBvsS", GetTestvarName() + "", fNbins, 0, 1 );
1466  for (Int_t ibin=1; ibin<=fNbins; ibin++) rej_bvss_tr->SetBinContent( ibin, 0. ); // Init
1467  results->Store(eff_bvss_tr, "EFF_BVSS_TR");
1468  results->Store(rej_bvss_tr, "REJ_BVSS_TR");
1469 
1470  // use root finder
1471 
1472  // make the background-vs-signal efficiency plot
1473  Double_t* tmpCutMin = new Double_t[GetNvar()];
1474  Double_t* tmpCutMax = new Double_t[GetNvar()];
1475  Int_t nFailedBins=0;
1476  for (Int_t bini=1; bini<=fNbins; bini++) {
1477  for (UInt_t ivar=0; ivar <GetNvar(); ivar++){
1478  tmpCutMin[ivar] = fCutMin[ivar][bini-1];
1479  tmpCutMax[ivar] = fCutMax[ivar][bini-1];
1480  }
1481  // find cut value corresponding to a given signal efficiency
1482  Double_t effS, effB;
1483  this->GetEffsfromSelection( &tmpCutMin[0], &tmpCutMax[0], effS, effB);
1484  // check that effS matches bini
1485  Int_t effBin = eff_bvss_tr->GetXaxis()->FindBin(effS);
1486  if (effBin != bini){
1487  Log()<< kVERBOSE << "unable to fill efficiency bin " << bini<< " " << effBin <<Endl;
1488  nFailedBins++;
1489  }
1490  else{
1491  // and fill histograms
1492  eff_bvss_tr->SetBinContent( bini, effB );
1493  rej_bvss_tr->SetBinContent( bini, 1.0-effB );
1494  }
1495  }
1496  if (nFailedBins>0) Log()<< kWARNING << " unable to fill "<< nFailedBins <<" efficiency bins " <<Endl;
1497 
1498  delete [] tmpCutMin;
1499  delete [] tmpCutMax;
1500 
1501  // create splines for histogram
1502  fSplTrainEffBvsS = new TSpline1( "trainEffBvsS", new TGraph( eff_bvss_tr ) );
1503  }
1504 
1505  // must exist...
1506  if (NULL == fSplTrainEffBvsS) return 0.0;
1507 
1508  // now find signal efficiency that corresponds to required background efficiency
1509  Double_t effS = 0., effB, effS_ = 0., effB_ = 0.;
1510  Int_t nbins_ = 1000;
1511 
1512  // loop over efficiency bins until the background eff. matches the requirement
1513  for (Int_t bini=1; bini<=nbins_; bini++) {
1514  // get corresponding signal and background efficiencies
1515  effS = (bini - 0.5)/Float_t(nbins_);
1516  effB = fSplTrainEffBvsS->Eval( effS );
1517 
1518  // find signal efficiency that corresponds to required background efficiency
1519  if ((effB - effBref)*(effB_ - effBref) < 0) break;
1520  effS_ = effS;
1521  effB_ = effB;
1522  }
1523 
1524  return 0.5*(effS + effS_);
1525 }
1526 
1527 ////////////////////////////////////////////////////////////////////////////////
1528 /// - overloaded function to create background efficiency (rejection) versus
1529 /// signal efficiency plot (first call of this function)
1530 /// - the function returns the signal efficiency at background efficiency
1531 /// indicated in theString
1532 ///
1533 /// "theString" must have two entries:
1534 /// [0]: "Efficiency"
1535 /// [1]: the value of background efficiency at which the signal efficiency
1536 /// is to be returned
1537 
1539 {
1540  Data()->SetCurrentType(type);
1541 
1542  Results* results = Data()->GetResults( GetMethodName(), Types::kTesting, GetAnalysisType() );
1543 
1544  // parse input string for required background efficiency
1545  TList* list = gTools().ParseFormatLine( theString, ":" );
1546 
1547  if (list->GetSize() > 2) {
1548  delete list;
1549  Log() << kFATAL << "<GetEfficiency> wrong number of arguments"
1550  << " in string: " << theString
1551  << " | required format, e.g., Efficiency:0.05, or empty string" << Endl;
1552  return -1;
1553  }
1554 
1555  // sanity check
1556  Bool_t computeArea = (list->GetSize() < 2); // the area is computed
1557 
1558  // that will be the value of the efficiency retured (does not affect
1559  // the efficiency-vs-bkg plot which is done anyway.
1560  Float_t effBref = (computeArea?1.:atof( ((TObjString*)list->At(1))->GetString() ));
1561 
1562  delete list;
1563 
1564 
1565  // first round ? --> create histograms
1566  if (results->GetHist("MVA_EFF_BvsS")==0) {
1567 
1568  if (fBinaryTreeS!=0) { delete fBinaryTreeS; fBinaryTreeS = 0; }
1569  if (fBinaryTreeB!=0) { delete fBinaryTreeB; fBinaryTreeB = 0; }
1570 
1571  // the variables may be transformed by a transformation method: to coherently
1572  // treat signal and background one must decide which transformation type shall
1573  // be used: our default is signal-type
1574  fBinaryTreeS = new BinarySearchTree();
1575  fBinaryTreeS->Fill( GetEventCollection(Types::kTesting), fSignalClass );
1576  fBinaryTreeB = new BinarySearchTree();
1577  fBinaryTreeB->Fill( GetEventCollection(Types::kTesting), fBackgroundClass );
1578 
1579  // there is no really good equivalent to the fEffS; fEffB (efficiency vs cutvalue)
1580  // for the "Cuts" method (unless we had only one cut). Maybe later I might add here
1581  // histograms for each of the cuts...but this would require also a change in the
1582  // base class, and it is not really necessary, as we get exactly THIS info from the
1583  // "evaluateAllVariables" anyway.
1584 
1585  // now create efficiency curve: background versus signal
1586  TH1* eff_BvsS = new TH1F( GetTestvarName() + "_effBvsS", GetTestvarName() + "", fNbins, 0, 1 );
1587  for (Int_t ibin=1; ibin<=fNbins; ibin++) eff_BvsS->SetBinContent( ibin, -0.1 ); // Init
1588  TH1* rej_BvsS = new TH1F( GetTestvarName() + "_rejBvsS", GetTestvarName() + "", fNbins, 0, 1 );
1589  for (Int_t ibin=1; ibin<=fNbins; ibin++) rej_BvsS->SetBinContent( ibin, 0.0 ); // Init
1590  results->Store(eff_BvsS, "MVA_EFF_BvsS");
1591  results->Store(rej_BvsS);
1592 
1593  Double_t xmin = 0.;
1594  Double_t xmax = 1.000001;
1595 
1596  TH1* eff_s = new TH1F( GetTestvarName() + "_effS", GetTestvarName() + " (signal)", fNbins, xmin, xmax);
1597  for (Int_t ibin=1; ibin<=fNbins; ibin++) eff_s->SetBinContent( ibin, -0.1 ); // Init
1598  TH1* eff_b = new TH1F( GetTestvarName() + "_effB", GetTestvarName() + " (background)", fNbins, xmin, xmax);
1599  for (Int_t ibin=1; ibin<=fNbins; ibin++) eff_b->SetBinContent( ibin, -0.1 ); // Init
1600  results->Store(eff_s, "MVA_S");
1601  results->Store(eff_b, "MVA_B");
1602 
1603  // use root finder
1604 
1605  // make the background-vs-signal efficiency plot
1606  Double_t* tmpCutMin = new Double_t[GetNvar()];
1607  Double_t* tmpCutMax = new Double_t[GetNvar()];
1608  TGraph* tmpBvsS = new TGraph(fNbins+1);
1609  tmpBvsS->SetPoint(0, 0., 0.);
1610 
1611  for (Int_t bini=1; bini<=fNbins; bini++) {
1612  for (UInt_t ivar=0; ivar <GetNvar(); ivar++) {
1613  tmpCutMin[ivar] = fCutMin[ivar][bini-1];
1614  tmpCutMax[ivar] = fCutMax[ivar][bini-1];
1615  }
1616  // find cut value corresponding to a given signal efficiency
1617  Double_t effS, effB;
1618  this->GetEffsfromSelection( &tmpCutMin[0], &tmpCutMax[0], effS, effB);
1619  tmpBvsS->SetPoint(bini, effS, effB);
1620 
1621  eff_s->SetBinContent(bini, effS);
1622  eff_b->SetBinContent(bini, effB);
1623  }
1624  tmpBvsS->SetPoint(fNbins+1, 1., 1.);
1625 
1626  delete [] tmpCutMin;
1627  delete [] tmpCutMax;
1628 
1629  // create splines for histogram
1630  fSpleffBvsS = new TSpline1( "effBvsS", tmpBvsS );
1631  for (Int_t bini=1; bini<=fNbins; bini++) {
1632  Double_t effS = (bini - 0.5)/Float_t(fNbins);
1633  Double_t effB = fSpleffBvsS->Eval( effS );
1634  eff_BvsS->SetBinContent( bini, effB );
1635  rej_BvsS->SetBinContent( bini, 1.0-effB );
1636  }
1637  }
1638 
1639  // must exist...
1640  if (NULL == fSpleffBvsS) return 0.0;
1641 
1642  // now find signal efficiency that corresponds to required background efficiency
1643  Double_t effS = 0, effB = 0, effS_ = 0, effB_ = 0;
1644  Int_t nbins_ = 1000;
1645 
1646  if (computeArea) {
1647 
1648  // compute area of rej-vs-eff plot
1649  Double_t integral = 0;
1650  for (Int_t bini=1; bini<=nbins_; bini++) {
1651 
1652  // get corresponding signal and background efficiencies
1653  effS = (bini - 0.5)/Float_t(nbins_);
1654  effB = fSpleffBvsS->Eval( effS );
1655  integral += (1.0 - effB);
1656  }
1657  integral /= nbins_;
1658 
1659  return integral;
1660  }
1661  else {
1662 
1663  // loop over efficiency bins until the background eff. matches the requirement
1664  for (Int_t bini=1; bini<=nbins_; bini++) {
1665  // get corresponding signal and background efficiencies
1666  effS = (bini - 0.5)/Float_t(nbins_);
1667  effB = fSpleffBvsS->Eval( effS );
1668 
1669  // find signal efficiency that corresponds to required background efficiency
1670  if ((effB - effBref)*(effB_ - effBref) < 0) break;
1671  effS_ = effS;
1672  effB_ = effB;
1673  }
1674 
1675  effS = 0.5*(effS + effS_);
1676  effSerr = 0;
1677  if (Data()->GetNEvtSigTest() > 0)
1678  effSerr = TMath::Sqrt( effS*(1.0 - effS)/Double_t(Data()->GetNEvtSigTest()) );
1679 
1680  return effS;
1681 
1682  }
1683 
1684  return -1;
1685 }
1686 
1687 ////////////////////////////////////////////////////////////////////////////////
1688 /// write specific classifier response
1689 
1690 void TMVA::MethodCuts::MakeClassSpecific( std::ostream& fout, const TString& className ) const
1691 {
1692  fout << " // not implemented for class: \"" << className << "\"" << std::endl;
1693  fout << "};" << std::endl;
1694 }
1695 
1696 ////////////////////////////////////////////////////////////////////////////////
1697 /// get help message text
1698 ///
1699 /// typical length of text line:
1700 /// "|--------------------------------------------------------------|"
1701 
1703 {
1704  TString bold = gConfig().WriteOptionsReference() ? "<b>" : "";
1705  TString resbold = gConfig().WriteOptionsReference() ? "</b>" : "";
1706  TString brk = gConfig().WriteOptionsReference() ? "<br>" : "";
1707 
1708  Log() << Endl;
1709  Log() << gTools().Color("bold") << "--- Short description:" << gTools().Color("reset") << Endl;
1710  Log() << Endl;
1711  Log() << "The optimisation of rectangular cuts performed by TMVA maximises " << Endl;
1712  Log() << "the background rejection at given signal efficiency, and scans " << Endl;
1713  Log() << "over the full range of the latter quantity. Three optimisation" << Endl;
1714  Log() << "methods are optional: Monte Carlo sampling (MC), a Genetics" << Endl;
1715  Log() << "Algorithm (GA), and Simulated Annealing (SA). GA and SA are" << Endl;
1716  Log() << "expected to perform best." << Endl;
1717  Log() << Endl;
1718  Log() << "The difficulty to find the optimal cuts strongly increases with" << Endl;
1719  Log() << "the dimensionality (number of input variables) of the problem." << Endl;
1720  Log() << "This behavior is due to the non-uniqueness of the solution space."<< Endl;
1721  Log() << Endl;
1722  Log() << gTools().Color("bold") << "--- Performance optimisation:" << gTools().Color("reset") << Endl;
1723  Log() << Endl;
1724  Log() << "If the dimensionality exceeds, say, 4 input variables, it is " << Endl;
1725  Log() << "advisable to scrutinize the separation power of the variables," << Endl;
1726  Log() << "and to remove the weakest ones. If some among the input variables" << Endl;
1727  Log() << "can be described by a single cut (e.g., because signal tends to be" << Endl;
1728  Log() << "larger than background), this can be indicated to MethodCuts via" << Endl;
1729  Log() << "the \"Fsmart\" options (see option string). Choosing this option" << Endl;
1730  Log() << "reduces the number of requirements for the variable from 2 (min/max)" << Endl;
1731  Log() << "to a single one (TMVA finds out whether it is to be interpreted as" << Endl;
1732  Log() << "min or max)." << Endl;
1733  Log() << Endl;
1734  Log() << gTools().Color("bold") << "--- Performance tuning via configuration options:" << gTools().Color("reset") << Endl;
1735  Log() << "" << Endl;
1736  Log() << bold << "Monte Carlo sampling:" << resbold << Endl;
1737  Log() << "" << Endl;
1738  Log() << "Apart form the \"Fsmart\" option for the variables, the only way" << Endl;
1739  Log() << "to improve the MC sampling is to increase the sampling rate. This" << Endl;
1740  Log() << "is done via the configuration option \"MC_NRandCuts\". The execution" << Endl;
1741  Log() << "time scales linearly with the sampling rate." << Endl;
1742  Log() << "" << Endl;
1743  Log() << bold << "Genetic Algorithm:" << resbold << Endl;
1744  Log() << "" << Endl;
1745  Log() << "The algorithm terminates if no significant fitness increase has" << Endl;
1746  Log() << "been achieved within the last \"nsteps\" steps of the calculation." << Endl;
1747  Log() << "Wiggles in the ROC curve or constant background rejection of 1" << Endl;
1748  Log() << "indicate that the GA failed to always converge at the true maximum" << Endl;
1749  Log() << "fitness. In such a case, it is recommended to broaden the search " << Endl;
1750  Log() << "by increasing the population size (\"popSize\") and to give the GA " << Endl;
1751  Log() << "more time to find improvements by increasing the number of steps" << Endl;
1752  Log() << "(\"nsteps\")" << Endl;
1753  Log() << " -> increase \"popSize\" (at least >10 * number of variables)" << Endl;
1754  Log() << " -> increase \"nsteps\"" << Endl;
1755  Log() << "" << Endl;
1756  Log() << bold << "Simulated Annealing (SA) algorithm:" << resbold << Endl;
1757  Log() << "" << Endl;
1758  Log() << "\"Increasing Adaptive\" approach:" << Endl;
1759  Log() << "" << Endl;
1760  Log() << "The algorithm seeks local minima and explores their neighborhood, while" << Endl;
1761  Log() << "changing the ambient temperature depending on the number of failures" << Endl;
1762  Log() << "in the previous steps. The performance can be improved by increasing" << Endl;
1763  Log() << "the number of iteration steps (\"MaxCalls\"), or by adjusting the" << Endl;
1764  Log() << "minimal temperature (\"MinTemperature\"). Manual adjustments of the" << Endl;
1765  Log() << "speed of the temperature increase (\"TemperatureScale\" and \"AdaptiveSpeed\")" << Endl;
1766  Log() << "to individual data sets should also help. Summary:" << brk << Endl;
1767  Log() << " -> increase \"MaxCalls\"" << brk << Endl;
1768  Log() << " -> adjust \"MinTemperature\"" << brk << Endl;
1769  Log() << " -> adjust \"TemperatureScale\"" << brk << Endl;
1770  Log() << " -> adjust \"AdaptiveSpeed\"" << Endl;
1771  Log() << "" << Endl;
1772  Log() << "\"Decreasing Adaptive\" approach:" << Endl;
1773  Log() << "" << Endl;
1774  Log() << "The algorithm calculates the initial temperature (based on the effect-" << Endl;
1775  Log() << "iveness of large steps) and the multiplier that ensures to reach the" << Endl;
1776  Log() << "minimal temperature with the requested number of iteration steps." << Endl;
1777  Log() << "The performance can be improved by adjusting the minimal temperature" << Endl;
1778  Log() << " (\"MinTemperature\") and by increasing number of steps (\"MaxCalls\"):" << brk << Endl;
1779  Log() << " -> increase \"MaxCalls\"" << brk << Endl;
1780  Log() << " -> adjust \"MinTemperature\"" << Endl;
1781  Log() << " " << Endl;
1782  Log() << "Other kernels:" << Endl;
1783  Log() << "" << Endl;
1784  Log() << "Alternative ways of counting the temperature change are implemented. " << Endl;
1785  Log() << "Each of them starts with the maximum temperature (\"MaxTemperature\")" << Endl;
1786  Log() << "and descreases while changing the temperature according to a given" << Endl;
1787  Log() << "prescription:" << brk << Endl;
1788  Log() << "CurrentTemperature =" << brk << Endl;
1789  Log() << " - Sqrt: InitialTemperature / Sqrt(StepNumber+2) * TemperatureScale" << brk << Endl;
1790  Log() << " - Log: InitialTemperature / Log(StepNumber+2) * TemperatureScale" << brk << Endl;
1791  Log() << " - Homo: InitialTemperature / (StepNumber+2) * TemperatureScale" << brk << Endl;
1792  Log() << " - Sin: (Sin(StepNumber / TemperatureScale) + 1) / (StepNumber + 1)*InitialTemperature + Eps" << brk << Endl;
1793  Log() << " - Geo: CurrentTemperature * TemperatureScale" << Endl;
1794  Log() << "" << Endl;
1795  Log() << "Their performance can be improved by adjusting initial temperature" << Endl;
1796  Log() << "(\"InitialTemperature\"), the number of iteration steps (\"MaxCalls\")," << Endl;
1797  Log() << "and the multiplier that scales the termperature descrease" << Endl;
1798  Log() << "(\"TemperatureScale\")" << brk << Endl;
1799  Log() << " -> increase \"MaxCalls\"" << brk << Endl;
1800  Log() << " -> adjust \"InitialTemperature\"" << brk << Endl;
1801  Log() << " -> adjust \"TemperatureScale\"" << brk << Endl;
1802  Log() << " -> adjust \"KernelTemperature\"" << Endl;
1803 }
virtual Int_t Write(const char *name=0, Int_t option=0, Int_t bufsize=0)
Write this object to the current directory.
Definition: TObject.cxx:823
Double_t GetMvaValue(Double_t *err=0, Double_t *errUpper=0)
cut evaluation: returns 1.0 if event passed, 0.0 otherwise
Definition: MethodCuts.cxx:432
float xmin
Definition: THbookFile.cxx:93
Random number generator class based on M.
Definition: TRandom3.h:29
MsgLogger & Endl(MsgLogger &ml)
Definition: MsgLogger.h:162
TH1 * GetHist(const TString &alias) const
Definition: Results.cxx:107
Double_t ComputeEstimator(std::vector< Double_t > &)
returns estimator for "cut fitness" used by GA there are two requirements: 1) the signal efficiency m...
Definition: MethodCuts.cxx:878
void TestClassification()
nothing to test
Definition: MethodCuts.cxx:814
Collectable string class.
Definition: TObjString.h:32
float Float_t
Definition: RtypesCore.h:53
void CheckForUnusedOptions() const
checks for unused options in option string
Config & gConfig()
void MatchParsToCuts(const std::vector< Double_t > &, Double_t *, Double_t *)
translates parameters into cuts
Definition: MethodCuts.cxx:959
EAnalysisType
Definition: Types.h:124
void Clone(Ssiz_t nc)
Make self a distinct copy with capacity of at least tot, where tot cannot be smaller than the current...
Definition: TString.cxx:1175
THist< 1, float > TH1F
Definition: THist.h:315
Basic string class.
Definition: TString.h:137
1-D histogram with a float per channel (see TH1 documentation)}
Definition: TH1.h:570
Short_t Min(Short_t a, Short_t b)
Definition: TMathBase.h:170
int Int_t
Definition: RtypesCore.h:41
bool Bool_t
Definition: RtypesCore.h:59
void AddWeightsXMLTo(void *parent) const
create XML description for LD classification and regression (for arbitrary number of output classes/t...
const Bool_t kFALSE
Definition: Rtypes.h:92
int nbins[3]
void AddAttr(void *node, const char *, const T &value, Int_t precision=16)
Definition: Tools.h:308
Bool_t AddComment(void *node, const char *comment)
Definition: Tools.cxx:1142
void * AddChild(void *parent, const char *childname, const char *content=0, bool isRootNode=false)
add child node
Definition: Tools.cxx:1134
Short_t Abs(Short_t d)
Definition: TMathBase.h:110
virtual TObject * At(Int_t idx) const
Returns the object at position idx. Returns 0 if idx is out of range.
Definition: TList.cxx:310
Float_t GetValue(UInt_t ivar) const
return value of i'th variable
Definition: Event.cxx:231
const char * Data() const
Definition: TString.h:349
Tools & gTools()
Definition: Tools.cxx:79
Double_t EstimatorFunction(std::vector< Double_t > &)
returns estimator for "cut fitness" used by GA
Definition: MethodCuts.cxx:865
TStopwatch timer
Definition: pirndm.C:37
Double_t Run()
estimator function interface for fitting
Definition: FitterBase.cxx:73
void Init(void)
default initialisation called by all constructors
Definition: MethodCuts.cxx:222
void CreateVariablePDFs(void)
for PDF method: create efficiency reference histograms and PDFs
void * GetChild(void *parent, const char *childname=0)
get child node
Definition: Tools.cxx:1158
Double_t GetCuts(Double_t effS, std::vector< Double_t > &cutMin, std::vector< Double_t > &cutMax) const
retrieve cut values for given signal efficiency
Definition: MethodCuts.cxx:551
void GetEffsfromSelection(Double_t *cutMin, Double_t *cutMax, Double_t &effS, Double_t &effB)
compute signal and background efficiencies from event counting for given cut sample ...
std::vector< std::vector< double > > Data
void MatchCutsToPars(std::vector< Double_t > &, Double_t *, Double_t *)
translates cuts into parameters
Definition: MethodCuts.cxx:994
Definition: PDF.h:71
A doubly linked list.
Definition: TList.h:47
void Train(void)
training method: here the cuts are optimised for the training sample
Definition: MethodCuts.cxx:578
std::string GetMethodName(TCppMethod_t)
Definition: Cppyy.cxx:707
MethodCuts(const TString &jobName, const TString &methodTitle, DataSetInfo &theData, const TString &theOption="MC:150:10000:", TDirectory *theTargetFile=0)
void PrintCuts(Double_t effS) const
print cuts
Definition: MethodCuts.cxx:465
void MakeClassSpecific(std::ostream &, const TString &) const
write specific classifier response
void ReadWeightsFromXML(void *wghtnode)
read coefficients from xml weight file
virtual void SetBinContent(Int_t bin, Double_t content)
Set bin content see convention for numbering bins in TH1::GetBin In case the bin number is greater th...
Definition: TH1.cxx:8543
unsigned int UInt_t
Definition: RtypesCore.h:42
char * Form(const char *fmt,...)
void GetEffsfromPDFs(Double_t *cutMin, Double_t *cutMax, Double_t &effS, Double_t &effB)
compute signal and background efficiencies from PDFs for given cut sample
void ReadAttr(void *node, const char *, T &value)
Definition: Tools.h:295
float xmax
Definition: THbookFile.cxx:93
void DeclareOptions()
define the options (their key words) that can be set in the option string know options: Method
Definition: MethodCuts.cxx:320
REAL epsilon
Definition: triangle.c:617
virtual Int_t FindBin(Double_t x)
Find bin number corresponding to abscissa x.
Definition: TAxis.cxx:264
virtual Int_t GetSize() const
Definition: TCollection.h:95
#define ClassImp(name)
Definition: Rtypes.h:279
double Double_t
Definition: RtypesCore.h:55
void WriteMonitoringHistosToFile(void) const
write histograms and PDFs to file for monitoring purposes
Describe directory structure in memory.
Definition: TDirectory.h:41
int type
Definition: TGX11.cxx:120
void ReadWeightsFromStream(std::istream &i)
read the cuts from stream
static RooMathCoreReg dummy
void * GetNextChild(void *prevchild, const char *childname=0)
XML helpers.
Definition: Tools.cxx:1170
The TH1 histogram class.
Definition: TH1.h:80
Double_t GetEfficiency(const TString &, Types::ETreeType, Double_t &)
virtual Double_t Uniform(Double_t x1=1)
Returns a uniform deviate on the interval (0, x1).
Definition: TRandom.cxx:606
void GetHelpMessage() const
get help message text
const TString & Color(const TString &)
human readable color strings
Definition: Tools.cxx:837
#define REGISTER_METHOD(CLASS)
for example
Abstract ClassifierFactory template that handles arbitrary types.
virtual void SetPoint(Int_t i, Double_t x, Double_t y)
Set x and y values for point number i.
Definition: TGraph.cxx:2127
virtual ~MethodCuts(void)
destructor
Definition: MethodCuts.cxx:272
Short_t Max(Short_t a, Short_t b)
Definition: TMathBase.h:202
A Graph is a graphics object made of two arrays X and Y with npoints each.
Definition: TGraph.h:53
void ProcessOptions()
process user options sanity check, do not allow the input variables to be normalised, because this only creates problems when interpreting the cuts
Definition: MethodCuts.cxx:363
TList * ParseFormatLine(TString theString, const char *sep=":")
Parse the string and cut into labels separated by ":".
Definition: Tools.cxx:413
#define NULL
Definition: Rtypes.h:82
void DrawProgressBar(Int_t, const TString &comment="")
draws progress bar in color or B&W caution:
Definition: Timer.cxx:183
Bool_t WriteOptionsReference() const
Definition: Config.h:66
void Store(TObject *obj, const char *alias=0)
Definition: Results.cxx:63
Double_t Sqrt(Double_t x)
Definition: TMath.h:464
Double_t GetTrainingEfficiency(const TString &)
const Bool_t kTRUE
Definition: Rtypes.h:91
virtual Bool_t HasAnalysisType(Types::EAnalysisType type, UInt_t numberClasses, UInt_t numberTargets)
Cuts can only handle classification with 2 classes.
Definition: MethodCuts.cxx:213
Definition: math.cpp:60
TAxis * GetXaxis()
Definition: TH1.h:319