Logo ROOT   6.10/09
Reference Guide
DataSetFactory.cxx
Go to the documentation of this file.
1 // @(#)root/tmva $Id$
2 // Author: Andreas Hoecker, Peter Speckmayer, Joerg Stelzer, Eckhard von Toerne, Helge Voss
3 
4 /*****************************************************************************
5  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6  * Package: TMVA *
7  * Class : DataSetFactory *
8  * Web : http://tmva.sourceforge.net *
9  * *
10  * Description: *
11  * Implementation (see header for description) *
12  * *
13  * Authors (alphabetical): *
14  * Andreas Hoecker <Andreas.Hocker@cern.ch> - CERN, Switzerland *
15  * Peter Speckmayer <Peter.Speckmayer@cern.ch> - CERN, Switzerland *
16  * Joerg Stelzer <Joerg.Stelzer@cern.ch> - MSU, USA *
17  * Eckhard von Toerne <evt@physik.uni-bonn.de> - U. of Bonn, Germany *
18  * Helge Voss <Helge.Voss@cern.ch> - MPI-K Heidelberg, Germany *
19  * *
20  * Copyright (c) 2009: *
21  * CERN, Switzerland *
22  * MPI-K Heidelberg, Germany *
23  * U. of Bonn, Germany *
24  * Redistribution and use in source and binary forms, with or without *
25  * modification, are permitted according to the terms listed in LICENSE *
26  * (http://tmva.sourceforge.net/LICENSE) *
27  *****************************************************************************/
28 
29 /*! \class TMVA::DataSetFactory
30 \ingroup TMVA
31 
32 Class that contains all the data information
33 
34 */
35 
36 #include <assert.h>
37 
38 #include <map>
39 #include <vector>
40 #include <iomanip>
41 #include <iostream>
42 
43 #include <algorithm>
44 #include <functional>
45 #include <numeric>
46 
47 #include "TMVA/DataSetFactory.h"
48 
49 #include "TEventList.h"
50 #include "TFile.h"
51 #include "TH1.h"
52 #include "TH2.h"
53 #include "TProfile.h"
54 #include "TRandom3.h"
55 #include "TMatrixF.h"
56 #include "TVectorF.h"
57 #include "TMath.h"
58 #include "TROOT.h"
59 
60 #include "TMVA/MsgLogger.h"
61 #include "TMVA/Configurable.h"
65 #include "TMVA/DataSet.h"
66 #include "TMVA/DataSetInfo.h"
67 #include "TMVA/DataInputHandler.h"
68 #include "TMVA/Event.h"
69 
70 #include "TMVA/Types.h"
71 #include "TMVA/VariableInfo.h"
72 
73 using namespace std;
74 
75 //TMVA::DataSetFactory* TMVA::DataSetFactory::fgInstance = 0;
76 
77 namespace TMVA {
78  // calculate the largest common divider
79  // this function is not happy if numbers are negative!
81  {
82  if (a<b) {Int_t tmp = a; a=b; b=tmp; } // achieve a>=b
83  if (b==0) return a;
84  Int_t fullFits = a/b;
85  return LargestCommonDivider(b,a-b*fullFits);
86  }
87 }
88 
89 
90 ////////////////////////////////////////////////////////////////////////////////
91 /// constructor
92 
94  fVerbose(kFALSE),
95  fVerboseLevel(TString("Info")),
96  fScaleWithPreselEff(0),
97  fCurrentTree(0),
98  fCurrentEvtIdx(0),
99  fInputFormulas(0),
100  fLogger( new MsgLogger("DataSetFactory", kINFO) )
101 {
102 }
103 
104 ////////////////////////////////////////////////////////////////////////////////
105 /// destructor
106 
108 {
109  std::vector<TTreeFormula*>::const_iterator formIt;
110 
111  for (formIt = fInputFormulas.begin() ; formIt!=fInputFormulas.end() ; formIt++) if (*formIt) delete *formIt;
112  for (formIt = fTargetFormulas.begin() ; formIt!=fTargetFormulas.end() ; formIt++) if (*formIt) delete *formIt;
113  for (formIt = fCutFormulas.begin() ; formIt!=fCutFormulas.end() ; formIt++) if (*formIt) delete *formIt;
114  for (formIt = fWeightFormula.begin() ; formIt!=fWeightFormula.end() ; formIt++) if (*formIt) delete *formIt;
115  for (formIt = fSpectatorFormulas.begin(); formIt!=fSpectatorFormulas.end(); formIt++) if (*formIt) delete *formIt;
116 
117  delete fLogger;
118 }
119 
120 ////////////////////////////////////////////////////////////////////////////////
121 /// steering the creation of a new dataset
122 
124  TMVA::DataInputHandler& dataInput )
125 {
126  // build the first dataset from the data input
127  DataSet * ds = BuildInitialDataSet( dsi, dataInput );
128 
129  if (ds->GetNEvents() > 1) {
130  CalcMinMax(ds,dsi);
131 
132  // from the the final dataset build the correlation matrix
133  for (UInt_t cl = 0; cl< dsi.GetNClasses(); cl++) {
134  const TString className = dsi.GetClassInfo(cl)->GetName();
135  dsi.SetCorrelationMatrix( className, CalcCorrelationMatrix( ds, cl ) );
136  dsi.PrintCorrelationMatrix( className );
137  }
138  //Log() << kHEADER << Endl;
139  Log() << kHEADER << Form("[%s] : ",dsi.GetName()) << " " << Endl << Endl;
140  }
141 
142  return ds;
143 }
144 
145 ////////////////////////////////////////////////////////////////////////////////
146 
148 {
149  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName()) << "Build DataSet consisting of one Event with dynamically changing variables" << Endl;
150  DataSet* ds = new DataSet(dsi);
151 
152  // create a DataSet with one Event which uses dynamic variables
153  // (pointers to variables)
154  if(dsi.GetNClasses()==0){
155  dsi.AddClass( "data" );
156  dsi.GetClassInfo( "data" )->SetNumber(0);
157  }
158 
159  std::vector<Float_t*>* evdyn = new std::vector<Float_t*>(0);
160 
161  std::vector<VariableInfo>& varinfos = dsi.GetVariableInfos();
162 
163  if (varinfos.empty())
164  Log() << kFATAL << Form("Dataset[%s] : ",dsi.GetName()) << "Dynamic data set cannot be built, since no variable informations are present. Apparently no variables have been set. This should not happen, please contact the TMVA authors." << Endl;
165 
166  std::vector<VariableInfo>::iterator it = varinfos.begin(), itEnd=varinfos.end();
167  for (;it!=itEnd;++it) {
168  Float_t* external=(Float_t*)(*it).GetExternalLink();
169  if (external==0)
170  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName()) << "The link to the external variable is NULL while I am trying to build a dynamic data set. In this case fTmpEvent from MethodBase HAS TO BE USED in the method to get useful values in variables." << Endl;
171  else evdyn->push_back (external);
172  }
173 
174  std::vector<VariableInfo>& spectatorinfos = dsi.GetSpectatorInfos();
175  it = spectatorinfos.begin();
176  for (;it!=spectatorinfos.end();it++) evdyn->push_back( (Float_t*)(*it).GetExternalLink() );
177 
178  TMVA::Event * ev = new Event((const std::vector<Float_t*>*&)evdyn, varinfos.size());
179  std::vector<Event*>* newEventVector = new std::vector<Event*>;
180  newEventVector->push_back(ev);
181 
182  ds->SetEventCollection(newEventVector, Types::kTraining);
183  ds->SetCurrentType( Types::kTraining );
184  ds->SetCurrentEvent( 0 );
185 
186  delete newEventVector;
187  return ds;
188 }
189 
190 ////////////////////////////////////////////////////////////////////////////////
191 /// if no entries, than create a DataSet with one Event which uses
192 /// dynamic variables (pointers to variables)
193 
196  DataInputHandler& dataInput )
197 {
198  if (dataInput.GetEntries()==0) return BuildDynamicDataSet( dsi );
199  // -------------------------------------------------------------------------
200 
201  // register the classes in the datasetinfo-object
202  // information comes from the trees in the dataInputHandler-object
203  std::vector< TString >* classList = dataInput.GetClassList();
204  for (std::vector<TString>::iterator it = classList->begin(); it< classList->end(); it++) {
205  dsi.AddClass( (*it) );
206  }
207  delete classList;
208 
209  EvtStatsPerClass eventCounts(dsi.GetNClasses());
210  TString normMode;
211  TString splitMode;
212  TString mixMode;
213  UInt_t splitSeed;
214 
215  InitOptions( dsi, eventCounts, normMode, splitSeed, splitMode , mixMode );
216  // ======= build event-vector from input, apply preselection ===============
217  EventVectorOfClassesOfTreeType tmpEventVector;
218  BuildEventVector( dsi, dataInput, tmpEventVector, eventCounts );
219 
220  DataSet* ds = MixEvents( dsi, tmpEventVector, eventCounts,
221  splitMode, mixMode, normMode, splitSeed );
222 
223  const Bool_t showCollectedOutput = kFALSE;
224  if (showCollectedOutput) {
225  Int_t maxL = dsi.GetClassNameMaxLength();
226  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << "Collected:" << Endl;
227  for (UInt_t cl = 0; cl < dsi.GetNClasses(); cl++) {
228  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << " "
229  << setiosflags(ios::left) << std::setw(maxL) << dsi.GetClassInfo(cl)->GetName()
230  << " training entries: " << ds->GetNClassEvents( 0, cl ) << Endl;
231  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << " "
232  << setiosflags(ios::left) << std::setw(maxL) << dsi.GetClassInfo(cl)->GetName()
233  << " testing entries: " << ds->GetNClassEvents( 1, cl ) << Endl;
234  }
235  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << " " << Endl;
236  }
237 
238  return ds;
239 }
240 
241 ////////////////////////////////////////////////////////////////////////////////
242 /// checks a TTreeFormula for problems
243 
245  const TString& expression,
246  Bool_t& hasDollar )
247 {
248  Bool_t worked = kTRUE;
249 
250  if( ttf->GetNdim() <= 0 )
251  Log() << kFATAL << "Expression " << expression.Data()
252  << " could not be resolved to a valid formula. " << Endl;
253  if( ttf->GetNdata() == 0 ){
254  Log() << kWARNING << "Expression: " << expression.Data()
255  << " does not provide data for this event. "
256  << "This event is not taken into account. --> please check if you use as a variable "
257  << "an entry of an array which is not filled for some events "
258  << "(e.g. arr[4] when arr has only 3 elements)." << Endl;
259  Log() << kWARNING << "If you want to take the event into account you can do something like: "
260  << "\"Alt$(arr[4],0)\" where in cases where arr doesn't have a 4th element, "
261  << " 0 is taken as an alternative." << Endl;
262  worked = kFALSE;
263  }
264  if( expression.Contains("$") )
265  hasDollar = kTRUE;
266  else
267  {
268  for (int i = 0, iEnd = ttf->GetNcodes (); i < iEnd; ++i)
269  {
270  TLeaf* leaf = ttf->GetLeaf (i);
271  if (!leaf->IsOnTerminalBranch())
272  hasDollar = kTRUE;
273  }
274  }
275  return worked;
276 }
277 
278 
279 ////////////////////////////////////////////////////////////////////////////////
280 /// While the data gets copied into the local training and testing
281 /// trees, the input tree can change (for instance when changing from
282 /// signal to background tree, or using TChains as input) The
283 /// TTreeFormulas, that hold the input expressions need to be
284 /// re-associated with the new tree, which is done here
285 
287 {
288  TTree *tr = tinfo.GetTree()->GetTree();
289 
290  tr->SetBranchStatus("*",1);
291 
292  Bool_t hasDollar = kFALSE;
293 
294  // 1) the input variable formulas
295  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName()) << "transform input variables" << Endl;
296  std::vector<TTreeFormula*>::const_iterator formIt, formItEnd;
297  for (formIt = fInputFormulas.begin(), formItEnd=fInputFormulas.end(); formIt!=formItEnd; formIt++) if (*formIt) delete *formIt;
298  fInputFormulas.clear();
299  TTreeFormula* ttf = 0;
300 
301  for (UInt_t i=0; i<dsi.GetNVariables(); i++) {
302  ttf = new TTreeFormula( Form( "Formula%s", dsi.GetVariableInfo(i).GetInternalName().Data() ),
303  dsi.GetVariableInfo(i).GetExpression().Data(), tr );
304  CheckTTreeFormula( ttf, dsi.GetVariableInfo(i).GetExpression(), hasDollar );
305  fInputFormulas.push_back( ttf );
306  }
307 
308  //
309  // targets
310  //
311  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName()) << "transform regression targets" << Endl;
312  for (formIt = fTargetFormulas.begin(), formItEnd = fTargetFormulas.end(); formIt!=formItEnd; formIt++) if (*formIt) delete *formIt;
313  fTargetFormulas.clear();
314  for (UInt_t i=0; i<dsi.GetNTargets(); i++) {
315  ttf = new TTreeFormula( Form( "Formula%s", dsi.GetTargetInfo(i).GetInternalName().Data() ),
316  dsi.GetTargetInfo(i).GetExpression().Data(), tr );
317  CheckTTreeFormula( ttf, dsi.GetTargetInfo(i).GetExpression(), hasDollar );
318  fTargetFormulas.push_back( ttf );
319  }
320 
321  //
322  // spectators
323  //
324  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName()) << "transform spectator variables" << Endl;
325  for (formIt = fSpectatorFormulas.begin(), formItEnd = fSpectatorFormulas.end(); formIt!=formItEnd; formIt++) if (*formIt) delete *formIt;
326  fSpectatorFormulas.clear();
327  for (UInt_t i=0; i<dsi.GetNSpectators(); i++) {
328  ttf = new TTreeFormula( Form( "Formula%s", dsi.GetSpectatorInfo(i).GetInternalName().Data() ),
329  dsi.GetSpectatorInfo(i).GetExpression().Data(), tr );
330  CheckTTreeFormula( ttf, dsi.GetSpectatorInfo(i).GetExpression(), hasDollar );
331  fSpectatorFormulas.push_back( ttf );
332  }
333 
334  //
335  // the cuts (one per class, if non-existent: formula pointer = 0)
336  //
337  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName()) << "transform cuts" << Endl;
338  for (formIt = fCutFormulas.begin(), formItEnd = fCutFormulas.end(); formIt!=formItEnd; formIt++) if (*formIt) delete *formIt;
339  fCutFormulas.clear();
340  for (UInt_t clIdx=0; clIdx<dsi.GetNClasses(); clIdx++) {
341  const TCut& tmpCut = dsi.GetClassInfo(clIdx)->GetCut();
342  const TString tmpCutExp(tmpCut.GetTitle());
343  ttf = 0;
344  if (tmpCutExp!="") {
345  ttf = new TTreeFormula( Form("CutClass%i",clIdx), tmpCutExp, tr );
346  Bool_t worked = CheckTTreeFormula( ttf, tmpCutExp, hasDollar );
347  if( !worked ){
348  Log() << kWARNING << "Please check class \"" << dsi.GetClassInfo(clIdx)->GetName()
349  << "\" cut \"" << dsi.GetClassInfo(clIdx)->GetCut() << Endl;
350  }
351  }
352  fCutFormulas.push_back( ttf );
353  }
354 
355  //
356  // the weights (one per class, if non-existent: formula pointer = 0)
357  //
358  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName()) << "transform weights" << Endl;
359  for (formIt = fWeightFormula.begin(), formItEnd = fWeightFormula.end(); formIt!=formItEnd; formIt++) if (*formIt) delete *formIt;
360  fWeightFormula.clear();
361  for (UInt_t clIdx=0; clIdx<dsi.GetNClasses(); clIdx++) {
362  const TString tmpWeight = dsi.GetClassInfo(clIdx)->GetWeight();
363 
364  if (dsi.GetClassInfo(clIdx)->GetName() != tinfo.GetClassName() ) { // if the tree is of another class
365  fWeightFormula.push_back( 0 );
366  continue;
367  }
368 
369  ttf = 0;
370  if (tmpWeight!="") {
371  ttf = new TTreeFormula( "FormulaWeight", tmpWeight, tr );
372  Bool_t worked = CheckTTreeFormula( ttf, tmpWeight, hasDollar );
373  if( !worked ){
374  Log() << kWARNING << Form("Dataset[%s] : ",dsi.GetName()) << "Please check class \"" << dsi.GetClassInfo(clIdx)->GetName()
375  << "\" weight \"" << dsi.GetClassInfo(clIdx)->GetWeight() << Endl;
376  }
377  }
378  else {
379  ttf = 0;
380  }
381  fWeightFormula.push_back( ttf );
382  }
383  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName()) << "enable branches" << Endl;
384  // now enable only branches that are needed in any input formula, target, cut, weight
385 
386  if (!hasDollar) {
387  tr->SetBranchStatus("*",0);
388  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName()) << "enable branches: input variables" << Endl;
389  // input vars
390  for (formIt = fInputFormulas.begin(); formIt!=fInputFormulas.end(); formIt++) {
391  ttf = *formIt;
392  for (Int_t bi = 0; bi<ttf->GetNcodes(); bi++) {
393  tr->SetBranchStatus( ttf->GetLeaf(bi)->GetBranch()->GetName(), 1 );
394  }
395  }
396  // targets
397  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName()) << "enable branches: targets" << Endl;
398  for (formIt = fTargetFormulas.begin(); formIt!=fTargetFormulas.end(); formIt++) {
399  ttf = *formIt;
400  for (Int_t bi = 0; bi<ttf->GetNcodes(); bi++)
401  tr->SetBranchStatus( ttf->GetLeaf(bi)->GetBranch()->GetName(), 1 );
402  }
403  // spectators
404  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName()) << "enable branches: spectators" << Endl;
405  for (formIt = fSpectatorFormulas.begin(); formIt!=fSpectatorFormulas.end(); formIt++) {
406  ttf = *formIt;
407  for (Int_t bi = 0; bi<ttf->GetNcodes(); bi++)
408  tr->SetBranchStatus( ttf->GetLeaf(bi)->GetBranch()->GetName(), 1 );
409  }
410  // cuts
411  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName()) << "enable branches: cuts" << Endl;
412  for (formIt = fCutFormulas.begin(); formIt!=fCutFormulas.end(); formIt++) {
413  ttf = *formIt;
414  if (!ttf) continue;
415  for (Int_t bi = 0; bi<ttf->GetNcodes(); bi++)
416  tr->SetBranchStatus( ttf->GetLeaf(bi)->GetBranch()->GetName(), 1 );
417  }
418  // weights
419  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName()) << "enable branches: weights" << Endl;
420  for (formIt = fWeightFormula.begin(); formIt!=fWeightFormula.end(); formIt++) {
421  ttf = *formIt;
422  if (!ttf) continue;
423  for (Int_t bi = 0; bi<ttf->GetNcodes(); bi++)
424  tr->SetBranchStatus( ttf->GetLeaf(bi)->GetBranch()->GetName(), 1 );
425  }
426  }
427  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName()) << "tree initialized" << Endl;
428  return;
429 }
430 
431 ////////////////////////////////////////////////////////////////////////////////
432 /// compute covariance matrix
433 
435 {
436  const UInt_t nvar = ds->GetNVariables();
437  const UInt_t ntgts = ds->GetNTargets();
438  const UInt_t nvis = ds->GetNSpectators();
439 
440  Float_t *min = new Float_t[nvar];
441  Float_t *max = new Float_t[nvar];
442  Float_t *tgmin = new Float_t[ntgts];
443  Float_t *tgmax = new Float_t[ntgts];
444  Float_t *vmin = new Float_t[nvis];
445  Float_t *vmax = new Float_t[nvis];
446 
447  for (UInt_t ivar=0; ivar<nvar ; ivar++) { min[ivar] = FLT_MAX; max[ivar] = -FLT_MAX; }
448  for (UInt_t ivar=0; ivar<ntgts; ivar++) { tgmin[ivar] = FLT_MAX; tgmax[ivar] = -FLT_MAX; }
449  for (UInt_t ivar=0; ivar<nvis; ivar++) { vmin[ivar] = FLT_MAX; vmax[ivar] = -FLT_MAX; }
450 
451  // perform event loop
452 
453  for (Int_t i=0; i<ds->GetNEvents(); i++) {
454  const Event * ev = ds->GetEvent(i);
455  for (UInt_t ivar=0; ivar<nvar; ivar++) {
456  Double_t v = ev->GetValue(ivar);
457  if (v<min[ivar]) min[ivar] = v;
458  if (v>max[ivar]) max[ivar] = v;
459  }
460  for (UInt_t itgt=0; itgt<ntgts; itgt++) {
461  Double_t v = ev->GetTarget(itgt);
462  if (v<tgmin[itgt]) tgmin[itgt] = v;
463  if (v>tgmax[itgt]) tgmax[itgt] = v;
464  }
465  for (UInt_t ivis=0; ivis<nvis; ivis++) {
466  Double_t v = ev->GetSpectator(ivis);
467  if (v<vmin[ivis]) vmin[ivis] = v;
468  if (v>vmax[ivis]) vmax[ivis] = v;
469  }
470  }
471 
472  for (UInt_t ivar=0; ivar<nvar; ivar++) {
473  dsi.GetVariableInfo(ivar).SetMin(min[ivar]);
474  dsi.GetVariableInfo(ivar).SetMax(max[ivar]);
475  if( TMath::Abs(max[ivar]-min[ivar]) <= FLT_MIN )
476  Log() << kFATAL << Form("Dataset[%s] : ",dsi.GetName()) << "Variable " << dsi.GetVariableInfo(ivar).GetExpression().Data() << " is constant. Please remove the variable." << Endl;
477  }
478  for (UInt_t ivar=0; ivar<ntgts; ivar++) {
479  dsi.GetTargetInfo(ivar).SetMin(tgmin[ivar]);
480  dsi.GetTargetInfo(ivar).SetMax(tgmax[ivar]);
481  if( TMath::Abs(tgmax[ivar]-tgmin[ivar]) <= FLT_MIN )
482  Log() << kFATAL << Form("Dataset[%s] : ",dsi.GetName()) << "Target " << dsi.GetTargetInfo(ivar).GetExpression().Data() << " is constant. Please remove the variable." << Endl;
483  }
484  for (UInt_t ivar=0; ivar<nvis; ivar++) {
485  dsi.GetSpectatorInfo(ivar).SetMin(vmin[ivar]);
486  dsi.GetSpectatorInfo(ivar).SetMax(vmax[ivar]);
487  // if( TMath::Abs(vmax[ivar]-vmin[ivar]) <= FLT_MIN )
488  // Log() << kWARNING << "Spectator variable " << dsi.GetSpectatorInfo(ivar).GetExpression().Data() << " is constant." << Endl;
489  }
490  delete [] min;
491  delete [] max;
492  delete [] tgmin;
493  delete [] tgmax;
494  delete [] vmin;
495  delete [] vmax;
496 }
497 
498 ////////////////////////////////////////////////////////////////////////////////
499 /// computes correlation matrix for variables "theVars" in tree;
500 /// "theType" defines the required event "type"
501 /// ("type" variable must be present in tree)
502 
504 {
505  // first compute variance-covariance
506  TMatrixD* mat = CalcCovarianceMatrix( ds, classNumber );
507 
508  // now the correlation
509  UInt_t nvar = ds->GetNVariables(), ivar, jvar;
510 
511  for (ivar=0; ivar<nvar; ivar++) {
512  for (jvar=0; jvar<nvar; jvar++) {
513  if (ivar != jvar) {
514  Double_t d = (*mat)(ivar, ivar)*(*mat)(jvar, jvar);
515  if (d > 0) (*mat)(ivar, jvar) /= sqrt(d);
516  else {
517  Log() << kWARNING << Form("Dataset[%s] : ",DataSetInfo().GetName())<< "<GetCorrelationMatrix> Zero variances for variables "
518  << "(" << ivar << ", " << jvar << ") = " << d
519  << Endl;
520  (*mat)(ivar, jvar) = 0;
521  }
522  }
523  }
524  }
525 
526  for (ivar=0; ivar<nvar; ivar++) (*mat)(ivar, ivar) = 1.0;
527 
528  return mat;
529 }
530 
531 ////////////////////////////////////////////////////////////////////////////////
532 /// compute covariance matrix
533 
535 {
536  UInt_t nvar = ds->GetNVariables();
537  UInt_t ivar = 0, jvar = 0;
538 
539  TMatrixD* mat = new TMatrixD( nvar, nvar );
540 
541  // init matrices
542  TVectorD vec(nvar);
543  TMatrixD mat2(nvar, nvar);
544  for (ivar=0; ivar<nvar; ivar++) {
545  vec(ivar) = 0;
546  for (jvar=0; jvar<nvar; jvar++) mat2(ivar, jvar) = 0;
547  }
548 
549  // perform event loop
550  Double_t ic = 0;
551  for (Int_t i=0; i<ds->GetNEvents(); i++) {
552 
553  const Event * ev = ds->GetEvent(i);
554  if (ev->GetClass() != classNumber ) continue;
555 
556  Double_t weight = ev->GetWeight();
557  ic += weight; // count used events
558 
559  for (ivar=0; ivar<nvar; ivar++) {
560 
561  Double_t xi = ev->GetValue(ivar);
562  vec(ivar) += xi*weight;
563  mat2(ivar, ivar) += (xi*xi*weight);
564 
565  for (jvar=ivar+1; jvar<nvar; jvar++) {
566  Double_t xj = ev->GetValue(jvar);
567  mat2(ivar, jvar) += (xi*xj*weight);
568  }
569  }
570  }
571 
572  for (ivar=0; ivar<nvar; ivar++)
573  for (jvar=ivar+1; jvar<nvar; jvar++)
574  mat2(jvar, ivar) = mat2(ivar, jvar); // symmetric matrix
575 
576 
577  // variance-covariance
578  for (ivar=0; ivar<nvar; ivar++) {
579  for (jvar=0; jvar<nvar; jvar++) {
580  (*mat)(ivar, jvar) = mat2(ivar, jvar)/ic - vec(ivar)*vec(jvar)/(ic*ic);
581  }
582  }
583 
584  return mat;
585 }
586 
587 // --------------------------------------- new versions
588 
589 ////////////////////////////////////////////////////////////////////////////////
590 /// the dataset splitting
591 
592 void
594  EvtStatsPerClass& nEventRequests,
595  TString& normMode,
596  UInt_t& splitSeed,
597  TString& splitMode,
598  TString& mixMode)
599 {
600  Configurable splitSpecs( dsi.GetSplitOptions() );
601  splitSpecs.SetConfigName("DataSetFactory");
602  splitSpecs.SetConfigDescription( "Configuration options given in the \"PrepareForTrainingAndTesting\" call; these options define the creation of the data sets used for training and expert validation by TMVA" );
603 
604  splitMode = "Random"; // the splitting mode
605  splitSpecs.DeclareOptionRef( splitMode, "SplitMode",
606  "Method of picking training and testing events (default: random)" );
607  splitSpecs.AddPreDefVal(TString("Random"));
608  splitSpecs.AddPreDefVal(TString("Alternate"));
609  splitSpecs.AddPreDefVal(TString("Block"));
610 
611  mixMode = "SameAsSplitMode"; // the splitting mode
612  splitSpecs.DeclareOptionRef( mixMode, "MixMode",
613  "Method of mixing events of different classes into one dataset (default: SameAsSplitMode)" );
614  splitSpecs.AddPreDefVal(TString("SameAsSplitMode"));
615  splitSpecs.AddPreDefVal(TString("Random"));
616  splitSpecs.AddPreDefVal(TString("Alternate"));
617  splitSpecs.AddPreDefVal(TString("Block"));
618 
619  splitSeed = 100;
620  splitSpecs.DeclareOptionRef( splitSeed, "SplitSeed",
621  "Seed for random event shuffling" );
622 
623  normMode = "EqualNumEvents"; // the weight normalisation modes
624  splitSpecs.DeclareOptionRef( normMode, "NormMode",
625  "Overall renormalisation of event-by-event weights used in the training (NumEvents: average weight of 1 per event, independently for signal and background; EqualNumEvents: average weight of 1 per event for signal, and sum of weights for background equal to sum of weights for signal)" );
626  splitSpecs.AddPreDefVal(TString("None"));
627  splitSpecs.AddPreDefVal(TString("NumEvents"));
628  splitSpecs.AddPreDefVal(TString("EqualNumEvents"));
629 
630  splitSpecs.DeclareOptionRef(fScaleWithPreselEff=kFALSE,"ScaleWithPreselEff","Scale the number of requested events by the eff. of the preselection cuts (or not)" );
631 
632  // the number of events
633 
634  // fill in the numbers
635  for (UInt_t cl = 0; cl < dsi.GetNClasses(); cl++) {
636  TString clName = dsi.GetClassInfo(cl)->GetName();
637  TString titleTrain = TString().Format("Number of training events of class %s (default: 0 = all)",clName.Data()).Data();
638  TString titleTest = TString().Format("Number of test events of class %s (default: 0 = all)",clName.Data()).Data();
639  TString titleSplit = TString().Format("Split in training and test events of class %s (default: 0 = deactivated)",clName.Data()).Data();
640 
641  splitSpecs.DeclareOptionRef( nEventRequests.at(cl).nTrainingEventsRequested, TString("nTrain_")+clName, titleTrain );
642  splitSpecs.DeclareOptionRef( nEventRequests.at(cl).nTestingEventsRequested , TString("nTest_")+clName , titleTest );
643  splitSpecs.DeclareOptionRef( nEventRequests.at(cl).TrainTestSplitRequested , TString("TrainTestSplit_")+clName , titleTest );
644  }
645 
646  splitSpecs.DeclareOptionRef( fVerbose, "V", "Verbosity (default: true)" );
647 
648  splitSpecs.DeclareOptionRef( fVerboseLevel=TString("Info"), "VerboseLevel", "VerboseLevel (Debug/Verbose/Info)" );
649  splitSpecs.AddPreDefVal(TString("Debug"));
650  splitSpecs.AddPreDefVal(TString("Verbose"));
651  splitSpecs.AddPreDefVal(TString("Info"));
652 
653  splitSpecs.ParseOptions();
654  splitSpecs.CheckForUnusedOptions();
655 
656  // output logging verbosity
657  if (Verbose()) fLogger->SetMinType( kVERBOSE );
658  if (fVerboseLevel.CompareTo("Debug") ==0) fLogger->SetMinType( kDEBUG );
659  if (fVerboseLevel.CompareTo("Verbose") ==0) fLogger->SetMinType( kVERBOSE );
660  if (fVerboseLevel.CompareTo("Info") ==0) fLogger->SetMinType( kINFO );
661 
662  // put all to upper case
663  splitMode.ToUpper(); mixMode.ToUpper(); normMode.ToUpper();
664  // adjust mixmode if same as splitmode option has been set
665  Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
666  << "\tSplitmode is: \"" << splitMode << "\" the mixmode is: \"" << mixMode << "\"" << Endl;
667  if (mixMode=="SAMEASSPLITMODE") mixMode = splitMode;
668  else if (mixMode!=splitMode)
669  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << "DataSet splitmode="<<splitMode
670  <<" differs from mixmode="<<mixMode<<Endl;
671 }
672 
673 ////////////////////////////////////////////////////////////////////////////////
674 /// build empty event vectors
675 /// distributes events between kTraining/kTesting/kMaxTreeType
676 
677 void
679  TMVA::DataInputHandler& dataInput,
681  EvtStatsPerClass& eventCounts)
682 {
683  const UInt_t nclasses = dsi.GetNClasses();
684 
685  eventsmap[ Types::kTraining ] = EventVectorOfClasses(nclasses);
686  eventsmap[ Types::kTesting ] = EventVectorOfClasses(nclasses);
687  eventsmap[ Types::kMaxTreeType ] = EventVectorOfClasses(nclasses);
688 
689  // create the type, weight and boostweight branches
690  const UInt_t nvars = dsi.GetNVariables();
691  const UInt_t ntgts = dsi.GetNTargets();
692  const UInt_t nvis = dsi.GetNSpectators();
693 
694  for (size_t i=0; i<nclasses; i++) {
695  eventCounts[i].varAvLength = new Float_t[nvars];
696  for (UInt_t ivar=0; ivar<nvars; ivar++)
697  eventCounts[i].varAvLength[ivar] = 0;
698  }
699 
700  // Bool_t haveArrayVariable = kFALSE;
701  Bool_t *varIsArray = new Bool_t[nvars];
702 
703  // If there are NaNs in the tree:
704  // => warn if used variables/cuts/weights contain nan (no problem if event is cut out)
705  // => fatal if cut value is nan or (event not cut out and nans somewhere)
706  // Count & collect all these warnings/errors and output them at the end.
707  std::map<TString, int> nanInfWarnings;
708  std::map<TString, int> nanInfErrors;
709 
710  // if we work with chains we need to remember the current tree if
711  // the chain jumps to a new tree we have to reset the formulas
712  for (UInt_t cl=0; cl<nclasses; cl++) {
713 
714  //Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << "Create training and testing trees -- looping over class \"" << dsi.GetClassInfo(cl)->GetName() << "\" ..." << Endl;
715 
716  EventStats& classEventCounts = eventCounts[cl];
717 
718  // info output for weights
719  Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
720  << "\tWeight expression for class \'" << dsi.GetClassInfo(cl)->GetName() << "\': \""
721  << dsi.GetClassInfo(cl)->GetWeight() << "\"" << Endl;
722 
723  // used for chains only
724  TString currentFileName("");
725 
726  std::vector<TreeInfo>::const_iterator treeIt(dataInput.begin(dsi.GetClassInfo(cl)->GetName()));
727  for (;treeIt!=dataInput.end(dsi.GetClassInfo(cl)->GetName()); treeIt++) {
728 
729  // read first the variables
730  std::vector<Float_t> vars(nvars);
731  std::vector<Float_t> tgts(ntgts);
732  std::vector<Float_t> vis(nvis);
733  TreeInfo currentInfo = *treeIt;
734 
735  Log() << kDEBUG << "Building event vectors " << currentInfo.GetTreeType() << Endl;
736 
737  EventVector& event_v = eventsmap[currentInfo.GetTreeType()].at(cl);
738 
739  Bool_t isChain = (TString("TChain") == currentInfo.GetTree()->ClassName());
740  currentInfo.GetTree()->LoadTree(0);
741  ChangeToNewTree( currentInfo, dsi );
742 
743  // count number of events in tree before cut
744  classEventCounts.nInitialEvents += currentInfo.GetTree()->GetEntries();
745 
746  // loop over events in ntuple
747  const UInt_t nEvts = currentInfo.GetTree()->GetEntries();
748  for (Long64_t evtIdx = 0; evtIdx < nEvts; evtIdx++) {
749  currentInfo.GetTree()->LoadTree(evtIdx);
750 
751  // may need to reload tree in case of chains
752  if (isChain) {
753  if (currentInfo.GetTree()->GetTree()->GetDirectory()->GetFile()->GetName() != currentFileName) {
754  currentFileName = currentInfo.GetTree()->GetTree()->GetDirectory()->GetFile()->GetName();
755  ChangeToNewTree( currentInfo, dsi );
756  }
757  }
758  currentInfo.GetTree()->GetEntry(evtIdx);
759  Int_t sizeOfArrays = 1;
760  Int_t prevArrExpr = 0;
761 
762  // ======= evaluate all formulas =================
763 
764  // first we check if some of the formulas are arrays
765  for (UInt_t ivar=0; ivar<nvars; ivar++) {
766  Int_t ndata = fInputFormulas[ivar]->GetNdata();
767  classEventCounts.varAvLength[ivar] += ndata;
768  if (ndata == 1) continue;
769  // haveArrayVariable = kTRUE;
770  varIsArray[ivar] = kTRUE;
771  if (sizeOfArrays == 1) {
772  sizeOfArrays = ndata;
773  prevArrExpr = ivar;
774  }
775  else if (sizeOfArrays!=ndata) {
776  Log() << kERROR << Form("Dataset[%s] : ",dsi.GetName())<< "ERROR while preparing training and testing trees:" << Endl;
777  Log() << Form("Dataset[%s] : ",dsi.GetName())<< " multiple array-type expressions of different length were encountered" << Endl;
778  Log() << Form("Dataset[%s] : ",dsi.GetName())<< " location of error: event " << evtIdx
779  << " in tree " << currentInfo.GetTree()->GetName()
780  << " of file " << currentInfo.GetTree()->GetCurrentFile()->GetName() << Endl;
781  Log() << Form("Dataset[%s] : ",dsi.GetName())<< " expression " << fInputFormulas[ivar]->GetTitle() << " has "
782  << Form("Dataset[%s] : ",dsi.GetName()) << ndata << " entries, while" << Endl;
783  Log() << Form("Dataset[%s] : ",dsi.GetName())<< " expression " << fInputFormulas[prevArrExpr]->GetTitle() << " has "
784  << Form("Dataset[%s] : ",dsi.GetName())<< fInputFormulas[prevArrExpr]->GetNdata() << " entries" << Endl;
785  Log() << kFATAL << Form("Dataset[%s] : ",dsi.GetName())<< "Need to abort" << Endl;
786  }
787  }
788 
789  // now we read the information
790  for (Int_t idata = 0; idata<sizeOfArrays; idata++) {
791  Bool_t contains_NaN_or_inf = kFALSE;
792 
793  auto checkNanInf = [&](std::map<TString, int> &msgMap, Float_t value, const char *what, const char *formulaTitle) {
794  if (TMath::IsNaN(value)) {
795  contains_NaN_or_inf = kTRUE;
796  ++msgMap[TString::Format("Dataset[%s] : %s expression resolves to indeterminate value (NaN): %s", dsi.GetName(), what, formulaTitle)];
797  } else if (!TMath::Finite(value)) {
798  contains_NaN_or_inf = kTRUE;
799  ++msgMap[TString::Format("Dataset[%s] : %s expression resolves to infinite value (+inf or -inf): %s", dsi.GetName(), what, formulaTitle)];
800  }
801  };
802 
803  TTreeFormula* formula = 0;
804 
805  // the cut expression
806  Double_t cutVal = 1.;
807  formula = fCutFormulas[cl];
808  if (formula) {
809  Int_t ndata = formula->GetNdata();
810  cutVal = (ndata==1 ?
811  formula->EvalInstance(0) :
812  formula->EvalInstance(idata));
813  checkNanInf(nanInfErrors, cutVal, "Cut", formula->GetTitle());
814  }
815 
816  // if event is cut out, add to warnings, else add to errors.
817  auto &nanMessages = cutVal < 0.5 ? nanInfWarnings : nanInfErrors;
818 
819  // the input variable
820  for (UInt_t ivar=0; ivar<nvars; ivar++) {
821  formula = fInputFormulas[ivar];
822  Int_t ndata = formula->GetNdata();
823  vars[ivar] = (ndata == 1 ?
824  formula->EvalInstance(0) :
825  formula->EvalInstance(idata));
826  checkNanInf(nanMessages, vars[ivar], "Input", formula->GetTitle());
827  }
828 
829  // the targets
830  for (UInt_t itrgt=0; itrgt<ntgts; itrgt++) {
831  formula = fTargetFormulas[itrgt];
832  Int_t ndata = formula->GetNdata();
833  tgts[itrgt] = (ndata == 1 ?
834  formula->EvalInstance(0) :
835  formula->EvalInstance(idata));
836  checkNanInf(nanMessages, tgts[itrgt], "Target", formula->GetTitle());
837  }
838 
839  // the spectators
840  for (UInt_t itVis=0; itVis<nvis; itVis++) {
841  formula = fSpectatorFormulas[itVis];
842  Int_t ndata = formula->GetNdata();
843  vis[itVis] = (ndata == 1 ?
844  formula->EvalInstance(0) :
845  formula->EvalInstance(idata));
846  checkNanInf(nanMessages, vis[itVis], "Spectator", formula->GetTitle());
847  }
848 
849 
850  // the weight
851  Float_t weight = currentInfo.GetWeight(); // multiply by tree weight
852  formula = fWeightFormula[cl];
853  if (formula!=0) {
854  Int_t ndata = formula->GetNdata();
855  weight *= (ndata == 1 ?
856  formula->EvalInstance() :
857  formula->EvalInstance(idata));
858  checkNanInf(nanMessages, weight, "Weight", formula->GetTitle());
859  }
860 
861  // Count the events before rejection due to cut or NaN
862  // value (weighted and unweighted)
863  classEventCounts.nEvBeforeCut++;
864  if (!TMath::IsNaN(weight))
865  classEventCounts.nWeEvBeforeCut += weight;
866 
867  // apply the cut, skip rest if cut is not fulfilled
868  if (cutVal<0.5) continue;
869 
870  // global flag if negative weights exist -> can be used
871  // by classifiers who may require special data
872  // treatment (also print warning)
873  if (weight < 0) classEventCounts.nNegWeights++;
874 
875  // now read the event-values (variables and regression targets)
876 
877  if (contains_NaN_or_inf) {
878  Log() << kWARNING << Form("Dataset[%s] : ",dsi.GetName())<< "NaN or +-inf in Event " << evtIdx << Endl;
879  if (sizeOfArrays>1) Log() << kWARNING << Form("Dataset[%s] : ",dsi.GetName())<< " rejected" << Endl;
880  continue;
881  }
882 
883  // Count the events after rejection due to cut or NaN value
884  // (weighted and unweighted)
885  classEventCounts.nEvAfterCut++;
886  classEventCounts.nWeEvAfterCut += weight;
887 
888  // event accepted, fill temporary ntuple
889  event_v.push_back(new Event(vars, tgts , vis, cl , weight));
890  }
891  }
892  currentInfo.GetTree()->ResetBranchAddresses();
893  }
894  }
895 
896  if (!nanInfWarnings.empty()) {
897  Log() << kWARNING << "Found events with NaN and/or +-inf values" << Endl;
898  for (const auto &warning : nanInfWarnings) {
899  auto &log = Log() << kWARNING << warning.first;
900  if (warning.second > 1) log << " (" << warning.second << " times)";
901  log << Endl;
902  }
903  Log() << kWARNING << "These NaN and/or +-infs were all removed by the specified cut, continuing." << Endl;
904  Log() << Endl;
905  }
906 
907  if (!nanInfErrors.empty()) {
908  Log() << kWARNING << "Found events with NaN and/or +-inf values (not removed by cut)" << Endl;
909  for (const auto &error : nanInfErrors) {
910  auto &log = Log() << kWARNING << error.first;
911  if (error.second > 1) log << " (" << error.second << " times)";
912  log << Endl;
913  }
914  Log() << kFATAL << "How am I supposed to train a NaN or +-inf?!" << Endl;
915  }
916 
917  // for output format, get the maximum class name length
918  Int_t maxL = dsi.GetClassNameMaxLength();
919 
920  Log() << kHEADER << Form("[%s] : ",dsi.GetName()) << "Number of events in input trees" << Endl;
921  Log() << kDEBUG << "(after possible flattening of arrays):" << Endl;
922 
923 
924  for (UInt_t cl = 0; cl < dsi.GetNClasses(); cl++) {
925  Log() << kDEBUG //<< Form("[%s] : ",dsi.GetName())
926  << " "
927  << setiosflags(ios::left) << std::setw(maxL) << dsi.GetClassInfo(cl)->GetName()
928  << " -- number of events : "
929  << std::setw(5) << eventCounts[cl].nEvBeforeCut
930  << " / sum of weights: " << std::setw(5) << eventCounts[cl].nWeEvBeforeCut << Endl;
931  }
932 
933  for (UInt_t cl = 0; cl < dsi.GetNClasses(); cl++) {
934  Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
935  << " " << std::setw(maxL) << dsi.GetClassInfo(cl)->GetName()
936  <<" tree -- total number of entries: "
937  << std::setw(5) << dataInput.GetEntries(dsi.GetClassInfo(cl)->GetName()) << Endl;
938  }
939 
941  Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
942  << "\tPreselection: (will affect number of requested training and testing events)" << Endl;
943  else
944  Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
945  << "\tPreselection: (will NOT affect number of requested training and testing events)" << Endl;
946 
947  if (dsi.HasCuts()) {
948  for (UInt_t cl = 0; cl< dsi.GetNClasses(); cl++) {
949  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << " " << setiosflags(ios::left) << std::setw(maxL) << dsi.GetClassInfo(cl)->GetName()
950  << " requirement: \"" << dsi.GetClassInfo(cl)->GetCut() << "\"" << Endl;
951  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << " "
952  << setiosflags(ios::left) << std::setw(maxL) << dsi.GetClassInfo(cl)->GetName()
953  << " -- number of events passed: "
954  << std::setw(5) << eventCounts[cl].nEvAfterCut
955  << " / sum of weights: " << std::setw(5) << eventCounts[cl].nWeEvAfterCut << Endl;
956  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << " "
957  << setiosflags(ios::left) << std::setw(maxL) << dsi.GetClassInfo(cl)->GetName()
958  << " -- efficiency : "
959  << std::setw(6) << eventCounts[cl].nWeEvAfterCut/eventCounts[cl].nWeEvBeforeCut << Endl;
960  }
961  }
962  else Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
963  << " No preselection cuts applied on event classes" << Endl;
964 
965  delete[] varIsArray;
966 
967 }
968 
969 ////////////////////////////////////////////////////////////////////////////////
970 /// Select and distribute unassigned events to kTraining and kTesting
971 
974  EventVectorOfClassesOfTreeType& tmpEventVector,
975  EvtStatsPerClass& eventCounts,
976  const TString& splitMode,
977  const TString& mixMode,
978  const TString& normMode,
979  UInt_t splitSeed)
980 {
981  TMVA::RandomGenerator rndm( splitSeed );
982 
983  // ==== splitting of undefined events to kTraining and kTesting
984 
985  // if splitMode contains "RANDOM", then shuffle the undefined events
986  if (splitMode.Contains( "RANDOM" ) /*&& !emptyUndefined*/ ) {
987  // random shuffle the undefined events of each class
988  for( UInt_t cls = 0; cls < dsi.GetNClasses(); ++cls ){
989  EventVector& unspecifiedEvents = tmpEventVector[Types::kMaxTreeType].at(cls);
990  if( ! unspecifiedEvents.empty() ) {
991  Log() << kDEBUG << "randomly shuffling "
992  << unspecifiedEvents.size()
993  << " events of class " << cls
994  << " which are not yet associated to testing or training" << Endl;
995  std::random_shuffle( unspecifiedEvents.begin(),
996  unspecifiedEvents.end(),
997  rndm );
998  }
999  }
1000  }
1001 
1002  // check for each class the number of training and testing events, the requested number and the available number
1003  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "SPLITTING ========" << Endl;
1004  for( UInt_t cls = 0; cls < dsi.GetNClasses(); ++cls ){
1005  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "---- class " << cls << Endl;
1006  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "check number of training/testing events, requested and available number of events and for class " << cls << Endl;
1007 
1008  // check if enough or too many events are already in the training/testing eventvectors of the class cls
1009  EventVector& eventVectorTraining = tmpEventVector[ Types::kTraining ].at(cls);
1010  EventVector& eventVectorTesting = tmpEventVector[ Types::kTesting ].at(cls);
1011  EventVector& eventVectorUndefined = tmpEventVector[ Types::kMaxTreeType ].at(cls);
1012 
1013  Int_t availableTraining = eventVectorTraining.size();
1014  Int_t availableTesting = eventVectorTesting.size();
1015  Int_t availableUndefined = eventVectorUndefined.size();
1016 
1017  Float_t presel_scale;
1018  if (fScaleWithPreselEff) {
1019  presel_scale = eventCounts[cls].cutScaling();
1020  if (presel_scale < 1)
1021  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << " you have opted for scaling the number of requested training/testing events\n to be scaled by the preselection efficiency"<< Endl;
1022  }else{
1023  presel_scale = 1.; // this scaling was too confusing to most people, including me! Sorry... (Helge)
1024  if (eventCounts[cls].cutScaling() < 1)
1025  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << " you have opted for interpreting the requested number of training/testing events\n to be the number of events AFTER your preselection cuts" << Endl;
1026 
1027  }
1028 
1029  // If TrainTestSplit_<class> is set, set number of requested training events to split*num_all_events
1030  // Requested number of testing events is set to zero and therefore takes all other events
1031  // The option TrainTestSplit_<class> overrides nTrain_<class> or nTest_<class>
1032  if(eventCounts[cls].TrainTestSplitRequested < 1.0 && eventCounts[cls].TrainTestSplitRequested > 0.0){
1033  eventCounts[cls].nTrainingEventsRequested = Int_t(eventCounts[cls].TrainTestSplitRequested*(availableTraining+availableTesting+availableUndefined));
1034  eventCounts[cls].nTestingEventsRequested = Int_t(0);
1035  }
1036  else if(eventCounts[cls].TrainTestSplitRequested != 0.0) Log() << kFATAL << Form("The option TrainTestSplit_<class> has to be in range (0, 1] but is set to %f.",eventCounts[cls].TrainTestSplitRequested) << Endl;
1037  Int_t requestedTraining = Int_t(eventCounts[cls].nTrainingEventsRequested * presel_scale);
1038  Int_t requestedTesting = Int_t(eventCounts[cls].nTestingEventsRequested * presel_scale);
1039 
1040  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "events in training trees : " << availableTraining << Endl;
1041  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "events in testing trees : " << availableTesting << Endl;
1042  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "events in unspecified trees : " << availableUndefined << Endl;
1043  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "requested for training : " << requestedTraining << Endl;;
1044 
1045  if(presel_scale<1)
1046  Log() << " ( " << eventCounts[cls].nTrainingEventsRequested
1047  << " * " << presel_scale << " preselection efficiency)" << Endl;
1048  else
1049  Log() << Endl;
1050  Log() << kDEBUG << "requested for testing : " << requestedTesting;
1051  if(presel_scale<1)
1052  Log() << " ( " << eventCounts[cls].nTestingEventsRequested
1053  << " * " << presel_scale << " preselection efficiency)" << Endl;
1054  else
1055  Log() << Endl;
1056 
1057  // nomenclature r = available training
1058  // s = available testing
1059  // u = available undefined
1060  // R = requested training
1061  // S = requested testing
1062  // nR = to be used to select training events
1063  // nS = to be used to select test events
1064  // we have the constraint: nR + nS < r+s+u,
1065  // since we can not use more events than we have
1066  // free events: Nfree = u-Thet(R-r)-Thet(S-s)
1067  // nomenclature: Thet(x) = x, if x>0 else 0
1068  // nR = max(R,r) + 0.5 * Nfree
1069  // nS = max(S,s) + 0.5 * Nfree
1070  // nR +nS = R+S + u-R+r-S+s = u+r+s= ok! for R>r
1071  // nR +nS = r+S + u-S+s = u+r+s= ok! for r>R
1072 
1073  // three different cases might occur here
1074  //
1075  // Case a
1076  // requestedTraining and requestedTesting >0
1077  // free events: Nfree = u-Thet(R-r)-Thet(S-s)
1078  // nR = Max(R,r) + 0.5 * Nfree
1079  // nS = Max(S,s) + 0.5 * Nfree
1080  //
1081  // Case b
1082  // exactly one of requestedTraining or requestedTesting >0
1083  // assume training R >0
1084  // nR = max(R,r)
1085  // nS = s+u+r-nR
1086  // and s=nS
1087  //
1088  // Case c
1089  // requestedTraining=0, requestedTesting=0
1090  // Nfree = u-|r-s|
1091  // if NFree >=0
1092  // R = Max(r,s) + 0.5 * Nfree = S
1093  // else if r>s
1094  // R = r; S=s+u
1095  // else
1096  // R = r+u; S=s
1097  //
1098  // Next steps:
1099  // Determination of Event numbers R,S, nR, nS
1100  // distribute undefined events according to nR, nS
1101  // finally determine actual sub samples from nR and nS to be used in training / testing
1102  //
1103 
1104  Int_t useForTesting(0),useForTraining(0);
1105  Int_t allAvailable(availableUndefined + availableTraining + availableTesting);
1106 
1107  if( (requestedTraining == 0) && (requestedTesting == 0)){
1108 
1109  // Case C: balance the number of training and testing events
1110 
1111  if ( availableUndefined >= TMath::Abs(availableTraining - availableTesting) ) {
1112  // enough unspecified are available to equal training and testing
1113  useForTraining = useForTesting = allAvailable/2;
1114  } else {
1115  // all unspecified are assigned to the smaller of training / testing
1116  useForTraining = availableTraining;
1117  useForTesting = availableTesting;
1118  if (availableTraining < availableTesting)
1119  useForTraining += availableUndefined;
1120  else
1121  useForTesting += availableUndefined;
1122  }
1123  requestedTraining = useForTraining;
1124  requestedTesting = useForTesting;
1125  }
1126 
1127  else if (requestedTesting == 0){
1128  // case B
1129  useForTraining = TMath::Max(requestedTraining,availableTraining);
1130  if (allAvailable < useForTraining) {
1131  Log() << kFATAL << Form("Dataset[%s] : ",dsi.GetName())<< "More events requested for training ("
1132  << requestedTraining << ") than available ("
1133  << allAvailable << ")!" << Endl;
1134  }
1135  useForTesting = allAvailable - useForTraining; // the rest
1136  requestedTesting = useForTesting;
1137  }
1138 
1139  else if (requestedTraining == 0){ // case B)
1140  useForTesting = TMath::Max(requestedTesting,availableTesting);
1141  if (allAvailable < useForTesting) {
1142  Log() << kFATAL << Form("Dataset[%s] : ",dsi.GetName())<< "More events requested for testing ("
1143  << requestedTesting << ") than available ("
1144  << allAvailable << ")!" << Endl;
1145  }
1146  useForTraining= allAvailable - useForTesting; // the rest
1147  requestedTraining = useForTraining;
1148  }
1149 
1150  else {
1151  // Case A
1152  // requestedTraining R and requestedTesting S >0
1153  // free events: Nfree = u-Thet(R-r)-Thet(S-s)
1154  // nR = Max(R,r) + 0.5 * Nfree
1155  // nS = Max(S,s) + 0.5 * Nfree
1156  Int_t stillNeedForTraining = TMath::Max(requestedTraining-availableTraining,0);
1157  Int_t stillNeedForTesting = TMath::Max(requestedTesting-availableTesting,0);
1158 
1159  int NFree = availableUndefined - stillNeedForTraining - stillNeedForTesting;
1160  if (NFree <0) NFree = 0;
1161  useForTraining = TMath::Max(requestedTraining,availableTraining) + NFree/2;
1162  useForTesting= allAvailable - useForTraining; // the rest
1163  }
1164 
1165  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "determined event sample size to select training sample from="<<useForTraining<<Endl;
1166  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "determined event sample size to select test sample from="<<useForTesting<<Endl;
1167 
1168 
1169 
1170  // associate undefined events
1171  if( splitMode == "ALTERNATE" ){
1172  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "split 'ALTERNATE'" << Endl;
1173  Int_t nTraining = availableTraining;
1174  Int_t nTesting = availableTesting;
1175  for( EventVector::iterator it = eventVectorUndefined.begin(), itEnd = eventVectorUndefined.end(); it != itEnd; ){
1176  ++nTraining;
1177  if( nTraining <= requestedTraining ){
1178  eventVectorTraining.insert( eventVectorTraining.end(), (*it) );
1179  ++it;
1180  }
1181  if( it != itEnd ){
1182  ++nTesting;
1183  eventVectorTesting.insert( eventVectorTesting.end(), (*it) );
1184  ++it;
1185  }
1186  }
1187  } else {
1188  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "split '" << splitMode << "'" << Endl;
1189 
1190  // test if enough events are available
1191  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "availableundefined : " << availableUndefined << Endl;
1192  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "useForTraining : " << useForTraining << Endl;
1193  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "useForTesting : " << useForTesting << Endl;
1194  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "availableTraining : " << availableTraining << Endl;
1195  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "availableTesting : " << availableTesting << Endl;
1196 
1197  if( availableUndefined<(useForTraining-availableTraining) ||
1198  availableUndefined<(useForTesting -availableTesting ) ||
1199  availableUndefined<(useForTraining+useForTesting-availableTraining-availableTesting ) ){
1200  Log() << kFATAL << Form("Dataset[%s] : ",dsi.GetName())<< "More events requested than available!" << Endl;
1201  }
1202 
1203  // select the events
1204  if (useForTraining>availableTraining){
1205  eventVectorTraining.insert( eventVectorTraining.end() , eventVectorUndefined.begin(), eventVectorUndefined.begin()+ useForTraining- availableTraining );
1206  eventVectorUndefined.erase( eventVectorUndefined.begin(), eventVectorUndefined.begin() + useForTraining- availableTraining);
1207  }
1208  if (useForTesting>availableTesting){
1209  eventVectorTesting.insert( eventVectorTesting.end() , eventVectorUndefined.begin(), eventVectorUndefined.begin()+ useForTesting- availableTesting );
1210  }
1211  }
1212  eventVectorUndefined.clear();
1213 
1214  // finally shorten the event vectors to the requested size by removing random events
1215  if (splitMode.Contains( "RANDOM" )){
1216  UInt_t sizeTraining = eventVectorTraining.size();
1217  if( sizeTraining > UInt_t(requestedTraining) ){
1218  std::vector<UInt_t> indicesTraining( sizeTraining );
1219  // make indices
1220  std::generate( indicesTraining.begin(), indicesTraining.end(), TMVA::Increment<UInt_t>(0) );
1221  // shuffle indices
1222  std::random_shuffle( indicesTraining.begin(), indicesTraining.end(), rndm );
1223  // erase indices of not needed events
1224  indicesTraining.erase( indicesTraining.begin()+sizeTraining-UInt_t(requestedTraining), indicesTraining.end() );
1225  // delete all events with the given indices
1226  for( std::vector<UInt_t>::iterator it = indicesTraining.begin(), itEnd = indicesTraining.end(); it != itEnd; ++it ){
1227  delete eventVectorTraining.at( (*it) ); // delete event
1228  eventVectorTraining.at( (*it) ) = NULL; // set pointer to NULL
1229  }
1230  // now remove and erase all events with pointer==NULL
1231  eventVectorTraining.erase( std::remove( eventVectorTraining.begin(), eventVectorTraining.end(), (void*)NULL ), eventVectorTraining.end() );
1232  }
1233 
1234  UInt_t sizeTesting = eventVectorTesting.size();
1235  if( sizeTesting > UInt_t(requestedTesting) ){
1236  std::vector<UInt_t> indicesTesting( sizeTesting );
1237  // make indices
1238  std::generate( indicesTesting.begin(), indicesTesting.end(), TMVA::Increment<UInt_t>(0) );
1239  // shuffle indices
1240  std::random_shuffle( indicesTesting.begin(), indicesTesting.end(), rndm );
1241  // erase indices of not needed events
1242  indicesTesting.erase( indicesTesting.begin()+sizeTesting-UInt_t(requestedTesting), indicesTesting.end() );
1243  // delete all events with the given indices
1244  for( std::vector<UInt_t>::iterator it = indicesTesting.begin(), itEnd = indicesTesting.end(); it != itEnd; ++it ){
1245  delete eventVectorTesting.at( (*it) ); // delete event
1246  eventVectorTesting.at( (*it) ) = NULL; // set pointer to NULL
1247  }
1248  // now remove and erase all events with pointer==NULL
1249  eventVectorTesting.erase( std::remove( eventVectorTesting.begin(), eventVectorTesting.end(), (void*)NULL ), eventVectorTesting.end() );
1250  }
1251  }
1252  else { // erase at end
1253  if( eventVectorTraining.size() < UInt_t(requestedTraining) )
1254  Log() << kWARNING << Form("Dataset[%s] : ",dsi.GetName())<< "DataSetFactory/requested number of training samples larger than size of eventVectorTraining.\n"
1255  << "There is probably an issue. Please contact the TMVA developers." << Endl;
1256  std::for_each( eventVectorTraining.begin()+requestedTraining, eventVectorTraining.end(), DeleteFunctor<Event>() );
1257  eventVectorTraining.erase(eventVectorTraining.begin()+requestedTraining,eventVectorTraining.end());
1258 
1259  if( eventVectorTesting.size() < UInt_t(requestedTesting) )
1260  Log() << kWARNING << Form("Dataset[%s] : ",dsi.GetName())<< "DataSetFactory/requested number of testing samples larger than size of eventVectorTesting.\n"
1261  << "There is probably an issue. Please contact the TMVA developers." << Endl;
1262  std::for_each( eventVectorTesting.begin()+requestedTesting, eventVectorTesting.end(), DeleteFunctor<Event>() );
1263  eventVectorTesting.erase(eventVectorTesting.begin()+requestedTesting,eventVectorTesting.end());
1264  }
1265  }
1266 
1267  TMVA::DataSetFactory::RenormEvents( dsi, tmpEventVector, eventCounts, normMode );
1268 
1269  Int_t trainingSize = 0;
1270  Int_t testingSize = 0;
1271 
1272  // sum up number of training and testing events
1273  for( UInt_t cls = 0; cls < dsi.GetNClasses(); ++cls ){
1274  trainingSize += tmpEventVector[Types::kTraining].at(cls).size();
1275  testingSize += tmpEventVector[Types::kTesting].at(cls).size();
1276  }
1277 
1278  // --- collect all training (testing) events into the training (testing) eventvector
1279 
1280  // create event vectors reserve enough space
1281  EventVector* trainingEventVector = new EventVector();
1282  EventVector* testingEventVector = new EventVector();
1283 
1284  trainingEventVector->reserve( trainingSize );
1285  testingEventVector->reserve( testingSize );
1286 
1287 
1288  // collect the events
1289 
1290  // mixing of kTraining and kTesting data sets
1291  Log() << kDEBUG << " MIXING ============= " << Endl;
1292 
1293  if( mixMode == "ALTERNATE" ){
1294  // Inform user if he tries to use alternate mixmode for
1295  // event classes with different number of events, this works but the alternation stops at the last event of the smaller class
1296  for( UInt_t cls = 1; cls < dsi.GetNClasses(); ++cls ){
1297  if (tmpEventVector[Types::kTraining].at(cls).size() != tmpEventVector[Types::kTraining].at(0).size()){
1298  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << "Training sample: You are trying to mix events in alternate mode although the classes have different event numbers. This works but the alternation stops at the last event of the smaller class."<<Endl;
1299  }
1300  if (tmpEventVector[Types::kTesting].at(cls).size() != tmpEventVector[Types::kTesting].at(0).size()){
1301  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << "Testing sample: You are trying to mix events in alternate mode although the classes have different event numbers. This works but the alternation stops at the last event of the smaller class."<<Endl;
1302  }
1303  }
1304  typedef EventVector::iterator EvtVecIt;
1305  EvtVecIt itEvent, itEventEnd;
1306 
1307  // insert first class
1308  Log() << kDEBUG << "insert class 0 into training and test vector" << Endl;
1309  trainingEventVector->insert( trainingEventVector->end(), tmpEventVector[Types::kTraining].at(0).begin(), tmpEventVector[Types::kTraining].at(0).end() );
1310  testingEventVector->insert( testingEventVector->end(), tmpEventVector[Types::kTesting].at(0).begin(), tmpEventVector[Types::kTesting].at(0).end() );
1311 
1312  // insert other classes
1313  EvtVecIt itTarget;
1314  for( UInt_t cls = 1; cls < dsi.GetNClasses(); ++cls ){
1315  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "insert class " << cls << Endl;
1316  // training vector
1317  itTarget = trainingEventVector->begin() - 1; // start one before begin
1318  // loop over source
1319  for( itEvent = tmpEventVector[Types::kTraining].at(cls).begin(), itEventEnd = tmpEventVector[Types::kTraining].at(cls).end(); itEvent != itEventEnd; ++itEvent ){
1320  // if( std::distance( itTarget, trainingEventVector->end()) < Int_t(cls+1) ) {
1321  if( (trainingEventVector->end() - itTarget) < Int_t(cls+1) ) {
1322  itTarget = trainingEventVector->end();
1323  trainingEventVector->insert( itTarget, itEvent, itEventEnd ); // fill in the rest without mixing
1324  break;
1325  }else{
1326  itTarget += cls+1;
1327  trainingEventVector->insert( itTarget, (*itEvent) ); // fill event
1328  }
1329  }
1330  // testing vector
1331  itTarget = testingEventVector->begin() - 1;
1332  // loop over source
1333  for( itEvent = tmpEventVector[Types::kTesting].at(cls).begin(), itEventEnd = tmpEventVector[Types::kTesting].at(cls).end(); itEvent != itEventEnd; ++itEvent ){
1334  // if( std::distance( itTarget, testingEventVector->end()) < Int_t(cls+1) ) {
1335  if( ( testingEventVector->end() - itTarget ) < Int_t(cls+1) ) {
1336  itTarget = testingEventVector->end();
1337  testingEventVector->insert( itTarget, itEvent, itEventEnd ); // fill in the rest without mixing
1338  break;
1339  }else{
1340  itTarget += cls+1;
1341  testingEventVector->insert( itTarget, (*itEvent) ); // fill event
1342  }
1343  }
1344  }
1345 
1346  // debugging output: classnumbers of all events in training and testing vectors
1347  // std::cout << std::endl;
1348  // std::cout << "TRAINING VECTOR" << std::endl;
1349  // std::transform( trainingEventVector->begin(), trainingEventVector->end(), ostream_iterator<Int_t>(std::cout, "|"), std::mem_fun(&TMVA::Event::GetClass) );
1350 
1351  // std::cout << std::endl;
1352  // std::cout << "TESTING VECTOR" << std::endl;
1353  // std::transform( testingEventVector->begin(), testingEventVector->end(), ostream_iterator<Int_t>(std::cout, "|"), std::mem_fun(&TMVA::Event::GetClass) );
1354  // std::cout << std::endl;
1355 
1356  }else{
1357  for( UInt_t cls = 0; cls < dsi.GetNClasses(); ++cls ){
1358  trainingEventVector->insert( trainingEventVector->end(), tmpEventVector[Types::kTraining].at(cls).begin(), tmpEventVector[Types::kTraining].at(cls).end() );
1359  testingEventVector->insert ( testingEventVector->end(), tmpEventVector[Types::kTesting].at(cls).begin(), tmpEventVector[Types::kTesting].at(cls).end() );
1360  }
1361  }
1362 
1363  // std::cout << "trainingEventVector " << trainingEventVector->size() << std::endl;
1364  // std::cout << "testingEventVector " << testingEventVector->size() << std::endl;
1365 
1366  // std::transform( trainingEventVector->begin(), trainingEventVector->end(), ostream_iterator<Int_t>(std::cout, "> \n"), std::mem_fun(&TMVA::Event::GetNVariables) );
1367  // std::transform( testingEventVector->begin(), testingEventVector->end(), ostream_iterator<Int_t>(std::cout, "> \n"), std::mem_fun(&TMVA::Event::GetNVariables) );
1368 
1369  // delete the tmpEventVector (but not the events therein)
1370  tmpEventVector[Types::kTraining].clear();
1371  tmpEventVector[Types::kTesting].clear();
1372 
1373  tmpEventVector[Types::kMaxTreeType].clear();
1374 
1375  if (mixMode == "RANDOM") {
1376  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "shuffling events"<<Endl;
1377 
1378  std::random_shuffle( trainingEventVector->begin(), trainingEventVector->end(), rndm );
1379  std::random_shuffle( testingEventVector->begin(), testingEventVector->end(), rndm );
1380  }
1381 
1382  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "trainingEventVector " << trainingEventVector->size() << Endl;
1383  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "testingEventVector " << testingEventVector->size() << Endl;
1384 
1385  // create dataset
1386  DataSet* ds = new DataSet(dsi);
1387 
1388  // Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << "Create internal training tree" << Endl;
1389  ds->SetEventCollection(trainingEventVector, Types::kTraining );
1390  // Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << "Create internal testing tree" << Endl;
1391  ds->SetEventCollection(testingEventVector, Types::kTesting );
1392 
1393 
1394  if (ds->GetNTrainingEvents() < 1){
1395  Log() << kFATAL << "Dataset " << std::string(dsi.GetName()) << " does not have any training events, I better stop here and let you fix that one first " << Endl;
1396  }
1397 
1398  if (ds->GetNTestEvents() < 1) {
1399  Log() << kERROR << "Dataset " << std::string(dsi.GetName()) << " does not have any testing events, guess that will cause problems later..but for now, I continue " << Endl;
1400  }
1401 
1402  delete trainingEventVector;
1403  delete testingEventVector;
1404  return ds;
1405 
1406 }
1407 
1408 ////////////////////////////////////////////////////////////////////////////////
1409 /// renormalisation of the TRAINING event weights
1410 /// - none (kind of obvious) .. use the weights as supplied by the
1411 /// user.. (we store however the relative weight for later use)
1412 /// - numEvents
1413 /// - equalNumEvents reweight the training events such that the sum of all
1414 /// backgr. (class > 0) weights equal that of the signal (class 0)
1415 
1416 void
1418  EventVectorOfClassesOfTreeType& tmpEventVector,
1419  const EvtStatsPerClass& eventCounts,
1420  const TString& normMode )
1421 {
1422 
1423 
1424  // print rescaling info
1425  // ---------------------------------
1426  // compute sizes and sums of weights
1427  Int_t trainingSize = 0;
1428  Int_t testingSize = 0;
1429 
1430  ValuePerClass trainingSumWeightsPerClass( dsi.GetNClasses() );
1431  ValuePerClass testingSumWeightsPerClass( dsi.GetNClasses() );
1432 
1433  NumberPerClass trainingSizePerClass( dsi.GetNClasses() );
1434  NumberPerClass testingSizePerClass( dsi.GetNClasses() );
1435 
1436  Double_t trainingSumSignalWeights = 0;
1437  Double_t trainingSumBackgrWeights = 0; // Backgr. includes all classes that are not signal
1438  Double_t testingSumSignalWeights = 0;
1439  Double_t testingSumBackgrWeights = 0; // Backgr. includes all classes that are not signal
1440 
1441 
1442 
1443  for( UInt_t cls = 0, clsEnd = dsi.GetNClasses(); cls < clsEnd; ++cls ){
1444  trainingSizePerClass.at(cls) = tmpEventVector[Types::kTraining].at(cls).size();
1445  testingSizePerClass.at(cls) = tmpEventVector[Types::kTesting].at(cls).size();
1446 
1447  trainingSize += trainingSizePerClass.back();
1448  testingSize += testingSizePerClass.back();
1449 
1450  // the functional solution
1451  // sum up the weights in Double_t although the individual weights are Float_t to prevent rounding issues in addition of floating points
1452  //
1453  // accumulate --> does what the name says
1454  // begin() and end() denote the range of the vector to be accumulated
1455  // Double_t(0) tells accumulate the type and the starting value
1456  // compose_binary creates a BinaryFunction of ...
1457  // std::plus<Double_t>() knows how to sum up two doubles
1458  // null<Double_t>() leaves the first argument (the running sum) unchanged and returns it
1459  // std::mem_fun knows how to call a member function (type and member-function given as argument) and return the result
1460  //
1461  // all together sums up all the event-weights of the events in the vector and returns it
1462  trainingSumWeightsPerClass.at(cls) = std::accumulate( tmpEventVector[Types::kTraining].at(cls).begin(),
1463  tmpEventVector[Types::kTraining].at(cls).end(),
1464  Double_t(0),
1465  compose_binary( std::plus<Double_t>(),
1466  null<Double_t>(),
1467  std::mem_fun(&TMVA::Event::GetOriginalWeight) ) );
1468 
1469  testingSumWeightsPerClass.at(cls) = std::accumulate( tmpEventVector[Types::kTesting].at(cls).begin(),
1470  tmpEventVector[Types::kTesting].at(cls).end(),
1471  Double_t(0),
1472  compose_binary( std::plus<Double_t>(),
1473  null<Double_t>(),
1474  std::mem_fun(&TMVA::Event::GetOriginalWeight) ) );
1475 
1476  if ( cls == dsi.GetSignalClassIndex()){
1477  trainingSumSignalWeights += trainingSumWeightsPerClass.at(cls);
1478  testingSumSignalWeights += testingSumWeightsPerClass.at(cls);
1479  }else{
1480  trainingSumBackgrWeights += trainingSumWeightsPerClass.at(cls);
1481  testingSumBackgrWeights += testingSumWeightsPerClass.at(cls);
1482  }
1483  }
1484 
1485  // ---------------------------------
1486  // compute renormalization factors
1487 
1488  ValuePerClass renormFactor( dsi.GetNClasses() );
1489 
1490 
1491  // for information purposes
1492  dsi.SetNormalization( normMode );
1493  // !! these will be overwritten later by the 'rescaled' ones if
1494  // NormMode != None !!!
1495  dsi.SetTrainingSumSignalWeights(trainingSumSignalWeights);
1496  dsi.SetTrainingSumBackgrWeights(trainingSumBackgrWeights);
1497  dsi.SetTestingSumSignalWeights(testingSumSignalWeights);
1498  dsi.SetTestingSumBackgrWeights(testingSumBackgrWeights);
1499 
1500 
1501  if (normMode == "NONE") {
1502  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << "No weight renormalisation applied: use original global and event weights" << Endl;
1503  return;
1504  }
1505  //changed by Helge 27.5.2013 What on earth was done here before? I still remember the idea behind this which apparently was
1506  //NOT understood by the 'programmer' :) .. the idea was to have SAME amount of effective TRAINING data for signal and background.
1507  // Testing events are totally irrelevant for this and might actually skew the whole normalisation!!
1508  else if (normMode == "NUMEVENTS") {
1509  Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
1510  << "\tWeight renormalisation mode: \"NumEvents\": renormalises all event classes " << Endl;
1511  Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
1512  << " such that the effective (weighted) number of events in each class equals the respective " << Endl;
1513  Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
1514  << " number of events (entries) that you demanded in PrepareTrainingAndTestTree(\"\",\"nTrain_Signal=.. )" << Endl;
1515  Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
1516  << " ... i.e. such that Sum[i=1..N_j]{w_i} = N_j, j=0,1,2..." << Endl;
1517  Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
1518  << " ... (note that N_j is the sum of TRAINING events (nTrain_j...with j=Signal,Background.." << Endl;
1519  Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
1520  << " ..... Testing events are not renormalised nor included in the renormalisation factor! )"<< Endl;
1521 
1522  for( UInt_t cls = 0, clsEnd = dsi.GetNClasses(); cls < clsEnd; ++cls ){
1523  // renormFactor.at(cls) = ( (trainingSizePerClass.at(cls) + testingSizePerClass.at(cls))/
1524  // (trainingSumWeightsPerClass.at(cls) + testingSumWeightsPerClass.at(cls)) );
1525  //changed by Helge 27.5.2013
1526  renormFactor.at(cls) = ((Float_t)trainingSizePerClass.at(cls) )/
1527  (trainingSumWeightsPerClass.at(cls)) ;
1528  }
1529  }
1530  else if (normMode == "EQUALNUMEVENTS") {
1531  //changed by Helge 27.5.2013 What on earth was done here before? I still remember the idea behind this which apparently was
1532  //NOT understood by the 'programmer' :) .. the idea was to have SAME amount of effective TRAINING data for signal and background.
1533  //done here was something like having each data source normalized to its number of entries and this even for training+testing together.
1534  // what should this have been good for ???
1535 
1536  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << "Weight renormalisation mode: \"EqualNumEvents\": renormalises all event classes ..." << Endl;
1537  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << " such that the effective (weighted) number of events in each class is the same " << Endl;
1538  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << " (and equals the number of events (entries) given for class=0 )" << Endl;
1539  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << "... i.e. such that Sum[i=1..N_j]{w_i} = N_classA, j=classA, classB, ..." << Endl;
1540  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << "... (note that N_j is the sum of TRAINING events" << Endl;
1541  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << " ..... Testing events are not renormalised nor included in the renormalisation factor!)" << Endl;
1542 
1543  // normalize to size of first class
1544  UInt_t referenceClass = 0;
1545  for (UInt_t cls = 0, clsEnd = dsi.GetNClasses(); cls < clsEnd; ++cls ) {
1546  renormFactor.at(cls) = Float_t(trainingSizePerClass.at(referenceClass))/
1547  (trainingSumWeightsPerClass.at(cls));
1548  }
1549  }
1550  else {
1551  Log() << kFATAL << Form("Dataset[%s] : ",dsi.GetName())<< "<PrepareForTrainingAndTesting> Unknown NormMode: " << normMode << Endl;
1552  }
1553 
1554  // ---------------------------------
1555  // now apply the normalization factors
1556  Int_t maxL = dsi.GetClassNameMaxLength();
1557  for (UInt_t cls = 0, clsEnd = dsi.GetNClasses(); cls<clsEnd; ++cls) {
1558  Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
1559  << "--> Rescale " << setiosflags(ios::left) << std::setw(maxL)
1560  << dsi.GetClassInfo(cls)->GetName() << " event weights by factor: " << renormFactor.at(cls) << Endl;
1561  for (EventVector::iterator it = tmpEventVector[Types::kTraining].at(cls).begin(),
1562  itEnd = tmpEventVector[Types::kTraining].at(cls).end(); it != itEnd; ++it){
1563  (*it)->SetWeight ((*it)->GetWeight() * renormFactor.at(cls));
1564  }
1565 
1566  }
1567 
1568 
1569  // print out the result
1570  // (same code as before --> this can be done nicer )
1571  //
1572 
1573  Log() << kINFO //<< Form("Dataset[%s] : ",dsi.GetName())
1574  << "Number of training and testing events" << Endl;
1575  Log() << kDEBUG << "\tafter rescaling:" << Endl;
1576  Log() << kINFO //<< Form("Dataset[%s] : ",dsi.GetName())
1577  << "---------------------------------------------------------------------------" << Endl;
1578 
1579  trainingSumSignalWeights = 0;
1580  trainingSumBackgrWeights = 0; // Backgr. includes all classes that are not signal
1581  testingSumSignalWeights = 0;
1582  testingSumBackgrWeights = 0; // Backgr. includes all classes that are not signal
1583 
1584  for( UInt_t cls = 0, clsEnd = dsi.GetNClasses(); cls < clsEnd; ++cls ){
1585 
1586  trainingSumWeightsPerClass.at(cls) = (std::accumulate( tmpEventVector[Types::kTraining].at(cls).begin(), // accumulate --> start at begin
1587  tmpEventVector[Types::kTraining].at(cls).end(), // until end()
1588  Double_t(0), // values are of type double
1589  compose_binary( std::plus<Double_t>(), // define addition for doubles
1590  null<Double_t>(), // take the argument, don't do anything and return it
1591  std::mem_fun(&TMVA::Event::GetOriginalWeight) ) )); // take the value from GetOriginalWeight
1592 
1593  testingSumWeightsPerClass.at(cls) = std::accumulate( tmpEventVector[Types::kTesting].at(cls).begin(),
1594  tmpEventVector[Types::kTesting].at(cls).end(),
1595  Double_t(0),
1596  compose_binary( std::plus<Double_t>(),
1597  null<Double_t>(),
1598  std::mem_fun(&TMVA::Event::GetOriginalWeight) ) );
1599 
1600 
1601  if ( cls == dsi.GetSignalClassIndex()){
1602  trainingSumSignalWeights += trainingSumWeightsPerClass.at(cls);
1603  testingSumSignalWeights += testingSumWeightsPerClass.at(cls);
1604  }else{
1605  trainingSumBackgrWeights += trainingSumWeightsPerClass.at(cls);
1606  testingSumBackgrWeights += testingSumWeightsPerClass.at(cls);
1607  }
1608 
1609  // output statistics
1610 
1611  Log() << kINFO //<< Form("Dataset[%s] : ",dsi.GetName())
1612  << setiosflags(ios::left) << std::setw(maxL)
1613  << dsi.GetClassInfo(cls)->GetName() << " -- "
1614  << "training events : " << trainingSizePerClass.at(cls) << Endl;
1615  Log() << kDEBUG << "\t(sum of weights: " << trainingSumWeightsPerClass.at(cls) << ")"
1616  << " - requested were " << eventCounts[cls].nTrainingEventsRequested << " events" << Endl;
1617  Log() << kINFO //<< Form("Dataset[%s] : ",dsi.GetName())
1618  << setiosflags(ios::left) << std::setw(maxL)
1619  << dsi.GetClassInfo(cls)->GetName() << " -- "
1620  << "testing events : " << testingSizePerClass.at(cls) << Endl;
1621  Log() << kDEBUG << "\t(sum of weights: " << testingSumWeightsPerClass.at(cls) << ")"
1622  << " - requested were " << eventCounts[cls].nTestingEventsRequested << " events" << Endl;
1623  Log() << kINFO //<< Form("Dataset[%s] : ",dsi.GetName())
1624  << setiosflags(ios::left) << std::setw(maxL)
1625  << dsi.GetClassInfo(cls)->GetName() << " -- "
1626  << "training and testing events: "
1627  << (trainingSizePerClass.at(cls)+testingSizePerClass.at(cls)) << Endl;
1628  Log() << kDEBUG << "\t(sum of weights: "
1629  << (trainingSumWeightsPerClass.at(cls)+testingSumWeightsPerClass.at(cls)) << ")" << Endl;
1630  if(eventCounts[cls].nEvAfterCut<eventCounts[cls].nEvBeforeCut) {
1631  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << setiosflags(ios::left) << std::setw(maxL)
1632  << dsi.GetClassInfo(cls)->GetName() << " -- "
1633  << "due to the preselection a scaling factor has been applied to the numbers of requested events: "
1634  << eventCounts[cls].cutScaling() << Endl;
1635  }
1636  }
1637  Log() << kINFO << Endl;
1638 
1639  // for information purposes
1640  dsi.SetTrainingSumSignalWeights(trainingSumSignalWeights);
1641  dsi.SetTrainingSumBackgrWeights(trainingSumBackgrWeights);
1642  dsi.SetTestingSumSignalWeights(testingSumSignalWeights);
1643  dsi.SetTestingSumBackgrWeights(testingSumBackgrWeights);
1644 
1645 
1646 }
1647 
const int ndata
TTree * GetTree() const
virtual const char * GetName() const
Returns name of object.
Definition: TNamed.h:47
A TLeaf describes individual elements of a TBranch See TBranch structure in TTree.
Definition: TLeaf.h:32
UInt_t GetNVariables() const
Definition: DataSetInfo.h:110
std::vector< EventVector > EventVectorOfClasses
void SetTrainingSumBackgrWeights(Double_t trainingSumBackgrWeights)
Definition: DataSetInfo.h:118
MsgLogger & Endl(MsgLogger &ml)
Definition: MsgLogger.h:158
long long Long64_t
Definition: RtypesCore.h:69
const TString & GetInternalName() const
Definition: VariableInfo.h:58
std::vector< VariableInfo > & GetSpectatorInfos()
Definition: DataSetInfo.h:104
std::vector< TTreeFormula * > fInputFormulas
float Float_t
Definition: RtypesCore.h:53
std::vector< TTreeFormula * > fCutFormulas
std::vector< Double_t > ValuePerClass
UInt_t GetNVariables() const
access the number of variables through the datasetinfo
Definition: DataSet.cxx:216
void SetTrainingSumSignalWeights(Double_t trainingSumSignalWeights)
Definition: DataSetInfo.h:117
void SetTestingSumBackgrWeights(Double_t testingSumBackgrWeights)
Definition: DataSetInfo.h:120
void BuildEventVector(DataSetInfo &dsi, DataInputHandler &dataInput, EventVectorOfClassesOfTreeType &eventsmap, EvtStatsPerClass &eventCounts)
build empty event vectors distributes events between kTraining/kTesting/kMaxTreeType ...
int Int_t
Definition: RtypesCore.h:41
bool Bool_t
Definition: RtypesCore.h:59
TArc * a
Definition: textangle.C:12
void generate(R &r, TH1D *h)
Definition: piRandom.C:28
UInt_t GetNClasses() const
Definition: DataSetInfo.h:136
STL namespace.
#define NULL
Definition: RtypesCore.h:88
std::vector< TreeInfo >::const_iterator end(const TString &className) const
void CalcMinMax(DataSet *, DataSetInfo &dsi)
compute covariance matrix
Short_t Abs(Short_t d)
Definition: TMathBase.h:108
virtual Int_t GetNdim() const
Definition: TFormula.h:237
const TString & GetExpression() const
Definition: VariableInfo.h:57
std::vector< int > NumberPerClass
std::map< Types::ETreeType, EventVectorOfClasses > EventVectorOfClassesOfTreeType
double sqrt(double)
void InitOptions(DataSetInfo &dsi, EvtStatsPerClass &eventsmap, TString &normMode, UInt_t &splitSeed, TString &splitMode, TString &mixMode)
the dataset splitting
DataSet * BuildDynamicDataSet(DataSetInfo &)
static TString Format(const char *fmt,...)
Static method which formats a string using a printf style format descriptor and return a TString...
Definition: TString.cxx:2345
UInt_t GetNSpectators() const
access the number of targets through the datasetinfo
Definition: DataSet.cxx:232
MsgLogger & Log() const
message logger
std::vector< TTreeFormula * > fWeightFormula
UInt_t GetClass() const
Definition: Event.h:81
void SetTestingSumSignalWeights(Double_t testingSumSignalWeights)
Definition: DataSetInfo.h:119
std::vector< std::vector< double > > Data
void PrintCorrelationMatrix(const TString &className)
calculates the correlation matrices for signal and background, prints them to standard output...
void SetMin(Double_t v)
Definition: VariableInfo.h:69
void SetMinType(EMsgType minType)
Definition: MsgLogger.h:72
Int_t Finite(Double_t x)
Definition: TMath.h:655
void RenormEvents(DataSetInfo &dsi, EventVectorOfClassesOfTreeType &eventsmap, const EvtStatsPerClass &eventCounts, const TString &normMode)
renormalisation of the TRAINING event weights
Class that contains all the data information.
Definition: DataSetInfo.h:60
virtual Int_t GetNcodes() const
Definition: TTreeFormula.h:193
Double_t GetWeight() const
return the event weight - depending on whether the flag IgnoreNegWeightsInTraining is or not...
Definition: Event.cxx:382
Long64_t GetNTrainingEvents() const
Definition: DataSet.h:79
Used to pass a selection expression to the Tree drawing routine.
Definition: TTreeFormula.h:58
void ChangeToNewTree(TreeInfo &, const DataSetInfo &)
While the data gets copied into the local training and testing trees, the input tree can change (for ...
TMatrixT< Double_t > TMatrixD
Definition: TMatrixDfwd.h:22
void SetCorrelationMatrix(const TString &className, TMatrixD *matrix)
Class that contains all the data information.
Definition: DataSet.h:69
DataSetFactory()
constructor
TMatrixD * CalcCorrelationMatrix(DataSet *, const UInt_t classNumber)
computes correlation matrix for variables "theVars" in tree; "theType" defines the required event "ty...
Float_t GetTarget(UInt_t itgt) const
Definition: Event.h:97
Int_t LargestCommonDivider(Int_t a, Int_t b)
UInt_t GetNTargets() const
Definition: DataSetInfo.h:111
Types::ETreeType GetTreeType() const
Class that contains all the data information.
SVector< double, 2 > v
Definition: Dict.h:5
ClassInfo * GetClassInfo(Int_t clNum) const
std::vector< TTreeFormula * > fSpectatorFormulas
DataSet * CreateDataSet(DataSetInfo &, DataInputHandler &)
steering the creation of a new dataset
VariableInfo & GetTargetInfo(Int_t i)
Definition: DataSetInfo.h:101
std::vector< TreeInfo >::const_iterator begin(const TString &className) const
unsigned int UInt_t
Definition: RtypesCore.h:42
char * Form(const char *fmt,...)
UInt_t GetNSpectators(bool all=kTRUE) const
UInt_t GetSignalClassIndex()
Definition: DataSetInfo.h:139
std::vector< TTreeFormula * > fTargetFormulas
std::vector< Event *> EventVector
Long64_t GetNTestEvents() const
Definition: DataSet.h:80
void SetMax(Double_t v)
Definition: VariableInfo.h:70
const Bool_t kFALSE
Definition: RtypesCore.h:92
Float_t GetValue(UInt_t ivar) const
return value of i&#39;th variable
Definition: Event.cxx:237
DataSet * BuildInitialDataSet(DataSetInfo &, TMVA::DataInputHandler &)
if no entries, than create a DataSet with one Event which uses dynamic variables (pointers to variabl...
~DataSetFactory()
destructor
virtual Int_t GetNdata()
Return number of available instances in the formula.
virtual TLeaf * GetLeaf(Int_t n) const
Return leaf corresponding to serial number n.
double Double_t
Definition: RtypesCore.h:55
const TString & GetClassName() const
VariableInfo & GetSpectatorInfo(Int_t i)
Definition: DataSetInfo.h:106
compose_binary_t< F, G, H > compose_binary(const F &_f, const G &_g, const H &_h)
UInt_t GetEntries(const TString &name) const
void SetEventCollection(std::vector< Event *> *, Types::ETreeType, Bool_t deleteEvents=true)
Sets the event collection (by DataSetFactory)
Definition: DataSet.cxx:250
VariableInfo & GetVariableInfo(Int_t i)
Definition: DataSetInfo.h:96
T EvalInstance(Int_t i=0, const char *stringStack[]=0)
Evaluate this treeformula.
ClassInfo * AddClass(const TString &className)
Int_t GetClassNameMaxLength() const
virtual const char * GetName() const
Returns name of object.
Definition: DataSetInfo.h:67
Long64_t GetNClassEvents(Int_t type, UInt_t classNumber)
Definition: DataSet.cxx:168
ostringstream derivative to redirect and format output
Definition: MsgLogger.h:59
void SetConfigName(const char *n)
Definition: Configurable.h:63
virtual const char * GetTitle() const
Returns title of object.
Definition: TObject.cxx:408
Abstract ClassifierFactory template that handles arbitrary types.
const TCut & GetCut() const
Definition: ClassInfo.h:64
std::vector< EventStats > EvtStatsPerClass
const TString & GetSplitOptions() const
Definition: DataSetInfo.h:167
Bool_t HasCuts() const
Short_t Max(Short_t a, Short_t b)
Definition: TMathBase.h:200
Int_t IsNaN(Double_t x)
Definition: TMath.h:778
Double_t GetOriginalWeight() const
Definition: Event.h:79
Bool_t CheckTTreeFormula(TTreeFormula *ttf, const TString &expression, Bool_t &hasDollar)
checks a TTreeFormula for problems
void SetNumber(const UInt_t index)
Definition: ClassInfo.h:59
you should not use this method at all Int_t Int_t Double_t Double_t Double_t Int_t Double_t Double_t Double_t Double_t b
Definition: TRolke.cxx:630
Long64_t GetNEvents(Types::ETreeType type=Types::kMaxTreeType) const
Definition: DataSet.h:215
const TString & GetWeight() const
Definition: ClassInfo.h:63
DataSet * MixEvents(DataSetInfo &dsi, EventVectorOfClassesOfTreeType &eventsmap, EvtStatsPerClass &eventCounts, const TString &splitMode, const TString &mixMode, const TString &normMode, UInt_t splitSeed)
Select and distribute unassigned events to kTraining and kTesting.
TBranch * GetBranch() const
Definition: TLeaf.h:65
virtual const char * GetName() const
Returns name of object.
Definition: TObject.cxx:364
virtual Bool_t IsOnTerminalBranch() const
Definition: TLeaf.h:84
Double_t GetWeight() const
UInt_t GetNTargets() const
access the number of targets through the datasetinfo
Definition: DataSet.cxx:224
Float_t GetSpectator(UInt_t ivar) const
return spectator content
Definition: Event.cxx:262
const Bool_t kTRUE
Definition: RtypesCore.h:91
void SetNormalization(const TString &norm)
Definition: DataSetInfo.h:115
TMatrixD * CalcCovarianceMatrix(DataSet *, const UInt_t classNumber)
compute covariance matrix
const Event * GetEvent() const
Definition: DataSet.cxx:202
std::vector< VariableInfo > & GetVariableInfos()
Definition: DataSetInfo.h:94
double log(double)
virtual const char * GetTitle() const
Returns title of object.
Definition: TNamed.h:48
std::vector< TString > * GetClassList() const