Logo ROOT   6.07/09
Reference Guide
DataSetFactory.cxx
Go to the documentation of this file.
1 // @(#)root/tmva $Id$
2 // Author: Andreas Hoecker, Peter Speckmayer, Joerg Stelzer, Eckhard von Toerne, Helge Voss
3 
4 /*****************************************************************************
5  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6  * Package: TMVA *
7  * Class : DataSetFactory *
8  * Web : http://tmva.sourceforge.net *
9  * *
10  * Description: *
11  * Implementation (see header for description) *
12  * *
13  * Authors (alphabetical): *
14  * Andreas Hoecker <Andreas.Hocker@cern.ch> - CERN, Switzerland *
15  * Peter Speckmayer <Peter.Speckmayer@cern.ch> - CERN, Switzerland *
16  * Joerg Stelzer <Joerg.Stelzer@cern.ch> - MSU, USA *
17  * Eckhard von Toerne <evt@physik.uni-bonn.de> - U. of Bonn, Germany *
18  * Helge Voss <Helge.Voss@cern.ch> - MPI-K Heidelberg, Germany *
19  * *
20  * Copyright (c) 2009: *
21  * CERN, Switzerland *
22  * MPI-K Heidelberg, Germany *
23  * U. of Bonn, Germany *
24  * Redistribution and use in source and binary forms, with or without *
25  * modification, are permitted according to the terms listed in LICENSE *
26  * (http://tmva.sourceforge.net/LICENSE) *
27  *****************************************************************************/
28 
29 #include <assert.h>
30 
31 #include <map>
32 #include <vector>
33 #include <iomanip>
34 #include <iostream>
35 
36 #include <algorithm>
37 #include <functional>
38 #include <numeric>
39 
40 #include "TMVA/DataSetFactory.h"
41 
42 #include "TEventList.h"
43 #include "TFile.h"
44 #include "TH1.h"
45 #include "TH2.h"
46 #include "TProfile.h"
47 #include "TRandom3.h"
48 #include "TMatrixF.h"
49 #include "TVectorF.h"
50 #include "TMath.h"
51 #include "TROOT.h"
52 
53 #ifndef ROOT_TMVA_MsgLogger
54 #include "TMVA/MsgLogger.h"
55 #endif
56 #ifndef ROOT_TMVA_Configurable
57 #include "TMVA/Configurable.h"
58 #endif
59 #ifndef ROOT_TMVA_VariableIdentityTransform
61 #endif
62 #ifndef ROOT_TMVA_VariableDecorrTransform
64 #endif
65 #ifndef ROOT_TMVA_VariablePCATransform
67 #endif
68 #ifndef ROOT_TMVA_DataSet
69 #include "TMVA/DataSet.h"
70 #endif
71 #ifndef ROOT_TMVA_DataSetInfo
72 #include "TMVA/DataSetInfo.h"
73 #endif
74 #ifndef ROOT_TMVA_DataInputHandler
75 #include "TMVA/DataInputHandler.h"
76 #endif
77 #ifndef ROOT_TMVA_Event
78 #include "TMVA/Event.h"
79 #endif
80 
81 #include "TMVA/Types.h"
82 #include "TMVA/VariableInfo.h"
83 
84 using namespace std;
85 
86 //TMVA::DataSetFactory* TMVA::DataSetFactory::fgInstance = 0;
87 
88 namespace TMVA {
89  // calculate the largest common divider
90  // this function is not happy if numbers are negative!
92  {
93  if (a<b) {Int_t tmp = a; a=b; b=tmp; } // achieve a>=b
94  if (b==0) return a;
95  Int_t fullFits = a/b;
96  return LargestCommonDivider(b,a-b*fullFits);
97  }
98 }
99 
100 
101 ////////////////////////////////////////////////////////////////////////////////
102 /// constructor
103 
105  fVerbose(kFALSE),
106  fVerboseLevel(TString("Info")),
107  fScaleWithPreselEff(0),
108  fCurrentTree(0),
109  fCurrentEvtIdx(0),
110  fInputFormulas(0),
111  fLogger( new MsgLogger("DataSetFactory", kINFO) )
112 {
113 }
114 
115 ////////////////////////////////////////////////////////////////////////////////
116 /// destructor
117 
119 {
120  std::vector<TTreeFormula*>::const_iterator formIt;
121 
122  for (formIt = fInputFormulas.begin() ; formIt!=fInputFormulas.end() ; formIt++) if (*formIt) delete *formIt;
123  for (formIt = fTargetFormulas.begin() ; formIt!=fTargetFormulas.end() ; formIt++) if (*formIt) delete *formIt;
124  for (formIt = fCutFormulas.begin() ; formIt!=fCutFormulas.end() ; formIt++) if (*formIt) delete *formIt;
125  for (formIt = fWeightFormula.begin() ; formIt!=fWeightFormula.end() ; formIt++) if (*formIt) delete *formIt;
126  for (formIt = fSpectatorFormulas.begin(); formIt!=fSpectatorFormulas.end(); formIt++) if (*formIt) delete *formIt;
127 
128  delete fLogger;
129 }
130 
131 ////////////////////////////////////////////////////////////////////////////////
132 /// steering the creation of a new dataset
133 
135  TMVA::DataInputHandler& dataInput )
136 {
137  // build the first dataset from the data input
138  DataSet * ds = BuildInitialDataSet( dsi, dataInput );
139 
140  if (ds->GetNEvents() > 1) {
141  CalcMinMax(ds,dsi);
142 
143  // from the the final dataset build the correlation matrix
144  for (UInt_t cl = 0; cl< dsi.GetNClasses(); cl++) {
145  const TString className = dsi.GetClassInfo(cl)->GetName();
146  dsi.SetCorrelationMatrix( className, CalcCorrelationMatrix( ds, cl ) );
147  dsi.PrintCorrelationMatrix( className );
148  }
149  //Log() << kHEADER << Endl;
150  Log() << kHEADER << Form("[%s] : ",dsi.GetName()) << " " << Endl << Endl;
151  }
152 
153  return ds;
154 }
155 
156 ////////////////////////////////////////////////////////////////////////////////
157 
159 {
160  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName()) << "Build DataSet consisting of one Event with dynamically changing variables" << Endl;
161  DataSet* ds = new DataSet(dsi);
162 
163  // create a DataSet with one Event which uses dynamic variables
164  // (pointers to variables)
165  if(dsi.GetNClasses()==0){
166  dsi.AddClass( "data" );
167  dsi.GetClassInfo( "data" )->SetNumber(0);
168  }
169 
170  std::vector<Float_t*>* evdyn = new std::vector<Float_t*>(0);
171 
172  std::vector<VariableInfo>& varinfos = dsi.GetVariableInfos();
173 
174  if (varinfos.empty())
175  Log() << kFATAL << Form("Dataset[%s] : ",dsi.GetName()) << "Dynamic data set cannot be built, since no variable informations are present. Apparently no variables have been set. This should not happen, please contact the TMVA authors." << Endl;
176 
177  std::vector<VariableInfo>::iterator it = varinfos.begin(), itEnd=varinfos.end();
178  for (;it!=itEnd;++it) {
179  Float_t* external=(Float_t*)(*it).GetExternalLink();
180  if (external==0)
181  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName()) << "The link to the external variable is NULL while I am trying to build a dynamic data set. In this case fTmpEvent from MethodBase HAS TO BE USED in the method to get useful values in variables." << Endl;
182  else evdyn->push_back (external);
183  }
184 
185  std::vector<VariableInfo>& spectatorinfos = dsi.GetSpectatorInfos();
186  it = spectatorinfos.begin();
187  for (;it!=spectatorinfos.end();it++) evdyn->push_back( (Float_t*)(*it).GetExternalLink() );
188 
189  TMVA::Event * ev = new Event((const std::vector<Float_t*>*&)evdyn, varinfos.size());
190  std::vector<Event*>* newEventVector = new std::vector<Event*>;
191  newEventVector->push_back(ev);
192 
193  ds->SetEventCollection(newEventVector, Types::kTraining);
194  ds->SetCurrentType( Types::kTraining );
195  ds->SetCurrentEvent( 0 );
196 
197  return ds;
198 }
199 
200 ////////////////////////////////////////////////////////////////////////////////
201 /// if no entries, than create a DataSet with one Event which uses
202 /// dynamic variables (pointers to variables)
203 
206  DataInputHandler& dataInput )
207 {
208  if (dataInput.GetEntries()==0) return BuildDynamicDataSet( dsi );
209  // -------------------------------------------------------------------------
210 
211  // register the classes in the datasetinfo-object
212  // information comes from the trees in the dataInputHandler-object
213  std::vector< TString >* classList = dataInput.GetClassList();
214  for (std::vector<TString>::iterator it = classList->begin(); it< classList->end(); it++) {
215  dsi.AddClass( (*it) );
216  }
217  delete classList;
218 
219  EvtStatsPerClass eventCounts(dsi.GetNClasses());
220  TString normMode;
221  TString splitMode;
222  TString mixMode;
223  UInt_t splitSeed;
224 
225  InitOptions( dsi, eventCounts, normMode, splitSeed, splitMode , mixMode );
226  // ======= build event-vector from input, apply preselection ===============
227  EventVectorOfClassesOfTreeType tmpEventVector;
228  BuildEventVector( dsi, dataInput, tmpEventVector, eventCounts );
229 
230  DataSet* ds = MixEvents( dsi, tmpEventVector, eventCounts,
231  splitMode, mixMode, normMode, splitSeed );
232 
233  const Bool_t showCollectedOutput = kFALSE;
234  if (showCollectedOutput) {
235  Int_t maxL = dsi.GetClassNameMaxLength();
236  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << "Collected:" << Endl;
237  for (UInt_t cl = 0; cl < dsi.GetNClasses(); cl++) {
238  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << " "
239  << setiosflags(ios::left) << std::setw(maxL) << dsi.GetClassInfo(cl)->GetName()
240  << " training entries: " << ds->GetNClassEvents( 0, cl ) << Endl;
241  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << " "
242  << setiosflags(ios::left) << std::setw(maxL) << dsi.GetClassInfo(cl)->GetName()
243  << " testing entries: " << ds->GetNClassEvents( 1, cl ) << Endl;
244  }
245  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << " " << Endl;
246  }
247 
248  return ds;
249 }
250 
251 ////////////////////////////////////////////////////////////////////////////////
252 /// checks a TTreeFormula for problems
253 
255  const TString& expression,
256  Bool_t& hasDollar )
257 {
258  Bool_t worked = kTRUE;
259 
260  if( ttf->GetNdim() <= 0 )
261  Log() << kFATAL << "Expression " << expression.Data()
262  << " could not be resolved to a valid formula. " << Endl;
263  if( ttf->GetNdata() == 0 ){
264  Log() << kWARNING << "Expression: " << expression.Data()
265  << " does not provide data for this event. "
266  << "This event is not taken into account. --> please check if you use as a variable "
267  << "an entry of an array which is not filled for some events "
268  << "(e.g. arr[4] when arr has only 3 elements)." << Endl;
269  Log() << kWARNING << "If you want to take the event into account you can do something like: "
270  << "\"Alt$(arr[4],0)\" where in cases where arr doesn't have a 4th element, "
271  << " 0 is taken as an alternative." << Endl;
272  worked = kFALSE;
273  }
274  if( expression.Contains("$") )
275  hasDollar = kTRUE;
276  else
277  {
278  for (int i = 0, iEnd = ttf->GetNcodes (); i < iEnd; ++i)
279  {
280  TLeaf* leaf = ttf->GetLeaf (i);
281  if (!leaf->IsOnTerminalBranch())
282  hasDollar = kTRUE;
283  }
284  }
285  return worked;
286 }
287 
288 
289 ////////////////////////////////////////////////////////////////////////////////
290 /// While the data gets copied into the local training and testing
291 /// trees, the input tree can change (for intance when changing from
292 /// signal to background tree, or using TChains as input) The
293 /// TTreeFormulas, that hold the input expressions need to be
294 /// reassociated with the new tree, which is done here
295 
297 {
298  TTree *tr = tinfo.GetTree()->GetTree();
299 
300  tr->SetBranchStatus("*",1);
301 
302  Bool_t hasDollar = kFALSE;
303 
304  // 1) the input variable formulas
305  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName()) << "transform input variables" << Endl;
306  std::vector<TTreeFormula*>::const_iterator formIt, formItEnd;
307  for (formIt = fInputFormulas.begin(), formItEnd=fInputFormulas.end(); formIt!=formItEnd; formIt++) if (*formIt) delete *formIt;
308  fInputFormulas.clear();
309  TTreeFormula* ttf = 0;
310 
311  for (UInt_t i=0; i<dsi.GetNVariables(); i++) {
312  ttf = new TTreeFormula( Form( "Formula%s", dsi.GetVariableInfo(i).GetInternalName().Data() ),
313  dsi.GetVariableInfo(i).GetExpression().Data(), tr );
314  CheckTTreeFormula( ttf, dsi.GetVariableInfo(i).GetExpression(), hasDollar );
315  fInputFormulas.push_back( ttf );
316  }
317 
318  //
319  // targets
320  //
321  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName()) << "transform regression targets" << Endl;
322  for (formIt = fTargetFormulas.begin(), formItEnd = fTargetFormulas.end(); formIt!=formItEnd; formIt++) if (*formIt) delete *formIt;
323  fTargetFormulas.clear();
324  for (UInt_t i=0; i<dsi.GetNTargets(); i++) {
325  ttf = new TTreeFormula( Form( "Formula%s", dsi.GetTargetInfo(i).GetInternalName().Data() ),
326  dsi.GetTargetInfo(i).GetExpression().Data(), tr );
327  CheckTTreeFormula( ttf, dsi.GetTargetInfo(i).GetExpression(), hasDollar );
328  fTargetFormulas.push_back( ttf );
329  }
330 
331  //
332  // spectators
333  //
334  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName()) << "transform spectator variables" << Endl;
335  for (formIt = fSpectatorFormulas.begin(), formItEnd = fSpectatorFormulas.end(); formIt!=formItEnd; formIt++) if (*formIt) delete *formIt;
336  fSpectatorFormulas.clear();
337  for (UInt_t i=0; i<dsi.GetNSpectators(); i++) {
338  ttf = new TTreeFormula( Form( "Formula%s", dsi.GetSpectatorInfo(i).GetInternalName().Data() ),
339  dsi.GetSpectatorInfo(i).GetExpression().Data(), tr );
340  CheckTTreeFormula( ttf, dsi.GetSpectatorInfo(i).GetExpression(), hasDollar );
341  fSpectatorFormulas.push_back( ttf );
342  }
343 
344  //
345  // the cuts (one per class, if non-existent: formula pointer = 0)
346  //
347  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName()) << "transform cuts" << Endl;
348  for (formIt = fCutFormulas.begin(), formItEnd = fCutFormulas.end(); formIt!=formItEnd; formIt++) if (*formIt) delete *formIt;
349  fCutFormulas.clear();
350  for (UInt_t clIdx=0; clIdx<dsi.GetNClasses(); clIdx++) {
351  const TCut& tmpCut = dsi.GetClassInfo(clIdx)->GetCut();
352  const TString tmpCutExp(tmpCut.GetTitle());
353  ttf = 0;
354  if (tmpCutExp!="") {
355  ttf = new TTreeFormula( Form("CutClass%i",clIdx), tmpCutExp, tr );
356  Bool_t worked = CheckTTreeFormula( ttf, tmpCutExp, hasDollar );
357  if( !worked ){
358  Log() << kWARNING << "Please check class \"" << dsi.GetClassInfo(clIdx)->GetName()
359  << "\" cut \"" << dsi.GetClassInfo(clIdx)->GetCut() << Endl;
360  }
361  }
362  fCutFormulas.push_back( ttf );
363  }
364 
365  //
366  // the weights (one per class, if non-existent: formula pointer = 0)
367  //
368  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName()) << "transform weights" << Endl;
369  for (formIt = fWeightFormula.begin(), formItEnd = fWeightFormula.end(); formIt!=formItEnd; formIt++) if (*formIt) delete *formIt;
370  fWeightFormula.clear();
371  for (UInt_t clIdx=0; clIdx<dsi.GetNClasses(); clIdx++) {
372  const TString tmpWeight = dsi.GetClassInfo(clIdx)->GetWeight();
373 
374  if (dsi.GetClassInfo(clIdx)->GetName() != tinfo.GetClassName() ) { // if the tree is of another class
375  fWeightFormula.push_back( 0 );
376  continue;
377  }
378 
379  ttf = 0;
380  if (tmpWeight!="") {
381  ttf = new TTreeFormula( "FormulaWeight", tmpWeight, tr );
382  Bool_t worked = CheckTTreeFormula( ttf, tmpWeight, hasDollar );
383  if( !worked ){
384  Log() << kWARNING << Form("Dataset[%s] : ",dsi.GetName()) << "Please check class \"" << dsi.GetClassInfo(clIdx)->GetName()
385  << "\" weight \"" << dsi.GetClassInfo(clIdx)->GetWeight() << Endl;
386  }
387  }
388  else {
389  ttf = 0;
390  }
391  fWeightFormula.push_back( ttf );
392  }
393  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName()) << "enable branches" << Endl;
394  // now enable only branches that are needed in any input formula, target, cut, weight
395 
396  if (!hasDollar) {
397  tr->SetBranchStatus("*",0);
398  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName()) << "enable branches: input variables" << Endl;
399  // input vars
400  for (formIt = fInputFormulas.begin(); formIt!=fInputFormulas.end(); formIt++) {
401  ttf = *formIt;
402  for (Int_t bi = 0; bi<ttf->GetNcodes(); bi++) {
403  tr->SetBranchStatus( ttf->GetLeaf(bi)->GetBranch()->GetName(), 1 );
404  }
405  }
406  // targets
407  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName()) << "enable branches: targets" << Endl;
408  for (formIt = fTargetFormulas.begin(); formIt!=fTargetFormulas.end(); formIt++) {
409  ttf = *formIt;
410  for (Int_t bi = 0; bi<ttf->GetNcodes(); bi++)
411  tr->SetBranchStatus( ttf->GetLeaf(bi)->GetBranch()->GetName(), 1 );
412  }
413  // spectators
414  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName()) << "enable branches: spectators" << Endl;
415  for (formIt = fSpectatorFormulas.begin(); formIt!=fSpectatorFormulas.end(); formIt++) {
416  ttf = *formIt;
417  for (Int_t bi = 0; bi<ttf->GetNcodes(); bi++)
418  tr->SetBranchStatus( ttf->GetLeaf(bi)->GetBranch()->GetName(), 1 );
419  }
420  // cuts
421  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName()) << "enable branches: cuts" << Endl;
422  for (formIt = fCutFormulas.begin(); formIt!=fCutFormulas.end(); formIt++) {
423  ttf = *formIt;
424  if (!ttf) continue;
425  for (Int_t bi = 0; bi<ttf->GetNcodes(); bi++)
426  tr->SetBranchStatus( ttf->GetLeaf(bi)->GetBranch()->GetName(), 1 );
427  }
428  // weights
429  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName()) << "enable branches: weights" << Endl;
430  for (formIt = fWeightFormula.begin(); formIt!=fWeightFormula.end(); formIt++) {
431  ttf = *formIt;
432  if (!ttf) continue;
433  for (Int_t bi = 0; bi<ttf->GetNcodes(); bi++)
434  tr->SetBranchStatus( ttf->GetLeaf(bi)->GetBranch()->GetName(), 1 );
435  }
436  }
437  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName()) << "tree initialized" << Endl;
438  return;
439 }
440 
441 ////////////////////////////////////////////////////////////////////////////////
442 /// compute covariance matrix
443 
445 {
446  const UInt_t nvar = ds->GetNVariables();
447  const UInt_t ntgts = ds->GetNTargets();
448  const UInt_t nvis = ds->GetNSpectators();
449 
450  Float_t *min = new Float_t[nvar];
451  Float_t *max = new Float_t[nvar];
452  Float_t *tgmin = new Float_t[ntgts];
453  Float_t *tgmax = new Float_t[ntgts];
454  Float_t *vmin = new Float_t[nvis];
455  Float_t *vmax = new Float_t[nvis];
456 
457  for (UInt_t ivar=0; ivar<nvar ; ivar++) { min[ivar] = FLT_MAX; max[ivar] = -FLT_MAX; }
458  for (UInt_t ivar=0; ivar<ntgts; ivar++) { tgmin[ivar] = FLT_MAX; tgmax[ivar] = -FLT_MAX; }
459  for (UInt_t ivar=0; ivar<nvis; ivar++) { vmin[ivar] = FLT_MAX; vmax[ivar] = -FLT_MAX; }
460 
461  // perform event loop
462 
463  for (Int_t i=0; i<ds->GetNEvents(); i++) {
464  const Event * ev = ds->GetEvent(i);
465  for (UInt_t ivar=0; ivar<nvar; ivar++) {
466  Double_t v = ev->GetValue(ivar);
467  if (v<min[ivar]) min[ivar] = v;
468  if (v>max[ivar]) max[ivar] = v;
469  }
470  for (UInt_t itgt=0; itgt<ntgts; itgt++) {
471  Double_t v = ev->GetTarget(itgt);
472  if (v<tgmin[itgt]) tgmin[itgt] = v;
473  if (v>tgmax[itgt]) tgmax[itgt] = v;
474  }
475  for (UInt_t ivis=0; ivis<nvis; ivis++) {
476  Double_t v = ev->GetSpectator(ivis);
477  if (v<vmin[ivis]) vmin[ivis] = v;
478  if (v>vmax[ivis]) vmax[ivis] = v;
479  }
480  }
481 
482  for (UInt_t ivar=0; ivar<nvar; ivar++) {
483  dsi.GetVariableInfo(ivar).SetMin(min[ivar]);
484  dsi.GetVariableInfo(ivar).SetMax(max[ivar]);
485  if( TMath::Abs(max[ivar]-min[ivar]) <= FLT_MIN )
486  Log() << kFATAL << Form("Dataset[%s] : ",dsi.GetName()) << "Variable " << dsi.GetVariableInfo(ivar).GetExpression().Data() << " is constant. Please remove the variable." << Endl;
487  }
488  for (UInt_t ivar=0; ivar<ntgts; ivar++) {
489  dsi.GetTargetInfo(ivar).SetMin(tgmin[ivar]);
490  dsi.GetTargetInfo(ivar).SetMax(tgmax[ivar]);
491  if( TMath::Abs(tgmax[ivar]-tgmin[ivar]) <= FLT_MIN )
492  Log() << kFATAL << Form("Dataset[%s] : ",dsi.GetName()) << "Target " << dsi.GetTargetInfo(ivar).GetExpression().Data() << " is constant. Please remove the variable." << Endl;
493  }
494  for (UInt_t ivar=0; ivar<nvis; ivar++) {
495  dsi.GetSpectatorInfo(ivar).SetMin(vmin[ivar]);
496  dsi.GetSpectatorInfo(ivar).SetMax(vmax[ivar]);
497  // if( TMath::Abs(vmax[ivar]-vmin[ivar]) <= FLT_MIN )
498  // Log() << kWARNING << "Spectator variable " << dsi.GetSpectatorInfo(ivar).GetExpression().Data() << " is constant." << Endl;
499  }
500  delete [] min;
501  delete [] max;
502  delete [] tgmin;
503  delete [] tgmax;
504  delete [] vmin;
505  delete [] vmax;
506 }
507 
508 ////////////////////////////////////////////////////////////////////////////////
509 /// computes correlation matrix for variables "theVars" in tree;
510 /// "theType" defines the required event "type"
511 /// ("type" variable must be present in tree)
512 
514 {
515  // first compute variance-covariance
516  TMatrixD* mat = CalcCovarianceMatrix( ds, classNumber );
517 
518  // now the correlation
519  UInt_t nvar = ds->GetNVariables(), ivar, jvar;
520 
521  for (ivar=0; ivar<nvar; ivar++) {
522  for (jvar=0; jvar<nvar; jvar++) {
523  if (ivar != jvar) {
524  Double_t d = (*mat)(ivar, ivar)*(*mat)(jvar, jvar);
525  if (d > 0) (*mat)(ivar, jvar) /= sqrt(d);
526  else {
527  Log() << kWARNING << Form("Dataset[%s] : ",DataSetInfo().GetName())<< "<GetCorrelationMatrix> Zero variances for variables "
528  << "(" << ivar << ", " << jvar << ") = " << d
529  << Endl;
530  (*mat)(ivar, jvar) = 0;
531  }
532  }
533  }
534  }
535 
536  for (ivar=0; ivar<nvar; ivar++) (*mat)(ivar, ivar) = 1.0;
537 
538  return mat;
539 }
540 
541 ////////////////////////////////////////////////////////////////////////////////
542 /// compute covariance matrix
543 
545 {
546  UInt_t nvar = ds->GetNVariables();
547  UInt_t ivar = 0, jvar = 0;
548 
549  TMatrixD* mat = new TMatrixD( nvar, nvar );
550 
551  // init matrices
552  TVectorD vec(nvar);
553  TMatrixD mat2(nvar, nvar);
554  for (ivar=0; ivar<nvar; ivar++) {
555  vec(ivar) = 0;
556  for (jvar=0; jvar<nvar; jvar++) mat2(ivar, jvar) = 0;
557  }
558 
559  // perform event loop
560  Double_t ic = 0;
561  for (Int_t i=0; i<ds->GetNEvents(); i++) {
562 
563  const Event * ev = ds->GetEvent(i);
564  if (ev->GetClass() != classNumber ) continue;
565 
566  Double_t weight = ev->GetWeight();
567  ic += weight; // count used events
568 
569  for (ivar=0; ivar<nvar; ivar++) {
570 
571  Double_t xi = ev->GetValue(ivar);
572  vec(ivar) += xi*weight;
573  mat2(ivar, ivar) += (xi*xi*weight);
574 
575  for (jvar=ivar+1; jvar<nvar; jvar++) {
576  Double_t xj = ev->GetValue(jvar);
577  mat2(ivar, jvar) += (xi*xj*weight);
578  }
579  }
580  }
581 
582  for (ivar=0; ivar<nvar; ivar++)
583  for (jvar=ivar+1; jvar<nvar; jvar++)
584  mat2(jvar, ivar) = mat2(ivar, jvar); // symmetric matrix
585 
586 
587  // variance-covariance
588  for (ivar=0; ivar<nvar; ivar++) {
589  for (jvar=0; jvar<nvar; jvar++) {
590  (*mat)(ivar, jvar) = mat2(ivar, jvar)/ic - vec(ivar)*vec(jvar)/(ic*ic);
591  }
592  }
593 
594  return mat;
595 }
596 
597 // --------------------------------------- new versions
598 
599 ////////////////////////////////////////////////////////////////////////////////
600 /// the dataset splitting
601 
602 void
604  EvtStatsPerClass& nEventRequests,
605  TString& normMode,
606  UInt_t& splitSeed,
607  TString& splitMode,
608  TString& mixMode)
609 {
610  Configurable splitSpecs( dsi.GetSplitOptions() );
611  splitSpecs.SetConfigName("DataSetFactory");
612  splitSpecs.SetConfigDescription( "Configuration options given in the \"PrepareForTrainingAndTesting\" call; these options define the creation of the data sets used for training and expert validation by TMVA" );
613 
614  splitMode = "Random"; // the splitting mode
615  splitSpecs.DeclareOptionRef( splitMode, "SplitMode",
616  "Method of picking training and testing events (default: random)" );
617  splitSpecs.AddPreDefVal(TString("Random"));
618  splitSpecs.AddPreDefVal(TString("Alternate"));
619  splitSpecs.AddPreDefVal(TString("Block"));
620 
621  mixMode = "SameAsSplitMode"; // the splitting mode
622  splitSpecs.DeclareOptionRef( mixMode, "MixMode",
623  "Method of mixing events of differnt classes into one dataset (default: SameAsSplitMode)" );
624  splitSpecs.AddPreDefVal(TString("SameAsSplitMode"));
625  splitSpecs.AddPreDefVal(TString("Random"));
626  splitSpecs.AddPreDefVal(TString("Alternate"));
627  splitSpecs.AddPreDefVal(TString("Block"));
628 
629  splitSeed = 100;
630  splitSpecs.DeclareOptionRef( splitSeed, "SplitSeed",
631  "Seed for random event shuffling" );
632 
633  normMode = "EqualNumEvents"; // the weight normalisation modes
634  splitSpecs.DeclareOptionRef( normMode, "NormMode",
635  "Overall renormalisation of event-by-event weights used in the training (NumEvents: average weight of 1 per event, independently for signal and background; EqualNumEvents: average weight of 1 per event for signal, and sum of weights for background equal to sum of weights for signal)" );
636  splitSpecs.AddPreDefVal(TString("None"));
637  splitSpecs.AddPreDefVal(TString("NumEvents"));
638  splitSpecs.AddPreDefVal(TString("EqualNumEvents"));
639 
640  splitSpecs.DeclareOptionRef(fScaleWithPreselEff=kFALSE,"ScaleWithPreselEff","Scale the number of requested events by the eff. of the preselection cuts (or not)" );
641 
642  // the number of events
643 
644  // fill in the numbers
645  for (UInt_t cl = 0; cl < dsi.GetNClasses(); cl++) {
646  TString clName = dsi.GetClassInfo(cl)->GetName();
647  TString titleTrain = TString().Format("Number of training events of class %s (default: 0 = all)",clName.Data()).Data();
648  TString titleTest = TString().Format("Number of test events of class %s (default: 0 = all)",clName.Data()).Data();
649 
650  splitSpecs.DeclareOptionRef( nEventRequests.at(cl).nTrainingEventsRequested, TString("nTrain_")+clName, titleTrain );
651  splitSpecs.DeclareOptionRef( nEventRequests.at(cl).nTestingEventsRequested , TString("nTest_")+clName , titleTest );
652  }
653 
654  splitSpecs.DeclareOptionRef( fVerbose, "V", "Verbosity (default: true)" );
655 
656  splitSpecs.DeclareOptionRef( fVerboseLevel=TString("Info"), "VerboseLevel", "VerboseLevel (Debug/Verbose/Info)" );
657  splitSpecs.AddPreDefVal(TString("Debug"));
658  splitSpecs.AddPreDefVal(TString("Verbose"));
659  splitSpecs.AddPreDefVal(TString("Info"));
660 
661  splitSpecs.ParseOptions();
662  splitSpecs.CheckForUnusedOptions();
663 
664  // output logging verbosity
665  if (Verbose()) fLogger->SetMinType( kVERBOSE );
666  if (fVerboseLevel.CompareTo("Debug") ==0) fLogger->SetMinType( kDEBUG );
667  if (fVerboseLevel.CompareTo("Verbose") ==0) fLogger->SetMinType( kVERBOSE );
668  if (fVerboseLevel.CompareTo("Info") ==0) fLogger->SetMinType( kINFO );
669 
670  // put all to upper case
671  splitMode.ToUpper(); mixMode.ToUpper(); normMode.ToUpper();
672  // adjust mixmode if same as splitmode option has been set
673  Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
674  << "\tSplitmode is: \"" << splitMode << "\" the mixmode is: \"" << mixMode << "\"" << Endl;
675  if (mixMode=="SAMEASSPLITMODE") mixMode = splitMode;
676  else if (mixMode!=splitMode)
677  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << "DataSet splitmode="<<splitMode
678  <<" differs from mixmode="<<mixMode<<Endl;
679 }
680 
681 
682 ////////////////////////////////////////////////////////////////////////////////
683 /// build empty event vectors
684 /// distributes events between kTraining/kTesting/kMaxTreeType
685 
686 void
688  TMVA::DataInputHandler& dataInput,
690  EvtStatsPerClass& eventCounts)
691 {
692  const UInt_t nclasses = dsi.GetNClasses();
693 
694  eventsmap[ Types::kTraining ] = EventVectorOfClasses(nclasses);
695  eventsmap[ Types::kTesting ] = EventVectorOfClasses(nclasses);
696  eventsmap[ Types::kMaxTreeType ] = EventVectorOfClasses(nclasses);
697 
698  // create the type, weight and boostweight branches
699  const UInt_t nvars = dsi.GetNVariables();
700  const UInt_t ntgts = dsi.GetNTargets();
701  const UInt_t nvis = dsi.GetNSpectators();
702 
703  for (size_t i=0; i<nclasses; i++) {
704  eventCounts[i].varAvLength = new Float_t[nvars];
705  for (UInt_t ivar=0; ivar<nvars; ivar++)
706  eventCounts[i].varAvLength[ivar] = 0;
707  }
708 
709  // Bool_t haveArrayVariable = kFALSE;
710  Bool_t *varIsArray = new Bool_t[nvars];
711 
712  // if we work with chains we need to remember the current tree if
713  // the chain jumps to a new tree we have to reset the formulas
714  for (UInt_t cl=0; cl<nclasses; cl++) {
715 
716  //Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << "Create training and testing trees -- looping over class \"" << dsi.GetClassInfo(cl)->GetName() << "\" ..." << Endl;
717 
718  EventStats& classEventCounts = eventCounts[cl];
719 
720  // info output for weights
721  Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
722  << "\tWeight expression for class \'" << dsi.GetClassInfo(cl)->GetName() << "\': \""
723  << dsi.GetClassInfo(cl)->GetWeight() << "\"" << Endl;
724 
725  // used for chains only
726  TString currentFileName("");
727 
728  std::vector<TreeInfo>::const_iterator treeIt(dataInput.begin(dsi.GetClassInfo(cl)->GetName()));
729  for (;treeIt!=dataInput.end(dsi.GetClassInfo(cl)->GetName()); treeIt++) {
730 
731  // read first the variables
732  std::vector<Float_t> vars(nvars);
733  std::vector<Float_t> tgts(ntgts);
734  std::vector<Float_t> vis(nvis);
735  TreeInfo currentInfo = *treeIt;
736 
737  Log() << kDEBUG << "Building event vectors " << currentInfo.GetTreeType() << Endl;
738 
739  EventVector& event_v = eventsmap[currentInfo.GetTreeType()].at(cl);
740 
741  Bool_t isChain = (TString("TChain") == currentInfo.GetTree()->ClassName());
742  currentInfo.GetTree()->LoadTree(0);
743  ChangeToNewTree( currentInfo, dsi );
744 
745  // count number of events in tree before cut
746  classEventCounts.nInitialEvents += currentInfo.GetTree()->GetEntries();
747 
748  // loop over events in ntuple
749  const UInt_t nEvts = currentInfo.GetTree()->GetEntries();
750  for (Long64_t evtIdx = 0; evtIdx < nEvts; evtIdx++) {
751  currentInfo.GetTree()->LoadTree(evtIdx);
752 
753  // may need to reload tree in case of chains
754  if (isChain) {
755  if (currentInfo.GetTree()->GetTree()->GetDirectory()->GetFile()->GetName() != currentFileName) {
756  currentFileName = currentInfo.GetTree()->GetTree()->GetDirectory()->GetFile()->GetName();
757  ChangeToNewTree( currentInfo, dsi );
758  }
759  }
760  currentInfo.GetTree()->GetEntry(evtIdx);
761  Int_t sizeOfArrays = 1;
762  Int_t prevArrExpr = 0;
763 
764  // ======= evaluate all formulas =================
765 
766  // first we check if some of the formulas are arrays
767  for (UInt_t ivar=0; ivar<nvars; ivar++) {
768  Int_t ndata = fInputFormulas[ivar]->GetNdata();
769  classEventCounts.varAvLength[ivar] += ndata;
770  if (ndata == 1) continue;
771  // haveArrayVariable = kTRUE;
772  varIsArray[ivar] = kTRUE;
773  if (sizeOfArrays == 1) {
774  sizeOfArrays = ndata;
775  prevArrExpr = ivar;
776  }
777  else if (sizeOfArrays!=ndata) {
778  Log() << kERROR << Form("Dataset[%s] : ",dsi.GetName())<< "ERROR while preparing training and testing trees:" << Endl;
779  Log() << Form("Dataset[%s] : ",dsi.GetName())<< " multiple array-type expressions of different length were encountered" << Endl;
780  Log() << Form("Dataset[%s] : ",dsi.GetName())<< " location of error: event " << evtIdx
781  << " in tree " << currentInfo.GetTree()->GetName()
782  << " of file " << currentInfo.GetTree()->GetCurrentFile()->GetName() << Endl;
783  Log() << Form("Dataset[%s] : ",dsi.GetName())<< " expression " << fInputFormulas[ivar]->GetTitle() << " has "
784  << Form("Dataset[%s] : ",dsi.GetName()) << ndata << " entries, while" << Endl;
785  Log() << Form("Dataset[%s] : ",dsi.GetName())<< " expression " << fInputFormulas[prevArrExpr]->GetTitle() << " has "
786  << Form("Dataset[%s] : ",dsi.GetName())<< fInputFormulas[prevArrExpr]->GetNdata() << " entries" << Endl;
787  Log() << kFATAL << Form("Dataset[%s] : ",dsi.GetName())<< "Need to abort" << Endl;
788  }
789  }
790 
791  // now we read the information
792  for (Int_t idata = 0; idata<sizeOfArrays; idata++) {
793  Bool_t containsNaN = kFALSE;
794 
795  TTreeFormula* formula = 0;
796 
797  // the cut expression
798  Float_t cutVal = 1;
799  formula = fCutFormulas[cl];
800  if (formula) {
801  Int_t ndata = formula->GetNdata();
802  cutVal = (ndata==1 ?
803  formula->EvalInstance(0) :
804  formula->EvalInstance(idata));
805  if (TMath::IsNaN(cutVal)) {
806  containsNaN = kTRUE;
807  Log() << kWARNING << Form("Dataset[%s] : ",dsi.GetName())<< "Cut expression resolves to infinite value (NaN): "
808  << formula->GetTitle() << Endl;
809  }
810  }
811 
812  // the input variable
813  for (UInt_t ivar=0; ivar<nvars; ivar++) {
814  formula = fInputFormulas[ivar];
815  Int_t ndata = formula->GetNdata();
816  vars[ivar] = (ndata == 1 ?
817  formula->EvalInstance(0) :
818  formula->EvalInstance(idata));
819  if (TMath::IsNaN(vars[ivar])) {
820  containsNaN = kTRUE;
821  Log() << kWARNING << Form("Dataset[%s] : ",dsi.GetName())<< "Input expression resolves to infinite value (NaN): "
822  << formula->GetTitle() << Endl;
823  }
824  }
825 
826  // the targets
827  for (UInt_t itrgt=0; itrgt<ntgts; itrgt++) {
828  formula = fTargetFormulas[itrgt];
829  Int_t ndata = formula->GetNdata();
830  tgts[itrgt] = (ndata == 1 ?
831  formula->EvalInstance(0) :
832  formula->EvalInstance(idata));
833  if (TMath::IsNaN(tgts[itrgt])) {
834  containsNaN = kTRUE;
835  Log() << kWARNING << Form("Dataset[%s] : ",dsi.GetName())<< "Target expression resolves to infinite value (NaN): "
836  << formula->GetTitle() << Endl;
837  }
838  }
839 
840  // the spectators
841  for (UInt_t itVis=0; itVis<nvis; itVis++) {
842  formula = fSpectatorFormulas[itVis];
843  Int_t ndata = formula->GetNdata();
844  vis[itVis] = (ndata == 1 ?
845  formula->EvalInstance(0) :
846  formula->EvalInstance(idata));
847  if (TMath::IsNaN(vis[itVis])) {
848  containsNaN = kTRUE;
849  Log() << kWARNING << Form("Dataset[%s] : ",dsi.GetName())<< "Spectator expression resolves to infinite value (NaN): "
850  << formula->GetTitle() << Endl;
851  }
852  }
853 
854 
855  // the weight
856  Float_t weight = currentInfo.GetWeight(); // multiply by tree weight
857  formula = fWeightFormula[cl];
858  if (formula!=0) {
859  Int_t ndata = formula->GetNdata();
860  weight *= (ndata == 1 ?
861  formula->EvalInstance() :
862  formula->EvalInstance(idata));
863  if (TMath::IsNaN(weight)) {
864  containsNaN = kTRUE;
865  Log() << kWARNING << Form("Dataset[%s] : ",dsi.GetName())<< "Weight expression resolves to infinite value (NaN): "
866  << formula->GetTitle() << Endl;
867  }
868  }
869 
870  // Count the events before rejection due to cut or NaN
871  // value (weighted and unweighted)
872  classEventCounts.nEvBeforeCut++;
873  if (!TMath::IsNaN(weight))
874  classEventCounts.nWeEvBeforeCut += weight;
875 
876  // apply the cut, skip rest if cut is not fulfilled
877  if (cutVal<0.5) continue;
878 
879  // global flag if negative weights exist -> can be used
880  // by classifiers who may require special data
881  // treatment (also print warning)
882  if (weight < 0) classEventCounts.nNegWeights++;
883 
884  // now read the event-values (variables and regression targets)
885 
886  if (containsNaN) {
887  Log() << kWARNING << Form("Dataset[%s] : ",dsi.GetName())<< "Event " << evtIdx;
888  if (sizeOfArrays>1) Log() << kWARNING << Form("Dataset[%s] : ",dsi.GetName())<< " rejected" << Endl;
889  continue;
890  }
891 
892  // Count the events after rejection due to cut or NaN value
893  // (weighted and unweighted)
894  classEventCounts.nEvAfterCut++;
895  classEventCounts.nWeEvAfterCut += weight;
896 
897  // event accepted, fill temporary ntuple
898  event_v.push_back(new Event(vars, tgts , vis, cl , weight));
899  }
900  }
901  currentInfo.GetTree()->ResetBranchAddresses();
902  }
903  }
904 
905  // for output format, get the maximum class name length
906  Int_t maxL = dsi.GetClassNameMaxLength();
907 
908  Log() << kHEADER << Form("[%s] : ",dsi.GetName()) << "Number of events in input trees" << Endl;
909  Log() << kDEBUG << "(after possible flattening of arrays):" << Endl;
910 
911 
912  for (UInt_t cl = 0; cl < dsi.GetNClasses(); cl++) {
913  Log() << kDEBUG //<< Form("[%s] : ",dsi.GetName())
914  << " "
915  << setiosflags(ios::left) << std::setw(maxL) << dsi.GetClassInfo(cl)->GetName()
916  << " -- number of events : "
917  << std::setw(5) << eventCounts[cl].nEvBeforeCut
918  << " / sum of weights: " << std::setw(5) << eventCounts[cl].nWeEvBeforeCut << Endl;
919  }
920 
921  for (UInt_t cl = 0; cl < dsi.GetNClasses(); cl++) {
922  Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
923  << " " << std::setw(maxL) << dsi.GetClassInfo(cl)->GetName()
924  <<" tree -- total number of entries: "
925  << std::setw(5) << dataInput.GetEntries(dsi.GetClassInfo(cl)->GetName()) << Endl;
926  }
927 
928  if (fScaleWithPreselEff)
929  Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
930  << "\tPreselection: (will affect number of requested training and testing events)" << Endl;
931  else
932  Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
933  << "\tPreselection: (will NOT affect number of requested training and testing events)" << Endl;
934 
935  if (dsi.HasCuts()) {
936  for (UInt_t cl = 0; cl< dsi.GetNClasses(); cl++) {
937  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << " " << setiosflags(ios::left) << std::setw(maxL) << dsi.GetClassInfo(cl)->GetName()
938  << " requirement: \"" << dsi.GetClassInfo(cl)->GetCut() << "\"" << Endl;
939  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << " "
940  << setiosflags(ios::left) << std::setw(maxL) << dsi.GetClassInfo(cl)->GetName()
941  << " -- number of events passed: "
942  << std::setw(5) << eventCounts[cl].nEvAfterCut
943  << " / sum of weights: " << std::setw(5) << eventCounts[cl].nWeEvAfterCut << Endl;
944  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << " "
945  << setiosflags(ios::left) << std::setw(maxL) << dsi.GetClassInfo(cl)->GetName()
946  << " -- efficiency : "
947  << std::setw(6) << eventCounts[cl].nWeEvAfterCut/eventCounts[cl].nWeEvBeforeCut << Endl;
948  }
949  }
950  else Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
951  << " No preselection cuts applied on event classes" << Endl;
952 
953  delete[] varIsArray;
954 
955 }
956 
957 ////////////////////////////////////////////////////////////////////////////////
958 /// Select and distribute unassigned events to kTraining and kTesting
959 ///Bool_t emptyUndefined = kTRUE;
960 
963  EventVectorOfClassesOfTreeType& tmpEventVector,
964  EvtStatsPerClass& eventCounts,
965  const TString& splitMode,
966  const TString& mixMode,
967  const TString& normMode,
968  UInt_t splitSeed)
969 {
970  // check if the vectors of all classes are empty
971  //for( Int_t cls = 0, clsEnd = dsi.GetNClasses(); cls < clsEnd; ++cls ){
972  // emptyUndefined &= tmpEventVector[Types::kMaxTreeType].at(cls).empty();
973  //}
974 
975  TMVA::RandomGenerator rndm( splitSeed );
976 
977  // ==== splitting of undefined events to kTraining and kTesting
978 
979  // if splitMode contains "RANDOM", then shuffle the undefined events
980  if (splitMode.Contains( "RANDOM" ) /*&& !emptyUndefined*/ ) {
981  // random shuffle the undefined events of each class
982  for( UInt_t cls = 0; cls < dsi.GetNClasses(); ++cls ){
983  EventVector& unspecifiedEvents = tmpEventVector[Types::kMaxTreeType].at(cls);
984  if( ! unspecifiedEvents.empty() ) {
985  Log() << kDEBUG << "randomly shuffling "
986  << unspecifiedEvents.size()
987  << " events of class " << cls
988  << " which are not yet associated to testing or training" << Endl;
989  std::random_shuffle( unspecifiedEvents.begin(),
990  unspecifiedEvents.end(),
991  rndm );
992  }
993  }
994  }
995 
996  // check for each class the number of training and testing events, the requested number and the available number
997  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "SPLITTING ========" << Endl;
998  for( UInt_t cls = 0; cls < dsi.GetNClasses(); ++cls ){
999  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "---- class " << cls << Endl;
1000  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "check number of training/testing events, requested and available number of events and for class " << cls << Endl;
1001 
1002  // check if enough or too many events are already in the training/testing eventvectors of the class cls
1003  EventVector& eventVectorTraining = tmpEventVector[ Types::kTraining ].at(cls);
1004  EventVector& eventVectorTesting = tmpEventVector[ Types::kTesting ].at(cls);
1005  EventVector& eventVectorUndefined = tmpEventVector[ Types::kMaxTreeType ].at(cls);
1006 
1007  Int_t availableTraining = eventVectorTraining.size();
1008  Int_t availableTesting = eventVectorTesting.size();
1009  Int_t availableUndefined = eventVectorUndefined.size();
1010 
1011  Float_t presel_scale;
1012  if (fScaleWithPreselEff) {
1013  presel_scale = eventCounts[cls].cutScaling();
1014  if (presel_scale < 1)
1015  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << " you have opted for scaling the number of requested training/testing events\n to be scaled by the preselection efficiency"<< Endl;
1016  }else{
1017  presel_scale = 1.; // this scaling was tooo confusing to most people, including me! Sorry... (Helge)
1018  if (eventCounts[cls].cutScaling() < 1)
1019  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << " you have opted for interpreting the requested number of training/testing events\n to be the number of events AFTER your preselection cuts" << Endl;
1020 
1021  }
1022  Int_t requestedTraining = Int_t(eventCounts[cls].nTrainingEventsRequested * presel_scale);
1023  Int_t requestedTesting = Int_t(eventCounts[cls].nTestingEventsRequested * presel_scale);
1024 
1025  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "events in training trees : " << availableTraining << Endl;
1026  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "events in testing trees : " << availableTesting << Endl;
1027  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "events in unspecified trees : " << availableUndefined << Endl;
1028  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "requested for training : " << requestedTraining;
1029 
1030  if(presel_scale<1)
1031  Log() << " ( " << eventCounts[cls].nTrainingEventsRequested
1032  << " * " << presel_scale << " preselection efficiency)" << Endl;
1033  else
1034  Log() << Endl;
1035  Log() << kDEBUG << "requested for testing : " << requestedTesting;
1036  if(presel_scale<1)
1037  Log() << " ( " << eventCounts[cls].nTestingEventsRequested
1038  << " * " << presel_scale << " preselection efficiency)" << Endl;
1039  else
1040  Log() << Endl;
1041 
1042  // nomenclature r = available training
1043  // s = available testing
1044  // u = available undefined
1045  // R = requested training
1046  // S = requested testing
1047  // nR = to be used to select training events
1048  // nS = to be used to select test events
1049  // we have the constraint: nR + nS < r+s+u,
1050  // since we can not use more events than we have
1051  // free events: Nfree = u-Thet(R-r)-Thet(S-s)
1052  // nomenclature: Thet(x) = x, if x>0 else 0
1053  // nR = max(R,r) + 0.5 * Nfree
1054  // nS = max(S,s) + 0.5 * Nfree
1055  // nR +nS = R+S + u-R+r-S+s = u+r+s= ok! for R>r
1056  // nR +nS = r+S + u-S+s = u+r+s= ok! for r>R
1057 
1058  // three different cases might occur here
1059  //
1060  // Case a
1061  // requestedTraining and requestedTesting >0
1062  // free events: Nfree = u-Thet(R-r)-Thet(S-s)
1063  // nR = Max(R,r) + 0.5 * Nfree
1064  // nS = Max(S,s) + 0.5 * Nfree
1065  //
1066  // Case b
1067  // exactly one of requestedTraining or requestedTesting >0
1068  // assume training R >0
1069  // nR = max(R,r)
1070  // nS = s+u+r-nR
1071  // and s=nS
1072  //
1073  // Case c
1074  // requestedTraining=0, requestedTesting=0
1075  // Nfree = u-|r-s|
1076  // if NFree >=0
1077  // R = Max(r,s) + 0.5 * Nfree = S
1078  // else if r>s
1079  // R = r; S=s+u
1080  // else
1081  // R = r+u; S=s
1082  //
1083  // Next steps:
1084  // Determination of Event numbers R,S, nR, nS
1085  // distribute undefined events according to nR, nS
1086  // finally determine actual sub samples from nR and nS to be used in training / testing
1087  //
1088 
1089  Int_t useForTesting(0),useForTraining(0);
1090  Int_t allAvailable(availableUndefined + availableTraining + availableTesting);
1091 
1092  if( (requestedTraining == 0) && (requestedTesting == 0)){
1093 
1094  // Case C: balance the number of training and testing events
1095 
1096  if ( availableUndefined >= TMath::Abs(availableTraining - availableTesting) ) {
1097  // enough unspecified are available to equal training and testing
1098  useForTraining = useForTesting = allAvailable/2;
1099  } else {
1100  // all unspecified are assigned to the smaller of training / testing
1101  useForTraining = availableTraining;
1102  useForTesting = availableTesting;
1103  if (availableTraining < availableTesting)
1104  useForTraining += availableUndefined;
1105  else
1106  useForTesting += availableUndefined;
1107  }
1108  requestedTraining = useForTraining;
1109  requestedTesting = useForTesting;
1110  }
1111 
1112  else if (requestedTesting == 0){
1113  // case B
1114  useForTraining = TMath::Max(requestedTraining,availableTraining);
1115  if (allAvailable < useForTraining) {
1116  Log() << kFATAL << Form("Dataset[%s] : ",dsi.GetName())<< "More events requested for training ("
1117  << requestedTraining << ") than available ("
1118  << allAvailable << ")!" << Endl;
1119  }
1120  useForTesting = allAvailable - useForTraining; // the rest
1121  requestedTesting = useForTesting;
1122  }
1123 
1124  else if (requestedTraining == 0){ // case B)
1125  useForTesting = TMath::Max(requestedTesting,availableTesting);
1126  if (allAvailable < useForTesting) {
1127  Log() << kFATAL << Form("Dataset[%s] : ",dsi.GetName())<< "More events requested for testing ("
1128  << requestedTesting << ") than available ("
1129  << allAvailable << ")!" << Endl;
1130  }
1131  useForTraining= allAvailable - useForTesting; // the rest
1132  requestedTraining = useForTraining;
1133  }
1134 
1135  else {
1136  // Case A
1137  // requestedTraining R and requestedTesting S >0
1138  // free events: Nfree = u-Thet(R-r)-Thet(S-s)
1139  // nR = Max(R,r) + 0.5 * Nfree
1140  // nS = Max(S,s) + 0.5 * Nfree
1141  Int_t stillNeedForTraining = TMath::Max(requestedTraining-availableTraining,0);
1142  Int_t stillNeedForTesting = TMath::Max(requestedTesting-availableTesting,0);
1143 
1144  int NFree = availableUndefined - stillNeedForTraining - stillNeedForTesting;
1145  if (NFree <0) NFree = 0;
1146  useForTraining = TMath::Max(requestedTraining,availableTraining) + NFree/2;
1147  useForTesting= allAvailable - useForTraining; // the rest
1148  }
1149 
1150  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "determined event sample size to select training sample from="<<useForTraining<<Endl;
1151  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "determined event sample size to select test sample from="<<useForTesting<<Endl;
1152 
1153 
1154 
1155  // associate undefined events
1156  if( splitMode == "ALTERNATE" ){
1157  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "split 'ALTERNATE'" << Endl;
1158  Int_t nTraining = availableTraining;
1159  Int_t nTesting = availableTesting;
1160  for( EventVector::iterator it = eventVectorUndefined.begin(), itEnd = eventVectorUndefined.end(); it != itEnd; ){
1161  ++nTraining;
1162  if( nTraining <= requestedTraining ){
1163  eventVectorTraining.insert( eventVectorTraining.end(), (*it) );
1164  ++it;
1165  }
1166  if( it != itEnd ){
1167  ++nTesting;
1168  eventVectorTesting.insert( eventVectorTesting.end(), (*it) );
1169  ++it;
1170  }
1171  }
1172  } else {
1173  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "split '" << splitMode << "'" << Endl;
1174 
1175  // test if enough events are available
1176  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "availableundefined : " << availableUndefined << Endl;
1177  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "useForTraining : " << useForTraining << Endl;
1178  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "useForTesting : " << useForTesting << Endl;
1179  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "availableTraining : " << availableTraining << Endl;
1180  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "availableTesting : " << availableTesting << Endl;
1181 
1182  if( availableUndefined<(useForTraining-availableTraining) ||
1183  availableUndefined<(useForTesting -availableTesting ) ||
1184  availableUndefined<(useForTraining+useForTesting-availableTraining-availableTesting ) ){
1185  Log() << kFATAL << Form("Dataset[%s] : ",dsi.GetName())<< "More events requested than available!" << Endl;
1186  }
1187 
1188  // select the events
1189  if (useForTraining>availableTraining){
1190  eventVectorTraining.insert( eventVectorTraining.end() , eventVectorUndefined.begin(), eventVectorUndefined.begin()+ useForTraining- availableTraining );
1191  eventVectorUndefined.erase( eventVectorUndefined.begin(), eventVectorUndefined.begin() + useForTraining- availableTraining);
1192  }
1193  if (useForTesting>availableTesting){
1194  eventVectorTesting.insert( eventVectorTesting.end() , eventVectorUndefined.begin(), eventVectorUndefined.begin()+ useForTesting- availableTesting );
1195  }
1196  }
1197  eventVectorUndefined.clear();
1198 
1199  // finally shorten the event vectors to the requested size by removing random events
1200  if (splitMode.Contains( "RANDOM" )){
1201  UInt_t sizeTraining = eventVectorTraining.size();
1202  if( sizeTraining > UInt_t(requestedTraining) ){
1203  std::vector<UInt_t> indicesTraining( sizeTraining );
1204  // make indices
1205  std::generate( indicesTraining.begin(), indicesTraining.end(), TMVA::Increment<UInt_t>(0) );
1206  // shuffle indices
1207  std::random_shuffle( indicesTraining.begin(), indicesTraining.end(), rndm );
1208  // erase indices of not needed events
1209  indicesTraining.erase( indicesTraining.begin()+sizeTraining-UInt_t(requestedTraining), indicesTraining.end() );
1210  // delete all events with the given indices
1211  for( std::vector<UInt_t>::iterator it = indicesTraining.begin(), itEnd = indicesTraining.end(); it != itEnd; ++it ){
1212  delete eventVectorTraining.at( (*it) ); // delete event
1213  eventVectorTraining.at( (*it) ) = NULL; // set pointer to NULL
1214  }
1215  // now remove and erase all events with pointer==NULL
1216  eventVectorTraining.erase( std::remove( eventVectorTraining.begin(), eventVectorTraining.end(), (void*)NULL ), eventVectorTraining.end() );
1217  }
1218 
1219  UInt_t sizeTesting = eventVectorTesting.size();
1220  if( sizeTesting > UInt_t(requestedTesting) ){
1221  std::vector<UInt_t> indicesTesting( sizeTesting );
1222  // make indices
1223  std::generate( indicesTesting.begin(), indicesTesting.end(), TMVA::Increment<UInt_t>(0) );
1224  // shuffle indices
1225  std::random_shuffle( indicesTesting.begin(), indicesTesting.end(), rndm );
1226  // erase indices of not needed events
1227  indicesTesting.erase( indicesTesting.begin()+sizeTesting-UInt_t(requestedTesting), indicesTesting.end() );
1228  // delete all events with the given indices
1229  for( std::vector<UInt_t>::iterator it = indicesTesting.begin(), itEnd = indicesTesting.end(); it != itEnd; ++it ){
1230  delete eventVectorTesting.at( (*it) ); // delete event
1231  eventVectorTesting.at( (*it) ) = NULL; // set pointer to NULL
1232  }
1233  // now remove and erase all events with pointer==NULL
1234  eventVectorTesting.erase( std::remove( eventVectorTesting.begin(), eventVectorTesting.end(), (void*)NULL ), eventVectorTesting.end() );
1235  }
1236  }
1237  else { // erase at end
1238  if( eventVectorTraining.size() < UInt_t(requestedTraining) )
1239  Log() << kWARNING << Form("Dataset[%s] : ",dsi.GetName())<< "DataSetFactory/requested number of training samples larger than size of eventVectorTraining.\n"
1240  << "There is probably an issue. Please contact the TMVA developers." << Endl;
1241  std::for_each( eventVectorTraining.begin()+requestedTraining, eventVectorTraining.end(), DeleteFunctor<Event>() );
1242  eventVectorTraining.erase(eventVectorTraining.begin()+requestedTraining,eventVectorTraining.end());
1243 
1244  if( eventVectorTesting.size() < UInt_t(requestedTesting) )
1245  Log() << kWARNING << Form("Dataset[%s] : ",dsi.GetName())<< "DataSetFactory/requested number of testing samples larger than size of eventVectorTesting.\n"
1246  << "There is probably an issue. Please contact the TMVA developers." << Endl;
1247  std::for_each( eventVectorTesting.begin()+requestedTesting, eventVectorTesting.end(), DeleteFunctor<Event>() );
1248  eventVectorTesting.erase(eventVectorTesting.begin()+requestedTesting,eventVectorTesting.end());
1249  }
1250  }
1251 
1252  TMVA::DataSetFactory::RenormEvents( dsi, tmpEventVector, eventCounts, normMode );
1253 
1254  Int_t trainingSize = 0;
1255  Int_t testingSize = 0;
1256 
1257  // sum up number of training and testing events
1258  for( UInt_t cls = 0; cls < dsi.GetNClasses(); ++cls ){
1259  trainingSize += tmpEventVector[Types::kTraining].at(cls).size();
1260  testingSize += tmpEventVector[Types::kTesting].at(cls).size();
1261  }
1262 
1263  // --- collect all training (testing) events into the training (testing) eventvector
1264 
1265  // create event vectors reserve enough space
1266  EventVector* trainingEventVector = new EventVector();
1267  EventVector* testingEventVector = new EventVector();
1268 
1269  trainingEventVector->reserve( trainingSize );
1270  testingEventVector->reserve( testingSize );
1271 
1272 
1273  // collect the events
1274 
1275  // mixing of kTraining and kTesting data sets
1276  Log() << kDEBUG << " MIXING ============= " << Endl;
1277 
1278  if( mixMode == "ALTERNATE" ){
1279  // Inform user if he tries to use alternate mixmode for
1280  // event classes with different number of events, this works but the alternation stops at the last event of the smaller class
1281  for( UInt_t cls = 1; cls < dsi.GetNClasses(); ++cls ){
1282  if (tmpEventVector[Types::kTraining].at(cls).size() != tmpEventVector[Types::kTraining].at(0).size()){
1283  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << "Training sample: You are trying to mix events in alternate mode although the classes have different event numbers. This works but the alternation stops at the last event of the smaller class."<<Endl;
1284  }
1285  if (tmpEventVector[Types::kTesting].at(cls).size() != tmpEventVector[Types::kTesting].at(0).size()){
1286  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << "Testing sample: You are trying to mix events in alternate mode although the classes have different event numbers. This works but the alternation stops at the last event of the smaller class."<<Endl;
1287  }
1288  }
1289  typedef EventVector::iterator EvtVecIt;
1290  EvtVecIt itEvent, itEventEnd;
1291 
1292  // insert first class
1293  Log() << kDEBUG << "insert class 0 into training and test vector" << Endl;
1294  trainingEventVector->insert( trainingEventVector->end(), tmpEventVector[Types::kTraining].at(0).begin(), tmpEventVector[Types::kTraining].at(0).end() );
1295  testingEventVector->insert( testingEventVector->end(), tmpEventVector[Types::kTesting].at(0).begin(), tmpEventVector[Types::kTesting].at(0).end() );
1296 
1297  // insert other classes
1298  EvtVecIt itTarget;
1299  for( UInt_t cls = 1; cls < dsi.GetNClasses(); ++cls ){
1300  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "insert class " << cls << Endl;
1301  // training vector
1302  itTarget = trainingEventVector->begin() - 1; // start one before begin
1303  // loop over source
1304  for( itEvent = tmpEventVector[Types::kTraining].at(cls).begin(), itEventEnd = tmpEventVector[Types::kTraining].at(cls).end(); itEvent != itEventEnd; ++itEvent ){
1305  // if( std::distance( itTarget, trainingEventVector->end()) < Int_t(cls+1) ) {
1306  if( (trainingEventVector->end() - itTarget) < Int_t(cls+1) ) {
1307  itTarget = trainingEventVector->end();
1308  trainingEventVector->insert( itTarget, itEvent, itEventEnd ); // fill in the rest without mixing
1309  break;
1310  }else{
1311  itTarget += cls+1;
1312  trainingEventVector->insert( itTarget, (*itEvent) ); // fill event
1313  }
1314  }
1315  // testing vector
1316  itTarget = testingEventVector->begin() - 1;
1317  // loop over source
1318  for( itEvent = tmpEventVector[Types::kTesting].at(cls).begin(), itEventEnd = tmpEventVector[Types::kTesting].at(cls).end(); itEvent != itEventEnd; ++itEvent ){
1319  // if( std::distance( itTarget, testingEventVector->end()) < Int_t(cls+1) ) {
1320  if( ( testingEventVector->end() - itTarget ) < Int_t(cls+1) ) {
1321  itTarget = testingEventVector->end();
1322  testingEventVector->insert( itTarget, itEvent, itEventEnd ); // fill in the rest without mixing
1323  break;
1324  }else{
1325  itTarget += cls+1;
1326  testingEventVector->insert( itTarget, (*itEvent) ); // fill event
1327  }
1328  }
1329  }
1330 
1331  // debugging output: classnumbers of all events in training and testing vectors
1332  // std::cout << std::endl;
1333  // std::cout << "TRAINING VECTOR" << std::endl;
1334  // std::transform( trainingEventVector->begin(), trainingEventVector->end(), ostream_iterator<Int_t>(std::cout, "|"), std::mem_fun(&TMVA::Event::GetClass) );
1335 
1336  // std::cout << std::endl;
1337  // std::cout << "TESTING VECTOR" << std::endl;
1338  // std::transform( testingEventVector->begin(), testingEventVector->end(), ostream_iterator<Int_t>(std::cout, "|"), std::mem_fun(&TMVA::Event::GetClass) );
1339  // std::cout << std::endl;
1340 
1341  }else{
1342  for( UInt_t cls = 0; cls < dsi.GetNClasses(); ++cls ){
1343  trainingEventVector->insert( trainingEventVector->end(), tmpEventVector[Types::kTraining].at(cls).begin(), tmpEventVector[Types::kTraining].at(cls).end() );
1344  testingEventVector->insert ( testingEventVector->end(), tmpEventVector[Types::kTesting].at(cls).begin(), tmpEventVector[Types::kTesting].at(cls).end() );
1345  }
1346  }
1347 
1348  // std::cout << "trainingEventVector " << trainingEventVector->size() << std::endl;
1349  // std::cout << "testingEventVector " << testingEventVector->size() << std::endl;
1350 
1351  // std::transform( trainingEventVector->begin(), trainingEventVector->end(), ostream_iterator<Int_t>(std::cout, "> \n"), std::mem_fun(&TMVA::Event::GetNVariables) );
1352  // std::transform( testingEventVector->begin(), testingEventVector->end(), ostream_iterator<Int_t>(std::cout, "> \n"), std::mem_fun(&TMVA::Event::GetNVariables) );
1353 
1354  // delete the tmpEventVector (but not the events therein)
1355  tmpEventVector[Types::kTraining].clear();
1356  tmpEventVector[Types::kTesting].clear();
1357 
1358  tmpEventVector[Types::kMaxTreeType].clear();
1359 
1360  if (mixMode == "RANDOM") {
1361  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "shuffling events"<<Endl;
1362 
1363  std::random_shuffle( trainingEventVector->begin(), trainingEventVector->end(), rndm );
1364  std::random_shuffle( testingEventVector->begin(), testingEventVector->end(), rndm );
1365  }
1366 
1367  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "trainingEventVector " << trainingEventVector->size() << Endl;
1368  Log() << kDEBUG << Form("Dataset[%s] : ",dsi.GetName())<< "testingEventVector " << testingEventVector->size() << Endl;
1369 
1370  // create dataset
1371  DataSet* ds = new DataSet(dsi);
1372 
1373  // Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << "Create internal training tree" << Endl;
1374  ds->SetEventCollection(trainingEventVector, Types::kTraining );
1375  // Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << "Create internal testing tree" << Endl;
1376  ds->SetEventCollection(testingEventVector, Types::kTesting );
1377 
1378 
1379  if (ds->GetNTrainingEvents() < 1){
1380  Log() << kFATAL << "Dataset " << std::string(dsi.GetName()) << " does not have any training events, I better stop here and let you fix that one first " << Endl;
1381  }
1382 
1383  if (ds->GetNTestEvents() < 1) {
1384  Log() << kERROR << "Dataset " << std::string(dsi.GetName()) << " does not have any testing events, guess that will cause problems later..but for now, I continue " << Endl;
1385  }
1386 
1387 
1388  return ds;
1389 
1390 }
1391 
1392 ////////////////////////////////////////////////////////////////////////////////
1393 /// =============================================================================
1394 /// renormalisation of the TRAINING event weights
1395 /// -none (kind of obvious) .. use the weights as supplied by the
1396 /// user.. (we store however the relative weight for later use)
1397 /// -numEvents
1398 /// -equalNumEvents reweight the training events such that the sum of all
1399 /// backgr. (class > 0) weights equal that of the signal (class 0)
1400 /// =============================================================================
1401 
1402 void
1404  EventVectorOfClassesOfTreeType& tmpEventVector,
1405  const EvtStatsPerClass& eventCounts,
1406  const TString& normMode )
1407 {
1408 
1409 
1410  // print rescaling info
1411  // ---------------------------------
1412  // compute sizes and sums of weights
1413  Int_t trainingSize = 0;
1414  Int_t testingSize = 0;
1415 
1416  ValuePerClass trainingSumWeightsPerClass( dsi.GetNClasses() );
1417  ValuePerClass testingSumWeightsPerClass( dsi.GetNClasses() );
1418 
1419  NumberPerClass trainingSizePerClass( dsi.GetNClasses() );
1420  NumberPerClass testingSizePerClass( dsi.GetNClasses() );
1421 
1422  Double_t trainingSumSignalWeights = 0;
1423  Double_t trainingSumBackgrWeights = 0; // Backgr. includes all clasess that are not signal
1424  Double_t testingSumSignalWeights = 0;
1425  Double_t testingSumBackgrWeights = 0; // Backgr. includes all clasess that are not signal
1426 
1427 
1428 
1429  for( UInt_t cls = 0, clsEnd = dsi.GetNClasses(); cls < clsEnd; ++cls ){
1430  trainingSizePerClass.at(cls) = tmpEventVector[Types::kTraining].at(cls).size();
1431  testingSizePerClass.at(cls) = tmpEventVector[Types::kTesting].at(cls).size();
1432 
1433  trainingSize += trainingSizePerClass.back();
1434  testingSize += testingSizePerClass.back();
1435 
1436  // the functional solution
1437  // sum up the weights in Double_t although the individual weights are Float_t to prevent rounding issues in addition of floating points
1438  //
1439  // accumulate --> does what the name says
1440  // begin() and end() denote the range of the vector to be accumulated
1441  // Double_t(0) tells accumulate the type and the starting value
1442  // compose_binary creates a BinaryFunction of ...
1443  // std::plus<Double_t>() knows how to sum up two doubles
1444  // null<Double_t>() leaves the first argument (the running sum) unchanged and returns it
1445  // std::mem_fun knows how to call a member function (type and member-function given as argument) and return the result
1446  //
1447  // all together sums up all the event-weights of the events in the vector and returns it
1448  trainingSumWeightsPerClass.at(cls) = std::accumulate( tmpEventVector[Types::kTraining].at(cls).begin(),
1449  tmpEventVector[Types::kTraining].at(cls).end(),
1450  Double_t(0),
1451  compose_binary( std::plus<Double_t>(),
1452  null<Double_t>(),
1453  std::mem_fun(&TMVA::Event::GetOriginalWeight) ) );
1454 
1455  testingSumWeightsPerClass.at(cls) = std::accumulate( tmpEventVector[Types::kTesting].at(cls).begin(),
1456  tmpEventVector[Types::kTesting].at(cls).end(),
1457  Double_t(0),
1458  compose_binary( std::plus<Double_t>(),
1459  null<Double_t>(),
1460  std::mem_fun(&TMVA::Event::GetOriginalWeight) ) );
1461 
1462  if ( cls == dsi.GetSignalClassIndex()){
1463  trainingSumSignalWeights += trainingSumWeightsPerClass.at(cls);
1464  testingSumSignalWeights += testingSumWeightsPerClass.at(cls);
1465  }else{
1466  trainingSumBackgrWeights += trainingSumWeightsPerClass.at(cls);
1467  testingSumBackgrWeights += testingSumWeightsPerClass.at(cls);
1468  }
1469  }
1470 
1471  // ---------------------------------
1472  // compute renormalization factors
1473 
1474  ValuePerClass renormFactor( dsi.GetNClasses() );
1475 
1476 
1477  // for information purposes
1478  dsi.SetNormalization( normMode );
1479  // !! these will be overwritten later by the 'rescaled' ones if
1480  // NormMode != None !!!
1481  dsi.SetTrainingSumSignalWeights(trainingSumSignalWeights);
1482  dsi.SetTrainingSumBackgrWeights(trainingSumBackgrWeights);
1483  dsi.SetTestingSumSignalWeights(testingSumSignalWeights);
1484  dsi.SetTestingSumBackgrWeights(testingSumBackgrWeights);
1485 
1486 
1487  if (normMode == "NONE") {
1488  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << "No weight renormalisation applied: use original global and event weights" << Endl;
1489  return;
1490  }
1491  //changed by Helge 27.5.2013 What on earth was done here before? I still remember the idea behind this which apparently was
1492  //NOT understood by the 'programmer' :) .. the idea was to have SAME amount of effective TRAINING data for signal and background.
1493  // Testing events are totally irrelevant for this and might actually skew the whole normalisation!!
1494  else if (normMode == "NUMEVENTS") {
1495  Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
1496  << "\tWeight renormalisation mode: \"NumEvents\": renormalises all event classes " << Endl;
1497  Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
1498  << " such that the effective (weighted) number of events in each class equals the respective " << Endl;
1499  Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
1500  << " number of events (entries) that you demanded in PrepareTrainingAndTestTree(\"\",\"nTrain_Signal=.. )" << Endl;
1501  Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
1502  << " ... i.e. such that Sum[i=1..N_j]{w_i} = N_j, j=0,1,2..." << Endl;
1503  Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
1504  << " ... (note that N_j is the sum of TRAINING events (nTrain_j...with j=Signal,Background.." << Endl;
1505  Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
1506  << " ..... Testing events are not renormalised nor included in the renormalisation factor! )"<< Endl;
1507 
1508  for( UInt_t cls = 0, clsEnd = dsi.GetNClasses(); cls < clsEnd; ++cls ){
1509  // renormFactor.at(cls) = ( (trainingSizePerClass.at(cls) + testingSizePerClass.at(cls))/
1510  // (trainingSumWeightsPerClass.at(cls) + testingSumWeightsPerClass.at(cls)) );
1511  //changed by Helge 27.5.2013
1512  renormFactor.at(cls) = ((Float_t)trainingSizePerClass.at(cls) )/
1513  (trainingSumWeightsPerClass.at(cls)) ;
1514  }
1515  }
1516  else if (normMode == "EQUALNUMEVENTS") {
1517  //changed by Helge 27.5.2013 What on earth was done here before? I still remember the idea behind this which apparently was
1518  //NOT understood by the 'programmer' :) .. the idea was to have SAME amount of effective TRAINING data for signal and background.
1519  //done here was something like having each data source normalized to its number of entries and this even for trainig+testing together.
1520  // what should this have been good for ???
1521 
1522  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << "Weight renormalisation mode: \"EqualNumEvents\": renormalises all event classes ..." << Endl;
1523  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << " such that the effective (weighted) number of events in each class is the same " << Endl;
1524  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << " (and equals the number of events (entries) given for class=0 )" << Endl;
1525  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << "... i.e. such that Sum[i=1..N_j]{w_i} = N_classA, j=classA, classB, ..." << Endl;
1526  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << "... (note that N_j is the sum of TRAINING events" << Endl;
1527  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << " ..... Testing events are not renormalised nor included in the renormalisation factor!)" << Endl;
1528 
1529  // normalize to size of first class
1530  UInt_t referenceClass = 0;
1531  for (UInt_t cls = 0, clsEnd = dsi.GetNClasses(); cls < clsEnd; ++cls ) {
1532  renormFactor.at(cls) = Float_t(trainingSizePerClass.at(referenceClass))/
1533  (trainingSumWeightsPerClass.at(cls));
1534  }
1535  }
1536  else {
1537  Log() << kFATAL << Form("Dataset[%s] : ",dsi.GetName())<< "<PrepareForTrainingAndTesting> Unknown NormMode: " << normMode << Endl;
1538  }
1539 
1540  // ---------------------------------
1541  // now apply the normalization factors
1542  Int_t maxL = dsi.GetClassNameMaxLength();
1543  for (UInt_t cls = 0, clsEnd = dsi.GetNClasses(); cls<clsEnd; ++cls) {
1544  Log() << kDEBUG //<< Form("Dataset[%s] : ",dsi.GetName())
1545  << "--> Rescale " << setiosflags(ios::left) << std::setw(maxL)
1546  << dsi.GetClassInfo(cls)->GetName() << " event weights by factor: " << renormFactor.at(cls) << Endl;
1547  for (EventVector::iterator it = tmpEventVector[Types::kTraining].at(cls).begin(),
1548  itEnd = tmpEventVector[Types::kTraining].at(cls).end(); it != itEnd; ++it){
1549  (*it)->SetWeight ((*it)->GetWeight() * renormFactor.at(cls));
1550  }
1551 
1552  }
1553 
1554 
1555  // print out the result
1556  // (same code as before --> this can be done nicer )
1557  //
1558 
1559  Log() << kINFO //<< Form("Dataset[%s] : ",dsi.GetName())
1560  << "Number of training and testing events" << Endl;
1561  Log() << kDEBUG << "\tafter rescaling:" << Endl;
1562  Log() << kINFO //<< Form("Dataset[%s] : ",dsi.GetName())
1563  << "---------------------------------------------------------------------------" << Endl;
1564 
1565  trainingSumSignalWeights = 0;
1566  trainingSumBackgrWeights = 0; // Backgr. includes all clasess that are not signal
1567  testingSumSignalWeights = 0;
1568  testingSumBackgrWeights = 0; // Backgr. includes all clasess that are not signal
1569 
1570  for( UInt_t cls = 0, clsEnd = dsi.GetNClasses(); cls < clsEnd; ++cls ){
1571 
1572  trainingSumWeightsPerClass.at(cls) = (std::accumulate( tmpEventVector[Types::kTraining].at(cls).begin(), // accumulate --> start at begin
1573  tmpEventVector[Types::kTraining].at(cls).end(), // until end()
1574  Double_t(0), // values are of type double
1575  compose_binary( std::plus<Double_t>(), // define addition for doubles
1576  null<Double_t>(), // take the argument, don't do anything and return it
1577  std::mem_fun(&TMVA::Event::GetOriginalWeight) ) )); // take the value from GetOriginalWeight
1578 
1579  testingSumWeightsPerClass.at(cls) = std::accumulate( tmpEventVector[Types::kTesting].at(cls).begin(),
1580  tmpEventVector[Types::kTesting].at(cls).end(),
1581  Double_t(0),
1582  compose_binary( std::plus<Double_t>(),
1583  null<Double_t>(),
1584  std::mem_fun(&TMVA::Event::GetOriginalWeight) ) );
1585 
1586 
1587  if ( cls == dsi.GetSignalClassIndex()){
1588  trainingSumSignalWeights += trainingSumWeightsPerClass.at(cls);
1589  testingSumSignalWeights += testingSumWeightsPerClass.at(cls);
1590  }else{
1591  trainingSumBackgrWeights += trainingSumWeightsPerClass.at(cls);
1592  testingSumBackgrWeights += testingSumWeightsPerClass.at(cls);
1593  }
1594 
1595  // output statistics
1596 
1597  Log() << kINFO //<< Form("Dataset[%s] : ",dsi.GetName())
1598  << setiosflags(ios::left) << std::setw(maxL)
1599  << dsi.GetClassInfo(cls)->GetName() << " -- "
1600  << "training events : " << trainingSizePerClass.at(cls) << Endl;
1601  Log() << kDEBUG << "\t(sum of weights: " << trainingSumWeightsPerClass.at(cls) << ")"
1602  << " - requested were " << eventCounts[cls].nTrainingEventsRequested << " events" << Endl;
1603  Log() << kINFO //<< Form("Dataset[%s] : ",dsi.GetName())
1604  << setiosflags(ios::left) << std::setw(maxL)
1605  << dsi.GetClassInfo(cls)->GetName() << " -- "
1606  << "testing events : " << testingSizePerClass.at(cls) << Endl;
1607  Log() << kDEBUG << "\t(sum of weights: " << testingSumWeightsPerClass.at(cls) << ")"
1608  << " - requested were " << eventCounts[cls].nTestingEventsRequested << " events" << Endl;
1609  Log() << kINFO //<< Form("Dataset[%s] : ",dsi.GetName())
1610  << setiosflags(ios::left) << std::setw(maxL)
1611  << dsi.GetClassInfo(cls)->GetName() << " -- "
1612  << "training and testing events: "
1613  << (trainingSizePerClass.at(cls)+testingSizePerClass.at(cls)) << Endl;
1614  Log() << kDEBUG << "\t(sum of weights: "
1615  << (trainingSumWeightsPerClass.at(cls)+testingSumWeightsPerClass.at(cls)) << ")" << Endl;
1616  if(eventCounts[cls].nEvAfterCut<eventCounts[cls].nEvBeforeCut) {
1617  Log() << kINFO << Form("Dataset[%s] : ",dsi.GetName()) << setiosflags(ios::left) << std::setw(maxL)
1618  << dsi.GetClassInfo(cls)->GetName() << " -- "
1619  << "due to the preselection a scaling factor has been applied to the numbers of requested events: "
1620  << eventCounts[cls].cutScaling() << Endl;
1621  }
1622  }
1623  Log() << kINFO << Endl;
1624 
1625  // for information purposes
1626  dsi.SetTrainingSumSignalWeights(trainingSumSignalWeights);
1627  dsi.SetTrainingSumBackgrWeights(trainingSumBackgrWeights);
1628  dsi.SetTestingSumSignalWeights(testingSumSignalWeights);
1629  dsi.SetTestingSumBackgrWeights(testingSumBackgrWeights);
1630 
1631 
1632 }
1633 
const int ndata
virtual const char * GetTitle() const
Returns title of object.
Definition: TNamed.h:52
UInt_t GetNSpectators() const
access the number of targets through the datasetinfo
Definition: DataSet.cxx:241
A TLeaf describes individual elements of a TBranch See TBranch structure in TTree.
Definition: TLeaf.h:37
std::vector< EventVector > EventVectorOfClasses
Long64_t GetNTestEvents() const
Definition: DataSet.h:94
std::vector< TString > * GetClassList() const
void SetTrainingSumBackgrWeights(Double_t trainingSumBackgrWeights)
Definition: DataSetInfo.h:136
MsgLogger & Endl(MsgLogger &ml)
Definition: MsgLogger.h:162
long long Long64_t
Definition: RtypesCore.h:69
virtual void SetBranchStatus(const char *bname, Bool_t status=1, UInt_t *found=0)
Set branch status to Process or DoNotProcess.
Definition: TTree.cxx:7865
std::vector< VariableInfo > & GetSpectatorInfos()
Definition: DataSetInfo.h:122
std::vector< TTreeFormula * > fInputFormulas
float Float_t
Definition: RtypesCore.h:53
Float_t GetSpectator(UInt_t ivar) const
return spectator content
Definition: Event.cxx:258
const TString & GetExpression() const
Definition: VariableInfo.h:65
Int_t GetClassNameMaxLength() const
virtual TLeaf * GetLeaf(Int_t n) const
Return leaf corresponding to serial number n.
std::vector< TTreeFormula * > fCutFormulas
UInt_t GetNClasses() const
Definition: DataSetInfo.h:154
std::vector< Double_t > ValuePerClass
Types::ETreeType GetTreeType() const
UInt_t GetNTargets() const
Definition: DataSetInfo.h:129
void ToUpper()
Change string to upper case.
Definition: TString.cxx:1102
void SetTrainingSumSignalWeights(Double_t trainingSumSignalWeights)
Definition: DataSetInfo.h:135
virtual Int_t GetEntry(Long64_t entry=0, Int_t getall=0)
Read all branches of entry and return total number of bytes read.
Definition: TTree.cxx:5210
Basic string class.
Definition: TString.h:137
void SetTestingSumBackgrWeights(Double_t testingSumBackgrWeights)
Definition: DataSetInfo.h:138
void BuildEventVector(DataSetInfo &dsi, DataInputHandler &dataInput, EventVectorOfClassesOfTreeType &eventsmap, EvtStatsPerClass &eventCounts)
build empty event vectors distributes events between kTraining/kTesting/kMaxTreeType ...
const TString & GetWeight() const
Definition: ClassInfo.h:71
virtual Int_t GetNdim() const
Definition: TFormula.h:243
int Int_t
Definition: RtypesCore.h:41
bool Bool_t
Definition: RtypesCore.h:59
TArc * a
Definition: textangle.C:12
const Bool_t kFALSE
Definition: Rtypes.h:92
void generate(R &r, TH1D *h)
Definition: piRandom.C:28
TBranch * GetBranch() const
Definition: TLeaf.h:70
STL namespace.
void CalcMinMax(DataSet *, DataSetInfo &dsi)
compute covariance matrix
TFile * GetCurrentFile() const
Return pointer to the current file.
Definition: TTree.cxx:5052
Double_t GetWeight() const
return the event weight - depending on whether the flag IgnoreNegWeightsInTraining is or not...
Definition: Event.cxx:378
TTree * GetTree() const
UInt_t GetNVariables() const
Definition: DataSetInfo.h:128
Short_t Abs(Short_t d)
Definition: TMathBase.h:110
Float_t GetValue(UInt_t ivar) const
return value of i&#39;th variable
Definition: Event.cxx:233
UInt_t GetEntries(const TString &name) const
std::vector< int > NumberPerClass
const TString & GetClassName() const
const TString & GetInternalName() const
Definition: VariableInfo.h:66
virtual const char * GetName() const
Returns name of object.
Definition: DataSetInfo.h:85
const char * Data() const
Definition: TString.h:349
std::map< Types::ETreeType, EventVectorOfClasses > EventVectorOfClassesOfTreeType
double sqrt(double)
void InitOptions(DataSetInfo &dsi, EvtStatsPerClass &eventsmap, TString &normMode, UInt_t &splitSeed, TString &splitMode, TString &mixMode)
the dataset splitting
DataSet * BuildDynamicDataSet(DataSetInfo &)
static TString Format(const char *fmt,...)
Static method which formats a string using a printf style format descriptor and return a TString...
Definition: TString.cxx:2335
virtual Bool_t IsOnTerminalBranch() const
Definition: TLeaf.h:89
virtual Long64_t LoadTree(Long64_t entry)
Set current entry.
Definition: TTree.cxx:5967
std::vector< TTreeFormula * > fWeightFormula
void SetTestingSumSignalWeights(Double_t testingSumSignalWeights)
Definition: DataSetInfo.h:137
MsgLogger & Log() const
std::vector< std::vector< double > > Data
void PrintCorrelationMatrix(const TString &className)
calculates the correlation matrices for signal and background, prints them to standard output...
void SetMin(Double_t v)
Definition: VariableInfo.h:77
void SetMinType(EMsgType minType)
Definition: MsgLogger.h:76
void RenormEvents(DataSetInfo &dsi, EventVectorOfClassesOfTreeType &eventsmap, const EvtStatsPerClass &eventCounts, const TString &normMode)
renormalisation of the TRAINING event weights -none (kind of obvious) .
Double_t GetWeight() const
Double_t GetOriginalWeight() const
Definition: Event.h:87
Used to pass a selection expression to the Tree drawing routine.
Definition: TTreeFormula.h:64
void ChangeToNewTree(TreeInfo &, const DataSetInfo &)
While the data gets copied into the local training and testing trees, the input tree can change (for ...
A specialized string object used for TTree selections.
Definition: TCut.h:27
virtual TFile * GetFile() const
Definition: TDirectory.h:155
TMatrixT< Double_t > TMatrixD
Definition: TMatrixDfwd.h:24
void SetCorrelationMatrix(const TString &className, TMatrixD *matrix)
DataSetFactory()
constructor
TMatrixD * CalcCorrelationMatrix(DataSet *, const UInt_t classNumber)
computes correlation matrix for variables "theVars" in tree; "theType" defines the required event "ty...
Int_t LargestCommonDivider(Int_t a, Int_t b)
SVector< double, 2 > v
Definition: Dict.h:5
std::vector< TTreeFormula * > fSpectatorFormulas
DataSet * CreateDataSet(DataSetInfo &, DataInputHandler &)
steering the creation of a new dataset
TDirectory * GetDirectory() const
Definition: TTree.h:391
virtual const char * ClassName() const
Returns name of class to which the object belongs.
Definition: TObject.cxx:188
VariableInfo & GetTargetInfo(Int_t i)
Definition: DataSetInfo.h:119
ClassInfo * GetClassInfo(Int_t clNum) const
unsigned int UInt_t
Definition: RtypesCore.h:42
virtual Int_t GetNcodes() const
Definition: TTreeFormula.h:189
char * Form(const char *fmt,...)
virtual const char * GetName() const
Returns name of object.
Definition: TNamed.h:51
UInt_t GetSignalClassIndex()
Definition: DataSetInfo.h:157
std::vector< TTreeFormula * > fTargetFormulas
void SetEventCollection(std::vector< Event * > *, Types::ETreeType, Bool_t deleteEvents=true)
Sets the event collection (by DataSetFactory)
Definition: DataSet.cxx:259
const TCut & GetCut() const
Definition: ClassInfo.h:72
std::vector< Event * > EventVector
void SetMax(Double_t v)
Definition: VariableInfo.h:78
const TString & GetSplitOptions() const
Definition: DataSetInfo.h:185
DataSet * BuildInitialDataSet(DataSetInfo &, TMVA::DataInputHandler &)
if no entries, than create a DataSet with one Event which uses dynamic variables (pointers to variabl...
const Event * GetEvent() const
Definition: DataSet.cxx:211
~DataSetFactory()
destructor
virtual Int_t GetNdata()
Return number of available instances in the formula.
virtual const char * GetName() const
Returns name of object.
Definition: TObject.cxx:416
double Double_t
Definition: RtypesCore.h:55
std::vector< TreeInfo >::const_iterator end(const TString &className) const
virtual void ResetBranchAddresses()
Tell all of our branches to drop their current objects and allocate new ones.
Definition: TTree.cxx:7475
Long64_t GetNEvents(Types::ETreeType type=Types::kMaxTreeType) const
Definition: DataSet.h:229
UInt_t GetNTargets() const
access the number of targets through the datasetinfo
Definition: DataSet.cxx:233
UInt_t GetNVariables() const
access the number of variables through the datasetinfo
Definition: DataSet.cxx:225
VariableInfo & GetSpectatorInfo(Int_t i)
Definition: DataSetInfo.h:124
compose_binary_t< F, G, H > compose_binary(const F &_f, const G &_g, const H &_h)
virtual TTree * GetTree() const
Definition: TTree.h:443
VariableInfo & GetVariableInfo(Int_t i)
Definition: DataSetInfo.h:114
T EvalInstance(Int_t i=0, const char *stringStack[]=0)
Evaluate this treeformula.
ClassInfo * AddClass(const TString &className)
UInt_t GetClass() const
Definition: Event.h:89
Long64_t GetNClassEvents(Int_t type, UInt_t classNumber)
Definition: DataSet.cxx:177
void SetConfigName(const char *n)
Definition: Configurable.h:69
Float_t GetTarget(UInt_t itgt) const
Definition: Event.h:104
Abstract ClassifierFactory template that handles arbitrary types.
std::vector< EventStats > EvtStatsPerClass
Short_t Max(Short_t a, Short_t b)
Definition: TMathBase.h:202
Bool_t Contains(const char *pat, ECaseCompare cmp=kExact) const
Definition: TString.h:567
Int_t IsNaN(Double_t x)
Definition: TMath.h:613
UInt_t GetNSpectators(bool all=kTRUE) const
Bool_t CheckTTreeFormula(TTreeFormula *ttf, const TString &expression, Bool_t &hasDollar)
checks a TTreeFormula for problems
void SetNumber(const UInt_t index)
Definition: ClassInfo.h:67
you should not use this method at all Int_t Int_t Double_t Double_t Double_t Int_t Double_t Double_t Double_t Double_t b
Definition: TRolke.cxx:630
#define NULL
Definition: Rtypes.h:82
virtual Long64_t GetEntries() const
Definition: TTree.h:392
Long64_t GetNTrainingEvents() const
Definition: DataSet.h:93
A TTree object has a header with a name and a title.
Definition: TTree.h:98
DataSet * MixEvents(DataSetInfo &dsi, EventVectorOfClassesOfTreeType &eventsmap, EvtStatsPerClass &eventCounts, const TString &splitMode, const TString &mixMode, const TString &normMode, UInt_t splitSeed)
Select and distribute unassigned events to kTraining and kTesting Bool_t emptyUndefined = kTRUE;...
const Bool_t kTRUE
Definition: Rtypes.h:91
std::vector< TreeInfo >::const_iterator begin(const TString &className) const
virtual const char * GetTitle() const
Returns title of object.
Definition: TObject.cxx:460
Bool_t HasCuts() const
void SetNormalization(const TString &norm)
Definition: DataSetInfo.h:133
TMatrixD * CalcCovarianceMatrix(DataSet *, const UInt_t classNumber)
compute covariance matrix
std::vector< VariableInfo > & GetVariableInfos()
Definition: DataSetInfo.h:112
int CompareTo(const char *cs, ECaseCompare cmp=kExact) const
Compare a string to char *cs2.
Definition: TString.cxx:386