ROOT  6.06/09
Reference Guide
DataSetFactory.cxx
Go to the documentation of this file.
1 // @(#)root/tmva $Id$
2 // Author: Andreas Hoecker, Peter Speckmayer, Joerg Stelzer, Eckhard von Toerne, Helge Voss
3 
4 /*****************************************************************************
5  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6  * Package: TMVA *
7  * Class : DataSetFactory *
8  * Web : http://tmva.sourceforge.net *
9  * *
10  * Description: *
11  * Implementation (see header for description) *
12  * *
13  * Authors (alphabetical): *
14  * Andreas Hoecker <Andreas.Hocker@cern.ch> - CERN, Switzerland *
15  * Peter Speckmayer <Peter.Speckmayer@cern.ch> - CERN, Switzerland *
16  * Joerg Stelzer <Joerg.Stelzer@cern.ch> - MSU, USA *
17  * Eckhard von Toerne <evt@physik.uni-bonn.de> - U. of Bonn, Germany *
18  * Helge Voss <Helge.Voss@cern.ch> - MPI-K Heidelberg, Germany *
19  * *
20  * Copyright (c) 2009: *
21  * CERN, Switzerland *
22  * MPI-K Heidelberg, Germany *
23  * U. of Bonn, Germany *
24  * Redistribution and use in source and binary forms, with or without *
25  * modification, are permitted according to the terms listed in LICENSE *
26  * (http://tmva.sourceforge.net/LICENSE) *
27  *****************************************************************************/
28 
29 #include <assert.h>
30 
31 #include <map>
32 #include <vector>
33 #include <iomanip>
34 #include <iostream>
35 
36 #include <algorithm>
37 #include <functional>
38 #include <numeric>
39 
40 #include "TMVA/DataSetFactory.h"
41 
42 #include "TEventList.h"
43 #include "TFile.h"
44 #include "TH1.h"
45 #include "TH2.h"
46 #include "TProfile.h"
47 #include "TRandom3.h"
48 #include "TMatrixF.h"
49 #include "TVectorF.h"
50 #include "TMath.h"
51 #include "TROOT.h"
52 
53 #ifndef ROOT_TMVA_MsgLogger
54 #include "TMVA/MsgLogger.h"
55 #endif
56 #ifndef ROOT_TMVA_Configurable
57 #include "TMVA/Configurable.h"
58 #endif
59 #ifndef ROOT_TMVA_VariableIdentityTransform
61 #endif
62 #ifndef ROOT_TMVA_VariableDecorrTransform
64 #endif
65 #ifndef ROOT_TMVA_VariablePCATransform
67 #endif
68 #ifndef ROOT_TMVA_DataSet
69 #include "TMVA/DataSet.h"
70 #endif
71 #ifndef ROOT_TMVA_DataSetInfo
72 #include "TMVA/DataSetInfo.h"
73 #endif
74 #ifndef ROOT_TMVA_DataInputHandler
75 #include "TMVA/DataInputHandler.h"
76 #endif
77 #ifndef ROOT_TMVA_Event
78 #include "TMVA/Event.h"
79 #endif
80 
81 using namespace std;
82 
83 TMVA::DataSetFactory* TMVA::DataSetFactory::fgInstance = 0;
84 
85 namespace TMVA {
86  // calculate the largest common divider
87  // this function is not happy if numbers are negative!
89  {
90  if (a<b) {Int_t tmp = a; a=b; b=tmp; } // achieve a>=b
91  if (b==0) return a;
92  Int_t fullFits = a/b;
93  return LargestCommonDivider(b,a-b*fullFits);
94  }
95 }
96 
97 ////////////////////////////////////////////////////////////////////////////////
98 /// constructor
99 
101  fVerbose(kFALSE),
102  fVerboseLevel(TString("Info")),
103  fScaleWithPreselEff(0),
104  fCurrentTree(0),
105  fCurrentEvtIdx(0),
106  fInputFormulas(0),
107  fLogger( new MsgLogger("DataSetFactory", kINFO) )
108 {
109 }
110 
111 ////////////////////////////////////////////////////////////////////////////////
112 /// destructor
113 
115 {
116  std::vector<TTreeFormula*>::const_iterator formIt;
117 
118  for (formIt = fInputFormulas.begin() ; formIt!=fInputFormulas.end() ; formIt++) if (*formIt) delete *formIt;
119  for (formIt = fTargetFormulas.begin() ; formIt!=fTargetFormulas.end() ; formIt++) if (*formIt) delete *formIt;
120  for (formIt = fCutFormulas.begin() ; formIt!=fCutFormulas.end() ; formIt++) if (*formIt) delete *formIt;
121  for (formIt = fWeightFormula.begin() ; formIt!=fWeightFormula.end() ; formIt++) if (*formIt) delete *formIt;
122  for (formIt = fSpectatorFormulas.begin(); formIt!=fSpectatorFormulas.end(); formIt++) if (*formIt) delete *formIt;
123 
124  delete fLogger;
125 }
126 
127 ////////////////////////////////////////////////////////////////////////////////
128 /// steering the creation of a new dataset
129 
131  TMVA::DataInputHandler& dataInput )
132 {
133  // build the first dataset from the data input
134  DataSet * ds = BuildInitialDataSet( dsi, dataInput );
135 
136  if (ds->GetNEvents() > 1) {
137  CalcMinMax(ds,dsi);
138 
139  // from the the final dataset build the correlation matrix
140  for (UInt_t cl = 0; cl< dsi.GetNClasses(); cl++) {
141  const TString className = dsi.GetClassInfo(cl)->GetName();
142  dsi.SetCorrelationMatrix( className, CalcCorrelationMatrix( ds, cl ) );
143  dsi.PrintCorrelationMatrix( className );
144  }
145  Log() << kINFO << " " << Endl;
146  }
147 
148  return ds;
149 }
150 
151 ////////////////////////////////////////////////////////////////////////////////
152 
154 {
155  Log() << kDEBUG << "Build DataSet consisting of one Event with dynamically changing variables" << Endl;
156  DataSet* ds = new DataSet(dsi);
157 
158  // create a DataSet with one Event which uses dynamic variables
159  // (pointers to variables)
160  if(dsi.GetNClasses()==0){
161  dsi.AddClass( "data" );
162  dsi.GetClassInfo( "data" )->SetNumber(0);
163  }
164 
165  std::vector<Float_t*>* evdyn = new std::vector<Float_t*>(0);
166 
167  std::vector<VariableInfo>& varinfos = dsi.GetVariableInfos();
168 
169  if (varinfos.empty())
170  Log() << kFATAL << "Dynamic data set cannot be built, since no variable informations are present. Apparently no variables have been set. This should not happen, please contact the TMVA authors." << Endl;
171 
172  std::vector<VariableInfo>::iterator it = varinfos.begin(), itEnd=varinfos.end();
173  for (;it!=itEnd;++it) {
174  Float_t* external=(Float_t*)(*it).GetExternalLink();
175  if (external==0)
176  Log() << kDEBUG << "The link to the external variable is NULL while I am trying to build a dynamic data set. In this case fTmpEvent from MethodBase HAS TO BE USED in the method to get useful values in variables." << Endl;
177  evdyn->push_back (external);
178  }
179 
180  std::vector<VariableInfo>& spectatorinfos = dsi.GetSpectatorInfos();
181  it = spectatorinfos.begin();
182  for (;it!=spectatorinfos.end();it++) evdyn->push_back( (Float_t*)(*it).GetExternalLink() );
183 
184  TMVA::Event * ev = new Event((const std::vector<Float_t*>*&)evdyn, varinfos.size());
185  std::vector<Event*>* newEventVector = new std::vector<Event*>;
186  newEventVector->push_back(ev);
187 
188  ds->SetEventCollection(newEventVector, Types::kTraining);
190  ds->SetCurrentEvent( 0 );
191 
192  return ds;
193 }
194 
195 ////////////////////////////////////////////////////////////////////////////////
196 /// if no entries, than create a DataSet with one Event which uses
197 /// dynamic variables (pointers to variables)
198 
201  DataInputHandler& dataInput )
202 {
203  if (dataInput.GetEntries()==0) return BuildDynamicDataSet( dsi );
204  // -------------------------------------------------------------------------
205 
206  // register the classes in the datasetinfo-object
207  // information comes from the trees in the dataInputHandler-object
208  std::vector< TString >* classList = dataInput.GetClassList();
209  for (std::vector<TString>::iterator it = classList->begin(); it< classList->end(); it++) {
210  dsi.AddClass( (*it) );
211  }
212  delete classList;
213 
214  EvtStatsPerClass eventCounts(dsi.GetNClasses());
215  TString normMode;
216  TString splitMode;
217  TString mixMode;
218  UInt_t splitSeed;
219  InitOptions( dsi, eventCounts, normMode, splitSeed, splitMode , mixMode );
220 
221  // ======= build event-vector from input, apply preselection ===============
222  EventVectorOfClassesOfTreeType tmpEventVector;
223  BuildEventVector( dsi, dataInput, tmpEventVector, eventCounts );
224 
225  DataSet* ds = MixEvents( dsi, tmpEventVector, eventCounts,
226  splitMode, mixMode, normMode, splitSeed);
227 
228  const Bool_t showCollectedOutput = kFALSE;
229  if (showCollectedOutput) {
230  Int_t maxL = dsi.GetClassNameMaxLength();
231  Log() << kINFO << "Collected:" << Endl;
232  for (UInt_t cl = 0; cl < dsi.GetNClasses(); cl++) {
233  Log() << kINFO << " "
234  << setiosflags(ios::left) << std::setw(maxL) << dsi.GetClassInfo(cl)->GetName()
235  << " training entries: " << ds->GetNClassEvents( 0, cl ) << Endl;
236  Log() << kINFO << " "
237  << setiosflags(ios::left) << std::setw(maxL) << dsi.GetClassInfo(cl)->GetName()
238  << " testing entries: " << ds->GetNClassEvents( 1, cl ) << Endl;
239  }
240  Log() << kINFO << " " << Endl;
241  }
242 
243  return ds;
244 }
245 
246 ////////////////////////////////////////////////////////////////////////////////
247 /// checks a TTreeFormula for problems
248 
250  const TString& expression,
251  Bool_t& hasDollar )
252 {
253  Bool_t worked = kTRUE;
254 
255  if( ttf->GetNdim() <= 0 )
256  Log() << kFATAL << "Expression " << expression.Data()
257  << " could not be resolved to a valid formula. " << Endl;
258  if( ttf->GetNdata() == 0 ){
259  Log() << kWARNING << "Expression: " << expression.Data()
260  << " does not provide data for this event. "
261  << "This event is not taken into account. --> please check if you use as a variable "
262  << "an entry of an array which is not filled for some events "
263  << "(e.g. arr[4] when arr has only 3 elements)." << Endl;
264  Log() << kWARNING << "If you want to take the event into account you can do something like: "
265  << "\"Alt$(arr[4],0)\" where in cases where arr doesn't have a 4th element, "
266  << " 0 is taken as an alternative." << Endl;
267  worked = kFALSE;
268  }
269  if( expression.Contains("$") )
270  hasDollar = kTRUE;
271  else
272  {
273  for (int i = 0, iEnd = ttf->GetNcodes (); i < iEnd; ++i)
274  {
275  TLeaf* leaf = ttf->GetLeaf (i);
276  if (!leaf->IsOnTerminalBranch())
277  hasDollar = kTRUE;
278  }
279  }
280  return worked;
281 }
282 
283 
284 ////////////////////////////////////////////////////////////////////////////////
285 /// While the data gets copied into the local training and testing
286 /// trees, the input tree can change (for intance when changing from
287 /// signal to background tree, or using TChains as input) The
288 /// TTreeFormulas, that hold the input expressions need to be
289 /// reassociated with the new tree, which is done here
290 
292 {
293  TTree *tr = tinfo.GetTree()->GetTree();
294 
295  tr->SetBranchStatus("*",1);
296 
297  Bool_t hasDollar = kFALSE;
298 
299  // 1) the input variable formulas
300  Log() << kDEBUG << "transform input variables" << Endl;
301  std::vector<TTreeFormula*>::const_iterator formIt, formItEnd;
302  for (formIt = fInputFormulas.begin(), formItEnd=fInputFormulas.end(); formIt!=formItEnd; formIt++) if (*formIt) delete *formIt;
303  fInputFormulas.clear();
304  TTreeFormula* ttf = 0;
305 
306  for (UInt_t i=0; i<dsi.GetNVariables(); i++) {
307  ttf = new TTreeFormula( Form( "Formula%s", dsi.GetVariableInfo(i).GetInternalName().Data() ),
308  dsi.GetVariableInfo(i).GetExpression().Data(), tr );
309  CheckTTreeFormula( ttf, dsi.GetVariableInfo(i).GetExpression(), hasDollar );
310  fInputFormulas.push_back( ttf );
311  }
312 
313  //
314  // targets
315  //
316  Log() << kDEBUG << "transform regression targets" << Endl;
317  for (formIt = fTargetFormulas.begin(), formItEnd = fTargetFormulas.end(); formIt!=formItEnd; formIt++) if (*formIt) delete *formIt;
318  fTargetFormulas.clear();
319  for (UInt_t i=0; i<dsi.GetNTargets(); i++) {
320  ttf = new TTreeFormula( Form( "Formula%s", dsi.GetTargetInfo(i).GetInternalName().Data() ),
321  dsi.GetTargetInfo(i).GetExpression().Data(), tr );
322  CheckTTreeFormula( ttf, dsi.GetTargetInfo(i).GetExpression(), hasDollar );
323  fTargetFormulas.push_back( ttf );
324  }
325 
326  //
327  // spectators
328  //
329  Log() << kDEBUG << "transform spectator variables" << Endl;
330  for (formIt = fSpectatorFormulas.begin(), formItEnd = fSpectatorFormulas.end(); formIt!=formItEnd; formIt++) if (*formIt) delete *formIt;
331  fSpectatorFormulas.clear();
332  for (UInt_t i=0; i<dsi.GetNSpectators(); i++) {
333  ttf = new TTreeFormula( Form( "Formula%s", dsi.GetSpectatorInfo(i).GetInternalName().Data() ),
334  dsi.GetSpectatorInfo(i).GetExpression().Data(), tr );
335  CheckTTreeFormula( ttf, dsi.GetSpectatorInfo(i).GetExpression(), hasDollar );
336  fSpectatorFormulas.push_back( ttf );
337  }
338 
339  //
340  // the cuts (one per class, if non-existent: formula pointer = 0)
341  //
342  Log() << kDEBUG << "transform cuts" << Endl;
343  for (formIt = fCutFormulas.begin(), formItEnd = fCutFormulas.end(); formIt!=formItEnd; formIt++) if (*formIt) delete *formIt;
344  fCutFormulas.clear();
345  for (UInt_t clIdx=0; clIdx<dsi.GetNClasses(); clIdx++) {
346  const TCut& tmpCut = dsi.GetClassInfo(clIdx)->GetCut();
347  const TString tmpCutExp(tmpCut.GetTitle());
348  ttf = 0;
349  if (tmpCutExp!="") {
350  ttf = new TTreeFormula( Form("CutClass%i",clIdx), tmpCutExp, tr );
351  Bool_t worked = CheckTTreeFormula( ttf, tmpCutExp, hasDollar );
352  if( !worked ){
353  Log() << kWARNING << "Please check class \"" << dsi.GetClassInfo(clIdx)->GetName()
354  << "\" cut \"" << dsi.GetClassInfo(clIdx)->GetCut() << Endl;
355  }
356  }
357  fCutFormulas.push_back( ttf );
358  }
359 
360  //
361  // the weights (one per class, if non-existent: formula pointer = 0)
362  //
363  Log() << kDEBUG << "transform weights" << Endl;
364  for (formIt = fWeightFormula.begin(), formItEnd = fWeightFormula.end(); formIt!=formItEnd; formIt++) if (*formIt) delete *formIt;
365  fWeightFormula.clear();
366  for (UInt_t clIdx=0; clIdx<dsi.GetNClasses(); clIdx++) {
367  const TString tmpWeight = dsi.GetClassInfo(clIdx)->GetWeight();
368 
369  if (dsi.GetClassInfo(clIdx)->GetName() != tinfo.GetClassName() ) { // if the tree is of another class
370  fWeightFormula.push_back( 0 );
371  continue;
372  }
373 
374  ttf = 0;
375  if (tmpWeight!="") {
376  ttf = new TTreeFormula( "FormulaWeight", tmpWeight, tr );
377  Bool_t worked = CheckTTreeFormula( ttf, tmpWeight, hasDollar );
378  if( !worked ){
379  Log() << kWARNING << "Please check class \"" << dsi.GetClassInfo(clIdx)->GetName()
380  << "\" weight \"" << dsi.GetClassInfo(clIdx)->GetWeight() << Endl;
381  }
382  }
383  else {
384  ttf = 0;
385  }
386  fWeightFormula.push_back( ttf );
387  }
388  Log() << kDEBUG << "enable branches" << Endl;
389  // now enable only branches that are needed in any input formula, target, cut, weight
390 
391  if (!hasDollar) {
392  tr->SetBranchStatus("*",0);
393  Log() << kDEBUG << "enable branches: input variables" << Endl;
394  // input vars
395  for (formIt = fInputFormulas.begin(); formIt!=fInputFormulas.end(); formIt++) {
396  ttf = *formIt;
397  for (Int_t bi = 0; bi<ttf->GetNcodes(); bi++) {
398  tr->SetBranchStatus( ttf->GetLeaf(bi)->GetBranch()->GetName(), 1 );
399  }
400  }
401  // targets
402  Log() << kDEBUG << "enable branches: targets" << Endl;
403  for (formIt = fTargetFormulas.begin(); formIt!=fTargetFormulas.end(); formIt++) {
404  ttf = *formIt;
405  for (Int_t bi = 0; bi<ttf->GetNcodes(); bi++)
406  tr->SetBranchStatus( ttf->GetLeaf(bi)->GetBranch()->GetName(), 1 );
407  }
408  // spectators
409  Log() << kDEBUG << "enable branches: spectators" << Endl;
410  for (formIt = fSpectatorFormulas.begin(); formIt!=fSpectatorFormulas.end(); formIt++) {
411  ttf = *formIt;
412  for (Int_t bi = 0; bi<ttf->GetNcodes(); bi++)
413  tr->SetBranchStatus( ttf->GetLeaf(bi)->GetBranch()->GetName(), 1 );
414  }
415  // cuts
416  Log() << kDEBUG << "enable branches: cuts" << Endl;
417  for (formIt = fCutFormulas.begin(); formIt!=fCutFormulas.end(); formIt++) {
418  ttf = *formIt;
419  if (!ttf) continue;
420  for (Int_t bi = 0; bi<ttf->GetNcodes(); bi++)
421  tr->SetBranchStatus( ttf->GetLeaf(bi)->GetBranch()->GetName(), 1 );
422  }
423  // weights
424  Log() << kDEBUG << "enable branches: weights" << Endl;
425  for (formIt = fWeightFormula.begin(); formIt!=fWeightFormula.end(); formIt++) {
426  ttf = *formIt;
427  if (!ttf) continue;
428  for (Int_t bi = 0; bi<ttf->GetNcodes(); bi++)
429  tr->SetBranchStatus( ttf->GetLeaf(bi)->GetBranch()->GetName(), 1 );
430  }
431  }
432  Log() << kDEBUG << "tree initialized" << Endl;
433  return;
434 }
435 
436 ////////////////////////////////////////////////////////////////////////////////
437 /// compute covariance matrix
438 
440 {
441  const UInt_t nvar = ds->GetNVariables();
442  const UInt_t ntgts = ds->GetNTargets();
443  const UInt_t nvis = ds->GetNSpectators();
444 
445  Float_t *min = new Float_t[nvar];
446  Float_t *max = new Float_t[nvar];
447  Float_t *tgmin = new Float_t[ntgts];
448  Float_t *tgmax = new Float_t[ntgts];
449  Float_t *vmin = new Float_t[nvis];
450  Float_t *vmax = new Float_t[nvis];
451 
452  for (UInt_t ivar=0; ivar<nvar ; ivar++) { min[ivar] = FLT_MAX; max[ivar] = -FLT_MAX; }
453  for (UInt_t ivar=0; ivar<ntgts; ivar++) { tgmin[ivar] = FLT_MAX; tgmax[ivar] = -FLT_MAX; }
454  for (UInt_t ivar=0; ivar<nvis; ivar++) { vmin[ivar] = FLT_MAX; vmax[ivar] = -FLT_MAX; }
455 
456  // perform event loop
457 
458  for (Int_t i=0; i<ds->GetNEvents(); i++) {
459  const Event * ev = ds->GetEvent(i);
460  for (UInt_t ivar=0; ivar<nvar; ivar++) {
461  Double_t v = ev->GetValue(ivar);
462  if (v<min[ivar]) min[ivar] = v;
463  if (v>max[ivar]) max[ivar] = v;
464  }
465  for (UInt_t itgt=0; itgt<ntgts; itgt++) {
466  Double_t v = ev->GetTarget(itgt);
467  if (v<tgmin[itgt]) tgmin[itgt] = v;
468  if (v>tgmax[itgt]) tgmax[itgt] = v;
469  }
470  for (UInt_t ivis=0; ivis<nvis; ivis++) {
471  Double_t v = ev->GetSpectator(ivis);
472  if (v<vmin[ivis]) vmin[ivis] = v;
473  if (v>vmax[ivis]) vmax[ivis] = v;
474  }
475  }
476 
477  for (UInt_t ivar=0; ivar<nvar; ivar++) {
478  dsi.GetVariableInfo(ivar).SetMin(min[ivar]);
479  dsi.GetVariableInfo(ivar).SetMax(max[ivar]);
480  if( TMath::Abs(max[ivar]-min[ivar]) <= FLT_MIN )
481  Log() << kFATAL << "Variable " << dsi.GetVariableInfo(ivar).GetExpression().Data() << " is constant. Please remove the variable." << Endl;
482  }
483  for (UInt_t ivar=0; ivar<ntgts; ivar++) {
484  dsi.GetTargetInfo(ivar).SetMin(tgmin[ivar]);
485  dsi.GetTargetInfo(ivar).SetMax(tgmax[ivar]);
486  if( TMath::Abs(tgmax[ivar]-tgmin[ivar]) <= FLT_MIN )
487  Log() << kFATAL << "Target " << dsi.GetTargetInfo(ivar).GetExpression().Data() << " is constant. Please remove the variable." << Endl;
488  }
489  for (UInt_t ivar=0; ivar<nvis; ivar++) {
490  dsi.GetSpectatorInfo(ivar).SetMin(vmin[ivar]);
491  dsi.GetSpectatorInfo(ivar).SetMax(vmax[ivar]);
492  // if( TMath::Abs(vmax[ivar]-vmin[ivar]) <= FLT_MIN )
493  // Log() << kWARNING << "Spectator variable " << dsi.GetSpectatorInfo(ivar).GetExpression().Data() << " is constant." << Endl;
494  }
495  delete [] min;
496  delete [] max;
497  delete [] tgmin;
498  delete [] tgmax;
499  delete [] vmin;
500  delete [] vmax;
501 }
502 
503 ////////////////////////////////////////////////////////////////////////////////
504 /// computes correlation matrix for variables "theVars" in tree;
505 /// "theType" defines the required event "type"
506 /// ("type" variable must be present in tree)
507 
509 {
510  // first compute variance-covariance
511  TMatrixD* mat = CalcCovarianceMatrix( ds, classNumber );
512 
513  // now the correlation
514  UInt_t nvar = ds->GetNVariables(), ivar, jvar;
515 
516  for (ivar=0; ivar<nvar; ivar++) {
517  for (jvar=0; jvar<nvar; jvar++) {
518  if (ivar != jvar) {
519  Double_t d = (*mat)(ivar, ivar)*(*mat)(jvar, jvar);
520  if (d > 0) (*mat)(ivar, jvar) /= sqrt(d);
521  else {
522  Log() << kWARNING << "<GetCorrelationMatrix> Zero variances for variables "
523  << "(" << ivar << ", " << jvar << ") = " << d
524  << Endl;
525  (*mat)(ivar, jvar) = 0;
526  }
527  }
528  }
529  }
530 
531  for (ivar=0; ivar<nvar; ivar++) (*mat)(ivar, ivar) = 1.0;
532 
533  return mat;
534 }
535 
536 ////////////////////////////////////////////////////////////////////////////////
537 /// compute covariance matrix
538 
540 {
541  UInt_t nvar = ds->GetNVariables();
542  UInt_t ivar = 0, jvar = 0;
543 
544  TMatrixD* mat = new TMatrixD( nvar, nvar );
545 
546  // init matrices
547  TVectorD vec(nvar);
548  TMatrixD mat2(nvar, nvar);
549  for (ivar=0; ivar<nvar; ivar++) {
550  vec(ivar) = 0;
551  for (jvar=0; jvar<nvar; jvar++) mat2(ivar, jvar) = 0;
552  }
553 
554  // perform event loop
555  Double_t ic = 0;
556  for (Int_t i=0; i<ds->GetNEvents(); i++) {
557 
558  const Event * ev = ds->GetEvent(i);
559  if (ev->GetClass() != classNumber ) continue;
560 
561  Double_t weight = ev->GetWeight();
562  ic += weight; // count used events
563 
564  for (ivar=0; ivar<nvar; ivar++) {
565 
566  Double_t xi = ev->GetValue(ivar);
567  vec(ivar) += xi*weight;
568  mat2(ivar, ivar) += (xi*xi*weight);
569 
570  for (jvar=ivar+1; jvar<nvar; jvar++) {
571  Double_t xj = ev->GetValue(jvar);
572  mat2(ivar, jvar) += (xi*xj*weight);
573  }
574  }
575  }
576 
577  for (ivar=0; ivar<nvar; ivar++)
578  for (jvar=ivar+1; jvar<nvar; jvar++)
579  mat2(jvar, ivar) = mat2(ivar, jvar); // symmetric matrix
580 
581 
582  // variance-covariance
583  for (ivar=0; ivar<nvar; ivar++) {
584  for (jvar=0; jvar<nvar; jvar++) {
585  (*mat)(ivar, jvar) = mat2(ivar, jvar)/ic - vec(ivar)*vec(jvar)/(ic*ic);
586  }
587  }
588 
589  return mat;
590 }
591 
592 // --------------------------------------- new versions
593 
594 ////////////////////////////////////////////////////////////////////////////////
595 /// the dataset splitting
596 
597 void
599  EvtStatsPerClass& nEventRequests,
600  TString& normMode,
601  UInt_t& splitSeed,
602  TString& splitMode,
603  TString& mixMode )
604 {
605  Configurable splitSpecs( dsi.GetSplitOptions() );
606  splitSpecs.SetConfigName("DataSetFactory");
607  splitSpecs.SetConfigDescription( "Configuration options given in the \"PrepareForTrainingAndTesting\" call; these options define the creation of the data sets used for training and expert validation by TMVA" );
608 
609  splitMode = "Random"; // the splitting mode
610  splitSpecs.DeclareOptionRef( splitMode, "SplitMode",
611  "Method of picking training and testing events (default: random)" );
612  splitSpecs.AddPreDefVal(TString("Random"));
613  splitSpecs.AddPreDefVal(TString("Alternate"));
614  splitSpecs.AddPreDefVal(TString("Block"));
615 
616  mixMode = "SameAsSplitMode"; // the splitting mode
617  splitSpecs.DeclareOptionRef( mixMode, "MixMode",
618  "Method of mixing events of differnt classes into one dataset (default: SameAsSplitMode)" );
619  splitSpecs.AddPreDefVal(TString("SameAsSplitMode"));
620  splitSpecs.AddPreDefVal(TString("Random"));
621  splitSpecs.AddPreDefVal(TString("Alternate"));
622  splitSpecs.AddPreDefVal(TString("Block"));
623 
624  splitSeed = 100;
625  splitSpecs.DeclareOptionRef( splitSeed, "SplitSeed",
626  "Seed for random event shuffling" );
627 
628  normMode = "EqualNumEvents"; // the weight normalisation modes
629  splitSpecs.DeclareOptionRef( normMode, "NormMode",
630  "Overall renormalisation of event-by-event weights used in the training (NumEvents: average weight of 1 per event, independently for signal and background; EqualNumEvents: average weight of 1 per event for signal, and sum of weights for background equal to sum of weights for signal)" );
631  splitSpecs.AddPreDefVal(TString("None"));
632  splitSpecs.AddPreDefVal(TString("NumEvents"));
633  splitSpecs.AddPreDefVal(TString("EqualNumEvents"));
634 
635  splitSpecs.DeclareOptionRef(fScaleWithPreselEff=kFALSE,"ScaleWithPreselEff","Scale the number of requested events by the eff. of the preselection cuts (or not)" );
636 
637  // the number of events
638 
639  // fill in the numbers
640  for (UInt_t cl = 0; cl < dsi.GetNClasses(); cl++) {
641  TString clName = dsi.GetClassInfo(cl)->GetName();
642  TString titleTrain = TString().Format("Number of training events of class %s (default: 0 = all)",clName.Data()).Data();
643  TString titleTest = TString().Format("Number of test events of class %s (default: 0 = all)",clName.Data()).Data();
644 
645  splitSpecs.DeclareOptionRef( nEventRequests.at(cl).nTrainingEventsRequested, TString("nTrain_")+clName, titleTrain );
646  splitSpecs.DeclareOptionRef( nEventRequests.at(cl).nTestingEventsRequested , TString("nTest_")+clName , titleTest );
647  }
648 
649  splitSpecs.DeclareOptionRef( fVerbose, "V", "Verbosity (default: true)" );
650 
651  splitSpecs.DeclareOptionRef( fVerboseLevel=TString("Info"), "VerboseLevel", "VerboseLevel (Debug/Verbose/Info)" );
652  splitSpecs.AddPreDefVal(TString("Debug"));
653  splitSpecs.AddPreDefVal(TString("Verbose"));
654  splitSpecs.AddPreDefVal(TString("Info"));
655 
656  splitSpecs.ParseOptions();
657  splitSpecs.CheckForUnusedOptions();
658 
659  // output logging verbosity
660  if (Verbose()) fLogger->SetMinType( kVERBOSE );
661  if (fVerboseLevel.CompareTo("Debug") ==0) fLogger->SetMinType( kDEBUG );
662  if (fVerboseLevel.CompareTo("Verbose") ==0) fLogger->SetMinType( kVERBOSE );
663  if (fVerboseLevel.CompareTo("Info") ==0) fLogger->SetMinType( kINFO );
664 
665  // put all to upper case
666  splitMode.ToUpper(); mixMode.ToUpper(); normMode.ToUpper();
667  // adjust mixmode if same as splitmode option has been set
668  Log() << kINFO << "Splitmode is: \"" << splitMode << "\" the mixmode is: \"" << mixMode << "\"" << Endl;
669  if (mixMode=="SAMEASSPLITMODE") mixMode = splitMode;
670  else if (mixMode!=splitMode)
671  Log() << kINFO << "DataSet splitmode="<<splitMode
672  <<" differs from mixmode="<<mixMode<<Endl;
673 }
674 
675 
676 ////////////////////////////////////////////////////////////////////////////////
677 /// build empty event vectors
678 /// distributes events between kTraining/kTesting/kMaxTreeType
679 
680 void
682  TMVA::DataInputHandler& dataInput,
684  EvtStatsPerClass& eventCounts)
685 {
686  const UInt_t nclasses = dsi.GetNClasses();
687 
688  eventsmap[ Types::kTraining ] = EventVectorOfClasses(nclasses);
689  eventsmap[ Types::kTesting ] = EventVectorOfClasses(nclasses);
690  eventsmap[ Types::kMaxTreeType ] = EventVectorOfClasses(nclasses);
691 
692  // create the type, weight and boostweight branches
693  const UInt_t nvars = dsi.GetNVariables();
694  const UInt_t ntgts = dsi.GetNTargets();
695  const UInt_t nvis = dsi.GetNSpectators();
696 
697  for (size_t i=0; i<nclasses; i++) {
698  eventCounts[i].varAvLength = new Float_t[nvars];
699  for (UInt_t ivar=0; ivar<nvars; ivar++)
700  eventCounts[i].varAvLength[ivar] = 0;
701  }
702 
703  // Bool_t haveArrayVariable = kFALSE;
704  Bool_t *varIsArray = new Bool_t[nvars];
705 
706  // if we work with chains we need to remember the current tree if
707  // the chain jumps to a new tree we have to reset the formulas
708  for (UInt_t cl=0; cl<nclasses; cl++) {
709 
710  Log() << kINFO << "Create training and testing trees -- looping over class \""
711  << dsi.GetClassInfo(cl)->GetName() << "\" ..." << Endl;
712 
713  EventStats& classEventCounts = eventCounts[cl];
714 
715  // info output for weights
716  Log() << kINFO << "Weight expression for class \'" << dsi.GetClassInfo(cl)->GetName() << "\': \""
717  << dsi.GetClassInfo(cl)->GetWeight() << "\"" << Endl;
718 
719  // used for chains only
720  TString currentFileName("");
721 
722  std::vector<TreeInfo>::const_iterator treeIt(dataInput.begin(dsi.GetClassInfo(cl)->GetName()));
723  for (;treeIt!=dataInput.end(dsi.GetClassInfo(cl)->GetName()); treeIt++) {
724 
725  // read first the variables
726  std::vector<Float_t> vars(nvars);
727  std::vector<Float_t> tgts(ntgts);
728  std::vector<Float_t> vis(nvis);
729  TreeInfo currentInfo = *treeIt;
730 
731  Log() << kDEBUG << "Building event vectors " << currentInfo.GetTreeType() << Endl;
732 
733  EventVector& event_v = eventsmap[currentInfo.GetTreeType()].at(cl);
734 
735  Bool_t isChain = (TString("TChain") == currentInfo.GetTree()->ClassName());
736  currentInfo.GetTree()->LoadTree(0);
737  ChangeToNewTree( currentInfo, dsi );
738 
739  // count number of events in tree before cut
740  classEventCounts.nInitialEvents += currentInfo.GetTree()->GetEntries();
741 
742  // loop over events in ntuple
743  const UInt_t nEvts = currentInfo.GetTree()->GetEntries();
744  for (Long64_t evtIdx = 0; evtIdx < nEvts; evtIdx++) {
745  currentInfo.GetTree()->LoadTree(evtIdx);
746 
747  // may need to reload tree in case of chains
748  if (isChain) {
749  if (currentInfo.GetTree()->GetTree()->GetDirectory()->GetFile()->GetName() != currentFileName) {
750  currentFileName = currentInfo.GetTree()->GetTree()->GetDirectory()->GetFile()->GetName();
751  ChangeToNewTree( currentInfo, dsi );
752  }
753  }
754  currentInfo.GetTree()->GetEntry(evtIdx);
755  Int_t sizeOfArrays = 1;
756  Int_t prevArrExpr = 0;
757 
758  // ======= evaluate all formulas =================
759 
760  // first we check if some of the formulas are arrays
761  for (UInt_t ivar=0; ivar<nvars; ivar++) {
762  Int_t ndata = fInputFormulas[ivar]->GetNdata();
763  classEventCounts.varAvLength[ivar] += ndata;
764  if (ndata == 1) continue;
765  // haveArrayVariable = kTRUE;
766  varIsArray[ivar] = kTRUE;
767  if (sizeOfArrays == 1) {
768  sizeOfArrays = ndata;
769  prevArrExpr = ivar;
770  }
771  else if (sizeOfArrays!=ndata) {
772  Log() << kERROR << "ERROR while preparing training and testing trees:" << Endl;
773  Log() << " multiple array-type expressions of different length were encountered" << Endl;
774  Log() << " location of error: event " << evtIdx
775  << " in tree " << currentInfo.GetTree()->GetName()
776  << " of file " << currentInfo.GetTree()->GetCurrentFile()->GetName() << Endl;
777  Log() << " expression " << fInputFormulas[ivar]->GetTitle() << " has "
778  << ndata << " entries, while" << Endl;
779  Log() << " expression " << fInputFormulas[prevArrExpr]->GetTitle() << " has "
780  << fInputFormulas[prevArrExpr]->GetNdata() << " entries" << Endl;
781  Log() << kFATAL << "Need to abort" << Endl;
782  }
783  }
784 
785  // now we read the information
786  for (Int_t idata = 0; idata<sizeOfArrays; idata++) {
787  Bool_t containsNaN = kFALSE;
788 
789  TTreeFormula* formula = 0;
790 
791  // the cut expression
792  Float_t cutVal = 1;
793  formula = fCutFormulas[cl];
794  if (formula) {
795  Int_t ndata = formula->GetNdata();
796  cutVal = (ndata==1 ?
797  formula->EvalInstance(0) :
798  formula->EvalInstance(idata));
799  if (TMath::IsNaN(cutVal)) {
800  containsNaN = kTRUE;
801  Log() << kWARNING << "Cut expression resolves to infinite value (NaN): "
802  << formula->GetTitle() << Endl;
803  }
804  }
805 
806  // the input variable
807  for (UInt_t ivar=0; ivar<nvars; ivar++) {
808  formula = fInputFormulas[ivar];
809  Int_t ndata = formula->GetNdata();
810  vars[ivar] = (ndata == 1 ?
811  formula->EvalInstance(0) :
812  formula->EvalInstance(idata));
813  if (TMath::IsNaN(vars[ivar])) {
814  containsNaN = kTRUE;
815  Log() << kWARNING << "Input expression resolves to infinite value (NaN): "
816  << formula->GetTitle() << Endl;
817  }
818  }
819 
820  // the targets
821  for (UInt_t itrgt=0; itrgt<ntgts; itrgt++) {
822  formula = fTargetFormulas[itrgt];
823  Int_t ndata = formula->GetNdata();
824  tgts[itrgt] = (ndata == 1 ?
825  formula->EvalInstance(0) :
826  formula->EvalInstance(idata));
827  if (TMath::IsNaN(tgts[itrgt])) {
828  containsNaN = kTRUE;
829  Log() << kWARNING << "Target expression resolves to infinite value (NaN): "
830  << formula->GetTitle() << Endl;
831  }
832  }
833 
834  // the spectators
835  for (UInt_t itVis=0; itVis<nvis; itVis++) {
836  formula = fSpectatorFormulas[itVis];
837  Int_t ndata = formula->GetNdata();
838  vis[itVis] = (ndata == 1 ?
839  formula->EvalInstance(0) :
840  formula->EvalInstance(idata));
841  if (TMath::IsNaN(vis[itVis])) {
842  containsNaN = kTRUE;
843  Log() << kWARNING << "Spectator expression resolves to infinite value (NaN): "
844  << formula->GetTitle() << Endl;
845  }
846  }
847 
848 
849  // the weight
850  Float_t weight = currentInfo.GetWeight(); // multiply by tree weight
851  formula = fWeightFormula[cl];
852  if (formula!=0) {
853  Int_t ndata = formula->GetNdata();
854  weight *= (ndata == 1 ?
855  formula->EvalInstance() :
856  formula->EvalInstance(idata));
857  if (TMath::IsNaN(weight)) {
858  containsNaN = kTRUE;
859  Log() << kWARNING << "Weight expression resolves to infinite value (NaN): "
860  << formula->GetTitle() << Endl;
861  }
862  }
863 
864  // Count the events before rejection due to cut or NaN
865  // value (weighted and unweighted)
866  classEventCounts.nEvBeforeCut++;
867  if (!TMath::IsNaN(weight))
868  classEventCounts.nWeEvBeforeCut += weight;
869 
870  // apply the cut, skip rest if cut is not fulfilled
871  if (cutVal<0.5) continue;
872 
873  // global flag if negative weights exist -> can be used
874  // by classifiers who may require special data
875  // treatment (also print warning)
876  if (weight < 0) classEventCounts.nNegWeights++;
877 
878  // now read the event-values (variables and regression targets)
879 
880  if (containsNaN) {
881  Log() << kWARNING << "Event " << evtIdx;
882  if (sizeOfArrays>1) Log() << kWARNING << " rejected" << Endl;
883  continue;
884  }
885 
886  // Count the events after rejection due to cut or NaN value
887  // (weighted and unweighted)
888  classEventCounts.nEvAfterCut++;
889  classEventCounts.nWeEvAfterCut += weight;
890 
891  // event accepted, fill temporary ntuple
892  event_v.push_back(new Event(vars, tgts , vis, cl , weight));
893  }
894  }
895  currentInfo.GetTree()->ResetBranchAddresses();
896  }
897  }
898 
899  // for output format, get the maximum class name length
900  Int_t maxL = dsi.GetClassNameMaxLength();
901 
902  Log() << kINFO << "Number of events in input trees (after possible flattening of arrays):" << Endl;
903  for (UInt_t cl = 0; cl < dsi.GetNClasses(); cl++) {
904  Log() << kINFO << " "
905  << setiosflags(ios::left) << std::setw(maxL) << dsi.GetClassInfo(cl)->GetName()
906  << " -- number of events : "
907  << std::setw(5) << eventCounts[cl].nEvBeforeCut
908  << " / sum of weights: " << std::setw(5) << eventCounts[cl].nWeEvBeforeCut << Endl;
909  }
910 
911  for (UInt_t cl = 0; cl < dsi.GetNClasses(); cl++) {
912  Log() << kINFO << " " << std::setw(maxL) << dsi.GetClassInfo(cl)->GetName()
913  <<" tree -- total number of entries: "
914  << std::setw(5) << dataInput.GetEntries(dsi.GetClassInfo(cl)->GetName()) << Endl;
915  }
916 
917  if (fScaleWithPreselEff)
918  Log() << kINFO << "Preselection: (will affect number of requested training and testing events)" << Endl;
919  else
920  Log() << kINFO << "Preselection: (will NOT affect number of requested training and testing events)" << Endl;
921 
922  if (dsi.HasCuts()) {
923  for (UInt_t cl = 0; cl< dsi.GetNClasses(); cl++) {
924  Log() << kINFO << " " << setiosflags(ios::left) << std::setw(maxL) << dsi.GetClassInfo(cl)->GetName()
925  << " requirement: \"" << dsi.GetClassInfo(cl)->GetCut() << "\"" << Endl;
926  Log() << kINFO << " "
927  << setiosflags(ios::left) << std::setw(maxL) << dsi.GetClassInfo(cl)->GetName()
928  << " -- number of events passed: "
929  << std::setw(5) << eventCounts[cl].nEvAfterCut
930  << " / sum of weights: " << std::setw(5) << eventCounts[cl].nWeEvAfterCut << Endl;
931  Log() << kINFO << " "
932  << setiosflags(ios::left) << std::setw(maxL) << dsi.GetClassInfo(cl)->GetName()
933  << " -- efficiency : "
934  << std::setw(6) << eventCounts[cl].nWeEvAfterCut/eventCounts[cl].nWeEvBeforeCut << Endl;
935  }
936  }
937  else Log() << kINFO << " No preselection cuts applied on event classes" << Endl;
938 
939  delete[] varIsArray;
940 
941 }
942 
943 ////////////////////////////////////////////////////////////////////////////////
944 /// Select and distribute unassigned events to kTraining and kTesting
945 ///Bool_t emptyUndefined = kTRUE;
946 
949  EventVectorOfClassesOfTreeType& tmpEventVector,
950  EvtStatsPerClass& eventCounts,
951  const TString& splitMode,
952  const TString& mixMode,
953  const TString& normMode,
954  UInt_t splitSeed)
955 {
956  // check if the vectors of all classes are empty
957  //for( Int_t cls = 0, clsEnd = dsi.GetNClasses(); cls < clsEnd; ++cls ){
958  // emptyUndefined &= tmpEventVector[Types::kMaxTreeType].at(cls).empty();
959  //}
960 
961  TMVA::RandomGenerator rndm( splitSeed );
962 
963  // ==== splitting of undefined events to kTraining and kTesting
964 
965  // if splitMode contains "RANDOM", then shuffle the undefined events
966  if (splitMode.Contains( "RANDOM" ) /*&& !emptyUndefined*/ ) {
967  // random shuffle the undefined events of each class
968  for( UInt_t cls = 0; cls < dsi.GetNClasses(); ++cls ){
969  EventVector& unspecifiedEvents = tmpEventVector[Types::kMaxTreeType].at(cls);
970  if( ! unspecifiedEvents.empty() ) {
971  Log() << kDEBUG << "randomly shuffling "
972  << unspecifiedEvents.size()
973  << " events of class " << cls
974  << " which are not yet associated to testing or training" << Endl;
975  std::random_shuffle( unspecifiedEvents.begin(),
976  unspecifiedEvents.end(),
977  rndm );
978  }
979  }
980  }
981 
982  // check for each class the number of training and testing events, the requested number and the available number
983  Log() << kDEBUG << "SPLITTING ========" << Endl;
984  for( UInt_t cls = 0; cls < dsi.GetNClasses(); ++cls ){
985  Log() << kDEBUG << "---- class " << cls << Endl;
986  Log() << kDEBUG << "check number of training/testing events, requested and available number of events and for class " << cls << Endl;
987 
988  // check if enough or too many events are already in the training/testing eventvectors of the class cls
989  EventVector& eventVectorTraining = tmpEventVector[ Types::kTraining ].at(cls);
990  EventVector& eventVectorTesting = tmpEventVector[ Types::kTesting ].at(cls);
991  EventVector& eventVectorUndefined = tmpEventVector[ Types::kMaxTreeType ].at(cls);
992 
993  Int_t availableTraining = eventVectorTraining.size();
994  Int_t availableTesting = eventVectorTesting.size();
995  Int_t availableUndefined = eventVectorUndefined.size();
996 
997  Float_t presel_scale;
998  if (fScaleWithPreselEff) {
999  presel_scale = eventCounts[cls].cutScaling();
1000  if (presel_scale < 1)
1001  Log() << kINFO << " you have opted for scaling the number of requested training/testing events\n to be scaled by the preselection efficiency"<< Endl;
1002  }else{
1003  presel_scale = 1.; // this scaling was tooo confusing to most people, including me! Sorry... (Helge)
1004  if (eventCounts[cls].cutScaling() < 1)
1005  Log() << kINFO << " you have opted for interpreting the requested number of training/testing events\n to be the number of events AFTER your preselection cuts" << Endl;
1006 
1007  }
1008  Int_t requestedTraining = Int_t(eventCounts[cls].nTrainingEventsRequested * presel_scale);
1009  Int_t requestedTesting = Int_t(eventCounts[cls].nTestingEventsRequested * presel_scale);
1010 
1011  Log() << kDEBUG << "events in training trees : " << availableTraining << Endl;
1012  Log() << kDEBUG << "events in testing trees : " << availableTesting << Endl;
1013  Log() << kDEBUG << "events in unspecified trees : " << availableUndefined << Endl;
1014  Log() << kDEBUG << "requested for training : " << requestedTraining;
1015 
1016  if(presel_scale<1)
1017  Log() << " ( " << eventCounts[cls].nTrainingEventsRequested
1018  << " * " << presel_scale << " preselection efficiency)" << Endl;
1019  else
1020  Log() << Endl;
1021  Log() << kDEBUG << "requested for testing : " << requestedTesting;
1022  if(presel_scale<1)
1023  Log() << " ( " << eventCounts[cls].nTestingEventsRequested
1024  << " * " << presel_scale << " preselection efficiency)" << Endl;
1025  else
1026  Log() << Endl;
1027 
1028  // nomenclature r = available training
1029  // s = available testing
1030  // u = available undefined
1031  // R = requested training
1032  // S = requested testing
1033  // nR = to be used to select training events
1034  // nS = to be used to select test events
1035  // we have the constraint: nR + nS < r+s+u,
1036  // since we can not use more events than we have
1037  // free events: Nfree = u-Thet(R-r)-Thet(S-s)
1038  // nomenclature: Thet(x) = x, if x>0 else 0
1039  // nR = max(R,r) + 0.5 * Nfree
1040  // nS = max(S,s) + 0.5 * Nfree
1041  // nR +nS = R+S + u-R+r-S+s = u+r+s= ok! for R>r
1042  // nR +nS = r+S + u-S+s = u+r+s= ok! for r>R
1043 
1044  // three different cases might occur here
1045  //
1046  // Case a
1047  // requestedTraining and requestedTesting >0
1048  // free events: Nfree = u-Thet(R-r)-Thet(S-s)
1049  // nR = Max(R,r) + 0.5 * Nfree
1050  // nS = Max(S,s) + 0.5 * Nfree
1051  //
1052  // Case b
1053  // exactly one of requestedTraining or requestedTesting >0
1054  // assume training R >0
1055  // nR = max(R,r)
1056  // nS = s+u+r-nR
1057  // and s=nS
1058  //
1059  // Case c
1060  // requestedTraining=0, requestedTesting=0
1061  // Nfree = u-|r-s|
1062  // if NFree >=0
1063  // R = Max(r,s) + 0.5 * Nfree = S
1064  // else if r>s
1065  // R = r; S=s+u
1066  // else
1067  // R = r+u; S=s
1068  //
1069  // Next steps:
1070  // Determination of Event numbers R,S, nR, nS
1071  // distribute undefined events according to nR, nS
1072  // finally determine actual sub samples from nR and nS to be used in training / testing
1073  //
1074 
1075  Int_t useForTesting(0),useForTraining(0);
1076  Int_t allAvailable(availableUndefined + availableTraining + availableTesting);
1077 
1078  if( (requestedTraining == 0) && (requestedTesting == 0)){
1079 
1080  // Case C: balance the number of training and testing events
1081 
1082  if ( availableUndefined >= TMath::Abs(availableTraining - availableTesting) ) {
1083  // enough unspecified are available to equal training and testing
1084  useForTraining = useForTesting = allAvailable/2;
1085  } else {
1086  // all unspecified are assigned to the smaller of training / testing
1087  useForTraining = availableTraining;
1088  useForTesting = availableTesting;
1089  if (availableTraining < availableTesting)
1090  useForTraining += availableUndefined;
1091  else
1092  useForTesting += availableUndefined;
1093  }
1094  requestedTraining = useForTraining;
1095  requestedTesting = useForTesting;
1096  }
1097 
1098  else if (requestedTesting == 0){
1099  // case B
1100  useForTraining = TMath::Max(requestedTraining,availableTraining);
1101  if (allAvailable < useForTraining) {
1102  Log() << kFATAL << "More events requested for training ("
1103  << requestedTraining << ") than available ("
1104  << allAvailable << ")!" << Endl;
1105  }
1106  useForTesting = allAvailable - useForTraining; // the rest
1107  requestedTesting = useForTesting;
1108  }
1109 
1110  else if (requestedTraining == 0){ // case B)
1111  useForTesting = TMath::Max(requestedTesting,availableTesting);
1112  if (allAvailable < useForTesting) {
1113  Log() << kFATAL << "More events requested for testing ("
1114  << requestedTesting << ") than available ("
1115  << allAvailable << ")!" << Endl;
1116  }
1117  useForTraining= allAvailable - useForTesting; // the rest
1118  requestedTraining = useForTraining;
1119  }
1120 
1121  else {
1122  // Case A
1123  // requestedTraining R and requestedTesting S >0
1124  // free events: Nfree = u-Thet(R-r)-Thet(S-s)
1125  // nR = Max(R,r) + 0.5 * Nfree
1126  // nS = Max(S,s) + 0.5 * Nfree
1127  Int_t stillNeedForTraining = TMath::Max(requestedTraining-availableTraining,0);
1128  Int_t stillNeedForTesting = TMath::Max(requestedTesting-availableTesting,0);
1129 
1130  int NFree = availableUndefined - stillNeedForTraining - stillNeedForTesting;
1131  if (NFree <0) NFree = 0;
1132  useForTraining = TMath::Max(requestedTraining,availableTraining) + NFree/2;
1133  useForTesting= allAvailable - useForTraining; // the rest
1134  }
1135 
1136  Log() << kDEBUG << "determined event sample size to select training sample from="<<useForTraining<<Endl;
1137  Log() << kDEBUG << "determined event sample size to select test sample from="<<useForTesting<<Endl;
1138 
1139 
1140 
1141  // associate undefined events
1142  if( splitMode == "ALTERNATE" ){
1143  Log() << kDEBUG << "split 'ALTERNATE'" << Endl;
1144  Int_t nTraining = availableTraining;
1145  Int_t nTesting = availableTesting;
1146  for( EventVector::iterator it = eventVectorUndefined.begin(), itEnd = eventVectorUndefined.end(); it != itEnd; ){
1147  ++nTraining;
1148  if( nTraining <= requestedTraining ){
1149  eventVectorTraining.insert( eventVectorTraining.end(), (*it) );
1150  ++it;
1151  }
1152  if( it != itEnd ){
1153  ++nTesting;
1154  eventVectorTesting.insert( eventVectorTesting.end(), (*it) );
1155  ++it;
1156  }
1157  }
1158  } else {
1159  Log() << kDEBUG << "split '" << splitMode << "'" << Endl;
1160 
1161  // test if enough events are available
1162  Log() << kDEBUG << "availableundefined : " << availableUndefined << Endl;
1163  Log() << kDEBUG << "useForTraining : " << useForTraining << Endl;
1164  Log() << kDEBUG << "useForTesting : " << useForTesting << Endl;
1165  Log() << kDEBUG << "availableTraining : " << availableTraining << Endl;
1166  Log() << kDEBUG << "availableTesting : " << availableTesting << Endl;
1167 
1168  if( availableUndefined<(useForTraining-availableTraining) ||
1169  availableUndefined<(useForTesting -availableTesting ) ||
1170  availableUndefined<(useForTraining+useForTesting-availableTraining-availableTesting ) ){
1171  Log() << kFATAL << "More events requested than available!" << Endl;
1172  }
1173 
1174  // select the events
1175  if (useForTraining>availableTraining){
1176  eventVectorTraining.insert( eventVectorTraining.end() , eventVectorUndefined.begin(), eventVectorUndefined.begin()+ useForTraining- availableTraining );
1177  eventVectorUndefined.erase( eventVectorUndefined.begin(), eventVectorUndefined.begin() + useForTraining- availableTraining);
1178  }
1179  if (useForTesting>availableTesting){
1180  eventVectorTesting.insert( eventVectorTesting.end() , eventVectorUndefined.begin(), eventVectorUndefined.begin()+ useForTesting- availableTesting );
1181  }
1182  }
1183  eventVectorUndefined.clear();
1184 
1185  // finally shorten the event vectors to the requested size by removing random events
1186  if (splitMode.Contains( "RANDOM" )){
1187  UInt_t sizeTraining = eventVectorTraining.size();
1188  if( sizeTraining > UInt_t(requestedTraining) ){
1189  std::vector<UInt_t> indicesTraining( sizeTraining );
1190  // make indices
1191  std::generate( indicesTraining.begin(), indicesTraining.end(), TMVA::Increment<UInt_t>(0) );
1192  // shuffle indices
1193  std::random_shuffle( indicesTraining.begin(), indicesTraining.end(), rndm );
1194  // erase indices of not needed events
1195  indicesTraining.erase( indicesTraining.begin()+sizeTraining-UInt_t(requestedTraining), indicesTraining.end() );
1196  // delete all events with the given indices
1197  for( std::vector<UInt_t>::iterator it = indicesTraining.begin(), itEnd = indicesTraining.end(); it != itEnd; ++it ){
1198  delete eventVectorTraining.at( (*it) ); // delete event
1199  eventVectorTraining.at( (*it) ) = NULL; // set pointer to NULL
1200  }
1201  // now remove and erase all events with pointer==NULL
1202  eventVectorTraining.erase( std::remove( eventVectorTraining.begin(), eventVectorTraining.end(), (void*)NULL ), eventVectorTraining.end() );
1203  }
1204 
1205  UInt_t sizeTesting = eventVectorTesting.size();
1206  if( sizeTesting > UInt_t(requestedTesting) ){
1207  std::vector<UInt_t> indicesTesting( sizeTesting );
1208  // make indices
1209  std::generate( indicesTesting.begin(), indicesTesting.end(), TMVA::Increment<UInt_t>(0) );
1210  // shuffle indices
1211  std::random_shuffle( indicesTesting.begin(), indicesTesting.end(), rndm );
1212  // erase indices of not needed events
1213  indicesTesting.erase( indicesTesting.begin()+sizeTesting-UInt_t(requestedTesting), indicesTesting.end() );
1214  // delete all events with the given indices
1215  for( std::vector<UInt_t>::iterator it = indicesTesting.begin(), itEnd = indicesTesting.end(); it != itEnd; ++it ){
1216  delete eventVectorTesting.at( (*it) ); // delete event
1217  eventVectorTesting.at( (*it) ) = NULL; // set pointer to NULL
1218  }
1219  // now remove and erase all events with pointer==NULL
1220  eventVectorTesting.erase( std::remove( eventVectorTesting.begin(), eventVectorTesting.end(), (void*)NULL ), eventVectorTesting.end() );
1221  }
1222  }
1223  else { // erase at end
1224  if( eventVectorTraining.size() < UInt_t(requestedTraining) )
1225  Log() << kWARNING << "DataSetFactory/requested number of training samples larger than size of eventVectorTraining.\n"
1226  << "There is probably an issue. Please contact the TMVA developers." << Endl;
1227  std::for_each( eventVectorTraining.begin()+requestedTraining, eventVectorTraining.end(), DeleteFunctor<Event>() );
1228  eventVectorTraining.erase(eventVectorTraining.begin()+requestedTraining,eventVectorTraining.end());
1229 
1230  if( eventVectorTesting.size() < UInt_t(requestedTesting) )
1231  Log() << kWARNING << "DataSetFactory/requested number of testing samples larger than size of eventVectorTesting.\n"
1232  << "There is probably an issue. Please contact the TMVA developers." << Endl;
1233  std::for_each( eventVectorTesting.begin()+requestedTesting, eventVectorTesting.end(), DeleteFunctor<Event>() );
1234  eventVectorTesting.erase(eventVectorTesting.begin()+requestedTesting,eventVectorTesting.end());
1235  }
1236  }
1237 
1238  TMVA::DataSetFactory::RenormEvents( dsi, tmpEventVector, eventCounts, normMode );
1239 
1240  Int_t trainingSize = 0;
1241  Int_t testingSize = 0;
1242 
1243  // sum up number of training and testing events
1244  for( UInt_t cls = 0; cls < dsi.GetNClasses(); ++cls ){
1245  trainingSize += tmpEventVector[Types::kTraining].at(cls).size();
1246  testingSize += tmpEventVector[Types::kTesting].at(cls).size();
1247  }
1248 
1249  // --- collect all training (testing) events into the training (testing) eventvector
1250 
1251  // create event vectors reserve enough space
1252  EventVector* trainingEventVector = new EventVector();
1253  EventVector* testingEventVector = new EventVector();
1254 
1255  trainingEventVector->reserve( trainingSize );
1256  testingEventVector->reserve( testingSize );
1257 
1258 
1259  // collect the events
1260 
1261  // mixing of kTraining and kTesting data sets
1262  Log() << kDEBUG << " MIXING ============= " << Endl;
1263 
1264  if( mixMode == "ALTERNATE" ){
1265  // Inform user if he tries to use alternate mixmode for
1266  // event classes with different number of events, this works but the alternation stops at the last event of the smaller class
1267  for( UInt_t cls = 1; cls < dsi.GetNClasses(); ++cls ){
1268  if (tmpEventVector[Types::kTraining].at(cls).size() != tmpEventVector[Types::kTraining].at(0).size()){
1269  Log() << kINFO << "Training sample: You are trying to mix events in alternate mode although the classes have different event numbers. This works but the alternation stops at the last event of the smaller class."<<Endl;
1270  }
1271  if (tmpEventVector[Types::kTesting].at(cls).size() != tmpEventVector[Types::kTesting].at(0).size()){
1272  Log() << kINFO << "Testing sample: You are trying to mix events in alternate mode although the classes have different event numbers. This works but the alternation stops at the last event of the smaller class."<<Endl;
1273  }
1274  }
1275  typedef EventVector::iterator EvtVecIt;
1276  EvtVecIt itEvent, itEventEnd;
1277 
1278  // insert first class
1279  Log() << kDEBUG << "insert class 0 into training and test vector" << Endl;
1280  trainingEventVector->insert( trainingEventVector->end(), tmpEventVector[Types::kTraining].at(0).begin(), tmpEventVector[Types::kTraining].at(0).end() );
1281  testingEventVector->insert( testingEventVector->end(), tmpEventVector[Types::kTesting].at(0).begin(), tmpEventVector[Types::kTesting].at(0).end() );
1282 
1283  // insert other classes
1284  EvtVecIt itTarget;
1285  for( UInt_t cls = 1; cls < dsi.GetNClasses(); ++cls ){
1286  Log() << kDEBUG << "insert class " << cls << Endl;
1287  // training vector
1288  itTarget = trainingEventVector->begin() - 1; // start one before begin
1289  // loop over source
1290  for( itEvent = tmpEventVector[Types::kTraining].at(cls).begin(), itEventEnd = tmpEventVector[Types::kTraining].at(cls).end(); itEvent != itEventEnd; ++itEvent ){
1291 // if( std::distance( itTarget, trainingEventVector->end()) < Int_t(cls+1) ) {
1292  if( (trainingEventVector->end() - itTarget) < Int_t(cls+1) ) {
1293  itTarget = trainingEventVector->end();
1294  trainingEventVector->insert( itTarget, itEvent, itEventEnd ); // fill in the rest without mixing
1295  break;
1296  }else{
1297  itTarget += cls+1;
1298  trainingEventVector->insert( itTarget, (*itEvent) ); // fill event
1299  }
1300  }
1301  // testing vector
1302  itTarget = testingEventVector->begin() - 1;
1303  // loop over source
1304  for( itEvent = tmpEventVector[Types::kTesting].at(cls).begin(), itEventEnd = tmpEventVector[Types::kTesting].at(cls).end(); itEvent != itEventEnd; ++itEvent ){
1305 // if( std::distance( itTarget, testingEventVector->end()) < Int_t(cls+1) ) {
1306  if( ( testingEventVector->end() - itTarget ) < Int_t(cls+1) ) {
1307  itTarget = testingEventVector->end();
1308  testingEventVector->insert( itTarget, itEvent, itEventEnd ); // fill in the rest without mixing
1309  break;
1310  }else{
1311  itTarget += cls+1;
1312  testingEventVector->insert( itTarget, (*itEvent) ); // fill event
1313  }
1314  }
1315  }
1316 
1317  // debugging output: classnumbers of all events in training and testing vectors
1318  // std::cout << std::endl;
1319  // std::cout << "TRAINING VECTOR" << std::endl;
1320  // std::transform( trainingEventVector->begin(), trainingEventVector->end(), ostream_iterator<Int_t>(std::cout, "|"), std::mem_fun(&TMVA::Event::GetClass) );
1321 
1322  // std::cout << std::endl;
1323  // std::cout << "TESTING VECTOR" << std::endl;
1324  // std::transform( testingEventVector->begin(), testingEventVector->end(), ostream_iterator<Int_t>(std::cout, "|"), std::mem_fun(&TMVA::Event::GetClass) );
1325  // std::cout << std::endl;
1326 
1327  }else{
1328  for( UInt_t cls = 0; cls < dsi.GetNClasses(); ++cls ){
1329  trainingEventVector->insert( trainingEventVector->end(), tmpEventVector[Types::kTraining].at(cls).begin(), tmpEventVector[Types::kTraining].at(cls).end() );
1330  testingEventVector->insert ( testingEventVector->end(), tmpEventVector[Types::kTesting].at(cls).begin(), tmpEventVector[Types::kTesting].at(cls).end() );
1331  }
1332  }
1333 
1334  // std::cout << "trainingEventVector " << trainingEventVector->size() << std::endl;
1335  // std::cout << "testingEventVector " << testingEventVector->size() << std::endl;
1336 
1337  // std::transform( trainingEventVector->begin(), trainingEventVector->end(), ostream_iterator<Int_t>(std::cout, "> \n"), std::mem_fun(&TMVA::Event::GetNVariables) );
1338  // std::transform( testingEventVector->begin(), testingEventVector->end(), ostream_iterator<Int_t>(std::cout, "> \n"), std::mem_fun(&TMVA::Event::GetNVariables) );
1339 
1340  // delete the tmpEventVector (but not the events therein)
1341  tmpEventVector[Types::kTraining].clear();
1342  tmpEventVector[Types::kTesting].clear();
1343 
1344  tmpEventVector[Types::kMaxTreeType].clear();
1345 
1346  if (mixMode == "RANDOM") {
1347  Log() << kDEBUG << "shuffling events"<<Endl;
1348 
1349  std::random_shuffle( trainingEventVector->begin(), trainingEventVector->end(), rndm );
1350  std::random_shuffle( testingEventVector->begin(), testingEventVector->end(), rndm );
1351  }
1352 
1353  Log() << kDEBUG << "trainingEventVector " << trainingEventVector->size() << Endl;
1354  Log() << kDEBUG << "testingEventVector " << testingEventVector->size() << Endl;
1355 
1356  // create dataset
1357  DataSet* ds = new DataSet(dsi);
1358 
1359  Log() << kINFO << "Create internal training tree" << Endl;
1360  ds->SetEventCollection(trainingEventVector, Types::kTraining );
1361  Log() << kINFO << "Create internal testing tree" << Endl;
1362  ds->SetEventCollection(testingEventVector, Types::kTesting );
1363 
1364 
1365  if (ds->GetNTrainingEvents() < 1){
1366  Log() << kFATAL << "Dataset " << std::string(dsi.GetName()) << " does not have any training events, I better stop here and let you fix that one first " << Endl;
1367  }
1368 
1369  if (ds->GetNTestEvents() < 1) {
1370  Log() << kERROR << "Dataset " << std::string(dsi.GetName()) << " does not have any testing events, guess that will cause problems later..but for now, I continue " << Endl;
1371  }
1372 
1373 
1374  return ds;
1375 
1376 }
1377 
1378 ////////////////////////////////////////////////////////////////////////////////
1379 /// =============================================================================
1380 /// renormalisation of the TRAINING event weights
1381 /// -none (kind of obvious) .. use the weights as supplied by the
1382 /// user.. (we store however the relative weight for later use)
1383 /// -numEvents
1384 /// -equalNumEvents reweight the training events such that the sum of all
1385 /// backgr. (class > 0) weights equal that of the signal (class 0)
1386 /// =============================================================================
1387 
1388 void
1390  EventVectorOfClassesOfTreeType& tmpEventVector,
1391  const EvtStatsPerClass& eventCounts,
1392  const TString& normMode )
1393 {
1394 
1395 
1396  // print rescaling info
1397  // ---------------------------------
1398  // compute sizes and sums of weights
1399  Int_t trainingSize = 0;
1400  Int_t testingSize = 0;
1401 
1402  ValuePerClass trainingSumWeightsPerClass( dsi.GetNClasses() );
1403  ValuePerClass testingSumWeightsPerClass( dsi.GetNClasses() );
1404 
1405  NumberPerClass trainingSizePerClass( dsi.GetNClasses() );
1406  NumberPerClass testingSizePerClass( dsi.GetNClasses() );
1407 
1408  Double_t trainingSumSignalWeights = 0;
1409  Double_t trainingSumBackgrWeights = 0; // Backgr. includes all clasess that are not signal
1410  Double_t testingSumSignalWeights = 0;
1411  Double_t testingSumBackgrWeights = 0; // Backgr. includes all clasess that are not signal
1412 
1413 
1414 
1415  for( UInt_t cls = 0, clsEnd = dsi.GetNClasses(); cls < clsEnd; ++cls ){
1416  trainingSizePerClass.at(cls) = tmpEventVector[Types::kTraining].at(cls).size();
1417  testingSizePerClass.at(cls) = tmpEventVector[Types::kTesting].at(cls).size();
1418 
1419  trainingSize += trainingSizePerClass.back();
1420  testingSize += testingSizePerClass.back();
1421 
1422  // the functional solution
1423  // sum up the weights in Double_t although the individual weights are Float_t to prevent rounding issues in addition of floating points
1424  //
1425  // accumulate --> does what the name says
1426  // begin() and end() denote the range of the vector to be accumulated
1427  // Double_t(0) tells accumulate the type and the starting value
1428  // compose_binary creates a BinaryFunction of ...
1429  // std::plus<Double_t>() knows how to sum up two doubles
1430  // null<Double_t>() leaves the first argument (the running sum) unchanged and returns it
1431  // std::mem_fun knows how to call a member function (type and member-function given as argument) and return the result
1432  //
1433  // all together sums up all the event-weights of the events in the vector and returns it
1434  trainingSumWeightsPerClass.at(cls) = std::accumulate( tmpEventVector[Types::kTraining].at(cls).begin(),
1435  tmpEventVector[Types::kTraining].at(cls).end(),
1436  Double_t(0),
1437  compose_binary( std::plus<Double_t>(),
1438  null<Double_t>(),
1439  std::mem_fun(&TMVA::Event::GetOriginalWeight) ) );
1440 
1441  testingSumWeightsPerClass.at(cls) = std::accumulate( tmpEventVector[Types::kTesting].at(cls).begin(),
1442  tmpEventVector[Types::kTesting].at(cls).end(),
1443  Double_t(0),
1444  compose_binary( std::plus<Double_t>(),
1445  null<Double_t>(),
1446  std::mem_fun(&TMVA::Event::GetOriginalWeight) ) );
1447 
1448  if ( cls == dsi.GetSignalClassIndex()){
1449  trainingSumSignalWeights += trainingSumWeightsPerClass.at(cls);
1450  testingSumSignalWeights += testingSumWeightsPerClass.at(cls);
1451  }else{
1452  trainingSumBackgrWeights += trainingSumWeightsPerClass.at(cls);
1453  testingSumBackgrWeights += testingSumWeightsPerClass.at(cls);
1454  }
1455  }
1456 
1457  // ---------------------------------
1458  // compute renormalization factors
1459 
1460  ValuePerClass renormFactor( dsi.GetNClasses() );
1461 
1462 
1463  // for information purposes
1464  dsi.SetNormalization( normMode );
1465  // !! these will be overwritten later by the 'rescaled' ones if
1466  // NormMode != None !!!
1467  dsi.SetTrainingSumSignalWeights(trainingSumSignalWeights);
1468  dsi.SetTrainingSumBackgrWeights(trainingSumBackgrWeights);
1469  dsi.SetTestingSumSignalWeights(testingSumSignalWeights);
1470  dsi.SetTestingSumBackgrWeights(testingSumBackgrWeights);
1471 
1472 
1473  if (normMode == "NONE") {
1474  Log() << kINFO << "No weight renormalisation applied: use original global and event weights" << Endl;
1475  return;
1476  }
1477  //changed by Helge 27.5.2013 What on earth was done here before? I still remember the idea behind this which apparently was
1478  //NOT understood by the 'programmer' :) .. the idea was to have SAME amount of effective TRAINING data for signal and background.
1479  // Testing events are totally irrelevant for this and might actually skew the whole normalisation!!
1480  else if (normMode == "NUMEVENTS") {
1481  Log() << kINFO << "Weight renormalisation mode: \"NumEvents\": renormalises all event classes " << Endl;
1482  Log() << kINFO << " such that the effective (weighted) number of events in each class equals the respective " << Endl;
1483  Log() << kINFO << " number of events (entries) that you demanded in PrepareTrainingAndTestTree(\"\",\"nTrain_Signal=.. )" << Endl;
1484  Log() << kINFO << " ... i.e. such that Sum[i=1..N_j]{w_i} = N_j, j=0,1,2..." << Endl;
1485  Log() << kINFO << " ... (note that N_j is the sum of TRAINING events (nTrain_j...with j=Signal,Background.." << Endl;
1486  Log() << kINFO << " ..... Testing events are not renormalised nor included in the renormalisation factor! )"<< Endl;
1487 
1488  for( UInt_t cls = 0, clsEnd = dsi.GetNClasses(); cls < clsEnd; ++cls ){
1489  // renormFactor.at(cls) = ( (trainingSizePerClass.at(cls) + testingSizePerClass.at(cls))/
1490  // (trainingSumWeightsPerClass.at(cls) + testingSumWeightsPerClass.at(cls)) );
1491  //changed by Helge 27.5.2013
1492  renormFactor.at(cls) = ((Float_t)trainingSizePerClass.at(cls) )/
1493  (trainingSumWeightsPerClass.at(cls)) ;
1494  }
1495  }
1496  else if (normMode == "EQUALNUMEVENTS") {
1497  //changed by Helge 27.5.2013 What on earth was done here before? I still remember the idea behind this which apparently was
1498  //NOT understood by the 'programmer' :) .. the idea was to have SAME amount of effective TRAINING data for signal and background.
1499  //done here was something like having each data source normalized to its number of entries and this even for trainig+testing together.
1500  // what should this have been good for ???
1501 
1502  Log() << kINFO << "Weight renormalisation mode: \"EqualNumEvents\": renormalises all event classes ..." << Endl;
1503  Log() << kINFO << " such that the effective (weighted) number of events in each class is the same " << Endl;
1504  Log() << kINFO << " (and equals the number of events (entries) given for class=0 )" << Endl;
1505  Log() << kINFO << "... i.e. such that Sum[i=1..N_j]{w_i} = N_classA, j=classA, classB, ..." << Endl;
1506  Log() << kINFO << "... (note that N_j is the sum of TRAINING events" << Endl;
1507  Log() << kINFO << " ..... Testing events are not renormalised nor included in the renormalisation factor!)" << Endl;
1508 
1509  // normalize to size of first class
1510  UInt_t referenceClass = 0;
1511  for (UInt_t cls = 0, clsEnd = dsi.GetNClasses(); cls < clsEnd; ++cls ) {
1512  renormFactor.at(cls) = Float_t(trainingSizePerClass.at(referenceClass))/
1513  (trainingSumWeightsPerClass.at(cls));
1514  }
1515  }
1516  else {
1517  Log() << kFATAL << "<PrepareForTrainingAndTesting> Unknown NormMode: " << normMode << Endl;
1518  }
1519 
1520  // ---------------------------------
1521  // now apply the normalization factors
1522  Int_t maxL = dsi.GetClassNameMaxLength();
1523  for (UInt_t cls = 0, clsEnd = dsi.GetNClasses(); cls<clsEnd; ++cls) {
1524  Log() << kINFO << "--> Rescale " << setiosflags(ios::left) << std::setw(maxL)
1525  << dsi.GetClassInfo(cls)->GetName() << " event weights by factor: " << renormFactor.at(cls) << Endl;
1526  for (EventVector::iterator it = tmpEventVector[Types::kTraining].at(cls).begin(),
1527  itEnd = tmpEventVector[Types::kTraining].at(cls).end(); it != itEnd; ++it){
1528  (*it)->SetWeight ((*it)->GetWeight() * renormFactor.at(cls));
1529  }
1530 
1531  }
1532 
1533 
1534  // print out the result
1535  // (same code as before --> this can be done nicer )
1536  //
1537 
1538  Log() << kINFO << "Number of training and testing events after rescaling:" << Endl;
1539  Log() << kINFO << "------------------------------------------------------" << Endl;
1540 
1541  trainingSumSignalWeights = 0;
1542  trainingSumBackgrWeights = 0; // Backgr. includes all clasess that are not signal
1543  testingSumSignalWeights = 0;
1544  testingSumBackgrWeights = 0; // Backgr. includes all clasess that are not signal
1545 
1546  for( UInt_t cls = 0, clsEnd = dsi.GetNClasses(); cls < clsEnd; ++cls ){
1547 
1548  trainingSumWeightsPerClass.at(cls) = (std::accumulate( tmpEventVector[Types::kTraining].at(cls).begin(), // accumulate --> start at begin
1549  tmpEventVector[Types::kTraining].at(cls).end(), // until end()
1550  Double_t(0), // values are of type double
1551  compose_binary( std::plus<Double_t>(), // define addition for doubles
1552  null<Double_t>(), // take the argument, don't do anything and return it
1553  std::mem_fun(&TMVA::Event::GetOriginalWeight) ) )); // take the value from GetOriginalWeight
1554 
1555  testingSumWeightsPerClass.at(cls) = std::accumulate( tmpEventVector[Types::kTesting].at(cls).begin(),
1556  tmpEventVector[Types::kTesting].at(cls).end(),
1557  Double_t(0),
1558  compose_binary( std::plus<Double_t>(),
1559  null<Double_t>(),
1560  std::mem_fun(&TMVA::Event::GetOriginalWeight) ) );
1561 
1562 
1563  if ( cls == dsi.GetSignalClassIndex()){
1564  trainingSumSignalWeights += trainingSumWeightsPerClass.at(cls);
1565  testingSumSignalWeights += testingSumWeightsPerClass.at(cls);
1566  }else{
1567  trainingSumBackgrWeights += trainingSumWeightsPerClass.at(cls);
1568  testingSumBackgrWeights += testingSumWeightsPerClass.at(cls);
1569  }
1570 
1571  // output statistics
1572 
1573  Log() << kINFO << setiosflags(ios::left) << std::setw(maxL)
1574  << dsi.GetClassInfo(cls)->GetName() << " -- "
1575  << "training events : " << trainingSizePerClass.at(cls)
1576  << " (sum of weights: " << trainingSumWeightsPerClass.at(cls) << ")"
1577  << " - requested were " << eventCounts[cls].nTrainingEventsRequested << " events" << Endl;
1578  Log() << kINFO << setiosflags(ios::left) << std::setw(maxL)
1579  << dsi.GetClassInfo(cls)->GetName() << " -- "
1580  << "testing events : " << testingSizePerClass.at(cls)
1581  << " (sum of weights: " << testingSumWeightsPerClass.at(cls) << ")"
1582  << " - requested were " << eventCounts[cls].nTestingEventsRequested << " events" << Endl;
1583  Log() << kINFO << setiosflags(ios::left) << std::setw(maxL)
1584  << dsi.GetClassInfo(cls)->GetName() << " -- "
1585  << "training and testing events: "
1586  << (trainingSizePerClass.at(cls)+testingSizePerClass.at(cls))
1587  << " (sum of weights: "
1588  << (trainingSumWeightsPerClass.at(cls)+testingSumWeightsPerClass.at(cls)) << ")" << Endl;
1589  if(eventCounts[cls].nEvAfterCut<eventCounts[cls].nEvBeforeCut) {
1590  Log() << kINFO << setiosflags(ios::left) << std::setw(maxL)
1591  << dsi.GetClassInfo(cls)->GetName() << " -- "
1592  << "due to the preselection a scaling factor has been applied to the numbers of requested events: "
1593  << eventCounts[cls].cutScaling() << Endl;
1594  }
1595  }
1596 
1597  // for information purposes
1598  dsi.SetTrainingSumSignalWeights(trainingSumSignalWeights);
1599  dsi.SetTrainingSumBackgrWeights(trainingSumBackgrWeights);
1600  dsi.SetTestingSumSignalWeights(testingSumSignalWeights);
1601  dsi.SetTestingSumBackgrWeights(testingSumBackgrWeights);
1602 
1603 
1604 }
1605 
1606 
1607 
const int ndata
virtual const char * GetTitle() const
Returns title of object.
Definition: TNamed.h:52
UInt_t GetNSpectators() const
access the number of targets through the datasetinfo
Definition: DataSet.cxx:210
A TLeaf describes individual elements of a TBranch See TBranch structure in TTree.
Definition: TLeaf.h:37
void SetEventCollection(std::vector< Event * > *, Types::ETreeType)
Sets the event collection (by DataSetFactory)
Definition: DataSet.cxx:229
std::vector< EventVector > EventVectorOfClasses
Long64_t GetNTestEvents() const
Definition: DataSet.h:91
std::vector< TString > * GetClassList() const
void SetTrainingSumBackgrWeights(Double_t trainingSumBackgrWeights)
Definition: DataSetInfo.h:136
static Vc_ALWAYS_INLINE int_v min(const int_v &x, const int_v &y)
Definition: vector.h:433
MsgLogger & Endl(MsgLogger &ml)
Definition: MsgLogger.h:162
long long Long64_t
Definition: RtypesCore.h:69
virtual void SetBranchStatus(const char *bname, Bool_t status=1, UInt_t *found=0)
Set branch status to Process or DoNotProcess.
Definition: TTree.cxx:7675
std::vector< VariableInfo > & GetSpectatorInfos()
Definition: DataSetInfo.h:122
float Float_t
Definition: RtypesCore.h:53
Float_t GetSpectator(UInt_t ivar) const
return spectator content
Definition: Event.cxx:256
const TString & GetExpression() const
Definition: VariableInfo.h:62
Int_t GetClassNameMaxLength() const
virtual TLeaf * GetLeaf(Int_t n) const
Return leaf corresponding to serial number n.
UInt_t GetNClasses() const
Definition: DataSetInfo.h:152
std::vector< Double_t > ValuePerClass
Types::ETreeType GetTreeType() const
UInt_t GetNTargets() const
Definition: DataSetInfo.h:129
void ToUpper()
Change string to upper case.
Definition: TString.cxx:1101
void SetTrainingSumSignalWeights(Double_t trainingSumSignalWeights)
Definition: DataSetInfo.h:135
virtual Int_t GetEntry(Long64_t entry=0, Int_t getall=0)
Read all branches of entry and return total number of bytes read.
Definition: TTree.cxx:5163
Basic string class.
Definition: TString.h:137
void SetTestingSumBackgrWeights(Double_t testingSumBackgrWeights)
Definition: DataSetInfo.h:138
void BuildEventVector(DataSetInfo &dsi, DataInputHandler &dataInput, EventVectorOfClassesOfTreeType &eventsmap, EvtStatsPerClass &eventCounts)
build empty event vectors distributes events between kTraining/kTesting/kMaxTreeType ...
const TString & GetWeight() const
Definition: ClassInfo.h:73
virtual Int_t GetNdim() const
Definition: TFormula.h:243
int Int_t
Definition: RtypesCore.h:41
bool Bool_t
Definition: RtypesCore.h:59
TArc * a
Definition: textangle.C:12
const Bool_t kFALSE
Definition: Rtypes.h:92
void generate(R &r, TH1D *h)
Definition: piRandom.C:28
TBranch * GetBranch() const
Definition: TLeaf.h:70
STL namespace.
void CalcMinMax(DataSet *, DataSetInfo &dsi)
compute covariance matrix
TFile * GetCurrentFile() const
Return pointer to the current file.
Definition: TTree.cxx:5006
Double_t GetWeight() const
return the event weight - depending on whether the flag IgnoreNegWeightsInTraining is or not...
Definition: Event.cxx:376
TTree * GetTree() const
UInt_t GetNVariables() const
Definition: DataSetInfo.h:128
Short_t Abs(Short_t d)
Definition: TMathBase.h:110
TObject Int_t at
Float_t GetValue(UInt_t ivar) const
return value of i'th variable
Definition: Event.cxx:231
UInt_t GetEntries(const TString &name) const
std::vector< int > NumberPerClass
const TString & GetClassName() const
const TString & GetInternalName() const
Definition: VariableInfo.h:63
virtual const char * GetName() const
Returns name of object.
Definition: DataSetInfo.h:85
const char * Data() const
Definition: TString.h:349
std::map< Types::ETreeType, EventVectorOfClasses > EventVectorOfClassesOfTreeType
double sqrt(double)
void InitOptions(DataSetInfo &dsi, EvtStatsPerClass &eventsmap, TString &normMode, UInt_t &splitSeed, TString &splitMode, TString &mixMode)
the dataset splitting
DataSet * BuildDynamicDataSet(DataSetInfo &)
static TString Format(const char *fmt,...)
Static method which formats a string using a printf style format descriptor and return a TString...
Definition: TString.cxx:2334
virtual Bool_t IsOnTerminalBranch() const
Definition: TLeaf.h:89
virtual Long64_t LoadTree(Long64_t entry)
Set current entry.
Definition: TTree.cxx:5804
void SetTestingSumSignalWeights(Double_t testingSumSignalWeights)
Definition: DataSetInfo.h:137
std::vector< std::vector< double > > Data
void PrintCorrelationMatrix(const TString &className)
calculates the correlation matrices for signal and background, prints them to standard output...
void SetMin(Double_t v)
Definition: VariableInfo.h:74
void RenormEvents(DataSetInfo &dsi, EventVectorOfClassesOfTreeType &eventsmap, const EvtStatsPerClass &eventCounts, const TString &normMode)
renormalisation of the TRAINING event weights -none (kind of obvious) .
void SetCurrentEvent(Long64_t ievt) const
Definition: DataSet.h:110
Double_t GetWeight() const
Double_t GetOriginalWeight() const
Definition: Event.h:84
Used to pass a selection expression to the Tree drawing routine.
Definition: TTreeFormula.h:64
void ChangeToNewTree(TreeInfo &, const DataSetInfo &)
While the data gets copied into the local training and testing trees, the input tree can change (for ...
A specialized string object used for TTree selections.
Definition: TCut.h:27
virtual TFile * GetFile() const
Definition: TDirectory.h:152
TMatrixT< Double_t > TMatrixD
Definition: TMatrixDfwd.h:24
void SetCorrelationMatrix(const TString &className, TMatrixD *matrix)
DataSetFactory()
constructor
TMatrixD * CalcCorrelationMatrix(DataSet *, const UInt_t classNumber)
computes correlation matrix for variables "theVars" in tree; "theType" defines the required event "ty...
Int_t LargestCommonDivider(Int_t a, Int_t b)
SVector< double, 2 > v
Definition: Dict.h:5
DataSet * CreateDataSet(DataSetInfo &, DataInputHandler &)
steering the creation of a new dataset
TDirectory * GetDirectory() const
Definition: TTree.h:381
virtual const char * ClassName() const
Returns name of class to which the object belongs.
Definition: TObject.cxx:187
VariableInfo & GetTargetInfo(Int_t i)
Definition: DataSetInfo.h:119
ClassInfo * GetClassInfo(Int_t clNum) const
unsigned int UInt_t
Definition: RtypesCore.h:42
virtual Int_t GetNcodes() const
Definition: TTreeFormula.h:189
char * Form(const char *fmt,...)
const TString & GetName() const
Definition: ClassInfo.h:72
virtual const char * GetName() const
Returns name of object.
Definition: TNamed.h:51
UInt_t GetSignalClassIndex()
Definition: DataSetInfo.h:155
const TCut & GetCut() const
Definition: ClassInfo.h:74
std::vector< Event * > EventVector
void SetMax(Double_t v)
Definition: VariableInfo.h:75
const TString & GetSplitOptions() const
Definition: DataSetInfo.h:183
DataSet * BuildInitialDataSet(DataSetInfo &, TMVA::DataInputHandler &)
if no entries, than create a DataSet with one Event which uses dynamic variables (pointers to variabl...
const Event * GetEvent() const
Definition: DataSet.cxx:180
~DataSetFactory()
destructor
void SetCurrentType(Types::ETreeType type) const
Definition: DataSet.h:111
virtual Int_t GetNdata()
Return number of available instances in the formula.
double Double_t
Definition: RtypesCore.h:55
std::vector< TreeInfo >::const_iterator end(const TString &className) const
virtual void ResetBranchAddresses()
Tell all of our branches to drop their current objects and allocate new ones.
Definition: TTree.cxx:7285
Long64_t GetNEvents(Types::ETreeType type=Types::kMaxTreeType) const
Definition: DataSet.h:225
UInt_t GetNTargets() const
access the number of targets through the datasetinfo
Definition: DataSet.cxx:202
UInt_t GetNVariables() const
access the number of variables through the datasetinfo
Definition: DataSet.cxx:194
VariableInfo & GetSpectatorInfo(Int_t i)
Definition: DataSetInfo.h:124
compose_binary_t< F, G, H > compose_binary(const F &_f, const G &_g, const H &_h)
virtual TTree * GetTree() const
Definition: TTree.h:432
VariableInfo & GetVariableInfo(Int_t i)
Definition: DataSetInfo.h:114
RooCmdArg Verbose(Bool_t flag=kTRUE)
T EvalInstance(Int_t i=0, const char *stringStack[]=0)
Evaluate this treeformula.
ClassInfo * AddClass(const TString &className)
UInt_t GetClass() const
Definition: Event.h:86
static Vc_ALWAYS_INLINE int_v max(const int_v &x, const int_v &y)
Definition: vector.h:440
Long64_t GetNClassEvents(Int_t type, UInt_t classNumber)
Definition: DataSet.cxx:146
void SetConfigName(const char *n)
Definition: Configurable.h:70
Float_t GetTarget(UInt_t itgt) const
Definition: Event.h:101
Abstract ClassifierFactory template that handles arbitrary types.
std::vector< EventStats > EvtStatsPerClass
Short_t Max(Short_t a, Short_t b)
Definition: TMathBase.h:202
Bool_t Contains(const char *pat, ECaseCompare cmp=kExact) const
Definition: TString.h:567
Int_t IsNaN(Double_t x)
Definition: TMath.h:617
UInt_t GetNSpectators(bool all=kTRUE) const
Bool_t CheckTTreeFormula(TTreeFormula *ttf, const TString &expression, Bool_t &hasDollar)
checks a TTreeFormula for problems
void SetNumber(const UInt_t index)
Definition: ClassInfo.h:68
#define NULL
Definition: Rtypes.h:82
virtual Long64_t GetEntries() const
Definition: TTree.h:382
Long64_t GetNTrainingEvents() const
Definition: DataSet.h:90
A TTree object has a header with a name and a title.
Definition: TTree.h:94
DataSet * MixEvents(DataSetInfo &dsi, EventVectorOfClassesOfTreeType &eventsmap, EvtStatsPerClass &eventCounts, const TString &splitMode, const TString &mixMode, const TString &normMode, UInt_t splitSeed)
Select and distribute unassigned events to kTraining and kTesting Bool_t emptyUndefined = kTRUE;...
const Bool_t kTRUE
Definition: Rtypes.h:91
std::vector< TreeInfo >::const_iterator begin(const TString &className) const
Bool_t HasCuts() const
void SetNormalization(const TString &norm)
Definition: DataSetInfo.h:133
TMatrixD * CalcCovarianceMatrix(DataSet *, const UInt_t classNumber)
compute covariance matrix
std::vector< VariableInfo > & GetVariableInfos()
Definition: DataSetInfo.h:112
Definition: math.cpp:60