ROOT  6.07/01
Reference Guide
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
DataSetFactory.cxx
Go to the documentation of this file.
1 // @(#)root/tmva $Id$
2 // Author: Andreas Hoecker, Peter Speckmayer, Joerg Stelzer, Eckhard von Toerne, Helge Voss
3 
4 /*****************************************************************************
5  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6  * Package: TMVA *
7  * Class : DataSetFactory *
8  * Web : http://tmva.sourceforge.net *
9  * *
10  * Description: *
11  * Implementation (see header for description) *
12  * *
13  * Authors (alphabetical): *
14  * Andreas Hoecker <Andreas.Hocker@cern.ch> - CERN, Switzerland *
15  * Peter Speckmayer <Peter.Speckmayer@cern.ch> - CERN, Switzerland *
16  * Joerg Stelzer <Joerg.Stelzer@cern.ch> - MSU, USA *
17  * Eckhard von Toerne <evt@physik.uni-bonn.de> - U. of Bonn, Germany *
18  * Helge Voss <Helge.Voss@cern.ch> - MPI-K Heidelberg, Germany *
19  * *
20  * Copyright (c) 2009: *
21  * CERN, Switzerland *
22  * MPI-K Heidelberg, Germany *
23  * U. of Bonn, Germany *
24  * Redistribution and use in source and binary forms, with or without *
25  * modification, are permitted according to the terms listed in LICENSE *
26  * (http://tmva.sourceforge.net/LICENSE) *
27  *****************************************************************************/
28 
29 #include <assert.h>
30 
31 #include <map>
32 #include <vector>
33 #include <iomanip>
34 #include <iostream>
35 
36 #include <algorithm>
37 #include <functional>
38 #include <numeric>
39 
40 #include "TMVA/DataSetFactory.h"
41 
42 #include "TEventList.h"
43 #include "TFile.h"
44 #include "TH1.h"
45 #include "TH2.h"
46 #include "TProfile.h"
47 #include "TRandom3.h"
48 #include "TMatrixF.h"
49 #include "TVectorF.h"
50 #include "TMath.h"
51 #include "TROOT.h"
52 
53 #ifndef ROOT_TMVA_MsgLogger
54 #include "TMVA/MsgLogger.h"
55 #endif
56 #ifndef ROOT_TMVA_Configurable
57 #include "TMVA/Configurable.h"
58 #endif
59 #ifndef ROOT_TMVA_VariableIdentityTransform
61 #endif
62 #ifndef ROOT_TMVA_VariableDecorrTransform
64 #endif
65 #ifndef ROOT_TMVA_VariablePCATransform
67 #endif
68 #ifndef ROOT_TMVA_DataSet
69 #include "TMVA/DataSet.h"
70 #endif
71 #ifndef ROOT_TMVA_DataSetInfo
72 #include "TMVA/DataSetInfo.h"
73 #endif
74 #ifndef ROOT_TMVA_DataInputHandler
75 #include "TMVA/DataInputHandler.h"
76 #endif
77 #ifndef ROOT_TMVA_Event
78 #include "TMVA/Event.h"
79 #endif
80 
81 #include "TMVA/Types.h"
82 #include "TMVA/VariableInfo.h"
83 
84 using namespace std;
85 
87 
88 namespace TMVA {
89  // calculate the largest common divider
90  // this function is not happy if numbers are negative!
92  {
93  if (a<b) {Int_t tmp = a; a=b; b=tmp; } // achieve a>=b
94  if (b==0) return a;
95  Int_t fullFits = a/b;
96  return LargestCommonDivider(b,a-b*fullFits);
97  }
98 }
99 
100 ////////////////////////////////////////////////////////////////////////////////
101 /// constructor
102 
104  fVerbose(kFALSE),
105  fVerboseLevel(TString("Info")),
106  fScaleWithPreselEff(0),
107  fCurrentTree(0),
108  fCurrentEvtIdx(0),
109  fInputFormulas(0),
110  fLogger( new MsgLogger("DataSetFactory", kINFO) )
111 {
112 }
113 
114 ////////////////////////////////////////////////////////////////////////////////
115 /// destructor
116 
118 {
119  std::vector<TTreeFormula*>::const_iterator formIt;
120 
121  for (formIt = fInputFormulas.begin() ; formIt!=fInputFormulas.end() ; formIt++) if (*formIt) delete *formIt;
122  for (formIt = fTargetFormulas.begin() ; formIt!=fTargetFormulas.end() ; formIt++) if (*formIt) delete *formIt;
123  for (formIt = fCutFormulas.begin() ; formIt!=fCutFormulas.end() ; formIt++) if (*formIt) delete *formIt;
124  for (formIt = fWeightFormula.begin() ; formIt!=fWeightFormula.end() ; formIt++) if (*formIt) delete *formIt;
125  for (formIt = fSpectatorFormulas.begin(); formIt!=fSpectatorFormulas.end(); formIt++) if (*formIt) delete *formIt;
126 
127  delete fLogger;
128 }
129 
130 ////////////////////////////////////////////////////////////////////////////////
131 /// steering the creation of a new dataset
132 
134  TMVA::DataInputHandler& dataInput )
135 {
136  // build the first dataset from the data input
137  DataSet * ds = BuildInitialDataSet( dsi, dataInput );
138 
139  if (ds->GetNEvents() > 1) {
140  CalcMinMax(ds,dsi);
141 
142  // from the the final dataset build the correlation matrix
143  for (UInt_t cl = 0; cl< dsi.GetNClasses(); cl++) {
144  const TString className = dsi.GetClassInfo(cl)->GetName();
145  dsi.SetCorrelationMatrix( className, CalcCorrelationMatrix( ds, cl ) );
146  dsi.PrintCorrelationMatrix( className );
147  }
148  Log() << kINFO << " " << Endl;
149  }
150 
151  return ds;
152 }
153 
154 ////////////////////////////////////////////////////////////////////////////////
155 
157 {
158  Log() << kDEBUG << "Build DataSet consisting of one Event with dynamically changing variables" << Endl;
159  DataSet* ds = new DataSet(dsi);
160 
161  // create a DataSet with one Event which uses dynamic variables
162  // (pointers to variables)
163  if(dsi.GetNClasses()==0){
164  dsi.AddClass( "data" );
165  dsi.GetClassInfo( "data" )->SetNumber(0);
166  }
167 
168  std::vector<Float_t*>* evdyn = new std::vector<Float_t*>(0);
169 
170  std::vector<VariableInfo>& varinfos = dsi.GetVariableInfos();
171 
172  if (varinfos.empty())
173  Log() << kFATAL << "Dynamic data set cannot be built, since no variable informations are present. Apparently no variables have been set. This should not happen, please contact the TMVA authors." << Endl;
174 
175  std::vector<VariableInfo>::iterator it = varinfos.begin(), itEnd=varinfos.end();
176  for (;it!=itEnd;++it) {
177  Float_t* external=(Float_t*)(*it).GetExternalLink();
178  if (external==0)
179  Log() << kDEBUG << "The link to the external variable is NULL while I am trying to build a dynamic data set. In this case fTmpEvent from MethodBase HAS TO BE USED in the method to get useful values in variables." << Endl;
180  evdyn->push_back (external);
181  }
182 
183  std::vector<VariableInfo>& spectatorinfos = dsi.GetSpectatorInfos();
184  it = spectatorinfos.begin();
185  for (;it!=spectatorinfos.end();it++) evdyn->push_back( (Float_t*)(*it).GetExternalLink() );
186 
187  TMVA::Event * ev = new Event((const std::vector<Float_t*>*&)evdyn, varinfos.size());
188  std::vector<Event*>* newEventVector = new std::vector<Event*>;
189  newEventVector->push_back(ev);
190 
191  ds->SetEventCollection(newEventVector, Types::kTraining);
193  ds->SetCurrentEvent( 0 );
194 
195  return ds;
196 }
197 
198 ////////////////////////////////////////////////////////////////////////////////
199 /// if no entries, than create a DataSet with one Event which uses
200 /// dynamic variables (pointers to variables)
201 
204  DataInputHandler& dataInput )
205 {
206  if (dataInput.GetEntries()==0) return BuildDynamicDataSet( dsi );
207  // -------------------------------------------------------------------------
208 
209  // register the classes in the datasetinfo-object
210  // information comes from the trees in the dataInputHandler-object
211  std::vector< TString >* classList = dataInput.GetClassList();
212  for (std::vector<TString>::iterator it = classList->begin(); it< classList->end(); it++) {
213  dsi.AddClass( (*it) );
214  }
215  delete classList;
216 
217  EvtStatsPerClass eventCounts(dsi.GetNClasses());
218  TString normMode;
219  TString splitMode;
220  TString mixMode;
221  UInt_t splitSeed;
222  InitOptions( dsi, eventCounts, normMode, splitSeed, splitMode , mixMode );
223 
224  // ======= build event-vector from input, apply preselection ===============
225  EventVectorOfClassesOfTreeType tmpEventVector;
226  BuildEventVector( dsi, dataInput, tmpEventVector, eventCounts );
227 
228  DataSet* ds = MixEvents( dsi, tmpEventVector, eventCounts,
229  splitMode, mixMode, normMode, splitSeed);
230 
231  const Bool_t showCollectedOutput = kFALSE;
232  if (showCollectedOutput) {
233  Int_t maxL = dsi.GetClassNameMaxLength();
234  Log() << kINFO << "Collected:" << Endl;
235  for (UInt_t cl = 0; cl < dsi.GetNClasses(); cl++) {
236  Log() << kINFO << " "
237  << setiosflags(ios::left) << std::setw(maxL) << dsi.GetClassInfo(cl)->GetName()
238  << " training entries: " << ds->GetNClassEvents( 0, cl ) << Endl;
239  Log() << kINFO << " "
240  << setiosflags(ios::left) << std::setw(maxL) << dsi.GetClassInfo(cl)->GetName()
241  << " testing entries: " << ds->GetNClassEvents( 1, cl ) << Endl;
242  }
243  Log() << kINFO << " " << Endl;
244  }
245 
246  return ds;
247 }
248 
249 ////////////////////////////////////////////////////////////////////////////////
250 /// checks a TTreeFormula for problems
251 
253  const TString& expression,
254  Bool_t& hasDollar )
255 {
256  Bool_t worked = kTRUE;
257 
258  if( ttf->GetNdim() <= 0 )
259  Log() << kFATAL << "Expression " << expression.Data()
260  << " could not be resolved to a valid formula. " << Endl;
261  if( ttf->GetNdata() == 0 ){
262  Log() << kWARNING << "Expression: " << expression.Data()
263  << " does not provide data for this event. "
264  << "This event is not taken into account. --> please check if you use as a variable "
265  << "an entry of an array which is not filled for some events "
266  << "(e.g. arr[4] when arr has only 3 elements)." << Endl;
267  Log() << kWARNING << "If you want to take the event into account you can do something like: "
268  << "\"Alt$(arr[4],0)\" where in cases where arr doesn't have a 4th element, "
269  << " 0 is taken as an alternative." << Endl;
270  worked = kFALSE;
271  }
272  if( expression.Contains("$") )
273  hasDollar = kTRUE;
274  else
275  {
276  for (int i = 0, iEnd = ttf->GetNcodes (); i < iEnd; ++i)
277  {
278  TLeaf* leaf = ttf->GetLeaf (i);
279  if (!leaf->IsOnTerminalBranch())
280  hasDollar = kTRUE;
281  }
282  }
283  return worked;
284 }
285 
286 
287 ////////////////////////////////////////////////////////////////////////////////
288 /// While the data gets copied into the local training and testing
289 /// trees, the input tree can change (for intance when changing from
290 /// signal to background tree, or using TChains as input) The
291 /// TTreeFormulas, that hold the input expressions need to be
292 /// reassociated with the new tree, which is done here
293 
295 {
296  TTree *tr = tinfo.GetTree()->GetTree();
297 
298  tr->SetBranchStatus("*",1);
299 
300  Bool_t hasDollar = kFALSE;
301 
302  // 1) the input variable formulas
303  Log() << kDEBUG << "transform input variables" << Endl;
304  std::vector<TTreeFormula*>::const_iterator formIt, formItEnd;
305  for (formIt = fInputFormulas.begin(), formItEnd=fInputFormulas.end(); formIt!=formItEnd; formIt++) if (*formIt) delete *formIt;
306  fInputFormulas.clear();
307  TTreeFormula* ttf = 0;
308 
309  for (UInt_t i=0; i<dsi.GetNVariables(); i++) {
310  ttf = new TTreeFormula( Form( "Formula%s", dsi.GetVariableInfo(i).GetInternalName().Data() ),
311  dsi.GetVariableInfo(i).GetExpression().Data(), tr );
312  CheckTTreeFormula( ttf, dsi.GetVariableInfo(i).GetExpression(), hasDollar );
313  fInputFormulas.push_back( ttf );
314  }
315 
316  //
317  // targets
318  //
319  Log() << kDEBUG << "transform regression targets" << Endl;
320  for (formIt = fTargetFormulas.begin(), formItEnd = fTargetFormulas.end(); formIt!=formItEnd; formIt++) if (*formIt) delete *formIt;
321  fTargetFormulas.clear();
322  for (UInt_t i=0; i<dsi.GetNTargets(); i++) {
323  ttf = new TTreeFormula( Form( "Formula%s", dsi.GetTargetInfo(i).GetInternalName().Data() ),
324  dsi.GetTargetInfo(i).GetExpression().Data(), tr );
325  CheckTTreeFormula( ttf, dsi.GetTargetInfo(i).GetExpression(), hasDollar );
326  fTargetFormulas.push_back( ttf );
327  }
328 
329  //
330  // spectators
331  //
332  Log() << kDEBUG << "transform spectator variables" << Endl;
333  for (formIt = fSpectatorFormulas.begin(), formItEnd = fSpectatorFormulas.end(); formIt!=formItEnd; formIt++) if (*formIt) delete *formIt;
334  fSpectatorFormulas.clear();
335  for (UInt_t i=0; i<dsi.GetNSpectators(); i++) {
336  ttf = new TTreeFormula( Form( "Formula%s", dsi.GetSpectatorInfo(i).GetInternalName().Data() ),
337  dsi.GetSpectatorInfo(i).GetExpression().Data(), tr );
338  CheckTTreeFormula( ttf, dsi.GetSpectatorInfo(i).GetExpression(), hasDollar );
339  fSpectatorFormulas.push_back( ttf );
340  }
341 
342  //
343  // the cuts (one per class, if non-existent: formula pointer = 0)
344  //
345  Log() << kDEBUG << "transform cuts" << Endl;
346  for (formIt = fCutFormulas.begin(), formItEnd = fCutFormulas.end(); formIt!=formItEnd; formIt++) if (*formIt) delete *formIt;
347  fCutFormulas.clear();
348  for (UInt_t clIdx=0; clIdx<dsi.GetNClasses(); clIdx++) {
349  const TCut& tmpCut = dsi.GetClassInfo(clIdx)->GetCut();
350  const TString tmpCutExp(tmpCut.GetTitle());
351  ttf = 0;
352  if (tmpCutExp!="") {
353  ttf = new TTreeFormula( Form("CutClass%i",clIdx), tmpCutExp, tr );
354  Bool_t worked = CheckTTreeFormula( ttf, tmpCutExp, hasDollar );
355  if( !worked ){
356  Log() << kWARNING << "Please check class \"" << dsi.GetClassInfo(clIdx)->GetName()
357  << "\" cut \"" << dsi.GetClassInfo(clIdx)->GetCut() << Endl;
358  }
359  }
360  fCutFormulas.push_back( ttf );
361  }
362 
363  //
364  // the weights (one per class, if non-existent: formula pointer = 0)
365  //
366  Log() << kDEBUG << "transform weights" << Endl;
367  for (formIt = fWeightFormula.begin(), formItEnd = fWeightFormula.end(); formIt!=formItEnd; formIt++) if (*formIt) delete *formIt;
368  fWeightFormula.clear();
369  for (UInt_t clIdx=0; clIdx<dsi.GetNClasses(); clIdx++) {
370  const TString tmpWeight = dsi.GetClassInfo(clIdx)->GetWeight();
371 
372  if (dsi.GetClassInfo(clIdx)->GetName() != tinfo.GetClassName() ) { // if the tree is of another class
373  fWeightFormula.push_back( 0 );
374  continue;
375  }
376 
377  ttf = 0;
378  if (tmpWeight!="") {
379  ttf = new TTreeFormula( "FormulaWeight", tmpWeight, tr );
380  Bool_t worked = CheckTTreeFormula( ttf, tmpWeight, hasDollar );
381  if( !worked ){
382  Log() << kWARNING << "Please check class \"" << dsi.GetClassInfo(clIdx)->GetName()
383  << "\" weight \"" << dsi.GetClassInfo(clIdx)->GetWeight() << Endl;
384  }
385  }
386  else {
387  ttf = 0;
388  }
389  fWeightFormula.push_back( ttf );
390  }
391  Log() << kDEBUG << "enable branches" << Endl;
392  // now enable only branches that are needed in any input formula, target, cut, weight
393 
394  if (!hasDollar) {
395  tr->SetBranchStatus("*",0);
396  Log() << kDEBUG << "enable branches: input variables" << Endl;
397  // input vars
398  for (formIt = fInputFormulas.begin(); formIt!=fInputFormulas.end(); formIt++) {
399  ttf = *formIt;
400  for (Int_t bi = 0; bi<ttf->GetNcodes(); bi++) {
401  tr->SetBranchStatus( ttf->GetLeaf(bi)->GetBranch()->GetName(), 1 );
402  }
403  }
404  // targets
405  Log() << kDEBUG << "enable branches: targets" << Endl;
406  for (formIt = fTargetFormulas.begin(); formIt!=fTargetFormulas.end(); formIt++) {
407  ttf = *formIt;
408  for (Int_t bi = 0; bi<ttf->GetNcodes(); bi++)
409  tr->SetBranchStatus( ttf->GetLeaf(bi)->GetBranch()->GetName(), 1 );
410  }
411  // spectators
412  Log() << kDEBUG << "enable branches: spectators" << Endl;
413  for (formIt = fSpectatorFormulas.begin(); formIt!=fSpectatorFormulas.end(); formIt++) {
414  ttf = *formIt;
415  for (Int_t bi = 0; bi<ttf->GetNcodes(); bi++)
416  tr->SetBranchStatus( ttf->GetLeaf(bi)->GetBranch()->GetName(), 1 );
417  }
418  // cuts
419  Log() << kDEBUG << "enable branches: cuts" << Endl;
420  for (formIt = fCutFormulas.begin(); formIt!=fCutFormulas.end(); formIt++) {
421  ttf = *formIt;
422  if (!ttf) continue;
423  for (Int_t bi = 0; bi<ttf->GetNcodes(); bi++)
424  tr->SetBranchStatus( ttf->GetLeaf(bi)->GetBranch()->GetName(), 1 );
425  }
426  // weights
427  Log() << kDEBUG << "enable branches: weights" << Endl;
428  for (formIt = fWeightFormula.begin(); formIt!=fWeightFormula.end(); formIt++) {
429  ttf = *formIt;
430  if (!ttf) continue;
431  for (Int_t bi = 0; bi<ttf->GetNcodes(); bi++)
432  tr->SetBranchStatus( ttf->GetLeaf(bi)->GetBranch()->GetName(), 1 );
433  }
434  }
435  Log() << kDEBUG << "tree initialized" << Endl;
436  return;
437 }
438 
439 ////////////////////////////////////////////////////////////////////////////////
440 /// compute covariance matrix
441 
443 {
444  const UInt_t nvar = ds->GetNVariables();
445  const UInt_t ntgts = ds->GetNTargets();
446  const UInt_t nvis = ds->GetNSpectators();
447 
448  Float_t *min = new Float_t[nvar];
449  Float_t *max = new Float_t[nvar];
450  Float_t *tgmin = new Float_t[ntgts];
451  Float_t *tgmax = new Float_t[ntgts];
452  Float_t *vmin = new Float_t[nvis];
453  Float_t *vmax = new Float_t[nvis];
454 
455  for (UInt_t ivar=0; ivar<nvar ; ivar++) { min[ivar] = FLT_MAX; max[ivar] = -FLT_MAX; }
456  for (UInt_t ivar=0; ivar<ntgts; ivar++) { tgmin[ivar] = FLT_MAX; tgmax[ivar] = -FLT_MAX; }
457  for (UInt_t ivar=0; ivar<nvis; ivar++) { vmin[ivar] = FLT_MAX; vmax[ivar] = -FLT_MAX; }
458 
459  // perform event loop
460 
461  for (Int_t i=0; i<ds->GetNEvents(); i++) {
462  const Event * ev = ds->GetEvent(i);
463  for (UInt_t ivar=0; ivar<nvar; ivar++) {
464  Double_t v = ev->GetValue(ivar);
465  if (v<min[ivar]) min[ivar] = v;
466  if (v>max[ivar]) max[ivar] = v;
467  }
468  for (UInt_t itgt=0; itgt<ntgts; itgt++) {
469  Double_t v = ev->GetTarget(itgt);
470  if (v<tgmin[itgt]) tgmin[itgt] = v;
471  if (v>tgmax[itgt]) tgmax[itgt] = v;
472  }
473  for (UInt_t ivis=0; ivis<nvis; ivis++) {
474  Double_t v = ev->GetSpectator(ivis);
475  if (v<vmin[ivis]) vmin[ivis] = v;
476  if (v>vmax[ivis]) vmax[ivis] = v;
477  }
478  }
479 
480  for (UInt_t ivar=0; ivar<nvar; ivar++) {
481  dsi.GetVariableInfo(ivar).SetMin(min[ivar]);
482  dsi.GetVariableInfo(ivar).SetMax(max[ivar]);
483  if( TMath::Abs(max[ivar]-min[ivar]) <= FLT_MIN )
484  Log() << kFATAL << "Variable " << dsi.GetVariableInfo(ivar).GetExpression().Data() << " is constant. Please remove the variable." << Endl;
485  }
486  for (UInt_t ivar=0; ivar<ntgts; ivar++) {
487  dsi.GetTargetInfo(ivar).SetMin(tgmin[ivar]);
488  dsi.GetTargetInfo(ivar).SetMax(tgmax[ivar]);
489  if( TMath::Abs(tgmax[ivar]-tgmin[ivar]) <= FLT_MIN )
490  Log() << kFATAL << "Target " << dsi.GetTargetInfo(ivar).GetExpression().Data() << " is constant. Please remove the variable." << Endl;
491  }
492  for (UInt_t ivar=0; ivar<nvis; ivar++) {
493  dsi.GetSpectatorInfo(ivar).SetMin(vmin[ivar]);
494  dsi.GetSpectatorInfo(ivar).SetMax(vmax[ivar]);
495  // if( TMath::Abs(vmax[ivar]-vmin[ivar]) <= FLT_MIN )
496  // Log() << kWARNING << "Spectator variable " << dsi.GetSpectatorInfo(ivar).GetExpression().Data() << " is constant." << Endl;
497  }
498  delete [] min;
499  delete [] max;
500  delete [] tgmin;
501  delete [] tgmax;
502  delete [] vmin;
503  delete [] vmax;
504 }
505 
506 ////////////////////////////////////////////////////////////////////////////////
507 /// computes correlation matrix for variables "theVars" in tree;
508 /// "theType" defines the required event "type"
509 /// ("type" variable must be present in tree)
510 
512 {
513  // first compute variance-covariance
514  TMatrixD* mat = CalcCovarianceMatrix( ds, classNumber );
515 
516  // now the correlation
517  UInt_t nvar = ds->GetNVariables(), ivar, jvar;
518 
519  for (ivar=0; ivar<nvar; ivar++) {
520  for (jvar=0; jvar<nvar; jvar++) {
521  if (ivar != jvar) {
522  Double_t d = (*mat)(ivar, ivar)*(*mat)(jvar, jvar);
523  if (d > 0) (*mat)(ivar, jvar) /= sqrt(d);
524  else {
525  Log() << kWARNING << "<GetCorrelationMatrix> Zero variances for variables "
526  << "(" << ivar << ", " << jvar << ") = " << d
527  << Endl;
528  (*mat)(ivar, jvar) = 0;
529  }
530  }
531  }
532  }
533 
534  for (ivar=0; ivar<nvar; ivar++) (*mat)(ivar, ivar) = 1.0;
535 
536  return mat;
537 }
538 
539 ////////////////////////////////////////////////////////////////////////////////
540 /// compute covariance matrix
541 
543 {
544  UInt_t nvar = ds->GetNVariables();
545  UInt_t ivar = 0, jvar = 0;
546 
547  TMatrixD* mat = new TMatrixD( nvar, nvar );
548 
549  // init matrices
550  TVectorD vec(nvar);
551  TMatrixD mat2(nvar, nvar);
552  for (ivar=0; ivar<nvar; ivar++) {
553  vec(ivar) = 0;
554  for (jvar=0; jvar<nvar; jvar++) mat2(ivar, jvar) = 0;
555  }
556 
557  // perform event loop
558  Double_t ic = 0;
559  for (Int_t i=0; i<ds->GetNEvents(); i++) {
560 
561  const Event * ev = ds->GetEvent(i);
562  if (ev->GetClass() != classNumber ) continue;
563 
564  Double_t weight = ev->GetWeight();
565  ic += weight; // count used events
566 
567  for (ivar=0; ivar<nvar; ivar++) {
568 
569  Double_t xi = ev->GetValue(ivar);
570  vec(ivar) += xi*weight;
571  mat2(ivar, ivar) += (xi*xi*weight);
572 
573  for (jvar=ivar+1; jvar<nvar; jvar++) {
574  Double_t xj = ev->GetValue(jvar);
575  mat2(ivar, jvar) += (xi*xj*weight);
576  }
577  }
578  }
579 
580  for (ivar=0; ivar<nvar; ivar++)
581  for (jvar=ivar+1; jvar<nvar; jvar++)
582  mat2(jvar, ivar) = mat2(ivar, jvar); // symmetric matrix
583 
584 
585  // variance-covariance
586  for (ivar=0; ivar<nvar; ivar++) {
587  for (jvar=0; jvar<nvar; jvar++) {
588  (*mat)(ivar, jvar) = mat2(ivar, jvar)/ic - vec(ivar)*vec(jvar)/(ic*ic);
589  }
590  }
591 
592  return mat;
593 }
594 
595 // --------------------------------------- new versions
596 
597 ////////////////////////////////////////////////////////////////////////////////
598 /// the dataset splitting
599 
600 void
602  EvtStatsPerClass& nEventRequests,
603  TString& normMode,
604  UInt_t& splitSeed,
605  TString& splitMode,
606  TString& mixMode )
607 {
608  Configurable splitSpecs( dsi.GetSplitOptions() );
609  splitSpecs.SetConfigName("DataSetFactory");
610  splitSpecs.SetConfigDescription( "Configuration options given in the \"PrepareForTrainingAndTesting\" call; these options define the creation of the data sets used for training and expert validation by TMVA" );
611 
612  splitMode = "Random"; // the splitting mode
613  splitSpecs.DeclareOptionRef( splitMode, "SplitMode",
614  "Method of picking training and testing events (default: random)" );
615  splitSpecs.AddPreDefVal(TString("Random"));
616  splitSpecs.AddPreDefVal(TString("Alternate"));
617  splitSpecs.AddPreDefVal(TString("Block"));
618 
619  mixMode = "SameAsSplitMode"; // the splitting mode
620  splitSpecs.DeclareOptionRef( mixMode, "MixMode",
621  "Method of mixing events of differnt classes into one dataset (default: SameAsSplitMode)" );
622  splitSpecs.AddPreDefVal(TString("SameAsSplitMode"));
623  splitSpecs.AddPreDefVal(TString("Random"));
624  splitSpecs.AddPreDefVal(TString("Alternate"));
625  splitSpecs.AddPreDefVal(TString("Block"));
626 
627  splitSeed = 100;
628  splitSpecs.DeclareOptionRef( splitSeed, "SplitSeed",
629  "Seed for random event shuffling" );
630 
631  normMode = "EqualNumEvents"; // the weight normalisation modes
632  splitSpecs.DeclareOptionRef( normMode, "NormMode",
633  "Overall renormalisation of event-by-event weights used in the training (NumEvents: average weight of 1 per event, independently for signal and background; EqualNumEvents: average weight of 1 per event for signal, and sum of weights for background equal to sum of weights for signal)" );
634  splitSpecs.AddPreDefVal(TString("None"));
635  splitSpecs.AddPreDefVal(TString("NumEvents"));
636  splitSpecs.AddPreDefVal(TString("EqualNumEvents"));
637 
638  splitSpecs.DeclareOptionRef(fScaleWithPreselEff=kFALSE,"ScaleWithPreselEff","Scale the number of requested events by the eff. of the preselection cuts (or not)" );
639 
640  // the number of events
641 
642  // fill in the numbers
643  for (UInt_t cl = 0; cl < dsi.GetNClasses(); cl++) {
644  TString clName = dsi.GetClassInfo(cl)->GetName();
645  TString titleTrain = TString().Format("Number of training events of class %s (default: 0 = all)",clName.Data()).Data();
646  TString titleTest = TString().Format("Number of test events of class %s (default: 0 = all)",clName.Data()).Data();
647 
648  splitSpecs.DeclareOptionRef( nEventRequests.at(cl).nTrainingEventsRequested, TString("nTrain_")+clName, titleTrain );
649  splitSpecs.DeclareOptionRef( nEventRequests.at(cl).nTestingEventsRequested , TString("nTest_")+clName , titleTest );
650  }
651 
652  splitSpecs.DeclareOptionRef( fVerbose, "V", "Verbosity (default: true)" );
653 
654  splitSpecs.DeclareOptionRef( fVerboseLevel=TString("Info"), "VerboseLevel", "VerboseLevel (Debug/Verbose/Info)" );
655  splitSpecs.AddPreDefVal(TString("Debug"));
656  splitSpecs.AddPreDefVal(TString("Verbose"));
657  splitSpecs.AddPreDefVal(TString("Info"));
658 
659  splitSpecs.ParseOptions();
660  splitSpecs.CheckForUnusedOptions();
661 
662  // output logging verbosity
663  if (Verbose()) fLogger->SetMinType( kVERBOSE );
664  if (fVerboseLevel.CompareTo("Debug") ==0) fLogger->SetMinType( kDEBUG );
665  if (fVerboseLevel.CompareTo("Verbose") ==0) fLogger->SetMinType( kVERBOSE );
666  if (fVerboseLevel.CompareTo("Info") ==0) fLogger->SetMinType( kINFO );
667 
668  // put all to upper case
669  splitMode.ToUpper(); mixMode.ToUpper(); normMode.ToUpper();
670  // adjust mixmode if same as splitmode option has been set
671  Log() << kINFO << "Splitmode is: \"" << splitMode << "\" the mixmode is: \"" << mixMode << "\"" << Endl;
672  if (mixMode=="SAMEASSPLITMODE") mixMode = splitMode;
673  else if (mixMode!=splitMode)
674  Log() << kINFO << "DataSet splitmode="<<splitMode
675  <<" differs from mixmode="<<mixMode<<Endl;
676 }
677 
678 
679 ////////////////////////////////////////////////////////////////////////////////
680 /// build empty event vectors
681 /// distributes events between kTraining/kTesting/kMaxTreeType
682 
683 void
685  TMVA::DataInputHandler& dataInput,
687  EvtStatsPerClass& eventCounts)
688 {
689  const UInt_t nclasses = dsi.GetNClasses();
690 
691  eventsmap[ Types::kTraining ] = EventVectorOfClasses(nclasses);
692  eventsmap[ Types::kTesting ] = EventVectorOfClasses(nclasses);
693  eventsmap[ Types::kMaxTreeType ] = EventVectorOfClasses(nclasses);
694 
695  // create the type, weight and boostweight branches
696  const UInt_t nvars = dsi.GetNVariables();
697  const UInt_t ntgts = dsi.GetNTargets();
698  const UInt_t nvis = dsi.GetNSpectators();
699 
700  for (size_t i=0; i<nclasses; i++) {
701  eventCounts[i].varAvLength = new Float_t[nvars];
702  for (UInt_t ivar=0; ivar<nvars; ivar++)
703  eventCounts[i].varAvLength[ivar] = 0;
704  }
705 
706  // Bool_t haveArrayVariable = kFALSE;
707  Bool_t *varIsArray = new Bool_t[nvars];
708 
709  // if we work with chains we need to remember the current tree if
710  // the chain jumps to a new tree we have to reset the formulas
711  for (UInt_t cl=0; cl<nclasses; cl++) {
712 
713  Log() << kINFO << "Create training and testing trees -- looping over class \""
714  << dsi.GetClassInfo(cl)->GetName() << "\" ..." << Endl;
715 
716  EventStats& classEventCounts = eventCounts[cl];
717 
718  // info output for weights
719  Log() << kINFO << "Weight expression for class \'" << dsi.GetClassInfo(cl)->GetName() << "\': \""
720  << dsi.GetClassInfo(cl)->GetWeight() << "\"" << Endl;
721 
722  // used for chains only
723  TString currentFileName("");
724 
725  std::vector<TreeInfo>::const_iterator treeIt(dataInput.begin(dsi.GetClassInfo(cl)->GetName()));
726  for (;treeIt!=dataInput.end(dsi.GetClassInfo(cl)->GetName()); treeIt++) {
727 
728  // read first the variables
729  std::vector<Float_t> vars(nvars);
730  std::vector<Float_t> tgts(ntgts);
731  std::vector<Float_t> vis(nvis);
732  TreeInfo currentInfo = *treeIt;
733 
734  Log() << kDEBUG << "Building event vectors " << currentInfo.GetTreeType() << Endl;
735 
736  EventVector& event_v = eventsmap[currentInfo.GetTreeType()].at(cl);
737 
738  Bool_t isChain = (TString("TChain") == currentInfo.GetTree()->ClassName());
739  currentInfo.GetTree()->LoadTree(0);
740  ChangeToNewTree( currentInfo, dsi );
741 
742  // count number of events in tree before cut
743  classEventCounts.nInitialEvents += currentInfo.GetTree()->GetEntries();
744 
745  // loop over events in ntuple
746  const UInt_t nEvts = currentInfo.GetTree()->GetEntries();
747  for (Long64_t evtIdx = 0; evtIdx < nEvts; evtIdx++) {
748  currentInfo.GetTree()->LoadTree(evtIdx);
749 
750  // may need to reload tree in case of chains
751  if (isChain) {
752  if (currentInfo.GetTree()->GetTree()->GetDirectory()->GetFile()->GetName() != currentFileName) {
753  currentFileName = currentInfo.GetTree()->GetTree()->GetDirectory()->GetFile()->GetName();
754  ChangeToNewTree( currentInfo, dsi );
755  }
756  }
757  currentInfo.GetTree()->GetEntry(evtIdx);
758  Int_t sizeOfArrays = 1;
759  Int_t prevArrExpr = 0;
760 
761  // ======= evaluate all formulas =================
762 
763  // first we check if some of the formulas are arrays
764  for (UInt_t ivar=0; ivar<nvars; ivar++) {
765  Int_t ndata = fInputFormulas[ivar]->GetNdata();
766  classEventCounts.varAvLength[ivar] += ndata;
767  if (ndata == 1) continue;
768  // haveArrayVariable = kTRUE;
769  varIsArray[ivar] = kTRUE;
770  if (sizeOfArrays == 1) {
771  sizeOfArrays = ndata;
772  prevArrExpr = ivar;
773  }
774  else if (sizeOfArrays!=ndata) {
775  Log() << kERROR << "ERROR while preparing training and testing trees:" << Endl;
776  Log() << " multiple array-type expressions of different length were encountered" << Endl;
777  Log() << " location of error: event " << evtIdx
778  << " in tree " << currentInfo.GetTree()->GetName()
779  << " of file " << currentInfo.GetTree()->GetCurrentFile()->GetName() << Endl;
780  Log() << " expression " << fInputFormulas[ivar]->GetTitle() << " has "
781  << ndata << " entries, while" << Endl;
782  Log() << " expression " << fInputFormulas[prevArrExpr]->GetTitle() << " has "
783  << fInputFormulas[prevArrExpr]->GetNdata() << " entries" << Endl;
784  Log() << kFATAL << "Need to abort" << Endl;
785  }
786  }
787 
788  // now we read the information
789  for (Int_t idata = 0; idata<sizeOfArrays; idata++) {
790  Bool_t containsNaN = kFALSE;
791 
792  TTreeFormula* formula = 0;
793 
794  // the cut expression
795  Float_t cutVal = 1;
796  formula = fCutFormulas[cl];
797  if (formula) {
798  Int_t ndata = formula->GetNdata();
799  cutVal = (ndata==1 ?
800  formula->EvalInstance(0) :
801  formula->EvalInstance(idata));
802  if (TMath::IsNaN(cutVal)) {
803  containsNaN = kTRUE;
804  Log() << kWARNING << "Cut expression resolves to infinite value (NaN): "
805  << formula->GetTitle() << Endl;
806  }
807  }
808 
809  // the input variable
810  for (UInt_t ivar=0; ivar<nvars; ivar++) {
811  formula = fInputFormulas[ivar];
812  Int_t ndata = formula->GetNdata();
813  vars[ivar] = (ndata == 1 ?
814  formula->EvalInstance(0) :
815  formula->EvalInstance(idata));
816  if (TMath::IsNaN(vars[ivar])) {
817  containsNaN = kTRUE;
818  Log() << kWARNING << "Input expression resolves to infinite value (NaN): "
819  << formula->GetTitle() << Endl;
820  }
821  }
822 
823  // the targets
824  for (UInt_t itrgt=0; itrgt<ntgts; itrgt++) {
825  formula = fTargetFormulas[itrgt];
826  Int_t ndata = formula->GetNdata();
827  tgts[itrgt] = (ndata == 1 ?
828  formula->EvalInstance(0) :
829  formula->EvalInstance(idata));
830  if (TMath::IsNaN(tgts[itrgt])) {
831  containsNaN = kTRUE;
832  Log() << kWARNING << "Target expression resolves to infinite value (NaN): "
833  << formula->GetTitle() << Endl;
834  }
835  }
836 
837  // the spectators
838  for (UInt_t itVis=0; itVis<nvis; itVis++) {
839  formula = fSpectatorFormulas[itVis];
840  Int_t ndata = formula->GetNdata();
841  vis[itVis] = (ndata == 1 ?
842  formula->EvalInstance(0) :
843  formula->EvalInstance(idata));
844  if (TMath::IsNaN(vis[itVis])) {
845  containsNaN = kTRUE;
846  Log() << kWARNING << "Spectator expression resolves to infinite value (NaN): "
847  << formula->GetTitle() << Endl;
848  }
849  }
850 
851 
852  // the weight
853  Float_t weight = currentInfo.GetWeight(); // multiply by tree weight
854  formula = fWeightFormula[cl];
855  if (formula!=0) {
856  Int_t ndata = formula->GetNdata();
857  weight *= (ndata == 1 ?
858  formula->EvalInstance() :
859  formula->EvalInstance(idata));
860  if (TMath::IsNaN(weight)) {
861  containsNaN = kTRUE;
862  Log() << kWARNING << "Weight expression resolves to infinite value (NaN): "
863  << formula->GetTitle() << Endl;
864  }
865  }
866 
867  // Count the events before rejection due to cut or NaN
868  // value (weighted and unweighted)
869  classEventCounts.nEvBeforeCut++;
870  if (!TMath::IsNaN(weight))
871  classEventCounts.nWeEvBeforeCut += weight;
872 
873  // apply the cut, skip rest if cut is not fulfilled
874  if (cutVal<0.5) continue;
875 
876  // global flag if negative weights exist -> can be used
877  // by classifiers who may require special data
878  // treatment (also print warning)
879  if (weight < 0) classEventCounts.nNegWeights++;
880 
881  // now read the event-values (variables and regression targets)
882 
883  if (containsNaN) {
884  Log() << kWARNING << "Event " << evtIdx;
885  if (sizeOfArrays>1) Log() << kWARNING << " rejected" << Endl;
886  continue;
887  }
888 
889  // Count the events after rejection due to cut or NaN value
890  // (weighted and unweighted)
891  classEventCounts.nEvAfterCut++;
892  classEventCounts.nWeEvAfterCut += weight;
893 
894  // event accepted, fill temporary ntuple
895  event_v.push_back(new Event(vars, tgts , vis, cl , weight));
896  }
897  }
898  currentInfo.GetTree()->ResetBranchAddresses();
899  }
900  }
901 
902  // for output format, get the maximum class name length
903  Int_t maxL = dsi.GetClassNameMaxLength();
904 
905  Log() << kINFO << "Number of events in input trees (after possible flattening of arrays):" << Endl;
906  for (UInt_t cl = 0; cl < dsi.GetNClasses(); cl++) {
907  Log() << kINFO << " "
908  << setiosflags(ios::left) << std::setw(maxL) << dsi.GetClassInfo(cl)->GetName()
909  << " -- number of events : "
910  << std::setw(5) << eventCounts[cl].nEvBeforeCut
911  << " / sum of weights: " << std::setw(5) << eventCounts[cl].nWeEvBeforeCut << Endl;
912  }
913 
914  for (UInt_t cl = 0; cl < dsi.GetNClasses(); cl++) {
915  Log() << kINFO << " " << std::setw(maxL) << dsi.GetClassInfo(cl)->GetName()
916  <<" tree -- total number of entries: "
917  << std::setw(5) << dataInput.GetEntries(dsi.GetClassInfo(cl)->GetName()) << Endl;
918  }
919 
920  if (fScaleWithPreselEff)
921  Log() << kINFO << "Preselection: (will affect number of requested training and testing events)" << Endl;
922  else
923  Log() << kINFO << "Preselection: (will NOT affect number of requested training and testing events)" << Endl;
924 
925  if (dsi.HasCuts()) {
926  for (UInt_t cl = 0; cl< dsi.GetNClasses(); cl++) {
927  Log() << kINFO << " " << setiosflags(ios::left) << std::setw(maxL) << dsi.GetClassInfo(cl)->GetName()
928  << " requirement: \"" << dsi.GetClassInfo(cl)->GetCut() << "\"" << Endl;
929  Log() << kINFO << " "
930  << setiosflags(ios::left) << std::setw(maxL) << dsi.GetClassInfo(cl)->GetName()
931  << " -- number of events passed: "
932  << std::setw(5) << eventCounts[cl].nEvAfterCut
933  << " / sum of weights: " << std::setw(5) << eventCounts[cl].nWeEvAfterCut << Endl;
934  Log() << kINFO << " "
935  << setiosflags(ios::left) << std::setw(maxL) << dsi.GetClassInfo(cl)->GetName()
936  << " -- efficiency : "
937  << std::setw(6) << eventCounts[cl].nWeEvAfterCut/eventCounts[cl].nWeEvBeforeCut << Endl;
938  }
939  }
940  else Log() << kINFO << " No preselection cuts applied on event classes" << Endl;
941 
942  delete[] varIsArray;
943 
944 }
945 
946 ////////////////////////////////////////////////////////////////////////////////
947 /// Select and distribute unassigned events to kTraining and kTesting
948 ///Bool_t emptyUndefined = kTRUE;
949 
952  EventVectorOfClassesOfTreeType& tmpEventVector,
953  EvtStatsPerClass& eventCounts,
954  const TString& splitMode,
955  const TString& mixMode,
956  const TString& normMode,
957  UInt_t splitSeed)
958 {
959  // check if the vectors of all classes are empty
960  //for( Int_t cls = 0, clsEnd = dsi.GetNClasses(); cls < clsEnd; ++cls ){
961  // emptyUndefined &= tmpEventVector[Types::kMaxTreeType].at(cls).empty();
962  //}
963 
964  TMVA::RandomGenerator rndm( splitSeed );
965 
966  // ==== splitting of undefined events to kTraining and kTesting
967 
968  // if splitMode contains "RANDOM", then shuffle the undefined events
969  if (splitMode.Contains( "RANDOM" ) /*&& !emptyUndefined*/ ) {
970  // random shuffle the undefined events of each class
971  for( UInt_t cls = 0; cls < dsi.GetNClasses(); ++cls ){
972  EventVector& unspecifiedEvents = tmpEventVector[Types::kMaxTreeType].at(cls);
973  if( ! unspecifiedEvents.empty() ) {
974  Log() << kDEBUG << "randomly shuffling "
975  << unspecifiedEvents.size()
976  << " events of class " << cls
977  << " which are not yet associated to testing or training" << Endl;
978  std::random_shuffle( unspecifiedEvents.begin(),
979  unspecifiedEvents.end(),
980  rndm );
981  }
982  }
983  }
984 
985  // check for each class the number of training and testing events, the requested number and the available number
986  Log() << kDEBUG << "SPLITTING ========" << Endl;
987  for( UInt_t cls = 0; cls < dsi.GetNClasses(); ++cls ){
988  Log() << kDEBUG << "---- class " << cls << Endl;
989  Log() << kDEBUG << "check number of training/testing events, requested and available number of events and for class " << cls << Endl;
990 
991  // check if enough or too many events are already in the training/testing eventvectors of the class cls
992  EventVector& eventVectorTraining = tmpEventVector[ Types::kTraining ].at(cls);
993  EventVector& eventVectorTesting = tmpEventVector[ Types::kTesting ].at(cls);
994  EventVector& eventVectorUndefined = tmpEventVector[ Types::kMaxTreeType ].at(cls);
995 
996  Int_t availableTraining = eventVectorTraining.size();
997  Int_t availableTesting = eventVectorTesting.size();
998  Int_t availableUndefined = eventVectorUndefined.size();
999 
1000  Float_t presel_scale;
1001  if (fScaleWithPreselEff) {
1002  presel_scale = eventCounts[cls].cutScaling();
1003  if (presel_scale < 1)
1004  Log() << kINFO << " you have opted for scaling the number of requested training/testing events\n to be scaled by the preselection efficiency"<< Endl;
1005  }else{
1006  presel_scale = 1.; // this scaling was tooo confusing to most people, including me! Sorry... (Helge)
1007  if (eventCounts[cls].cutScaling() < 1)
1008  Log() << kINFO << " you have opted for interpreting the requested number of training/testing events\n to be the number of events AFTER your preselection cuts" << Endl;
1009 
1010  }
1011  Int_t requestedTraining = Int_t(eventCounts[cls].nTrainingEventsRequested * presel_scale);
1012  Int_t requestedTesting = Int_t(eventCounts[cls].nTestingEventsRequested * presel_scale);
1013 
1014  Log() << kDEBUG << "events in training trees : " << availableTraining << Endl;
1015  Log() << kDEBUG << "events in testing trees : " << availableTesting << Endl;
1016  Log() << kDEBUG << "events in unspecified trees : " << availableUndefined << Endl;
1017  Log() << kDEBUG << "requested for training : " << requestedTraining;
1018 
1019  if(presel_scale<1)
1020  Log() << " ( " << eventCounts[cls].nTrainingEventsRequested
1021  << " * " << presel_scale << " preselection efficiency)" << Endl;
1022  else
1023  Log() << Endl;
1024  Log() << kDEBUG << "requested for testing : " << requestedTesting;
1025  if(presel_scale<1)
1026  Log() << " ( " << eventCounts[cls].nTestingEventsRequested
1027  << " * " << presel_scale << " preselection efficiency)" << Endl;
1028  else
1029  Log() << Endl;
1030 
1031  // nomenclature r = available training
1032  // s = available testing
1033  // u = available undefined
1034  // R = requested training
1035  // S = requested testing
1036  // nR = to be used to select training events
1037  // nS = to be used to select test events
1038  // we have the constraint: nR + nS < r+s+u,
1039  // since we can not use more events than we have
1040  // free events: Nfree = u-Thet(R-r)-Thet(S-s)
1041  // nomenclature: Thet(x) = x, if x>0 else 0
1042  // nR = max(R,r) + 0.5 * Nfree
1043  // nS = max(S,s) + 0.5 * Nfree
1044  // nR +nS = R+S + u-R+r-S+s = u+r+s= ok! for R>r
1045  // nR +nS = r+S + u-S+s = u+r+s= ok! for r>R
1046 
1047  // three different cases might occur here
1048  //
1049  // Case a
1050  // requestedTraining and requestedTesting >0
1051  // free events: Nfree = u-Thet(R-r)-Thet(S-s)
1052  // nR = Max(R,r) + 0.5 * Nfree
1053  // nS = Max(S,s) + 0.5 * Nfree
1054  //
1055  // Case b
1056  // exactly one of requestedTraining or requestedTesting >0
1057  // assume training R >0
1058  // nR = max(R,r)
1059  // nS = s+u+r-nR
1060  // and s=nS
1061  //
1062  // Case c
1063  // requestedTraining=0, requestedTesting=0
1064  // Nfree = u-|r-s|
1065  // if NFree >=0
1066  // R = Max(r,s) + 0.5 * Nfree = S
1067  // else if r>s
1068  // R = r; S=s+u
1069  // else
1070  // R = r+u; S=s
1071  //
1072  // Next steps:
1073  // Determination of Event numbers R,S, nR, nS
1074  // distribute undefined events according to nR, nS
1075  // finally determine actual sub samples from nR and nS to be used in training / testing
1076  //
1077 
1078  Int_t useForTesting(0),useForTraining(0);
1079  Int_t allAvailable(availableUndefined + availableTraining + availableTesting);
1080 
1081  if( (requestedTraining == 0) && (requestedTesting == 0)){
1082 
1083  // Case C: balance the number of training and testing events
1084 
1085  if ( availableUndefined >= TMath::Abs(availableTraining - availableTesting) ) {
1086  // enough unspecified are available to equal training and testing
1087  useForTraining = useForTesting = allAvailable/2;
1088  } else {
1089  // all unspecified are assigned to the smaller of training / testing
1090  useForTraining = availableTraining;
1091  useForTesting = availableTesting;
1092  if (availableTraining < availableTesting)
1093  useForTraining += availableUndefined;
1094  else
1095  useForTesting += availableUndefined;
1096  }
1097  requestedTraining = useForTraining;
1098  requestedTesting = useForTesting;
1099  }
1100 
1101  else if (requestedTesting == 0){
1102  // case B
1103  useForTraining = TMath::Max(requestedTraining,availableTraining);
1104  if (allAvailable < useForTraining) {
1105  Log() << kFATAL << "More events requested for training ("
1106  << requestedTraining << ") than available ("
1107  << allAvailable << ")!" << Endl;
1108  }
1109  useForTesting = allAvailable - useForTraining; // the rest
1110  requestedTesting = useForTesting;
1111  }
1112 
1113  else if (requestedTraining == 0){ // case B)
1114  useForTesting = TMath::Max(requestedTesting,availableTesting);
1115  if (allAvailable < useForTesting) {
1116  Log() << kFATAL << "More events requested for testing ("
1117  << requestedTesting << ") than available ("
1118  << allAvailable << ")!" << Endl;
1119  }
1120  useForTraining= allAvailable - useForTesting; // the rest
1121  requestedTraining = useForTraining;
1122  }
1123 
1124  else {
1125  // Case A
1126  // requestedTraining R and requestedTesting S >0
1127  // free events: Nfree = u-Thet(R-r)-Thet(S-s)
1128  // nR = Max(R,r) + 0.5 * Nfree
1129  // nS = Max(S,s) + 0.5 * Nfree
1130  Int_t stillNeedForTraining = TMath::Max(requestedTraining-availableTraining,0);
1131  Int_t stillNeedForTesting = TMath::Max(requestedTesting-availableTesting,0);
1132 
1133  int NFree = availableUndefined - stillNeedForTraining - stillNeedForTesting;
1134  if (NFree <0) NFree = 0;
1135  useForTraining = TMath::Max(requestedTraining,availableTraining) + NFree/2;
1136  useForTesting= allAvailable - useForTraining; // the rest
1137  }
1138 
1139  Log() << kDEBUG << "determined event sample size to select training sample from="<<useForTraining<<Endl;
1140  Log() << kDEBUG << "determined event sample size to select test sample from="<<useForTesting<<Endl;
1141 
1142 
1143 
1144  // associate undefined events
1145  if( splitMode == "ALTERNATE" ){
1146  Log() << kDEBUG << "split 'ALTERNATE'" << Endl;
1147  Int_t nTraining = availableTraining;
1148  Int_t nTesting = availableTesting;
1149  for( EventVector::iterator it = eventVectorUndefined.begin(), itEnd = eventVectorUndefined.end(); it != itEnd; ){
1150  ++nTraining;
1151  if( nTraining <= requestedTraining ){
1152  eventVectorTraining.insert( eventVectorTraining.end(), (*it) );
1153  ++it;
1154  }
1155  if( it != itEnd ){
1156  ++nTesting;
1157  eventVectorTesting.insert( eventVectorTesting.end(), (*it) );
1158  ++it;
1159  }
1160  }
1161  } else {
1162  Log() << kDEBUG << "split '" << splitMode << "'" << Endl;
1163 
1164  // test if enough events are available
1165  Log() << kDEBUG << "availableundefined : " << availableUndefined << Endl;
1166  Log() << kDEBUG << "useForTraining : " << useForTraining << Endl;
1167  Log() << kDEBUG << "useForTesting : " << useForTesting << Endl;
1168  Log() << kDEBUG << "availableTraining : " << availableTraining << Endl;
1169  Log() << kDEBUG << "availableTesting : " << availableTesting << Endl;
1170 
1171  if( availableUndefined<(useForTraining-availableTraining) ||
1172  availableUndefined<(useForTesting -availableTesting ) ||
1173  availableUndefined<(useForTraining+useForTesting-availableTraining-availableTesting ) ){
1174  Log() << kFATAL << "More events requested than available!" << Endl;
1175  }
1176 
1177  // select the events
1178  if (useForTraining>availableTraining){
1179  eventVectorTraining.insert( eventVectorTraining.end() , eventVectorUndefined.begin(), eventVectorUndefined.begin()+ useForTraining- availableTraining );
1180  eventVectorUndefined.erase( eventVectorUndefined.begin(), eventVectorUndefined.begin() + useForTraining- availableTraining);
1181  }
1182  if (useForTesting>availableTesting){
1183  eventVectorTesting.insert( eventVectorTesting.end() , eventVectorUndefined.begin(), eventVectorUndefined.begin()+ useForTesting- availableTesting );
1184  }
1185  }
1186  eventVectorUndefined.clear();
1187 
1188  // finally shorten the event vectors to the requested size by removing random events
1189  if (splitMode.Contains( "RANDOM" )){
1190  UInt_t sizeTraining = eventVectorTraining.size();
1191  if( sizeTraining > UInt_t(requestedTraining) ){
1192  std::vector<UInt_t> indicesTraining( sizeTraining );
1193  // make indices
1194  std::generate( indicesTraining.begin(), indicesTraining.end(), TMVA::Increment<UInt_t>(0) );
1195  // shuffle indices
1196  std::random_shuffle( indicesTraining.begin(), indicesTraining.end(), rndm );
1197  // erase indices of not needed events
1198  indicesTraining.erase( indicesTraining.begin()+sizeTraining-UInt_t(requestedTraining), indicesTraining.end() );
1199  // delete all events with the given indices
1200  for( std::vector<UInt_t>::iterator it = indicesTraining.begin(), itEnd = indicesTraining.end(); it != itEnd; ++it ){
1201  delete eventVectorTraining.at( (*it) ); // delete event
1202  eventVectorTraining.at( (*it) ) = NULL; // set pointer to NULL
1203  }
1204  // now remove and erase all events with pointer==NULL
1205  eventVectorTraining.erase( std::remove( eventVectorTraining.begin(), eventVectorTraining.end(), (void*)NULL ), eventVectorTraining.end() );
1206  }
1207 
1208  UInt_t sizeTesting = eventVectorTesting.size();
1209  if( sizeTesting > UInt_t(requestedTesting) ){
1210  std::vector<UInt_t> indicesTesting( sizeTesting );
1211  // make indices
1212  std::generate( indicesTesting.begin(), indicesTesting.end(), TMVA::Increment<UInt_t>(0) );
1213  // shuffle indices
1214  std::random_shuffle( indicesTesting.begin(), indicesTesting.end(), rndm );
1215  // erase indices of not needed events
1216  indicesTesting.erase( indicesTesting.begin()+sizeTesting-UInt_t(requestedTesting), indicesTesting.end() );
1217  // delete all events with the given indices
1218  for( std::vector<UInt_t>::iterator it = indicesTesting.begin(), itEnd = indicesTesting.end(); it != itEnd; ++it ){
1219  delete eventVectorTesting.at( (*it) ); // delete event
1220  eventVectorTesting.at( (*it) ) = NULL; // set pointer to NULL
1221  }
1222  // now remove and erase all events with pointer==NULL
1223  eventVectorTesting.erase( std::remove( eventVectorTesting.begin(), eventVectorTesting.end(), (void*)NULL ), eventVectorTesting.end() );
1224  }
1225  }
1226  else { // erase at end
1227  if( eventVectorTraining.size() < UInt_t(requestedTraining) )
1228  Log() << kWARNING << "DataSetFactory/requested number of training samples larger than size of eventVectorTraining.\n"
1229  << "There is probably an issue. Please contact the TMVA developers." << Endl;
1230  std::for_each( eventVectorTraining.begin()+requestedTraining, eventVectorTraining.end(), DeleteFunctor<Event>() );
1231  eventVectorTraining.erase(eventVectorTraining.begin()+requestedTraining,eventVectorTraining.end());
1232 
1233  if( eventVectorTesting.size() < UInt_t(requestedTesting) )
1234  Log() << kWARNING << "DataSetFactory/requested number of testing samples larger than size of eventVectorTesting.\n"
1235  << "There is probably an issue. Please contact the TMVA developers." << Endl;
1236  std::for_each( eventVectorTesting.begin()+requestedTesting, eventVectorTesting.end(), DeleteFunctor<Event>() );
1237  eventVectorTesting.erase(eventVectorTesting.begin()+requestedTesting,eventVectorTesting.end());
1238  }
1239  }
1240 
1241  TMVA::DataSetFactory::RenormEvents( dsi, tmpEventVector, eventCounts, normMode );
1242 
1243  Int_t trainingSize = 0;
1244  Int_t testingSize = 0;
1245 
1246  // sum up number of training and testing events
1247  for( UInt_t cls = 0; cls < dsi.GetNClasses(); ++cls ){
1248  trainingSize += tmpEventVector[Types::kTraining].at(cls).size();
1249  testingSize += tmpEventVector[Types::kTesting].at(cls).size();
1250  }
1251 
1252  // --- collect all training (testing) events into the training (testing) eventvector
1253 
1254  // create event vectors reserve enough space
1255  EventVector* trainingEventVector = new EventVector();
1256  EventVector* testingEventVector = new EventVector();
1257 
1258  trainingEventVector->reserve( trainingSize );
1259  testingEventVector->reserve( testingSize );
1260 
1261 
1262  // collect the events
1263 
1264  // mixing of kTraining and kTesting data sets
1265  Log() << kDEBUG << " MIXING ============= " << Endl;
1266 
1267  if( mixMode == "ALTERNATE" ){
1268  // Inform user if he tries to use alternate mixmode for
1269  // event classes with different number of events, this works but the alternation stops at the last event of the smaller class
1270  for( UInt_t cls = 1; cls < dsi.GetNClasses(); ++cls ){
1271  if (tmpEventVector[Types::kTraining].at(cls).size() != tmpEventVector[Types::kTraining].at(0).size()){
1272  Log() << kINFO << "Training sample: You are trying to mix events in alternate mode although the classes have different event numbers. This works but the alternation stops at the last event of the smaller class."<<Endl;
1273  }
1274  if (tmpEventVector[Types::kTesting].at(cls).size() != tmpEventVector[Types::kTesting].at(0).size()){
1275  Log() << kINFO << "Testing sample: You are trying to mix events in alternate mode although the classes have different event numbers. This works but the alternation stops at the last event of the smaller class."<<Endl;
1276  }
1277  }
1278  typedef EventVector::iterator EvtVecIt;
1279  EvtVecIt itEvent, itEventEnd;
1280 
1281  // insert first class
1282  Log() << kDEBUG << "insert class 0 into training and test vector" << Endl;
1283  trainingEventVector->insert( trainingEventVector->end(), tmpEventVector[Types::kTraining].at(0).begin(), tmpEventVector[Types::kTraining].at(0).end() );
1284  testingEventVector->insert( testingEventVector->end(), tmpEventVector[Types::kTesting].at(0).begin(), tmpEventVector[Types::kTesting].at(0).end() );
1285 
1286  // insert other classes
1287  EvtVecIt itTarget;
1288  for( UInt_t cls = 1; cls < dsi.GetNClasses(); ++cls ){
1289  Log() << kDEBUG << "insert class " << cls << Endl;
1290  // training vector
1291  itTarget = trainingEventVector->begin() - 1; // start one before begin
1292  // loop over source
1293  for( itEvent = tmpEventVector[Types::kTraining].at(cls).begin(), itEventEnd = tmpEventVector[Types::kTraining].at(cls).end(); itEvent != itEventEnd; ++itEvent ){
1294 // if( std::distance( itTarget, trainingEventVector->end()) < Int_t(cls+1) ) {
1295  if( (trainingEventVector->end() - itTarget) < Int_t(cls+1) ) {
1296  itTarget = trainingEventVector->end();
1297  trainingEventVector->insert( itTarget, itEvent, itEventEnd ); // fill in the rest without mixing
1298  break;
1299  }else{
1300  itTarget += cls+1;
1301  trainingEventVector->insert( itTarget, (*itEvent) ); // fill event
1302  }
1303  }
1304  // testing vector
1305  itTarget = testingEventVector->begin() - 1;
1306  // loop over source
1307  for( itEvent = tmpEventVector[Types::kTesting].at(cls).begin(), itEventEnd = tmpEventVector[Types::kTesting].at(cls).end(); itEvent != itEventEnd; ++itEvent ){
1308 // if( std::distance( itTarget, testingEventVector->end()) < Int_t(cls+1) ) {
1309  if( ( testingEventVector->end() - itTarget ) < Int_t(cls+1) ) {
1310  itTarget = testingEventVector->end();
1311  testingEventVector->insert( itTarget, itEvent, itEventEnd ); // fill in the rest without mixing
1312  break;
1313  }else{
1314  itTarget += cls+1;
1315  testingEventVector->insert( itTarget, (*itEvent) ); // fill event
1316  }
1317  }
1318  }
1319 
1320  // debugging output: classnumbers of all events in training and testing vectors
1321  // std::cout << std::endl;
1322  // std::cout << "TRAINING VECTOR" << std::endl;
1323  // std::transform( trainingEventVector->begin(), trainingEventVector->end(), ostream_iterator<Int_t>(std::cout, "|"), std::mem_fun(&TMVA::Event::GetClass) );
1324 
1325  // std::cout << std::endl;
1326  // std::cout << "TESTING VECTOR" << std::endl;
1327  // std::transform( testingEventVector->begin(), testingEventVector->end(), ostream_iterator<Int_t>(std::cout, "|"), std::mem_fun(&TMVA::Event::GetClass) );
1328  // std::cout << std::endl;
1329 
1330  }else{
1331  for( UInt_t cls = 0; cls < dsi.GetNClasses(); ++cls ){
1332  trainingEventVector->insert( trainingEventVector->end(), tmpEventVector[Types::kTraining].at(cls).begin(), tmpEventVector[Types::kTraining].at(cls).end() );
1333  testingEventVector->insert ( testingEventVector->end(), tmpEventVector[Types::kTesting].at(cls).begin(), tmpEventVector[Types::kTesting].at(cls).end() );
1334  }
1335  }
1336 
1337  // std::cout << "trainingEventVector " << trainingEventVector->size() << std::endl;
1338  // std::cout << "testingEventVector " << testingEventVector->size() << std::endl;
1339 
1340  // std::transform( trainingEventVector->begin(), trainingEventVector->end(), ostream_iterator<Int_t>(std::cout, "> \n"), std::mem_fun(&TMVA::Event::GetNVariables) );
1341  // std::transform( testingEventVector->begin(), testingEventVector->end(), ostream_iterator<Int_t>(std::cout, "> \n"), std::mem_fun(&TMVA::Event::GetNVariables) );
1342 
1343  // delete the tmpEventVector (but not the events therein)
1344  tmpEventVector[Types::kTraining].clear();
1345  tmpEventVector[Types::kTesting].clear();
1346 
1347  tmpEventVector[Types::kMaxTreeType].clear();
1348 
1349  if (mixMode == "RANDOM") {
1350  Log() << kDEBUG << "shuffling events"<<Endl;
1351 
1352  std::random_shuffle( trainingEventVector->begin(), trainingEventVector->end(), rndm );
1353  std::random_shuffle( testingEventVector->begin(), testingEventVector->end(), rndm );
1354  }
1355 
1356  Log() << kDEBUG << "trainingEventVector " << trainingEventVector->size() << Endl;
1357  Log() << kDEBUG << "testingEventVector " << testingEventVector->size() << Endl;
1358 
1359  // create dataset
1360  DataSet* ds = new DataSet(dsi);
1361 
1362  Log() << kINFO << "Create internal training tree" << Endl;
1363  ds->SetEventCollection(trainingEventVector, Types::kTraining );
1364  Log() << kINFO << "Create internal testing tree" << Endl;
1365  ds->SetEventCollection(testingEventVector, Types::kTesting );
1366 
1367 
1368  if (ds->GetNTrainingEvents() < 1){
1369  Log() << kFATAL << "Dataset " << std::string(dsi.GetName()) << " does not have any training events, I better stop here and let you fix that one first " << Endl;
1370  }
1371 
1372  if (ds->GetNTestEvents() < 1) {
1373  Log() << kERROR << "Dataset " << std::string(dsi.GetName()) << " does not have any testing events, guess that will cause problems later..but for now, I continue " << Endl;
1374  }
1375 
1376 
1377  return ds;
1378 
1379 }
1380 
1381 ////////////////////////////////////////////////////////////////////////////////
1382 /// =============================================================================
1383 /// renormalisation of the TRAINING event weights
1384 /// -none (kind of obvious) .. use the weights as supplied by the
1385 /// user.. (we store however the relative weight for later use)
1386 /// -numEvents
1387 /// -equalNumEvents reweight the training events such that the sum of all
1388 /// backgr. (class > 0) weights equal that of the signal (class 0)
1389 /// =============================================================================
1390 
1391 void
1393  EventVectorOfClassesOfTreeType& tmpEventVector,
1394  const EvtStatsPerClass& eventCounts,
1395  const TString& normMode )
1396 {
1397 
1398 
1399  // print rescaling info
1400  // ---------------------------------
1401  // compute sizes and sums of weights
1402  Int_t trainingSize = 0;
1403  Int_t testingSize = 0;
1404 
1405  ValuePerClass trainingSumWeightsPerClass( dsi.GetNClasses() );
1406  ValuePerClass testingSumWeightsPerClass( dsi.GetNClasses() );
1407 
1408  NumberPerClass trainingSizePerClass( dsi.GetNClasses() );
1409  NumberPerClass testingSizePerClass( dsi.GetNClasses() );
1410 
1411  Double_t trainingSumSignalWeights = 0;
1412  Double_t trainingSumBackgrWeights = 0; // Backgr. includes all clasess that are not signal
1413  Double_t testingSumSignalWeights = 0;
1414  Double_t testingSumBackgrWeights = 0; // Backgr. includes all clasess that are not signal
1415 
1416 
1417 
1418  for( UInt_t cls = 0, clsEnd = dsi.GetNClasses(); cls < clsEnd; ++cls ){
1419  trainingSizePerClass.at(cls) = tmpEventVector[Types::kTraining].at(cls).size();
1420  testingSizePerClass.at(cls) = tmpEventVector[Types::kTesting].at(cls).size();
1421 
1422  trainingSize += trainingSizePerClass.back();
1423  testingSize += testingSizePerClass.back();
1424 
1425  // the functional solution
1426  // sum up the weights in Double_t although the individual weights are Float_t to prevent rounding issues in addition of floating points
1427  //
1428  // accumulate --> does what the name says
1429  // begin() and end() denote the range of the vector to be accumulated
1430  // Double_t(0) tells accumulate the type and the starting value
1431  // compose_binary creates a BinaryFunction of ...
1432  // std::plus<Double_t>() knows how to sum up two doubles
1433  // null<Double_t>() leaves the first argument (the running sum) unchanged and returns it
1434  // std::mem_fun knows how to call a member function (type and member-function given as argument) and return the result
1435  //
1436  // all together sums up all the event-weights of the events in the vector and returns it
1437  trainingSumWeightsPerClass.at(cls) = std::accumulate( tmpEventVector[Types::kTraining].at(cls).begin(),
1438  tmpEventVector[Types::kTraining].at(cls).end(),
1439  Double_t(0),
1440  compose_binary( std::plus<Double_t>(),
1441  null<Double_t>(),
1442  std::mem_fun(&TMVA::Event::GetOriginalWeight) ) );
1443 
1444  testingSumWeightsPerClass.at(cls) = std::accumulate( tmpEventVector[Types::kTesting].at(cls).begin(),
1445  tmpEventVector[Types::kTesting].at(cls).end(),
1446  Double_t(0),
1447  compose_binary( std::plus<Double_t>(),
1448  null<Double_t>(),
1449  std::mem_fun(&TMVA::Event::GetOriginalWeight) ) );
1450 
1451  if ( cls == dsi.GetSignalClassIndex()){
1452  trainingSumSignalWeights += trainingSumWeightsPerClass.at(cls);
1453  testingSumSignalWeights += testingSumWeightsPerClass.at(cls);
1454  }else{
1455  trainingSumBackgrWeights += trainingSumWeightsPerClass.at(cls);
1456  testingSumBackgrWeights += testingSumWeightsPerClass.at(cls);
1457  }
1458  }
1459 
1460  // ---------------------------------
1461  // compute renormalization factors
1462 
1463  ValuePerClass renormFactor( dsi.GetNClasses() );
1464 
1465 
1466  // for information purposes
1467  dsi.SetNormalization( normMode );
1468  // !! these will be overwritten later by the 'rescaled' ones if
1469  // NormMode != None !!!
1470  dsi.SetTrainingSumSignalWeights(trainingSumSignalWeights);
1471  dsi.SetTrainingSumBackgrWeights(trainingSumBackgrWeights);
1472  dsi.SetTestingSumSignalWeights(testingSumSignalWeights);
1473  dsi.SetTestingSumBackgrWeights(testingSumBackgrWeights);
1474 
1475 
1476  if (normMode == "NONE") {
1477  Log() << kINFO << "No weight renormalisation applied: use original global and event weights" << Endl;
1478  return;
1479  }
1480  //changed by Helge 27.5.2013 What on earth was done here before? I still remember the idea behind this which apparently was
1481  //NOT understood by the 'programmer' :) .. the idea was to have SAME amount of effective TRAINING data for signal and background.
1482  // Testing events are totally irrelevant for this and might actually skew the whole normalisation!!
1483  else if (normMode == "NUMEVENTS") {
1484  Log() << kINFO << "Weight renormalisation mode: \"NumEvents\": renormalises all event classes " << Endl;
1485  Log() << kINFO << " such that the effective (weighted) number of events in each class equals the respective " << Endl;
1486  Log() << kINFO << " number of events (entries) that you demanded in PrepareTrainingAndTestTree(\"\",\"nTrain_Signal=.. )" << Endl;
1487  Log() << kINFO << " ... i.e. such that Sum[i=1..N_j]{w_i} = N_j, j=0,1,2..." << Endl;
1488  Log() << kINFO << " ... (note that N_j is the sum of TRAINING events (nTrain_j...with j=Signal,Background.." << Endl;
1489  Log() << kINFO << " ..... Testing events are not renormalised nor included in the renormalisation factor! )"<< Endl;
1490 
1491  for( UInt_t cls = 0, clsEnd = dsi.GetNClasses(); cls < clsEnd; ++cls ){
1492  // renormFactor.at(cls) = ( (trainingSizePerClass.at(cls) + testingSizePerClass.at(cls))/
1493  // (trainingSumWeightsPerClass.at(cls) + testingSumWeightsPerClass.at(cls)) );
1494  //changed by Helge 27.5.2013
1495  renormFactor.at(cls) = ((Float_t)trainingSizePerClass.at(cls) )/
1496  (trainingSumWeightsPerClass.at(cls)) ;
1497  }
1498  }
1499  else if (normMode == "EQUALNUMEVENTS") {
1500  //changed by Helge 27.5.2013 What on earth was done here before? I still remember the idea behind this which apparently was
1501  //NOT understood by the 'programmer' :) .. the idea was to have SAME amount of effective TRAINING data for signal and background.
1502  //done here was something like having each data source normalized to its number of entries and this even for trainig+testing together.
1503  // what should this have been good for ???
1504 
1505  Log() << kINFO << "Weight renormalisation mode: \"EqualNumEvents\": renormalises all event classes ..." << Endl;
1506  Log() << kINFO << " such that the effective (weighted) number of events in each class is the same " << Endl;
1507  Log() << kINFO << " (and equals the number of events (entries) given for class=0 )" << Endl;
1508  Log() << kINFO << "... i.e. such that Sum[i=1..N_j]{w_i} = N_classA, j=classA, classB, ..." << Endl;
1509  Log() << kINFO << "... (note that N_j is the sum of TRAINING events" << Endl;
1510  Log() << kINFO << " ..... Testing events are not renormalised nor included in the renormalisation factor!)" << Endl;
1511 
1512  // normalize to size of first class
1513  UInt_t referenceClass = 0;
1514  for (UInt_t cls = 0, clsEnd = dsi.GetNClasses(); cls < clsEnd; ++cls ) {
1515  renormFactor.at(cls) = Float_t(trainingSizePerClass.at(referenceClass))/
1516  (trainingSumWeightsPerClass.at(cls));
1517  }
1518  }
1519  else {
1520  Log() << kFATAL << "<PrepareForTrainingAndTesting> Unknown NormMode: " << normMode << Endl;
1521  }
1522 
1523  // ---------------------------------
1524  // now apply the normalization factors
1525  Int_t maxL = dsi.GetClassNameMaxLength();
1526  for (UInt_t cls = 0, clsEnd = dsi.GetNClasses(); cls<clsEnd; ++cls) {
1527  Log() << kINFO << "--> Rescale " << setiosflags(ios::left) << std::setw(maxL)
1528  << dsi.GetClassInfo(cls)->GetName() << " event weights by factor: " << renormFactor.at(cls) << Endl;
1529  for (EventVector::iterator it = tmpEventVector[Types::kTraining].at(cls).begin(),
1530  itEnd = tmpEventVector[Types::kTraining].at(cls).end(); it != itEnd; ++it){
1531  (*it)->SetWeight ((*it)->GetWeight() * renormFactor.at(cls));
1532  }
1533 
1534  }
1535 
1536 
1537  // print out the result
1538  // (same code as before --> this can be done nicer )
1539  //
1540 
1541  Log() << kINFO << "Number of training and testing events after rescaling:" << Endl;
1542  Log() << kINFO << "------------------------------------------------------" << Endl;
1543 
1544  trainingSumSignalWeights = 0;
1545  trainingSumBackgrWeights = 0; // Backgr. includes all clasess that are not signal
1546  testingSumSignalWeights = 0;
1547  testingSumBackgrWeights = 0; // Backgr. includes all clasess that are not signal
1548 
1549  for( UInt_t cls = 0, clsEnd = dsi.GetNClasses(); cls < clsEnd; ++cls ){
1550 
1551  trainingSumWeightsPerClass.at(cls) = (std::accumulate( tmpEventVector[Types::kTraining].at(cls).begin(), // accumulate --> start at begin
1552  tmpEventVector[Types::kTraining].at(cls).end(), // until end()
1553  Double_t(0), // values are of type double
1554  compose_binary( std::plus<Double_t>(), // define addition for doubles
1555  null<Double_t>(), // take the argument, don't do anything and return it
1556  std::mem_fun(&TMVA::Event::GetOriginalWeight) ) )); // take the value from GetOriginalWeight
1557 
1558  testingSumWeightsPerClass.at(cls) = std::accumulate( tmpEventVector[Types::kTesting].at(cls).begin(),
1559  tmpEventVector[Types::kTesting].at(cls).end(),
1560  Double_t(0),
1561  compose_binary( std::plus<Double_t>(),
1562  null<Double_t>(),
1563  std::mem_fun(&TMVA::Event::GetOriginalWeight) ) );
1564 
1565 
1566  if ( cls == dsi.GetSignalClassIndex()){
1567  trainingSumSignalWeights += trainingSumWeightsPerClass.at(cls);
1568  testingSumSignalWeights += testingSumWeightsPerClass.at(cls);
1569  }else{
1570  trainingSumBackgrWeights += trainingSumWeightsPerClass.at(cls);
1571  testingSumBackgrWeights += testingSumWeightsPerClass.at(cls);
1572  }
1573 
1574  // output statistics
1575 
1576  Log() << kINFO << setiosflags(ios::left) << std::setw(maxL)
1577  << dsi.GetClassInfo(cls)->GetName() << " -- "
1578  << "training events : " << trainingSizePerClass.at(cls)
1579  << " (sum of weights: " << trainingSumWeightsPerClass.at(cls) << ")"
1580  << " - requested were " << eventCounts[cls].nTrainingEventsRequested << " events" << Endl;
1581  Log() << kINFO << setiosflags(ios::left) << std::setw(maxL)
1582  << dsi.GetClassInfo(cls)->GetName() << " -- "
1583  << "testing events : " << testingSizePerClass.at(cls)
1584  << " (sum of weights: " << testingSumWeightsPerClass.at(cls) << ")"
1585  << " - requested were " << eventCounts[cls].nTestingEventsRequested << " events" << Endl;
1586  Log() << kINFO << setiosflags(ios::left) << std::setw(maxL)
1587  << dsi.GetClassInfo(cls)->GetName() << " -- "
1588  << "training and testing events: "
1589  << (trainingSizePerClass.at(cls)+testingSizePerClass.at(cls))
1590  << " (sum of weights: "
1591  << (trainingSumWeightsPerClass.at(cls)+testingSumWeightsPerClass.at(cls)) << ")" << Endl;
1592  if(eventCounts[cls].nEvAfterCut<eventCounts[cls].nEvBeforeCut) {
1593  Log() << kINFO << setiosflags(ios::left) << std::setw(maxL)
1594  << dsi.GetClassInfo(cls)->GetName() << " -- "
1595  << "due to the preselection a scaling factor has been applied to the numbers of requested events: "
1596  << eventCounts[cls].cutScaling() << Endl;
1597  }
1598  }
1599 
1600  // for information purposes
1601  dsi.SetTrainingSumSignalWeights(trainingSumSignalWeights);
1602  dsi.SetTrainingSumBackgrWeights(trainingSumBackgrWeights);
1603  dsi.SetTestingSumSignalWeights(testingSumSignalWeights);
1604  dsi.SetTestingSumBackgrWeights(testingSumBackgrWeights);
1605 
1606 
1607 }
1608 
1609 
1610 
const int ndata
virtual const char * GetTitle() const
Returns title of object.
Definition: TNamed.h:52
UInt_t GetNSpectators() const
access the number of targets through the datasetinfo
Definition: DataSet.cxx:216
A TLeaf describes individual elements of a TBranch See TBranch structure in TTree.
Definition: TLeaf.h:37
void SetEventCollection(std::vector< Event * > *, Types::ETreeType)
Sets the event collection (by DataSetFactory)
Definition: DataSet.cxx:235
std::vector< EventVector > EventVectorOfClasses
Long64_t GetNTestEvents() const
Definition: DataSet.h:91
std::vector< TString > * GetClassList() const
void SetTrainingSumBackgrWeights(Double_t trainingSumBackgrWeights)
Definition: DataSetInfo.h:136
static Vc_ALWAYS_INLINE int_v min(const int_v &x, const int_v &y)
Definition: vector.h:433
MsgLogger & Endl(MsgLogger &ml)
Definition: MsgLogger.h:162
long long Long64_t
Definition: RtypesCore.h:69
virtual void SetBranchStatus(const char *bname, Bool_t status=1, UInt_t *found=0)
Set branch status to Process or DoNotProcess.
Definition: TTree.cxx:7656
std::vector< VariableInfo > & GetSpectatorInfos()
Definition: DataSetInfo.h:122
float Float_t
Definition: RtypesCore.h:53
const TString & GetExpression() const
Definition: VariableInfo.h:62
Int_t GetClassNameMaxLength() const
virtual TLeaf * GetLeaf(Int_t n) const
Return leaf corresponding to serial number n.
UInt_t GetNClasses() const
Definition: DataSetInfo.h:152
std::vector< Double_t > ValuePerClass
Types::ETreeType GetTreeType() const
UInt_t GetNTargets() const
Definition: DataSetInfo.h:129
void ToUpper()
Change string to upper case.
Definition: TString.cxx:1088
void SetTrainingSumSignalWeights(Double_t trainingSumSignalWeights)
Definition: DataSetInfo.h:135
virtual Int_t GetEntry(Long64_t entry=0, Int_t getall=0)
Read all branches of entry and return total number of bytes read.
Definition: TTree.cxx:5144
Basic string class.
Definition: TString.h:137
void SetTestingSumBackgrWeights(Double_t testingSumBackgrWeights)
Definition: DataSetInfo.h:138
void BuildEventVector(DataSetInfo &dsi, DataInputHandler &dataInput, EventVectorOfClassesOfTreeType &eventsmap, EvtStatsPerClass &eventCounts)
build empty event vectors distributes events between kTraining/kTesting/kMaxTreeType ...
const TString & GetWeight() const
Definition: ClassInfo.h:73
virtual Int_t GetNdim() const
Definition: TFormula.h:243
int Int_t
Definition: RtypesCore.h:41
bool Bool_t
Definition: RtypesCore.h:59
TArc * a
Definition: textangle.C:12
const Bool_t kFALSE
Definition: Rtypes.h:92
void generate(R &r, TH1D *h)
Definition: piRandom.C:28
TBranch * GetBranch() const
Definition: TLeaf.h:70
Double_t GetWeight() const
return the event weight - depending on whether the flag IgnoreNegWeightsInTraining is or not...
Definition: Event.cxx:376
Float_t GetSpectator(UInt_t ivar) const
return spectator content
Definition: Event.cxx:256
void CalcMinMax(DataSet *, DataSetInfo &dsi)
compute covariance matrix
TFile * GetCurrentFile() const
Return pointer to the current file.
Definition: TTree.cxx:4987
TTree * GetTree() const
UInt_t GetNVariables() const
Definition: DataSetInfo.h:128
Short_t Abs(Short_t d)
Definition: TMathBase.h:110
TObject Int_t at
UInt_t GetEntries(const TString &name) const
std::vector< int > NumberPerClass
const TString & GetClassName() const
const TString & GetInternalName() const
Definition: VariableInfo.h:63
virtual const char * GetName() const
Returns name of object.
Definition: DataSetInfo.h:85
const char * Data() const
Definition: TString.h:349
std::map< Types::ETreeType, EventVectorOfClasses > EventVectorOfClassesOfTreeType
double sqrt(double)
void InitOptions(DataSetInfo &dsi, EvtStatsPerClass &eventsmap, TString &normMode, UInt_t &splitSeed, TString &splitMode, TString &mixMode)
the dataset splitting
DataSet * BuildDynamicDataSet(DataSetInfo &)
static TString Format(const char *fmt,...)
Static method which formats a string using a printf style format descriptor and return a TString...
Definition: TString.cxx:2321
virtual Bool_t IsOnTerminalBranch() const
Definition: TLeaf.h:89
virtual Long64_t LoadTree(Long64_t entry)
Set current entry.
Definition: TTree.cxx:5785
int d
Definition: tornado.py:11
void SetTestingSumSignalWeights(Double_t testingSumSignalWeights)
Definition: DataSetInfo.h:137
std::vector< std::vector< double > > Data
void PrintCorrelationMatrix(const TString &className)
calculates the correlation matrices for signal and background, prints them to standard output...
void SetMin(Double_t v)
Definition: VariableInfo.h:74
void RenormEvents(DataSetInfo &dsi, EventVectorOfClassesOfTreeType &eventsmap, const EvtStatsPerClass &eventCounts, const TString &normMode)
renormalisation of the TRAINING event weights -none (kind of obvious) .
void SetCurrentEvent(Long64_t ievt) const
Definition: DataSet.h:110
Double_t GetWeight() const
Double_t GetOriginalWeight() const
Definition: Event.h:84
Used to pass a selection expression to the Tree drawing routine.
Definition: TTreeFormula.h:64
void ChangeToNewTree(TreeInfo &, const DataSetInfo &)
While the data gets copied into the local training and testing trees, the input tree can change (for ...
A specialized string object used for TTree selections.
Definition: TCut.h:27
virtual TFile * GetFile() const
Definition: TDirectory.h:155
TMatrixT< Double_t > TMatrixD
Definition: TMatrixDfwd.h:24
void SetCorrelationMatrix(const TString &className, TMatrixD *matrix)
DataSetFactory()
constructor
TMatrixD * CalcCorrelationMatrix(DataSet *, const UInt_t classNumber)
computes correlation matrix for variables "theVars" in tree; "theType" defines the required event "ty...
Int_t LargestCommonDivider(Int_t a, Int_t b)
SVector< double, 2 > v
Definition: Dict.h:5
DataSet * CreateDataSet(DataSetInfo &, DataInputHandler &)
steering the creation of a new dataset
TDirectory * GetDirectory() const
Definition: TTree.h:385
virtual const char * ClassName() const
Returns name of class to which the object belongs.
Definition: TObject.cxx:187
VariableInfo & GetTargetInfo(Int_t i)
Definition: DataSetInfo.h:119
ClassInfo * GetClassInfo(Int_t clNum) const
unsigned int UInt_t
Definition: RtypesCore.h:42
virtual Int_t GetNcodes() const
Definition: TTreeFormula.h:188
char * Form(const char *fmt,...)
const TString & GetName() const
Definition: ClassInfo.h:72
virtual const char * GetName() const
Returns name of object.
Definition: TNamed.h:51
UInt_t GetSignalClassIndex()
Definition: DataSetInfo.h:155
const TCut & GetCut() const
Definition: ClassInfo.h:74
std::vector< Event * > EventVector
static DataSetFactory * fgInstance
void SetMax(Double_t v)
Definition: VariableInfo.h:75
const TString & GetSplitOptions() const
Definition: DataSetInfo.h:183
DataSet * BuildInitialDataSet(DataSetInfo &, TMVA::DataInputHandler &)
if no entries, than create a DataSet with one Event which uses dynamic variables (pointers to variabl...
const Event * GetEvent() const
Definition: DataSet.cxx:186
~DataSetFactory()
destructor
void SetCurrentType(Types::ETreeType type) const
Definition: DataSet.h:111
virtual Int_t GetNdata()
Return number of available instances in the formula.
double Double_t
Definition: RtypesCore.h:55
std::vector< TreeInfo >::const_iterator end(const TString &className) const
virtual void ResetBranchAddresses()
Tell all of our branches to drop their current objects and allocate new ones.
Definition: TTree.cxx:7266
Long64_t GetNEvents(Types::ETreeType type=Types::kMaxTreeType) const
Definition: DataSet.h:225
Float_t GetValue(UInt_t ivar) const
return value of i'th variable
Definition: Event.cxx:231
UInt_t GetNTargets() const
access the number of targets through the datasetinfo
Definition: DataSet.cxx:208
UInt_t GetNVariables() const
access the number of variables through the datasetinfo
Definition: DataSet.cxx:200
VariableInfo & GetSpectatorInfo(Int_t i)
Definition: DataSetInfo.h:124
compose_binary_t< F, G, H > compose_binary(const F &_f, const G &_g, const H &_h)
virtual TTree * GetTree() const
Definition: TTree.h:436
VariableInfo & GetVariableInfo(Int_t i)
Definition: DataSetInfo.h:114
RooCmdArg Verbose(Bool_t flag=kTRUE)
T EvalInstance(Int_t i=0, const char *stringStack[]=0)
Evaluate this treeformula.
ClassInfo * AddClass(const TString &className)
UInt_t GetClass() const
Definition: Event.h:86
static Vc_ALWAYS_INLINE int_v max(const int_v &x, const int_v &y)
Definition: vector.h:440
Long64_t GetNClassEvents(Int_t type, UInt_t classNumber)
Definition: DataSet.cxx:152
void SetConfigName(const char *n)
Definition: Configurable.h:70
Float_t GetTarget(UInt_t itgt) const
Definition: Event.h:101
TRandom3 rndm
Definition: fit2dHist.C:44
std::vector< EventStats > EvtStatsPerClass
Short_t Max(Short_t a, Short_t b)
Definition: TMathBase.h:202
Bool_t Contains(const char *pat, ECaseCompare cmp=kExact) const
Definition: TString.h:567
Int_t IsNaN(Double_t x)
Definition: TMath.h:617
UInt_t GetNSpectators(bool all=kTRUE) const
Bool_t CheckTTreeFormula(TTreeFormula *ttf, const TString &expression, Bool_t &hasDollar)
checks a TTreeFormula for problems
void SetNumber(const UInt_t index)
Definition: ClassInfo.h:68
#define NULL
Definition: Rtypes.h:82
virtual Long64_t GetEntries() const
Definition: TTree.h:386
Long64_t GetNTrainingEvents() const
Definition: DataSet.h:90
A TTree object has a header with a name and a title.
Definition: TTree.h:98
DataSet * MixEvents(DataSetInfo &dsi, EventVectorOfClassesOfTreeType &eventsmap, EvtStatsPerClass &eventCounts, const TString &splitMode, const TString &mixMode, const TString &normMode, UInt_t splitSeed)
Select and distribute unassigned events to kTraining and kTesting Bool_t emptyUndefined = kTRUE;...
const Bool_t kTRUE
Definition: Rtypes.h:91
std::vector< TreeInfo >::const_iterator begin(const TString &className) const
Bool_t HasCuts() const
void SetNormalization(const TString &norm)
Definition: DataSetInfo.h:133
TMatrixD * CalcCovarianceMatrix(DataSet *, const UInt_t classNumber)
compute covariance matrix
std::vector< VariableInfo > & GetVariableInfos()
Definition: DataSetInfo.h:112
Definition: math.cpp:60