Logo ROOT   6.07/09
Reference Guide
DataLoader.h
Go to the documentation of this file.
1 // @(#)root/tmva $Id$
2 // Author: Andreas Hoecker, Peter Speckmayer, Joerg Stelzer, Helge Voss, Kai Voss, Eckhard von Toerne, Jan Therhaag, Omar Zapata, Lorenzo Moneta, Sergei Gleyzer
3 //NOTE: Based on TMVA::Factory
4 
5 /**********************************************************************************
6  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
7  * Package: TMVA *
8  * Class : DataLoader *
9  * Web : http://tmva.sourceforge.net *
10  * *
11  * Description: *
12  * This is a class to load datasets into every booked method *
13  * *
14  * Authors (alphabetical): *
15  * Lorenzo Moneta <Lorenzo.Moneta@cern.ch> - CERN, Switzerland *
16  * Omar Zapata <andresete.chaos@gmail.com> - ITM/UdeA, Colombia *
17  * Sergei Gleyzer<sergei.gleyzer@cern.ch> - CERN, Switzerland *
18  * *
19  * Copyright (c) 2005-2011: *
20  * CERN, Switzerland *
21  * ITM/UdeA, Colombia *
22  * *
23  * Redistribution and use in source and binary forms, with or without *
24  * modification, are permitted according to the terms listed in LICENSE *
25  * (http://tmva.sourceforge.net/LICENSE) *
26  **********************************************************************************/
27 
28 #ifndef ROOT_TMVA_DataLoader
29 #define ROOT_TMVA_DataLoader
30 
31 
32 #include <string>
33 #include <vector>
34 #include <map>
35 #ifndef ROOT_TCut
36 #include "TCut.h"
37 #endif
38 
39 #ifndef ROOT_TMVA_Factory
40 #include "TMVA/Factory.h"
41 #endif
42 #ifndef ROOT_TMVA_Types
43 #include "TMVA/Types.h"
44 #endif
45 #ifndef ROOT_TMVA_DataSet
46 #include "TMVA/DataSet.h"
47 #endif
48 
49 class TFile;
50 class TTree;
51 class TDirectory;
52 class TH2;
53 
54 namespace TMVA {
55 
56  class IMethod;
57  class Envelope;
58  class MethodBase;
59  class DataInputHandler;
60  class DataSetInfo;
61  class DataSetManager;
62  class VariableTransformBase;
63  class VarTransformHandler;
64 
65  class DataLoader : public Configurable {
66  friend class Factory;
67  friend class Envelope;
68  public:
69 
70  DataLoader( TString thedlName="default");
71 
72  // default destructor
73  virtual ~DataLoader();
74 
75 
76  // add events to training and testing trees
77  void AddSignalTrainingEvent ( const std::vector<Double_t>& event, Double_t weight = 1.0 );
78  void AddBackgroundTrainingEvent( const std::vector<Double_t>& event, Double_t weight = 1.0 );
79  void AddSignalTestEvent ( const std::vector<Double_t>& event, Double_t weight = 1.0 );
80  void AddBackgroundTestEvent ( const std::vector<Double_t>& event, Double_t weight = 1.0 );
81  void AddTrainingEvent( const TString& className, const std::vector<Double_t>& event, Double_t weight );
82  void AddTestEvent ( const TString& className, const std::vector<Double_t>& event, Double_t weight );
83  void AddEvent ( const TString& className, Types::ETreeType tt, const std::vector<Double_t>& event, Double_t weight );
86 
88  DataSetInfo& AddDataSet( const TString& );
90  DataLoader* VarTransform(TString trafoDefinition);
91 
92  // special case: signal/background
93 
94  // Data input related
95  void SetInputTrees( const TString& signalFileName, const TString& backgroundFileName,
96  Double_t signalWeight=1.0, Double_t backgroundWeight=1.0 );
97  void SetInputTrees( TTree* inputTree, const TCut& SigCut, const TCut& BgCut );
98  // Set input trees at once
99  void SetInputTrees( TTree* signal, TTree* background,
100  Double_t signalWeight=1.0, Double_t backgroundWeight=1.0) ;
101 
102  void AddSignalTree( TTree* signal, Double_t weight=1.0, Types::ETreeType treetype = Types::kMaxTreeType );
103  void AddSignalTree( TString datFileS, Double_t weight=1.0, Types::ETreeType treetype = Types::kMaxTreeType );
104  void AddSignalTree( TTree* signal, Double_t weight, const TString& treetype );
105 
106  // ... depreciated, kept for backwards compatibility
107  void SetSignalTree( TTree* signal, Double_t weight=1.0);
108 
110  void AddBackgroundTree( TString datFileB, Double_t weight=1.0, Types::ETreeType treetype = Types::kMaxTreeType );
111  void AddBackgroundTree( TTree* background, Double_t weight, const TString & treetype );
112 
113  // ... depreciated, kept for backwards compatibility
114  void SetBackgroundTree( TTree* background, Double_t weight=1.0 );
115 
116  void SetSignalWeightExpression( const TString& variable );
117  void SetBackgroundWeightExpression( const TString& variable );
118 
119  // special case: regression
120  void AddRegressionTree( TTree* tree, Double_t weight = 1.0,
121  Types::ETreeType treetype = Types::kMaxTreeType ) {
122  AddTree( tree, "Regression", weight, "", treetype );
123  }
124 
125  // general
126 
127  // Data input related
128  void SetTree( TTree* tree, const TString& className, Double_t weight ); // depreciated
129  void AddTree( TTree* tree, const TString& className, Double_t weight=1.0,
130  const TCut& cut = "",
132  void AddTree( TTree* tree, const TString& className, Double_t weight, const TCut& cut, const TString& treeType );
133 
134  // set input variable
135  void SetInputVariables ( std::vector<TString>* theVariables ); // depreciated
136  void AddVariable ( const TString& expression, const TString& title, const TString& unit,
137  char type='F', Double_t min = 0, Double_t max = 0 );
138  void AddVariable ( const TString& expression, char type='F',
139  Double_t min = 0, Double_t max = 0 );
140  void AddTarget ( const TString& expression, const TString& title = "", const TString& unit = "",
141  Double_t min = 0, Double_t max = 0 );
142  void AddRegressionTarget( const TString& expression, const TString& title = "", const TString& unit = "",
143  Double_t min = 0, Double_t max = 0 )
144  {
145  AddTarget( expression, title, unit, min, max );
146  }
147  void AddSpectator ( const TString& expression, const TString& title = "", const TString& unit = "",
148  Double_t min = 0, Double_t max = 0 );
149 
150  // set weight for class
151  void SetWeightExpression( const TString& variable, const TString& className = "" );
152 
153  // set cut for class
154  void SetCut( const TString& cut, const TString& className = "" );
155  void SetCut( const TCut& cut, const TString& className = "" );
156  void AddCut( const TString& cut, const TString& className = "" );
157  void AddCut( const TCut& cut, const TString& className = "" );
158 
159 
160  // prepare input tree for training
161  void PrepareTrainingAndTestTree( const TCut& cut, const TString& splitOpt );
162  void PrepareTrainingAndTestTree( TCut sigcut, TCut bkgcut, const TString& splitOpt );
163 
164  // ... deprecated, kept for backwards compatibility
165  void PrepareTrainingAndTestTree( const TCut& cut, Int_t Ntrain, Int_t Ntest = -1 );
166 
167  void PrepareTrainingAndTestTree( const TCut& cut, Int_t NsigTrain, Int_t NbkgTrain, Int_t NsigTest, Int_t NbkgTest,
168  const TString& otherOpt="SplitMode=Random:!V" );
169 
170  void PrepareTrainingAndTestTree( int foldNumber, Types::ETreeType tt );
171 
172  void PrepareFoldDataSet( UInt_t foldNumber, Types::ETreeType tt);
173  void MakeKFoldDataSet(UInt_t numberFolds, bool validationSet=false);
174  std::vector<std::vector<TMVA::Event*>> SplitSets(std::vector<TMVA::Event*>& oldSet, int seedNum, int numFolds);
175 
177 
178  TH2* GetCorrelationMatrix(const TString& className);
179 
180  //Copy method use in VI and CV DEPRECATED: you can just call Clone DataLoader *dl2=(DataLoader *)dl1->Clone("dl2")
182  friend void DataLoaderCopy(TMVA::DataLoader* des, TMVA::DataLoader* src);
184 
185  private:
186 
187 
190 
191 
192  private:
193 
194  // data members
195 
196 
198 
199 
201 
202  std::vector<TMVA::VariableTransformBase*> fDefaultTrfs; // list of transformations on default DataSet
203 
204  // cd to local directory
205  TString fOptions; // option string given by construction (presently only "V")
206  TString fTransformations; // List of transformations to test
207  Bool_t fVerbose; // verbose mode
208 
209  // flag determining the way training and test data are assigned to DataLoader
213  DataAssignType fDataAssignType; // flags for data assigning
214  std::vector<TTree*> fTrainAssignTree; // for each class: tmp tree if user wants to assign the events directly
215  std::vector<TTree*> fTestAssignTree; // for each class: tmp tree if user wants to assign the events directly
216 
217  std::vector<std::vector<TMVA::Event*>> fTrainSigEvents;
218  std::vector<std::vector<TMVA::Event*>> fTrainBkgEvents;
219  std::vector<std::vector<TMVA::Event*>> fValidSigEvents;
220  std::vector<std::vector<TMVA::Event*>> fValidBkgEvents;
221  std::vector<std::vector<TMVA::Event*>> fTestSigEvents;
222  std::vector<std::vector<TMVA::Event*>> fTestBkgEvents;
223 
224  Int_t fATreeType; // type of event (=classIndex)
225  Float_t fATreeWeight; // weight of the event
226  std::vector<Float_t> fATreeEvent; // event variables
227 
228  Types::EAnalysisType fAnalysisType; // the training type
229 
231 
232  protected:
233 
234  ClassDef(DataLoader,3);
235  };
237 } // namespace TMVA
238 
239 #endif
240 
void AddBackgroundTree(TTree *background, Double_t weight=1.0, Types::ETreeType treetype=Types::kMaxTreeType)
Definition: DataLoader.cxx:381
DataSetManager * fDataSetManager
Definition: DataLoader.h:197
virtual ~DataLoader()
Definition: DataLoader.cxx:97
void AddTrainingEvent(const TString &className, const std::vector< Double_t > &event, Double_t weight)
Definition: DataLoader.cxx:248
std::vector< TMVA::VariableTransformBase * > fDefaultTrfs
Definition: DataLoader.h:202
DataLoader(TString thedlName="default")
Definition: DataLoader.cxx:81
float Float_t
Definition: RtypesCore.h:53
void AddRegressionTarget(const TString &expression, const TString &title="", const TString &unit="", Double_t min=0, Double_t max=0)
Definition: DataLoader.h:142
std::vector< std::vector< TMVA::Event * > > fTrainBkgEvents
Definition: DataLoader.h:218
DataSetInfo & GetDataSetInfo()
Definition: DataLoader.cxx:135
A ROOT file is a suite of consecutive data records (TKey instances) with a well defined format...
Definition: TFile.h:50
Double_t background(Double_t *x, Double_t *par)
EAnalysisType
Definition: Types.h:128
TTree * CreateEventAssignTrees(const TString &name)
Definition: DataLoader.cxx:188
DataSetInfo & DefaultDataSetInfo()
Definition: DataLoader.cxx:491
DataLoader * VarTransform(TString trafoDefinition)
Transforms the variables and return a new DataLoader with the transformed variables.
Definition: DataLoader.cxx:144
Basic string class.
Definition: TString.h:137
int Int_t
Definition: RtypesCore.h:41
void MakeKFoldDataSet(UInt_t numberFolds, bool validationSet=false)
Definition: DataLoader.cxx:610
bool Bool_t
Definition: RtypesCore.h:59
void SetBackgroundTree(TTree *background, Double_t weight=1.0)
Definition: DataLoader.cxx:415
DataInputHandler * fDataInputHandler
Definition: DataLoader.h:200
Types::EAnalysisType fAnalysisType
Definition: DataLoader.h:228
void AddBackgroundTestEvent(const std::vector< Double_t > &event, Double_t weight=1.0)
Definition: DataLoader.cxx:241
TH2 * GetCorrelationMatrix(const TString &className)
Definition: DataLoader.cxx:809
std::vector< std::vector< TMVA::Event * > > fTestBkgEvents
Definition: DataLoader.h:222
void AddVariable(const TString &expression, const TString &title, const TString &unit, char type='F', Double_t min=0, Double_t max=0)
Definition: DataLoader.cxx:455
#define ClassDef(name, id)
Definition: Rtypes.h:254
TText * tt
Definition: textangle.C:16
void AddTestEvent(const TString &className, const std::vector< Double_t > &event, Double_t weight)
Definition: DataLoader.cxx:255
void SetInputTrees(const TString &signalFileName, const TString &backgroundFileName, Double_t signalWeight=1.0, Double_t backgroundWeight=1.0)
Definition: DataLoader.cxx:437
void SetTree(TTree *tree, const TString &className, Double_t weight)
Definition: DataLoader.cxx:421
void PrepareFoldDataSet(UInt_t foldNumber, Types::ETreeType tt)
Definition: DataLoader.cxx:661
Base class for all machine learning algorithms.
Definition: Envelope.h:55
void SetInputVariables(std::vector< TString > *theVariables)
Definition: DataLoader.cxx:498
DataSetInfo & AddDataSet(DataSetInfo &)
Definition: DataLoader.cxx:119
void AddCut(const TString &cut, const TString &className="")
Definition: DataLoader.cxx:540
A specialized string object used for TTree selections.
Definition: TCut.h:27
void SetInputTreesFromEventAssignTrees()
Definition: DataLoader.cxx:303
Float_t fATreeWeight
Definition: DataLoader.h:225
Bool_t fMakeFoldDataSet
Definition: DataLoader.h:230
DataInputHandler & DataInput()
Definition: DataLoader.h:183
Service class for 2-Dim histogram classes.
Definition: TH2.h:36
unsigned int UInt_t
Definition: RtypesCore.h:42
std::vector< TTree * > fTestAssignTree
Definition: DataLoader.h:215
Bool_t UserAssignEvents(UInt_t clIndex)
Definition: DataLoader.cxx:296
std::vector< Float_t > fATreeEvent
Definition: DataLoader.h:226
void AddRegressionTree(TTree *tree, Double_t weight=1.0, Types::ETreeType treetype=Types::kMaxTreeType)
Definition: DataLoader.h:120
std::vector< std::vector< TMVA::Event * > > fTestSigEvents
Definition: DataLoader.h:221
DataLoader * MakeCopy(TString name)
Definition: DataLoader.cxx:786
const DataSetInfo & GetDefaultDataSetInfo()
Definition: DataLoader.h:176
void AddTree(TTree *tree, const TString &className, Double_t weight=1.0, const TCut &cut="", Types::ETreeType tt=Types::kMaxTreeType)
Definition: DataLoader.cxx:334
void PrepareTrainingAndTestTree(const TCut &cut, const TString &splitOpt)
Definition: DataLoader.cxx:579
DataAssignType fDataAssignType
Definition: DataLoader.h:213
TString fTransformations
Definition: DataLoader.h:206
double Double_t
Definition: RtypesCore.h:55
void AddEvent(const TString &className, Types::ETreeType tt, const std::vector< Double_t > &event, Double_t weight)
Definition: DataLoader.cxx:262
Describe directory structure in memory.
Definition: TDirectory.h:44
void SetBackgroundWeightExpression(const TString &variable)
Definition: DataLoader.cxx:512
int type
Definition: TGX11.cxx:120
void AddTarget(const TString &expression, const TString &title="", const TString &unit="", Double_t min=0, Double_t max=0)
Definition: DataLoader.cxx:471
void SetWeightExpression(const TString &variable, const TString &className="")
Definition: DataLoader.cxx:518
void AddBackgroundTrainingEvent(const std::vector< Double_t > &event, Double_t weight=1.0)
Definition: DataLoader.cxx:234
void SetSignalWeightExpression(const TString &variable)
Definition: DataLoader.cxx:506
std::vector< std::vector< TMVA::Event * > > fTrainSigEvents
Definition: DataLoader.h:217
Abstract ClassifierFactory template that handles arbitrary types.
std::vector< TTree * > fTrainAssignTree
Definition: DataLoader.h:214
void AddSignalTestEvent(const std::vector< Double_t > &event, Double_t weight=1.0)
Definition: DataLoader.cxx:227
std::vector< std::vector< TMVA::Event * > > fValidBkgEvents
Definition: DataLoader.h:220
void AddSignalTrainingEvent(const std::vector< Double_t > &event, Double_t weight=1.0)
Definition: DataLoader.cxx:220
friend void DataLoaderCopy(TMVA::DataLoader *des, TMVA::DataLoader *src)
TString fOptions
Definition: DataLoader.h:205
void SetSignalTree(TTree *signal, Double_t weight=1.0)
Definition: DataLoader.cxx:409
Definition: tree.py:1
A TTree object has a header with a name and a title.
Definition: TTree.h:98
std::vector< std::vector< TMVA::Event * > > SplitSets(std::vector< TMVA::Event * > &oldSet, int seedNum, int numFolds)
Definition: DataLoader.cxx:751
std::vector< std::vector< TMVA::Event * > > fValidSigEvents
Definition: DataLoader.h:219
void AddSignalTree(TTree *signal, Double_t weight=1.0, Types::ETreeType treetype=Types::kMaxTreeType)
Definition: DataLoader.cxx:352
void SetCut(const TString &cut, const TString &className="")
Definition: DataLoader.cxx:529
char name[80]
Definition: TGX11.cxx:109
void AddSpectator(const TString &expression, const TString &title="", const TString &unit="", Double_t min=0, Double_t max=0)
Definition: DataLoader.cxx:483