Logo ROOT   6.10/09
Reference Guide
TMVARegression.C
Go to the documentation of this file.
1 /// \file
2 /// \ingroup tutorial_tmva
3 /// \notebook -nodraw
4 /// This macro provides examples for the training and testing of the
5 /// TMVA classifiers.
6 ///
7 /// As input data is used a toy-MC sample consisting of four Gaussian-distributed
8 /// and linearly correlated input variables.
9 ///
10 /// The methods to be used can be switched on and off by means of booleans, or
11 /// via the prompt command, for example:
12 ///
13 /// root -l TMVARegression.C\(\"LD,MLP\"\)
14 ///
15 /// (note that the backslashes are mandatory)
16 /// If no method given, a default set is used.
17 ///
18 /// The output file "TMVAReg.root" can be analysed with the use of dedicated
19 /// macros (simply say: root -l <macro.C>), which can be conveniently
20 /// invoked through a GUI that will appear at the end of the run of this macro.
21 /// - Project : TMVA - a Root-integrated toolkit for multivariate data analysis
22 /// - Package : TMVA
23 /// - Root Macro: TMVARegression
24 ///
25 /// \macro_output
26 /// \macro_code
27 /// \author Andreas Hoecker
28 
29 #include <cstdlib>
30 #include <iostream>
31 #include <map>
32 #include <string>
33 
34 #include "TChain.h"
35 #include "TFile.h"
36 #include "TTree.h"
37 #include "TString.h"
38 #include "TObjString.h"
39 #include "TSystem.h"
40 #include "TROOT.h"
41 
42 #include "TMVA/Tools.h"
43 #include "TMVA/Factory.h"
44 #include "TMVA/DataLoader.h"
45 #include "TMVA/TMVARegGui.h"
46 
47 
48 using namespace TMVA;
49 
50 void TMVARegression( TString myMethodList = "" )
51 {
52  // The explicit loading of the shared libTMVA is done in TMVAlogon.C, defined in .rootrc
53  // if you use your private .rootrc, or run from a different directory, please copy the
54  // corresponding lines from .rootrc
55 
56  // methods to be processed can be given as an argument; use format:
57  //
58  // mylinux~> root -l TMVARegression.C\(\"myMethod1,myMethod2,myMethod3\"\)
59  //
60 
61  //---------------------------------------------------------------
62  // This loads the library
64 
65 
66 
67  // Default MVA methods to be trained + tested
68  std::map<std::string,int> Use;
69 
70  // Mutidimensional likelihood and Nearest-Neighbour methods
71  Use["PDERS"] = 0;
72  Use["PDEFoam"] = 1;
73  Use["KNN"] = 1;
74  //
75  // Linear Discriminant Analysis
76  Use["LD"] = 1;
77  //
78  // Function Discriminant analysis
79  Use["FDA_GA"] = 1;
80  Use["FDA_MC"] = 0;
81  Use["FDA_MT"] = 0;
82  Use["FDA_GAMT"] = 0;
83  //
84  // Neural Network
85  Use["MLP"] = 1;
86  Use["DNN_CPU"] = 0;
87  //
88  // Support Vector Machine
89  Use["SVM"] = 0;
90  //
91  // Boosted Decision Trees
92  Use["BDT"] = 0;
93  Use["BDTG"] = 1;
94  // ---------------------------------------------------------------
95 
96  std::cout << std::endl;
97  std::cout << "==> Start TMVARegression" << std::endl;
98 
99  // Select methods (don't look at this code - not of interest)
100  if (myMethodList != "") {
101  for (std::map<std::string,int>::iterator it = Use.begin(); it != Use.end(); it++) it->second = 0;
102 
103  std::vector<TString> mlist = gTools().SplitString( myMethodList, ',' );
104  for (UInt_t i=0; i<mlist.size(); i++) {
105  std::string regMethod(mlist[i]);
106 
107  if (Use.find(regMethod) == Use.end()) {
108  std::cout << "Method \"" << regMethod << "\" not known in TMVA under this name. Choose among the following:" << std::endl;
109  for (std::map<std::string,int>::iterator it = Use.begin(); it != Use.end(); it++) std::cout << it->first << " ";
110  std::cout << std::endl;
111  return;
112  }
113  Use[regMethod] = 1;
114  }
115  }
116 
117  // --------------------------------------------------------------------------------------------------
118 
119  // Here the preparation phase begins
120 
121  // Create a new root output file
122  TString outfileName( "TMVAReg.root" );
123  TFile* outputFile = TFile::Open( outfileName, "RECREATE" );
124 
125  // Create the factory object. Later you can choose the methods
126  // whose performance you'd like to investigate. The factory will
127  // then run the performance analysis for you.
128  //
129  // The first argument is the base of the name of all the
130  // weightfiles in the directory weight/
131  //
132  // The second argument is the output file for the training results
133  // All TMVA output can be suppressed by removing the "!" (not) in
134  // front of the "Silent" argument in the option string
135  TMVA::Factory *factory = new TMVA::Factory( "TMVARegression", outputFile,
136  "!V:!Silent:Color:DrawProgressBar:AnalysisType=Regression" );
137 
138 
140  // If you wish to modify default settings
141  // (please check "src/Config.h" to see all available global options)
142  //
143  // (TMVA::gConfig().GetVariablePlotting()).fTimesRMS = 8.0;
144  // (TMVA::gConfig().GetIONames()).fWeightFileDir = "myWeightDirectory";
145 
146  // Define the input variables that shall be used for the MVA training
147  // note that you may also use variable expressions, such as: "3*var1/var2*abs(var3)"
148  // [all types of expressions that can also be parsed by TTree::Draw( "expression" )]
149  dataloader->AddVariable( "var1", "Variable 1", "units", 'F' );
150  dataloader->AddVariable( "var2", "Variable 2", "units", 'F' );
151 
152  // You can add so-called "Spectator variables", which are not used in the MVA training,
153  // but will appear in the final "TestTree" produced by TMVA. This TestTree will contain the
154  // input variables, the response values of all trained MVAs, and the spectator variables
155  dataloader->AddSpectator( "spec1:=var1*2", "Spectator 1", "units", 'F' );
156  dataloader->AddSpectator( "spec2:=var1*3", "Spectator 2", "units", 'F' );
157 
158  // Add the variable carrying the regression target
159  dataloader->AddTarget( "fvalue" );
160 
161  // It is also possible to declare additional targets for multi-dimensional regression, ie:
162  // factory->AddTarget( "fvalue2" );
163  // BUT: this is currently ONLY implemented for MLP
164 
165  // Read training and test data (see TMVAClassification for reading ASCII files)
166  // load the signal and background event samples from ROOT trees
167  TFile *input(0);
168  TString fname = "./tmva_reg_example.root";
169  if (!gSystem->AccessPathName( fname )) {
170  input = TFile::Open( fname ); // check if file in local directory exists
171  }
172  else {
174  input = TFile::Open("http://root.cern.ch/files/tmva_reg_example.root", "CACHEREAD"); // if not: download from ROOT server
175  }
176  if (!input) {
177  std::cout << "ERROR: could not open data file" << std::endl;
178  exit(1);
179  }
180  std::cout << "--- TMVARegression : Using input file: " << input->GetName() << std::endl;
181 
182  // Register the regression tree
183 
184  TTree *regTree = (TTree*)input->Get("TreeR");
185 
186  // global event weights per tree (see below for setting event-wise weights)
187  Double_t regWeight = 1.0;
188 
189  // You can add an arbitrary number of regression trees
190  dataloader->AddRegressionTree( regTree, regWeight );
191 
192  // This would set individual event weights (the variables defined in the
193  // expression need to exist in the original TTree)
194  dataloader->SetWeightExpression( "var1", "Regression" );
195 
196  // Apply additional cuts on the signal and background samples (can be different)
197  TCut mycut = ""; // for example: TCut mycut = "abs(var1)<0.5 && abs(var2-0.5)<1";
198 
199  // tell the DataLoader to use all remaining events in the trees after training for testing:
200  dataloader->PrepareTrainingAndTestTree( mycut,
201  "nTrain_Regression=1000:nTest_Regression=0:SplitMode=Random:NormMode=NumEvents:!V" );
202  //
203  // dataloader->PrepareTrainingAndTestTree( mycut,
204  // "nTrain_Regression=0:nTest_Regression=0:SplitMode=Random:NormMode=NumEvents:!V" );
205 
206  // If no numbers of events are given, half of the events in the tree are used
207  // for training, and the other half for testing:
208  //
209  // dataloader->PrepareTrainingAndTestTree( mycut, "SplitMode=random:!V" );
210 
211  // Book MVA methods
212  //
213  // Please lookup the various method configuration options in the corresponding cxx files, eg:
214  // src/MethoCuts.cxx, etc, or here: http://tmva.sourceforge.net/optionRef.html
215  // it is possible to preset ranges in the option string in which the cut optimisation should be done:
216  // "...:CutRangeMin[2]=-1:CutRangeMax[2]=1"...", where [2] is the third input variable
217 
218  // PDE - RS method
219  if (Use["PDERS"])
220  factory->BookMethod( dataloader, TMVA::Types::kPDERS, "PDERS",
221  "!H:!V:NormTree=T:VolumeRangeMode=Adaptive:KernelEstimator=Gauss:GaussSigma=0.3:NEventsMin=40:NEventsMax=60:VarTransform=None" );
222  // And the options strings for the MinMax and RMS methods, respectively:
223  //
224  // "!H:!V:VolumeRangeMode=MinMax:DeltaFrac=0.2:KernelEstimator=Gauss:GaussSigma=0.3" );
225  // "!H:!V:VolumeRangeMode=RMS:DeltaFrac=3:KernelEstimator=Gauss:GaussSigma=0.3" );
226 
227  if (Use["PDEFoam"])
228  factory->BookMethod( dataloader, TMVA::Types::kPDEFoam, "PDEFoam",
229  "!H:!V:MultiTargetRegression=F:TargetSelection=Mpv:TailCut=0.001:VolFrac=0.0666:nActiveCells=500:nSampl=2000:nBin=5:Compress=T:Kernel=None:Nmin=10:VarTransform=None" );
230 
231  // K-Nearest Neighbour classifier (KNN)
232  if (Use["KNN"])
233  factory->BookMethod( dataloader, TMVA::Types::kKNN, "KNN",
234  "nkNN=20:ScaleFrac=0.8:SigmaFact=1.0:Kernel=Gaus:UseKernel=F:UseWeight=T:!Trim" );
235 
236  // Linear discriminant
237  if (Use["LD"])
238  factory->BookMethod( dataloader, TMVA::Types::kLD, "LD",
239  "!H:!V:VarTransform=None" );
240 
241  // Function discrimination analysis (FDA) -- test of various fitters - the recommended one is Minuit (or GA or SA)
242  if (Use["FDA_MC"])
243  factory->BookMethod( dataloader, TMVA::Types::kFDA, "FDA_MC",
244  "!H:!V:Formula=(0)+(1)*x0+(2)*x1:ParRanges=(-100,100);(-100,100);(-100,100):FitMethod=MC:SampleSize=100000:Sigma=0.1:VarTransform=D" );
245 
246  if (Use["FDA_GA"]) // can also use Simulated Annealing (SA) algorithm (see Cuts_SA options) .. the formula of this example is good for parabolas
247  factory->BookMethod( dataloader, TMVA::Types::kFDA, "FDA_GA",
248  "!H:!V:Formula=(0)+(1)*x0+(2)*x1:ParRanges=(-100,100);(-100,100);(-100,100):FitMethod=GA:PopSize=100:Cycles=3:Steps=30:Trim=True:SaveBestGen=1:VarTransform=Norm" );
249 
250  if (Use["FDA_MT"])
251  factory->BookMethod( dataloader, TMVA::Types::kFDA, "FDA_MT",
252  "!H:!V:Formula=(0)+(1)*x0+(2)*x1:ParRanges=(-100,100);(-100,100);(-100,100);(-10,10):FitMethod=MINUIT:ErrorLevel=1:PrintLevel=-1:FitStrategy=2:UseImprove:UseMinos:SetBatch" );
253 
254  if (Use["FDA_GAMT"])
255  factory->BookMethod( dataloader, TMVA::Types::kFDA, "FDA_GAMT",
256  "!H:!V:Formula=(0)+(1)*x0+(2)*x1:ParRanges=(-100,100);(-100,100);(-100,100):FitMethod=GA:Converger=MINUIT:ErrorLevel=1:PrintLevel=-1:FitStrategy=0:!UseImprove:!UseMinos:SetBatch:Cycles=1:PopSize=5:Steps=5:Trim" );
257 
258  // Neural network (MLP)
259  if (Use["MLP"])
260  factory->BookMethod( dataloader, TMVA::Types::kMLP, "MLP", "!H:!V:VarTransform=Norm:NeuronType=tanh:NCycles=20000:HiddenLayers=N+20:TestRate=6:TrainingMethod=BFGS:Sampling=0.3:SamplingEpoch=0.8:ConvergenceImprove=1e-6:ConvergenceTests=15:!UseRegulator" );
261 
262  if (Use["DNN_CPU"]) {
263  /*
264  TString layoutString ("Layout=TANH|(N+100)*2,LINEAR");
265  TString layoutString ("Layout=SOFTSIGN|100,SOFTSIGN|50,SOFTSIGN|20,LINEAR");
266  TString layoutString ("Layout=RELU|300,RELU|100,RELU|30,RELU|10,LINEAR");
267  TString layoutString ("Layout=SOFTSIGN|50,SOFTSIGN|30,SOFTSIGN|20,SOFTSIGN|10,LINEAR");
268  TString layoutString ("Layout=TANH|50,TANH|30,TANH|20,TANH|10,LINEAR");
269  TString layoutString ("Layout=SOFTSIGN|50,SOFTSIGN|20,LINEAR");
270  TString layoutString ("Layout=TANH|100,TANH|30,LINEAR");
271  */
272  TString layoutString("Layout=TANH|100,LINEAR");
273 
274  TString training0("LearningRate=1e-5,Momentum=0.5,Repetitions=1,ConvergenceSteps=500,BatchSize=50,"
275  "TestRepetitions=7,WeightDecay=0.01,Regularization=NONE,DropConfig=0.5+0.5+0.5+0.5,"
276  "DropRepetitions=2");
277  TString training1("LearningRate=1e-5,Momentum=0.9,Repetitions=1,ConvergenceSteps=170,BatchSize=30,"
278  "TestRepetitions=7,WeightDecay=0.01,Regularization=L2,DropConfig=0.1+0.1+0.1,DropRepetitions="
279  "1");
280  TString training2("LearningRate=1e-5,Momentum=0.3,Repetitions=1,ConvergenceSteps=150,BatchSize=40,"
281  "TestRepetitions=7,WeightDecay=0.01,Regularization=NONE");
282  TString training3("LearningRate=1e-6,Momentum=0.1,Repetitions=1,ConvergenceSteps=500,BatchSize=100,"
283  "TestRepetitions=7,WeightDecay=0.0001,Regularization=NONE");
284 
285  TString trainingStrategyString("TrainingStrategy=");
286  trainingStrategyString += training0 + "|" + training1 + "|" + training2 + "|" + training3;
287 
288  // TString trainingStrategyString
289  // ("TrainingStrategy=LearningRate=1e-1,Momentum=0.3,Repetitions=3,ConvergenceSteps=20,BatchSize=30,TestRepetitions=7,WeightDecay=0.0,L1=false,DropFraction=0.0,DropRepetitions=5");
290 
291  TString nnOptions(
292  "!H:V:ErrorStrategy=SUMOFSQUARES:VarTransform=G:WeightInitialization=XAVIERUNIFORM:Architecture=CPU");
293  // TString nnOptions ("!H:V:VarTransform=Normalize:ErrorStrategy=CHECKGRADIENTS");
294  nnOptions.Append(":");
295  nnOptions.Append(layoutString);
296  nnOptions.Append(":");
297  nnOptions.Append(trainingStrategyString);
298 
299  factory->BookMethod(dataloader, TMVA::Types::kDNN, "DNN_CPU", nnOptions); // NN
300  }
301 
302 
303 
304  // Support Vector Machine
305  if (Use["SVM"])
306  factory->BookMethod( dataloader, TMVA::Types::kSVM, "SVM", "Gamma=0.25:Tol=0.001:VarTransform=Norm" );
307 
308  // Boosted Decision Trees
309  if (Use["BDT"])
310  factory->BookMethod( dataloader, TMVA::Types::kBDT, "BDT",
311  "!H:!V:NTrees=100:MinNodeSize=1.0%:BoostType=AdaBoostR2:SeparationType=RegressionVariance:nCuts=20:PruneMethod=CostComplexity:PruneStrength=30" );
312 
313  if (Use["BDTG"])
314  factory->BookMethod( dataloader, TMVA::Types::kBDT, "BDTG",
315  "!H:!V:NTrees=2000::BoostType=Grad:Shrinkage=0.1:UseBaggedBoost:BaggedSampleFraction=0.5:nCuts=20:MaxDepth=3:MaxDepth=4" );
316  // --------------------------------------------------------------------------------------------------
317 
318  // Now you can tell the factory to train, test, and evaluate the MVAs
319 
320  // Train MVAs using the set of training events
321  factory->TrainAllMethods();
322 
323  // Evaluate all MVAs using the set of test events
324  factory->TestAllMethods();
325 
326  // Evaluate and compare performance of all configured MVAs
327  factory->EvaluateAllMethods();
328 
329  // --------------------------------------------------------------
330 
331  // Save the output
332  outputFile->Close();
333 
334  std::cout << "==> Wrote root file: " << outputFile->GetName() << std::endl;
335  std::cout << "==> TMVARegression is done!" << std::endl;
336 
337  delete factory;
338  delete dataloader;
339 
340  // Launch the GUI for the root macros
341  if (!gROOT->IsBatch()) TMVA::TMVARegGui( outfileName );
342 }
343 
344 int main( int argc, char** argv )
345 {
346  // Select methods (don't look at this code - not of interest)
347  TString methodList;
348  for (int i=1; i<argc; i++) {
349  TString regMethod(argv[i]);
350  if(regMethod=="-b" || regMethod=="--batch") continue;
351  if (!methodList.IsNull()) methodList += TString(",");
352  methodList += regMethod;
353  }
354  TMVARegression(methodList);
355  return 0;
356 }
virtual Bool_t AccessPathName(const char *path, EAccessMode mode=kFileExists)
Returns FALSE if one can access a file using the specified access mode.
Definition: TSystem.cxx:1272
static Tools & Instance()
Definition: Tools.cxx:75
MethodBase * BookMethod(DataLoader *loader, TString theMethodName, TString methodTitle, TString theOption="")
Book a classifier or regression method.
Definition: Factory.cxx:343
#define gROOT
Definition: TROOT.h:375
void TrainAllMethods()
Iterates through all booked methods and calls training.
Definition: Factory.cxx:1017
void AddVariable(const TString &expression, const TString &title, const TString &unit, char type='F', Double_t min=0, Double_t max=0)
user inserts discriminating variable in data set info
Definition: DataLoader.cxx:491
static TFile * Open(const char *name, Option_t *option="", const char *ftitle="", Int_t compress=1, Int_t netopt=0)
Create / open a file.
Definition: TFile.cxx:3909
R__EXTERN TSystem * gSystem
Definition: TSystem.h:539
void EvaluateAllMethods(void)
Iterates over all MVAs that have been booked, and calls their evaluation methods. ...
Definition: Factory.cxx:1255
void TestAllMethods()
Definition: Factory.cxx:1153
unsigned int UInt_t
Definition: RtypesCore.h:42
void AddRegressionTree(TTree *tree, Double_t weight=1.0, Types::ETreeType treetype=Types::kMaxTreeType)
Definition: DataLoader.h:112
This is the main MVA steering class.
Definition: Factory.h:81
Tools & gTools()
void PrepareTrainingAndTestTree(const TCut &cut, const TString &splitOpt)
prepare the training and test trees -> same cuts for signal and background
Definition: DataLoader.cxx:629
double Double_t
Definition: RtypesCore.h:55
void AddTarget(const TString &expression, const TString &title="", const TString &unit="", Double_t min=0, Double_t max=0)
user inserts target in data set info
Definition: DataLoader.cxx:509
void SetWeightExpression(const TString &variable, const TString &className="")
Definition: DataLoader.cxx:560
Abstract ClassifierFactory template that handles arbitrary types.
static Bool_t SetCacheFileDir(const char *cacheDir, Bool_t operateDisconnected=kTRUE, Bool_t forceCacheread=kFALSE)
Sets the directory where to locally stage/cache remote files.
Definition: TFile.cxx:4429
std::vector< TString > SplitString(const TString &theOpt, const char separator) const
splits the option string at &#39;separator&#39; and fills the list &#39;splitV&#39; with the primitive strings ...
Definition: Tools.cxx:1210
void TMVARegGui(const char *fName="TMVAReg.root", TString dataset="")
int main(int argc, char **argv)
void AddSpectator(const TString &expression, const TString &title="", const TString &unit="", Double_t min=0, Double_t max=0)
user inserts target in data set info
Definition: DataLoader.cxx:521