Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
TMVA_Higgs_Classification.C
Go to the documentation of this file.
1/// \file
2/// \ingroup tutorial_tmva
3/// \notebook
4/// Classification example of TMVA based on public Higgs UCI dataset
5///
6/// The UCI data set is a public HIGGS data set , see http://archive.ics.uci.edu/ml/datasets/HIGGS
7/// used in this paper: Baldi, P., P. Sadowski, and D. Whiteson. “Searching for Exotic Particles in High-energy Physics
8/// with Deep Learning.” Nature Communications 5 (July 2, 2014).
9///
10/// \macro_image
11/// \macro_output
12/// \macro_code
13///
14/// \author Lorenzo Moneta
15
16/***
17## Declare Factory
18
19Create the Factory class. Later you can choose the methods
20whose performance you'd like to investigate.
21
22The factory is the major TMVA object you have to interact with. Here is the list of parameters you need to pass
23
24 - The first argument is the base of the name of all the output
25weightfiles in the directory weight/ that will be created with the
26method parameters
27
28 - The second argument is the output file for the training results
29
30 - The third argument is a string option defining some general configuration for the TMVA session. For example all TMVA output can be suppressed by removing the "!" (not) in front of the "Silent" argument in the option string
31
32**/
33
35
36 // options to control used methods
37
38 bool useLikelihood = true; // likelihood based discriminant
39 bool useLikelihoodKDE = false; // likelihood based discriminant
40 bool useFischer = true; // Fischer discriminant
41 bool useMLP = false; // Multi Layer Perceptron (old TMVA NN implementation)
42 bool useBDT = true; // Boosted Decision Tree
43 bool useDL = true; // TMVA Deep learning ( CPU or GPU)
44 bool useKeras = true; // Keras Deep learning
45 bool usePyTorch = true; // PyTorch Deep learning
46
48
49#ifdef R__HAS_PYMVA
50 gSystem->Setenv("KERAS_BACKEND", "tensorflow");
51 // for using Keras
53#else
54 useKeras = false;
55 usePyTorch = false;
56#endif
57
58 auto outputFile = TFile::Open("Higgs_ClassificationOutput.root", "RECREATE");
59
60 TMVA::Factory factory("TMVA_Higgs_Classification", outputFile,
61 "!V:ROC:!Silent:Color:AnalysisType=Classification" );
62
63/**
64
65## Setup Dataset(s)
66
67Define now input data file and signal and background trees
68
69 **/
70
71 TString inputFileName = "Higgs_data.root";
72 TString inputFileLink = "http://root.cern/files/" + inputFileName;
73
74 TFile *inputFile = nullptr;
75
76 if (!gSystem->AccessPathName(inputFileName)) {
77 // file exists
78 inputFile = TFile::Open( inputFileName );
79 }
80
81 if (!inputFile) {
82 // download file from Cernbox location
83 Info("TMVA_Higgs_Classification","Download Higgs_data.root file");
85 inputFile = TFile::Open(inputFileLink, "CACHEREAD");
86 if (!inputFile) {
87 Error("TMVA_Higgs_Classification","Input file cannot be downloaded - exit");
88 return;
89 }
90 }
91
92// --- Register the training and test trees
93
94 TTree *signalTree = (TTree*)inputFile->Get("sig_tree");
95 TTree *backgroundTree = (TTree*)inputFile->Get("bkg_tree");
96
97 signalTree->Print();
98
99/***
100## Declare DataLoader(s)
101
102The next step is to declare the DataLoader class that deals with input variables
103
104Define the input variables that shall be used for the MVA training
105note that you may also use variable expressions, which can be parsed by TTree::Draw( "expression" )]
106
107***/
108
109 TMVA::DataLoader * loader = new TMVA::DataLoader("dataset");
110
111 loader->AddVariable("m_jj");
112 loader->AddVariable("m_jjj");
113 loader->AddVariable("m_lv");
114 loader->AddVariable("m_jlv");
115 loader->AddVariable("m_bb");
116 loader->AddVariable("m_wbb");
117 loader->AddVariable("m_wwbb");
118
119/// We set now the input data trees in the TMVA DataLoader class
120
121// global event weights per tree (see below for setting event-wise weights)
122 Double_t signalWeight = 1.0;
123 Double_t backgroundWeight = 1.0;
124
125// You can add an arbitrary number of signal or background trees
126 loader->AddSignalTree ( signalTree, signalWeight );
127 loader->AddBackgroundTree( backgroundTree, backgroundWeight );
128
129
130// Set individual event weights (the variables must exist in the original TTree)
131// for signal : factory->SetSignalWeightExpression ("weight1*weight2");
132// for background: factory->SetBackgroundWeightExpression("weight1*weight2");
133//loader->SetBackgroundWeightExpression( "weight" );
134
135// Apply additional cuts on the signal and background samples (can be different)
136 TCut mycuts = ""; // for example: TCut mycuts = "abs(var1)<0.5 && abs(var2-0.5)<1";
137 TCut mycutb = ""; // for example: TCut mycutb = "abs(var1)<0.5";
138
139// Tell the factory how to use the training and testing events
140//
141// If no numbers of events are given, half of the events in the tree are used
142// for training, and the other half for testing:
143// loader->PrepareTrainingAndTestTree( mycut, "SplitMode=random:!V" );
144// To also specify the number of testing events, use:
145
146 loader->PrepareTrainingAndTestTree( mycuts, mycutb,
147 "nTrain_Signal=7000:nTrain_Background=7000:SplitMode=Random:NormMode=NumEvents:!V" );
148
149/***
150## Booking Methods
151
152Here we book the TMVA methods. We book first a Likelihood based on KDE (Kernel Density Estimation), a Fischer discriminant, a BDT
153and a shallow neural network
154
155 */
156
157
158// Likelihood ("naive Bayes estimator")
159if (useLikelihood) {
160 factory.BookMethod(loader, TMVA::Types::kLikelihood, "Likelihood",
161 "H:!V:TransformOutput:PDFInterpol=Spline2:NSmoothSig[0]=20:NSmoothBkg[0]=20:NSmoothBkg[1]=10:NSmooth=1:NAvEvtPerBin=50" );
162}
163// Use a kernel density estimator to approximate the PDFs
164if (useLikelihoodKDE) {
165 factory.BookMethod(loader, TMVA::Types::kLikelihood, "LikelihoodKDE",
166 "!H:!V:!TransformOutput:PDFInterpol=KDE:KDEtype=Gauss:KDEiter=Adaptive:KDEFineFactor=0.3:KDEborder=None:NAvEvtPerBin=50" );
167
168}
169
170// Fisher discriminant (same as LD)
171if (useFischer) {
172 factory.BookMethod(loader, TMVA::Types::kFisher, "Fisher", "H:!V:Fisher:VarTransform=None:CreateMVAPdfs:PDFInterpolMVAPdf=Spline2:NbinsMVAPdf=50:NsmoothMVAPdf=10" );
173}
174
175//Boosted Decision Trees
176if (useBDT) {
177 factory.BookMethod(loader,TMVA::Types::kBDT, "BDT",
178 "!V:NTrees=200:MinNodeSize=2.5%:MaxDepth=2:BoostType=AdaBoost:AdaBoostBeta=0.5:UseBaggedBoost:BaggedSampleFraction=0.5:SeparationType=GiniIndex:nCuts=20" );
179}
180
181//Multi-Layer Perceptron (Neural Network)
182if (useMLP) {
183 factory.BookMethod(loader, TMVA::Types::kMLP, "MLP",
184 "!H:!V:NeuronType=tanh:VarTransform=N:NCycles=100:HiddenLayers=N+5:TestRate=5:!UseRegulator" );
185}
186
187
188/// Here we book the new DNN of TMVA if we have support in ROOT. We will use GPU version if ROOT is enabled with GPU
189
190
191 /***
192
193## Booking Deep Neural Network
194
195Here we define the option string for building the Deep Neural network model.
196
197#### 1. Define DNN layout
198
199The DNN configuration is defined using a string. Note that whitespaces between characters are not allowed.
200
201We define first the DNN layout:
202
203- **input layout** : this defines the input data format for the DNN as ``input depth | height | width``.
204 In case of a dense layer as first layer the input layout should be ``1 | 1 | number of input variables`` (features)
205- **batch layout** : this defines how are the input batch. It is related to input layout but not the same.
206 If the first layer is dense it should be ``1 | batch size ! number of variables`` (features)
207
208 *(note the use of the character `|` as separator of input parameters for DNN layout)*
209
210note that in case of only dense layer the input layout could be omitted but it is required when defining more
211complex architectures
212
213- **layer layout** string defining the layer architecture. The syntax is
214 - layer type (e.g. DENSE, CONV, RNN)
215 - layer parameters (e.g. number of units)
216 - activation function (e.g TANH, RELU,...)
217
218 *the different layers are separated by the ``","`` *
219
220#### 2. Define Training Strategy
221
222We define here the training strategy parameters for the DNN. The parameters are separated by the ``","`` separator.
223One can then concatenate different training strategy with different parameters. The training strategy are separated by
224the ``"|"`` separator.
225
226 - Optimizer
227 - Learning rate
228 - Momentum (valid for SGD and RMSPROP)
229 - Regularization and Weight Decay
230 - Dropout
231 - Max number of epochs
232 - Convergence steps. if the test error will not decrease after that value the training will stop
233 - Batch size (This value must be the same specified in the input layout)
234 - Test Repetitions (the interval when the test error will be computed)
235
236
237#### 3. Define general DNN options
238
239We define the general DNN options concatenating in the final string the previously defined layout and training strategy.
240Note we use the ``":"`` separator to separate the different higher level options, as in the other TMVA methods.
241In addition to input layout, batch layout and training strategy we add now:
242
243- Type of Loss function (e.g. CROSSENTROPY)
244- Weight Initizalization (e.g XAVIER, XAVIERUNIFORM, NORMAL )
245- Variable Transformation
246- Type of Architecture (e.g. CPU, GPU, Standard)
247
248We can then book the DL method using the built option string
249
250 ***/
251
252 if (useDL) {
253
254 bool useDLGPU = false;
255#ifdef R__HAS_TMVAGPU
256 useDLGPU = true;
257#endif
258
259 // Define DNN layout
260 TString inputLayoutString = "InputLayout=1|1|7";
261 TString batchLayoutString= "BatchLayout=1|128|7";
262 TString layoutString ("Layout=DENSE|64|TANH,DENSE|64|TANH,DENSE|64|TANH,DENSE|64|TANH,DENSE|1|LINEAR");
263 // Define Training strategies
264 // one can catenate several training strategies
265 TString training1("LearningRate=1e-3,Momentum=0.9,"
266 "ConvergenceSteps=10,BatchSize=128,TestRepetitions=1,"
267 "MaxEpochs=30,WeightDecay=1e-4,Regularization=None,"
268 "Optimizer=ADAM,ADAM_beta1=0.9,ADAM_beta2=0.999,ADAM_eps=1.E-7," // ADAM default parameters
269 "DropConfig=0.0+0.0+0.0+0.");
270 // TString training2("LearningRate=1e-3,Momentum=0.9"
271 // "ConvergenceSteps=10,BatchSize=128,TestRepetitions=1,"
272 // "MaxEpochs=20,WeightDecay=1e-4,Regularization=None,"
273 // "Optimizer=SGD,DropConfig=0.0+0.0+0.0+0.");
274
275 TString trainingStrategyString ("TrainingStrategy=");
276 trainingStrategyString += training1; // + "|" + training2;
277
278 // General Options.
279
280 TString dnnOptions ("!H:V:ErrorStrategy=CROSSENTROPY:VarTransform=G:"
281 "WeightInitialization=XAVIER");
282 dnnOptions.Append (":"); dnnOptions.Append (inputLayoutString);
283 dnnOptions.Append (":"); dnnOptions.Append (batchLayoutString);
284 dnnOptions.Append (":"); dnnOptions.Append (layoutString);
285 dnnOptions.Append (":"); dnnOptions.Append (trainingStrategyString);
286
287 TString dnnMethodName = "DNN_CPU";
288 if (useDLGPU) {
289 dnnOptions += ":Architecture=GPU";
290 dnnMethodName = "DNN_GPU";
291 } else {
292 dnnOptions += ":Architecture=CPU";
293 }
294
295 factory.BookMethod(loader, TMVA::Types::kDL, dnnMethodName, dnnOptions);
296 }
297
298 // Keras deep learning
299 if (useKeras) {
300
301 Info("TMVA_Higgs_Classification", "Building deep neural network with keras ");
302 // create python script which can be executed
303 // create 2 conv2d layer + maxpool + dense
304 TMacro m;
305 m.AddLine("import tensorflow");
306 m.AddLine("from tensorflow.keras.models import Sequential");
307 m.AddLine("from tensorflow.keras.optimizers import Adam");
308 m.AddLine("from tensorflow.keras.layers import Input, Dense");
309 m.AddLine("");
310 m.AddLine("model = Sequential() ");
311 m.AddLine("model.add(Dense(64, activation='relu',input_dim=7))");
312 m.AddLine("model.add(Dense(64, activation='relu'))");
313 m.AddLine("model.add(Dense(64, activation='relu'))");
314 m.AddLine("model.add(Dense(64, activation='relu'))");
315 m.AddLine("model.add(Dense(2, activation='sigmoid'))");
316 m.AddLine("model.compile(loss = 'binary_crossentropy', optimizer = Adam(learning_rate = 0.001), weighted_metrics = ['accuracy'])");
317 m.AddLine("model.save('Higgs_model.h5')");
318 m.AddLine("model.summary()");
319
320 m.SaveSource("make_higgs_model.py");
321 // execute
322 auto ret = (TString *)gROOT->ProcessLine("TMVA::Python_Executable()");
323 TString python_exe = (ret) ? *(ret) : "python";
324 gSystem->Exec(python_exe + " make_higgs_model.py");
325
326 if (gSystem->AccessPathName("Higgs_model.h5")) {
327 Warning("TMVA_Higgs_Classification", "Error creating Keras model file - skip using Keras");
328 } else {
329 // book PyKeras method only if Keras model could be created
330 Info("TMVA_Higgs_Classification", "Booking tf.Keras Dense model");
331 factory.BookMethod(
332 loader, TMVA::Types::kPyKeras, "PyKeras",
333 "H:!V:VarTransform=None:FilenameModel=Higgs_model.h5:tf.keras:"
334 "FilenameTrainedModel=Higgs_trained_model.h5:NumEpochs=20:BatchSize=100:"
335 "GpuOptions=allow_growth=True"); // needed for RTX NVidia card and to avoid TF allocates all GPU memory
336 }
337 }
338
339 /**
340## Train Methods
341
342Here we train all the previously booked methods.
343
344 */
345
346 factory.TrainAllMethods();
347
348/**
349 ## Test all methods
350
351 Now we test and evaluate all methods using the test data set
352*/
353
354 factory.TestAllMethods();
355
356 factory.EvaluateAllMethods();
357
358/// after we get the ROC curve and we display
359
360 auto c1 = factory.GetROCCurve(loader);
361 c1->Draw();
362
363/// at the end we close the output file which contains the evaluation result of all methods and it can be used by TMVAGUI
364/// to display additional plots
365
366 outputFile->Close();
367
368
369}
double Double_t
Definition RtypesCore.h:59
void Info(const char *location, const char *msgfmt,...)
Use this function for informational messages.
Definition TError.cxx:218
void Error(const char *location, const char *msgfmt,...)
Use this function in case an error occurred.
Definition TError.cxx:185
void Warning(const char *location, const char *msgfmt,...)
Use this function in warning situations.
Definition TError.cxx:229
#define gROOT
Definition TROOT.h:406
R__EXTERN TSystem * gSystem
Definition TSystem.h:555
A specialized string object used for TTree selections.
Definition TCut.h:25
TObject * Get(const char *namecycle) override
Return pointer to object identified by namecycle.
A ROOT file is an on-disk file, usually with extension .root, that stores objects in a file-system-li...
Definition TFile.h:53
static TFile * Open(const char *name, Option_t *option="", const char *ftitle="", Int_t compress=ROOT::RCompressionSetting::EDefaults::kUseCompiledDefault, Int_t netopt=0)
Create / open a file.
Definition TFile.cxx:4089
static Bool_t SetCacheFileDir(std::string_view cacheDir, Bool_t operateDisconnected=kTRUE, Bool_t forceCacheread=kFALSE)
Sets the directory where to locally stage/cache remote files.
Definition TFile.cxx:4626
void AddSignalTree(TTree *signal, Double_t weight=1.0, Types::ETreeType treetype=Types::kMaxTreeType)
number of signal events (used to compute significance)
void PrepareTrainingAndTestTree(const TCut &cut, const TString &splitOpt)
prepare the training and test trees -> same cuts for signal and background
void AddBackgroundTree(TTree *background, Double_t weight=1.0, Types::ETreeType treetype=Types::kMaxTreeType)
number of signal events (used to compute significance)
void AddVariable(const TString &expression, const TString &title, const TString &unit, char type='F', Double_t min=0, Double_t max=0)
user inserts discriminating variable in data set info
This is the main MVA steering class.
Definition Factory.h:80
static void PyInitialize()
Initialize Python interpreter.
static Tools & Instance()
Definition Tools.cxx:71
@ kFisher
Definition Types.h:82
@ kLikelihood
Definition Types.h:79
Class supporting a collection of lines with C++ code.
Definition TMacro.h:31
Basic string class.
Definition TString.h:139
virtual Int_t Exec(const char *shellcmd)
Execute a command.
Definition TSystem.cxx:653
virtual Bool_t AccessPathName(const char *path, EAccessMode mode=kFileExists)
Returns FALSE if one can access a file using the specified access mode.
Definition TSystem.cxx:1296
virtual void Setenv(const char *name, const char *value)
Set environment variable.
Definition TSystem.cxx:1649
A TTree represents a columnar dataset.
Definition TTree.h:79
void Print(Option_t *option="") const override
Print a summary of the tree contents.
Definition TTree.cxx:7219
return c1
Definition legend1.C:41
TMarker m
Definition textangle.C:8