Logo ROOT  
Reference Guide
VariableImportance.cxx
Go to the documentation of this file.
1// @(#)root/tmva $Id$
2// Author: Omar Zapata and Sergei Gleyzer
3
4/*! \class TMVA::VariableImportanceResult
5\ingroup TMVA
6*/
7
8/*! \class TMVA::VariableImportance
9\ingroup TMVA
10*/
11
13
14#include "TMVA/Config.h"
15#include "TMVA/DataSetInfo.h"
16#include "TMVA/Envelope.h"
17#include "TMVA/Factory.h"
18#include "TMVA/OptionMap.h"
19#include "TMVA/MethodBase.h"
20#include "TMVA/MethodCategory.h"
21#include "TMVA/MsgLogger.h"
22#include "TMVA/Types.h"
24
25#include "TAxis.h"
26#include "TCanvas.h"
27#include "TH1.h"
28#include "TRandom3.h"
29#include "TStyle.h"
30
31#include <bitset>
32#include <memory>
33#include <utility>
34
35
36//number of bits for bitset
37#define NBITS 32
38
39////////////////////////////////////////////////////////////////////////////////
40
41TMVA::VariableImportanceResult::VariableImportanceResult():fImportanceValues("VariableImportance"),
42 fImportanceHist(nullptr)
43{
44
45}
46
47////////////////////////////////////////////////////////////////////////////////
48
50{
51 fImportanceValues = obj.fImportanceValues;
52 fImportanceHist = obj.fImportanceHist;
53}
54
55////////////////////////////////////////////////////////////////////////////////
56
58{
61
62 MsgLogger fLogger("VariableImportance");
63 if(fType==VIType::kShort)
64 {
65 fLogger<<kINFO<<"Variable Importance Results (Short)"<<Endl;
66 }else if(fType==VIType::kAll)
67 {
68 fLogger<<kINFO<<"Variable Importance Results (All)"<<Endl;
69 }else{
70 fLogger<<kINFO<<"Variable Importance Results (Random)"<<Endl;
71 }
72
73 fImportanceValues.Print();
75}
76
77////////////////////////////////////////////////////////////////////////////////
78
80{
81 TCanvas *c=new TCanvas(name.Data());
82 fImportanceHist->Draw("");
83 fImportanceHist->GetXaxis()->SetTitle(" Variable Names ");
84 fImportanceHist->GetYaxis()->SetTitle(" Importance (%) ");
85 c->Draw();
86 return c;
87}
88
89////////////////////////////////////////////////////////////////////////////////
90
91TMVA::VariableImportance::VariableImportance(TMVA::DataLoader *dataloader):TMVA::Envelope("VariableImportance",dataloader,nullptr),fType(VIType::kShort)
92{
93 fClassifier=std::unique_ptr<Factory>(new TMVA::Factory("VariableImportance","!V:!ROC:!ModelPersistence:Silent:Color:!DrawProgressBar:AnalysisType=Classification"));
94}
95
96////////////////////////////////////////////////////////////////////////////////
97
99{
100 fClassifier=nullptr;
101}
102
103////////////////////////////////////////////////////////////////////////////////
104
106{
107
108 //NOTE: Put the type of VI Algorithm in the results Print
109 if(fType==VIType::kShort)
110 {
111 EvaluateImportanceShort();
112 }else if(fType==VIType::kAll)
113 {
114 EvaluateImportanceAll();
115 }else{
116 UInt_t nbits=fDataLoader->GetDefaultDataSetInfo().GetNVariables();
117 if(nbits<10)
118 Log()<<kERROR<<"Running variable importance with less that 10 varibales in Random mode "<<
119 "can to produce inconsisten results"<<Endl;
120 EvaluateImportanceRandom(pow(nbits,2));
121 }
122 fResults.fType = fType;
125 Log()<<kINFO<<"Evaluation done."<<Endl;
127}
128
129////////////////////////////////////////////////////////////////////////////////
130
132{
133 ULong_t sum=0;
134 for(ULong_t n=0;n<i;n++) sum+=pow(2,n);
135 return sum;
136}
137
138////////////////////////////////////////////////////////////////////////////////
139
140TH1F* TMVA::VariableImportance::GetImportance(const UInt_t nbits,std::vector<Float_t> &importances,std::vector<TString> &varNames)
141{
142 TH1F *vihist = new TH1F("vihist", "", nbits, 0, nbits);
143
144 gStyle->SetOptStat(000000);
145
146 Float_t normalization = 0.0;
147 for (UInt_t i = 0; i < nbits; i++) normalization += importances[i];
148
149 Float_t roc = 0.0;
150
153
154
155 for (UInt_t i = 1; i < nbits + 1; i++) {
156 roc = 100.0 * importances[i - 1] / normalization;
157 vihist->GetXaxis()->SetBinLabel(i, varNames[i - 1].Data());
158 vihist->SetBinContent(i, roc);
159 }
160
161 vihist->LabelsOption("v >", "X");
162 vihist->SetBarWidth(0.97);
163 vihist->SetFillColor(TColor::GetColor("#006600"));
164
165 vihist->GetXaxis()->SetTitle(" Variable Names ");
166 vihist->GetXaxis()->SetTitleSize(0.045);
167 vihist->GetXaxis()->CenterTitle();
168 vihist->GetXaxis()->SetTitleOffset(1.24);
169
170 vihist->GetYaxis()->SetTitle(" Importance (%)");
171 vihist->GetYaxis()->SetTitleSize(0.045);
172 vihist->GetYaxis()->CenterTitle();
173 vihist->GetYaxis()->SetTitleOffset(1.24);
174
175 vihist->GetYaxis()->SetRangeUser(-7, 50);
176 vihist->SetDirectory(0);
177
178 return vihist;
179}
180
181////////////////////////////////////////////////////////////////////////////////
182
184{
185 for (auto &meth : fMethods) {
186 TString methodName = meth.GetValue<TString>("MethodName");
187 TString methodTitle = meth.GetValue<TString>("MethodTitle");
188 TString methodOptions = meth.GetValue<TString>("MethodOptions");
189
190 uint32_t x = 0;
191 uint32_t y = 0;
192 // getting number of variables and variable names from loader
193 const UInt_t nbits = fDataLoader->GetDefaultDataSetInfo().GetNVariables();
194 std::vector<TString> varNames = fDataLoader->GetDefaultDataSetInfo().GetListOfVariables();
195
196 ULong_t range = Sum(nbits);
197
198 // vector to save importances
199 std::vector<Float_t> importances(nbits);
200 for (UInt_t i = 0; i < nbits; i++)
201 importances[i] = 0;
202
203 Float_t SROC, SSROC; // computed ROC value for every Seed and SubSeed
204
205 x = range;
206
207 std::bitset<NBITS> xbitset(x);
208 if (x == 0)
209 Log() << kFATAL << "Error: need at least one variable."; // dataloader need at least one variable
210
211 // creating loader for seed
212 TMVA::DataLoader *seeddl = new TMVA::DataLoader(xbitset.to_string());
213
214 // adding variables from seed
215 for (UInt_t index = 0; index < nbits; index++) {
216 if (xbitset[index])
217 seeddl->AddVariable(varNames[index], 'F');
218 }
219
220 // Loading Dataset
221 DataLoaderCopy(seeddl, fDataLoader.get());
222
223 // Booking Seed
224 fClassifier->BookMethod(seeddl, methodName, methodTitle, methodOptions);
225
226 // Train/Test/Evaluation
227 fClassifier->TrainAllMethods();
228 fClassifier->TestAllMethods();
229 fClassifier->EvaluateAllMethods();
230
231 // getting ROC
232 SROC = fClassifier->GetROCIntegral(xbitset.to_string(), methodTitle);
233
234 delete seeddl;
235 fClassifier->DeleteAllMethods();
236 fClassifier->fMethodsMap.clear();
237
238 for (uint32_t i = 0; i < NBITS; ++i) {
239 if (x & (1 << i)) {
240 y = x & ~(1 << i);
241 std::bitset<NBITS> ybitset(y);
242 //need at least one variable
243 //NOTE: if subssed is zero then is the special case
244 //that count in xbitset is 1
245 Double_t ny = log(x - y) / 0.693147;
246 if (y == 0) {
247 importances[ny] = SROC - 0.5;
248 continue;
249 }
250
251 //creating loader for subseed
252 TMVA::DataLoader *subseeddl = new TMVA::DataLoader(ybitset.to_string());
253 //adding variables from subseed
254 for (UInt_t index = 0; index < nbits; index++) {
255 if (ybitset[index]) subseeddl->AddVariable(varNames[index], 'F');
256 }
257
258 //Loading Dataset
259 DataLoaderCopy(subseeddl,fDataLoader.get());
260
261 //Booking SubSeed
262 fClassifier->BookMethod(subseeddl, methodName, methodTitle, methodOptions);
263
264 //Train/Test/Evaluation
265 fClassifier->TrainAllMethods();
266 fClassifier->TestAllMethods();
267 fClassifier->EvaluateAllMethods();
268
269 //getting ROC
270 SSROC = fClassifier->GetROCIntegral(ybitset.to_string(), methodTitle);
271 importances[ny] += SROC - SSROC;
272
273 delete subseeddl;
274 fClassifier->DeleteAllMethods();
275 fClassifier->fMethodsMap.clear();
276 }
277 }
278 Float_t normalization = 0.0;
279 for (UInt_t i = 0; i < nbits; i++) normalization += importances[i];
280
281 for(UInt_t i=0;i<nbits;i++){
282 //adding values
283 fResults.fImportanceValues[varNames[i]]=(100.0 * importances[i] / normalization);
284 //adding sufix
285 fResults.fImportanceValues[varNames[i]]=fResults.fImportanceValues.GetValue<TString>(varNames[i])+" % ";
286 }
287 fResults.fImportanceHist = std::shared_ptr<TH1F>(GetImportance(nbits,importances,varNames));
288 }
289}
290
291////////////////////////////////////////////////////////////////////////////////
292
294{
295 for (auto &meth : fMethods) {
296
297 TString methodName = meth.GetValue<TString>("MethodName");
298 TString methodTitle = meth.GetValue<TString>("MethodTitle");
299 TString methodOptions = meth.GetValue<TString>("MethodOptions");
300
301 TRandom3 *rangen = new TRandom3(0); // Random Gen.
302
303 uint32_t x = 0;
304 uint32_t y = 0;
305
306 // getting number of variables and variable names from loader
307 const UInt_t nbits = fDataLoader->GetDefaultDataSetInfo().GetNVariables();
308 std::vector<TString> varNames = fDataLoader->GetDefaultDataSetInfo().GetListOfVariables();
309
310 ULong_t range = pow(2, nbits);
311
312 // vector to save importances
313 std::vector<Float_t> importances(nbits);
314
315 for (UInt_t i = 0; i < nbits; i++)
316 importances[i] = 0;
317
318 Float_t SROC, SSROC; // computed ROC value for every Seed and SubSeed
319
320 x = range;
321
322 for (UInt_t n = 0; n < seeds; n++) {
323 x = rangen->Integer(range);
324
325 std::bitset<NBITS> xbitset(x);
326 if (x == 0)
327 continue; // dataloader need at least one variable
328
329 // creating loader for seed
330 TMVA::DataLoader *seeddl = new TMVA::DataLoader(xbitset.to_string());
331
332 // adding variables from seed
333 for (UInt_t index = 0; index < nbits; index++) {
334 if (xbitset[index]) seeddl->AddVariable(varNames[index], 'F');
335 }
336
337 //Loading Dataset
338 DataLoaderCopy(seeddl,fDataLoader.get());
339
340 //Booking Seed
341 fClassifier->BookMethod(seeddl, methodName, methodTitle, methodOptions);
342
343 //Train/Test/Evaluation
344 fClassifier->TrainAllMethods();
345 fClassifier->TestAllMethods();
346 fClassifier->EvaluateAllMethods();
347
348 //getting ROC
349 SROC = fClassifier->GetROCIntegral(xbitset.to_string(), methodTitle);
350
351 delete seeddl;
352 fClassifier->DeleteAllMethods();
353 fClassifier->fMethodsMap.clear();
354
355 for (uint32_t i = 0; i < 32; ++i) {
356 if (x & (1 << i)) {
357 y = x & ~(1 << i);
358 std::bitset<NBITS> ybitset(y);
359 //need at least one variable
360 //NOTE: if subssed is zero then is the special case
361 //that count in xbitset is 1
362 Double_t ny = log(x - y) / 0.693147;
363 if (y == 0) {
364 importances[ny] = SROC - 0.5;
365 continue;
366 }
367
368 //creating loader for subseed
369 TMVA::DataLoader *subseeddl = new TMVA::DataLoader(ybitset.to_string());
370 //adding variables from subseed
371 for (UInt_t index = 0; index < nbits; index++) {
372 if (ybitset[index]) subseeddl->AddVariable(varNames[index], 'F');
373 }
374
375 //Loading Dataset
376 DataLoaderCopy(subseeddl,fDataLoader.get());
377
378 //Booking SubSeed
379 fClassifier->BookMethod(subseeddl, methodName, methodTitle, methodOptions);
380
381 //Train/Test/Evaluation
382 fClassifier->TrainAllMethods();
383 fClassifier->TestAllMethods();
384 fClassifier->EvaluateAllMethods();
385
386 //getting ROC
387 SSROC = fClassifier->GetROCIntegral(ybitset.to_string(), methodTitle);
388 importances[ny] += SROC - SSROC;
389
390 delete subseeddl;
391 fClassifier->DeleteAllMethods();
392 fClassifier->fMethodsMap.clear();
393 }
394 }
395 }
396
397 Float_t normalization = 0.0;
398 for (UInt_t i = 0; i < nbits; i++) normalization += importances[i];
399
400 for(UInt_t i=0;i<nbits;i++){
401 //adding values
402 fResults.fImportanceValues[varNames[i]]=(100.0 * importances[i] / normalization);
403 //adding sufix
404 fResults.fImportanceValues[varNames[i]]=fResults.fImportanceValues.GetValue<TString>(varNames[i])+" % ";
405 }
406 fResults.fImportanceHist = std::shared_ptr<TH1F>(GetImportance(nbits,importances,varNames));
407 delete rangen;
408 }
409}
410
411////////////////////////////////////////////////////////////////////////////////
412
414{
415 for (auto &meth : fMethods) {
416 TString methodName = meth.GetValue<TString>("MethodName");
417 TString methodTitle = meth.GetValue<TString>("MethodTitle");
418 TString methodOptions = meth.GetValue<TString>("MethodOptions");
419
420 uint32_t x = 0;
421 uint32_t y = 0;
422
423 // getting number of variables and variable names from loader
424 const UInt_t nbits = fDataLoader->GetDefaultDataSetInfo().GetNVariables();
425 std::vector<TString> varNames = fDataLoader->GetDefaultDataSetInfo().GetListOfVariables();
426
427 ULong_t range = pow(2, nbits);
428
429 // vector to save importances
430 std::vector<Float_t> importances(nbits);
431
432 // vector to save ROC-Integral values
433 std::vector<Float_t> ROC(range);
434 ROC[0] = 0.5;
435 for (UInt_t i = 0; i < nbits; i++)
436 importances[i] = 0;
437
438 Float_t SROC, SSROC; // computed ROC value
439 for (x = 1; x < range; x++) {
440
441 std::bitset<NBITS> xbitset(x);
442 if (x == 0)
443 continue; // dataloader need at least one variable
444
445 // creating loader for seed
446 TMVA::DataLoader *seeddl = new TMVA::DataLoader(xbitset.to_string());
447
448 // adding variables from seed
449 for (UInt_t index = 0; index < nbits; index++) {
450 if (xbitset[index]) seeddl->AddVariable(varNames[index], 'F');
451 }
452
453 DataLoaderCopy(seeddl,fDataLoader.get());
454
455 seeddl->PrepareTrainingAndTestTree(fDataLoader->GetDefaultDataSetInfo().GetCut("Signal"), fDataLoader->GetDefaultDataSetInfo().GetCut("Background"), fDataLoader->GetDefaultDataSetInfo().GetSplitOptions());
456
457 //Booking Seed
458 fClassifier->BookMethod(seeddl, methodName, methodTitle, methodOptions);
459
460 //Train/Test/Evaluation
461 fClassifier->TrainAllMethods();
462 fClassifier->TestAllMethods();
463 fClassifier->EvaluateAllMethods();
464
465 //getting ROC
466 ROC[x] = fClassifier->GetROCIntegral(xbitset.to_string(), methodTitle);
467
468 delete seeddl;
469 fClassifier->DeleteAllMethods();
470 fClassifier->fMethodsMap.clear();
471 }
472
473
474 for ( x = 0; x <range ; x++)
475 {
476 SROC=ROC[x];
477 for (uint32_t i = 0; i < NBITS; ++i) {
478 if (x & (1 << i)) {
479 y = x & ~(1 << i);
480 std::bitset<NBITS> ybitset(y);
481
482 Float_t ny = log(x - y) / 0.693147;
483 if (y == 0) {
484 importances[ny] = SROC - 0.5;
485 continue;
486 }
487
488 //getting ROC
489 SSROC = ROC[y];
490 importances[ny] += SROC - SSROC;
491 }
492
493 }
494 }
495 Float_t normalization = 0.0;
496 for (UInt_t i = 0; i < nbits; i++) normalization += importances[i];
497
498 for(UInt_t i=0;i<nbits;i++){
499 //adding values
500 fResults.fImportanceValues[varNames[i]]=(100.0 * importances[i] / normalization);
501 //adding sufix
502 fResults.fImportanceValues[varNames[i]]=fResults.fImportanceValues.GetValue<TString>(varNames[i])+" % ";
503 }
504 fResults.fImportanceHist = std::shared_ptr<TH1F>(GetImportance(nbits,importances,varNames));
505 }
506}
#define c(i)
Definition: RSha256.hxx:101
const Bool_t kFALSE
Definition: RtypesCore.h:101
float Float_t
Definition: RtypesCore.h:57
const Bool_t kTRUE
Definition: RtypesCore.h:100
unsigned long ULong_t
Definition: RtypesCore.h:55
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t index
char name[80]
Definition: TGX11.cxx:110
R__EXTERN TStyle * gStyle
Definition: TStyle.h:414
#define NBITS
virtual void SetTitleOffset(Float_t offset=1)
Set distance between the axis and the axis title.
Definition: TAttAxis.cxx:301
virtual void SetTitleSize(Float_t size=0.04)
Set size of axis title.
Definition: TAttAxis.cxx:312
virtual void SetFillColor(Color_t fcolor)
Set the fill area color.
Definition: TAttFill.h:37
virtual void SetBinLabel(Int_t bin, const char *label)
Set label for bin.
Definition: TAxis.cxx:852
void CenterTitle(Bool_t center=kTRUE)
Center axis title.
Definition: TAxis.h:185
virtual void SetRangeUser(Double_t ufirst, Double_t ulast)
Set the viewing range for the axis from ufirst to ulast (in user coordinates, that is,...
Definition: TAxis.cxx:979
The Canvas class.
Definition: TCanvas.h:23
static Int_t GetColor(const char *hexcolor)
Static method returning color number for color specified by hex color string of form: "#rrggbb",...
Definition: TColor.cxx:1822
1-D histogram with a float per channel (see TH1 documentation)}
Definition: TH1.h:574
virtual void SetDirectory(TDirectory *dir)
By default, when a histogram is created, it is added to the list of histogram objects in the current ...
Definition: TH1.cxx:8812
virtual void LabelsOption(Option_t *option="h", Option_t *axis="X")
Sort bins with labels or set option(s) to draw axis with labels.
Definition: TH1.cxx:5356
TAxis * GetXaxis()
Get the behaviour adopted by the object about the statoverflows. See EStatOverflows for more informat...
Definition: TH1.h:319
TAxis * GetYaxis()
Definition: TH1.h:320
virtual void SetBinContent(Int_t bin, Double_t content)
Set bin content see convention for numbering bins in TH1::GetBin In case the bin number is greater th...
Definition: TH1.cxx:9097
virtual void SetBarWidth(Float_t width=0.5)
Set the width of bars as fraction of the bin width for drawing mode "B".
Definition: TH1.h:359
void SetSilent(Bool_t s)
Definition: Config.h:63
void PrepareTrainingAndTestTree(const TCut &cut, const TString &splitOpt)
prepare the training and test trees -> same cuts for signal and background
Definition: DataLoader.cxx:632
void AddVariable(const TString &expression, const TString &title, const TString &unit, char type='F', Double_t min=0, Double_t max=0)
user inserts discriminating variable in data set info
Definition: DataLoader.cxx:485
Abstract base class for all high level ml algorithms, you can book ml methods like BDT,...
Definition: Envelope.h:44
This is the main MVA steering class.
Definition: Factory.h:80
ostringstream derivative to redirect and format output
Definition: MsgLogger.h:57
static void EnableOutput()
Definition: MsgLogger.cxx:68
std::shared_ptr< TH1F > fImportanceHist
TCanvas * Draw(const TString name="VariableImportance") const
std::unique_ptr< Factory > fClassifier
virtual void Evaluate()
Virtual method to be implemented with your algorithm.
void EvaluateImportanceRandom(UInt_t nseeds)
VariableImportance(DataLoader *loader)
TH1F * GetImportance(const UInt_t nbits, std::vector< Float_t > &importances, std::vector< TString > &varNames)
VIType
Definition: Types.h:69
@ kShort
Definition: Types.h:69
@ kERROR
Definition: Types.h:60
@ kINFO
Definition: Types.h:58
@ kFATAL
Definition: Types.h:61
virtual void SetTitle(const char *title="")
Set the title of the TNamed.
Definition: TNamed.cxx:164
virtual void Print(Option_t *option="") const
This method must be overridden when a class wants to print itself.
Definition: TObject.cxx:552
Random number generator class based on M.
Definition: TRandom3.h:27
Basic string class.
Definition: TString.h:136
void SetOptStat(Int_t stat=1)
The type of information printed in the histogram statistics box can be selected via the parameter mod...
Definition: TStyle.cxx:1590
void SetTitleXOffset(Float_t offset=1)
Definition: TStyle.h:393
R Sum(const RVec< T > &v, const R zero=R(0))
Sum elements of an RVec.
Definition: RVec.hxx:1861
RVec< PromoteTypes< T0, T1 > > pow(const T0 &x, const RVec< T1 > &v)
Definition: RVec.hxx:1753
RVec< PromoteType< T > > log(const RVec< T > &v)
Definition: RVec.hxx:1748
Double_t y[n]
Definition: legend1.C:17
Double_t x[n]
Definition: legend1.C:17
const Int_t n
Definition: legend1.C:16
create variable transformations
void DataLoaderCopy(TMVA::DataLoader *des, TMVA::DataLoader *src)
Config & gConfig()
MsgLogger & Endl(MsgLogger &ml)
Definition: MsgLogger.h:148
Double_t Log(Double_t x)
Returns the natural logarithm of x.
Definition: TMath.h:753