Logo ROOT   6.12/07
Reference Guide
VariableImportance.cxx
Go to the documentation of this file.
1 // @(#)root/tmva $Id$
2 // Author: Omar Zapata and Sergei Gleyzer
3 
4 /*! \class TMVA::VariableImportanceResult
5 \ingroup TMVA
6 */
7 
8 /*! \class TMVA::VariableImportance
9 \ingroup TMVA
10 */
11 
13 
14 #include "TMVA/Config.h"
15 #include "TMVA/DataSetInfo.h"
16 #include "TMVA/Envelope.h"
17 #include "TMVA/Factory.h"
18 #include "TMVA/OptionMap.h"
19 #include "TMVA/MethodBase.h"
20 #include "TMVA/MethodCategory.h"
21 #include "TMVA/MsgLogger.h"
22 #include "TMVA/Types.h"
24 
25 #include "TAxis.h"
26 #include "TGraph.h"
27 #include "TCanvas.h"
28 #include "TH1.h"
29 #include "TRandom3.h"
30 #include "TStyle.h"
31 #include "TSystem.h"
32 
33 #include <bitset>
34 #include <iostream>
35 #include <memory>
36 #include <utility>
37 
38 
39 //number of bits for bitset
40 #define NBITS 32
41 
42 ////////////////////////////////////////////////////////////////////////////////
43 
44 TMVA::VariableImportanceResult::VariableImportanceResult():fImportanceValues("VariableImportance"),
45  fImportanceHist(nullptr)
46 {
47 
48 }
49 
50 ////////////////////////////////////////////////////////////////////////////////
51 
53 {
56 }
57 
58 ////////////////////////////////////////////////////////////////////////////////
59 
61 {
64 
65  MsgLogger fLogger("VariableImportance");
66  if(fType==VIType::kShort)
67  {
68  fLogger<<kINFO<<"Variable Importance Results (Short)"<<Endl;
69  }else if(fType==VIType::kAll)
70  {
71  fLogger<<kINFO<<"Variable Importance Results (All)"<<Endl;
72  }else{
73  fLogger<<kINFO<<"Variable Importance Results (Random)"<<Endl;
74  }
75 
78 }
79 
80 ////////////////////////////////////////////////////////////////////////////////
81 
83 {
84  TCanvas *c=new TCanvas(name.Data());
85  fImportanceHist->Draw("");
86  fImportanceHist->GetXaxis()->SetTitle(" Variable Names ");
87  fImportanceHist->GetYaxis()->SetTitle(" Importance (%) ");
88  c->Draw();
89  return c;
90 }
91 
92 ////////////////////////////////////////////////////////////////////////////////
93 
94 TMVA::VariableImportance::VariableImportance(TMVA::DataLoader *dataloader):TMVA::Envelope("VariableImportance",dataloader,nullptr),fType(VIType::kShort)
95 {
96  fClassifier=std::unique_ptr<Factory>(new TMVA::Factory("VariableImportance","!V:!ROC:!ModelPersistence:Silent:Color:!DrawProgressBar:AnalysisType=Classification"));
97 }
98 
99 ////////////////////////////////////////////////////////////////////////////////
100 
102 {
103  fClassifier=nullptr;
104 }
105 
106 ////////////////////////////////////////////////////////////////////////////////
107 
109 {
110 
111  //NOTE: Put the type of VI Algorithm in the results Print
112  if(fType==VIType::kShort)
113  {
115  }else if(fType==VIType::kAll)
116  {
118  }else{
119  UInt_t nbits=fDataLoader->GetDefaultDataSetInfo().GetNVariables();
120  if(nbits<10)
121  Log()<<kERROR<<"Running variable importance with less that 10 varibales in Random mode "<<
122  "can to produce inconsisten results"<<Endl;
123  EvaluateImportanceRandom(pow(nbits,2));
124  }
125  fResults.fType = fType;
128  Log()<<kINFO<<"Evaluation done."<<Endl;
130 }
131 
132 ////////////////////////////////////////////////////////////////////////////////
133 
135 {
136  ULong_t sum=0;
137  for(ULong_t n=0;n<i;n++) sum+=pow(2,n);
138  return sum;
139 }
140 
141 ////////////////////////////////////////////////////////////////////////////////
142 
143 TH1F* TMVA::VariableImportance::GetImportance(const UInt_t nbits,std::vector<Float_t> &importances,std::vector<TString> &varNames)
144 {
145  TH1F *vihist = new TH1F("vihist", "", nbits, 0, nbits);
146 
147  gStyle->SetOptStat(000000);
148 
149  Float_t normalization = 0.0;
150  for (UInt_t i = 0; i < nbits; i++) normalization += importances[i];
151 
152  Float_t roc = 0.0;
153 
154  gStyle->SetTitleXOffset(0.4);
155  gStyle->SetTitleXOffset(1.2);
156 
157 
158  for (UInt_t i = 1; i < nbits + 1; i++) {
159  roc = 100.0 * importances[i - 1] / normalization;
160  vihist->GetXaxis()->SetBinLabel(i, varNames[i - 1].Data());
161  vihist->SetBinContent(i, roc);
162  }
163 
164  vihist->LabelsOption("v >", "X");
165  vihist->SetBarWidth(0.97);
166  vihist->SetFillColor(TColor::GetColor("#006600"));
167 
168  vihist->GetXaxis()->SetTitle(" Variable Names ");
169  vihist->GetXaxis()->SetTitleSize(0.045);
170  vihist->GetXaxis()->CenterTitle();
171  vihist->GetXaxis()->SetTitleOffset(1.24);
172 
173  vihist->GetYaxis()->SetTitle(" Importance (%)");
174  vihist->GetYaxis()->SetTitleSize(0.045);
175  vihist->GetYaxis()->CenterTitle();
176  vihist->GetYaxis()->SetTitleOffset(1.24);
177 
178  vihist->GetYaxis()->SetRangeUser(-7, 50);
179  vihist->SetDirectory(0);
180 
181  return vihist;
182 }
183 
184 ////////////////////////////////////////////////////////////////////////////////
185 
187 {
188  for (auto &meth : fMethods) {
189  TString methodName = meth.GetValue<TString>("MethodName");
190  TString methodTitle = meth.GetValue<TString>("MethodTitle");
191  TString methodOptions = meth.GetValue<TString>("MethodOptions");
192 
193  uint32_t x = 0;
194  uint32_t y = 0;
195  // getting number of variables and variable names from loader
196  const UInt_t nbits = fDataLoader->GetDefaultDataSetInfo().GetNVariables();
197  std::vector<TString> varNames = fDataLoader->GetDefaultDataSetInfo().GetListOfVariables();
198 
199  ULong_t range = Sum(nbits);
200 
201  // vector to save importances
202  std::vector<Float_t> importances(nbits);
203  for (UInt_t i = 0; i < nbits; i++)
204  importances[i] = 0;
205 
206  Float_t SROC, SSROC; // computed ROC value for every Seed and SubSeed
207 
208  x = range;
209 
210  std::bitset<NBITS> xbitset(x);
211  if (x == 0)
212  Log() << kFATAL << "Error: need at least one variable."; // dataloader need at least one variable
213 
214  // creating loader for seed
215  TMVA::DataLoader *seeddl = new TMVA::DataLoader(xbitset.to_string());
216 
217  // adding variables from seed
218  for (UInt_t index = 0; index < nbits; index++) {
219  if (xbitset[index])
220  seeddl->AddVariable(varNames[index], 'F');
221  }
222 
223  // Loading Dataset
224  DataLoaderCopy(seeddl, fDataLoader.get());
225 
226  // Booking Seed
227  fClassifier->BookMethod(seeddl, methodName, methodTitle, methodOptions);
228 
229  // Train/Test/Evaluation
230  fClassifier->TrainAllMethods();
231  fClassifier->TestAllMethods();
232  fClassifier->EvaluateAllMethods();
233 
234  // getting ROC
235  SROC = fClassifier->GetROCIntegral(xbitset.to_string(), methodTitle);
236 
237  delete seeddl;
238  fClassifier->DeleteAllMethods();
239  fClassifier->fMethodsMap.clear();
240 
241  for (uint32_t i = 0; i < NBITS; ++i) {
242  if (x & (1 << i)) {
243  y = x & ~(1 << i);
244  std::bitset<NBITS> ybitset(y);
245  //need at least one variable
246  //NOTE: if subssed is zero then is the special case
247  //that count in xbitset is 1
248  Double_t ny = log(x - y) / 0.693147;
249  if (y == 0) {
250  importances[ny] = SROC - 0.5;
251  continue;
252  }
253 
254  //creating loader for subseed
255  TMVA::DataLoader *subseeddl = new TMVA::DataLoader(ybitset.to_string());
256  //adding variables from subseed
257  for (UInt_t index = 0; index < nbits; index++) {
258  if (ybitset[index]) subseeddl->AddVariable(varNames[index], 'F');
259  }
260 
261  //Loading Dataset
262  DataLoaderCopy(subseeddl,fDataLoader.get());
263 
264  //Booking SubSeed
265  fClassifier->BookMethod(subseeddl, methodName, methodTitle, methodOptions);
266 
267  //Train/Test/Evaluation
268  fClassifier->TrainAllMethods();
269  fClassifier->TestAllMethods();
270  fClassifier->EvaluateAllMethods();
271 
272  //getting ROC
273  SSROC = fClassifier->GetROCIntegral(ybitset.to_string(), methodTitle);
274  importances[ny] += SROC - SSROC;
275 
276  delete subseeddl;
277  fClassifier->DeleteAllMethods();
278  fClassifier->fMethodsMap.clear();
279  }
280  }
281  Float_t normalization = 0.0;
282  for (UInt_t i = 0; i < nbits; i++) normalization += importances[i];
283 
284  for(UInt_t i=0;i<nbits;i++){
285  //adding values
286  fResults.fImportanceValues[varNames[i]]=(100.0 * importances[i] / normalization);
287  //adding sufix
288  fResults.fImportanceValues[varNames[i]]=fResults.fImportanceValues.GetValue<TString>(varNames[i])+" % ";
289  }
290  fResults.fImportanceHist = std::shared_ptr<TH1F>(GetImportance(nbits,importances,varNames));
291  }
292 }
293 
294 ////////////////////////////////////////////////////////////////////////////////
295 
297 {
298  for (auto &meth : fMethods) {
299 
300  TString methodName = meth.GetValue<TString>("MethodName");
301  TString methodTitle = meth.GetValue<TString>("MethodTitle");
302  TString methodOptions = meth.GetValue<TString>("MethodOptions");
303 
304  TRandom3 *rangen = new TRandom3(0); // Random Gen.
305 
306  uint32_t x = 0;
307  uint32_t y = 0;
308 
309  // getting number of variables and variable names from loader
310  const UInt_t nbits = fDataLoader->GetDefaultDataSetInfo().GetNVariables();
311  std::vector<TString> varNames = fDataLoader->GetDefaultDataSetInfo().GetListOfVariables();
312 
313  ULong_t range = pow(2, nbits);
314 
315  // vector to save importances
316  std::vector<Float_t> importances(nbits);
317  Float_t importances_norm = 0;
318 
319  for (UInt_t i = 0; i < nbits; i++)
320  importances[i] = 0;
321 
322  Float_t SROC, SSROC; // computed ROC value for every Seed and SubSeed
323 
324  x = range;
325 
326  for (UInt_t n = 0; n < seeds; n++) {
327  x = rangen->Integer(range);
328 
329  std::bitset<NBITS> xbitset(x);
330  if (x == 0)
331  continue; // dataloader need at least one variable
332 
333  // creating loader for seed
334  TMVA::DataLoader *seeddl = new TMVA::DataLoader(xbitset.to_string());
335 
336  // adding variables from seed
337  for (UInt_t index = 0; index < nbits; index++) {
338  if (xbitset[index]) seeddl->AddVariable(varNames[index], 'F');
339  }
340 
341  //Loading Dataset
342  DataLoaderCopy(seeddl,fDataLoader.get());
343 
344  //Booking Seed
345  fClassifier->BookMethod(seeddl, methodName, methodTitle, methodOptions);
346 
347  //Train/Test/Evaluation
348  fClassifier->TrainAllMethods();
349  fClassifier->TestAllMethods();
350  fClassifier->EvaluateAllMethods();
351 
352  //getting ROC
353  SROC = fClassifier->GetROCIntegral(xbitset.to_string(), methodTitle);
354 
355  delete seeddl;
356  fClassifier->DeleteAllMethods();
357  fClassifier->fMethodsMap.clear();
358 
359  for (uint32_t i = 0; i < 32; ++i) {
360  if (x & (1 << i)) {
361  y = x & ~(1 << i);
362  std::bitset<NBITS> ybitset(y);
363  //need at least one variable
364  //NOTE: if subssed is zero then is the special case
365  //that count in xbitset is 1
366  Double_t ny = log(x - y) / 0.693147;
367  if (y == 0) {
368  importances[ny] = SROC - 0.5;
369  importances_norm += importances[ny];
370  continue;
371  }
372 
373  //creating loader for subseed
374  TMVA::DataLoader *subseeddl = new TMVA::DataLoader(ybitset.to_string());
375  //adding variables from subseed
376  for (UInt_t index = 0; index < nbits; index++) {
377  if (ybitset[index]) subseeddl->AddVariable(varNames[index], 'F');
378  }
379 
380  //Loading Dataset
381  DataLoaderCopy(subseeddl,fDataLoader.get());
382 
383  //Booking SubSeed
384  fClassifier->BookMethod(subseeddl, methodName, methodTitle, methodOptions);
385 
386  //Train/Test/Evaluation
387  fClassifier->TrainAllMethods();
388  fClassifier->TestAllMethods();
389  fClassifier->EvaluateAllMethods();
390 
391  //getting ROC
392  SSROC = fClassifier->GetROCIntegral(ybitset.to_string(), methodTitle);
393  importances[ny] += SROC - SSROC;
394 
395  delete subseeddl;
396  fClassifier->DeleteAllMethods();
397  fClassifier->fMethodsMap.clear();
398  }
399  }
400  }
401 
402  Float_t normalization = 0.0;
403  for (UInt_t i = 0; i < nbits; i++) normalization += importances[i];
404 
405  for(UInt_t i=0;i<nbits;i++){
406  //adding values
407  fResults.fImportanceValues[varNames[i]]=(100.0 * importances[i] / normalization);
408  //adding sufix
409  fResults.fImportanceValues[varNames[i]]=fResults.fImportanceValues.GetValue<TString>(varNames[i])+" % ";
410  }
411  fResults.fImportanceHist = std::shared_ptr<TH1F>(GetImportance(nbits,importances,varNames));
412  delete rangen;
413  }
414 }
415 
416 ////////////////////////////////////////////////////////////////////////////////
417 
419 {
420  for (auto &meth : fMethods) {
421  TString methodName = meth.GetValue<TString>("MethodName");
422  TString methodTitle = meth.GetValue<TString>("MethodTitle");
423  TString methodOptions = meth.GetValue<TString>("MethodOptions");
424 
425  uint32_t x = 0;
426  uint32_t y = 0;
427 
428  // getting number of variables and variable names from loader
429  const UInt_t nbits = fDataLoader->GetDefaultDataSetInfo().GetNVariables();
430  std::vector<TString> varNames = fDataLoader->GetDefaultDataSetInfo().GetListOfVariables();
431 
432  ULong_t range = pow(2, nbits);
433 
434  // vector to save importances
435  std::vector<Float_t> importances(nbits);
436 
437  // vector to save ROC-Integral values
438  std::vector<Float_t> ROC(range);
439  ROC[0] = 0.5;
440  for (UInt_t i = 0; i < nbits; i++)
441  importances[i] = 0;
442 
443  Float_t SROC, SSROC; // computed ROC value
444  for (x = 1; x < range; x++) {
445 
446  std::bitset<NBITS> xbitset(x);
447  if (x == 0)
448  continue; // dataloader need at least one variable
449 
450  // creating loader for seed
451  TMVA::DataLoader *seeddl = new TMVA::DataLoader(xbitset.to_string());
452 
453  // adding variables from seed
454  for (UInt_t index = 0; index < nbits; index++) {
455  if (xbitset[index]) seeddl->AddVariable(varNames[index], 'F');
456  }
457 
458  DataLoaderCopy(seeddl,fDataLoader.get());
459 
460  seeddl->PrepareTrainingAndTestTree(fDataLoader->GetDefaultDataSetInfo().GetCut("Signal"), fDataLoader->GetDefaultDataSetInfo().GetCut("Background"), fDataLoader->GetDefaultDataSetInfo().GetSplitOptions());
461 
462  //Booking Seed
463  fClassifier->BookMethod(seeddl, methodName, methodTitle, methodOptions);
464 
465  //Train/Test/Evaluation
466  fClassifier->TrainAllMethods();
467  fClassifier->TestAllMethods();
468  fClassifier->EvaluateAllMethods();
469 
470  //getting ROC
471  ROC[x] = fClassifier->GetROCIntegral(xbitset.to_string(), methodTitle);
472 
473  delete seeddl;
474  fClassifier->DeleteAllMethods();
475  fClassifier->fMethodsMap.clear();
476  }
477 
478 
479  for ( x = 0; x <range ; x++)
480  {
481  SROC=ROC[x];
482  for (uint32_t i = 0; i < NBITS; ++i) {
483  if (x & (1 << i)) {
484  y = x & ~(1 << i);
485  std::bitset<NBITS> ybitset(y);
486 
487  Float_t ny = log(x - y) / 0.693147;
488  if (y == 0) {
489  importances[ny] = SROC - 0.5;
490  continue;
491  }
492 
493  //getting ROC
494  SSROC = ROC[y];
495  importances[ny] += SROC - SSROC;
496  }
497 
498  }
499  }
500  Float_t normalization = 0.0;
501  for (UInt_t i = 0; i < nbits; i++) normalization += importances[i];
502 
503  for(UInt_t i=0;i<nbits;i++){
504  //adding values
505  fResults.fImportanceValues[varNames[i]]=(100.0 * importances[i] / normalization);
506  //adding sufix
507  fResults.fImportanceValues[varNames[i]]=fResults.fImportanceValues.GetValue<TString>(varNames[i])+" % ";
508  }
509  fResults.fImportanceHist = std::shared_ptr<TH1F>(GetImportance(nbits,importances,varNames));
510  }
511 }
virtual void SetTitleOffset(Float_t offset=1)
Set distance between the axis and the axis title Offset is a correction factor with respect to the "s...
Definition: TAttAxis.cxx:294
static long int sum(long int i)
Definition: Factory.cxx:2173
Random number generator class based on M.
Definition: TRandom3.h:27
MsgLogger & Endl(MsgLogger &ml)
Definition: MsgLogger.h:158
virtual void LabelsOption(Option_t *option="h", Option_t *axis="X")
Set option(s) to draw axis with labels.
Definition: TH1.cxx:5059
float Float_t
Definition: RtypesCore.h:53
virtual void SetDirectory(TDirectory *dir)
By default when an histogram is created, it is added to the list of histogram objects in the current ...
Definition: TH1.cxx:8194
virtual void Evaluate()
Virtual method to be implemented with your algorithm.
T GetValue(const TString &key)
Definition: OptionMap.h:145
R__EXTERN TStyle * gStyle
Definition: TStyle.h:402
THist< 1, float, THistStatContent, THistStatUncertainty > TH1F
Definition: THist.hxx:285
Config & gConfig()
MsgLogger & Log() const
Definition: Configurable.h:122
Basic string class.
Definition: TString.h:125
1-D histogram with a float per channel (see TH1 documentation)}
Definition: TH1.h:567
void DataLoaderCopy(TMVA::DataLoader *des, TMVA::DataLoader *src)
void CenterTitle(Bool_t center=kTRUE)
Center axis title.
Definition: TAxis.h:184
void AddVariable(const TString &expression, const TString &title, const TString &unit, char type='F', Double_t min=0, Double_t max=0)
user inserts discriminating variable in data set info
Definition: DataLoader.cxx:491
std::shared_ptr< TH1F > fImportanceHist
virtual void SetBarWidth(Float_t width=0.5)
Definition: TH1.h:353
TCanvas * Draw(const TString name="VariableImportance") const
virtual void SetRangeUser(Double_t ufirst, Double_t ulast)
Set the viewing range for the axis from ufirst to ulast (in user coordinates).
Definition: TAxis.cxx:928
Double_t x[n]
Definition: legend1.C:17
std::unique_ptr< Factory > fClassifier
double pow(double, double)
Abstract base class for all high level ml algorithms, you can book ml methods like BDT...
Definition: Envelope.h:43
VariableImportance(DataLoader *loader)
void Print() const
Definition: OptionMap.h:136
virtual void SetFillColor(Color_t fcolor)
Set the fill area color.
Definition: TAttFill.h:37
virtual void SetBinContent(Int_t bin, Double_t content)
Set bin content see convention for numbering bins in TH1::GetBin In case the bin number is greater th...
Definition: TH1.cxx:8477
unsigned int UInt_t
Definition: RtypesCore.h:42
static Int_t GetColor(const char *hexcolor)
Static method returning color number for color specified by hex color string of form: "#rrggbb"...
Definition: TColor.cxx:1751
TAxis * GetYaxis()
Definition: TH1.h:316
This is the main MVA steering class.
Definition: Factory.h:81
virtual void SetTitleSize(Float_t size=0.04)
Set size of axis title The size is expressed in per cent of the pad width.
Definition: TAttAxis.cxx:304
const Bool_t kFALSE
Definition: RtypesCore.h:88
The Canvas class.
Definition: TCanvas.h:31
#define NBITS
void PrepareTrainingAndTestTree(const TCut &cut, const TString &splitOpt)
prepare the training and test trees -> same cuts for signal and background
Definition: DataLoader.cxx:629
double Double_t
Definition: RtypesCore.h:55
std::shared_ptr< DataLoader > fDataLoader
Booked method information.
Definition: Envelope.h:47
unsigned long ULong_t
Definition: RtypesCore.h:51
Double_t y[n]
Definition: legend1.C:17
virtual void SetBinLabel(Int_t bin, const char *label)
Set label for bin.
Definition: TAxis.cxx:809
TH1F * GetImportance(const UInt_t nbits, std::vector< Float_t > &importances, std::vector< TString > &varNames)
ostringstream derivative to redirect and format output
Definition: MsgLogger.h:59
virtual void Draw(Option_t *option="")
Draw a canvas.
Definition: TCanvas.cxx:826
void SetTitleXOffset(Float_t offset=1)
Definition: TStyle.h:382
Abstract ClassifierFactory template that handles arbitrary types.
void SetSilent(Bool_t s)
Definition: Config.h:68
void SetOptStat(Int_t stat=1)
The type of information printed in the histogram statistics box can be selected via the parameter mod...
Definition: TStyle.cxx:1266
static void EnableOutput()
Definition: MsgLogger.cxx:75
VariableImportanceResult fResults
virtual void SetTitle(const char *title="")
Set the title of the TNamed.
Definition: TNamed.cxx:164
const Bool_t kTRUE
Definition: RtypesCore.h:87
void EvaluateImportanceRandom(UInt_t nseeds)
const Int_t n
Definition: legend1.C:16
std::vector< OptionMap > fMethods
Definition: Envelope.h:46
char name[80]
Definition: TGX11.cxx:109
double log(double)
TAxis * GetXaxis()
Get the behaviour adopted by the object about the statoverflows. See EStatOverflows for more informat...
Definition: TH1.h:315
const char * Data() const
Definition: TString.h:345