Logo ROOT   6.12/07
Reference Guide
VarTransformHandler.cxx
Go to the documentation of this file.
1 /**********************************************************************************
2  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
3  * Package: TMVA *
4  * Class : VarTransformHandler *
5  * Web : http://tmva.sourceforge.net *
6  * *
7  * Description: *
8  * Implementation of unsupervised variable transformation methods *
9  * *
10  * Authors (alphabetical): *
11  * Abhinav Moudgil <abhinav.moudgil@research.iiit.ac.in> - IIIT-H, India *
12  * *
13  * Copyright (c) 2005: *
14  * CERN, Switzerland *
15  * *
16  * Redistribution and use in source and binary forms, with or without *
17  * modification, are permitted according to the terms listed in LICENSE *
18  * (http://tmva.sourceforge.net/LICENSE) *
19  **********************************************************************************/
20 
22 
23 #include "TMVA/ClassifierFactory.h"
24 #include "TMVA/DataLoader.h"
25 #include "TMVA/Event.h"
26 #include "TMVA/DataInputHandler.h"
27 #include "TMVA/DataSet.h"
28 #include "TMVA/DataSetInfo.h"
29 #include "TMVA/MethodBase.h"
30 #include "TMVA/MethodDNN.h"
31 #include "TMVA/MsgLogger.h"
32 #include "TMVA/Tools.h"
33 #include "TMVA/Types.h"
34 #include "TMVA/VariableInfo.h"
35 
36 #include "TMath.h"
37 #include "TVectorD.h"
38 #include "TFile.h"
39 #include "TTree.h"
40 #include "TMatrix.h"
41 #include "TMatrixTSparse.h"
42 #include "TMatrixDSparsefwd.h"
43 #include "TCanvas.h"
44 #include "TGraph.h"
45 #include "TStyle.h"
46 #include "TLegend.h"
47 #include "TH2.h"
48 
49 #include <algorithm>
50 #include <iomanip>
51 #include <vector>
52 
53 ////////////////////////////////////////////////////////////////////////////////
54 /// constructor
55 
57  : fLogger ( new MsgLogger(TString("VarTransformHandler").Data(), kINFO) ),
58  fDataSetInfo(dl->GetDataSetInfo()),
59  fDataLoader (dl),
60  fEvents (fDataSetInfo.GetDataSet()->GetEventCollection())
61 {
62  Log() << kINFO << "Number of events - " << fEvents.size() << Endl;
63 }
64 
65 ////////////////////////////////////////////////////////////////////////////////
66 /// destructor
67 
69 {
70  // do something
71  delete fLogger;
72 }
73 
74 ////////////////////////////////////////////////////////////////////////////////
75 /// Computes variance of all the variables and
76 /// returns a new DataLoader with the selected variables whose variance is above a specific threshold.
77 /// Threshold can be provided by user otherwise default value is 0 i.e. remove the variables which have same value in all
78 /// the events.
79 ///
80 /// \param[in] threshold value (Double)
81 ///
82 /// Transformation Definition String Format: "VT(optional float value)"
83 ///
84 /// Usage examples:
85 ///
86 /// String | Description
87 /// ------- |----------------------------------------
88 /// "VT" | Select variables whose variance is above threshold value = 0 (Default)
89 /// "VT(1.5)" | Select variables whose variance is above threshold value = 1.5
90 
92 {
93  CalcNorm();
94  const UInt_t nvars = fDataSetInfo.GetNVariables();
95  Log() << kINFO << "Number of variables before transformation: " << nvars << Endl;
96  std::vector<VariableInfo>& vars = fDataSetInfo.GetVariableInfos();
97 
98  // return a new dataloader
99  // iterate over all variables, ignore the ones whose variance is below specific threshold
100  // DataLoader *transformedLoader=(DataLoader *)fDataLoader->Clone("vt_transformed_dataset");
101  // TMVA::DataLoader *transformedLoader = new TMVA::DataLoader(fDataSetInfo.GetName());
102  TMVA::DataLoader *transformedLoader = new TMVA::DataLoader("vt_transformed_dataset");
103  Log() << kINFO << "Selecting variables whose variance is above threshold value = " << threshold << Endl;
105  maxL = maxL + 16;
106  Log() << kINFO << "----------------------------------------------------------------" << Endl;
107  Log() << kINFO << std::setiosflags(std::ios::left) << std::setw(maxL) << "Selected Variables";
108  Log() << kINFO << std::setiosflags(std::ios::left) << std::setw(10) << "Variance" << Endl;
109  Log() << kINFO << "----------------------------------------------------------------" << Endl;
110  for (UInt_t ivar=0; ivar<nvars; ivar++) {
111  Double_t variance = vars[ivar].GetVariance();
112  if (variance > threshold)
113  {
114  Log() << kINFO << std::setiosflags(std::ios::left) << std::setw(maxL) << vars[ivar].GetExpression();
115  Log() << kINFO << std::setiosflags(std::ios::left) << std::setw(maxL) << variance << Endl;
116  transformedLoader->AddVariable(vars[ivar].GetExpression(), vars[ivar].GetVarType());
117  }
118  }
119  CopyDataLoader(transformedLoader,fDataLoader);
120  Log() << kINFO << "----------------------------------------------------------------" << Endl;
121  // CopyDataLoader(transformedLoader, fDataLoader);
122  // DataLoader *transformedLoader=(DataLoader *)fDataLoader->Clone(fDataSetInfo.GetName());
124  Log() << kINFO << "Number of variables after transformation: " << transformedLoader->GetDataSetInfo().GetNVariables() << Endl;
125 
126  return transformedLoader;
127 }
128 
129 ///////////////////////////////////////////////////////////////////////////////
130 ////////////////////////////// Utility methods ////////////////////////////////
131 ///////////////////////////////////////////////////////////////////////////////
132 
133 ////////////////////////////////////////////////////////////////////////////////
134 /// Updates maximum and minimum value of a variable or target
135 
137 {
138  Int_t nvars = fDataSetInfo.GetNVariables();
139  std::vector<VariableInfo>& vars = fDataSetInfo.GetVariableInfos();
140  std::vector<VariableInfo>& tars = fDataSetInfo.GetTargetInfos();
141  if( ivar < nvars ){
142  if (x < vars[ivar].GetMin()) vars[ivar].SetMin(x);
143  if (x > vars[ivar].GetMax()) vars[ivar].SetMax(x);
144  }
145  else{
146  if (x < tars[ivar-nvars].GetMin()) tars[ivar-nvars].SetMin(x);
147  if (x > tars[ivar-nvars].GetMax()) tars[ivar-nvars].SetMax(x);
148  }
149 }
150 
151 ////////////////////////////////////////////////////////////////////////////////
152 /// Computes maximum, minimum, mean, RMS and variance for all
153 /// variables and targets
154 
156 {
157  const std::vector<TMVA::Event*>& events = fDataSetInfo.GetDataSet()->GetEventCollection();
158 
159  const UInt_t nvars = fDataSetInfo.GetNVariables();
160  const UInt_t ntgts = fDataSetInfo.GetNTargets();
161  std::vector<VariableInfo>& vars = fDataSetInfo.GetVariableInfos();
162  std::vector<VariableInfo>& tars = fDataSetInfo.GetTargetInfos();
163 
164  UInt_t nevts = events.size();
165 
166  TVectorD x2( nvars+ntgts ); x2 *= 0;
167  TVectorD x0( nvars+ntgts ); x0 *= 0;
168  TVectorD v0( nvars+ntgts ); v0 *= 0;
169 
170  Double_t sumOfWeights = 0;
171  for (UInt_t ievt=0; ievt<nevts; ievt++) {
172  const Event* ev = events[ievt];
173 
174  Double_t weight = ev->GetWeight();
175  sumOfWeights += weight;
176  for (UInt_t ivar=0; ivar<nvars; ivar++) {
177  Double_t x = ev->GetValue(ivar);
178  if (ievt==0) {
179  vars[ivar].SetMin(x);
180  vars[ivar].SetMax(x);
181  }
182  else {
183  UpdateNorm(ivar, x );
184  }
185  x0(ivar) += x*weight;
186  x2(ivar) += x*x*weight;
187  }
188  for (UInt_t itgt=0; itgt<ntgts; itgt++) {
189  Double_t x = ev->GetTarget(itgt);
190  if (ievt==0) {
191  tars[itgt].SetMin(x);
192  tars[itgt].SetMax(x);
193  }
194  else {
195  UpdateNorm( nvars+itgt, x );
196  }
197  x0(nvars+itgt) += x*weight;
198  x2(nvars+itgt) += x*x*weight;
199  }
200  }
201 
202  if (sumOfWeights <= 0) {
203  Log() << kFATAL << " the sum of event weights calculated for your input is == 0"
204  << " or exactly: " << sumOfWeights << " there is obviously some problem..."<< Endl;
205  }
206 
207  // set Mean and RMS
208  for (UInt_t ivar=0; ivar<nvars; ivar++) {
209  Double_t mean = x0(ivar)/sumOfWeights;
210 
211  vars[ivar].SetMean( mean );
212  if (x2(ivar)/sumOfWeights - mean*mean < 0) {
213  Log() << kFATAL << " the RMS of your input variable " << ivar
214  << " evaluates to an imaginary number: sqrt("<< x2(ivar)/sumOfWeights - mean*mean
215  <<") .. sometimes related to a problem with outliers and negative event weights"
216  << Endl;
217  }
218  vars[ivar].SetRMS( TMath::Sqrt( x2(ivar)/sumOfWeights - mean*mean) );
219  }
220  for (UInt_t itgt=0; itgt<ntgts; itgt++) {
221  Double_t mean = x0(nvars+itgt)/sumOfWeights;
222  tars[itgt].SetMean( mean );
223  if (x2(nvars+itgt)/sumOfWeights - mean*mean < 0) {
224  Log() << kFATAL << " the RMS of your target variable " << itgt
225  << " evaluates to an imaginary number: sqrt(" << x2(nvars+itgt)/sumOfWeights - mean*mean
226  <<") .. sometimes related to a problem with outliers and negative event weights"
227  << Endl;
228  }
229  tars[itgt].SetRMS( TMath::Sqrt( x2(nvars+itgt)/sumOfWeights - mean*mean) );
230  }
231 
232  // calculate variance
233  for (UInt_t ievt=0; ievt<nevts; ievt++) {
234  const Event* ev = events[ievt];
235  Double_t weight = ev->GetWeight();
236 
237  for (UInt_t ivar=0; ivar<nvars; ivar++) {
238  Double_t x = ev->GetValue(ivar);
239  Double_t mean = vars[ivar].GetMean();
240  v0(ivar) += weight*(x-mean)*(x-mean);
241  }
242 
243  for (UInt_t itgt=0; itgt<ntgts; itgt++) {
244  Double_t x = ev->GetTarget(itgt);
245  Double_t mean = tars[itgt].GetMean();
246  v0(nvars+itgt) += weight*(x-mean)*(x-mean);
247  }
248  }
249 
251  maxL = maxL + 8;
252  Log() << kINFO << "----------------------------------------------------------------" << Endl;
253  Log() << kINFO << std::setiosflags(std::ios::left) << std::setw(maxL) << "Variables";
254  Log() << kINFO << std::setiosflags(std::ios::left) << std::setw(10) << "Variance" << Endl;
255  Log() << kINFO << "----------------------------------------------------------------" << Endl;
256 
257  // set variance
258  Log() << std::setprecision(5);
259  for (UInt_t ivar=0; ivar<nvars; ivar++) {
260  Double_t variance = v0(ivar)/sumOfWeights;
261  vars[ivar].SetVariance( variance );
262  Log() << kINFO << std::setiosflags(std::ios::left) << std::setw(maxL) << vars[ivar].GetExpression();
263  Log() << kINFO << std::setiosflags(std::ios::left) << std::setw(maxL) << variance << Endl;
264  }
265 
267  maxL = maxL + 8;
268  Log() << kINFO << "----------------------------------------------------------------" << Endl;
269  Log() << kINFO << std::setiosflags(std::ios::left) << std::setw(maxL) << "Targets";
270  Log() << kINFO << std::setiosflags(std::ios::left) << std::setw(10) << "Variance" << Endl;
271  Log() << kINFO << "----------------------------------------------------------------" << Endl;
272 
273  for (UInt_t itgt=0; itgt<ntgts; itgt++) {
274  Double_t variance = v0(nvars+itgt)/sumOfWeights;
275  tars[itgt].SetVariance( variance );
276  Log() << kINFO << std::setiosflags(std::ios::left) << std::setw(maxL) << tars[itgt].GetExpression();
277  Log() << kINFO << std::setiosflags(std::ios::left) << std::setw(maxL) << variance << Endl;
278  }
279 
280  Log() << kINFO << "Set minNorm/maxNorm for variables to: " << Endl;
281  Log() << std::setprecision(3);
282  for (UInt_t ivar=0; ivar<nvars; ivar++)
283  Log() << " " << vars[ivar].GetExpression()
284  << "\t: [" << vars[ivar].GetMin() << "\t, " << vars[ivar].GetMax() << "\t] " << Endl;
285  Log() << kINFO << "Set minNorm/maxNorm for targets to: " << Endl;
286  Log() << std::setprecision(3);
287  for (UInt_t itgt=0; itgt<ntgts; itgt++)
288  Log() << " " << tars[itgt].GetExpression()
289  << "\t: [" << tars[itgt].GetMin() << "\t, " << tars[itgt].GetMax() << "\t] " << Endl;
290  Log() << std::setprecision(5); // reset to better value
291 }
292 
293 ////////////////////////////////////////////////////////////////////////////////
295 {
296  for( std::vector<TreeInfo>::const_iterator treeinfo=src->DataInput().Sbegin();treeinfo!=src->DataInput().Send();treeinfo++)
297  {
298  des->AddSignalTree( (*treeinfo).GetTree(), (*treeinfo).GetWeight(),(*treeinfo).GetTreeType());
299  }
300 
301  for( std::vector<TreeInfo>::const_iterator treeinfo=src->DataInput().Bbegin();treeinfo!=src->DataInput().Bend();treeinfo++)
302  {
303  des->AddBackgroundTree( (*treeinfo).GetTree(), (*treeinfo).GetWeight(),(*treeinfo).GetTreeType());
304  }
305 }
void AddBackgroundTree(TTree *background, Double_t weight=1.0, Types::ETreeType treetype=Types::kMaxTreeType)
number of signal events (used to compute significance)
Definition: DataLoader.cxx:408
UInt_t GetNVariables() const
Definition: DataSetInfo.h:110
void CalcNorm()
Computes maximum, minimum, mean, RMS and variance for all variables and targets.
std::vector< TreeInfo >::const_iterator Bend() const
MsgLogger & Endl(MsgLogger &ml)
Definition: MsgLogger.h:158
std::vector< TreeInfo >::const_iterator Bbegin() const
TMVA::DataLoader * VarianceThreshold(Double_t threshold)
Computes variance of all the variables and returns a new DataLoader with the selected variables whose...
TFileCollection * GetDataSet(const char *ds, const char *server)
GetDataSet wrapper.
Definition: pq2wrappers.cxx:87
DataSetInfo & GetDataSetInfo()
Definition: DataLoader.cxx:144
Int_t GetTargetNameMaxLength() const
Basic string class.
Definition: TString.h:125
int Int_t
Definition: RtypesCore.h:41
std::vector< TreeInfo >::const_iterator Send() const
const std::vector< Event * > & GetEventCollection(Types::ETreeType type=Types::kMaxTreeType) const
Definition: DataSet.h:225
void AddVariable(const TString &expression, const TString &title, const TString &unit, char type='F', Double_t min=0, Double_t max=0)
user inserts discriminating variable in data set info
Definition: DataLoader.cxx:491
std::vector< TreeInfo >::const_iterator Sbegin() const
static const double x2[5]
Double_t x[n]
Definition: legend1.C:17
Double_t GetWeight() const
return the event weight - depending on whether the flag IgnoreNegWeightsInTraining is or not...
Definition: Event.cxx:382
std::vector< VariableInfo > & GetTargetInfos()
Definition: DataSetInfo.h:99
void UpdateNorm(Int_t ivar, Double_t x)
Updates maximum and minimum value of a variable or target.
VarTransformHandler(DataLoader *)
constructor
Float_t GetTarget(UInt_t itgt) const
Definition: Event.h:97
UInt_t GetNTargets() const
Definition: DataSetInfo.h:111
DataInputHandler & DataInput()
Definition: DataLoader.h:176
const std::vector< Event * > & fEvents
unsigned int UInt_t
Definition: RtypesCore.h:42
Float_t GetValue(UInt_t ivar) const
return value of i&#39;th variable
Definition: Event.cxx:237
void PrepareTrainingAndTestTree(const TCut &cut, const TString &splitOpt)
prepare the training and test trees -> same cuts for signal and background
Definition: DataLoader.cxx:629
double Double_t
Definition: RtypesCore.h:55
Int_t GetVariableNameMaxLength() const
ostringstream derivative to redirect and format output
Definition: MsgLogger.h:59
const TString & GetSplitOptions() const
Definition: DataSetInfo.h:167
const TCut & GetCut(Int_t i) const
Definition: DataSetInfo.h:149
Double_t Sqrt(Double_t x)
Definition: TMath.h:590
void CopyDataLoader(TMVA::DataLoader *des, TMVA::DataLoader *src)
void AddSignalTree(TTree *signal, Double_t weight=1.0, Types::ETreeType treetype=Types::kMaxTreeType)
number of signal events (used to compute significance)
Definition: DataLoader.cxx:377
DataSet * GetDataSet() const
returns data set
std::vector< VariableInfo > & GetVariableInfos()
Definition: DataSetInfo.h:94
MsgLogger & Log() const
message logger