Logo ROOT   6.08/07
Reference Guide
VarTransformHandler.cxx
Go to the documentation of this file.
2 
4 #include "TMVA/DataLoader.h"
5 #include "TMVA/Event.h"
7 #include "TMVA/DataSet.h"
8 #include "TMVA/DataSetInfo.h"
9 #include "TMVA/MethodBase.h"
10 #include "TMVA/MethodDNN.h"
11 #include "TMVA/MsgLogger.h"
12 #include "TMVA/Tools.h"
13 #include "TMVA/Types.h"
14 #include "TMVA/VariableInfo.h"
15 
16 #include "TMath.h"
17 #include "TVectorD.h"
18 #include "TFile.h"
19 #include "TTree.h"
20 #include "TMatrix.h"
21 #include "TMatrixTSparse.h"
22 #include "TMatrixDSparsefwd.h"
23 #include "TCanvas.h"
24 #include "TGraph.h"
25 #include "TStyle.h"
26 #include "TLegend.h"
27 #include "TH2.h"
28 
29 #include <algorithm>
30 #include <iomanip>
31 #include <vector>
32 
33 ////////////////////////////////////////////////////////////////////////////////
34 /// constructor
35 
37  : fLogger ( new MsgLogger(TString("VarTransformHandler").Data(), kINFO) ),
38  fDataSetInfo(dl->GetDataSetInfo()),
39  fDataLoader (dl),
40  fEvents (fDataSetInfo.GetDataSet()->GetEventCollection())
41 {
42  Log() << kINFO << "Number of events - " << fEvents.size() << Endl;
43 }
44 
45 ////////////////////////////////////////////////////////////////////////////////
46 /// destructor
47 
49 {
50  // do something
51  delete fLogger;
52 }
53 
54 ////////////////////////////////////////////////////////////////////////////////
55 /// Computes variance of all the variables and
56 /// returns a new DataLoader with the selected variables whose variance is above a specific threshold.
57 /// Threshold can be provided by user otherwise default value is 0 i.e. remove the variables which have same value in all
58 /// the events.
59 ///
60 /// \param[in] threshold value (Double)
61 ///
62 /// Transformation Definition String Format: "VT(optional float value)"
63 ///
64 /// Usage examples:
65 ///
66 /// String | Description
67 /// ------- |----------------------------------------
68 /// "VT" | Select variables whose variance is above threshold value = 0 (Default)
69 /// "VT(1.5)" | Select variables whose variance is above threshold value = 1.5
70 
72 {
73  CalcNorm();
74  const UInt_t nvars = fDataSetInfo.GetNVariables();
75  Log() << kINFO << "Number of variables before transformation: " << nvars << Endl;
76  std::vector<VariableInfo>& vars = fDataSetInfo.GetVariableInfos();
77 
78  // return a new dataloader
79  // iterate over all variables, ignore the ones whose variance is below specific threshold
80  // DataLoader *transformedLoader=(DataLoader *)fDataLoader->Clone("vt_transformed_dataset");
81  // TMVA::DataLoader *transformedLoader = new TMVA::DataLoader(fDataSetInfo.GetName());
82  TMVA::DataLoader *transformedLoader = new TMVA::DataLoader("vt_transformed_dataset");
83  Log() << kINFO << "Selecting variables whose variance is above threshold value = " << threshold << Endl;
85  maxL = maxL + 16;
86  Log() << kINFO << "----------------------------------------------------------------" << Endl;
87  Log() << kINFO << std::setiosflags(std::ios::left) << std::setw(maxL) << "Selected Variables";
88  Log() << kINFO << std::setiosflags(std::ios::left) << std::setw(10) << "Variance" << Endl;
89  Log() << kINFO << "----------------------------------------------------------------" << Endl;
90  for (UInt_t ivar=0; ivar<nvars; ivar++) {
91  Double_t variance = vars[ivar].GetVariance();
92  if (variance > threshold)
93  {
94  Log() << kINFO << std::setiosflags(std::ios::left) << std::setw(maxL) << vars[ivar].GetExpression();
95  Log() << kINFO << std::setiosflags(std::ios::left) << std::setw(maxL) << variance << Endl;
96  transformedLoader->AddVariable(vars[ivar].GetExpression(), vars[ivar].GetVarType());
97  }
98  }
99  CopyDataLoader(transformedLoader,fDataLoader);
100  Log() << kINFO << "----------------------------------------------------------------" << Endl;
101  // CopyDataLoader(transformedLoader, fDataLoader);
102  // DataLoader *transformedLoader=(DataLoader *)fDataLoader->Clone(fDataSetInfo.GetName());
104  Log() << kINFO << "Number of variables after transformation: " << transformedLoader->GetDataSetInfo().GetNVariables() << Endl;
105 
106  return transformedLoader;
107 }
108 
109 ///////////////////////////////////////////////////////////////////////////////
110 ////////////////////////////// Utility methods ////////////////////////////////
111 ///////////////////////////////////////////////////////////////////////////////
112 
113 ////////////////////////////////////////////////////////////////////////////////
114 /// Updates maximum and minimum value of a variable or target
115 
117 {
118  Int_t nvars = fDataSetInfo.GetNVariables();
119  std::vector<VariableInfo>& vars = fDataSetInfo.GetVariableInfos();
120  std::vector<VariableInfo>& tars = fDataSetInfo.GetTargetInfos();
121  if( ivar < nvars ){
122  if (x < vars[ivar].GetMin()) vars[ivar].SetMin(x);
123  if (x > vars[ivar].GetMax()) vars[ivar].SetMax(x);
124  }
125  else{
126  if (x < tars[ivar-nvars].GetMin()) tars[ivar-nvars].SetMin(x);
127  if (x > tars[ivar-nvars].GetMax()) tars[ivar-nvars].SetMax(x);
128  }
129 }
130 
131 ////////////////////////////////////////////////////////////////////////////////
132 /// Computes maximum, minimum, mean, RMS and variance for all
133 /// variables and targets
134 
136 {
137  const std::vector<TMVA::Event*>& events = fDataSetInfo.GetDataSet()->GetEventCollection();
138 
139  const UInt_t nvars = fDataSetInfo.GetNVariables();
140  const UInt_t ntgts = fDataSetInfo.GetNTargets();
141  std::vector<VariableInfo>& vars = fDataSetInfo.GetVariableInfos();
142  std::vector<VariableInfo>& tars = fDataSetInfo.GetTargetInfos();
143 
144  UInt_t nevts = events.size();
145 
146  TVectorD x2( nvars+ntgts ); x2 *= 0;
147  TVectorD x0( nvars+ntgts ); x0 *= 0;
148  TVectorD v0( nvars+ntgts ); v0 *= 0;
149 
150  Double_t sumOfWeights = 0;
151  for (UInt_t ievt=0; ievt<nevts; ievt++) {
152  const Event* ev = events[ievt];
153 
154  Double_t weight = ev->GetWeight();
155  sumOfWeights += weight;
156  for (UInt_t ivar=0; ivar<nvars; ivar++) {
157  Double_t x = ev->GetValue(ivar);
158  if (ievt==0) {
159  vars[ivar].SetMin(x);
160  vars[ivar].SetMax(x);
161  }
162  else {
163  UpdateNorm(ivar, x );
164  }
165  x0(ivar) += x*weight;
166  x2(ivar) += x*x*weight;
167  }
168  for (UInt_t itgt=0; itgt<ntgts; itgt++) {
169  Double_t x = ev->GetTarget(itgt);
170  if (ievt==0) {
171  tars[itgt].SetMin(x);
172  tars[itgt].SetMax(x);
173  }
174  else {
175  UpdateNorm( nvars+itgt, x );
176  }
177  x0(nvars+itgt) += x*weight;
178  x2(nvars+itgt) += x*x*weight;
179  }
180  }
181 
182  if (sumOfWeights <= 0) {
183  Log() << kFATAL << " the sum of event weights calcualted for your input is == 0"
184  << " or exactly: " << sumOfWeights << " there is obviously some problem..."<< Endl;
185  }
186 
187  // set Mean and RMS
188  for (UInt_t ivar=0; ivar<nvars; ivar++) {
189  Double_t mean = x0(ivar)/sumOfWeights;
190 
191  vars[ivar].SetMean( mean );
192  if (x2(ivar)/sumOfWeights - mean*mean < 0) {
193  Log() << kFATAL << " the RMS of your input variable " << ivar
194  << " evaluates to an imaginary number: sqrt("<< x2(ivar)/sumOfWeights - mean*mean
195  <<") .. sometimes related to a problem with outliers and negative event weights"
196  << Endl;
197  }
198  vars[ivar].SetRMS( TMath::Sqrt( x2(ivar)/sumOfWeights - mean*mean) );
199  }
200  for (UInt_t itgt=0; itgt<ntgts; itgt++) {
201  Double_t mean = x0(nvars+itgt)/sumOfWeights;
202  tars[itgt].SetMean( mean );
203  if (x2(nvars+itgt)/sumOfWeights - mean*mean < 0) {
204  Log() << kFATAL << " the RMS of your target variable " << itgt
205  << " evaluates to an imaginary number: sqrt(" << x2(nvars+itgt)/sumOfWeights - mean*mean
206  <<") .. sometimes related to a problem with outliers and negative event weights"
207  << Endl;
208  }
209  tars[itgt].SetRMS( TMath::Sqrt( x2(nvars+itgt)/sumOfWeights - mean*mean) );
210  }
211 
212  // calculate variance
213  for (UInt_t ievt=0; ievt<nevts; ievt++) {
214  const Event* ev = events[ievt];
215  Double_t weight = ev->GetWeight();
216 
217  for (UInt_t ivar=0; ivar<nvars; ivar++) {
218  Double_t x = ev->GetValue(ivar);
219  Double_t mean = vars[ivar].GetMean();
220  v0(ivar) += weight*(x-mean)*(x-mean);
221  }
222 
223  for (UInt_t itgt=0; itgt<ntgts; itgt++) {
224  Double_t x = ev->GetTarget(itgt);
225  Double_t mean = tars[itgt].GetMean();
226  v0(nvars+itgt) += weight*(x-mean)*(x-mean);
227  }
228  }
229 
231  maxL = maxL + 8;
232  Log() << kINFO << "----------------------------------------------------------------" << Endl;
233  Log() << kINFO << std::setiosflags(std::ios::left) << std::setw(maxL) << "Variables";
234  Log() << kINFO << std::setiosflags(std::ios::left) << std::setw(10) << "Variance" << Endl;
235  Log() << kINFO << "----------------------------------------------------------------" << Endl;
236 
237  // set variance
238  Log() << std::setprecision(5);
239  for (UInt_t ivar=0; ivar<nvars; ivar++) {
240  Double_t variance = v0(ivar)/sumOfWeights;
241  vars[ivar].SetVariance( variance );
242  Log() << kINFO << std::setiosflags(std::ios::left) << std::setw(maxL) << vars[ivar].GetExpression();
243  Log() << kINFO << std::setiosflags(std::ios::left) << std::setw(maxL) << variance << Endl;
244  }
245 
247  maxL = maxL + 8;
248  Log() << kINFO << "----------------------------------------------------------------" << Endl;
249  Log() << kINFO << std::setiosflags(std::ios::left) << std::setw(maxL) << "Targets";
250  Log() << kINFO << std::setiosflags(std::ios::left) << std::setw(10) << "Variance" << Endl;
251  Log() << kINFO << "----------------------------------------------------------------" << Endl;
252 
253  for (UInt_t itgt=0; itgt<ntgts; itgt++) {
254  Double_t variance = v0(nvars+itgt)/sumOfWeights;
255  tars[itgt].SetVariance( variance );
256  Log() << kINFO << std::setiosflags(std::ios::left) << std::setw(maxL) << tars[itgt].GetExpression();
257  Log() << kINFO << std::setiosflags(std::ios::left) << std::setw(maxL) << variance << Endl;
258  }
259 
260  Log() << kINFO << "Set minNorm/maxNorm for variables to: " << Endl;
261  Log() << std::setprecision(3);
262  for (UInt_t ivar=0; ivar<nvars; ivar++)
263  Log() << " " << vars[ivar].GetExpression()
264  << "\t: [" << vars[ivar].GetMin() << "\t, " << vars[ivar].GetMax() << "\t] " << Endl;
265  Log() << kINFO << "Set minNorm/maxNorm for targets to: " << Endl;
266  Log() << std::setprecision(3);
267  for (UInt_t itgt=0; itgt<ntgts; itgt++)
268  Log() << " " << tars[itgt].GetExpression()
269  << "\t: [" << tars[itgt].GetMin() << "\t, " << tars[itgt].GetMax() << "\t] " << Endl;
270  Log() << std::setprecision(5); // reset to better value
271 }
272 
273 //_______________________________________________________________________
275 {
276  for( std::vector<TreeInfo>::const_iterator treeinfo=src->DataInput().Sbegin();treeinfo!=src->DataInput().Send();treeinfo++)
277  {
278  des->AddSignalTree( (*treeinfo).GetTree(), (*treeinfo).GetWeight(),(*treeinfo).GetTreeType());
279  }
280 
281  for( std::vector<TreeInfo>::const_iterator treeinfo=src->DataInput().Bbegin();treeinfo!=src->DataInput().Bend();treeinfo++)
282  {
283  des->AddBackgroundTree( (*treeinfo).GetTree(), (*treeinfo).GetWeight(),(*treeinfo).GetTreeType());
284  }
285 }
void AddBackgroundTree(TTree *background, Double_t weight=1.0, Types::ETreeType treetype=Types::kMaxTreeType)
Definition: DataLoader.cxx:382
UInt_t GetNVariables() const
Definition: DataSetInfo.h:128
void CalcNorm()
Computes maximum, minimum, mean, RMS and variance for all variables and targets.
std::vector< TreeInfo >::const_iterator Bend() const
MsgLogger & Endl(MsgLogger &ml)
Definition: MsgLogger.h:162
std::vector< TreeInfo >::const_iterator Bbegin() const
TMVA::DataLoader * VarianceThreshold(Double_t threshold)
Computes variance of all the variables and returns a new DataLoader with the selected variables whose...
DataSetInfo & GetDataSetInfo()
Definition: DataLoader.cxx:136
Int_t GetTargetNameMaxLength() const
Basic string class.
Definition: TString.h:137
int Int_t
Definition: RtypesCore.h:41
std::vector< TreeInfo >::const_iterator Send() const
const std::vector< Event * > & GetEventCollection(Types::ETreeType type=Types::kMaxTreeType) const
Definition: DataSet.h:239
void AddVariable(const TString &expression, const TString &title, const TString &unit, char type='F', Double_t min=0, Double_t max=0)
Definition: DataLoader.cxx:456
std::vector< TreeInfo >::const_iterator Sbegin() const
static const double x2[5]
Double_t x[n]
Definition: legend1.C:17
std::vector< std::vector< double > > Data
Double_t GetWeight() const
return the event weight - depending on whether the flag IgnoreNegWeightsInTraining is or not...
Definition: Event.cxx:378
std::vector< VariableInfo > & GetTargetInfos()
Definition: DataSetInfo.h:117
void UpdateNorm(Int_t ivar, Double_t x)
Updates maximum and minimum value of a variable or target.
VarTransformHandler(DataLoader *)
constructor
TFileCollection * GetDataSet(const char *ds, const char *server="")
GetDataSet wrapper.
Definition: pq2wrappers.cxx:87
Float_t GetTarget(UInt_t itgt) const
Definition: Event.h:104
UInt_t GetNTargets() const
Definition: DataSetInfo.h:129
DataInputHandler & DataInput()
Definition: DataLoader.h:183
const std::vector< Event * > & fEvents
unsigned int UInt_t
Definition: RtypesCore.h:42
Float_t GetValue(UInt_t ivar) const
return value of i&#39;th variable
Definition: Event.cxx:233
void PrepareTrainingAndTestTree(const TCut &cut, const TString &splitOpt)
Definition: DataLoader.cxx:580
double Double_t
Definition: RtypesCore.h:55
Int_t GetVariableNameMaxLength() const
const TString & GetSplitOptions() const
Definition: DataSetInfo.h:185
const TCut & GetCut(Int_t i) const
Definition: DataSetInfo.h:167
Double_t Sqrt(Double_t x)
Definition: TMath.h:464
void CopyDataLoader(TMVA::DataLoader *des, TMVA::DataLoader *src)
void AddSignalTree(TTree *signal, Double_t weight=1.0, Types::ETreeType treetype=Types::kMaxTreeType)
Definition: DataLoader.cxx:353
DataSet * GetDataSet() const
returns data set
std::vector< VariableInfo > & GetVariableInfos()
Definition: DataSetInfo.h:112
MsgLogger & Log() const
message logger