Logo ROOT   6.10/09
Reference Guide
ResultsMulticlass.cxx
Go to the documentation of this file.
1 // @(#)root/tmva $Id$
2 // Author: Andreas Hoecker, Peter Speckmayer, Joerg Stelzer, Helge Voss, Jan Therhaag
3 
4 /**********************************************************************************
5  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6  * Package: TMVA *
7  * Class : ResultsMulticlass *
8  * Web : http://tmva.sourceforge.net *
9  * *
10  * Description: *
11  * Implementation (see header for description) *
12  * *
13  * Authors (alphabetical): *
14  * Andreas Hoecker <Andreas.Hocker@cern.ch> - CERN, Switzerland *
15  * Peter Speckmayer <Peter.Speckmayer@cern.ch> - CERN, Switzerland *
16  * Joerg Stelzer <Joerg.Stelzer@cern.ch> - CERN, Switzerland *
17  * Jan Therhaag <Jan.Therhaag@cern.ch> - U of Bonn, Germany *
18  * Helge Voss <Helge.Voss@cern.ch> - MPI-K Heidelberg, Germany *
19  * *
20  * Copyright (c) 2006: *
21  * CERN, Switzerland *
22  * MPI-K Heidelberg, Germany *
23  * U. of Bonn, Germany *
24  * *
25  * Redistribution and use in source and binary forms, with or without *
26  * modification, are permitted according to the terms listed in LICENSE *
27  * (http://tmva.sourceforge.net/LICENSE) *
28  **********************************************************************************/
29 
30 /*! \class TMVA::ResultsMulticlass
31 \ingroup TMVA
32 Class which takes the results of a multiclass classification
33 */
34 
35 #include "TMVA/ResultsMulticlass.h"
36 
37 #include "TMVA/DataSet.h"
38 #include "TMVA/DataSetInfo.h"
39 #include "TMVA/GeneticAlgorithm.h"
40 #include "TMVA/GeneticFitter.h"
41 #include "TMVA/MsgLogger.h"
42 #include "TMVA/Results.h"
43 #include "TMVA/ROCCurve.h"
44 #include "TMVA/Tools.h"
45 #include "TMVA/Types.h"
46 
47 #include "TGraph.h"
48 #include "TH1F.h"
49 #include "TMatrixD.h"
50 
51 #include <limits>
52 #include <vector>
53 
54 
55 ////////////////////////////////////////////////////////////////////////////////
56 /// constructor
57 
59  : Results( dsi, resultsName ),
60  IFitterTarget(),
61  fLogger( new MsgLogger(Form("ResultsMultiClass%s",resultsName.Data()) , kINFO) ),
62  fClassToOptimize(0),
63  fAchievableEff(dsi->GetNClasses()),
64  fAchievablePur(dsi->GetNClasses()),
65  fBestCuts(dsi->GetNClasses(),std::vector<Double_t>(dsi->GetNClasses()))
66 {
67 }
68 
69 ////////////////////////////////////////////////////////////////////////////////
70 /// destructor
71 
73 {
74  delete fLogger;
75 }
76 
77 ////////////////////////////////////////////////////////////////////////////////
78 
79 void TMVA::ResultsMulticlass::SetValue( std::vector<Float_t>& value, Int_t ievt )
80 {
81  if (ievt >= (Int_t)fMultiClassValues.size()) fMultiClassValues.resize( ievt+1 );
82  fMultiClassValues[ievt] = value;
83 }
84 
85 ////////////////////////////////////////////////////////////////////////////////
86 /// Returns a confusion matrix where each class is pitted against each other.
87 /// Results are
88 
90 {
91  const DataSet *ds = GetDataSet();
92  const DataSetInfo *dsi = GetDataSetInfo();
94 
95  UInt_t numClasses = dsi->GetNClasses();
96  TMatrixD mat(numClasses, numClasses);
97 
98  // class == iRow is considered signal class
99  for (UInt_t iRow = 0; iRow < numClasses; ++iRow) {
100  for (UInt_t iCol = 0; iCol < numClasses; ++iCol) {
101 
102  // Number is meaningless with only one class
103  if (iRow == iCol) {
104  mat(iRow, iCol) = std::numeric_limits<double>::quiet_NaN();
105  }
106 
107  std::vector<Float_t> valueVector;
108  std::vector<Bool_t> classVector;
109  std::vector<Float_t> weightVector;
110 
111  for (UInt_t iEvt = 0; iEvt < ds->GetNEvents(); ++iEvt) {
112  const Event *ev = ds->GetEvent(iEvt);
113  const UInt_t cls = ev->GetClass();
114  const Float_t weight = ev->GetWeight();
115  const Float_t mvaValue = fMultiClassValues[iEvt][iRow];
116 
117  if (cls != iRow and cls != iCol) {
118  continue;
119  }
120 
121  classVector.push_back(cls == iRow);
122  weightVector.push_back(weight);
123  valueVector.push_back(mvaValue);
124  }
125 
126  ROCCurve roc(valueVector, classVector, weightVector);
127  mat(iRow, iCol) = roc.GetEffSForEffB(effB);
128  }
129  }
130 
131  return mat;
132 }
133 
134 ////////////////////////////////////////////////////////////////////////////////
135 
136 Double_t TMVA::ResultsMulticlass::EstimatorFunction( std::vector<Double_t> & cutvalues ){
137 
138  DataSet* ds = GetDataSet();
139  ds->SetCurrentType( GetTreeType() );
140 
141  // Cache optimisation, count true and false positives with memory access
142  // instead of code branch.
143  Float_t positives[2] = {0, 0};
144 
145  for (Int_t ievt = 0; ievt < ds->GetNEvents(); ievt++) {
146  UInt_t evClass = fEventClasses[ievt];
147  Float_t w = fEventWeights[ievt];
148 
149  Bool_t break_outer_loop = false;
150  for (UInt_t icls = 0; icls < cutvalues.size(); ++icls) {
151  auto value = fMultiClassValues[ievt][icls];
152  auto cutvalue = cutvalues.at(icls);
153  if (cutvalue < 0. ? (-value < cutvalue) : (+value <= cutvalue)) {
154  break_outer_loop = true;
155  break;
156  }
157  }
158 
159  if (break_outer_loop) {
160  continue;
161  }
162 
163  Bool_t isEvCurrClass = (evClass == fClassToOptimize);
164  positives[isEvCurrClass] += w;
165  }
166 
167  const Float_t truePositive = positives[1];
168  const Float_t falsePositive = positives[0];
169 
170  Float_t eff = truePositive / fClassSumWeights[fClassToOptimize];
171  Float_t pur = truePositive / (truePositive + falsePositive);
172  Float_t effTimesPur = eff*pur;
173 
174  Float_t toMinimize = std::numeric_limits<float>::max();
175  if (effTimesPur > std::numeric_limits<float>::min())
176  toMinimize = 1./(effTimesPur); // we want to minimize 1/efficiency*purity
177 
180 
181  return toMinimize;
182 }
183 
184 ////////////////////////////////////////////////////////////////////////////////
185 ///calculate the best working point (optimal cut values)
186 ///for the multiclass classifier
187 
188 std::vector<Double_t> TMVA::ResultsMulticlass::GetBestMultiClassCuts(UInt_t targetClass){
189 
190  const DataSetInfo* dsi = GetDataSetInfo();
191  Log() << kINFO << "Calculating best set of cuts for class "
192  << dsi->GetClassInfo( targetClass )->GetName() << Endl;
193 
194  fClassToOptimize = targetClass;
195  std::vector<Interval*> ranges(dsi->GetNClasses(), new Interval(-1,1));
196 
197  fClassSumWeights.clear();
198  fEventWeights.clear();
199  fEventClasses.clear();
200 
201  for (UInt_t icls = 0; icls < dsi->GetNClasses(); ++icls) {
202  fClassSumWeights.push_back(0);
203  }
204 
205  DataSet *ds = GetDataSet();
206  for (Int_t ievt = 0; ievt < ds->GetNEvents(); ievt++) {
207  const Event *ev = ds->GetEvent(ievt);
208  fClassSumWeights[ev->GetClass()] += ev->GetWeight();
209  fEventWeights.push_back(ev->GetWeight());
210  fEventClasses.push_back(ev->GetClass());
211  }
212 
213  const TString name( "MulticlassGA" );
214  const TString opts( "PopSize=100:Steps=30" );
215  GeneticFitter mg( *this, name, ranges, opts);
216 
217  std::vector<Double_t> result;
218  mg.Run(result);
219 
220  fBestCuts.at(targetClass) = result;
221 
222  UInt_t n = 0;
223  for( std::vector<Double_t>::iterator it = result.begin(); it<result.end(); it++ ){
224  Log() << kINFO << " cutValue[" <<dsi->GetClassInfo( n )->GetName() << "] = " << (*it) << ";"<< Endl;
225  n++;
226  }
227 
228  return result;
229 }
230 
231 ////////////////////////////////////////////////////////////////////////////////
232 /// Create performance graphs for this classifier a multiclass setting.
233 /// Requires that the method has already been evaluated (that a resultset
234 /// already exists.)
235 ///
236 /// Currently uses the new way of calculating ROC Curves. If anything looks
237 /// fishy, please contact the ROOT TMVA team.
238 ///
239 
241 {
242  DataSet *ds = GetDataSet();
244  const DataSetInfo *dsi = GetDataSetInfo();
245 
246  UInt_t numClasses = dsi->GetNClasses();
247 
248  std::vector<std::vector<Float_t>> *rawMvaRes = GetValueVector();
249 
250  for (size_t iClass = 0; iClass < numClasses; ++iClass) {
251  // Format data
252  // TODO: Replace with calls to GetMvaValuesPerClass
253  std::vector<Float_t> mvaRes;
254  std::vector<Bool_t> mvaResTypes;
255  std::vector<Float_t> mvaResWeights;
256 
257  // Vector transpose due to values being stored as
258  // [ [0, 1, 2], [0, 1, 2], ... ]
259  // in ResultsMulticlass::GetValueVector.
260  mvaRes.reserve(rawMvaRes->size());
261  for (auto item : *rawMvaRes) {
262  mvaRes.push_back(item[iClass]);
263  }
264 
265  auto eventCollection = ds->GetEventCollection();
266  mvaResTypes.reserve(eventCollection.size());
267  mvaResWeights.reserve(eventCollection.size());
268  for (auto ev : eventCollection) {
269  mvaResTypes.push_back(ev->GetClass() == iClass);
270  mvaResWeights.push_back(ev->GetWeight());
271  }
272 
273  // Get ROC Curve
274  ROCCurve *roc = new ROCCurve(mvaRes, mvaResTypes, mvaResWeights);
275  TGraph *rocGraph = new TGraph(*(roc->GetROCCurve()));
276  delete roc;
277 
278  // Style ROC Curve
279  TString className = dsi->GetClassInfo(iClass)->GetName();
280  TString name = Form("%s_rejBvsS_%s", prefix.Data(), className.Data());
281  TString title = Form("%s_%s", prefix.Data(), className.Data());
282  rocGraph->SetName(name);
283  rocGraph->SetTitle(title);
284 
285  // Store ROC Curve
286  Store(rocGraph);
287  }
288 }
289 
290 ////////////////////////////////////////////////////////////////////////////////
291 /// this function fills the mva response histos for multiclass classification
292 
294 {
295  Log() << kINFO << "Creating multiclass response histograms..." << Endl;
296 
297  DataSet* ds = GetDataSet();
298  ds->SetCurrentType( GetTreeType() );
299  const DataSetInfo* dsi = GetDataSetInfo();
300 
301  std::vector<std::vector<TH1F*> > histos;
302  Float_t xmin = 0.-0.0002;
303  Float_t xmax = 1.+0.0002;
304  for (UInt_t iCls = 0; iCls < dsi->GetNClasses(); iCls++) {
305  histos.push_back(std::vector<TH1F*>(0));
306  for (UInt_t jCls = 0; jCls < dsi->GetNClasses(); jCls++) {
307  TString name(Form("%s_%s_prob_for_%s",prefix.Data(),
308  dsi->GetClassInfo( jCls )->GetName(),
309  dsi->GetClassInfo( iCls )->GetName()));
310  histos.at(iCls).push_back(new TH1F(name,name,nbins,xmin,xmax));
311  }
312  }
313 
314  for (Int_t ievt=0; ievt<ds->GetNEvents(); ievt++) {
315  const Event* ev = ds->GetEvent(ievt);
316  Int_t cls = ev->GetClass();
317  Float_t w = ev->GetWeight();
318  for (UInt_t jCls = 0; jCls < dsi->GetNClasses(); jCls++) {
319  histos.at(cls).at(jCls)->Fill(fMultiClassValues[ievt][jCls],w);
320  }
321  }
322  for (UInt_t iCls = 0; iCls < dsi->GetNClasses(); iCls++) {
323  for (UInt_t jCls = 0; jCls < dsi->GetNClasses(); jCls++) {
324  gTools().NormHist( histos.at(iCls).at(jCls) );
325  Store(histos.at(iCls).at(jCls));
326  }
327  }
328 
329  /*
330  //fill fine binned histos for testing
331  if(prefix.Contains("Test")){
332  std::vector<std::vector<TH1F*> > histos_highbin;
333  for (UInt_t iCls = 0; iCls < dsi->GetNClasses(); iCls++) {
334  histos_highbin.push_back(std::vector<TH1F*>(0));
335  for (UInt_t jCls = 0; jCls < dsi->GetNClasses(); jCls++) {
336  TString name(Form("%s_%s_prob_for_%s_HIGHBIN",prefix.Data(),
337  dsi->GetClassInfo( jCls )->GetName().Data(),
338  dsi->GetClassInfo( iCls )->GetName().Data()));
339  histos_highbin.at(iCls).push_back(new TH1F(name,name,nbins_high,xmin,xmax));
340  }
341  }
342 
343  for (Int_t ievt=0; ievt<ds->GetNEvents(); ievt++) {
344  const Event* ev = ds->GetEvent(ievt);
345  Int_t cls = ev->GetClass();
346  Float_t w = ev->GetWeight();
347  for (UInt_t jCls = 0; jCls < dsi->GetNClasses(); jCls++) {
348  histos_highbin.at(cls).at(jCls)->Fill(fMultiClassValues[ievt][jCls],w);
349  }
350  }
351  for (UInt_t iCls = 0; iCls < dsi->GetNClasses(); iCls++) {
352  for (UInt_t jCls = 0; jCls < dsi->GetNClasses(); jCls++) {
353  gTools().NormHist( histos_highbin.at(iCls).at(jCls) );
354  Store(histos_highbin.at(iCls).at(jCls));
355  }
356  }
357  }
358  */
359 }
virtual const char * GetName() const
Returns name of object.
Definition: TNamed.h:47
std::vector< std::vector< Float_t > > * GetValueVector()
float xmin
Definition: THbookFile.cxx:93
Double_t GetEffSForEffB(Double_t effB, const UInt_t num_points=41)
Calculate the signal efficiency (sensitivity) for a given background efficiency (sensitivity).
Definition: ROCCurve.cxx:199
MsgLogger & Endl(MsgLogger &ml)
Definition: MsgLogger.h:158
float Float_t
Definition: RtypesCore.h:53
std::vector< Float_t > fAchievablePur
Double_t Run(std::vector< Double_t > &pars)
Execute fitting.
virtual void SetName(const char *name)
Set the name of the TNamed.
Definition: TNamed.cxx:131
THist< 1, float, THistStatContent, THistStatUncertainty > TH1F
Definition: THist.hxx:311
const DataSetInfo * GetDataSetInfo() const
Definition: Results.h:71
DataSet * GetDataSet() const
Definition: Results.h:72
Basic string class.
Definition: TString.h:129
int Int_t
Definition: RtypesCore.h:41
bool Bool_t
Definition: RtypesCore.h:59
virtual void SetTitle(const char *title="")
Set graph title.
Definition: TGraph.cxx:2180
UInt_t GetNClasses() const
Definition: DataSetInfo.h:136
int nbins[3]
TMatrixD GetConfusionMatrix(Double_t effB)
Returns a confusion matrix where each class is pitted against each other.
std::vector< std::vector< Double_t > > fBestCuts
const std::vector< Event * > & GetEventCollection(Types::ETreeType type=Types::kMaxTreeType) const
Definition: DataSet.h:225
STL namespace.
UInt_t GetClass() const
Definition: Event.h:81
std::vector< std::vector< double > > Data
Double_t NormHist(TH1 *theHist, Double_t norm=1.0)
normalises histogram
Definition: Tools.cxx:394
Class that contains all the data information.
Definition: DataSetInfo.h:60
MsgLogger & Log() const
message logger
Double_t GetWeight() const
return the event weight - depending on whether the flag IgnoreNegWeightsInTraining is or not...
Definition: Event.cxx:382
std::vector< Float_t > fEventWeights
ResultsMulticlass(const DataSetInfo *dsi, TString resultsName)
constructor
std::vector< Float_t > fClassSumWeights
Class that contains all the data information.
Definition: DataSet.h:69
ClassInfo * GetClassInfo(Int_t clNum) const
std::vector< std::vector< Float_t > > fMultiClassValues
void SetValue(std::vector< Float_t > &value, Int_t ievt)
The TMVA::Interval Class.
Definition: Interval.h:61
unsigned int UInt_t
Definition: RtypesCore.h:42
char * Form(const char *fmt,...)
Double_t EstimatorFunction(std::vector< Double_t > &)
void CreateMulticlassPerformanceHistos(TString prefix)
Create performance graphs for this classifier a multiclass setting.
float xmax
Definition: THbookFile.cxx:93
Tools & gTools()
void CreateMulticlassHistos(TString prefix, Int_t nbins, Int_t nbins_high)
this function fills the mva response histos for multiclass classification
std::vector< Float_t > fAchievableEff
double Double_t
Definition: RtypesCore.h:55
std::vector< Double_t > GetBestMultiClassCuts(UInt_t targetClass)
calculate the best working point (optimal cut values) for the multiclass classifier ...
void SetCurrentType(Types::ETreeType type) const
Definition: DataSet.h:100
ostringstream derivative to redirect and format output
Definition: MsgLogger.h:59
TGraph * GetROCCurve(const UInt_t points=100)
Returns a new TGraph containing the ROC curve.
Definition: ROCCurve.cxx:256
Class that is the base-class for a vector of result.
Definition: Results.h:57
A Graph is a graphics object made of two arrays X and Y with npoints each.
Definition: TGraph.h:41
Types::ETreeType GetTreeType() const
Definition: Results.h:70
Long64_t GetNEvents(Types::ETreeType type=Types::kMaxTreeType) const
Definition: DataSet.h:215
std::vector< UInt_t > fEventClasses
double result[121]
void Store(TObject *obj, const char *alias=0)
Definition: Results.cxx:86
Interface for a fitter &#39;target&#39;.
Definition: IFitterTarget.h:44
Fitter using a Genetic Algorithm.
Definition: GeneticFitter.h:43
const Int_t n
Definition: legend1.C:16
const Event * GetEvent() const
Definition: DataSet.cxx:202
const char * Data() const
Definition: TString.h:347