Logo ROOT   6.12/07
Reference Guide
ResultsMulticlass.cxx
Go to the documentation of this file.
1 // @(#)root/tmva $Id$
2 // Author: Andreas Hoecker, Peter Speckmayer, Joerg Stelzer, Helge Voss, Jan Therhaag
3 
4 /**********************************************************************************
5  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6  * Package: TMVA *
7  * Class : ResultsMulticlass *
8  * Web : http://tmva.sourceforge.net *
9  * *
10  * Description: *
11  * Implementation (see header for description) *
12  * *
13  * Authors (alphabetical): *
14  * Andreas Hoecker <Andreas.Hocker@cern.ch> - CERN, Switzerland *
15  * Peter Speckmayer <Peter.Speckmayer@cern.ch> - CERN, Switzerland *
16  * Joerg Stelzer <Joerg.Stelzer@cern.ch> - CERN, Switzerland *
17  * Jan Therhaag <Jan.Therhaag@cern.ch> - U of Bonn, Germany *
18  * Helge Voss <Helge.Voss@cern.ch> - MPI-K Heidelberg, Germany *
19  * *
20  * Copyright (c) 2006: *
21  * CERN, Switzerland *
22  * MPI-K Heidelberg, Germany *
23  * U. of Bonn, Germany *
24  * *
25  * Redistribution and use in source and binary forms, with or without *
26  * modification, are permitted according to the terms listed in LICENSE *
27  * (http://tmva.sourceforge.net/LICENSE) *
28  **********************************************************************************/
29 
30 /*! \class TMVA::ResultsMulticlass
31 \ingroup TMVA
32 Class which takes the results of a multiclass classification
33 */
34 
35 #include "TMVA/ResultsMulticlass.h"
36 
37 #include "TMVA/DataSet.h"
38 #include "TMVA/DataSetInfo.h"
39 #include "TMVA/GeneticAlgorithm.h"
40 #include "TMVA/GeneticFitter.h"
41 #include "TMVA/MsgLogger.h"
42 #include "TMVA/Results.h"
43 #include "TMVA/ROCCurve.h"
44 #include "TMVA/Tools.h"
45 #include "TMVA/Types.h"
46 
47 #include "TGraph.h"
48 #include "TH1F.h"
49 #include "TMatrixD.h"
50 
51 #include <limits>
52 #include <vector>
53 
54 
55 ////////////////////////////////////////////////////////////////////////////////
56 /// constructor
57 
59  : Results( dsi, resultsName ),
60  IFitterTarget(),
61  fLogger( new MsgLogger(Form("ResultsMultiClass%s",resultsName.Data()) , kINFO) ),
62  fClassToOptimize(0),
63  fAchievableEff(dsi->GetNClasses()),
64  fAchievablePur(dsi->GetNClasses()),
65  fBestCuts(dsi->GetNClasses(),std::vector<Double_t>(dsi->GetNClasses()))
66 {
67 }
68 
69 ////////////////////////////////////////////////////////////////////////////////
70 /// destructor
71 
73 {
74  delete fLogger;
75 }
76 
77 ////////////////////////////////////////////////////////////////////////////////
78 
79 void TMVA::ResultsMulticlass::SetValue( std::vector<Float_t>& value, Int_t ievt )
80 {
81  if (ievt >= (Int_t)fMultiClassValues.size()) fMultiClassValues.resize( ievt+1 );
82  fMultiClassValues[ievt] = value;
83 }
84 
85 ////////////////////////////////////////////////////////////////////////////////
86 /// Returns a confusion matrix where each class is pitted against each other.
87 /// Results are
88 
90 {
91  const DataSet *ds = GetDataSet();
92  const DataSetInfo *dsi = GetDataSetInfo();
94 
95  UInt_t numClasses = dsi->GetNClasses();
96  TMatrixD mat(numClasses, numClasses);
97 
98  // class == iRow is considered signal class
99  for (UInt_t iRow = 0; iRow < numClasses; ++iRow) {
100  for (UInt_t iCol = 0; iCol < numClasses; ++iCol) {
101 
102  // Number is meaningless with only one class
103  if (iRow == iCol) {
104  mat(iRow, iCol) = std::numeric_limits<double>::quiet_NaN();
105  }
106 
107  std::vector<Float_t> valueVector;
108  std::vector<Bool_t> classVector;
109  std::vector<Float_t> weightVector;
110 
111  for (UInt_t iEvt = 0; iEvt < ds->GetNEvents(); ++iEvt) {
112  const Event *ev = ds->GetEvent(iEvt);
113  const UInt_t cls = ev->GetClass();
114  const Float_t weight = ev->GetWeight();
115  const Float_t mvaValue = fMultiClassValues[iEvt][iRow];
116 
117  if (cls != iRow and cls != iCol) {
118  continue;
119  }
120 
121  classVector.push_back(cls == iRow);
122  weightVector.push_back(weight);
123  valueVector.push_back(mvaValue);
124  }
125 
126  ROCCurve roc(valueVector, classVector, weightVector);
127  mat(iRow, iCol) = roc.GetEffSForEffB(effB);
128  }
129  }
130 
131  return mat;
132 }
133 
134 ////////////////////////////////////////////////////////////////////////////////
135 
136 Double_t TMVA::ResultsMulticlass::EstimatorFunction( std::vector<Double_t> & cutvalues ){
137 
138  DataSet* ds = GetDataSet();
139  ds->SetCurrentType( GetTreeType() );
140 
141  // Cache optimisation, count true and false positives with memory access
142  // instead of code branch.
143  Float_t positives[2] = {0, 0};
144 
145  for (Int_t ievt = 0; ievt < ds->GetNEvents(); ievt++) {
146  UInt_t evClass = fEventClasses[ievt];
147  Float_t w = fEventWeights[ievt];
148 
149  Bool_t break_outer_loop = false;
150  for (UInt_t icls = 0; icls < cutvalues.size(); ++icls) {
151  auto value = fMultiClassValues[ievt][icls];
152  auto cutvalue = cutvalues.at(icls);
153  if (cutvalue < 0. ? (-value < cutvalue) : (+value <= cutvalue)) {
154  break_outer_loop = true;
155  break;
156  }
157  }
158 
159  if (break_outer_loop) {
160  continue;
161  }
162 
163  Bool_t isEvCurrClass = (evClass == fClassToOptimize);
164  positives[isEvCurrClass] += w;
165  }
166 
167  const Float_t truePositive = positives[1];
168  const Float_t falsePositive = positives[0];
169 
170  Float_t eff = truePositive / fClassSumWeights[fClassToOptimize];
171  Float_t pur = truePositive / (truePositive + falsePositive);
172  Float_t effTimesPur = eff*pur;
173 
174  Float_t toMinimize = std::numeric_limits<float>::max();
175  if (effTimesPur > std::numeric_limits<float>::min())
176  toMinimize = 1./(effTimesPur); // we want to minimize 1/efficiency*purity
177 
180 
181  return toMinimize;
182 }
183 
184 ////////////////////////////////////////////////////////////////////////////////
185 ///calculate the best working point (optimal cut values)
186 ///for the multiclass classifier
187 
188 std::vector<Double_t> TMVA::ResultsMulticlass::GetBestMultiClassCuts(UInt_t targetClass){
189 
190  const DataSetInfo* dsi = GetDataSetInfo();
191  Log() << kINFO << "Calculating best set of cuts for class "
192  << dsi->GetClassInfo( targetClass )->GetName() << Endl;
193 
194  fClassToOptimize = targetClass;
195  std::vector<Interval*> ranges(dsi->GetNClasses(), new Interval(-1,1));
196 
197  fClassSumWeights.clear();
198  fEventWeights.clear();
199  fEventClasses.clear();
200 
201  for (UInt_t icls = 0; icls < dsi->GetNClasses(); ++icls) {
202  fClassSumWeights.push_back(0);
203  }
204 
205  DataSet *ds = GetDataSet();
206  for (Int_t ievt = 0; ievt < ds->GetNEvents(); ievt++) {
207  const Event *ev = ds->GetEvent(ievt);
208  fClassSumWeights[ev->GetClass()] += ev->GetWeight();
209  fEventWeights.push_back(ev->GetWeight());
210  fEventClasses.push_back(ev->GetClass());
211  }
212 
213  const TString name( "MulticlassGA" );
214  const TString opts( "PopSize=100:Steps=30" );
215  GeneticFitter mg( *this, name, ranges, opts);
216 
217  std::vector<Double_t> result;
218  mg.Run(result);
219 
220  fBestCuts.at(targetClass) = result;
221 
222  UInt_t n = 0;
223  for( std::vector<Double_t>::iterator it = result.begin(); it<result.end(); it++ ){
224  Log() << kINFO << " cutValue[" <<dsi->GetClassInfo( n )->GetName() << "] = " << (*it) << ";"<< Endl;
225  n++;
226  }
227 
228  return result;
229 }
230 
231 ////////////////////////////////////////////////////////////////////////////////
232 /// Create performance graphs for this classifier a multiclass setting.
233 /// Requires that the method has already been evaluated (that a resultset
234 /// already exists.)
235 ///
236 /// Currently uses the new way of calculating ROC Curves. If anything looks
237 /// fishy, please contact the ROOT TMVA team.
238 ///
239 
241 {
242  DataSet *ds = GetDataSet();
244  const DataSetInfo *dsi = GetDataSetInfo();
245 
246  UInt_t numClasses = dsi->GetNClasses();
247 
248  std::vector<std::vector<Float_t>> *rawMvaRes = GetValueVector();
249 
250  //
251  // 1-vs-rest ROC curves
252  //
253  for (size_t iClass = 0; iClass < numClasses; ++iClass) {
254  // Format data
255  std::vector<Float_t> mvaRes;
256  std::vector<Bool_t> mvaResTypes;
257  std::vector<Float_t> mvaResWeights;
258 
259  // Vector transpose due to values being stored as
260  // [ [0, 1, 2], [0, 1, 2], ... ]
261  // in ResultsMulticlass::GetValueVector.
262  mvaRes.reserve(rawMvaRes->size());
263  for (auto item : *rawMvaRes) {
264  mvaRes.push_back(item[iClass]);
265  }
266 
267  auto eventCollection = ds->GetEventCollection();
268  mvaResTypes.reserve(eventCollection.size());
269  mvaResWeights.reserve(eventCollection.size());
270  for (auto ev : eventCollection) {
271  mvaResTypes.push_back(ev->GetClass() == iClass);
272  mvaResWeights.push_back(ev->GetWeight());
273  }
274 
275  // Get ROC Curve
276  ROCCurve *roc = new ROCCurve(mvaRes, mvaResTypes, mvaResWeights);
277  TGraph *rocGraph = new TGraph(*(roc->GetROCCurve()));
278  delete roc;
279 
280  // Style ROC Curve
281  TString className = dsi->GetClassInfo(iClass)->GetName();
282  TString name = Form("%s_rejBvsS_%s", prefix.Data(), className.Data());
283  TString title = Form("%s_%s", prefix.Data(), className.Data());
284  rocGraph->SetName(name);
285  rocGraph->SetTitle(title);
286 
287  // Store ROC Curve
288  Store(rocGraph);
289  }
290 
291  //
292  // 1-vs-1 ROC curves
293  //
294  for (size_t iClass = 0; iClass < numClasses; ++iClass) {
295  for (size_t jClass = 0; jClass < numClasses; ++jClass) {
296  if (iClass == jClass) {
297  continue;
298  }
299 
300  auto eventCollection = ds->GetEventCollection();
301 
302  // Format data
303  std::vector<Float_t> mvaRes;
304  std::vector<Bool_t> mvaResTypes;
305  std::vector<Float_t> mvaResWeights;
306 
307  mvaRes.reserve(rawMvaRes->size());
308  mvaResTypes.reserve(eventCollection.size());
309  mvaResWeights.reserve(eventCollection.size());
310 
311  for (size_t iEvent = 0; iEvent < eventCollection.size(); ++iEvent) {
312  Event *ev = eventCollection[iEvent];
313 
314  if (ev->GetClass() == iClass or ev->GetClass() == jClass) {
315  Float_t output_value = (*rawMvaRes)[iEvent][iClass];
316  mvaRes.push_back(output_value);
317  mvaResTypes.push_back(ev->GetClass() == iClass);
318  mvaResWeights.push_back(ev->GetWeight());
319  }
320  }
321 
322  // Get ROC Curve
323  ROCCurve *roc = new ROCCurve(mvaRes, mvaResTypes, mvaResWeights);
324  TGraph *rocGraph = new TGraph(*(roc->GetROCCurve()));
325  delete roc;
326 
327  // Style ROC Curve
328  TString iClassName = dsi->GetClassInfo(iClass)->GetName();
329  TString jClassName = dsi->GetClassInfo(jClass)->GetName();
330  TString name = Form("%s_1v1rejBvsS_%s_vs_%s", prefix.Data(), iClassName.Data(), jClassName.Data());
331  TString title = Form("%s_%s_vs_%s", prefix.Data(), iClassName.Data(), jClassName.Data());
332  rocGraph->SetName(name);
333  rocGraph->SetTitle(title);
334 
335  // Store ROC Curve
336  Store(rocGraph);
337  }
338  }
339 }
340 
341 ////////////////////////////////////////////////////////////////////////////////
342 /// this function fills the mva response histos for multiclass classification
343 
345 {
346  Log() << kINFO << "Creating multiclass response histograms..." << Endl;
347 
348  DataSet* ds = GetDataSet();
349  ds->SetCurrentType( GetTreeType() );
350  const DataSetInfo* dsi = GetDataSetInfo();
351 
352  std::vector<std::vector<TH1F*> > histos;
353  Float_t xmin = 0.-0.0002;
354  Float_t xmax = 1.+0.0002;
355  for (UInt_t iCls = 0; iCls < dsi->GetNClasses(); iCls++) {
356  histos.push_back(std::vector<TH1F*>(0));
357  for (UInt_t jCls = 0; jCls < dsi->GetNClasses(); jCls++) {
358  TString name(Form("%s_%s_prob_for_%s",prefix.Data(),
359  dsi->GetClassInfo( jCls )->GetName(),
360  dsi->GetClassInfo( iCls )->GetName()));
361  histos.at(iCls).push_back(new TH1F(name,name,nbins,xmin,xmax));
362  }
363  }
364 
365  for (Int_t ievt=0; ievt<ds->GetNEvents(); ievt++) {
366  const Event* ev = ds->GetEvent(ievt);
367  Int_t cls = ev->GetClass();
368  Float_t w = ev->GetWeight();
369  for (UInt_t jCls = 0; jCls < dsi->GetNClasses(); jCls++) {
370  histos.at(cls).at(jCls)->Fill(fMultiClassValues[ievt][jCls],w);
371  }
372  }
373  for (UInt_t iCls = 0; iCls < dsi->GetNClasses(); iCls++) {
374  for (UInt_t jCls = 0; jCls < dsi->GetNClasses(); jCls++) {
375  gTools().NormHist( histos.at(iCls).at(jCls) );
376  Store(histos.at(iCls).at(jCls));
377  }
378  }
379 
380  /*
381  //fill fine binned histos for testing
382  if(prefix.Contains("Test")){
383  std::vector<std::vector<TH1F*> > histos_highbin;
384  for (UInt_t iCls = 0; iCls < dsi->GetNClasses(); iCls++) {
385  histos_highbin.push_back(std::vector<TH1F*>(0));
386  for (UInt_t jCls = 0; jCls < dsi->GetNClasses(); jCls++) {
387  TString name(Form("%s_%s_prob_for_%s_HIGHBIN",prefix.Data(),
388  dsi->GetClassInfo( jCls )->GetName().Data(),
389  dsi->GetClassInfo( iCls )->GetName().Data()));
390  histos_highbin.at(iCls).push_back(new TH1F(name,name,nbins_high,xmin,xmax));
391  }
392  }
393 
394  for (Int_t ievt=0; ievt<ds->GetNEvents(); ievt++) {
395  const Event* ev = ds->GetEvent(ievt);
396  Int_t cls = ev->GetClass();
397  Float_t w = ev->GetWeight();
398  for (UInt_t jCls = 0; jCls < dsi->GetNClasses(); jCls++) {
399  histos_highbin.at(cls).at(jCls)->Fill(fMultiClassValues[ievt][jCls],w);
400  }
401  }
402  for (UInt_t iCls = 0; iCls < dsi->GetNClasses(); iCls++) {
403  for (UInt_t jCls = 0; jCls < dsi->GetNClasses(); jCls++) {
404  gTools().NormHist( histos_highbin.at(iCls).at(jCls) );
405  Store(histos_highbin.at(iCls).at(jCls));
406  }
407  }
408  }
409  */
410 }
virtual const char * GetName() const
Returns name of object.
Definition: TNamed.h:47
std::vector< std::vector< Float_t > > * GetValueVector()
float xmin
Definition: THbookFile.cxx:93
Double_t GetEffSForEffB(Double_t effB, const UInt_t num_points=41)
Calculate the signal efficiency (sensitivity) for a given background efficiency (sensitivity).
Definition: ROCCurve.cxx:220
MsgLogger & Endl(MsgLogger &ml)
Definition: MsgLogger.h:158
float Float_t
Definition: RtypesCore.h:53
std::vector< Float_t > fAchievablePur
Double_t Run(std::vector< Double_t > &pars)
Execute fitting.
virtual void SetName(const char *name)
Set the name of the TNamed.
Definition: TNamed.cxx:140
THist< 1, float, THistStatContent, THistStatUncertainty > TH1F
Definition: THist.hxx:285
const DataSetInfo * GetDataSetInfo() const
Definition: Results.h:71
DataSet * GetDataSet() const
Definition: Results.h:72
Basic string class.
Definition: TString.h:125
int Int_t
Definition: RtypesCore.h:41
bool Bool_t
Definition: RtypesCore.h:59
virtual void SetTitle(const char *title="")
Set graph title.
Definition: TGraph.cxx:2208
UInt_t GetNClasses() const
Definition: DataSetInfo.h:136
TMatrixD GetConfusionMatrix(Double_t effB)
Returns a confusion matrix where each class is pitted against each other.
std::vector< std::vector< Double_t > > fBestCuts
const std::vector< Event * > & GetEventCollection(Types::ETreeType type=Types::kMaxTreeType) const
Definition: DataSet.h:225
STL namespace.
static constexpr double mg
UInt_t GetClass() const
Definition: Event.h:81
Double_t NormHist(TH1 *theHist, Double_t norm=1.0)
normalises histogram
Definition: Tools.cxx:394
Class that contains all the data information.
Definition: DataSetInfo.h:60
MsgLogger & Log() const
message logger
Double_t GetWeight() const
return the event weight - depending on whether the flag IgnoreNegWeightsInTraining is or not...
Definition: Event.cxx:382
std::vector< Float_t > fEventWeights
ResultsMulticlass(const DataSetInfo *dsi, TString resultsName)
constructor
std::vector< Float_t > fClassSumWeights
Class that contains all the data information.
Definition: DataSet.h:69
ClassInfo * GetClassInfo(Int_t clNum) const
std::vector< std::vector< Float_t > > fMultiClassValues
void SetValue(std::vector< Float_t > &value, Int_t ievt)
The TMVA::Interval Class.
Definition: Interval.h:61
unsigned int UInt_t
Definition: RtypesCore.h:42
char * Form(const char *fmt,...)
Double_t EstimatorFunction(std::vector< Double_t > &)
void CreateMulticlassPerformanceHistos(TString prefix)
Create performance graphs for this classifier a multiclass setting.
float xmax
Definition: THbookFile.cxx:93
Tools & gTools()
void CreateMulticlassHistos(TString prefix, Int_t nbins, Int_t nbins_high)
this function fills the mva response histos for multiclass classification
std::vector< Float_t > fAchievableEff
double Double_t
Definition: RtypesCore.h:55
std::vector< Double_t > GetBestMultiClassCuts(UInt_t targetClass)
calculate the best working point (optimal cut values) for the multiclass classifier ...
void SetCurrentType(Types::ETreeType type) const
Definition: DataSet.h:100
ostringstream derivative to redirect and format output
Definition: MsgLogger.h:59
TGraph * GetROCCurve(const UInt_t points=100)
Returns a new TGraph containing the ROC curve.
Definition: ROCCurve.cxx:277
Class that is the base-class for a vector of result.
Definition: Results.h:57
A Graph is a graphics object made of two arrays X and Y with npoints each.
Definition: TGraph.h:41
Types::ETreeType GetTreeType() const
Definition: Results.h:70
Long64_t GetNEvents(Types::ETreeType type=Types::kMaxTreeType) const
Definition: DataSet.h:215
std::vector< UInt_t > fEventClasses
void Store(TObject *obj, const char *alias=0)
Definition: Results.cxx:86
Interface for a fitter &#39;target&#39;.
Definition: IFitterTarget.h:44
Fitter using a Genetic Algorithm.
Definition: GeneticFitter.h:43
const Int_t n
Definition: legend1.C:16
const Event * GetEvent() const
Definition: DataSet.cxx:202
char name[80]
Definition: TGX11.cxx:109
const char * Data() const
Definition: TString.h:345