Logo ROOT   6.12/07
Reference Guide
CCPruner.h
Go to the documentation of this file.
1 #ifndef ROOT_TMVA_CCPruner
2 #define ROOT_TMVA_CCPruner
3 /**********************************************************************************
4  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
5  * Package: TMVA *
6  * Class : CCPruner *
7  * Web : http://tmva.sourceforge.net *
8  * *
9  * Description: Cost Complexity Pruning *
10  *
11  * Author: Doug Schouten (dschoute@sfu.ca)
12  *
13  * *
14  * Copyright (c) 2007: *
15  * CERN, Switzerland *
16  * MPI-K Heidelberg, Germany *
17  * U. of Texas at Austin, USA *
18  * *
19  * Redistribution and use in source and binary forms, with or without *
20  * modification, are permitted according to the terms listed in LICENSE *
21  * (http://tmva.sourceforge.net/LICENSE) *
22  **********************************************************************************/
23 
24 ////////////////////////////////////////////////////////////////////////////////////////////////////////////
25 // CCPruner - a helper class to prune a decision tree using the Cost Complexity method //
26 // (see Classification and Regression Trees by Leo Breiman et al) //
27 // //
28 // Some definitions: //
29 // //
30 // T_max - the initial, usually highly overtrained tree, that is to be pruned back //
31 // R(T) - quality index (Gini, misclassification rate, or other) of a tree T //
32 // ~T - set of terminal nodes in T //
33 // T' - the pruned subtree of T_max that has the best quality index R(T') //
34 // alpha - the prune strength parameter in Cost Complexity pruning (R_alpha(T) = R(T) + alpha// |~T|) //
35 // //
36 // There are two running modes in CCPruner: (i) one may select a prune strength and prune back //
37 // the tree T_max until the criterion //
38 // R(T) - R(t) //
39 // alpha < ---------- //
40 // |~T_t| - 1 //
41 // //
42 // is true for all nodes t in T, or (ii) the algorithm finds the sequence of critical points //
43 // alpha_k < alpha_k+1 ... < alpha_K such that T_K = root(T_max) and then selects the optimally-pruned //
44 // subtree, defined to be the subtree with the best quality index for the validation sample. //
45 ////////////////////////////////////////////////////////////////////////////////////////////////////////////
46 
47 
48 #include "TMVA/DecisionTree.h"
49 
50 /* #ifndef ROOT_TMVA_DecisionTreeNode */
51 /* #include "TMVA/DecisionTreeNode.h" */
52 /* #endif */
53 
54 #include "TMVA/Event.h"
55 
56 namespace TMVA {
57  class DataSet;
58  class DecisionTreeNode;
59  class SeparationBase;
60 
61  class CCPruner {
62  public:
63  typedef std::vector<Event*> EventList;
64 
65  CCPruner( DecisionTree* t_max,
66  const EventList* validationSample,
67  SeparationBase* qualityIndex = NULL );
68 
69  CCPruner( DecisionTree* t_max,
70  const DataSet* validationSample,
71  SeparationBase* qualityIndex = NULL );
72 
73  ~CCPruner( );
74 
75  // set the pruning strength parameter alpha (if alpha < 0, the optimal alpha is calculated)
76  void SetPruneStrength( Float_t alpha = -1.0 );
77 
78  void Optimize( );
79 
80  // return the list of pruning locations to define the optimal subtree T' of T_max
81  std::vector<TMVA::DecisionTreeNode*> GetOptimalPruneSequence( ) const;
82 
83  // return the quality index from the validation sample for the optimal subtree T'
84  inline Float_t GetOptimalQualityIndex( ) const { return (fOptimalK >= 0 && fQualityIndexList.size() > 0 ?
85  fQualityIndexList[fOptimalK] : -1.0); }
86 
87  // return the prune strength (=alpha) corresponding to the prune sequence
88  inline Float_t GetOptimalPruneStrength( ) const { return (fOptimalK >= 0 && fPruneStrengthList.size() > 0 ?
89  fPruneStrengthList[fOptimalK] : -1.0); }
90 
91  private:
92  Float_t fAlpha; //! regularization parameter in CC pruning
93  const EventList* fValidationSample; //! the event sample to select the optimally-pruned tree
94  const DataSet* fValidationDataSet; //! the event sample to select the optimally-pruned tree
95  SeparationBase* fQualityIndex; //! the quality index used to calculate R(t), R(T) = sum[t in ~T]{ R(t) }
96  Bool_t fOwnQIndex; //! flag indicates if fQualityIndex is owned by this
97 
98  DecisionTree* fTree; //! (pruned) decision tree
99 
100  std::vector<TMVA::DecisionTreeNode*> fPruneSequence; //! map of weakest links (i.e., branches to prune) -> pruning index
101  std::vector<Float_t> fPruneStrengthList; //! map of alpha -> pruning index
102  std::vector<Float_t> fQualityIndexList; //! map of R(T) -> pruning index
103 
104  Int_t fOptimalK; //! index of the optimal tree in the pruned tree sequence
105  Bool_t fDebug; //! debug flag
106  };
107 }
108 
110  fAlpha = (alpha > 0 ? alpha : 0.0);
111 }
112 
113 
114 #endif
115 
116 
void Optimize()
determine the pruning sequence
Definition: CCPruner.cxx:124
std::vector< Float_t > fQualityIndexList
map of alpha -> pruning index
Definition: CCPruner.h:102
Float_t fAlpha
Definition: CCPruner.h:92
float Float_t
Definition: RtypesCore.h:53
const DataSet * fValidationDataSet
the event sample to select the optimally-pruned tree
Definition: CCPruner.h:94
Float_t GetOptimalPruneStrength() const
Definition: CCPruner.h:88
Float_t GetOptimalQualityIndex() const
Definition: CCPruner.h:84
int Int_t
Definition: RtypesCore.h:41
bool Bool_t
Definition: RtypesCore.h:59
Int_t fOptimalK
map of R(T) -> pruning index
Definition: CCPruner.h:104
DecisionTree * fTree
flag indicates if fQualityIndex is owned by this
Definition: CCPruner.h:98
Bool_t fDebug
index of the optimal tree in the pruned tree sequence
Definition: CCPruner.h:105
SeparationBase * fQualityIndex
the event sample to select the optimally-pruned tree
Definition: CCPruner.h:95
Class that contains all the data information.
Definition: DataSet.h:69
void SetPruneStrength(Float_t alpha=-1.0)
Definition: CCPruner.h:109
Bool_t fOwnQIndex
the quality index used to calculate R(t), R(T) = sum[t in ~T]{ R(t) }
Definition: CCPruner.h:96
std::vector< TMVA::DecisionTreeNode * > fPruneSequence
(pruned) decision tree
Definition: CCPruner.h:100
std::vector< Event * > EventList
Definition: CCPruner.h:63
Implementation of a Decision Tree.
Definition: DecisionTree.h:59
An interface to calculate the "SeparationGain" for different separation criteria used in various trai...
const EventList * fValidationSample
regularization parameter in CC pruning
Definition: CCPruner.h:93
std::vector< TMVA::DecisionTreeNode * > GetOptimalPruneSequence() const
return the prune strength (=alpha) corresponding to the prune sequence
Definition: CCPruner.cxx:240
Abstract ClassifierFactory template that handles arbitrary types.
CCPruner(DecisionTree *t_max, const EventList *validationSample, SeparationBase *qualityIndex=NULL)
constructor
Definition: CCPruner.cxx:69
std::vector< Float_t > fPruneStrengthList
map of weakest links (i.e., branches to prune) -> pruning index
Definition: CCPruner.h:101
A helper class to prune a decision tree using the Cost Complexity method (see Classification and Regr...
Definition: CCPruner.h:61