Logo ROOT   6.16/01
Reference Guide
CCPruner.h
Go to the documentation of this file.
1#ifndef ROOT_TMVA_CCPruner
2#define ROOT_TMVA_CCPruner
3/**********************************************************************************
4 * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
5 * Package: TMVA *
6 * Class : CCPruner *
7 * Web : http://tmva.sourceforge.net *
8 * *
9 * Description: Cost Complexity Pruning *
10 *
11 * Author: Doug Schouten (dschoute@sfu.ca)
12 *
13 * *
14 * Copyright (c) 2007: *
15 * CERN, Switzerland *
16 * MPI-K Heidelberg, Germany *
17 * U. of Texas at Austin, USA *
18 * *
19 * Redistribution and use in source and binary forms, with or without *
20 * modification, are permitted according to the terms listed in LICENSE *
21 * (http://tmva.sourceforge.net/LICENSE) *
22 **********************************************************************************/
23
24////////////////////////////////////////////////////////////////////////////////////////////////////////////
25// CCPruner - a helper class to prune a decision tree using the Cost Complexity method //
26// (see Classification and Regression Trees by Leo Breiman et al) //
27// //
28// Some definitions: //
29// //
30// T_max - the initial, usually highly overtrained tree, that is to be pruned back //
31// R(T) - quality index (Gini, misclassification rate, or other) of a tree T //
32// ~T - set of terminal nodes in T //
33// T' - the pruned subtree of T_max that has the best quality index R(T') //
34// alpha - the prune strength parameter in Cost Complexity pruning (R_alpha(T) = R(T) + alpha// |~T|) //
35// //
36// There are two running modes in CCPruner: (i) one may select a prune strength and prune back //
37// the tree T_max until the criterion //
38// R(T) - R(t) //
39// alpha < ---------- //
40// |~T_t| - 1 //
41// //
42// is true for all nodes t in T, or (ii) the algorithm finds the sequence of critical points //
43// alpha_k < alpha_k+1 ... < alpha_K such that T_K = root(T_max) and then selects the optimally-pruned //
44// subtree, defined to be the subtree with the best quality index for the validation sample. //
45////////////////////////////////////////////////////////////////////////////////////////////////////////////
46
47
48#include "TMVA/DecisionTree.h"
49
50/* #ifndef ROOT_TMVA_DecisionTreeNode */
51/* #include "TMVA/DecisionTreeNode.h" */
52/* #endif */
53
54#include "TMVA/Event.h"
55
56namespace TMVA {
57 class DataSet;
58 class DecisionTreeNode;
59 class SeparationBase;
60
61 class CCPruner {
62 public:
63 typedef std::vector<Event*> EventList;
64
65 CCPruner( DecisionTree* t_max,
66 const EventList* validationSample,
67 SeparationBase* qualityIndex = NULL );
68
69 CCPruner( DecisionTree* t_max,
70 const DataSet* validationSample,
71 SeparationBase* qualityIndex = NULL );
72
73 ~CCPruner( );
74
75 // set the pruning strength parameter alpha (if alpha < 0, the optimal alpha is calculated)
76 void SetPruneStrength( Float_t alpha = -1.0 );
77
78 void Optimize( );
79
80 // return the list of pruning locations to define the optimal subtree T' of T_max
81 std::vector<TMVA::DecisionTreeNode*> GetOptimalPruneSequence( ) const;
82
83 // return the quality index from the validation sample for the optimal subtree T'
84 inline Float_t GetOptimalQualityIndex( ) const { return (fOptimalK >= 0 && fQualityIndexList.size() > 0 ?
86
87 // return the prune strength (=alpha) corresponding to the prune sequence
88 inline Float_t GetOptimalPruneStrength( ) const { return (fOptimalK >= 0 && fPruneStrengthList.size() > 0 ?
90
91 private:
92 Float_t fAlpha; //! regularization parameter in CC pruning
93 const EventList* fValidationSample; //! the event sample to select the optimally-pruned tree
94 const DataSet* fValidationDataSet; //! the event sample to select the optimally-pruned tree
95 SeparationBase* fQualityIndex; //! the quality index used to calculate R(t), R(T) = sum[t in ~T]{ R(t) }
96 Bool_t fOwnQIndex; //! flag indicates if fQualityIndex is owned by this
97
98 DecisionTree* fTree; //! (pruned) decision tree
99
100 std::vector<TMVA::DecisionTreeNode*> fPruneSequence; //! map of weakest links (i.e., branches to prune) -> pruning index
101 std::vector<Float_t> fPruneStrengthList; //! map of alpha -> pruning index
102 std::vector<Float_t> fQualityIndexList; //! map of R(T) -> pruning index
103
104 Int_t fOptimalK; //! index of the optimal tree in the pruned tree sequence
105 Bool_t fDebug; //! debug flag
106 };
107}
108
110 fAlpha = (alpha > 0 ? alpha : 0.0);
111}
112
113
114#endif
115
116
int Int_t
Definition: RtypesCore.h:41
bool Bool_t
Definition: RtypesCore.h:59
float Float_t
Definition: RtypesCore.h:53
A helper class to prune a decision tree using the Cost Complexity method (see Classification and Regr...
Definition: CCPruner.h:61
Float_t GetOptimalQualityIndex() const
Definition: CCPruner.h:84
void SetPruneStrength(Float_t alpha=-1.0)
Definition: CCPruner.h:109
Float_t fAlpha
Definition: CCPruner.h:92
CCPruner(DecisionTree *t_max, const EventList *validationSample, SeparationBase *qualityIndex=NULL)
constructor
Definition: CCPruner.cxx:69
std::vector< Float_t > fQualityIndexList
map of alpha -> pruning index
Definition: CCPruner.h:102
void Optimize()
determine the pruning sequence
Definition: CCPruner.cxx:124
Bool_t fDebug
index of the optimal tree in the pruned tree sequence
Definition: CCPruner.h:105
Bool_t fOwnQIndex
the quality index used to calculate R(t), R(T) = sum[t in ~T]{ R(t) }
Definition: CCPruner.h:96
std::vector< Event * > EventList
Definition: CCPruner.h:63
std::vector< TMVA::DecisionTreeNode * > fPruneSequence
(pruned) decision tree
Definition: CCPruner.h:100
const EventList * fValidationSample
regularization parameter in CC pruning
Definition: CCPruner.h:93
std::vector< TMVA::DecisionTreeNode * > GetOptimalPruneSequence() const
return the prune strength (=alpha) corresponding to the prune sequence
Definition: CCPruner.cxx:240
Int_t fOptimalK
map of R(T) -> pruning index
Definition: CCPruner.h:104
const DataSet * fValidationDataSet
the event sample to select the optimally-pruned tree
Definition: CCPruner.h:94
std::vector< Float_t > fPruneStrengthList
map of weakest links (i.e., branches to prune) -> pruning index
Definition: CCPruner.h:101
SeparationBase * fQualityIndex
the event sample to select the optimally-pruned tree
Definition: CCPruner.h:95
DecisionTree * fTree
flag indicates if fQualityIndex is owned by this
Definition: CCPruner.h:98
Float_t GetOptimalPruneStrength() const
Definition: CCPruner.h:88
Class that contains all the data information.
Definition: DataSet.h:69
Implementation of a Decision Tree.
Definition: DecisionTree.h:64
An interface to calculate the "SeparationGain" for different separation criteria used in various trai...
Abstract ClassifierFactory template that handles arbitrary types.