Logo ROOT   6.07/09
Reference Guide
DecisionTree.cxx
Go to the documentation of this file.
1 // @(#)root/tmva $Id$
2 // Author: Andreas Hoecker, Joerg Stelzer, Helge Voss, Kai Voss, Eckhard von Toerne, Jan Therhaag
3 
4 /**********************************************************************************
5  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6  * Package: TMVA *
7  * Class : TMVA::DecisionTree *
8  * Web : http://tmva.sourceforge.net *
9  * *
10  * Description: *
11  * Implementation of a Decision Tree *
12  * *
13  * Authors (alphabetical): *
14  * Andreas Hoecker <Andreas.Hocker@cern.ch> - CERN, Switzerland *
15  * Helge Voss <Helge.Voss@cern.ch> - MPI-K Heidelberg, Germany *
16  * Kai Voss <Kai.Voss@cern.ch> - U. of Victoria, Canada *
17  * Eckhard v. Toerne <evt@uni-bonn.de> - U of Bonn, Germany *
18  * Jan Therhaag <Jan.Therhaag@cern.ch> - U of Bonn, Germany *
19  * *
20  * Copyright (c) 2005-2011: *
21  * CERN, Switzerland *
22  * U. of Victoria, Canada *
23  * MPI-K Heidelberg, Germany *
24  * U. of Bonn, Germany *
25  * *
26  * Redistribution and use in source and binary forms, with or without *
27  * modification, are permitted according to the terms listed in LICENSE *
28  * (http://mva.sourceforge.net/license.txt) *
29  * *
30  **********************************************************************************/
31 
32 //_______________________________________________________________________
33 //
34 // Implementation of a Decision Tree
35 //
36 // In a decision tree successive decision nodes are used to categorize the
37 // events out of the sample as either signal or background. Each node
38 // uses only a single discriminating variable to decide if the event is
39 // signal-like ("goes right") or background-like ("goes left"). This
40 // forms a tree like structure with "baskets" at the end (leave nodes),
41 // and an event is classified as either signal or background according to
42 // whether the basket where it ends up has been classified signal or
43 // background during the training. Training of a decision tree is the
44 // process to define the "cut criteria" for each node. The training
45 // starts with the root node. Here one takes the full training event
46 // sample and selects the variable and corresponding cut value that gives
47 // the best separation between signal and background at this stage. Using
48 // this cut criterion, the sample is then divided into two subsamples, a
49 // signal-like (right) and a background-like (left) sample. Two new nodes
50 // are then created for each of the two sub-samples and they are
51 // constructed using the same mechanism as described for the root
52 // node. The devision is stopped once a certain node has reached either a
53 // minimum number of events, or a minimum or maximum signal purity. These
54 // leave nodes are then called "signal" or "background" if they contain
55 // more signal respective background events from the training sample.
56 //_______________________________________________________________________
57 
58 #include <iostream>
59 #include <algorithm>
60 #include <vector>
61 #include <limits>
62 #include <fstream>
63 #include <algorithm>
64 #include <cassert>
65 
66 #include "TRandom3.h"
67 #include "TMath.h"
68 #include "TMatrix.h"
69 
70 #include "TMVA/MsgLogger.h"
71 #include "TMVA/DecisionTree.h"
72 #include "TMVA/DecisionTreeNode.h"
73 #include "TMVA/BinarySearchTree.h"
74 
75 #include "TMVA/Tools.h"
76 
77 #include "TMVA/GiniIndex.h"
78 #include "TMVA/CrossEntropy.h"
80 #include "TMVA/SdivSqrtSplusB.h"
81 #include "TMVA/Event.h"
82 #include "TMVA/BDTEventWrapper.h"
83 #include "TMVA/IPruneTool.h"
86 
87 const Int_t TMVA::DecisionTree::fgRandomSeed = 0; // set nonzero for debugging and zero for random seeds
88 
89 using std::vector;
90 
92 
93 ////////////////////////////////////////////////////////////////////////////////
94 /// default constructor using the GiniIndex as separation criterion,
95 /// no restrictions on minium number of events in a leave note or the
96 /// separation gain in the node splitting
97 
98 bool almost_equal_float(float x, float y, int ulp=4){
99  // the machine epsilon has to be scaled to the magnitude of the values used
100  // and multiplied by the desired precision in ULPs (units in the last place)
101  return std::abs(x-y) < std::numeric_limits<float>::epsilon() * std::abs(x+y) * ulp
102  // unless the result is subnormal
103  || std::abs(x-y) < std::numeric_limits<float>::min();
104 }
105 
106 bool almost_equal_double(double x, double y, int ulp=4){
107  // the machine epsilon has to be scaled to the magnitude of the values used
108  // and multiplied by the desired precision in ULPs (units in the last place)
109  return std::abs(x-y) < std::numeric_limits<double>::epsilon() * std::abs(x+y) * ulp
110  // unless the result is subnormal
111  || std::abs(x-y) < std::numeric_limits<double>::min();
112 }
113 
115  BinaryTree(),
116  fNvars (0),
117  fNCuts (-1),
118  fUseFisherCuts (kFALSE),
119  fMinLinCorrForFisher (1),
120  fUseExclusiveVars (kTRUE),
121  fSepType (NULL),
122  fRegType (NULL),
123  fMinSize (0),
124  fMinNodeSize (1),
125  fMinSepGain (0),
126  fUseSearchTree(kFALSE),
127  fPruneStrength(0),
128  fPruneMethod (kNoPruning),
129  fNNodesBeforePruning(0),
130  fNodePurityLimit(0.5),
131  fRandomisedTree (kFALSE),
132  fUseNvars (0),
133  fUsePoissonNvars(kFALSE),
134  fMyTrandom (NULL),
135  fMaxDepth (999999),
136  fSigClass (0),
137  fTreeID (0),
138  fAnalysisType (Types::kClassification),
139  fDataSetInfo (NULL)
140 {
141 }
142 
143 ////////////////////////////////////////////////////////////////////////////////
144 /// constructor specifying the separation type, the min number of
145 /// events in a no that is still subjected to further splitting, the
146 /// number of bins in the grid used in applying the cut for the node
147 /// splitting.
148 
150  Bool_t randomisedTree, Int_t useNvars, Bool_t usePoissonNvars,
151  UInt_t nMaxDepth, Int_t iSeed, Float_t purityLimit, Int_t treeID):
152  BinaryTree(),
153  fNvars (0),
154  fNCuts (nCuts),
158  fSepType (sepType),
159  fRegType (NULL),
160  fMinSize (0),
161  fMinNodeSize (minSize),
162  fMinSepGain (0),
164  fPruneStrength (0),
167  fNodePurityLimit(purityLimit),
168  fRandomisedTree (randomisedTree),
169  fUseNvars (useNvars),
170  fUsePoissonNvars(usePoissonNvars),
171  fMyTrandom (new TRandom3(iSeed)),
172  fMaxDepth (nMaxDepth),
173  fSigClass (cls),
174  fTreeID (treeID),
175  fAnalysisType (Types::kClassification),
176  fDataSetInfo (dataInfo)
177 {
178  if (sepType == NULL) { // it is interpreted as a regression tree, where
179  // currently the separation type (simple least square)
180  // cannot be chosen freely)
183  if ( nCuts <=0 ) {
184  fNCuts = 200;
185  Log() << kWARNING << " You had choosen the training mode using optimal cuts, not\n"
186  << " based on a grid of " << fNCuts << " by setting the option NCuts < 0\n"
187  << " as this doesn't exist yet, I set it to " << fNCuts << " and use the grid"
188  << Endl;
189  }
190  }else{
192  }
193 }
194 
195 ////////////////////////////////////////////////////////////////////////////////
196 /// copy constructor that creates a true copy, i.e. a completely independent tree
197 /// the node copy will recursively copy all the nodes
198 
200  BinaryTree(),
201  fNvars (d.fNvars),
202  fNCuts (d.fNCuts),
206  fSepType (d.fSepType),
207  fRegType (d.fRegType),
208  fMinSize (d.fMinSize),
216  fUseNvars (d.fUseNvars),
218  fMyTrandom (new TRandom3(fgRandomSeed)), // well, that means it's not an identical copy. But I only ever intend to really copy trees that are "outgrown" already.
219  fMaxDepth (d.fMaxDepth),
220  fSigClass (d.fSigClass),
221  fTreeID (d.fTreeID),
224 {
225  this->SetRoot( new TMVA::DecisionTreeNode ( *((DecisionTreeNode*)(d.GetRoot())) ) );
226  this->SetParentTreeInNodes();
227  fNNodes = d.fNNodes;
228 }
229 
230 
231 ////////////////////////////////////////////////////////////////////////////////
232 /// destructor
233 
235 {
236  // destruction of the tree nodes done in the "base class" BinaryTree
237 
238  if (fMyTrandom) delete fMyTrandom;
239  if (fRegType) delete fRegType;
240 }
241 
242 ////////////////////////////////////////////////////////////////////////////////
243 /// descend a tree to find all its leaf nodes, fill max depth reached in the
244 /// tree at the same time.
245 
247 {
248  if (n == NULL) { //default, start at the tree top, then descend recursively
249  n = this->GetRoot();
250  if (n == NULL) {
251  Log() << kFATAL << "SetParentTreeNodes: started with undefined ROOT node" <<Endl;
252  return ;
253  }
254  }
255 
256  if ((this->GetLeftDaughter(n) == NULL) && (this->GetRightDaughter(n) != NULL) ) {
257  Log() << kFATAL << " Node with only one daughter?? Something went wrong" << Endl;
258  return;
259  } else if ((this->GetLeftDaughter(n) != NULL) && (this->GetRightDaughter(n) == NULL) ) {
260  Log() << kFATAL << " Node with only one daughter?? Something went wrong" << Endl;
261  return;
262  }
263  else {
264  if (this->GetLeftDaughter(n) != NULL) {
265  this->SetParentTreeInNodes( this->GetLeftDaughter(n) );
266  }
267  if (this->GetRightDaughter(n) != NULL) {
268  this->SetParentTreeInNodes( this->GetRightDaughter(n) );
269  }
270  }
271  n->SetParentTree(this);
272  if (n->GetDepth() > this->GetTotalTreeDepth()) this->SetTotalTreeDepth(n->GetDepth());
273  return;
274 }
275 
276 ////////////////////////////////////////////////////////////////////////////////
277 /// re-create a new tree (decision tree or search tree) from XML
278 
280  std::string type("");
281  gTools().ReadAttr(node,"type", type);
282  DecisionTree* dt = new DecisionTree();
283 
284  dt->ReadXML( node, tmva_Version_Code );
285  return dt;
286 }
287 
288 
289 ////////////////////////////////////////////////////////////////////////////////
290 /// building the decision tree by recursively calling the splitting of
291 /// one (root-) node into two daughter nodes (returns the number of nodes)
292 
293 UInt_t TMVA::DecisionTree::BuildTree( const std::vector<const TMVA::Event*> & eventSample,
295 {
296  if (node==NULL) {
297  //start with the root node
298  node = new TMVA::DecisionTreeNode();
299  fNNodes = 1;
300  this->SetRoot(node);
301  // have to use "s" for start as "r" for "root" would be the same as "r" for "right"
302  this->GetRoot()->SetPos('s');
303  this->GetRoot()->SetDepth(0);
304  this->GetRoot()->SetParentTree(this);
305  fMinSize = fMinNodeSize/100. * eventSample.size();
306  if (GetTreeID()==0){
307  Log() << kDEBUG << "\tThe minimal node size MinNodeSize=" << fMinNodeSize << " fMinNodeSize="<<fMinNodeSize<< "% is translated to an actual number of events = "<< fMinSize<< " for the training sample size of " << eventSample.size() << Endl;
308  Log() << kDEBUG << "\tNote: This number will be taken as absolute minimum in the node, " << Endl;
309  Log() << kDEBUG << " \tin terms of 'weighted events' and unweighted ones !! " << Endl;
310  }
311  }
312 
313  UInt_t nevents = eventSample.size();
314 
315  if (nevents > 0 ) {
316  if (fNvars==0) fNvars = eventSample[0]->GetNVariables(); // should have been set before, but ... well..
317  fVariableImportance.resize(fNvars);
318  }
319  else Log() << kFATAL << ":<BuildTree> eventsample Size == 0 " << Endl;
320 
321  Double_t s=0, b=0;
322  Double_t suw=0, buw=0;
323  Double_t sub=0, bub=0; // unboosted!
324  Double_t target=0, target2=0;
325  Float_t *xmin = new Float_t[fNvars];
326  Float_t *xmax = new Float_t[fNvars];
327  for (UInt_t ivar=0; ivar<fNvars; ivar++) {
328  xmin[ivar]=xmax[ivar]=0;
329  }
330  for (UInt_t iev=0; iev<eventSample.size(); iev++) {
331  const TMVA::Event* evt = eventSample[iev];
332  const Double_t weight = evt->GetWeight();
333  const Double_t orgWeight = evt->GetOriginalWeight(); // unboosted!
334  if (evt->GetClass() == fSigClass) {
335  s += weight;
336  suw += 1;
337  sub += orgWeight;
338  }
339  else {
340  b += weight;
341  buw += 1;
342  bub += orgWeight;
343  }
344  if ( DoRegression() ) {
345  const Double_t tgt = evt->GetTarget(0);
346  target +=weight*tgt;
347  target2+=weight*tgt*tgt;
348  }
349 
350  for (UInt_t ivar=0; ivar<fNvars; ivar++) {
351  const Double_t val = evt->GetValue(ivar);
352  if (iev==0) xmin[ivar]=xmax[ivar]=val;
353  if (val < xmin[ivar]) xmin[ivar]=val;
354  if (val > xmax[ivar]) xmax[ivar]=val;
355  }
356  }
357 
358 
359  if (s+b < 0) {
360  Log() << kWARNING << " One of the Decision Tree nodes has negative total number of signal or background events. "
361  << "(Nsig="<<s<<" Nbkg="<<b<<" Probaby you use a Monte Carlo with negative weights. That should in principle "
362  << "be fine as long as on average you end up with something positive. For this you have to make sure that the "
363  << "minimul number of (unweighted) events demanded for a tree node (currently you use: MinNodeSize="<<fMinNodeSize
364  << "% of training events, you can set this via the BDT option string when booking the classifier) is large enough "
365  << "to allow for reasonable averaging!!!" << Endl
366  << " If this does not help.. maybe you want to try the option: NoNegWeightsInTraining which ignores events "
367  << "with negative weight in the training." << Endl;
368  double nBkg=0.;
369  for (UInt_t i=0; i<eventSample.size(); i++) {
370  if (eventSample[i]->GetClass() != fSigClass) {
371  nBkg += eventSample[i]->GetWeight();
372  Log() << kDEBUG << "Event "<< i<< " has (original) weight: " << eventSample[i]->GetWeight()/eventSample[i]->GetBoostWeight()
373  << " boostWeight: " << eventSample[i]->GetBoostWeight() << Endl;
374  }
375  }
376  Log() << kDEBUG << " that gives in total: " << nBkg<<Endl;
377  }
378 
379  node->SetNSigEvents(s);
380  node->SetNBkgEvents(b);
381  node->SetNSigEvents_unweighted(suw);
382  node->SetNBkgEvents_unweighted(buw);
383  node->SetNSigEvents_unboosted(sub);
384  node->SetNBkgEvents_unboosted(bub);
385  node->SetPurity();
386  if (node == this->GetRoot()) {
387  node->SetNEvents(s+b);
388  node->SetNEvents_unweighted(suw+buw);
389  node->SetNEvents_unboosted(sub+bub);
390  }
391  for (UInt_t ivar=0; ivar<fNvars; ivar++) {
392  node->SetSampleMin(ivar,xmin[ivar]);
393  node->SetSampleMax(ivar,xmax[ivar]);
394  }
395  delete[] xmin;
396  delete[] xmax;
397 
398  // I now demand the minimum number of events for both daughter nodes. Hence if the number
399  // of events in the parent node is not at least two times as big, I don't even need to try
400  // splitting
401 
402  // ask here for actuall "events" independent of their weight.. OR the weighted events
403  // to execeed the min requested number of events per dauther node
404  // (NOTE: make sure that at the eventSample at the ROOT node has sum_of_weights == sample.size() !
405  // if ((eventSample.size() >= 2*fMinSize ||s+b >= 2*fMinSize) && node->GetDepth() < fMaxDepth
406  // std::cout << "------------------------------------------------------------------"<<std::endl;
407  // std::cout << "------------------------------------------------------------------"<<std::endl;
408  // std::cout << " eveSampleSize = "<< eventSample.size() << " s+b="<<s+b << std::endl;
409  if ((eventSample.size() >= 2*fMinSize && s+b >= 2*fMinSize) && node->GetDepth() < fMaxDepth
410  && ( ( s!=0 && b !=0 && !DoRegression()) || ( (s+b)!=0 && DoRegression()) ) ) {
411  Double_t separationGain;
412  if (fNCuts > 0){
413  separationGain = this->TrainNodeFast(eventSample, node);
414  } else {
415  separationGain = this->TrainNodeFull(eventSample, node);
416  }
417  if (separationGain < std::numeric_limits<double>::epsilon()) { // we could not gain anything, e.g. all events are in one bin,
418  /// if (separationGain < 0.00000001) { // we could not gain anything, e.g. all events are in one bin,
419  // no cut can actually do anything to improve the node
420  // hence, naturally, the current node is a leaf node
421  if (DoRegression()) {
422  node->SetSeparationIndex(fRegType->GetSeparationIndex(s+b,target,target2));
423  node->SetResponse(target/(s+b));
424  if( almost_equal_double(target2/(s+b),target/(s+b)*target/(s+b)) ){
425  node->SetRMS(0);
426  }else{
427  node->SetRMS(TMath::Sqrt(target2/(s+b) - target/(s+b)*target/(s+b)));
428  }
429  }
430  else {
432 
433  if (node->GetPurity() > fNodePurityLimit) node->SetNodeType(1);
434  else node->SetNodeType(-1);
435  }
436  if (node->GetDepth() > this->GetTotalTreeDepth()) this->SetTotalTreeDepth(node->GetDepth());
437 
438  } else {
439 
440  std::vector<const TMVA::Event*> leftSample; leftSample.reserve(nevents);
441  std::vector<const TMVA::Event*> rightSample; rightSample.reserve(nevents);
442 
443  Double_t nRight=0, nLeft=0;
444  Double_t nRightUnBoosted=0, nLeftUnBoosted=0;
445 
446  for (UInt_t ie=0; ie< nevents ; ie++) {
447  if (node->GoesRight(*eventSample[ie])) {
448  rightSample.push_back(eventSample[ie]);
449  nRight += eventSample[ie]->GetWeight();
450  nRightUnBoosted += eventSample[ie]->GetOriginalWeight();
451  }
452  else {
453  leftSample.push_back(eventSample[ie]);
454  nLeft += eventSample[ie]->GetWeight();
455  nLeftUnBoosted += eventSample[ie]->GetOriginalWeight();
456  }
457  }
458 
459  // sanity check
460  if (leftSample.empty() || rightSample.empty()) {
461 
462  Log() << kERROR << "<TrainNode> all events went to the same branch" << Endl
463  << "--- Hence new node == old node ... check" << Endl
464  << "--- left:" << leftSample.size()
465  << " right:" << rightSample.size() << Endl
466  << " while the separation is thought to be " << separationGain
467  << "\n when cutting on variable " << node->GetSelector()
468  << " at value " << node->GetCutValue()
469  << kFATAL << "--- this should never happen, please write a bug report to Helge.Voss@cern.ch" << Endl;
470 
471 
472  }
473 
474  // continue building daughter nodes for the left and the right eventsample
475  TMVA::DecisionTreeNode *rightNode = new TMVA::DecisionTreeNode(node,'r');
476  fNNodes++;
477  rightNode->SetNEvents(nRight);
478  rightNode->SetNEvents_unboosted(nRightUnBoosted);
479  rightNode->SetNEvents_unweighted(rightSample.size());
480 
481  TMVA::DecisionTreeNode *leftNode = new TMVA::DecisionTreeNode(node,'l');
482 
483  fNNodes++;
484  leftNode->SetNEvents(nLeft);
485  leftNode->SetNEvents_unboosted(nLeftUnBoosted);
486  leftNode->SetNEvents_unweighted(leftSample.size());
487 
488  node->SetNodeType(0);
489  node->SetLeft(leftNode);
490  node->SetRight(rightNode);
491 
492  this->BuildTree(rightSample, rightNode);
493  this->BuildTree(leftSample, leftNode );
494 
495  }
496  }
497  else{ // it is a leaf node
498  if (DoRegression()) {
499  node->SetSeparationIndex(fRegType->GetSeparationIndex(s+b,target,target2));
500  node->SetResponse(target/(s+b));
501  if( almost_equal_double(target2/(s+b), target/(s+b)*target/(s+b)) ) {
502  node->SetRMS(0);
503  }else{
504  node->SetRMS(TMath::Sqrt(target2/(s+b) - target/(s+b)*target/(s+b)));
505  }
506  }
507  else {
509  if (node->GetPurity() > fNodePurityLimit) node->SetNodeType(1);
510  else node->SetNodeType(-1);
511  // loop through the event sample ending up in this node and check for events with negative weight
512  // those "cannot" be boosted normally. Hence, if there is one of those
513  // is misclassified, find randomly as many events with positive weights in this
514  // node as needed to get the same absolute number of weight, and mark them as
515  // "not to be boosted" in order to make up for not boosting the negative weight event
516  }
517 
518 
519  if (node->GetDepth() > this->GetTotalTreeDepth()) this->SetTotalTreeDepth(node->GetDepth());
520  }
521 
522  // if (IsRootNode) this->CleanTree();
523  return fNNodes;
524 }
525 
526 ////////////////////////////////////////////////////////////////////////////////
527 
528 void TMVA::DecisionTree::FillTree( const std::vector<TMVA::Event*> & eventSample )
529 
530 {
531  // fill the existing the decision tree structure by filling event
532  // in from the top node and see where they happen to end up
533  for (UInt_t i=0; i<eventSample.size(); i++) {
534  this->FillEvent(*(eventSample[i]),NULL);
535  }
536 }
537 
538 ////////////////////////////////////////////////////////////////////////////////
539 /// fill the existing the decision tree structure by filling event
540 /// in from the top node and see where they happen to end up
541 
543  TMVA::DecisionTreeNode *node )
544 {
545  if (node == NULL) { // that's the start, take the Root node
546  node = this->GetRoot();
547  }
548 
549  node->IncrementNEvents( event.GetWeight() );
551 
552  if (event.GetClass() == fSigClass) {
553  node->IncrementNSigEvents( event.GetWeight() );
555  }
556  else {
557  node->IncrementNBkgEvents( event.GetWeight() );
559  }
561  node->GetNBkgEvents()));
562 
563  if (node->GetNodeType() == 0) { //intermediate node --> go down
564  if (node->GoesRight(event))
565  this->FillEvent(event,dynamic_cast<TMVA::DecisionTreeNode*>(node->GetRight())) ;
566  else
567  this->FillEvent(event,dynamic_cast<TMVA::DecisionTreeNode*>(node->GetLeft())) ;
568  }
569 
570 
571 }
572 
573 ////////////////////////////////////////////////////////////////////////////////
574 /// clear the tree nodes (their S/N, Nevents etc), just keep the structure of the tree
575 
577 {
578  if (this->GetRoot()!=NULL) this->GetRoot()->ClearNodeAndAllDaughters();
579 
580 }
581 
582 ////////////////////////////////////////////////////////////////////////////////
583 /// remove those last splits that result in two leaf nodes that
584 /// are both of the type (i.e. both signal or both background)
585 /// this of course is only a reasonable thing to do when you use
586 /// "YesOrNo" leafs, while it might loose s.th. if you use the
587 /// purity information in the nodes.
588 /// --> hence I don't call it automatically in the tree building
589 
591 {
592  if (node==NULL) {
593  node = this->GetRoot();
594  }
595 
596  DecisionTreeNode *l = node->GetLeft();
597  DecisionTreeNode *r = node->GetRight();
598 
599  if (node->GetNodeType() == 0) {
600  this->CleanTree(l);
601  this->CleanTree(r);
602  if (l->GetNodeType() * r->GetNodeType() > 0) {
603 
604  this->PruneNode(node);
605  }
606  }
607  // update the number of nodes after the cleaning
608  return this->CountNodes();
609 
610 }
611 
612 ////////////////////////////////////////////////////////////////////////////////
613 /// prune (get rid of internal nodes) the Decision tree to avoid overtraining
614 /// serveral different pruning methods can be applied as selected by the
615 /// variable "fPruneMethod".
616 
618 {
619  // std::ofstream logfile("dt_pruning.log");
620 
621 
622 
623  IPruneTool* tool(NULL);
624  PruningInfo* info(NULL);
625 
626  if( fPruneMethod == kNoPruning ) return 0.0;
627 
629  // tool = new ExpectedErrorPruneTool(logfile);
630  tool = new ExpectedErrorPruneTool();
632  {
633  tool = new CostComplexityPruneTool();
634  }
635  else {
636  Log() << kFATAL << "Selected pruning method not yet implemented "
637  << Endl;
638  }
639 
640  if(!tool) return 0.0;
641 
643  if(tool->IsAutomatic()) {
644  if(validationSample == NULL){
645  Log() << kFATAL << "Cannot automate the pruning algorithm without an "
646  << "independent validation sample!" << Endl;
647  }else if(validationSample->size() == 0) {
648  Log() << kFATAL << "Cannot automate the pruning algorithm with "
649  << "independent validation sample of ZERO events!" << Endl;
650  }
651  }
652 
653  info = tool->CalculatePruningInfo(this,validationSample);
654  Double_t pruneStrength=0;
655  if(!info) {
656  Log() << kFATAL << "Error pruning tree! Check prune.log for more information."
657  << Endl;
658  } else {
659  pruneStrength = info->PruneStrength;
660 
661  // Log() << kDEBUG << "Optimal prune strength (alpha): " << pruneStrength
662  // << " has quality index " << info->QualityIndex << Endl;
663 
664 
665  for (UInt_t i = 0; i < info->PruneSequence.size(); ++i) {
666 
667  PruneNode(info->PruneSequence[i]);
668  }
669  // update the number of nodes after the pruning
670  this->CountNodes();
671  }
672 
673  delete tool;
674  delete info;
675 
676  return pruneStrength;
677 };
678 
679 
680 ////////////////////////////////////////////////////////////////////////////////
681 /// run the validation sample through the (pruned) tree and fill in the nodes
682 /// the variables NSValidation and NBValidadtion (i.e. how many of the Signal
683 /// and Background events from the validation sample. This is then later used
684 /// when asking for the "tree quality" ..
685 
686 void TMVA::DecisionTree::ApplyValidationSample( const EventConstList* validationSample ) const
687 {
689  for (UInt_t ievt=0; ievt < validationSample->size(); ievt++) {
690  CheckEventWithPrunedTree((*validationSample)[ievt]);
691  }
692 }
693 
694 ////////////////////////////////////////////////////////////////////////////////
695 /// return the misclassification rate of a pruned tree
696 /// a "pruned tree" may have set the variable "IsTerminal" to "arbitrary" at
697 /// any node, hence this tree quality testing will stop there, hence test
698 /// the pruned tree (while the full tree is still in place for normal/later use)
699 
701 {
702  if (n == NULL) { // default, start at the tree top, then descend recursively
703  n = this->GetRoot();
704  if (n == NULL) {
705  Log() << kFATAL << "TestPrunedTreeQuality: started with undefined ROOT node" <<Endl;
706  return 0;
707  }
708  }
709 
710  if( n->GetLeft() != NULL && n->GetRight() != NULL && !n->IsTerminal() ) {
711  return (TestPrunedTreeQuality( n->GetLeft(), mode ) +
712  TestPrunedTreeQuality( n->GetRight(), mode ));
713  }
714  else { // terminal leaf (in a pruned subtree of T_max at least)
715  if (DoRegression()) {
716  Double_t sumw = n->GetNSValidation() + n->GetNBValidation();
717  return n->GetSumTarget2() - 2*n->GetSumTarget()*n->GetResponse() + sumw*n->GetResponse()*n->GetResponse();
718  }
719  else {
720  if (mode == 0) {
721  if (n->GetPurity() > this->GetNodePurityLimit()) // this is a signal leaf, according to the training
722  return n->GetNBValidation();
723  else
724  return n->GetNSValidation();
725  }
726  else if ( mode == 1 ) {
727  // calculate the weighted error using the pruning validation sample
728  return (n->GetPurity() * n->GetNBValidation() + (1.0 - n->GetPurity()) * n->GetNSValidation());
729  }
730  else {
731  throw std::string("Unknown ValidationQualityMode");
732  }
733  }
734  }
735 }
736 
737 ////////////////////////////////////////////////////////////////////////////////
738 /// pass a single validation event throught a pruned decision tree
739 /// on the way down the tree, fill in all the "intermediate" information
740 /// that would normally be there from training.
741 
743 {
744  DecisionTreeNode* current = this->GetRoot();
745  if (current == NULL) {
746  Log() << kFATAL << "CheckEventWithPrunedTree: started with undefined ROOT node" <<Endl;
747  }
748 
749  while(current != NULL) {
750  if(e->GetClass() == fSigClass)
751  current->SetNSValidation(current->GetNSValidation() + e->GetWeight());
752  else
753  current->SetNBValidation(current->GetNBValidation() + e->GetWeight());
754 
755  if (e->GetNTargets() > 0) {
756  current->AddToSumTarget(e->GetWeight()*e->GetTarget(0));
757  current->AddToSumTarget2(e->GetWeight()*e->GetTarget(0)*e->GetTarget(0));
758  }
759 
760  if (current->GetRight() == NULL || current->GetLeft() == NULL) {
761  current = NULL;
762  }
763  else {
764  if (current->GoesRight(*e))
765  current = (TMVA::DecisionTreeNode*)current->GetRight();
766  else
767  current = (TMVA::DecisionTreeNode*)current->GetLeft();
768  }
769  }
770 }
771 
772 ////////////////////////////////////////////////////////////////////////////////
773 /// calculate the normalization factor for a pruning validation sample
774 
776 {
777  Double_t sumWeights = 0.0;
778  for( EventConstList::const_iterator it = validationSample->begin();
779  it != validationSample->end(); ++it ) {
780  sumWeights += (*it)->GetWeight();
781  }
782  return sumWeights;
783 }
784 
785 
786 
787 ////////////////////////////////////////////////////////////////////////////////
788 /// return the number of terminal nodes in the sub-tree below Node n
789 
791 {
792  if (n == NULL) { // default, start at the tree top, then descend recursively
793  n = this->GetRoot();
794  if (n == NULL) {
795  Log() << kFATAL << "CountLeafNodes: started with undefined ROOT node" <<Endl;
796  return 0;
797  }
798  }
799 
800  UInt_t countLeafs=0;
801 
802  if ((this->GetLeftDaughter(n) == NULL) && (this->GetRightDaughter(n) == NULL) ) {
803  countLeafs += 1;
804  }
805  else {
806  if (this->GetLeftDaughter(n) != NULL) {
807  countLeafs += this->CountLeafNodes( this->GetLeftDaughter(n) );
808  }
809  if (this->GetRightDaughter(n) != NULL) {
810  countLeafs += this->CountLeafNodes( this->GetRightDaughter(n) );
811  }
812  }
813  return countLeafs;
814 }
815 
816 ////////////////////////////////////////////////////////////////////////////////
817 /// descend a tree to find all its leaf nodes
818 
820 {
821  if (n == NULL) { // default, start at the tree top, then descend recursively
822  n = this->GetRoot();
823  if (n == NULL) {
824  Log() << kFATAL << "DescendTree: started with undefined ROOT node" <<Endl;
825  return ;
826  }
827  }
828 
829  if ((this->GetLeftDaughter(n) == NULL) && (this->GetRightDaughter(n) == NULL) ) {
830  // do nothing
831  }
832  else if ((this->GetLeftDaughter(n) == NULL) && (this->GetRightDaughter(n) != NULL) ) {
833  Log() << kFATAL << " Node with only one daughter?? Something went wrong" << Endl;
834  return;
835  }
836  else if ((this->GetLeftDaughter(n) != NULL) && (this->GetRightDaughter(n) == NULL) ) {
837  Log() << kFATAL << " Node with only one daughter?? Something went wrong" << Endl;
838  return;
839  }
840  else {
841  if (this->GetLeftDaughter(n) != NULL) {
842  this->DescendTree( this->GetLeftDaughter(n) );
843  }
844  if (this->GetRightDaughter(n) != NULL) {
845  this->DescendTree( this->GetRightDaughter(n) );
846  }
847  }
848 }
849 
850 ////////////////////////////////////////////////////////////////////////////////
851 /// prune away the subtree below the node
852 
854 {
855  DecisionTreeNode *l = node->GetLeft();
856  DecisionTreeNode *r = node->GetRight();
857 
858  node->SetRight(NULL);
859  node->SetLeft(NULL);
860  node->SetSelector(-1);
861  node->SetSeparationGain(-1);
862  if (node->GetPurity() > fNodePurityLimit) node->SetNodeType(1);
863  else node->SetNodeType(-1);
864  this->DeleteNode(l);
865  this->DeleteNode(r);
866  // update the stored number of nodes in the Tree
867  this->CountNodes();
868 
869 }
870 
871 ////////////////////////////////////////////////////////////////////////////////
872 /// prune a node temporaily (without actually deleting its decendants
873 /// which allows testing the pruned tree quality for many different
874 /// pruning stages without "touching" the tree.
875 
877  if(node == NULL) return;
878  node->SetNTerminal(1);
879  node->SetSubTreeR( node->GetNodeR() );
880  node->SetAlpha( std::numeric_limits<double>::infinity( ) );
881  node->SetAlphaMinSubtree( std::numeric_limits<double>::infinity( ) );
882  node->SetTerminal(kTRUE); // set the node to be terminal without deleting its descendants FIXME not needed
883 }
884 
885 ////////////////////////////////////////////////////////////////////////////////
886 /// retrieve node from the tree. Its position (up to a maximal tree depth of 64)
887 /// is coded as a sequence of left-right moves starting from the root, coded as
888 /// 0-1 bit patterns stored in the "long-integer" (i.e. 0:left ; 1:right
889 
891 {
892  Node* current = this->GetRoot();
893 
894  for (UInt_t i =0; i < depth; i++) {
895  ULong_t tmp = 1 << i;
896  if ( tmp & sequence) current = this->GetRightDaughter(current);
897  else current = this->GetLeftDaughter(current);
898  }
899 
900  return current;
901 }
902 
903 
904 ////////////////////////////////////////////////////////////////////////////////
905 ///
906 
907 void TMVA::DecisionTree::GetRandomisedVariables(Bool_t *useVariable, UInt_t *mapVariable, UInt_t &useNvars){
908  for (UInt_t ivar=0; ivar<fNvars; ivar++) useVariable[ivar]=kFALSE;
909  if (fUseNvars==0) { // no number specified ... choose s.th. which hopefully works well
910  // watch out, should never happen as it is initialised automatically in MethodBDT already!!!
911  fUseNvars = UInt_t(TMath::Sqrt(fNvars)+0.6);
912  }
914  else useNvars = fUseNvars;
915 
916  UInt_t nSelectedVars = 0;
917  while (nSelectedVars < useNvars) {
918  Double_t bla = fMyTrandom->Rndm()*fNvars;
919  useVariable[Int_t (bla)] = kTRUE;
920  nSelectedVars = 0;
921  for (UInt_t ivar=0; ivar < fNvars; ivar++) {
922  if (useVariable[ivar] == kTRUE) {
923  mapVariable[nSelectedVars] = ivar;
924  nSelectedVars++;
925  }
926  }
927  }
928  if (nSelectedVars != useNvars) { std::cout << "Bug in TrainNode - GetRandisedVariables()... sorry" << std::endl; std::exit(1);}
929 }
930 
931 ////////////////////////////////////////////////////////////////////////////////
932 /// Decide how to split a node using one of the variables that gives
933 /// the best separation of signal/background. In order to do this, for each
934 /// variable a scan of the different cut values in a grid (grid = fNCuts) is
935 /// performed and the resulting separation gains are compared.
936 /// in addition to the individual variables, one can also ask for a fisher
937 /// discriminant being built out of (some) of the variables and used as a
938 /// possible multivariate split.
939 
941  TMVA::DecisionTreeNode *node )
942 {
943  Double_t separationGainTotal = -1, sepTmp;
944  Double_t *separationGain = new Double_t[fNvars+1];
945  Int_t *cutIndex = new Int_t[fNvars+1]; //-1;
946 
947  for (UInt_t ivar=0; ivar <= fNvars; ivar++) {
948  separationGain[ivar]=-1;
949  cutIndex[ivar]=-1;
950  }
951  Int_t mxVar = -1;
952  Bool_t cutType = kTRUE;
953  Double_t nTotS, nTotB;
954  Int_t nTotS_unWeighted, nTotB_unWeighted;
955  UInt_t nevents = eventSample.size();
956 
957 
958  // the +1 comes from the fact that I treat later on the Fisher output as an
959  // additional possible variable.
960  Bool_t *useVariable = new Bool_t[fNvars+1]; // for performance reasons instead of std::vector<Bool_t> useVariable(fNvars);
961  UInt_t *mapVariable = new UInt_t[fNvars+1]; // map the subset of variables used in randomised trees to the original variable number (used in the Event() )
962 
963  std::vector<Double_t> fisherCoeff;
964 
965  if (fRandomisedTree) { // choose for each node splitting a random subset of variables to choose from
966  UInt_t tmp=fUseNvars;
967  GetRandomisedVariables(useVariable,mapVariable,tmp);
968  }
969  else {
970  for (UInt_t ivar=0; ivar < fNvars; ivar++) {
971  useVariable[ivar] = kTRUE;
972  mapVariable[ivar] = ivar;
973  }
974  }
975  useVariable[fNvars] = kFALSE; //by default fisher is not used..
976 
977  Bool_t fisherOK = kFALSE; // flag to show that the fisher discriminant could be calculated correctly or not;
978  if (fUseFisherCuts) {
979  useVariable[fNvars] = kTRUE; // that's were I store the "fisher MVA"
980 
981  //use for the Fisher discriminant ONLY those variables that show
982  //some reasonable linear correlation in either Signal or Background
983  Bool_t *useVarInFisher = new Bool_t[fNvars]; // for performance reasons instead of std::vector<Bool_t> useVariable(fNvars);
984  UInt_t *mapVarInFisher = new UInt_t[fNvars]; // map the subset of variables used in randomised trees to the original variable number (used in the Event() )
985  for (UInt_t ivar=0; ivar < fNvars; ivar++) {
986  useVarInFisher[ivar] = kFALSE;
987  mapVarInFisher[ivar] = ivar;
988  }
989 
990  std::vector<TMatrixDSym*>* covMatrices;
991  covMatrices = gTools().CalcCovarianceMatrices( eventSample, 2 ); // currently for 2 classes only
992  if (!covMatrices){
993  Log() << kWARNING << " in TrainNodeFast, the covariance Matrices needed for the Fisher-Cuts returned error --> revert to just normal cuts for this node" << Endl;
994  fisherOK = kFALSE;
995  }else{
996  TMatrixD *ss = new TMatrixD(*(covMatrices->at(0)));
997  TMatrixD *bb = new TMatrixD(*(covMatrices->at(1)));
998  const TMatrixD *s = gTools().GetCorrelationMatrix(ss);
999  const TMatrixD *b = gTools().GetCorrelationMatrix(bb);
1000 
1001  for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1002  for (UInt_t jvar=ivar+1; jvar < fNvars; jvar++) {
1003  if ( ( TMath::Abs( (*s)(ivar, jvar)) > fMinLinCorrForFisher) ||
1004  ( TMath::Abs( (*b)(ivar, jvar)) > fMinLinCorrForFisher) ){
1005  useVarInFisher[ivar] = kTRUE;
1006  useVarInFisher[jvar] = kTRUE;
1007  }
1008  }
1009  }
1010  // now as you know which variables you want to use, count and map them:
1011  // such that you can use an array/matrix filled only with THOSE variables
1012  // that you used
1013  UInt_t nFisherVars = 0;
1014  for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1015  //now .. pick those variables that are used in the FIsher and are also
1016  // part of the "allowed" variables in case of Randomized Trees)
1017  if (useVarInFisher[ivar] && useVariable[ivar]) {
1018  mapVarInFisher[nFisherVars++]=ivar;
1019  // now exclud the the variables used in the Fisher cuts, and don't
1020  // use them anymore in the individual variable scan
1021  if (fUseExclusiveVars) useVariable[ivar] = kFALSE;
1022  }
1023  }
1024 
1025 
1026  fisherCoeff = this->GetFisherCoefficients(eventSample, nFisherVars, mapVarInFisher);
1027  fisherOK = kTRUE;
1028  }
1029  delete [] useVarInFisher;
1030  delete [] mapVarInFisher;
1031 
1032  }
1033 
1034 
1035  UInt_t cNvars = fNvars;
1036  if (fUseFisherCuts && fisherOK) cNvars++; // use the Fisher output simple as additional variable
1037 
1038  UInt_t* nBins = new UInt_t [cNvars];
1039  Double_t* binWidth = new Double_t [cNvars];
1040  Double_t* invBinWidth = new Double_t [cNvars];
1041 
1042  Double_t** nSelS = new Double_t* [cNvars];
1043  Double_t** nSelB = new Double_t* [cNvars];
1044  Double_t** nSelS_unWeighted = new Double_t* [cNvars];
1045  Double_t** nSelB_unWeighted = new Double_t* [cNvars];
1046  Double_t** target = new Double_t* [cNvars];
1047  Double_t** target2 = new Double_t* [cNvars];
1048  Double_t** cutValues = new Double_t* [cNvars];
1049 
1050  for (UInt_t ivar=0; ivar<cNvars; ivar++) {
1051  nBins[ivar] = fNCuts+1;
1052  if (ivar < fNvars) {
1053  if (fDataSetInfo->GetVariableInfo(ivar).GetVarType() == 'I') {
1054  nBins[ivar] = node->GetSampleMax(ivar) - node->GetSampleMin(ivar) + 1;
1055  }
1056  }
1057 
1058  nSelS[ivar] = new Double_t [nBins[ivar]];
1059  nSelB[ivar] = new Double_t [nBins[ivar]];
1060  nSelS_unWeighted[ivar] = new Double_t [nBins[ivar]];
1061  nSelB_unWeighted[ivar] = new Double_t [nBins[ivar]];
1062  target[ivar] = new Double_t [nBins[ivar]];
1063  target2[ivar] = new Double_t [nBins[ivar]];
1064  cutValues[ivar] = new Double_t [nBins[ivar]];
1065 
1066  }
1067 
1068  Double_t *xmin = new Double_t[cNvars];
1069  Double_t *xmax = new Double_t[cNvars];
1070 
1071  for (UInt_t ivar=0; ivar < cNvars; ivar++) {
1072  if (ivar < fNvars){
1073  xmin[ivar]=node->GetSampleMin(ivar);
1074  xmax[ivar]=node->GetSampleMax(ivar);
1075  if (almost_equal_float(xmax[ivar], xmin[ivar])) {
1076  // std::cout << " variable " << ivar << " has no proper range in (xmax[ivar]-xmin[ivar] = " << xmax[ivar]-xmin[ivar] << std::endl;
1077  // std::cout << " will set useVariable[ivar]=false"<<std::endl;
1078  useVariable[ivar]=kFALSE;
1079  }
1080 
1081  } else { // the fisher variable
1082  xmin[ivar]=999;
1083  xmax[ivar]=-999;
1084  // too bad, for the moment I don't know how to do this without looping
1085  // once to get the "min max" and then AGAIN to fill the histogram
1086  for (UInt_t iev=0; iev<nevents; iev++) {
1087  // returns the Fisher value (no fixed range)
1088  Double_t result = fisherCoeff[fNvars]; // the fisher constant offset
1089  for (UInt_t jvar=0; jvar<fNvars; jvar++)
1090  result += fisherCoeff[jvar]*(eventSample[iev])->GetValue(jvar);
1091  if (result > xmax[ivar]) xmax[ivar]=result;
1092  if (result < xmin[ivar]) xmin[ivar]=result;
1093  }
1094  }
1095  for (UInt_t ibin=0; ibin<nBins[ivar]; ibin++) {
1096  nSelS[ivar][ibin]=0;
1097  nSelB[ivar][ibin]=0;
1098  nSelS_unWeighted[ivar][ibin]=0;
1099  nSelB_unWeighted[ivar][ibin]=0;
1100  target[ivar][ibin]=0;
1101  target2[ivar][ibin]=0;
1102  cutValues[ivar][ibin]=0;
1103  }
1104  }
1105 
1106  // fill the cut values for the scan:
1107  for (UInt_t ivar=0; ivar < cNvars; ivar++) {
1108 
1109  if ( useVariable[ivar] ) {
1110 
1111  //set the grid for the cut scan on the variables like this:
1112  //
1113  // | | | | | ... | |
1114  // xmin xmax
1115  //
1116  // cut 0 1 2 3 ... fNCuts-1 (counting from zero)
1117  // bin 0 1 2 3 ..... nBins-1=fNCuts (counting from zero)
1118  // --> nBins = fNCuts+1
1119  // (NOTE, the cuts at xmin or xmax would just give the whole sample and
1120  // hence can be safely omitted
1121 
1122  binWidth[ivar] = ( xmax[ivar] - xmin[ivar] ) / Double_t(nBins[ivar]);
1123  invBinWidth[ivar] = 1./binWidth[ivar];
1124  if (ivar < fNvars) {
1125  if (fDataSetInfo->GetVariableInfo(ivar).GetVarType() == 'I') { invBinWidth[ivar] = 1; binWidth[ivar] = 1; }
1126  }
1127 
1128  // std::cout << "ivar="<<ivar
1129  // <<" min="<<xmin[ivar]
1130  // << " max="<<xmax[ivar]
1131  // << " widht=" << istepSize
1132  // << " nBins["<<ivar<<"]="<<nBins[ivar]<<std::endl;
1133  for (UInt_t icut=0; icut<nBins[ivar]-1; icut++) {
1134  cutValues[ivar][icut]=xmin[ivar]+(Double_t(icut+1))*binWidth[ivar];
1135  // std::cout << " cutValues["<<ivar<<"]["<<icut<<"]=" << cutValues[ivar][icut] << std::endl;
1136  }
1137  }
1138  }
1139 
1140  nTotS=0; nTotB=0;
1141  nTotS_unWeighted=0; nTotB_unWeighted=0;
1142  for (UInt_t iev=0; iev<nevents; iev++) {
1143 
1144  Double_t eventWeight = eventSample[iev]->GetWeight();
1145  if (eventSample[iev]->GetClass() == fSigClass) {
1146  nTotS+=eventWeight;
1147  nTotS_unWeighted++; }
1148  else {
1149  nTotB+=eventWeight;
1150  nTotB_unWeighted++;
1151  }
1152 
1153  Int_t iBin=-1;
1154  for (UInt_t ivar=0; ivar < cNvars; ivar++) {
1155  // now scan trough the cuts for each varable and find which one gives
1156  // the best separationGain at the current stage.
1157  if ( useVariable[ivar] ) {
1158  Double_t eventData;
1159  if (ivar < fNvars) eventData = eventSample[iev]->GetValue(ivar);
1160  else { // the fisher variable
1161  eventData = fisherCoeff[fNvars];
1162  for (UInt_t jvar=0; jvar<fNvars; jvar++)
1163  eventData += fisherCoeff[jvar]*(eventSample[iev])->GetValue(jvar);
1164 
1165  }
1166  // "maximum" is nbins-1 (the "-1" because we start counting from 0 !!
1167  iBin = TMath::Min(Int_t(nBins[ivar]-1),TMath::Max(0,int (invBinWidth[ivar]*(eventData-xmin[ivar]) ) ));
1168  if (eventSample[iev]->GetClass() == fSigClass) {
1169  nSelS[ivar][iBin]+=eventWeight;
1170  nSelS_unWeighted[ivar][iBin]++;
1171  }
1172  else {
1173  nSelB[ivar][iBin]+=eventWeight;
1174  nSelB_unWeighted[ivar][iBin]++;
1175  }
1176  if (DoRegression()) {
1177  target[ivar][iBin] +=eventWeight*eventSample[iev]->GetTarget(0);
1178  target2[ivar][iBin]+=eventWeight*eventSample[iev]->GetTarget(0)*eventSample[iev]->GetTarget(0);
1179  }
1180  }
1181  }
1182  }
1183  // now turn the "histogram" into a cumulative distribution
1184  for (UInt_t ivar=0; ivar < cNvars; ivar++) {
1185  if (useVariable[ivar]) {
1186  for (UInt_t ibin=1; ibin < nBins[ivar]; ibin++) {
1187  nSelS[ivar][ibin]+=nSelS[ivar][ibin-1];
1188  nSelS_unWeighted[ivar][ibin]+=nSelS_unWeighted[ivar][ibin-1];
1189  nSelB[ivar][ibin]+=nSelB[ivar][ibin-1];
1190  nSelB_unWeighted[ivar][ibin]+=nSelB_unWeighted[ivar][ibin-1];
1191  if (DoRegression()) {
1192  target[ivar][ibin] +=target[ivar][ibin-1] ;
1193  target2[ivar][ibin]+=target2[ivar][ibin-1];
1194  }
1195  }
1196  if (nSelS_unWeighted[ivar][nBins[ivar]-1] +nSelB_unWeighted[ivar][nBins[ivar]-1] != eventSample.size()) {
1197  Log() << kFATAL << "Helge, you have a bug ....nSelS_unw..+nSelB_unw..= "
1198  << nSelS_unWeighted[ivar][nBins[ivar]-1] +nSelB_unWeighted[ivar][nBins[ivar]-1]
1199  << " while eventsample size = " << eventSample.size()
1200  << Endl;
1201  }
1202  double lastBins=nSelS[ivar][nBins[ivar]-1] +nSelB[ivar][nBins[ivar]-1];
1203  double totalSum=nTotS+nTotB;
1204  if (TMath::Abs(lastBins-totalSum)/totalSum>0.01) {
1205  Log() << kFATAL << "Helge, you have another bug ....nSelS+nSelB= "
1206  << lastBins
1207  << " while total number of events = " << totalSum
1208  << Endl;
1209  }
1210  }
1211  }
1212  // now select the optimal cuts for each varable and find which one gives
1213  // the best separationGain at the current stage
1214  for (UInt_t ivar=0; ivar < cNvars; ivar++) {
1215  if (useVariable[ivar]) {
1216  for (UInt_t iBin=0; iBin<nBins[ivar]-1; iBin++) { // the last bin contains "all events" -->skip
1217  // the separationGain is defined as the various indices (Gini, CorssEntropy, e.t.c)
1218  // calculated by the "SamplePurities" fom the branches that would go to the
1219  // left or the right from this node if "these" cuts were used in the Node:
1220  // hereby: nSelS and nSelB would go to the right branch
1221  // (nTotS - nSelS) + (nTotB - nSelB) would go to the left branch;
1222 
1223  // only allow splits where both daughter nodes match the specified miniumum number
1224  // for this use the "unweighted" events, as you are interested in statistically
1225  // significant splits, which is determined by the actual number of entries
1226  // for a node, rather than the sum of event weights.
1227 
1228  Double_t sl = nSelS_unWeighted[ivar][iBin];
1229  Double_t bl = nSelB_unWeighted[ivar][iBin];
1230  Double_t s = nTotS_unWeighted;
1231  Double_t b = nTotB_unWeighted;
1232  Double_t slW = nSelS[ivar][iBin];
1233  Double_t blW = nSelB[ivar][iBin];
1234  Double_t sW = nTotS;
1235  Double_t bW = nTotB;
1236  Double_t sr = s-sl;
1237  Double_t br = b-bl;
1238  Double_t srW = sW-slW;
1239  Double_t brW = bW-blW;
1240  // std::cout << "sl="<<sl << " bl="<<bl<<" fMinSize="<<fMinSize << "sr="<<sr << " br="<<br <<std::endl;
1241  if ( ((sl+bl)>=fMinSize && (sr+br)>=fMinSize)
1242  && ((slW+blW)>=fMinSize && (srW+brW)>=fMinSize)
1243  ) {
1244 
1245  if (DoRegression()) {
1246  sepTmp = fRegType->GetSeparationGain(nSelS[ivar][iBin]+nSelB[ivar][iBin],
1247  target[ivar][iBin],target2[ivar][iBin],
1248  nTotS+nTotB,
1249  target[ivar][nBins[ivar]-1],target2[ivar][nBins[ivar]-1]);
1250  } else {
1251  sepTmp = fSepType->GetSeparationGain(nSelS[ivar][iBin], nSelB[ivar][iBin], nTotS, nTotB);
1252  }
1253  if (separationGain[ivar] < sepTmp) {
1254  separationGain[ivar] = sepTmp;
1255  cutIndex[ivar] = iBin;
1256  }
1257  }
1258  }
1259  }
1260  }
1261 
1262 
1263  //now you have found the best separation cut for each variable, now compare the variables
1264  for (UInt_t ivar=0; ivar < cNvars; ivar++) {
1265  if (useVariable[ivar] ) {
1266  if (separationGainTotal < separationGain[ivar]) {
1267  separationGainTotal = separationGain[ivar];
1268  mxVar = ivar;
1269  }
1270  }
1271  }
1272 
1273  if (mxVar >= 0) {
1274  if (DoRegression()) {
1275  node->SetSeparationIndex(fRegType->GetSeparationIndex(nTotS+nTotB,target[0][nBins[mxVar]-1],target2[0][nBins[mxVar]-1]));
1276  node->SetResponse(target[0][nBins[mxVar]-1]/(nTotS+nTotB));
1277  if ( almost_equal_double(target2[0][nBins[mxVar]-1]/(nTotS+nTotB), target[0][nBins[mxVar]-1]/(nTotS+nTotB)*target[0][nBins[mxVar]-1]/(nTotS+nTotB))) {
1278  node->SetRMS(0);
1279  }else{
1280  node->SetRMS(TMath::Sqrt(target2[0][nBins[mxVar]-1]/(nTotS+nTotB) - target[0][nBins[mxVar]-1]/(nTotS+nTotB)*target[0][nBins[mxVar]-1]/(nTotS+nTotB)));
1281  }
1282  }
1283  else {
1284  node->SetSeparationIndex(fSepType->GetSeparationIndex(nTotS,nTotB));
1285  if (mxVar >=0){
1286  if (nSelS[mxVar][cutIndex[mxVar]]/nTotS > nSelB[mxVar][cutIndex[mxVar]]/nTotB) cutType=kTRUE;
1287  else cutType=kFALSE;
1288  }
1289  }
1290  node->SetSelector((UInt_t)mxVar);
1291  node->SetCutValue(cutValues[mxVar][cutIndex[mxVar]]);
1292  node->SetCutType(cutType);
1293  node->SetSeparationGain(separationGainTotal);
1294  if (mxVar < (Int_t) fNvars){ // the fisher cut is actually not used in this node, hence don't need to store fisher components
1295  node->SetNFisherCoeff(0);
1296  fVariableImportance[mxVar] += separationGainTotal*separationGainTotal * (nTotS+nTotB) * (nTotS+nTotB) ;
1297  //for (UInt_t ivar=0; ivar<fNvars; ivar++) fVariableImportance[ivar] += separationGain[ivar]*separationGain[ivar] * (nTotS+nTotB) * (nTotS+nTotB) ;
1298  }else{
1299  // allocate Fisher coefficients (use fNvars, and set the non-used ones to zero. Might
1300  // be even less storage space on average than storing also the mapping used otherwise
1301  // can always be changed relatively easy
1302  node->SetNFisherCoeff(fNvars+1);
1303  for (UInt_t ivar=0; ivar<=fNvars; ivar++) {
1304  node->SetFisherCoeff(ivar,fisherCoeff[ivar]);
1305  // take 'fisher coeff. weighted estimate as variable importance, "Don't fill the offset coefficient though :)
1306  if (ivar<fNvars){
1307  fVariableImportance[ivar] += fisherCoeff[ivar]*fisherCoeff[ivar]*separationGainTotal*separationGainTotal * (nTotS+nTotB) * (nTotS+nTotB) ;
1308  }
1309  }
1310  }
1311  }
1312  else {
1313  separationGainTotal = 0;
1314  }
1315 
1316  // if (mxVar > -1) {
1317  // std::cout << "------------------------------------------------------------------"<<std::endl;
1318  // std::cout << "cutting on Var: " << mxVar << " with cutIndex " << cutIndex[mxVar] << " being: " << cutValues[mxVar][cutIndex[mxVar]] << std::endl;
1319  // std::cout << " nSelS = " << nSelS_unWeighted[mxVar][cutIndex[mxVar]] << " nSelB = " << nSelB_unWeighted[mxVar][cutIndex[mxVar]] << " (right) sum:= " << nSelS_unWeighted[mxVar][cutIndex[mxVar]] + nSelB_unWeighted[mxVar][cutIndex[mxVar]] << std::endl;
1320  // std::cout << " nSelS = " << nTotS_unWeighted - nSelS_unWeighted[mxVar][cutIndex[mxVar]] << " nSelB = " << nTotB_unWeighted-nSelB_unWeighted[mxVar][cutIndex[mxVar]] << " (left) sum:= " << nTotS_unWeighted + nTotB_unWeighted - nSelS_unWeighted[mxVar][cutIndex[mxVar]] - nSelB_unWeighted[mxVar][cutIndex[mxVar]] << std::endl;
1321  // std::cout << " nSelS = " << nSelS[mxVar][cutIndex[mxVar]] << " nSelB = " << nSelB[mxVar][cutIndex[mxVar]] << std::endl;
1322  // std::cout << " s/s+b " << nSelS_unWeighted[mxVar][cutIndex[mxVar]]/( nSelS_unWeighted[mxVar][cutIndex[mxVar]] + nSelB_unWeighted[mxVar][cutIndex[mxVar]])
1323  // << " s/s+b " << (nTotS - nSelS_unWeighted[mxVar][cutIndex[mxVar]])/( nTotS-nSelS_unWeighted[mxVar][cutIndex[mxVar]] + nTotB-nSelB_unWeighted[mxVar][cutIndex[mxVar]]) << std::endl;
1324  // std::cout << " nTotS = " << nTotS << " nTotB = " << nTotB << std::endl;
1325  // std::cout << " separationGainTotal " << separationGainTotal << std::endl;
1326  // } else {
1327  // std::cout << "------------------------------------------------------------------"<<std::endl;
1328  // std::cout << " obviously didn't find new mxVar " << mxVar << std::endl;
1329  // }
1330  for (UInt_t i=0; i<cNvars; i++) {
1331  delete [] nSelS[i];
1332  delete [] nSelB[i];
1333  delete [] nSelS_unWeighted[i];
1334  delete [] nSelB_unWeighted[i];
1335  delete [] target[i];
1336  delete [] target2[i];
1337  delete [] cutValues[i];
1338  }
1339  delete [] nSelS;
1340  delete [] nSelB;
1341  delete [] nSelS_unWeighted;
1342  delete [] nSelB_unWeighted;
1343  delete [] target;
1344  delete [] target2;
1345  delete [] cutValues;
1346 
1347  delete [] xmin;
1348  delete [] xmax;
1349 
1350  delete [] useVariable;
1351  delete [] mapVariable;
1352 
1353  delete [] separationGain;
1354  delete [] cutIndex;
1355 
1356  delete [] nBins;
1357  delete [] binWidth;
1358  delete [] invBinWidth;
1359 
1360  return separationGainTotal;
1361 
1362 }
1363 
1364 
1365 
1366 ////////////////////////////////////////////////////////////////////////////////
1367 /// calculate the fisher coefficients for the event sample and the variables used
1368 
1369 std::vector<Double_t> TMVA::DecisionTree::GetFisherCoefficients(const EventConstList &eventSample, UInt_t nFisherVars, UInt_t *mapVarInFisher){
1370  std::vector<Double_t> fisherCoeff(fNvars+1);
1371 
1372  // initializaton of global matrices and vectors
1373  // average value of each variables for S, B, S+B
1374  TMatrixD* meanMatx = new TMatrixD( nFisherVars, 3 );
1375 
1376  // the covariance 'within class' and 'between class' matrices
1377  TMatrixD* betw = new TMatrixD( nFisherVars, nFisherVars );
1378  TMatrixD* with = new TMatrixD( nFisherVars, nFisherVars );
1379  TMatrixD* cov = new TMatrixD( nFisherVars, nFisherVars );
1380 
1381  //
1382  // compute mean values of variables in each sample, and the overall means
1383  //
1384 
1385  // initialize internal sum-of-weights variables
1386  Double_t sumOfWeightsS = 0;
1387  Double_t sumOfWeightsB = 0;
1388 
1389 
1390  // init vectors
1391  Double_t* sumS = new Double_t[nFisherVars];
1392  Double_t* sumB = new Double_t[nFisherVars];
1393  for (UInt_t ivar=0; ivar<nFisherVars; ivar++) { sumS[ivar] = sumB[ivar] = 0; }
1394 
1395  UInt_t nevents = eventSample.size();
1396  // compute sample means
1397  for (UInt_t ievt=0; ievt<nevents; ievt++) {
1398 
1399  // read the Training Event into "event"
1400  const Event * ev = eventSample[ievt];
1401 
1402  // sum of weights
1403  Double_t weight = ev->GetWeight();
1404  if (ev->GetClass() == fSigClass) sumOfWeightsS += weight;
1405  else sumOfWeightsB += weight;
1406 
1407  Double_t* sum = ev->GetClass() == fSigClass ? sumS : sumB;
1408  for (UInt_t ivar=0; ivar<nFisherVars; ivar++) {
1409  sum[ivar] += ev->GetValue( mapVarInFisher[ivar] )*weight;
1410  }
1411  }
1412  for (UInt_t ivar=0; ivar<nFisherVars; ivar++) {
1413  (*meanMatx)( ivar, 2 ) = sumS[ivar];
1414  (*meanMatx)( ivar, 0 ) = sumS[ivar]/sumOfWeightsS;
1415 
1416  (*meanMatx)( ivar, 2 ) += sumB[ivar];
1417  (*meanMatx)( ivar, 1 ) = sumB[ivar]/sumOfWeightsB;
1418 
1419  // signal + background
1420  (*meanMatx)( ivar, 2 ) /= (sumOfWeightsS + sumOfWeightsB);
1421  }
1422 
1423  delete [] sumS;
1424 
1425  delete [] sumB;
1426 
1427  // the matrix of covariance 'within class' reflects the dispersion of the
1428  // events relative to the center of gravity of their own class
1429 
1430  // assert required
1431 
1432  assert( sumOfWeightsS > 0 && sumOfWeightsB > 0 );
1433 
1434  // product matrices (x-<x>)(y-<y>) where x;y are variables
1435 
1436  const Int_t nFisherVars2 = nFisherVars*nFisherVars;
1437  Double_t *sum2Sig = new Double_t[nFisherVars2];
1438  Double_t *sum2Bgd = new Double_t[nFisherVars2];
1439  Double_t *xval = new Double_t[nFisherVars2];
1440  memset(sum2Sig,0,nFisherVars2*sizeof(Double_t));
1441  memset(sum2Bgd,0,nFisherVars2*sizeof(Double_t));
1442 
1443  // 'within class' covariance
1444  for (UInt_t ievt=0; ievt<nevents; ievt++) {
1445 
1446  // read the Training Event into "event"
1447  // const Event* ev = eventSample[ievt];
1448  const Event* ev = eventSample.at(ievt);
1449 
1450  Double_t weight = ev->GetWeight(); // may ignore events with negative weights
1451 
1452  for (UInt_t x=0; x<nFisherVars; x++) {
1453  xval[x] = ev->GetValue( mapVarInFisher[x] );
1454  }
1455  Int_t k=0;
1456  for (UInt_t x=0; x<nFisherVars; x++) {
1457  for (UInt_t y=0; y<nFisherVars; y++) {
1458  if ( ev->GetClass() == fSigClass ) sum2Sig[k] += ( (xval[x] - (*meanMatx)(x, 0))*(xval[y] - (*meanMatx)(y, 0)) )*weight;
1459  else sum2Bgd[k] += ( (xval[x] - (*meanMatx)(x, 1))*(xval[y] - (*meanMatx)(y, 1)) )*weight;
1460  k++;
1461  }
1462  }
1463  }
1464  Int_t k=0;
1465  for (UInt_t x=0; x<nFisherVars; x++) {
1466  for (UInt_t y=0; y<nFisherVars; y++) {
1467  (*with)(x, y) = sum2Sig[k]/sumOfWeightsS + sum2Bgd[k]/sumOfWeightsB;
1468  k++;
1469  }
1470  }
1471 
1472  delete [] sum2Sig;
1473  delete [] sum2Bgd;
1474  delete [] xval;
1475 
1476 
1477  // the matrix of covariance 'between class' reflects the dispersion of the
1478  // events of a class relative to the global center of gravity of all the class
1479  // hence the separation between classes
1480 
1481 
1482  Double_t prodSig, prodBgd;
1483 
1484  for (UInt_t x=0; x<nFisherVars; x++) {
1485  for (UInt_t y=0; y<nFisherVars; y++) {
1486 
1487  prodSig = ( ((*meanMatx)(x, 0) - (*meanMatx)(x, 2))*
1488  ((*meanMatx)(y, 0) - (*meanMatx)(y, 2)) );
1489  prodBgd = ( ((*meanMatx)(x, 1) - (*meanMatx)(x, 2))*
1490  ((*meanMatx)(y, 1) - (*meanMatx)(y, 2)) );
1491 
1492  (*betw)(x, y) = (sumOfWeightsS*prodSig + sumOfWeightsB*prodBgd) / (sumOfWeightsS + sumOfWeightsB);
1493  }
1494  }
1495 
1496 
1497 
1498  // compute full covariance matrix from sum of within and between matrices
1499  for (UInt_t x=0; x<nFisherVars; x++)
1500  for (UInt_t y=0; y<nFisherVars; y++)
1501  (*cov)(x, y) = (*with)(x, y) + (*betw)(x, y);
1502 
1503  // Fisher = Sum { [coeff]*[variables] }
1504  //
1505  // let Xs be the array of the mean values of variables for signal evts
1506  // let Xb be the array of the mean values of variables for backgd evts
1507  // let InvWith be the inverse matrix of the 'within class' correlation matrix
1508  //
1509  // then the array of Fisher coefficients is
1510  // [coeff] =TMath::Sqrt(fNsig*fNbgd)/fNevt*transpose{Xs-Xb}*InvWith
1511  TMatrixD* theMat = with; // Fishers original
1512  // TMatrixD* theMat = cov; // Mahalanobis
1513 
1514  TMatrixD invCov( *theMat );
1515  if ( TMath::Abs(invCov.Determinant()) < 10E-24 ) {
1516  Log() << kWARNING << "FisherCoeff matrix is almost singular with deterninant="
1517  << TMath::Abs(invCov.Determinant())
1518  << " did you use the variables that are linear combinations or highly correlated?"
1519  << Endl;
1520  }
1521  if ( TMath::Abs(invCov.Determinant()) < 10E-120 ) {
1522  Log() << kFATAL << "FisherCoeff matrix is singular with determinant="
1523  << TMath::Abs(invCov.Determinant())
1524  << " did you use the variables that are linear combinations?"
1525  << Endl;
1526  }
1527 
1528  invCov.Invert();
1529 
1530  // apply rescaling factor
1531  Double_t xfact = TMath::Sqrt( sumOfWeightsS*sumOfWeightsB ) / (sumOfWeightsS + sumOfWeightsB);
1532 
1533  // compute difference of mean values
1534  std::vector<Double_t> diffMeans( nFisherVars );
1535 
1536  for (UInt_t ivar=0; ivar<=fNvars; ivar++) fisherCoeff[ivar] = 0;
1537  for (UInt_t ivar=0; ivar<nFisherVars; ivar++) {
1538  for (UInt_t jvar=0; jvar<nFisherVars; jvar++) {
1539  Double_t d = (*meanMatx)(jvar, 0) - (*meanMatx)(jvar, 1);
1540  fisherCoeff[mapVarInFisher[ivar]] += invCov(ivar, jvar)*d;
1541  }
1542 
1543  // rescale
1544  fisherCoeff[mapVarInFisher[ivar]] *= xfact;
1545  }
1546 
1547  // offset correction
1548  Double_t f0 = 0.0;
1549  for (UInt_t ivar=0; ivar<nFisherVars; ivar++){
1550  f0 += fisherCoeff[mapVarInFisher[ivar]]*((*meanMatx)(ivar, 0) + (*meanMatx)(ivar, 1));
1551  }
1552  f0 /= -2.0;
1553 
1554  fisherCoeff[fNvars] = f0; //as we start counting variables from "zero", I store the fisher offset at the END
1555 
1556  return fisherCoeff;
1557 }
1558 
1559 ////////////////////////////////////////////////////////////////////////////////
1560 
1562  TMVA::DecisionTreeNode *node )
1563 {
1564  // train a node by finding the single optimal cut for a single variable
1565  // that best separates signal and background (maximizes the separation gain)
1566 
1567  Double_t nTotS = 0.0, nTotB = 0.0;
1568  Int_t nTotS_unWeighted = 0, nTotB_unWeighted = 0;
1569 
1570  std::vector<TMVA::BDTEventWrapper> bdtEventSample;
1571 
1572  // List of optimal cuts, separation gains, and cut types (removed background or signal) - one for each variable
1573  std::vector<Double_t> lCutValue( fNvars, 0.0 );
1574  std::vector<Double_t> lSepGain( fNvars, -1.0e6 );
1575  std::vector<Char_t> lCutType( fNvars ); // <----- bool is stored (for performance reasons, no std::vector<bool> has been taken)
1576  lCutType.assign( fNvars, Char_t(kFALSE) );
1577 
1578  // Initialize (un)weighted counters for signal & background
1579  // Construct a list of event wrappers that point to the original data
1580  for( std::vector<const TMVA::Event*>::const_iterator it = eventSample.begin(); it != eventSample.end(); ++it ) {
1581  if((*it)->GetClass() == fSigClass) { // signal or background event
1582  nTotS += (*it)->GetWeight();
1583  ++nTotS_unWeighted;
1584  }
1585  else {
1586  nTotB += (*it)->GetWeight();
1587  ++nTotB_unWeighted;
1588  }
1589  bdtEventSample.push_back(TMVA::BDTEventWrapper(*it));
1590  }
1591 
1592  std::vector<Char_t> useVariable(fNvars); // <----- bool is stored (for performance reasons, no std::vector<bool> has been taken)
1593  useVariable.assign( fNvars, Char_t(kTRUE) );
1594 
1595  for (UInt_t ivar=0; ivar < fNvars; ivar++) useVariable[ivar]=Char_t(kFALSE);
1596  if (fRandomisedTree) { // choose for each node splitting a random subset of variables to choose from
1597  if (fUseNvars ==0 ) { // no number specified ... choose s.th. which hopefully works well
1598  // watch out, should never happen as it is initialised automatically in MethodBDT already!!!
1599  fUseNvars = UInt_t(TMath::Sqrt(fNvars)+0.6);
1600  }
1601  Int_t nSelectedVars = 0;
1602  while (nSelectedVars < fUseNvars) {
1603  Double_t bla = fMyTrandom->Rndm()*fNvars;
1604  useVariable[Int_t (bla)] = Char_t(kTRUE);
1605  nSelectedVars = 0;
1606  for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1607  if(useVariable[ivar] == Char_t(kTRUE)) nSelectedVars++;
1608  }
1609  }
1610  }
1611  else {
1612  for (UInt_t ivar=0; ivar < fNvars; ivar++) useVariable[ivar] = Char_t(kTRUE);
1613  }
1614 
1615  for( UInt_t ivar = 0; ivar < fNvars; ivar++ ) { // loop over all discriminating variables
1616  if(!useVariable[ivar]) continue; // only optimze with selected variables
1617  TMVA::BDTEventWrapper::SetVarIndex(ivar); // select the variable to sort by
1618  std::sort( bdtEventSample.begin(),bdtEventSample.end() ); // sort the event data
1619 
1620  Double_t bkgWeightCtr = 0.0, sigWeightCtr = 0.0;
1621  std::vector<TMVA::BDTEventWrapper>::iterator it = bdtEventSample.begin(), it_end = bdtEventSample.end();
1622  for( ; it != it_end; ++it ) {
1623  if((**it)->GetClass() == fSigClass ) // specify signal or background event
1624  sigWeightCtr += (**it)->GetWeight();
1625  else
1626  bkgWeightCtr += (**it)->GetWeight();
1627  // Store the accumulated signal (background) weights
1628  it->SetCumulativeWeight(false,bkgWeightCtr);
1629  it->SetCumulativeWeight(true,sigWeightCtr);
1630  }
1631 
1632  const Double_t fPMin = 1.0e-6;
1633  Bool_t cutType = kFALSE;
1634  Long64_t index = 0;
1635  Double_t separationGain = -1.0, sepTmp = 0.0, cutValue = 0.0, dVal = 0.0, norm = 0.0;
1636  // Locate the optimal cut for this (ivar-th) variable
1637  for( it = bdtEventSample.begin(); it != it_end; ++it ) {
1638  if( index == 0 ) { ++index; continue; }
1639  if( *(*it) == NULL ) {
1640  Log() << kFATAL << "In TrainNodeFull(): have a null event! Where index="
1641  << index << ", and parent node=" << node->GetParent() << Endl;
1642  break;
1643  }
1644  dVal = bdtEventSample[index].GetVal() - bdtEventSample[index-1].GetVal();
1645  norm = TMath::Abs(bdtEventSample[index].GetVal() + bdtEventSample[index-1].GetVal());
1646  // Only allow splits where both daughter nodes have the specified miniumum number of events
1647  // Splits are only sensible when the data are ordered (eg. don't split inside a sequence of 0's)
1648  if( index >= fMinSize && (nTotS_unWeighted + nTotB_unWeighted) - index >= fMinSize && TMath::Abs(dVal/(0.5*norm + 1)) > fPMin ) {
1649  sepTmp = fSepType->GetSeparationGain( it->GetCumulativeWeight(true), it->GetCumulativeWeight(false), sigWeightCtr, bkgWeightCtr );
1650  if( sepTmp > separationGain ) {
1651  separationGain = sepTmp;
1652  cutValue = it->GetVal() - 0.5*dVal;
1653  Double_t nSelS = it->GetCumulativeWeight(true);
1654  Double_t nSelB = it->GetCumulativeWeight(false);
1655  // Indicate whether this cut is improving the node purity by removing background (enhancing signal)
1656  // or by removing signal (enhancing background)
1657  if( nSelS/sigWeightCtr > nSelB/bkgWeightCtr ) cutType = kTRUE;
1658  else cutType = kFALSE;
1659  }
1660  }
1661  ++index;
1662  }
1663  lCutType[ivar] = Char_t(cutType);
1664  lCutValue[ivar] = cutValue;
1665  lSepGain[ivar] = separationGain;
1666  }
1667 
1668  Double_t separationGain = -1.0;
1669  Int_t iVarIndex = -1;
1670  for( UInt_t ivar = 0; ivar < fNvars; ivar++ ) {
1671  if( lSepGain[ivar] > separationGain ) {
1672  iVarIndex = ivar;
1673  separationGain = lSepGain[ivar];
1674  }
1675  }
1676 
1677  if(iVarIndex >= 0) {
1678  node->SetSelector(iVarIndex);
1679  node->SetCutValue(lCutValue[iVarIndex]);
1680  node->SetSeparationGain(lSepGain[iVarIndex]);
1681  node->SetCutType(lCutType[iVarIndex]);
1682  fVariableImportance[iVarIndex] += separationGain*separationGain * (nTotS+nTotB) * (nTotS+nTotB);
1683  }
1684  else {
1685  separationGain = 0.0;
1686  }
1687 
1688  return separationGain;
1689 }
1690 
1691 ////////////////////////////////////////////////////////////////////////////////
1692 /// get the pointer to the leaf node where a particular event ends up in...
1693 /// (used in gradient boosting)
1694 
1696 {
1698  while(current->GetNodeType() == 0) { // intermediate node in a tree
1699  current = (current->GoesRight(e)) ?
1700  (TMVA::DecisionTreeNode*)current->GetRight() :
1701  (TMVA::DecisionTreeNode*)current->GetLeft();
1702  }
1703  return current;
1704 }
1705 
1706 ////////////////////////////////////////////////////////////////////////////////
1707 /// the event e is put into the decision tree (starting at the root node)
1708 /// and the output is NodeType (signal) or (background) of the final node (basket)
1709 /// in which the given events ends up. I.e. the result of the classification if
1710 /// the event for this decision tree.
1711 
1713 {
1714  TMVA::DecisionTreeNode *current = this->GetRoot();
1715  if (!current){
1716  Log() << kFATAL << "CheckEvent: started with undefined ROOT node" <<Endl;
1717  return 0; //keeps covarity happy that doesn't know that kFATAL causes an exit
1718  }
1719 
1720  while (current->GetNodeType() == 0) { // intermediate node in a (pruned) tree
1721  current = (current->GoesRight(*e)) ?
1722  current->GetRight() :
1723  current->GetLeft();
1724  if (!current) {
1725  Log() << kFATAL << "DT::CheckEvent: inconsistent tree structure" <<Endl;
1726  }
1727 
1728  }
1729 
1730  if ( DoRegression() ){
1731  return current->GetResponse();
1732  }
1733  else {
1734  if (UseYesNoLeaf) return Double_t ( current->GetNodeType() );
1735  else return current->GetPurity();
1736  }
1737 }
1738 
1739 ////////////////////////////////////////////////////////////////////////////////
1740 /// calculates the purity S/(S+B) of a given event sample
1741 
1742 Double_t TMVA::DecisionTree::SamplePurity( std::vector<TMVA::Event*> eventSample )
1743 {
1744  Double_t sumsig=0, sumbkg=0, sumtot=0;
1745  for (UInt_t ievt=0; ievt<eventSample.size(); ievt++) {
1746  if (eventSample[ievt]->GetClass() != fSigClass) sumbkg+=eventSample[ievt]->GetWeight();
1747  else sumsig+=eventSample[ievt]->GetWeight();
1748  sumtot+=eventSample[ievt]->GetWeight();
1749  }
1750  // sanity check
1751  if (sumtot!= (sumsig+sumbkg)){
1752  Log() << kFATAL << "<SamplePurity> sumtot != sumsig+sumbkg"
1753  << sumtot << " " << sumsig << " " << sumbkg << Endl;
1754  }
1755  if (sumtot>0) return sumsig/(sumsig + sumbkg);
1756  else return -1;
1757 }
1758 
1759 ////////////////////////////////////////////////////////////////////////////////
1760 /// Return the relative variable importance, normalized to all
1761 /// variables together having the importance 1. The importance in
1762 /// evaluated as the total separation-gain that this variable had in
1763 /// the decision trees (weighted by the number of events)
1764 
1766 {
1767  std::vector<Double_t> relativeImportance(fNvars);
1768  Double_t sum=0;
1769  for (UInt_t i=0; i< fNvars; i++) {
1770  sum += fVariableImportance[i];
1771  relativeImportance[i] = fVariableImportance[i];
1772  }
1773 
1774  for (UInt_t i=0; i< fNvars; i++) {
1776  relativeImportance[i] /= sum;
1777  else
1778  relativeImportance[i] = 0;
1779  }
1780  return relativeImportance;
1781 }
1782 
1783 ////////////////////////////////////////////////////////////////////////////////
1784 /// returns the relative improtance of variable ivar
1785 
1787 {
1788  std::vector<Double_t> relativeImportance = this->GetVariableImportance();
1789  if (ivar < fNvars) return relativeImportance[ivar];
1790  else {
1791  Log() << kFATAL << "<GetVariableImportance>" << Endl
1792  << "--- ivar = " << ivar << " is out of range " << Endl;
1793  }
1794 
1795  return -1;
1796 }
1797 
Double_t PruneStrength
quality measure for a pruned subtree T of T_max
Definition: IPruneTool.h:50
DataSetInfo * fDataSetInfo
Definition: DecisionTree.h:250
static long int sum(long int i)
Definition: Factory.cxx:1785
float xmin
Definition: THbookFile.cxx:93
Random number generator class based on M.
Definition: TRandom3.h:29
void SetSelector(Short_t i)
MsgLogger & Endl(MsgLogger &ml)
Definition: MsgLogger.h:162
void SetFisherCoeff(Int_t ivar, Double_t coeff)
set fisher coefficients
long long Long64_t
Definition: RtypesCore.h:69
Float_t GetSumTarget() const
virtual Double_t Rndm()
Machine independent random number generator.
Definition: TRandom3.cxx:94
float Float_t
Definition: RtypesCore.h:53
UInt_t GetNTargets() const
accessor to the number of targets
Definition: Event.cxx:316
UInt_t GetTotalTreeDepth() const
Definition: BinaryTree.h:99
std::vector< TMatrixDSym * > * CalcCovarianceMatrices(const std::vector< Event * > &events, Int_t maxCls, VariableTransformBase *transformBase=0)
compute covariance matrices
Definition: Tools.cxx:1522
EPruneMethod fPruneMethod
Definition: DecisionTree.h:230
virtual PruningInfo * CalculatePruningInfo(DecisionTree *dt, const EventSample *testEvents=NULL, Bool_t isAutomatic=kFALSE)=0
Double_t GetNodePurityLimit() const
Definition: DecisionTree.h:170
Double_t GetPruneStrength() const
Definition: DecisionTree.h:155
virtual DecisionTreeNode * GetRight() const
TMVA::DecisionTreeNode * GetEventNode(const TMVA::Event &e) const
get the pointer to the leaf node where a particular event ends up in...
Double_t GetSeparationGain(const Double_t &nLeft, const Double_t &targetLeft, const Double_t &target2Left, const Double_t &nTot, const Double_t &targetTot, const Double_t &target2Tot)
Separation Gain: the measure of how the quality of separation of the sample increases by splitting th...
void IncrementNEvents(Float_t nev)
Short_t Min(Short_t a, Short_t b)
Definition: TMathBase.h:170
Int_t GetNodeType(void) const
std::vector< Double_t > GetFisherCoefficients(const EventConstList &eventSample, UInt_t nFisherVars, UInt_t *mapVarInFisher)
calculate the fisher coefficients for the event sample and the variables used
std::vector< DecisionTreeNode * > PruneSequence
the regularization parameter for pruning
Definition: IPruneTool.h:51
int Int_t
Definition: RtypesCore.h:41
bool Bool_t
Definition: RtypesCore.h:59
const Bool_t kFALSE
Definition: Rtypes.h:92
virtual void SetParentTree(TMVA::BinaryTree *t)
Definition: Node.h:130
Double_t fNodePurityLimit
Definition: DecisionTree.h:233
virtual void SetRight(Node *r)
virtual ~DecisionTree(void)
destructor
virtual DecisionTreeNode * GetLeft() const
Double_t GetWeight() const
return the event weight - depending on whether the flag IgnoreNegWeightsInTraining is or not...
Definition: Event.cxx:378
virtual DecisionTreeNode * GetParent() const
void DeleteNode(Node *)
protected, recursive, function used by the class destructor and when Pruning
Definition: BinaryTree.cxx:75
void SetNSigEvents_unweighted(Float_t s)
void SetResponse(Float_t r)
Short_t Abs(Short_t d)
Definition: TMathBase.h:110
void SetNBValidation(Double_t b)
std::vector< Double_t > GetVariableImportance()
Return the relative variable importance, normalized to all variables together having the importance 1...
virtual Double_t GetSeparationIndex(const Double_t &n, const Double_t &target, const Double_t &target2)
Separation Index: a simple Variance.
virtual DecisionTreeNode * GetRoot() const
Definition: DecisionTree.h:102
Float_t GetValue(UInt_t ivar) const
return value of i&#39;th variable
Definition: Event.cxx:233
void SetNFisherCoeff(Int_t nvars)
void SetDepth(UInt_t d)
Definition: Node.h:115
std::vector< const TMVA::Event * > EventConstList
Definition: DecisionTree.h:82
TClass * GetClass(T *)
Definition: TClass.h:555
Tools & gTools()
Definition: Tools.cxx:79
UInt_t GetDepth() const
Definition: Node.h:118
Double_t x[n]
Definition: legend1.C:17
Double_t GetSumWeights(const EventConstList *validationSample) const
calculate the normalization factor for a pruning validation sample
static const Int_t fgRandomSeed
Definition: DecisionTree.h:77
Double_t GetNSValidation() const
void FillTree(const EventList &eventSample)
void IncrementNBkgEvents(Float_t b)
Double_t SamplePurity(EventList eventSample)
calculates the purity S/(S+B) of a given event sample
Float_t GetPurity(void) const
std::vector< Double_t > fVariableImportance
Definition: DecisionTree.h:241
void SetSeparationGain(Float_t sep)
void ResetValidationData()
temporary stored node values (number of events, etc.) that originate not from the training but from t...
void SetNBkgEvents(Float_t b)
void SetNSValidation(Double_t s)
UInt_t CountLeafNodes(TMVA::Node *n=NULL)
return the number of terminal nodes in the sub-tree below Node n
void AddToSumTarget(Float_t t)
Double_t TrainNodeFast(const EventConstList &eventSample, DecisionTreeNode *node)
Decide how to split a node using one of the variables that gives the best separation of signal/backgr...
Double_t GetOriginalWeight() const
Definition: Event.h:87
void DescendTree(Node *n=NULL)
descend a tree to find all its leaf nodes
TMatrixT< Element > & Invert(Double_t *det=0)
Invert the matrix and calculate its determinant.
Definition: TMatrixT.cxx:1396
void FillEvent(const TMVA::Event &event, TMVA::DecisionTreeNode *node)
fill the existing the decision tree structure by filling event in from the top node and see where the...
void SetNEvents(Float_t nev)
Double_t fPruneStrength
Definition: DecisionTree.h:228
Double_t CheckEvent(const TMVA::Event *, Bool_t UseYesNoLeaf=kFALSE) const
the event e is put into the decision tree (starting at the root node) and the output is NodeType (sig...
TMatrixT< Double_t > TMatrixD
Definition: TMatrixDfwd.h:24
Float_t GetNBkgEvents(void) const
Double_t fMinLinCorrForFisher
Definition: DecisionTree.h:217
void SetTotalTreeDepth(Int_t depth)
Definition: BinaryTree.h:101
char GetVarType() const
Definition: VariableInfo.h:69
void SetSubTreeR(Double_t r)
TRandom2 r(17)
Double_t GetNBValidation() const
virtual void SetLeft(Node *l)
void SetAlpha(Double_t alpha)
UInt_t CleanTree(DecisionTreeNode *node=NULL)
remove those last splits that result in two leaf nodes that are both of the type (i.e.
void SetSampleMin(UInt_t ivar, Float_t xmin)
set the minimum of variable ivar from the training sample that pass/end up in this node ...
void SetCutValue(Float_t c)
void GetRandomisedVariables(Bool_t *useVariable, UInt_t *variableMap, UInt_t &nVars)
unsigned int UInt_t
Definition: RtypesCore.h:42
Double_t E()
Definition: TMath.h:54
Double_t TrainNodeFull(const EventConstList &eventSample, DecisionTreeNode *node)
void SetParentTreeInNodes(Node *n=NULL)
descend a tree to find all its leaf nodes, fill max depth reached in the tree at the same time...
void SetPurity(void)
return the S/(S+B) (purity) for the node REM: even if nodes with purity 0.01 are very PURE background...
TLine * l
Definition: textangle.C:4
void CheckEventWithPrunedTree(const TMVA::Event *) const
pass a single validation event throught a pruned decision tree on the way down the tree...
void ReadAttr(void *node, const char *, T &value)
Definition: Tools.h:296
bool almost_equal_float(float x, float y, int ulp=4)
default constructor using the GiniIndex as separation criterion, no restrictions on minium number of ...
float xmax
Definition: THbookFile.cxx:93
Float_t GetSampleMin(UInt_t ivar) const
return the minimum of variable ivar from the training sample that pass/end up in this node ...
Bool_t IsAutomatic() const
Definition: IPruneTool.h:97
virtual void ReadXML(void *node, UInt_t tmva_Version_Code=TMVA_VERSION_CODE)
read attributes from XML
Definition: BinaryTree.cxx:145
void PruneNodeInPlace(TMVA::DecisionTreeNode *node)
prune a node temporaily (without actually deleting its decendants which allows testing the pruned tre...
REAL epsilon
Definition: triangle.c:617
bool almost_equal_double(double x, double y, int ulp=4)
Bool_t IsTerminal() const
static void SetVarIndex(Int_t iVar)
void AddToSumTarget2(Float_t t2)
virtual Double_t GetSeparationGain(const Double_t &nSelS, const Double_t &nSelB, const Double_t &nTotS, const Double_t &nTotB)
Separation Gain: the measure of how the quality of separation of the sample increases by splitting th...
void SetSampleMax(UInt_t ivar, Float_t xmax)
set the maximum of variable ivar from the training sample that pass/end up in this node ...
TRandom3 * fMyTrandom
Definition: DecisionTree.h:239
#define ClassImp(name)
Definition: Rtypes.h:279
Double_t GetNodeR() const
double Double_t
Definition: RtypesCore.h:55
Node * GetNode(ULong_t sequence, UInt_t depth)
retrieve node from the tree.
void IncrementNSigEvents(Float_t s)
Float_t GetSumTarget2() const
void ClearTree()
clear the tree nodes (their S/N, Nevents etc), just keep the structure of the tree ...
const TMatrixD * GetCorrelationMatrix(const TMatrixD *covMat)
turns covariance into correlation matrix
Definition: Tools.cxx:337
void SetAlphaMinSubtree(Double_t g)
int type
Definition: TGX11.cxx:120
Types::EAnalysisType fAnalysisType
Definition: DecisionTree.h:248
unsigned long ULong_t
Definition: RtypesCore.h:51
static DecisionTree * CreateFromXML(void *node, UInt_t tmva_Version_Code=TMVA_VERSION_CODE)
re-create a new tree (decision tree or search tree) from XML
Double_t y[n]
Definition: legend1.C:17
void SetNEvents_unboosted(Float_t nev)
you should not use this method at all Int_t Int_t Double_t Double_t Double_t e
Definition: TRolke.cxx:630
VariableInfo & GetVariableInfo(Int_t i)
Definition: DataSetInfo.h:114
void SetNSigEvents_unboosted(Float_t s)
void SetTerminal(Bool_t s=kTRUE)
UInt_t GetClass() const
Definition: Event.h:89
RegressionVariance * fRegType
Definition: DecisionTree.h:221
void SetNSigEvents(Float_t s)
UInt_t CountNodes(Node *n=NULL)
return the number of nodes in the tree. (make a new count –> takes time)
Definition: BinaryTree.cxx:104
Float_t GetTarget(UInt_t itgt) const
Definition: Event.h:104
void SetNBkgEvents_unboosted(Float_t b)
SeparationBase * fSepType
Definition: DecisionTree.h:220
Short_t GetSelector() const
void SetNBkgEvents_unweighted(Float_t b)
char Char_t
Definition: RtypesCore.h:29
Double_t PruneTree(const EventConstList *validationSample=NULL)
prune (get rid of internal nodes) the Decision tree to avoid overtraining serveral different pruning ...
Bool_t DoRegression() const
Definition: DecisionTree.h:196
Node * GetRightDaughter(Node *n)
get right daughter node current node "n"
Definition: BinaryTree.cxx:96
virtual Double_t GetSeparationIndex(const Double_t &s, const Double_t &b)=0
Short_t Max(Short_t a, Short_t b)
Definition: TMathBase.h:202
UInt_t BuildTree(const EventConstList &eventSample, DecisionTreeNode *node=NULL)
building the decision tree by recursively calling the splitting of one (root-) node into two daughter...
Double_t TestPrunedTreeQuality(const DecisionTreeNode *dt=NULL, Int_t mode=0) const
return the misclassification rate of a pruned tree a "pruned tree" may have set the variable "IsTermi...
you should not use this method at all Int_t Int_t Double_t Double_t Double_t Int_t Double_t Double_t Double_t Double_t b
Definition: TRolke.cxx:630
#define NULL
Definition: Rtypes.h:82
virtual Double_t Determinant() const
Return the matrix determinant.
Definition: TMatrixT.cxx:1361
Float_t GetNSigEvents(void) const
Node * GetLeftDaughter(Node *n)
get left daughter node current node "n"
Definition: BinaryTree.cxx:88
void ClearNodeAndAllDaughters()
clear the nodes (their S/N, Nevents etc), just keep the structure of the tree
void SetSeparationIndex(Float_t sep)
double result[121]
void SetRoot(Node *r)
Definition: BinaryTree.h:86
virtual Int_t Poisson(Double_t mean)
Generates a random integer N according to a Poisson law.
Definition: TRandom.cxx:362
Double_t Sqrt(Double_t x)
Definition: TMath.h:464
MsgLogger & Log() const
Definition: BinaryTree.cxx:236
void SetPruneStrength(Double_t alpha)
Definition: IPruneTool.h:90
const Bool_t kTRUE
Definition: Rtypes.h:91
double norm(double *x, double *p)
Definition: unuranDistr.cxx:40
virtual Bool_t GoesRight(const Event &) const
test event if it decends the tree at this node to the right
return
Definition: HLFactory.cxx:514
const Int_t n
Definition: legend1.C:16
Float_t GetResponse(void) const
void SetPos(char s)
Definition: Node.h:121
void SetNEvents_unweighted(Float_t nev)
void ApplyValidationSample(const EventConstList *validationSample) const
run the validation sample through the (pruned) tree and fill in the nodes the variables NSValidation ...
void PruneNode(TMVA::DecisionTreeNode *node)
prune away the subtree below the node
Float_t GetSampleMax(UInt_t ivar) const
return the maximum of variable ivar from the training sample that pass/end up in this node ...
Float_t GetCutValue(void) const