Logo ROOT   6.10/09
Reference Guide
DecisionTree.cxx
Go to the documentation of this file.
1 // @(#)root/tmva $Id$
2 // Author: Andreas Hoecker, Joerg Stelzer, Helge Voss, Kai Voss, Eckhard von Toerne, Jan Therhaag
3 
4 /**********************************************************************************
5  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6  * Package: TMVA *
7  * Class : TMVA::DecisionTree *
8  * Web : http://tmva.sourceforge.net *
9  * *
10  * Description: *
11  * Implementation of a Decision Tree *
12  * *
13  * Authors (alphabetical): *
14  * Andreas Hoecker <Andreas.Hocker@cern.ch> - CERN, Switzerland *
15  * Helge Voss <Helge.Voss@cern.ch> - MPI-K Heidelberg, Germany *
16  * Kai Voss <Kai.Voss@cern.ch> - U. of Victoria, Canada *
17  * Eckhard v. Toerne <evt@uni-bonn.de> - U of Bonn, Germany *
18  * Jan Therhaag <Jan.Therhaag@cern.ch> - U of Bonn, Germany *
19  * *
20  * Copyright (c) 2005-2011: *
21  * CERN, Switzerland *
22  * U. of Victoria, Canada *
23  * MPI-K Heidelberg, Germany *
24  * U. of Bonn, Germany *
25  * *
26  * Redistribution and use in source and binary forms, with or without *
27  * modification, are permitted according to the terms listed in LICENSE *
28  * (http://mva.sourceforge.net/license.txt) *
29  * *
30  **********************************************************************************/
31 
32 /*! \class TMVA::DecisionTree
33 \ingroup TMVA
34 
35 Implementation of a Decision Tree
36 
37 In a decision tree successive decision nodes are used to categorize the
38 events out of the sample as either signal or background. Each node
39 uses only a single discriminating variable to decide if the event is
40 signal-like ("goes right") or background-like ("goes left"). This
41 forms a tree like structure with "baskets" at the end (leave nodes),
42 and an event is classified as either signal or background according to
43 whether the basket where it ends up has been classified signal or
44 background during the training. Training of a decision tree is the
45 process to define the "cut criteria" for each node. The training
46 starts with the root node. Here one takes the full training event
47 sample and selects the variable and corresponding cut value that gives
48 the best separation between signal and background at this stage. Using
49 this cut criterion, the sample is then divided into two subsamples, a
50 signal-like (right) and a background-like (left) sample. Two new nodes
51 are then created for each of the two sub-samples and they are
52 constructed using the same mechanism as described for the root
53 node. The devision is stopped once a certain node has reached either a
54 minimum number of events, or a minimum or maximum signal purity. These
55 leave nodes are then called "signal" or "background" if they contain
56 more signal respective background events from the training sample.
57 
58 */
59 
60 #include <iostream>
61 #include <algorithm>
62 #include <vector>
63 #include <limits>
64 #include <fstream>
65 #include <algorithm>
66 #include <cassert>
67 
68 #include "TRandom3.h"
69 #include "TMath.h"
70 #include "TMatrix.h"
71 
72 #include "TMVA/MsgLogger.h"
73 #include "TMVA/DecisionTree.h"
74 #include "TMVA/DecisionTreeNode.h"
75 #include "TMVA/BinarySearchTree.h"
76 
77 #include "TMVA/Tools.h"
78 
79 #include "TMVA/GiniIndex.h"
80 #include "TMVA/CrossEntropy.h"
82 #include "TMVA/SdivSqrtSplusB.h"
83 #include "TMVA/Event.h"
84 #include "TMVA/BDTEventWrapper.h"
85 #include "TMVA/IPruneTool.h"
88 
89 const Int_t TMVA::DecisionTree::fgRandomSeed = 0; // set nonzero for debugging and zero for random seeds
90 
91 using std::vector;
92 
94 
95 bool almost_equal_float(float x, float y, int ulp=4){
96  // the machine epsilon has to be scaled to the magnitude of the values used
97  // and multiplied by the desired precision in ULPs (units in the last place)
98  return std::abs(x-y) < std::numeric_limits<float>::epsilon() * std::abs(x+y) * ulp
99  // unless the result is subnormal
100  || std::abs(x-y) < std::numeric_limits<float>::min();
101 }
102 
103 bool almost_equal_double(double x, double y, int ulp=4){
104  // the machine epsilon has to be scaled to the magnitude of the values used
105  // and multiplied by the desired precision in ULPs (units in the last place)
106  return std::abs(x-y) < std::numeric_limits<double>::epsilon() * std::abs(x+y) * ulp
107  // unless the result is subnormal
108  || std::abs(x-y) < std::numeric_limits<double>::min();
109 }
110 
111 ////////////////////////////////////////////////////////////////////////////////
112 /// default constructor using the GiniIndex as separation criterion,
113 /// no restrictions on minium number of events in a leave note or the
114 /// separation gain in the node splitting
115 
117  BinaryTree(),
118  fNvars (0),
119  fNCuts (-1),
120  fUseFisherCuts (kFALSE),
121  fMinLinCorrForFisher (1),
122  fUseExclusiveVars (kTRUE),
123  fSepType (NULL),
124  fRegType (NULL),
125  fMinSize (0),
126  fMinNodeSize (1),
127  fMinSepGain (0),
128  fUseSearchTree(kFALSE),
129  fPruneStrength(0),
130  fPruneMethod (kNoPruning),
131  fNNodesBeforePruning(0),
132  fNodePurityLimit(0.5),
133  fRandomisedTree (kFALSE),
134  fUseNvars (0),
135  fUsePoissonNvars(kFALSE),
136  fMyTrandom (NULL),
137  fMaxDepth (999999),
138  fSigClass (0),
139  fTreeID (0),
140  fAnalysisType (Types::kClassification),
141  fDataSetInfo (NULL)
142 {
143 }
144 
145 ////////////////////////////////////////////////////////////////////////////////
146 /// constructor specifying the separation type, the min number of
147 /// events in a no that is still subjected to further splitting, the
148 /// number of bins in the grid used in applying the cut for the node
149 /// splitting.
150 
152  Bool_t randomisedTree, Int_t useNvars, Bool_t usePoissonNvars,
153  UInt_t nMaxDepth, Int_t iSeed, Float_t purityLimit, Int_t treeID):
154  BinaryTree(),
155  fNvars (0),
156  fNCuts (nCuts),
160  fSepType (sepType),
161  fRegType (NULL),
162  fMinSize (0),
163  fMinNodeSize (minSize),
164  fMinSepGain (0),
166  fPruneStrength (0),
169  fNodePurityLimit(purityLimit),
170  fRandomisedTree (randomisedTree),
171  fUseNvars (useNvars),
172  fUsePoissonNvars(usePoissonNvars),
173  fMyTrandom (new TRandom3(iSeed)),
174  fMaxDepth (nMaxDepth),
175  fSigClass (cls),
176  fTreeID (treeID),
177  fAnalysisType (Types::kClassification),
178  fDataSetInfo (dataInfo)
179 {
180  if (sepType == NULL) { // it is interpreted as a regression tree, where
181  // currently the separation type (simple least square)
182  // cannot be chosen freely)
185  if ( nCuts <=0 ) {
186  fNCuts = 200;
187  Log() << kWARNING << " You had chosen the training mode using optimal cuts, not\n"
188  << " based on a grid of " << fNCuts << " by setting the option NCuts < 0\n"
189  << " as this doesn't exist yet, I set it to " << fNCuts << " and use the grid"
190  << Endl;
191  }
192  }else{
194  }
195 }
196 
197 ////////////////////////////////////////////////////////////////////////////////
198 /// copy constructor that creates a true copy, i.e. a completely independent tree
199 /// the node copy will recursively copy all the nodes
200 
202  BinaryTree(),
203  fNvars (d.fNvars),
204  fNCuts (d.fNCuts),
208  fSepType (d.fSepType),
209  fRegType (d.fRegType),
210  fMinSize (d.fMinSize),
218  fUseNvars (d.fUseNvars),
220  fMyTrandom (new TRandom3(fgRandomSeed)), // well, that means it's not an identical copy. But I only ever intend to really copy trees that are "outgrown" already.
221  fMaxDepth (d.fMaxDepth),
222  fSigClass (d.fSigClass),
223  fTreeID (d.fTreeID),
226 {
227  this->SetRoot( new TMVA::DecisionTreeNode ( *((DecisionTreeNode*)(d.GetRoot())) ) );
228  this->SetParentTreeInNodes();
229  fNNodes = d.fNNodes;
230 }
231 
232 
233 ////////////////////////////////////////////////////////////////////////////////
234 /// destructor
235 
237 {
238  // destruction of the tree nodes done in the "base class" BinaryTree
239 
240  if (fMyTrandom) delete fMyTrandom;
241  if (fRegType) delete fRegType;
242 }
243 
244 ////////////////////////////////////////////////////////////////////////////////
245 /// descend a tree to find all its leaf nodes, fill max depth reached in the
246 /// tree at the same time.
247 
249 {
250  if (n == NULL) { //default, start at the tree top, then descend recursively
251  n = this->GetRoot();
252  if (n == NULL) {
253  Log() << kFATAL << "SetParentTreeNodes: started with undefined ROOT node" <<Endl;
254  return ;
255  }
256  }
257 
258  if ((this->GetLeftDaughter(n) == NULL) && (this->GetRightDaughter(n) != NULL) ) {
259  Log() << kFATAL << " Node with only one daughter?? Something went wrong" << Endl;
260  return;
261  } else if ((this->GetLeftDaughter(n) != NULL) && (this->GetRightDaughter(n) == NULL) ) {
262  Log() << kFATAL << " Node with only one daughter?? Something went wrong" << Endl;
263  return;
264  }
265  else {
266  if (this->GetLeftDaughter(n) != NULL) {
267  this->SetParentTreeInNodes( this->GetLeftDaughter(n) );
268  }
269  if (this->GetRightDaughter(n) != NULL) {
270  this->SetParentTreeInNodes( this->GetRightDaughter(n) );
271  }
272  }
273  n->SetParentTree(this);
274  if (n->GetDepth() > this->GetTotalTreeDepth()) this->SetTotalTreeDepth(n->GetDepth());
275  return;
276 }
277 
278 ////////////////////////////////////////////////////////////////////////////////
279 /// re-create a new tree (decision tree or search tree) from XML
280 
282  std::string type("");
283  gTools().ReadAttr(node,"type", type);
284  DecisionTree* dt = new DecisionTree();
285 
286  dt->ReadXML( node, tmva_Version_Code );
287  return dt;
288 }
289 
290 ////////////////////////////////////////////////////////////////////////////////
291 /// building the decision tree by recursively calling the splitting of
292 /// one (root-) node into two daughter nodes (returns the number of nodes)
293 
294 UInt_t TMVA::DecisionTree::BuildTree( const std::vector<const TMVA::Event*> & eventSample,
296 {
297  if (node==NULL) {
298  //start with the root node
299  node = new TMVA::DecisionTreeNode();
300  fNNodes = 1;
301  this->SetRoot(node);
302  // have to use "s" for start as "r" for "root" would be the same as "r" for "right"
303  this->GetRoot()->SetPos('s');
304  this->GetRoot()->SetDepth(0);
305  this->GetRoot()->SetParentTree(this);
306  fMinSize = fMinNodeSize/100. * eventSample.size();
307  if (GetTreeID()==0){
308  Log() << kDEBUG << "\tThe minimal node size MinNodeSize=" << fMinNodeSize << " fMinNodeSize="<<fMinNodeSize<< "% is translated to an actual number of events = "<< fMinSize<< " for the training sample size of " << eventSample.size() << Endl;
309  Log() << kDEBUG << "\tNote: This number will be taken as absolute minimum in the node, " << Endl;
310  Log() << kDEBUG << " \tin terms of 'weighted events' and unweighted ones !! " << Endl;
311  }
312  }
313 
314  UInt_t nevents = eventSample.size();
315 
316  if (nevents > 0 ) {
317  if (fNvars==0) fNvars = eventSample[0]->GetNVariables(); // should have been set before, but ... well..
318  fVariableImportance.resize(fNvars);
319  }
320  else Log() << kFATAL << ":<BuildTree> eventsample Size == 0 " << Endl;
321 
322  Double_t s=0, b=0;
323  Double_t suw=0, buw=0;
324  Double_t sub=0, bub=0; // unboosted!
325  Double_t target=0, target2=0;
326  Float_t *xmin = new Float_t[fNvars];
327  Float_t *xmax = new Float_t[fNvars];
328  for (UInt_t ivar=0; ivar<fNvars; ivar++) {
329  xmin[ivar]=xmax[ivar]=0;
330  }
331  for (UInt_t iev=0; iev<eventSample.size(); iev++) {
332  const TMVA::Event* evt = eventSample[iev];
333  const Double_t weight = evt->GetWeight();
334  const Double_t orgWeight = evt->GetOriginalWeight(); // unboosted!
335  if (evt->GetClass() == fSigClass) {
336  s += weight;
337  suw += 1;
338  sub += orgWeight;
339  }
340  else {
341  b += weight;
342  buw += 1;
343  bub += orgWeight;
344  }
345  if ( DoRegression() ) {
346  const Double_t tgt = evt->GetTarget(0);
347  target +=weight*tgt;
348  target2+=weight*tgt*tgt;
349  }
350 
351  for (UInt_t ivar=0; ivar<fNvars; ivar++) {
352  const Double_t val = evt->GetValueFast(ivar);
353  if (iev==0) xmin[ivar]=xmax[ivar]=val;
354  if (val < xmin[ivar]) xmin[ivar]=val;
355  if (val > xmax[ivar]) xmax[ivar]=val;
356  }
357  }
358 
359 
360  if (s+b < 0) {
361  Log() << kWARNING << " One of the Decision Tree nodes has negative total number of signal or background events. "
362  << "(Nsig="<<s<<" Nbkg="<<b<<" Probably you use a Monte Carlo with negative weights. That should in principle "
363  << "be fine as long as on average you end up with something positive. For this you have to make sure that the "
364  << "minimal number of (unweighted) events demanded for a tree node (currently you use: MinNodeSize="<<fMinNodeSize
365  << "% of training events, you can set this via the BDT option string when booking the classifier) is large enough "
366  << "to allow for reasonable averaging!!!" << Endl
367  << " If this does not help.. maybe you want to try the option: NoNegWeightsInTraining which ignores events "
368  << "with negative weight in the training." << Endl;
369  double nBkg=0.;
370  for (UInt_t i=0; i<eventSample.size(); i++) {
371  if (eventSample[i]->GetClass() != fSigClass) {
372  nBkg += eventSample[i]->GetWeight();
373  Log() << kDEBUG << "Event "<< i<< " has (original) weight: " << eventSample[i]->GetWeight()/eventSample[i]->GetBoostWeight()
374  << " boostWeight: " << eventSample[i]->GetBoostWeight() << Endl;
375  }
376  }
377  Log() << kDEBUG << " that gives in total: " << nBkg<<Endl;
378  }
379 
380  node->SetNSigEvents(s);
381  node->SetNBkgEvents(b);
382  node->SetNSigEvents_unweighted(suw);
383  node->SetNBkgEvents_unweighted(buw);
384  node->SetNSigEvents_unboosted(sub);
385  node->SetNBkgEvents_unboosted(bub);
386  node->SetPurity();
387  if (node == this->GetRoot()) {
388  node->SetNEvents(s+b);
389  node->SetNEvents_unweighted(suw+buw);
390  node->SetNEvents_unboosted(sub+bub);
391  }
392  for (UInt_t ivar=0; ivar<fNvars; ivar++) {
393  node->SetSampleMin(ivar,xmin[ivar]);
394  node->SetSampleMax(ivar,xmax[ivar]);
395  }
396  delete[] xmin;
397  delete[] xmax;
398 
399  // I now demand the minimum number of events for both daughter nodes. Hence if the number
400  // of events in the parent node is not at least two times as big, I don't even need to try
401  // splitting
402 
403  // ask here for actual "events" independent of their weight.. OR the weighted events
404  // to exceed the min requested number of events per daughter node
405  // (NOTE: make sure that at the eventSample at the ROOT node has sum_of_weights == sample.size() !
406  // if ((eventSample.size() >= 2*fMinSize ||s+b >= 2*fMinSize) && node->GetDepth() < fMaxDepth
407  // std::cout << "------------------------------------------------------------------"<<std::endl;
408  // std::cout << "------------------------------------------------------------------"<<std::endl;
409  // std::cout << " eveSampleSize = "<< eventSample.size() << " s+b="<<s+b << std::endl;
410  if ((eventSample.size() >= 2*fMinSize && s+b >= 2*fMinSize) && node->GetDepth() < fMaxDepth
411  && ( ( s!=0 && b !=0 && !DoRegression()) || ( (s+b)!=0 && DoRegression()) ) ) {
412  Double_t separationGain;
413  if (fNCuts > 0){
414  separationGain = this->TrainNodeFast(eventSample, node);
415  } else {
416  separationGain = this->TrainNodeFull(eventSample, node);
417  }
418  if (separationGain < std::numeric_limits<double>::epsilon()) { // we could not gain anything, e.g. all events are in one bin,
419  /// if (separationGain < 0.00000001) { // we could not gain anything, e.g. all events are in one bin,
420  // no cut can actually do anything to improve the node
421  // hence, naturally, the current node is a leaf node
422  if (DoRegression()) {
423  node->SetSeparationIndex(fRegType->GetSeparationIndex(s+b,target,target2));
424  node->SetResponse(target/(s+b));
425  if( almost_equal_double(target2/(s+b),target/(s+b)*target/(s+b)) ){
426  node->SetRMS(0);
427  }else{
428  node->SetRMS(TMath::Sqrt(target2/(s+b) - target/(s+b)*target/(s+b)));
429  }
430  }
431  else {
433 
434  if (node->GetPurity() > fNodePurityLimit) node->SetNodeType(1);
435  else node->SetNodeType(-1);
436  }
437  if (node->GetDepth() > this->GetTotalTreeDepth()) this->SetTotalTreeDepth(node->GetDepth());
438 
439  } else {
440 
441  std::vector<const TMVA::Event*> leftSample; leftSample.reserve(nevents);
442  std::vector<const TMVA::Event*> rightSample; rightSample.reserve(nevents);
443 
444  Double_t nRight=0, nLeft=0;
445  Double_t nRightUnBoosted=0, nLeftUnBoosted=0;
446 
447  for (UInt_t ie=0; ie< nevents ; ie++) {
448  if (node->GoesRight(*eventSample[ie])) {
449  rightSample.push_back(eventSample[ie]);
450  nRight += eventSample[ie]->GetWeight();
451  nRightUnBoosted += eventSample[ie]->GetOriginalWeight();
452  }
453  else {
454  leftSample.push_back(eventSample[ie]);
455  nLeft += eventSample[ie]->GetWeight();
456  nLeftUnBoosted += eventSample[ie]->GetOriginalWeight();
457  }
458  }
459 
460  // sanity check
461  if (leftSample.empty() || rightSample.empty()) {
462 
463  Log() << kERROR << "<TrainNode> all events went to the same branch" << Endl
464  << "--- Hence new node == old node ... check" << Endl
465  << "--- left:" << leftSample.size()
466  << " right:" << rightSample.size() << Endl
467  << " while the separation is thought to be " << separationGain
468  << "\n when cutting on variable " << node->GetSelector()
469  << " at value " << node->GetCutValue()
470  << kFATAL << "--- this should never happen, please write a bug report to Helge.Voss@cern.ch" << Endl;
471 
472  }
473 
474  // continue building daughter nodes for the left and the right eventsample
475  TMVA::DecisionTreeNode *rightNode = new TMVA::DecisionTreeNode(node,'r');
476  fNNodes++;
477  rightNode->SetNEvents(nRight);
478  rightNode->SetNEvents_unboosted(nRightUnBoosted);
479  rightNode->SetNEvents_unweighted(rightSample.size());
480 
481  TMVA::DecisionTreeNode *leftNode = new TMVA::DecisionTreeNode(node,'l');
482 
483  fNNodes++;
484  leftNode->SetNEvents(nLeft);
485  leftNode->SetNEvents_unboosted(nLeftUnBoosted);
486  leftNode->SetNEvents_unweighted(leftSample.size());
487 
488  node->SetNodeType(0);
489  node->SetLeft(leftNode);
490  node->SetRight(rightNode);
491 
492  this->BuildTree(rightSample, rightNode);
493  this->BuildTree(leftSample, leftNode );
494 
495  }
496  }
497  else{ // it is a leaf node
498  if (DoRegression()) {
499  node->SetSeparationIndex(fRegType->GetSeparationIndex(s+b,target,target2));
500  node->SetResponse(target/(s+b));
501  if( almost_equal_double(target2/(s+b), target/(s+b)*target/(s+b)) ) {
502  node->SetRMS(0);
503  }else{
504  node->SetRMS(TMath::Sqrt(target2/(s+b) - target/(s+b)*target/(s+b)));
505  }
506  }
507  else {
509  if (node->GetPurity() > fNodePurityLimit) node->SetNodeType(1);
510  else node->SetNodeType(-1);
511  // loop through the event sample ending up in this node and check for events with negative weight
512  // those "cannot" be boosted normally. Hence, if there is one of those
513  // is misclassified, find randomly as many events with positive weights in this
514  // node as needed to get the same absolute number of weight, and mark them as
515  // "not to be boosted" in order to make up for not boosting the negative weight event
516  }
517 
518 
519  if (node->GetDepth() > this->GetTotalTreeDepth()) this->SetTotalTreeDepth(node->GetDepth());
520  }
521 
522  // if (IsRootNode) this->CleanTree();
523  return fNNodes;
524 }
525 
526 ////////////////////////////////////////////////////////////////////////////////
527 /// fill the existing the decision tree structure by filling event
528 /// in from the top node and see where they happen to end up
529 
530 void TMVA::DecisionTree::FillTree( const std::vector<TMVA::Event*> & eventSample )
531 {
532  for (UInt_t i=0; i<eventSample.size(); i++) {
533  this->FillEvent(*(eventSample[i]),NULL);
534  }
535 }
536 
537 ////////////////////////////////////////////////////////////////////////////////
538 /// fill the existing the decision tree structure by filling event
539 /// in from the top node and see where they happen to end up
540 
542  TMVA::DecisionTreeNode *node )
543 {
544  if (node == NULL) { // that's the start, take the Root node
545  node = this->GetRoot();
546  }
547 
548  node->IncrementNEvents( event.GetWeight() );
550 
551  if (event.GetClass() == fSigClass) {
552  node->IncrementNSigEvents( event.GetWeight() );
554  }
555  else {
556  node->IncrementNBkgEvents( event.GetWeight() );
558  }
560  node->GetNBkgEvents()));
561 
562  if (node->GetNodeType() == 0) { //intermediate node --> go down
563  if (node->GoesRight(event))
564  this->FillEvent(event,static_cast<TMVA::DecisionTreeNode*>(node->GetRight())) ;
565  else
566  this->FillEvent(event,static_cast<TMVA::DecisionTreeNode*>(node->GetLeft())) ;
567  }
568 }
569 
570 ////////////////////////////////////////////////////////////////////////////////
571 /// clear the tree nodes (their S/N, Nevents etc), just keep the structure of the tree
572 
574 {
575  if (this->GetRoot()!=NULL) this->GetRoot()->ClearNodeAndAllDaughters();
576 
577 }
578 
579 ////////////////////////////////////////////////////////////////////////////////
580 /// remove those last splits that result in two leaf nodes that
581 /// are both of the type (i.e. both signal or both background)
582 /// this of course is only a reasonable thing to do when you use
583 /// "YesOrNo" leafs, while it might loose s.th. if you use the
584 /// purity information in the nodes.
585 /// --> hence I don't call it automatically in the tree building
586 
588 {
589  if (node==NULL) {
590  node = this->GetRoot();
591  }
592 
593  DecisionTreeNode *l = node->GetLeft();
594  DecisionTreeNode *r = node->GetRight();
595 
596  if (node->GetNodeType() == 0) {
597  this->CleanTree(l);
598  this->CleanTree(r);
599  if (l->GetNodeType() * r->GetNodeType() > 0) {
600 
601  this->PruneNode(node);
602  }
603  }
604  // update the number of nodes after the cleaning
605  return this->CountNodes();
606 
607 }
608 
609 ////////////////////////////////////////////////////////////////////////////////
610 /// prune (get rid of internal nodes) the Decision tree to avoid overtraining
611 /// several different pruning methods can be applied as selected by the
612 /// variable "fPruneMethod".
613 
615 {
616  IPruneTool* tool(NULL);
617  PruningInfo* info(NULL);
618 
619  if( fPruneMethod == kNoPruning ) return 0.0;
620 
622  // tool = new ExpectedErrorPruneTool(logfile);
623  tool = new ExpectedErrorPruneTool();
625  {
626  tool = new CostComplexityPruneTool();
627  }
628  else {
629  Log() << kFATAL << "Selected pruning method not yet implemented "
630  << Endl;
631  }
632 
633  if(!tool) return 0.0;
634 
636  if(tool->IsAutomatic()) {
637  if(validationSample == NULL){
638  Log() << kFATAL << "Cannot automate the pruning algorithm without an "
639  << "independent validation sample!" << Endl;
640  }else if(validationSample->size() == 0) {
641  Log() << kFATAL << "Cannot automate the pruning algorithm with "
642  << "independent validation sample of ZERO events!" << Endl;
643  }
644  }
645 
646  info = tool->CalculatePruningInfo(this,validationSample);
647  Double_t pruneStrength=0;
648  if(!info) {
649  Log() << kFATAL << "Error pruning tree! Check prune.log for more information."
650  << Endl;
651  } else {
652  pruneStrength = info->PruneStrength;
653 
654  // Log() << kDEBUG << "Optimal prune strength (alpha): " << pruneStrength
655  // << " has quality index " << info->QualityIndex << Endl;
656 
657 
658  for (UInt_t i = 0; i < info->PruneSequence.size(); ++i) {
659 
660  PruneNode(info->PruneSequence[i]);
661  }
662  // update the number of nodes after the pruning
663  this->CountNodes();
664  }
665 
666  delete tool;
667  delete info;
668 
669  return pruneStrength;
670 };
671 
672 
673 ////////////////////////////////////////////////////////////////////////////////
674 /// run the validation sample through the (pruned) tree and fill in the nodes
675 /// the variables NSValidation and NBValidadtion (i.e. how many of the Signal
676 /// and Background events from the validation sample. This is then later used
677 /// when asking for the "tree quality" ..
678 
679 void TMVA::DecisionTree::ApplyValidationSample( const EventConstList* validationSample ) const
680 {
682  for (UInt_t ievt=0; ievt < validationSample->size(); ievt++) {
683  CheckEventWithPrunedTree((*validationSample)[ievt]);
684  }
685 }
686 
687 ////////////////////////////////////////////////////////////////////////////////
688 /// return the misclassification rate of a pruned tree
689 /// a "pruned tree" may have set the variable "IsTerminal" to "arbitrary" at
690 /// any node, hence this tree quality testing will stop there, hence test
691 /// the pruned tree (while the full tree is still in place for normal/later use)
692 
694 {
695  if (n == NULL) { // default, start at the tree top, then descend recursively
696  n = this->GetRoot();
697  if (n == NULL) {
698  Log() << kFATAL << "TestPrunedTreeQuality: started with undefined ROOT node" <<Endl;
699  return 0;
700  }
701  }
702 
703  if( n->GetLeft() != NULL && n->GetRight() != NULL && !n->IsTerminal() ) {
704  return (TestPrunedTreeQuality( n->GetLeft(), mode ) +
705  TestPrunedTreeQuality( n->GetRight(), mode ));
706  }
707  else { // terminal leaf (in a pruned subtree of T_max at least)
708  if (DoRegression()) {
709  Double_t sumw = n->GetNSValidation() + n->GetNBValidation();
710  return n->GetSumTarget2() - 2*n->GetSumTarget()*n->GetResponse() + sumw*n->GetResponse()*n->GetResponse();
711  }
712  else {
713  if (mode == 0) {
714  if (n->GetPurity() > this->GetNodePurityLimit()) // this is a signal leaf, according to the training
715  return n->GetNBValidation();
716  else
717  return n->GetNSValidation();
718  }
719  else if ( mode == 1 ) {
720  // calculate the weighted error using the pruning validation sample
721  return (n->GetPurity() * n->GetNBValidation() + (1.0 - n->GetPurity()) * n->GetNSValidation());
722  }
723  else {
724  throw std::string("Unknown ValidationQualityMode");
725  }
726  }
727  }
728 }
729 
730 ////////////////////////////////////////////////////////////////////////////////
731 /// pass a single validation event through a pruned decision tree
732 /// on the way down the tree, fill in all the "intermediate" information
733 /// that would normally be there from training.
734 
736 {
737  DecisionTreeNode* current = this->GetRoot();
738  if (current == NULL) {
739  Log() << kFATAL << "CheckEventWithPrunedTree: started with undefined ROOT node" <<Endl;
740  }
741 
742  while(current != NULL) {
743  if(e->GetClass() == fSigClass)
744  current->SetNSValidation(current->GetNSValidation() + e->GetWeight());
745  else
746  current->SetNBValidation(current->GetNBValidation() + e->GetWeight());
747 
748  if (e->GetNTargets() > 0) {
749  current->AddToSumTarget(e->GetWeight()*e->GetTarget(0));
750  current->AddToSumTarget2(e->GetWeight()*e->GetTarget(0)*e->GetTarget(0));
751  }
752 
753  if (current->GetRight() == NULL || current->GetLeft() == NULL) {
754  current = NULL;
755  }
756  else {
757  if (current->GoesRight(*e))
758  current = (TMVA::DecisionTreeNode*)current->GetRight();
759  else
760  current = (TMVA::DecisionTreeNode*)current->GetLeft();
761  }
762  }
763 }
764 
765 ////////////////////////////////////////////////////////////////////////////////
766 /// calculate the normalization factor for a pruning validation sample
767 
769 {
770  Double_t sumWeights = 0.0;
771  for( EventConstList::const_iterator it = validationSample->begin();
772  it != validationSample->end(); ++it ) {
773  sumWeights += (*it)->GetWeight();
774  }
775  return sumWeights;
776 }
777 
778 ////////////////////////////////////////////////////////////////////////////////
779 /// return the number of terminal nodes in the sub-tree below Node n
780 
782 {
783  if (n == NULL) { // default, start at the tree top, then descend recursively
784  n = this->GetRoot();
785  if (n == NULL) {
786  Log() << kFATAL << "CountLeafNodes: started with undefined ROOT node" <<Endl;
787  return 0;
788  }
789  }
790 
791  UInt_t countLeafs=0;
792 
793  if ((this->GetLeftDaughter(n) == NULL) && (this->GetRightDaughter(n) == NULL) ) {
794  countLeafs += 1;
795  }
796  else {
797  if (this->GetLeftDaughter(n) != NULL) {
798  countLeafs += this->CountLeafNodes( this->GetLeftDaughter(n) );
799  }
800  if (this->GetRightDaughter(n) != NULL) {
801  countLeafs += this->CountLeafNodes( this->GetRightDaughter(n) );
802  }
803  }
804  return countLeafs;
805 }
806 
807 ////////////////////////////////////////////////////////////////////////////////
808 /// descend a tree to find all its leaf nodes
809 
811 {
812  if (n == NULL) { // default, start at the tree top, then descend recursively
813  n = this->GetRoot();
814  if (n == NULL) {
815  Log() << kFATAL << "DescendTree: started with undefined ROOT node" <<Endl;
816  return ;
817  }
818  }
819 
820  if ((this->GetLeftDaughter(n) == NULL) && (this->GetRightDaughter(n) == NULL) ) {
821  // do nothing
822  }
823  else if ((this->GetLeftDaughter(n) == NULL) && (this->GetRightDaughter(n) != NULL) ) {
824  Log() << kFATAL << " Node with only one daughter?? Something went wrong" << Endl;
825  return;
826  }
827  else if ((this->GetLeftDaughter(n) != NULL) && (this->GetRightDaughter(n) == NULL) ) {
828  Log() << kFATAL << " Node with only one daughter?? Something went wrong" << Endl;
829  return;
830  }
831  else {
832  if (this->GetLeftDaughter(n) != NULL) {
833  this->DescendTree( this->GetLeftDaughter(n) );
834  }
835  if (this->GetRightDaughter(n) != NULL) {
836  this->DescendTree( this->GetRightDaughter(n) );
837  }
838  }
839 }
840 
841 ////////////////////////////////////////////////////////////////////////////////
842 /// prune away the subtree below the node
843 
845 {
846  DecisionTreeNode *l = node->GetLeft();
847  DecisionTreeNode *r = node->GetRight();
848 
849  node->SetRight(NULL);
850  node->SetLeft(NULL);
851  node->SetSelector(-1);
852  node->SetSeparationGain(-1);
853  if (node->GetPurity() > fNodePurityLimit) node->SetNodeType(1);
854  else node->SetNodeType(-1);
855  this->DeleteNode(l);
856  this->DeleteNode(r);
857  // update the stored number of nodes in the Tree
858  this->CountNodes();
859 
860 }
861 
862 ////////////////////////////////////////////////////////////////////////////////
863 /// prune a node temporarily (without actually deleting its descendants
864 /// which allows testing the pruned tree quality for many different
865 /// pruning stages without "touching" the tree.
866 
868  if(node == NULL) return;
869  node->SetNTerminal(1);
870  node->SetSubTreeR( node->GetNodeR() );
871  node->SetAlpha( std::numeric_limits<double>::infinity( ) );
872  node->SetAlphaMinSubtree( std::numeric_limits<double>::infinity( ) );
873  node->SetTerminal(kTRUE); // set the node to be terminal without deleting its descendants FIXME not needed
874 }
875 
876 ////////////////////////////////////////////////////////////////////////////////
877 /// retrieve node from the tree. Its position (up to a maximal tree depth of 64)
878 /// is coded as a sequence of left-right moves starting from the root, coded as
879 /// 0-1 bit patterns stored in the "long-integer" (i.e. 0:left ; 1:right
880 
882 {
883  Node* current = this->GetRoot();
884 
885  for (UInt_t i =0; i < depth; i++) {
886  ULong_t tmp = 1 << i;
887  if ( tmp & sequence) current = this->GetRightDaughter(current);
888  else current = this->GetLeftDaughter(current);
889  }
890 
891  return current;
892 }
893 
894 ////////////////////////////////////////////////////////////////////////////////
895 ///
896 
897 void TMVA::DecisionTree::GetRandomisedVariables(Bool_t *useVariable, UInt_t *mapVariable, UInt_t &useNvars){
898  for (UInt_t ivar=0; ivar<fNvars; ivar++) useVariable[ivar]=kFALSE;
899  if (fUseNvars==0) { // no number specified ... choose s.th. which hopefully works well
900  // watch out, should never happen as it is initialised automatically in MethodBDT already!!!
901  fUseNvars = UInt_t(TMath::Sqrt(fNvars)+0.6);
902  }
904  else useNvars = fUseNvars;
905 
906  UInt_t nSelectedVars = 0;
907  while (nSelectedVars < useNvars) {
908  Double_t bla = fMyTrandom->Rndm()*fNvars;
909  useVariable[Int_t (bla)] = kTRUE;
910  nSelectedVars = 0;
911  for (UInt_t ivar=0; ivar < fNvars; ivar++) {
912  if (useVariable[ivar] == kTRUE) {
913  mapVariable[nSelectedVars] = ivar;
914  nSelectedVars++;
915  }
916  }
917  }
918  if (nSelectedVars != useNvars) { std::cout << "Bug in TrainNode - GetRandisedVariables()... sorry" << std::endl; std::exit(1);}
919 }
920 
921 ////////////////////////////////////////////////////////////////////////////////
922 /// Decide how to split a node using one of the variables that gives
923 /// the best separation of signal/background. In order to do this, for each
924 /// variable a scan of the different cut values in a grid (grid = fNCuts) is
925 /// performed and the resulting separation gains are compared.
926 /// in addition to the individual variables, one can also ask for a fisher
927 /// discriminant being built out of (some) of the variables and used as a
928 /// possible multivariate split.
929 
931  TMVA::DecisionTreeNode *node )
932 {
933  Double_t separationGainTotal = -1, sepTmp;
934  Double_t *separationGain = new Double_t[fNvars+1];
935  Int_t *cutIndex = new Int_t[fNvars+1]; //-1;
936 
937  for (UInt_t ivar=0; ivar <= fNvars; ivar++) {
938  separationGain[ivar]=-1;
939  cutIndex[ivar]=-1;
940  }
941  Int_t mxVar = -1;
942  Bool_t cutType = kTRUE;
943  Double_t nTotS, nTotB;
944  Int_t nTotS_unWeighted, nTotB_unWeighted;
945  UInt_t nevents = eventSample.size();
946 
947 
948  // the +1 comes from the fact that I treat later on the Fisher output as an
949  // additional possible variable.
950  Bool_t *useVariable = new Bool_t[fNvars+1]; // for performance reasons instead of std::vector<Bool_t> useVariable(fNvars);
951  UInt_t *mapVariable = new UInt_t[fNvars+1]; // map the subset of variables used in randomised trees to the original variable number (used in the Event() )
952 
953  std::vector<Double_t> fisherCoeff;
954 
955  if (fRandomisedTree) { // choose for each node splitting a random subset of variables to choose from
956  UInt_t tmp=fUseNvars;
957  GetRandomisedVariables(useVariable,mapVariable,tmp);
958  }
959  else {
960  for (UInt_t ivar=0; ivar < fNvars; ivar++) {
961  useVariable[ivar] = kTRUE;
962  mapVariable[ivar] = ivar;
963  }
964  }
965  useVariable[fNvars] = kFALSE; //by default fisher is not used..
966 
967  Bool_t fisherOK = kFALSE; // flag to show that the fisher discriminant could be calculated correctly or not;
968  if (fUseFisherCuts) {
969  useVariable[fNvars] = kTRUE; // that's were I store the "fisher MVA"
970 
971  //use for the Fisher discriminant ONLY those variables that show
972  //some reasonable linear correlation in either Signal or Background
973  Bool_t *useVarInFisher = new Bool_t[fNvars]; // for performance reasons instead of std::vector<Bool_t> useVariable(fNvars);
974  UInt_t *mapVarInFisher = new UInt_t[fNvars]; // map the subset of variables used in randomised trees to the original variable number (used in the Event() )
975  for (UInt_t ivar=0; ivar < fNvars; ivar++) {
976  useVarInFisher[ivar] = kFALSE;
977  mapVarInFisher[ivar] = ivar;
978  }
979 
980  std::vector<TMatrixDSym*>* covMatrices;
981  covMatrices = gTools().CalcCovarianceMatrices( eventSample, 2 ); // currently for 2 classes only
982  if (!covMatrices){
983  Log() << kWARNING << " in TrainNodeFast, the covariance Matrices needed for the Fisher-Cuts returned error --> revert to just normal cuts for this node" << Endl;
984  fisherOK = kFALSE;
985  }else{
986  TMatrixD *ss = new TMatrixD(*(covMatrices->at(0)));
987  TMatrixD *bb = new TMatrixD(*(covMatrices->at(1)));
988  const TMatrixD *s = gTools().GetCorrelationMatrix(ss);
989  const TMatrixD *b = gTools().GetCorrelationMatrix(bb);
990 
991  for (UInt_t ivar=0; ivar < fNvars; ivar++) {
992  for (UInt_t jvar=ivar+1; jvar < fNvars; jvar++) {
993  if ( ( TMath::Abs( (*s)(ivar, jvar)) > fMinLinCorrForFisher) ||
994  ( TMath::Abs( (*b)(ivar, jvar)) > fMinLinCorrForFisher) ){
995  useVarInFisher[ivar] = kTRUE;
996  useVarInFisher[jvar] = kTRUE;
997  }
998  }
999  }
1000  // now as you know which variables you want to use, count and map them:
1001  // such that you can use an array/matrix filled only with THOSE variables
1002  // that you used
1003  UInt_t nFisherVars = 0;
1004  for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1005  //now .. pick those variables that are used in the FIsher and are also
1006  // part of the "allowed" variables in case of Randomized Trees)
1007  if (useVarInFisher[ivar] && useVariable[ivar]) {
1008  mapVarInFisher[nFisherVars++]=ivar;
1009  // now exclude the variables used in the Fisher cuts, and don't
1010  // use them anymore in the individual variable scan
1011  if (fUseExclusiveVars) useVariable[ivar] = kFALSE;
1012  }
1013  }
1014 
1015 
1016  fisherCoeff = this->GetFisherCoefficients(eventSample, nFisherVars, mapVarInFisher);
1017  fisherOK = kTRUE;
1018  }
1019  delete [] useVarInFisher;
1020  delete [] mapVarInFisher;
1021 
1022  }
1023 
1024 
1025  UInt_t cNvars = fNvars;
1026  if (fUseFisherCuts && fisherOK) cNvars++; // use the Fisher output simple as additional variable
1027 
1028  UInt_t* nBins = new UInt_t [cNvars];
1029  Double_t* binWidth = new Double_t [cNvars];
1030  Double_t* invBinWidth = new Double_t [cNvars];
1031 
1032  Double_t** nSelS = new Double_t* [cNvars];
1033  Double_t** nSelB = new Double_t* [cNvars];
1034  Double_t** nSelS_unWeighted = new Double_t* [cNvars];
1035  Double_t** nSelB_unWeighted = new Double_t* [cNvars];
1036  Double_t** target = new Double_t* [cNvars];
1037  Double_t** target2 = new Double_t* [cNvars];
1038  Double_t** cutValues = new Double_t* [cNvars];
1039 
1040  for (UInt_t ivar=0; ivar<cNvars; ivar++) {
1041  nBins[ivar] = fNCuts+1;
1042  if (ivar < fNvars) {
1043  if (fDataSetInfo->GetVariableInfo(ivar).GetVarType() == 'I') {
1044  nBins[ivar] = node->GetSampleMax(ivar) - node->GetSampleMin(ivar) + 1;
1045  }
1046  }
1047 
1048  nSelS[ivar] = new Double_t [nBins[ivar]];
1049  nSelB[ivar] = new Double_t [nBins[ivar]];
1050  nSelS_unWeighted[ivar] = new Double_t [nBins[ivar]];
1051  nSelB_unWeighted[ivar] = new Double_t [nBins[ivar]];
1052  target[ivar] = new Double_t [nBins[ivar]];
1053  target2[ivar] = new Double_t [nBins[ivar]];
1054  cutValues[ivar] = new Double_t [nBins[ivar]];
1055 
1056  }
1057 
1058  Double_t *xmin = new Double_t[cNvars];
1059  Double_t *xmax = new Double_t[cNvars];
1060 
1061  for (UInt_t ivar=0; ivar < cNvars; ivar++) {
1062  if (ivar < fNvars){
1063  xmin[ivar]=node->GetSampleMin(ivar);
1064  xmax[ivar]=node->GetSampleMax(ivar);
1065  if (almost_equal_float(xmax[ivar], xmin[ivar])) {
1066  // std::cout << " variable " << ivar << " has no proper range in (xmax[ivar]-xmin[ivar] = " << xmax[ivar]-xmin[ivar] << std::endl;
1067  // std::cout << " will set useVariable[ivar]=false"<<std::endl;
1068  useVariable[ivar]=kFALSE;
1069  }
1070 
1071  } else { // the fisher variable
1072  xmin[ivar]=999;
1073  xmax[ivar]=-999;
1074  // too bad, for the moment I don't know how to do this without looping
1075  // once to get the "min max" and then AGAIN to fill the histogram
1076  for (UInt_t iev=0; iev<nevents; iev++) {
1077  // returns the Fisher value (no fixed range)
1078  Double_t result = fisherCoeff[fNvars]; // the fisher constant offset
1079  for (UInt_t jvar=0; jvar<fNvars; jvar++)
1080  result += fisherCoeff[jvar]*(eventSample[iev])->GetValueFast(jvar);
1081  if (result > xmax[ivar]) xmax[ivar]=result;
1082  if (result < xmin[ivar]) xmin[ivar]=result;
1083  }
1084  }
1085  for (UInt_t ibin=0; ibin<nBins[ivar]; ibin++) {
1086  nSelS[ivar][ibin]=0;
1087  nSelB[ivar][ibin]=0;
1088  nSelS_unWeighted[ivar][ibin]=0;
1089  nSelB_unWeighted[ivar][ibin]=0;
1090  target[ivar][ibin]=0;
1091  target2[ivar][ibin]=0;
1092  cutValues[ivar][ibin]=0;
1093  }
1094  }
1095 
1096  // fill the cut values for the scan:
1097  for (UInt_t ivar=0; ivar < cNvars; ivar++) {
1098 
1099  if ( useVariable[ivar] ) {
1100 
1101  //set the grid for the cut scan on the variables like this:
1102  //
1103  // | | | | | ... | |
1104  // xmin xmax
1105  //
1106  // cut 0 1 2 3 ... fNCuts-1 (counting from zero)
1107  // bin 0 1 2 3 ..... nBins-1=fNCuts (counting from zero)
1108  // --> nBins = fNCuts+1
1109  // (NOTE, the cuts at xmin or xmax would just give the whole sample and
1110  // hence can be safely omitted
1111 
1112  binWidth[ivar] = ( xmax[ivar] - xmin[ivar] ) / Double_t(nBins[ivar]);
1113  invBinWidth[ivar] = 1./binWidth[ivar];
1114  if (ivar < fNvars) {
1115  if (fDataSetInfo->GetVariableInfo(ivar).GetVarType() == 'I') { invBinWidth[ivar] = 1; binWidth[ivar] = 1; }
1116  }
1117 
1118  // std::cout << "ivar="<<ivar
1119  // <<" min="<<xmin[ivar]
1120  // << " max="<<xmax[ivar]
1121  // << " width=" << istepSize
1122  // << " nBins["<<ivar<<"]="<<nBins[ivar]<<std::endl;
1123  for (UInt_t icut=0; icut<nBins[ivar]-1; icut++) {
1124  cutValues[ivar][icut]=xmin[ivar]+(Double_t(icut+1))*binWidth[ivar];
1125  // std::cout << " cutValues["<<ivar<<"]["<<icut<<"]=" << cutValues[ivar][icut] << std::endl;
1126  }
1127  }
1128  }
1129 
1130  nTotS=0; nTotB=0;
1131  nTotS_unWeighted=0; nTotB_unWeighted=0;
1132  for (UInt_t iev=0; iev<nevents; iev++) {
1133 
1134  Double_t eventWeight = eventSample[iev]->GetWeight();
1135  if (eventSample[iev]->GetClass() == fSigClass) {
1136  nTotS+=eventWeight;
1137  nTotS_unWeighted++; }
1138  else {
1139  nTotB+=eventWeight;
1140  nTotB_unWeighted++;
1141  }
1142 
1143  Int_t iBin=-1;
1144  for (UInt_t ivar=0; ivar < cNvars; ivar++) {
1145  // now scan trough the cuts for each variable and find which one gives
1146  // the best separationGain at the current stage.
1147  if ( useVariable[ivar] ) {
1148  Double_t eventData;
1149  if (ivar < fNvars) eventData = eventSample[iev]->GetValueFast(ivar);
1150  else { // the fisher variable
1151  eventData = fisherCoeff[fNvars];
1152  for (UInt_t jvar=0; jvar<fNvars; jvar++)
1153  eventData += fisherCoeff[jvar]*(eventSample[iev])->GetValueFast(jvar);
1154 
1155  }
1156  // "maximum" is nbins-1 (the "-1" because we start counting from 0 !!
1157  iBin = TMath::Min(Int_t(nBins[ivar]-1),TMath::Max(0,int (invBinWidth[ivar]*(eventData-xmin[ivar]) ) ));
1158  if (eventSample[iev]->GetClass() == fSigClass) {
1159  nSelS[ivar][iBin]+=eventWeight;
1160  nSelS_unWeighted[ivar][iBin]++;
1161  }
1162  else {
1163  nSelB[ivar][iBin]+=eventWeight;
1164  nSelB_unWeighted[ivar][iBin]++;
1165  }
1166  if (DoRegression()) {
1167  target[ivar][iBin] +=eventWeight*eventSample[iev]->GetTarget(0);
1168  target2[ivar][iBin]+=eventWeight*eventSample[iev]->GetTarget(0)*eventSample[iev]->GetTarget(0);
1169  }
1170  }
1171  }
1172  }
1173  // now turn the "histogram" into a cumulative distribution
1174  for (UInt_t ivar=0; ivar < cNvars; ivar++) {
1175  if (useVariable[ivar]) {
1176  for (UInt_t ibin=1; ibin < nBins[ivar]; ibin++) {
1177  nSelS[ivar][ibin]+=nSelS[ivar][ibin-1];
1178  nSelS_unWeighted[ivar][ibin]+=nSelS_unWeighted[ivar][ibin-1];
1179  nSelB[ivar][ibin]+=nSelB[ivar][ibin-1];
1180  nSelB_unWeighted[ivar][ibin]+=nSelB_unWeighted[ivar][ibin-1];
1181  if (DoRegression()) {
1182  target[ivar][ibin] +=target[ivar][ibin-1] ;
1183  target2[ivar][ibin]+=target2[ivar][ibin-1];
1184  }
1185  }
1186  if (nSelS_unWeighted[ivar][nBins[ivar]-1] +nSelB_unWeighted[ivar][nBins[ivar]-1] != eventSample.size()) {
1187  Log() << kFATAL << "Helge, you have a bug ....nSelS_unw..+nSelB_unw..= "
1188  << nSelS_unWeighted[ivar][nBins[ivar]-1] +nSelB_unWeighted[ivar][nBins[ivar]-1]
1189  << " while eventsample size = " << eventSample.size()
1190  << Endl;
1191  }
1192  double lastBins=nSelS[ivar][nBins[ivar]-1] +nSelB[ivar][nBins[ivar]-1];
1193  double totalSum=nTotS+nTotB;
1194  if (TMath::Abs(lastBins-totalSum)/totalSum>0.01) {
1195  Log() << kFATAL << "Helge, you have another bug ....nSelS+nSelB= "
1196  << lastBins
1197  << " while total number of events = " << totalSum
1198  << Endl;
1199  }
1200  }
1201  }
1202  // now select the optimal cuts for each variable and find which one gives
1203  // the best separationGain at the current stage
1204  for (UInt_t ivar=0; ivar < cNvars; ivar++) {
1205  if (useVariable[ivar]) {
1206  for (UInt_t iBin=0; iBin<nBins[ivar]-1; iBin++) { // the last bin contains "all events" -->skip
1207  // the separationGain is defined as the various indices (Gini, CorssEntropy, e.t.c)
1208  // calculated by the "SamplePurities" from the branches that would go to the
1209  // left or the right from this node if "these" cuts were used in the Node:
1210  // hereby: nSelS and nSelB would go to the right branch
1211  // (nTotS - nSelS) + (nTotB - nSelB) would go to the left branch;
1212 
1213  // only allow splits where both daughter nodes match the specified minimum number
1214  // for this use the "unweighted" events, as you are interested in statistically
1215  // significant splits, which is determined by the actual number of entries
1216  // for a node, rather than the sum of event weights.
1217 
1218  Double_t sl = nSelS_unWeighted[ivar][iBin];
1219  Double_t bl = nSelB_unWeighted[ivar][iBin];
1220  Double_t s = nTotS_unWeighted;
1221  Double_t b = nTotB_unWeighted;
1222  Double_t slW = nSelS[ivar][iBin];
1223  Double_t blW = nSelB[ivar][iBin];
1224  Double_t sW = nTotS;
1225  Double_t bW = nTotB;
1226  Double_t sr = s-sl;
1227  Double_t br = b-bl;
1228  Double_t srW = sW-slW;
1229  Double_t brW = bW-blW;
1230  // std::cout << "sl="<<sl << " bl="<<bl<<" fMinSize="<<fMinSize << "sr="<<sr << " br="<<br <<std::endl;
1231  if ( ((sl+bl)>=fMinSize && (sr+br)>=fMinSize)
1232  && ((slW+blW)>=fMinSize && (srW+brW)>=fMinSize)
1233  ) {
1234 
1235  if (DoRegression()) {
1236  sepTmp = fRegType->GetSeparationGain(nSelS[ivar][iBin]+nSelB[ivar][iBin],
1237  target[ivar][iBin],target2[ivar][iBin],
1238  nTotS+nTotB,
1239  target[ivar][nBins[ivar]-1],target2[ivar][nBins[ivar]-1]);
1240  } else {
1241  sepTmp = fSepType->GetSeparationGain(nSelS[ivar][iBin], nSelB[ivar][iBin], nTotS, nTotB);
1242  }
1243  if (separationGain[ivar] < sepTmp) {
1244  separationGain[ivar] = sepTmp;
1245  cutIndex[ivar] = iBin;
1246  }
1247  }
1248  }
1249  }
1250  }
1251 
1252 
1253  //now you have found the best separation cut for each variable, now compare the variables
1254  for (UInt_t ivar=0; ivar < cNvars; ivar++) {
1255  if (useVariable[ivar] ) {
1256  if (separationGainTotal < separationGain[ivar]) {
1257  separationGainTotal = separationGain[ivar];
1258  mxVar = ivar;
1259  }
1260  }
1261  }
1262 
1263  if (mxVar >= 0) {
1264  if (DoRegression()) {
1265  node->SetSeparationIndex(fRegType->GetSeparationIndex(nTotS+nTotB,target[0][nBins[mxVar]-1],target2[0][nBins[mxVar]-1]));
1266  node->SetResponse(target[0][nBins[mxVar]-1]/(nTotS+nTotB));
1267  if ( almost_equal_double(target2[0][nBins[mxVar]-1]/(nTotS+nTotB), target[0][nBins[mxVar]-1]/(nTotS+nTotB)*target[0][nBins[mxVar]-1]/(nTotS+nTotB))) {
1268  node->SetRMS(0);
1269  }else{
1270  node->SetRMS(TMath::Sqrt(target2[0][nBins[mxVar]-1]/(nTotS+nTotB) - target[0][nBins[mxVar]-1]/(nTotS+nTotB)*target[0][nBins[mxVar]-1]/(nTotS+nTotB)));
1271  }
1272  }
1273  else {
1274  node->SetSeparationIndex(fSepType->GetSeparationIndex(nTotS,nTotB));
1275  if (mxVar >=0){
1276  if (nSelS[mxVar][cutIndex[mxVar]]/nTotS > nSelB[mxVar][cutIndex[mxVar]]/nTotB) cutType=kTRUE;
1277  else cutType=kFALSE;
1278  }
1279  }
1280  node->SetSelector((UInt_t)mxVar);
1281  node->SetCutValue(cutValues[mxVar][cutIndex[mxVar]]);
1282  node->SetCutType(cutType);
1283  node->SetSeparationGain(separationGainTotal);
1284  if (mxVar < (Int_t) fNvars){ // the fisher cut is actually not used in this node, hence don't need to store fisher components
1285  node->SetNFisherCoeff(0);
1286  fVariableImportance[mxVar] += separationGainTotal*separationGainTotal * (nTotS+nTotB) * (nTotS+nTotB) ;
1287  //for (UInt_t ivar=0; ivar<fNvars; ivar++) fVariableImportance[ivar] += separationGain[ivar]*separationGain[ivar] * (nTotS+nTotB) * (nTotS+nTotB) ;
1288  }else{
1289  // allocate Fisher coefficients (use fNvars, and set the non-used ones to zero. Might
1290  // be even less storage space on average than storing also the mapping used otherwise
1291  // can always be changed relatively easy
1292  node->SetNFisherCoeff(fNvars+1);
1293  for (UInt_t ivar=0; ivar<=fNvars; ivar++) {
1294  node->SetFisherCoeff(ivar,fisherCoeff[ivar]);
1295  // take 'fisher coeff. weighted estimate as variable importance, "Don't fill the offset coefficient though :)
1296  if (ivar<fNvars){
1297  fVariableImportance[ivar] += fisherCoeff[ivar]*fisherCoeff[ivar]*separationGainTotal*separationGainTotal * (nTotS+nTotB) * (nTotS+nTotB) ;
1298  }
1299  }
1300  }
1301  }
1302  else {
1303  separationGainTotal = 0;
1304  }
1305 
1306  // if (mxVar > -1) {
1307  // std::cout << "------------------------------------------------------------------"<<std::endl;
1308  // std::cout << "cutting on Var: " << mxVar << " with cutIndex " << cutIndex[mxVar] << " being: " << cutValues[mxVar][cutIndex[mxVar]] << std::endl;
1309  // std::cout << " nSelS = " << nSelS_unWeighted[mxVar][cutIndex[mxVar]] << " nSelB = " << nSelB_unWeighted[mxVar][cutIndex[mxVar]] << " (right) sum:= " << nSelS_unWeighted[mxVar][cutIndex[mxVar]] + nSelB_unWeighted[mxVar][cutIndex[mxVar]] << std::endl;
1310  // std::cout << " nSelS = " << nTotS_unWeighted - nSelS_unWeighted[mxVar][cutIndex[mxVar]] << " nSelB = " << nTotB_unWeighted-nSelB_unWeighted[mxVar][cutIndex[mxVar]] << " (left) sum:= " << nTotS_unWeighted + nTotB_unWeighted - nSelS_unWeighted[mxVar][cutIndex[mxVar]] - nSelB_unWeighted[mxVar][cutIndex[mxVar]] << std::endl;
1311  // std::cout << " nSelS = " << nSelS[mxVar][cutIndex[mxVar]] << " nSelB = " << nSelB[mxVar][cutIndex[mxVar]] << std::endl;
1312  // std::cout << " s/s+b " << nSelS_unWeighted[mxVar][cutIndex[mxVar]]/( nSelS_unWeighted[mxVar][cutIndex[mxVar]] + nSelB_unWeighted[mxVar][cutIndex[mxVar]])
1313  // << " s/s+b " << (nTotS - nSelS_unWeighted[mxVar][cutIndex[mxVar]])/( nTotS-nSelS_unWeighted[mxVar][cutIndex[mxVar]] + nTotB-nSelB_unWeighted[mxVar][cutIndex[mxVar]]) << std::endl;
1314  // std::cout << " nTotS = " << nTotS << " nTotB = " << nTotB << std::endl;
1315  // std::cout << " separationGainTotal " << separationGainTotal << std::endl;
1316  // } else {
1317  // std::cout << "------------------------------------------------------------------"<<std::endl;
1318  // std::cout << " obviously didn't find new mxVar " << mxVar << std::endl;
1319  // }
1320  for (UInt_t i=0; i<cNvars; i++) {
1321  delete [] nSelS[i];
1322  delete [] nSelB[i];
1323  delete [] nSelS_unWeighted[i];
1324  delete [] nSelB_unWeighted[i];
1325  delete [] target[i];
1326  delete [] target2[i];
1327  delete [] cutValues[i];
1328  }
1329  delete [] nSelS;
1330  delete [] nSelB;
1331  delete [] nSelS_unWeighted;
1332  delete [] nSelB_unWeighted;
1333  delete [] target;
1334  delete [] target2;
1335  delete [] cutValues;
1336 
1337  delete [] xmin;
1338  delete [] xmax;
1339 
1340  delete [] useVariable;
1341  delete [] mapVariable;
1342 
1343  delete [] separationGain;
1344  delete [] cutIndex;
1345 
1346  delete [] nBins;
1347  delete [] binWidth;
1348  delete [] invBinWidth;
1349 
1350  return separationGainTotal;
1351 
1352 }
1353 
1354 ////////////////////////////////////////////////////////////////////////////////
1355 /// calculate the fisher coefficients for the event sample and the variables used
1356 
1357 std::vector<Double_t> TMVA::DecisionTree::GetFisherCoefficients(const EventConstList &eventSample, UInt_t nFisherVars, UInt_t *mapVarInFisher){
1358  std::vector<Double_t> fisherCoeff(fNvars+1);
1359 
1360  // initialization of global matrices and vectors
1361  // average value of each variables for S, B, S+B
1362  TMatrixD* meanMatx = new TMatrixD( nFisherVars, 3 );
1363 
1364  // the covariance 'within class' and 'between class' matrices
1365  TMatrixD* betw = new TMatrixD( nFisherVars, nFisherVars );
1366  TMatrixD* with = new TMatrixD( nFisherVars, nFisherVars );
1367  TMatrixD* cov = new TMatrixD( nFisherVars, nFisherVars );
1368 
1369  //
1370  // compute mean values of variables in each sample, and the overall means
1371  //
1372 
1373  // initialize internal sum-of-weights variables
1374  Double_t sumOfWeightsS = 0;
1375  Double_t sumOfWeightsB = 0;
1376 
1377 
1378  // init vectors
1379  Double_t* sumS = new Double_t[nFisherVars];
1380  Double_t* sumB = new Double_t[nFisherVars];
1381  for (UInt_t ivar=0; ivar<nFisherVars; ivar++) { sumS[ivar] = sumB[ivar] = 0; }
1382 
1383  UInt_t nevents = eventSample.size();
1384  // compute sample means
1385  for (UInt_t ievt=0; ievt<nevents; ievt++) {
1386 
1387  // read the Training Event into "event"
1388  const Event * ev = eventSample[ievt];
1389 
1390  // sum of weights
1391  Double_t weight = ev->GetWeight();
1392  if (ev->GetClass() == fSigClass) sumOfWeightsS += weight;
1393  else sumOfWeightsB += weight;
1394 
1395  Double_t* sum = ev->GetClass() == fSigClass ? sumS : sumB;
1396  for (UInt_t ivar=0; ivar<nFisherVars; ivar++) {
1397  sum[ivar] += ev->GetValueFast( mapVarInFisher[ivar] )*weight;
1398  }
1399  }
1400  for (UInt_t ivar=0; ivar<nFisherVars; ivar++) {
1401  (*meanMatx)( ivar, 2 ) = sumS[ivar];
1402  (*meanMatx)( ivar, 0 ) = sumS[ivar]/sumOfWeightsS;
1403 
1404  (*meanMatx)( ivar, 2 ) += sumB[ivar];
1405  (*meanMatx)( ivar, 1 ) = sumB[ivar]/sumOfWeightsB;
1406 
1407  // signal + background
1408  (*meanMatx)( ivar, 2 ) /= (sumOfWeightsS + sumOfWeightsB);
1409  }
1410 
1411  delete [] sumS;
1412 
1413  delete [] sumB;
1414 
1415  // the matrix of covariance 'within class' reflects the dispersion of the
1416  // events relative to the center of gravity of their own class
1417 
1418  // assert required
1419 
1420  assert( sumOfWeightsS > 0 && sumOfWeightsB > 0 );
1421 
1422  // product matrices (x-<x>)(y-<y>) where x;y are variables
1423 
1424  const Int_t nFisherVars2 = nFisherVars*nFisherVars;
1425  Double_t *sum2Sig = new Double_t[nFisherVars2];
1426  Double_t *sum2Bgd = new Double_t[nFisherVars2];
1427  Double_t *xval = new Double_t[nFisherVars2];
1428  memset(sum2Sig,0,nFisherVars2*sizeof(Double_t));
1429  memset(sum2Bgd,0,nFisherVars2*sizeof(Double_t));
1430 
1431  // 'within class' covariance
1432  for (UInt_t ievt=0; ievt<nevents; ievt++) {
1433 
1434  // read the Training Event into "event"
1435  // const Event* ev = eventSample[ievt];
1436  const Event* ev = eventSample.at(ievt);
1437 
1438  Double_t weight = ev->GetWeight(); // may ignore events with negative weights
1439 
1440  for (UInt_t x=0; x<nFisherVars; x++) {
1441  xval[x] = ev->GetValueFast( mapVarInFisher[x] );
1442  }
1443  Int_t k=0;
1444  for (UInt_t x=0; x<nFisherVars; x++) {
1445  for (UInt_t y=0; y<nFisherVars; y++) {
1446  if ( ev->GetClass() == fSigClass ) sum2Sig[k] += ( (xval[x] - (*meanMatx)(x, 0))*(xval[y] - (*meanMatx)(y, 0)) )*weight;
1447  else sum2Bgd[k] += ( (xval[x] - (*meanMatx)(x, 1))*(xval[y] - (*meanMatx)(y, 1)) )*weight;
1448  k++;
1449  }
1450  }
1451  }
1452  Int_t k=0;
1453  for (UInt_t x=0; x<nFisherVars; x++) {
1454  for (UInt_t y=0; y<nFisherVars; y++) {
1455  (*with)(x, y) = sum2Sig[k]/sumOfWeightsS + sum2Bgd[k]/sumOfWeightsB;
1456  k++;
1457  }
1458  }
1459 
1460  delete [] sum2Sig;
1461  delete [] sum2Bgd;
1462  delete [] xval;
1463 
1464 
1465  // the matrix of covariance 'between class' reflects the dispersion of the
1466  // events of a class relative to the global center of gravity of all the class
1467  // hence the separation between classes
1468 
1469 
1470  Double_t prodSig, prodBgd;
1471 
1472  for (UInt_t x=0; x<nFisherVars; x++) {
1473  for (UInt_t y=0; y<nFisherVars; y++) {
1474 
1475  prodSig = ( ((*meanMatx)(x, 0) - (*meanMatx)(x, 2))*
1476  ((*meanMatx)(y, 0) - (*meanMatx)(y, 2)) );
1477  prodBgd = ( ((*meanMatx)(x, 1) - (*meanMatx)(x, 2))*
1478  ((*meanMatx)(y, 1) - (*meanMatx)(y, 2)) );
1479 
1480  (*betw)(x, y) = (sumOfWeightsS*prodSig + sumOfWeightsB*prodBgd) / (sumOfWeightsS + sumOfWeightsB);
1481  }
1482  }
1483 
1484 
1485 
1486  // compute full covariance matrix from sum of within and between matrices
1487  for (UInt_t x=0; x<nFisherVars; x++)
1488  for (UInt_t y=0; y<nFisherVars; y++)
1489  (*cov)(x, y) = (*with)(x, y) + (*betw)(x, y);
1490 
1491  // Fisher = Sum { [coeff]*[variables] }
1492  //
1493  // let Xs be the array of the mean values of variables for signal evts
1494  // let Xb be the array of the mean values of variables for backgd evts
1495  // let InvWith be the inverse matrix of the 'within class' correlation matrix
1496  //
1497  // then the array of Fisher coefficients is
1498  // [coeff] =TMath::Sqrt(fNsig*fNbgd)/fNevt*transpose{Xs-Xb}*InvWith
1499  TMatrixD* theMat = with; // Fishers original
1500  // TMatrixD* theMat = cov; // Mahalanobis
1501 
1502  TMatrixD invCov( *theMat );
1503  if ( TMath::Abs(invCov.Determinant()) < 10E-24 ) {
1504  Log() << kWARNING << "FisherCoeff matrix is almost singular with determinant="
1505  << TMath::Abs(invCov.Determinant())
1506  << " did you use the variables that are linear combinations or highly correlated?"
1507  << Endl;
1508  }
1509  if ( TMath::Abs(invCov.Determinant()) < 10E-120 ) {
1510  Log() << kFATAL << "FisherCoeff matrix is singular with determinant="
1511  << TMath::Abs(invCov.Determinant())
1512  << " did you use the variables that are linear combinations?"
1513  << Endl;
1514  }
1515 
1516  invCov.Invert();
1517 
1518  // apply rescaling factor
1519  Double_t xfact = TMath::Sqrt( sumOfWeightsS*sumOfWeightsB ) / (sumOfWeightsS + sumOfWeightsB);
1520 
1521  // compute difference of mean values
1522  std::vector<Double_t> diffMeans( nFisherVars );
1523 
1524  for (UInt_t ivar=0; ivar<=fNvars; ivar++) fisherCoeff[ivar] = 0;
1525  for (UInt_t ivar=0; ivar<nFisherVars; ivar++) {
1526  for (UInt_t jvar=0; jvar<nFisherVars; jvar++) {
1527  Double_t d = (*meanMatx)(jvar, 0) - (*meanMatx)(jvar, 1);
1528  fisherCoeff[mapVarInFisher[ivar]] += invCov(ivar, jvar)*d;
1529  }
1530 
1531  // rescale
1532  fisherCoeff[mapVarInFisher[ivar]] *= xfact;
1533  }
1534 
1535  // offset correction
1536  Double_t f0 = 0.0;
1537  for (UInt_t ivar=0; ivar<nFisherVars; ivar++){
1538  f0 += fisherCoeff[mapVarInFisher[ivar]]*((*meanMatx)(ivar, 0) + (*meanMatx)(ivar, 1));
1539  }
1540  f0 /= -2.0;
1541 
1542  fisherCoeff[fNvars] = f0; //as we start counting variables from "zero", I store the fisher offset at the END
1543 
1544  return fisherCoeff;
1545 }
1546 
1547 ////////////////////////////////////////////////////////////////////////////////
1548 /// train a node by finding the single optimal cut for a single variable
1549 /// that best separates signal and background (maximizes the separation gain)
1550 
1552  TMVA::DecisionTreeNode *node )
1553 {
1554  Double_t nTotS = 0.0, nTotB = 0.0;
1555  Int_t nTotS_unWeighted = 0, nTotB_unWeighted = 0;
1556 
1557  std::vector<TMVA::BDTEventWrapper> bdtEventSample;
1558 
1559  // List of optimal cuts, separation gains, and cut types (removed background or signal) - one for each variable
1560  std::vector<Double_t> lCutValue( fNvars, 0.0 );
1561  std::vector<Double_t> lSepGain( fNvars, -1.0e6 );
1562  std::vector<Char_t> lCutType( fNvars ); // <----- bool is stored (for performance reasons, no std::vector<bool> has been taken)
1563  lCutType.assign( fNvars, Char_t(kFALSE) );
1564 
1565  // Initialize (un)weighted counters for signal & background
1566  // Construct a list of event wrappers that point to the original data
1567  for( std::vector<const TMVA::Event*>::const_iterator it = eventSample.begin(); it != eventSample.end(); ++it ) {
1568  if((*it)->GetClass() == fSigClass) { // signal or background event
1569  nTotS += (*it)->GetWeight();
1570  ++nTotS_unWeighted;
1571  }
1572  else {
1573  nTotB += (*it)->GetWeight();
1574  ++nTotB_unWeighted;
1575  }
1576  bdtEventSample.push_back(TMVA::BDTEventWrapper(*it));
1577  }
1578 
1579  std::vector<Char_t> useVariable(fNvars); // <----- bool is stored (for performance reasons, no std::vector<bool> has been taken)
1580  useVariable.assign( fNvars, Char_t(kTRUE) );
1581 
1582  for (UInt_t ivar=0; ivar < fNvars; ivar++) useVariable[ivar]=Char_t(kFALSE);
1583  if (fRandomisedTree) { // choose for each node splitting a random subset of variables to choose from
1584  if (fUseNvars ==0 ) { // no number specified ... choose s.th. which hopefully works well
1585  // watch out, should never happen as it is initialised automatically in MethodBDT already!!!
1586  fUseNvars = UInt_t(TMath::Sqrt(fNvars)+0.6);
1587  }
1588  Int_t nSelectedVars = 0;
1589  while (nSelectedVars < fUseNvars) {
1590  Double_t bla = fMyTrandom->Rndm()*fNvars;
1591  useVariable[Int_t (bla)] = Char_t(kTRUE);
1592  nSelectedVars = 0;
1593  for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1594  if(useVariable[ivar] == Char_t(kTRUE)) nSelectedVars++;
1595  }
1596  }
1597  }
1598  else {
1599  for (UInt_t ivar=0; ivar < fNvars; ivar++) useVariable[ivar] = Char_t(kTRUE);
1600  }
1601 
1602  for( UInt_t ivar = 0; ivar < fNvars; ivar++ ) { // loop over all discriminating variables
1603  if(!useVariable[ivar]) continue; // only optimize with selected variables
1604  TMVA::BDTEventWrapper::SetVarIndex(ivar); // select the variable to sort by
1605  std::sort( bdtEventSample.begin(),bdtEventSample.end() ); // sort the event data
1606 
1607  Double_t bkgWeightCtr = 0.0, sigWeightCtr = 0.0;
1608  std::vector<TMVA::BDTEventWrapper>::iterator it = bdtEventSample.begin(), it_end = bdtEventSample.end();
1609  for( ; it != it_end; ++it ) {
1610  if((**it)->GetClass() == fSigClass ) // specify signal or background event
1611  sigWeightCtr += (**it)->GetWeight();
1612  else
1613  bkgWeightCtr += (**it)->GetWeight();
1614  // Store the accumulated signal (background) weights
1615  it->SetCumulativeWeight(false,bkgWeightCtr);
1616  it->SetCumulativeWeight(true,sigWeightCtr);
1617  }
1618 
1619  const Double_t fPMin = 1.0e-6;
1620  Bool_t cutType = kFALSE;
1621  Long64_t index = 0;
1622  Double_t separationGain = -1.0, sepTmp = 0.0, cutValue = 0.0, dVal = 0.0, norm = 0.0;
1623  // Locate the optimal cut for this (ivar-th) variable
1624  for( it = bdtEventSample.begin(); it != it_end; ++it ) {
1625  if( index == 0 ) { ++index; continue; }
1626  if( *(*it) == NULL ) {
1627  Log() << kFATAL << "In TrainNodeFull(): have a null event! Where index="
1628  << index << ", and parent node=" << node->GetParent() << Endl;
1629  break;
1630  }
1631  dVal = bdtEventSample[index].GetVal() - bdtEventSample[index-1].GetVal();
1632  norm = TMath::Abs(bdtEventSample[index].GetVal() + bdtEventSample[index-1].GetVal());
1633  // Only allow splits where both daughter nodes have the specified minimum number of events
1634  // Splits are only sensible when the data are ordered (eg. don't split inside a sequence of 0's)
1635  if( index >= fMinSize && (nTotS_unWeighted + nTotB_unWeighted) - index >= fMinSize && TMath::Abs(dVal/(0.5*norm + 1)) > fPMin ) {
1636  sepTmp = fSepType->GetSeparationGain( it->GetCumulativeWeight(true), it->GetCumulativeWeight(false), sigWeightCtr, bkgWeightCtr );
1637  if( sepTmp > separationGain ) {
1638  separationGain = sepTmp;
1639  cutValue = it->GetVal() - 0.5*dVal;
1640  Double_t nSelS = it->GetCumulativeWeight(true);
1641  Double_t nSelB = it->GetCumulativeWeight(false);
1642  // Indicate whether this cut is improving the node purity by removing background (enhancing signal)
1643  // or by removing signal (enhancing background)
1644  if( nSelS/sigWeightCtr > nSelB/bkgWeightCtr ) cutType = kTRUE;
1645  else cutType = kFALSE;
1646  }
1647  }
1648  ++index;
1649  }
1650  lCutType[ivar] = Char_t(cutType);
1651  lCutValue[ivar] = cutValue;
1652  lSepGain[ivar] = separationGain;
1653  }
1654 
1655  Double_t separationGain = -1.0;
1656  Int_t iVarIndex = -1;
1657  for( UInt_t ivar = 0; ivar < fNvars; ivar++ ) {
1658  if( lSepGain[ivar] > separationGain ) {
1659  iVarIndex = ivar;
1660  separationGain = lSepGain[ivar];
1661  }
1662  }
1663 
1664  if(iVarIndex >= 0) {
1665  node->SetSelector(iVarIndex);
1666  node->SetCutValue(lCutValue[iVarIndex]);
1667  node->SetSeparationGain(lSepGain[iVarIndex]);
1668  node->SetCutType(lCutType[iVarIndex]);
1669  fVariableImportance[iVarIndex] += separationGain*separationGain * (nTotS+nTotB) * (nTotS+nTotB);
1670  }
1671  else {
1672  separationGain = 0.0;
1673  }
1674 
1675  return separationGain;
1676 }
1677 
1678 ////////////////////////////////////////////////////////////////////////////////
1679 /// get the pointer to the leaf node where a particular event ends up in...
1680 /// (used in gradient boosting)
1681 
1683 {
1685  while(current->GetNodeType() == 0) { // intermediate node in a tree
1686  current = (current->GoesRight(e)) ?
1687  (TMVA::DecisionTreeNode*)current->GetRight() :
1688  (TMVA::DecisionTreeNode*)current->GetLeft();
1689  }
1690  return current;
1691 }
1692 
1693 ////////////////////////////////////////////////////////////////////////////////
1694 /// the event e is put into the decision tree (starting at the root node)
1695 /// and the output is NodeType (signal) or (background) of the final node (basket)
1696 /// in which the given events ends up. I.e. the result of the classification if
1697 /// the event for this decision tree.
1698 
1700 {
1701  TMVA::DecisionTreeNode *current = this->GetRoot();
1702  if (!current){
1703  Log() << kFATAL << "CheckEvent: started with undefined ROOT node" <<Endl;
1704  return 0; //keeps coverity happy that doesn't know that kFATAL causes an exit
1705  }
1706 
1707  while (current->GetNodeType() == 0) { // intermediate node in a (pruned) tree
1708  current = (current->GoesRight(*e)) ?
1709  current->GetRight() :
1710  current->GetLeft();
1711  if (!current) {
1712  Log() << kFATAL << "DT::CheckEvent: inconsistent tree structure" <<Endl;
1713  }
1714 
1715  }
1716 
1717  if ( DoRegression() ){
1718  return current->GetResponse();
1719  }
1720  else {
1721  if (UseYesNoLeaf) return Double_t ( current->GetNodeType() );
1722  else return current->GetPurity();
1723  }
1724 }
1725 
1726 ////////////////////////////////////////////////////////////////////////////////
1727 /// calculates the purity S/(S+B) of a given event sample
1728 
1729 Double_t TMVA::DecisionTree::SamplePurity( std::vector<TMVA::Event*> eventSample )
1730 {
1731  Double_t sumsig=0, sumbkg=0, sumtot=0;
1732  for (UInt_t ievt=0; ievt<eventSample.size(); ievt++) {
1733  if (eventSample[ievt]->GetClass() != fSigClass) sumbkg+=eventSample[ievt]->GetWeight();
1734  else sumsig+=eventSample[ievt]->GetWeight();
1735  sumtot+=eventSample[ievt]->GetWeight();
1736  }
1737  // sanity check
1738  if (sumtot!= (sumsig+sumbkg)){
1739  Log() << kFATAL << "<SamplePurity> sumtot != sumsig+sumbkg"
1740  << sumtot << " " << sumsig << " " << sumbkg << Endl;
1741  }
1742  if (sumtot>0) return sumsig/(sumsig + sumbkg);
1743  else return -1;
1744 }
1745 
1746 ////////////////////////////////////////////////////////////////////////////////
1747 /// Return the relative variable importance, normalized to all
1748 /// variables together having the importance 1. The importance in
1749 /// evaluated as the total separation-gain that this variable had in
1750 /// the decision trees (weighted by the number of events)
1751 
1753 {
1754  std::vector<Double_t> relativeImportance(fNvars);
1755  Double_t sum=0;
1756  for (UInt_t i=0; i< fNvars; i++) {
1757  sum += fVariableImportance[i];
1758  relativeImportance[i] = fVariableImportance[i];
1759  }
1760 
1761  for (UInt_t i=0; i< fNvars; i++) {
1763  relativeImportance[i] /= sum;
1764  else
1765  relativeImportance[i] = 0;
1766  }
1767  return relativeImportance;
1768 }
1769 
1770 ////////////////////////////////////////////////////////////////////////////////
1771 /// returns the relative importance of variable ivar
1772 
1774 {
1775  std::vector<Double_t> relativeImportance = this->GetVariableImportance();
1776  if (ivar < fNvars) return relativeImportance[ivar];
1777  else {
1778  Log() << kFATAL << "<GetVariableImportance>" << Endl
1779  << "--- ivar = " << ivar << " is out of range " << Endl;
1780  }
1781 
1782  return -1;
1783 }
1784 
Float_t GetValueFast(UInt_t ivar) const
Definition: Event.h:88
Double_t PruneStrength
quality measure for a pruned subtree T of T_max
Definition: IPruneTool.h:46
virtual Double_t GetSeparationGain(const Double_t nSelS, const Double_t nSelB, const Double_t nTotS, const Double_t nTotB)
Separation Gain: the measure of how the quality of separation of the sample increases by splitting th...
DataSetInfo * fDataSetInfo
Definition: DecisionTree.h:236
static long int sum(long int i)
Definition: Factory.cxx:2162
float xmin
Definition: THbookFile.cxx:93
Random number generator class based on M.
Definition: TRandom3.h:27
void SetSelector(Short_t i)
MsgLogger & Endl(MsgLogger &ml)
Definition: MsgLogger.h:158
void SetFisherCoeff(Int_t ivar, Double_t coeff)
set fisher coefficients
Singleton class for Global types used by TMVA.
Definition: Types.h:73
long long Long64_t
Definition: RtypesCore.h:69
virtual Double_t Rndm()
Machine independent random number generator.
Definition: TRandom3.cxx:94
float Float_t
Definition: RtypesCore.h:53
Float_t GetSumTarget() const
Double_t CheckEvent(const TMVA::Event *, Bool_t UseYesNoLeaf=kFALSE) const
the event e is put into the decision tree (starting at the root node) and the output is NodeType (sig...
Double_t GetNodePurityLimit() const
Definition: DecisionTree.h:156
EPruneMethod fPruneMethod
Definition: DecisionTree.h:216
virtual PruningInfo * CalculatePruningInfo(DecisionTree *dt, const EventSample *testEvents=NULL, Bool_t isAutomatic=kFALSE)=0
Calculate the "SeparationGain" for Regression analysis separation criteria used in various training a...
virtual DecisionTreeNode * GetParent() const
UInt_t GetDepth() const
Definition: Node.h:114
Float_t GetSumTarget2() const
void IncrementNEvents(Float_t nev)
Float_t GetSampleMax(UInt_t ivar) const
return the maximum of variable ivar from the training sample that pass/end up in this node ...
Short_t Min(Short_t a, Short_t b)
Definition: TMathBase.h:168
std::vector< Double_t > GetFisherCoefficients(const EventConstList &eventSample, UInt_t nFisherVars, UInt_t *mapVarInFisher)
calculate the fisher coefficients for the event sample and the variables used
std::vector< DecisionTreeNode * > PruneSequence
the regularization parameter for pruning
Definition: IPruneTool.h:47
int Int_t
Definition: RtypesCore.h:41
bool Bool_t
Definition: RtypesCore.h:59
virtual void SetParentTree(TMVA::BinaryTree *t)
Definition: Node.h:126
Double_t fNodePurityLimit
Definition: DecisionTree.h:219
virtual void SetRight(Node *r)
virtual ~DecisionTree(void)
destructor
#define NULL
Definition: RtypesCore.h:88
Double_t TestPrunedTreeQuality(const DecisionTreeNode *dt=NULL, Int_t mode=0) const
return the misclassification rate of a pruned tree a "pruned tree" may have set the variable "IsTermi...
Float_t GetNSigEvents(void) const
virtual Double_t Determinant() const
Return the matrix determinant.
Definition: TMatrixT.cxx:1361
virtual DecisionTreeNode * GetRoot() const
Definition: DecisionTree.h:88
void CheckEventWithPrunedTree(const TMVA::Event *) const
pass a single validation event through a pruned decision tree on the way down the tree...
void DeleteNode(Node *)
protected, recursive, function used by the class destructor and when Pruning
Definition: BinaryTree.cxx:74
void SetNSigEvents_unweighted(Float_t s)
virtual Double_t GetSeparationIndex(const Double_t n, const Double_t target, const Double_t target2)
Separation Index: a simple Variance.
void SetResponse(Float_t r)
Short_t Abs(Short_t d)
Definition: TMathBase.h:108
void SetNBValidation(Double_t b)
std::vector< Double_t > GetVariableImportance()
Return the relative variable importance, normalized to all variables together having the importance 1...
Bool_t IsAutomatic() const
Definition: IPruneTool.h:95
void SetNFisherCoeff(Int_t nvars)
void SetDepth(UInt_t d)
Definition: Node.h:111
std::vector< const TMVA::Event * > EventConstList
Definition: DecisionTree.h:68
TClass * GetClass(T *)
Definition: TClass.h:545
char GetVarType() const
Definition: VariableInfo.h:61
MsgLogger & Log() const
Definition: BinaryTree.cxx:235
Double_t x[n]
Definition: legend1.C:17
Base class for BinarySearch and Decision Trees.
Definition: BinaryTree.h:62
static const Int_t fgRandomSeed
Definition: DecisionTree.h:63
Float_t GetNBkgEvents(void) const
void FillTree(const EventList &eventSample)
fill the existing the decision tree structure by filling event in from the top node and see where the...
Float_t GetSampleMin(UInt_t ivar) const
return the minimum of variable ivar from the training sample that pass/end up in this node ...
Float_t GetCutValue(void) const
void IncrementNBkgEvents(Float_t b)
Double_t SamplePurity(EventList eventSample)
calculates the purity S/(S+B) of a given event sample
Double_t GetNodeR() const
std::vector< Double_t > fVariableImportance
Definition: DecisionTree.h:227
void SetSeparationGain(Float_t sep)
UInt_t GetClass() const
Definition: Event.h:81
Double_t GetSumWeights(const EventConstList *validationSample) const
calculate the normalization factor for a pruning validation sample
void ResetValidationData()
temporary stored node values (number of events, etc.) that originate not from the training but from t...
void SetNBkgEvents(Float_t b)
void SetNSValidation(Double_t s)
Class that contains all the data information.
Definition: DataSetInfo.h:60
Double_t GetSeparationGain(const Double_t nLeft, const Double_t targetLeft, const Double_t target2Left, const Double_t nTot, const Double_t targetTot, const Double_t target2Tot)
Separation Gain: the measure of how the quality of separation of the sample increases by splitting th...
UInt_t CountLeafNodes(TMVA::Node *n=NULL)
return the number of terminal nodes in the sub-tree below Node n
void AddToSumTarget(Float_t t)
Double_t GetWeight() const
return the event weight - depending on whether the flag IgnoreNegWeightsInTraining is or not...
Definition: Event.cxx:382
Double_t TrainNodeFast(const EventConstList &eventSample, DecisionTreeNode *node)
Decide how to split a node using one of the variables that gives the best separation of signal/backgr...
void DescendTree(Node *n=NULL)
descend a tree to find all its leaf nodes
TMatrixT< Element > & Invert(Double_t *det=0)
Invert the matrix and calculate its determinant.
Definition: TMatrixT.cxx:1396
void FillEvent(const TMVA::Event &event, TMVA::DecisionTreeNode *node)
fill the existing the decision tree structure by filling event in from the top node and see where the...
UInt_t GetNTargets() const
accessor to the number of targets
Definition: Event.cxx:320
void SetNEvents(Float_t nev)
Double_t fPruneStrength
Definition: DecisionTree.h:214
TMatrixT< Double_t > TMatrixD
Definition: TMatrixDfwd.h:22
Bool_t DoRegression() const
Definition: DecisionTree.h:182
Double_t fMinLinCorrForFisher
Definition: DecisionTree.h:203
void SetTotalTreeDepth(Int_t depth)
Definition: BinaryTree.h:95
Float_t GetTarget(UInt_t itgt) const
Definition: Event.h:97
Int_t GetNodeType(void) const
void SetSubTreeR(Double_t r)
TRandom2 r(17)
virtual void SetLeft(Node *l)
void SetAlpha(Double_t alpha)
UInt_t CleanTree(DecisionTreeNode *node=NULL)
remove those last splits that result in two leaf nodes that are both of the type (i.e.
void SetSampleMin(UInt_t ivar, Float_t xmin)
set the minimum of variable ivar from the training sample that pass/end up in this node ...
void SetCutValue(Float_t c)
void GetRandomisedVariables(Bool_t *useVariable, UInt_t *variableMap, UInt_t &nVars)
Implementation of a Decision Tree.
Definition: DecisionTree.h:59
unsigned int UInt_t
Definition: RtypesCore.h:42
Double_t TrainNodeFull(const EventConstList &eventSample, DecisionTreeNode *node)
train a node by finding the single optimal cut for a single variable that best separates signal and b...
void SetParentTreeInNodes(Node *n=NULL)
descend a tree to find all its leaf nodes, fill max depth reached in the tree at the same time...
void SetPurity(void)
return the S/(S+B) (purity) for the node REM: even if nodes with purity 0.01 are very PURE background...
TLine * l
Definition: textangle.C:4
void ReadAttr(void *node, const char *, T &value)
read attribute from xml
Definition: Tools.h:290
bool almost_equal_float(float x, float y, int ulp=4)
float xmax
Definition: THbookFile.cxx:93
Tools & gTools()
An interface to calculate the "SeparationGain" for different separation criteria used in various trai...
virtual void ReadXML(void *node, UInt_t tmva_Version_Code=TMVA_VERSION_CODE)
read attributes from XML
Definition: BinaryTree.cxx:144
void PruneNodeInPlace(TMVA::DecisionTreeNode *node)
prune a node temporarily (without actually deleting its descendants which allows testing the pruned t...
TMVA::DecisionTreeNode * GetEventNode(const TMVA::Event &e) const
get the pointer to the leaf node where a particular event ends up in...
void ApplyValidationSample(const EventConstList *validationSample) const
run the validation sample through the (pruned) tree and fill in the nodes the variables NSValidation ...
REAL epsilon
Definition: triangle.c:617
constexpr Double_t E()
Definition: TMath.h:74
virtual Double_t GetSeparationIndex(const Double_t s, const Double_t b)=0
bool almost_equal_double(double x, double y, int ulp=4)
const Bool_t kFALSE
Definition: RtypesCore.h:92
static void SetVarIndex(Int_t iVar)
void AddToSumTarget2(Float_t t2)
Float_t GetPurity(void) const
void SetSampleMax(UInt_t ivar, Float_t xmax)
set the maximum of variable ivar from the training sample that pass/end up in this node ...
virtual Bool_t GoesRight(const Event &) const
test event if it descends the tree at this node to the right
TRandom3 * fMyTrandom
Definition: DecisionTree.h:225
#define ClassImp(name)
Definition: Rtypes.h:336
A class to prune a decision tree using the Cost Complexity method.
double Double_t
Definition: RtypesCore.h:55
Double_t GetNBValidation() const
Node * GetNode(ULong_t sequence, UInt_t depth)
retrieve node from the tree.
void IncrementNSigEvents(Float_t s)
void ClearTree()
clear the tree nodes (their S/N, Nevents etc), just keep the structure of the tree ...
const TMatrixD * GetCorrelationMatrix(const TMatrixD *covMat)
turns covariance into correlation matrix
Definition: Tools.cxx:336
void SetAlphaMinSubtree(Double_t g)
int type
Definition: TGX11.cxx:120
Types::EAnalysisType fAnalysisType
Definition: DecisionTree.h:234
unsigned long ULong_t
Definition: RtypesCore.h:51
static DecisionTree * CreateFromXML(void *node, UInt_t tmva_Version_Code=TMVA_VERSION_CODE)
re-create a new tree (decision tree or search tree) from XML
Double_t y[n]
Definition: legend1.C:17
IPruneTool - a helper interface class to prune a decision tree.
Definition: IPruneTool.h:70
void SetNEvents_unboosted(Float_t nev)
you should not use this method at all Int_t Int_t Double_t Double_t Double_t e
Definition: TRolke.cxx:630
UInt_t GetTotalTreeDepth() const
Definition: BinaryTree.h:93
VariableInfo & GetVariableInfo(Int_t i)
Definition: DataSetInfo.h:96
void SetNSigEvents_unboosted(Float_t s)
void SetTerminal(Bool_t s=kTRUE)
RegressionVariance * fRegType
Definition: DecisionTree.h:207
void SetNSigEvents(Float_t s)
UInt_t CountNodes(Node *n=NULL)
return the number of nodes in the tree. (make a new count –> takes time)
Definition: BinaryTree.cxx:103
void SetNBkgEvents_unboosted(Float_t b)
SeparationBase * fSepType
Definition: DecisionTree.h:206
void SetNBkgEvents_unweighted(Float_t b)
char Char_t
Definition: RtypesCore.h:29
Double_t PruneTree(const EventConstList *validationSample=NULL)
prune (get rid of internal nodes) the Decision tree to avoid overtraining several different pruning m...
Node * GetRightDaughter(Node *n)
get right daughter node current node "n"
Definition: BinaryTree.cxx:95
Node for the BinarySearch or Decision Trees.
Definition: Node.h:56
Float_t GetResponse(void) const
Double_t GetNSValidation() const
Short_t Max(Short_t a, Short_t b)
Definition: TMathBase.h:200
Double_t GetOriginalWeight() const
Definition: Event.h:79
UInt_t BuildTree(const EventConstList &eventSample, DecisionTreeNode *node=NULL)
building the decision tree by recursively calling the splitting of one (root-) node into two daughter...
you should not use this method at all Int_t Int_t Double_t Double_t Double_t Int_t Double_t Double_t Double_t Double_t b
Definition: TRolke.cxx:630
virtual DecisionTreeNode * GetLeft() const
virtual DecisionTreeNode * GetRight() const
Node * GetLeftDaughter(Node *n)
get left daughter node current node "n"
Definition: BinaryTree.cxx:87
void ClearNodeAndAllDaughters()
clear the nodes (their S/N, Nevents etc), just keep the structure of the tree
void SetSeparationIndex(Float_t sep)
double result[121]
void SetRoot(Node *r)
Definition: BinaryTree.h:80
Short_t GetSelector() const
virtual Int_t Poisson(Double_t mean)
Generates a random integer N according to a Poisson law.
Definition: TRandom.cxx:362
Double_t Sqrt(Double_t x)
Definition: TMath.h:591
std::vector< TMatrixDSym * > * CalcCovarianceMatrices(const std::vector< Event *> &events, Int_t maxCls, VariableTransformBase *transformBase=0)
compute covariance matrices
Definition: Tools.cxx:1525
DecisionTree(void)
default constructor using the GiniIndex as separation criterion, no restrictions on minium number of ...
void SetPruneStrength(Double_t alpha)
Definition: IPruneTool.h:88
Double_t GetPruneStrength() const
Definition: DecisionTree.h:141
const Bool_t kTRUE
Definition: RtypesCore.h:91
double norm(double *x, double *p)
Definition: unuranDistr.cxx:40
const Int_t n
Definition: legend1.C:16
A helper class to prune a decision tree using the expected error (C4.5) method.
void SetPos(char s)
Definition: Node.h:117
void SetNEvents_unweighted(Float_t nev)
void PruneNode(TMVA::DecisionTreeNode *node)
prune away the subtree below the node