Logo ROOT  
Reference Guide
DecisionTree.cxx
Go to the documentation of this file.
1// @(#)root/tmva $Id$
2// Author: Andreas Hoecker, Joerg Stelzer, Helge Voss, Kai Voss, Eckhard von Toerne, Jan Therhaag
3
4/**********************************************************************************
5 * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6 * Package: TMVA *
7 * Class : TMVA::DecisionTree *
8 * Web : http://tmva.sourceforge.net *
9 * *
10 * Description: *
11 * Implementation of a Decision Tree *
12 * *
13 * Authors (alphabetical): *
14 * Andreas Hoecker <Andreas.Hocker@cern.ch> - CERN, Switzerland *
15 * Helge Voss <Helge.Voss@cern.ch> - MPI-K Heidelberg, Germany *
16 * Kai Voss <Kai.Voss@cern.ch> - U. of Victoria, Canada *
17 * Eckhard v. Toerne <evt@uni-bonn.de> - U of Bonn, Germany *
18 * Jan Therhaag <Jan.Therhaag@cern.ch> - U of Bonn, Germany *
19 * *
20 * Copyright (c) 2005-2011: *
21 * CERN, Switzerland *
22 * U. of Victoria, Canada *
23 * MPI-K Heidelberg, Germany *
24 * U. of Bonn, Germany *
25 * *
26 * Redistribution and use in source and binary forms, with or without *
27 * modification, are permitted according to the terms listed in LICENSE *
28 * (http://mva.sourceforge.net/license.txt) *
29 * *
30 **********************************************************************************/
31
32/*! \class TMVA::DecisionTree
33\ingroup TMVA
34
35Implementation of a Decision Tree
36
37In a decision tree successive decision nodes are used to categorize the
38events out of the sample as either signal or background. Each node
39uses only a single discriminating variable to decide if the event is
40signal-like ("goes right") or background-like ("goes left"). This
41forms a tree like structure with "baskets" at the end (leave nodes),
42and an event is classified as either signal or background according to
43whether the basket where it ends up has been classified signal or
44background during the training. Training of a decision tree is the
45process to define the "cut criteria" for each node. The training
46starts with the root node. Here one takes the full training event
47sample and selects the variable and corresponding cut value that gives
48the best separation between signal and background at this stage. Using
49this cut criterion, the sample is then divided into two subsamples, a
50signal-like (right) and a background-like (left) sample. Two new nodes
51are then created for each of the two sub-samples and they are
52constructed using the same mechanism as described for the root
53node. The devision is stopped once a certain node has reached either a
54minimum number of events, or a minimum or maximum signal purity. These
55leave nodes are then called "signal" or "background" if they contain
56more signal respective background events from the training sample.
57
58*/
59
60#include <iostream>
61#include <algorithm>
62#include <vector>
63#include <limits>
64#include <cassert>
65
66#include "TRandom3.h"
67#include "TMath.h"
68#include "TMatrix.h"
69
70#include "TMVA/MsgLogger.h"
71#include "TMVA/DecisionTree.h"
74
75#include "TMVA/Tools.h"
76#include "TMVA/Config.h"
77
78#include "TMVA/GiniIndex.h"
79#include "TMVA/CrossEntropy.h"
81#include "TMVA/SdivSqrtSplusB.h"
82#include "TMVA/Event.h"
84#include "TMVA/IPruneTool.h"
87
88const Int_t TMVA::DecisionTree::fgRandomSeed = 0; // set nonzero for debugging and zero for random seeds
89
90using std::vector;
91
93
94bool almost_equal_float(float x, float y, int ulp=4){
95 // the machine epsilon has to be scaled to the magnitude of the values used
96 // and multiplied by the desired precision in ULPs (units in the last place)
97 return std::abs(x-y) < std::numeric_limits<float>::epsilon() * std::abs(x+y) * ulp
98 // unless the result is subnormal
99 || std::abs(x-y) < std::numeric_limits<float>::min();
100}
101
102bool almost_equal_double(double x, double y, int ulp=4){
103 // the machine epsilon has to be scaled to the magnitude of the values used
104 // and multiplied by the desired precision in ULPs (units in the last place)
105 return std::abs(x-y) < std::numeric_limits<double>::epsilon() * std::abs(x+y) * ulp
106 // unless the result is subnormal
107 || std::abs(x-y) < std::numeric_limits<double>::min();
108}
109
110////////////////////////////////////////////////////////////////////////////////
111/// default constructor using the GiniIndex as separation criterion,
112/// no restrictions on minium number of events in a leave note or the
113/// separation gain in the node splitting
114
116 BinaryTree(),
117 fNvars (0),
118 fNCuts (-1),
119 fUseFisherCuts (kFALSE),
120 fMinLinCorrForFisher (1),
121 fUseExclusiveVars (kTRUE),
122 fSepType (NULL),
123 fRegType (NULL),
124 fMinSize (0),
125 fMinNodeSize (1),
126 fMinSepGain (0),
127 fUseSearchTree(kFALSE),
128 fPruneStrength(0),
129 fPruneMethod (kNoPruning),
130 fNNodesBeforePruning(0),
131 fNodePurityLimit(0.5),
132 fRandomisedTree (kFALSE),
133 fUseNvars (0),
134 fUsePoissonNvars(kFALSE),
135 fMyTrandom (NULL),
136 fMaxDepth (999999),
137 fSigClass (0),
138 fTreeID (0),
139 fAnalysisType (Types::kClassification),
140 fDataSetInfo (NULL)
141
142{}
143
144////////////////////////////////////////////////////////////////////////////////
145/// constructor specifying the separation type, the min number of
146/// events in a no that is still subjected to further splitting, the
147/// number of bins in the grid used in applying the cut for the node
148/// splitting.
149
151 Bool_t randomisedTree, Int_t useNvars, Bool_t usePoissonNvars,
152 UInt_t nMaxDepth, Int_t iSeed, Float_t purityLimit, Int_t treeID):
153 BinaryTree(),
154 fNvars (0),
155 fNCuts (nCuts),
156 fUseFisherCuts (kFALSE),
157 fMinLinCorrForFisher (1),
158 fUseExclusiveVars (kTRUE),
159 fSepType (sepType),
160 fRegType (NULL),
161 fMinSize (0),
162 fMinNodeSize (minSize),
163 fMinSepGain (0),
164 fUseSearchTree (kFALSE),
165 fPruneStrength (0),
166 fPruneMethod (kNoPruning),
167 fNNodesBeforePruning(0),
168 fNodePurityLimit(purityLimit),
169 fRandomisedTree (randomisedTree),
170 fUseNvars (useNvars),
171 fUsePoissonNvars(usePoissonNvars),
172 fMyTrandom (new TRandom3(iSeed)),
173 fMaxDepth (nMaxDepth),
174 fSigClass (cls),
175 fTreeID (treeID),
176 fAnalysisType (Types::kClassification),
177 fDataSetInfo (dataInfo)
178{
179 if (sepType == NULL) { // it is interpreted as a regression tree, where
180 // currently the separation type (simple least square)
181 // cannot be chosen freely)
184 if ( nCuts <=0 ) {
185 fNCuts = 200;
186 Log() << kWARNING << " You had chosen the training mode using optimal cuts, not\n"
187 << " based on a grid of " << fNCuts << " by setting the option NCuts < 0\n"
188 << " as this doesn't exist yet, I set it to " << fNCuts << " and use the grid"
189 << Endl;
190 }
191 }else{
193 }
194}
195
196////////////////////////////////////////////////////////////////////////////////
197/// copy constructor that creates a true copy, i.e. a completely independent tree
198/// the node copy will recursively copy all the nodes
199
201 BinaryTree(),
202 fNvars (d.fNvars),
203 fNCuts (d.fNCuts),
204 fUseFisherCuts (d.fUseFisherCuts),
205 fMinLinCorrForFisher (d.fMinLinCorrForFisher),
206 fUseExclusiveVars (d.fUseExclusiveVars),
207 fSepType (d.fSepType),
208 fRegType (d.fRegType),
209 fMinSize (d.fMinSize),
210 fMinNodeSize(d.fMinNodeSize),
211 fMinSepGain (d.fMinSepGain),
212 fUseSearchTree (d.fUseSearchTree),
213 fPruneStrength (d.fPruneStrength),
214 fPruneMethod (d.fPruneMethod),
215 fNodePurityLimit(d.fNodePurityLimit),
216 fRandomisedTree (d.fRandomisedTree),
217 fUseNvars (d.fUseNvars),
218 fUsePoissonNvars(d.fUsePoissonNvars),
219 fMyTrandom (new TRandom3(fgRandomSeed)), // well, that means it's not an identical copy. But I only ever intend to really copy trees that are "outgrown" already.
220 fMaxDepth (d.fMaxDepth),
221 fSigClass (d.fSigClass),
222 fTreeID (d.fTreeID),
223 fAnalysisType(d.fAnalysisType),
224 fDataSetInfo (d.fDataSetInfo)
225{
226 this->SetRoot( new TMVA::DecisionTreeNode ( *((DecisionTreeNode*)(d.GetRoot())) ) );
227 this->SetParentTreeInNodes();
228 fNNodes = d.fNNodes;
229
230}
231
232
233////////////////////////////////////////////////////////////////////////////////
234/// destructor
235
237{
238 // destruction of the tree nodes done in the "base class" BinaryTree
239
240 if (fMyTrandom) delete fMyTrandom;
241 if (fRegType) delete fRegType;
242}
243
244////////////////////////////////////////////////////////////////////////////////
245/// descend a tree to find all its leaf nodes, fill max depth reached in the
246/// tree at the same time.
247
249{
250 if (n == NULL) { //default, start at the tree top, then descend recursively
251 n = this->GetRoot();
252 if (n == NULL) {
253 Log() << kFATAL << "SetParentTreeNodes: started with undefined ROOT node" <<Endl;
254 return ;
255 }
256 }
257
258 if ((this->GetLeftDaughter(n) == NULL) && (this->GetRightDaughter(n) != NULL) ) {
259 Log() << kFATAL << " Node with only one daughter?? Something went wrong" << Endl;
260 return;
261 } else if ((this->GetLeftDaughter(n) != NULL) && (this->GetRightDaughter(n) == NULL) ) {
262 Log() << kFATAL << " Node with only one daughter?? Something went wrong" << Endl;
263 return;
264 }
265 else {
266 if (this->GetLeftDaughter(n) != NULL) {
267 this->SetParentTreeInNodes( this->GetLeftDaughter(n) );
268 }
269 if (this->GetRightDaughter(n) != NULL) {
270 this->SetParentTreeInNodes( this->GetRightDaughter(n) );
271 }
272 }
273 n->SetParentTree(this);
274 if (n->GetDepth() > this->GetTotalTreeDepth()) this->SetTotalTreeDepth(n->GetDepth());
275 return;
276}
277
278////////////////////////////////////////////////////////////////////////////////
279/// re-create a new tree (decision tree or search tree) from XML
280
282 std::string type("");
283 gTools().ReadAttr(node,"type", type);
284 DecisionTree* dt = new DecisionTree();
285
286 dt->ReadXML( node, tmva_Version_Code );
287 return dt;
288}
289
290// #### Multithreaded DecisionTree::BuildTree
291#ifdef R__USE_IMT
292//====================================================================================
293// Had to define this struct to enable parallelization.
294// Using BuildNodeInfo, each thread can return the information needed.
295// After each thread returns we can then merge the results, hence the + operator.
296//====================================================================================
298
299 BuildNodeInfo(Int_t fNvars, const TMVA::Event* evt){
300
301 nvars = fNvars;
302 xmin = std::vector<Float_t>(nvars);
303 xmax = std::vector<Float_t>(nvars);
304
305 // #### the initial min and max for each feature
306 for (Int_t ivar=0; ivar<fNvars; ivar++) {
307 const Double_t val = evt->GetValueFast(ivar);
308 xmin[ivar]=val;
309 xmax[ivar]=val;
310 }
311 };
312
313 BuildNodeInfo(Int_t fNvars, std::vector<Float_t>& inxmin, std::vector<Float_t>& inxmax){
314
315 nvars = fNvars;
316 xmin = std::vector<Float_t>(nvars);
317 xmax = std::vector<Float_t>(nvars);
318
319 // #### the initial min and max for each feature
320 for (Int_t ivar=0; ivar<fNvars; ivar++) {
321 xmin[ivar]=inxmin[ivar];
322 xmax[ivar]=inxmax[ivar];
323 }
324 };
326
327 Int_t nvars = 0;
329 Double_t suw = 0;
330 Double_t sub = 0;
332 Double_t buw = 0;
333 Double_t bub = 0;
334 Double_t target = 0;
335 Double_t target2 = 0;
336 std::vector<Float_t> xmin;
337 std::vector<Float_t> xmax;
338
339 // #### Define the addition operator for BuildNodeInfo
340 // #### Make sure both BuildNodeInfos have the same nvars if we add them
342 {
343 BuildNodeInfo ret(nvars, xmin, xmax);
344 if(nvars != other.nvars)
345 {
346 std::cout << "!!! ERROR BuildNodeInfo1+BuildNodeInfo2 failure. Nvars1 != Nvars2." << std::endl;
347 return ret;
348 }
349 ret.s = s + other.s;
350 ret.suw = suw + other.suw;
351 ret.sub = sub + other.sub;
352 ret.b = b + other.b;
353 ret.buw = buw + other.buw;
354 ret.bub = bub + other.bub;
355 ret.target = target + other.target;
356 ret.target2 = target2 + other.target2;
357
358 // xmin is the min of both, xmax is the max of both
359 for(Int_t i=0; i<nvars; i++)
360 {
361 ret.xmin[i]=xmin[i]<other.xmin[i]?xmin[i]:other.xmin[i];
362 ret.xmax[i]=xmax[i]>other.xmax[i]?xmax[i]:other.xmax[i];
363 }
364 return ret;
365 };
366
367};
368//===========================================================================
369// Done with BuildNodeInfo declaration
370//===========================================================================
371
372
373////////////////////////////////////////////////////////////////////////////////
374/// building the decision tree by recursively calling the splitting of
375/// one (root-) node into two daughter nodes (returns the number of nodes)
376
377UInt_t TMVA::DecisionTree::BuildTree( const std::vector<const TMVA::Event*> & eventSample,
379{
380 if (node==NULL) {
381 //start with the root node
382 node = new TMVA::DecisionTreeNode();
383 fNNodes = 1;
384 this->SetRoot(node);
385 // have to use "s" for start as "r" for "root" would be the same as "r" for "right"
386 this->GetRoot()->SetPos('s');
387 this->GetRoot()->SetDepth(0);
388 this->GetRoot()->SetParentTree(this);
389 fMinSize = fMinNodeSize/100. * eventSample.size();
390 if (GetTreeID()==0){
391 Log() << kDEBUG << "\tThe minimal node size MinNodeSize=" << fMinNodeSize << " fMinNodeSize="<<fMinNodeSize<< "% is translated to an actual number of events = "<< fMinSize<< " for the training sample size of " << eventSample.size() << Endl;
392 Log() << kDEBUG << "\tNote: This number will be taken as absolute minimum in the node, " << Endl;
393 Log() << kDEBUG << " \tin terms of 'weighted events' and unweighted ones !! " << Endl;
394 }
395 }
396
397 UInt_t nevents = eventSample.size();
398
399 if (nevents > 0 ) {
400 if (fNvars==0) fNvars = eventSample[0]->GetNVariables(); // should have been set before, but ... well..
401 fVariableImportance.resize(fNvars);
402 }
403 else Log() << kFATAL << ":<BuildTree> eventsample Size == 0 " << Endl;
404
405 // sum up the totals
406 // sig and bkg for classification
407 // err and err2 for regression
408
409 // #### Set up prerequisite info for multithreading
411 auto seeds = ROOT::TSeqU(nPartitions);
412
413 // #### need a lambda function to pass to TThreadExecutor::MapReduce (multi-threading)
414 auto f = [this, &eventSample, &nPartitions](UInt_t partition = 0){
415
416 Int_t start = 1.0*partition/nPartitions*eventSample.size();
417 Int_t end = (partition+1.0)/nPartitions*eventSample.size();
418
419 BuildNodeInfo nodeInfof(fNvars, eventSample[0]);
420
421 for(Int_t iev=start; iev<end; iev++){
422 const TMVA::Event* evt = eventSample[iev];
423 const Double_t weight = evt->GetWeight();
424 const Double_t orgWeight = evt->GetOriginalWeight(); // unboosted!
425 if (evt->GetClass() == fSigClass) {
426 nodeInfof.s += weight;
427 nodeInfof.suw += 1;
428 nodeInfof.sub += orgWeight;
429 }
430 else {
431 nodeInfof.b += weight;
432 nodeInfof.buw += 1;
433 nodeInfof.bub += orgWeight;
434 }
435 if ( DoRegression() ) {
436 const Double_t tgt = evt->GetTarget(0);
437 nodeInfof.target +=weight*tgt;
438 nodeInfof.target2+=weight*tgt*tgt;
439 }
440
441 // save the min and max for each feature
442 for (UInt_t ivar=0; ivar<fNvars; ivar++) {
443 const Double_t val = evt->GetValueFast(ivar);
444 if (iev==start){
445 nodeInfof.xmin[ivar]=val;
446 nodeInfof.xmax[ivar]=val;
447 }
448 if (val < nodeInfof.xmin[ivar]) nodeInfof.xmin[ivar]=val;
449 if (val > nodeInfof.xmax[ivar]) nodeInfof.xmax[ivar]=val;
450 }
451 }
452 return nodeInfof;
453 };
454
455 // #### Need an initial struct to pass to std::accumulate
456 BuildNodeInfo nodeInfoInit(fNvars, eventSample[0]);
457
458 // #### Run the threads in parallel then merge the results
459 auto redfunc = [nodeInfoInit](std::vector<BuildNodeInfo> v) -> BuildNodeInfo { return std::accumulate(v.begin(), v.end(), nodeInfoInit); };
460 BuildNodeInfo nodeInfo = TMVA::Config::Instance().GetThreadExecutor().MapReduce(f, seeds, redfunc);
461 //NodeInfo nodeInfo(fNvars);
462
463 if (nodeInfo.s+nodeInfo.b < 0) {
464 Log() << kWARNING << " One of the Decision Tree nodes has negative total number of signal or background events. "
465 << "(Nsig="<<nodeInfo.s<<" Nbkg="<<nodeInfo.b<<" Probaby you use a Monte Carlo with negative weights. That should in principle "
466 << "be fine as long as on average you end up with something positive. For this you have to make sure that the "
467 << "minimal number of (unweighted) events demanded for a tree node (currently you use: MinNodeSize="<<fMinNodeSize
468 << "% of training events, you can set this via the BDT option string when booking the classifier) is large enough "
469 << "to allow for reasonable averaging!!!" << Endl
470 << " If this does not help.. maybe you want to try the option: NoNegWeightsInTraining which ignores events "
471 << "with negative weight in the training." << Endl;
472 double nBkg=0.;
473 for (UInt_t i=0; i<eventSample.size(); i++) {
474 if (eventSample[i]->GetClass() != fSigClass) {
475 nBkg += eventSample[i]->GetWeight();
476 Log() << kDEBUG << "Event "<< i<< " has (original) weight: " << eventSample[i]->GetWeight()/eventSample[i]->GetBoostWeight()
477 << " boostWeight: " << eventSample[i]->GetBoostWeight() << Endl;
478 }
479 }
480 Log() << kDEBUG << " that gives in total: " << nBkg<<Endl;
481 }
482
483 node->SetNSigEvents(nodeInfo.s);
484 node->SetNBkgEvents(nodeInfo.b);
485 node->SetNSigEvents_unweighted(nodeInfo.suw);
486 node->SetNBkgEvents_unweighted(nodeInfo.buw);
487 node->SetNSigEvents_unboosted(nodeInfo.sub);
488 node->SetNBkgEvents_unboosted(nodeInfo.bub);
489 node->SetPurity();
490 if (node == this->GetRoot()) {
491 node->SetNEvents(nodeInfo.s+nodeInfo.b);
492 node->SetNEvents_unweighted(nodeInfo.suw+nodeInfo.buw);
493 node->SetNEvents_unboosted(nodeInfo.sub+nodeInfo.bub);
494 }
495
496 // save the min and max for each feature
497 for (UInt_t ivar=0; ivar<fNvars; ivar++) {
498 node->SetSampleMin(ivar,nodeInfo.xmin[ivar]);
499 node->SetSampleMax(ivar,nodeInfo.xmax[ivar]);
500 }
501
502 // I now demand the minimum number of events for both daughter nodes. Hence if the number
503 // of events in the parent node is not at least two times as big, I don't even need to try
504 // splitting
505
506 // ask here for actual "events" independent of their weight.. OR the weighted events
507 // to exceed the min requested number of events per daughter node
508 // (NOTE: make sure that at the eventSample at the ROOT node has sum_of_weights == sample.size() !
509 // if ((eventSample.size() >= 2*fMinSize ||s+b >= 2*fMinSize) && node->GetDepth() < fMaxDepth
510 // std::cout << "------------------------------------------------------------------"<<std::endl;
511 // std::cout << "------------------------------------------------------------------"<<std::endl;
512 // std::cout << " eveSampleSize = "<< eventSample.size() << " s+b="<<s+b << std::endl;
513 if ((eventSample.size() >= 2*fMinSize && nodeInfo.s+nodeInfo.b >= 2*fMinSize) && node->GetDepth() < fMaxDepth
514 && ( ( nodeInfo.s!=0 && nodeInfo.b !=0 && !DoRegression()) || ( (nodeInfo.s+nodeInfo.b)!=0 && DoRegression()) ) ) {
515
516 // Train the node and figure out the separation gain and split points
517 Double_t separationGain;
518 if (fNCuts > 0){
519 separationGain = this->TrainNodeFast(eventSample, node);
520 }
521 else {
522 separationGain = this->TrainNodeFull(eventSample, node);
523 }
524
525 // The node has been trained and there is nothing to be gained by splitting
526 if (separationGain < std::numeric_limits<double>::epsilon()) { // we could not gain anything, e.g. all events are in one bin,
527 // no cut can actually do anything to improve the node
528 // hence, naturally, the current node is a leaf node
529 if (DoRegression()) {
530 node->SetSeparationIndex(fRegType->GetSeparationIndex(nodeInfo.s+nodeInfo.b,nodeInfo.target,nodeInfo.target2));
531 node->SetResponse(nodeInfo.target/(nodeInfo.s+nodeInfo.b));
532 if( almost_equal_double(nodeInfo.target2/(nodeInfo.s+nodeInfo.b),nodeInfo.target/(nodeInfo.s+nodeInfo.b)*nodeInfo.target/(nodeInfo.s+nodeInfo.b)) ){
533 node->SetRMS(0);
534 }else{
535 node->SetRMS(TMath::Sqrt(nodeInfo.target2/(nodeInfo.s+nodeInfo.b) - nodeInfo.target/(nodeInfo.s+nodeInfo.b)*nodeInfo.target/(nodeInfo.s+nodeInfo.b)));
536 }
537 }
538 else {
539 node->SetSeparationIndex(fSepType->GetSeparationIndex(nodeInfo.s,nodeInfo.b));
540 if (node->GetPurity() > fNodePurityLimit) node->SetNodeType(1);
541 else node->SetNodeType(-1);
542 }
543 if (node->GetDepth() > this->GetTotalTreeDepth()) this->SetTotalTreeDepth(node->GetDepth());
544 }
545 else {
546 // #### Couldn't parallelize this part (filtering events from mother node to daughter nodes)
547 // #### ... would need to avoid the push_back or use some threadsafe mutex locked version...
548 std::vector<const TMVA::Event*> leftSample; leftSample.reserve(nevents);
549 std::vector<const TMVA::Event*> rightSample; rightSample.reserve(nevents);
550
551 Double_t nRight=0, nLeft=0;
552 Double_t nRightUnBoosted=0, nLeftUnBoosted=0;
553
554 for (UInt_t ie=0; ie< nevents ; ie++) {
555 if (node->GoesRight(*eventSample[ie])) {
556 rightSample.push_back(eventSample[ie]);
557 nRight += eventSample[ie]->GetWeight();
558 nRightUnBoosted += eventSample[ie]->GetOriginalWeight();
559 }
560 else {
561 leftSample.push_back(eventSample[ie]);
562 nLeft += eventSample[ie]->GetWeight();
563 nLeftUnBoosted += eventSample[ie]->GetOriginalWeight();
564 }
565 }
566 // sanity check
567 if (leftSample.empty() || rightSample.empty()) {
568
569 Log() << kERROR << "<TrainNode> all events went to the same branch" << Endl
570 << "--- Hence new node == old node ... check" << Endl
571 << "--- left:" << leftSample.size()
572 << " right:" << rightSample.size() << Endl
573 << " while the separation is thought to be " << separationGain
574 << "\n when cutting on variable " << node->GetSelector()
575 << " at value " << node->GetCutValue()
576 << kFATAL << "--- this should never happen, please write a bug report to Helge.Voss@cern.ch" << Endl;
577 }
578
579 // continue building daughter nodes for the left and the right eventsample
580 TMVA::DecisionTreeNode *rightNode = new TMVA::DecisionTreeNode(node,'r');
581 fNNodes++;
582 rightNode->SetNEvents(nRight);
583 rightNode->SetNEvents_unboosted(nRightUnBoosted);
584 rightNode->SetNEvents_unweighted(rightSample.size());
585
586 TMVA::DecisionTreeNode *leftNode = new TMVA::DecisionTreeNode(node,'l');
587
588 fNNodes++;
589 leftNode->SetNEvents(nLeft);
590 leftNode->SetNEvents_unboosted(nLeftUnBoosted);
591 leftNode->SetNEvents_unweighted(leftSample.size());
592
593 node->SetNodeType(0);
594 node->SetLeft(leftNode);
595 node->SetRight(rightNode);
596
597 this->BuildTree(rightSample, rightNode);
598 this->BuildTree(leftSample, leftNode );
599
600 }
601 }
602 else{ // it is a leaf node
603 if (DoRegression()) {
604 node->SetSeparationIndex(fRegType->GetSeparationIndex(nodeInfo.s+nodeInfo.b,nodeInfo.target,nodeInfo.target2));
605 node->SetResponse(nodeInfo.target/(nodeInfo.s+nodeInfo.b));
606 if( almost_equal_double(nodeInfo.target2/(nodeInfo.s+nodeInfo.b), nodeInfo.target/(nodeInfo.s+nodeInfo.b)*nodeInfo.target/(nodeInfo.s+nodeInfo.b)) ) {
607 node->SetRMS(0);
608 }else{
609 node->SetRMS(TMath::Sqrt(nodeInfo.target2/(nodeInfo.s+nodeInfo.b) - nodeInfo.target/(nodeInfo.s+nodeInfo.b)*nodeInfo.target/(nodeInfo.s+nodeInfo.b)));
610 }
611 }
612 else {
613 node->SetSeparationIndex(fSepType->GetSeparationIndex(nodeInfo.s,nodeInfo.b));
614 if (node->GetPurity() > fNodePurityLimit) node->SetNodeType(1);
615 else node->SetNodeType(-1);
616 // loop through the event sample ending up in this node and check for events with negative weight
617 // those "cannot" be boosted normally. Hence, if there is one of those
618 // is misclassified, find randomly as many events with positive weights in this
619 // node as needed to get the same absolute number of weight, and mark them as
620 // "not to be boosted" in order to make up for not boosting the negative weight event
621 }
622
623
624 if (node->GetDepth() > this->GetTotalTreeDepth()) this->SetTotalTreeDepth(node->GetDepth());
625 }
626
627 // if (IsRootNode) this->CleanTree();
628 return fNNodes;
629}
630
631// Standard DecisionTree::BuildTree (multithreading is not enabled)
632#else
633
634UInt_t TMVA::DecisionTree::BuildTree( const std::vector<const TMVA::Event*> & eventSample,
636{
637 if (node==NULL) {
638 //start with the root node
639 node = new TMVA::DecisionTreeNode();
640 fNNodes = 1;
641 this->SetRoot(node);
642 // have to use "s" for start as "r" for "root" would be the same as "r" for "right"
643 this->GetRoot()->SetPos('s');
644 this->GetRoot()->SetDepth(0);
645 this->GetRoot()->SetParentTree(this);
646 fMinSize = fMinNodeSize/100. * eventSample.size();
647 if (GetTreeID()==0){
648 Log() << kDEBUG << "\tThe minimal node size MinNodeSize=" << fMinNodeSize << " fMinNodeSize="<<fMinNodeSize<< "% is translated to an actual number of events = "<< fMinSize<< " for the training sample size of " << eventSample.size() << Endl;
649 Log() << kDEBUG << "\tNote: This number will be taken as absolute minimum in the node, " << Endl;
650 Log() << kDEBUG << " \tin terms of 'weighted events' and unweighted ones !! " << Endl;
651 }
652 }
653
654 UInt_t nevents = eventSample.size();
655
656 if (nevents > 0 ) {
657 if (fNvars==0) fNvars = eventSample[0]->GetNVariables(); // should have been set before, but ... well..
658 fVariableImportance.resize(fNvars);
659 }
660 else Log() << kFATAL << ":<BuildTree> eventsample Size == 0 " << Endl;
661
662 Double_t s=0, b=0;
663 Double_t suw=0, buw=0;
664 Double_t sub=0, bub=0; // unboosted!
665 Double_t target=0, target2=0;
666 Float_t *xmin = new Float_t[fNvars];
667 Float_t *xmax = new Float_t[fNvars];
668
669 // initializing xmin and xmax for each variable
670 for (UInt_t ivar=0; ivar<fNvars; ivar++) {
671 xmin[ivar]=xmax[ivar]=0;
672 }
673 // sum up the totals
674 // sig and bkg for classification
675 // err and err2 for regression
676 for (UInt_t iev=0; iev<eventSample.size(); iev++) {
677 const TMVA::Event* evt = eventSample[iev];
678 const Double_t weight = evt->GetWeight();
679 const Double_t orgWeight = evt->GetOriginalWeight(); // unboosted!
680 if (evt->GetClass() == fSigClass) {
681 s += weight;
682 suw += 1;
683 sub += orgWeight;
684 }
685 else {
686 b += weight;
687 buw += 1;
688 bub += orgWeight;
689 }
690 if ( DoRegression() ) {
691 const Double_t tgt = evt->GetTarget(0);
692 target +=weight*tgt;
693 target2+=weight*tgt*tgt;
694 }
695
696 // save the min and max for each feature
697 for (UInt_t ivar=0; ivar<fNvars; ivar++) {
698 const Double_t val = evt->GetValueFast(ivar);
699 if (iev==0) xmin[ivar]=xmax[ivar]=val;
700 if (val < xmin[ivar]) xmin[ivar]=val;
701 if (val > xmax[ivar]) xmax[ivar]=val;
702 }
703 }
704
705
706 if (s+b < 0) {
707 Log() << kWARNING << " One of the Decision Tree nodes has negative total number of signal or background events. "
708 << "(Nsig="<<s<<" Nbkg="<<b<<" Probaby you use a Monte Carlo with negative weights. That should in principle "
709 << "be fine as long as on average you end up with something positive. For this you have to make sure that the "
710 << "minimul number of (unweighted) events demanded for a tree node (currently you use: MinNodeSize="<<fMinNodeSize
711 << "% of training events, you can set this via the BDT option string when booking the classifier) is large enough "
712 << "to allow for reasonable averaging!!!" << Endl
713 << " If this does not help.. maybe you want to try the option: NoNegWeightsInTraining which ignores events "
714 << "with negative weight in the training." << Endl;
715 double nBkg=0.;
716 for (UInt_t i=0; i<eventSample.size(); i++) {
717 if (eventSample[i]->GetClass() != fSigClass) {
718 nBkg += eventSample[i]->GetWeight();
719 Log() << kDEBUG << "Event "<< i<< " has (original) weight: " << eventSample[i]->GetWeight()/eventSample[i]->GetBoostWeight()
720 << " boostWeight: " << eventSample[i]->GetBoostWeight() << Endl;
721 }
722 }
723 Log() << kDEBUG << " that gives in total: " << nBkg<<Endl;
724 }
725
726 node->SetNSigEvents(s);
727 node->SetNBkgEvents(b);
728 node->SetNSigEvents_unweighted(suw);
729 node->SetNBkgEvents_unweighted(buw);
730 node->SetNSigEvents_unboosted(sub);
731 node->SetNBkgEvents_unboosted(bub);
732 node->SetPurity();
733 if (node == this->GetRoot()) {
734 node->SetNEvents(s+b);
735 node->SetNEvents_unweighted(suw+buw);
736 node->SetNEvents_unboosted(sub+bub);
737 }
738
739 // save the min and max for each feature
740 for (UInt_t ivar=0; ivar<fNvars; ivar++) {
741 node->SetSampleMin(ivar,xmin[ivar]);
742 node->SetSampleMax(ivar,xmax[ivar]);
743
744 }
745 delete[] xmin;
746 delete[] xmax;
747
748 // I now demand the minimum number of events for both daughter nodes. Hence if the number
749 // of events in the parent node is not at least two times as big, I don't even need to try
750 // splitting
751
752 // ask here for actuall "events" independent of their weight.. OR the weighted events
753 // to execeed the min requested number of events per dauther node
754 // (NOTE: make sure that at the eventSample at the ROOT node has sum_of_weights == sample.size() !
755 // if ((eventSample.size() >= 2*fMinSize ||s+b >= 2*fMinSize) && node->GetDepth() < fMaxDepth
756 // std::cout << "------------------------------------------------------------------"<<std::endl;
757 // std::cout << "------------------------------------------------------------------"<<std::endl;
758 // std::cout << " eveSampleSize = "<< eventSample.size() << " s+b="<<s+b << std::endl;
759 if ((eventSample.size() >= 2*fMinSize && s+b >= 2*fMinSize) && node->GetDepth() < fMaxDepth
760 && ( ( s!=0 && b !=0 && !DoRegression()) || ( (s+b)!=0 && DoRegression()) ) ) {
761 Double_t separationGain;
762 if (fNCuts > 0){
763 separationGain = this->TrainNodeFast(eventSample, node);
764 } else {
765 separationGain = this->TrainNodeFull(eventSample, node);
766 }
767 if (separationGain < std::numeric_limits<double>::epsilon()) { // we could not gain anything, e.g. all events are in one bin,
768 /// if (separationGain < 0.00000001) { // we could not gain anything, e.g. all events are in one bin,
769 // no cut can actually do anything to improve the node
770 // hence, naturally, the current node is a leaf node
771 if (DoRegression()) {
772 node->SetSeparationIndex(fRegType->GetSeparationIndex(s+b,target,target2));
773 node->SetResponse(target/(s+b));
774 if( almost_equal_double(target2/(s+b),target/(s+b)*target/(s+b)) ){
775 node->SetRMS(0);
776 }else{
777 node->SetRMS(TMath::Sqrt(target2/(s+b) - target/(s+b)*target/(s+b)));
778 }
779 }
780 else {
781 node->SetSeparationIndex(fSepType->GetSeparationIndex(s,b));
782
783 if (node->GetPurity() > fNodePurityLimit) node->SetNodeType(1);
784 else node->SetNodeType(-1);
785 }
786 if (node->GetDepth() > this->GetTotalTreeDepth()) this->SetTotalTreeDepth(node->GetDepth());
787
788 } else {
789
790 std::vector<const TMVA::Event*> leftSample; leftSample.reserve(nevents);
791 std::vector<const TMVA::Event*> rightSample; rightSample.reserve(nevents);
792
793 Double_t nRight=0, nLeft=0;
794 Double_t nRightUnBoosted=0, nLeftUnBoosted=0;
795
796 for (UInt_t ie=0; ie< nevents ; ie++) {
797 if (node->GoesRight(*eventSample[ie])) {
798 rightSample.push_back(eventSample[ie]);
799 nRight += eventSample[ie]->GetWeight();
800 nRightUnBoosted += eventSample[ie]->GetOriginalWeight();
801 }
802 else {
803 leftSample.push_back(eventSample[ie]);
804 nLeft += eventSample[ie]->GetWeight();
805 nLeftUnBoosted += eventSample[ie]->GetOriginalWeight();
806 }
807 }
808
809 // sanity check
810 if (leftSample.empty() || rightSample.empty()) {
811
812 Log() << kERROR << "<TrainNode> all events went to the same branch" << Endl
813 << "--- Hence new node == old node ... check" << Endl
814 << "--- left:" << leftSample.size()
815 << " right:" << rightSample.size() << Endl
816 << " while the separation is thought to be " << separationGain
817 << "\n when cutting on variable " << node->GetSelector()
818 << " at value " << node->GetCutValue()
819 << kFATAL << "--- this should never happen, please write a bug report to Helge.Voss@cern.ch" << Endl;
820 }
821
822 // continue building daughter nodes for the left and the right eventsample
823 TMVA::DecisionTreeNode *rightNode = new TMVA::DecisionTreeNode(node,'r');
824 fNNodes++;
825 rightNode->SetNEvents(nRight);
826 rightNode->SetNEvents_unboosted(nRightUnBoosted);
827 rightNode->SetNEvents_unweighted(rightSample.size());
828
829 TMVA::DecisionTreeNode *leftNode = new TMVA::DecisionTreeNode(node,'l');
830
831 fNNodes++;
832 leftNode->SetNEvents(nLeft);
833 leftNode->SetNEvents_unboosted(nLeftUnBoosted);
834 leftNode->SetNEvents_unweighted(leftSample.size());
835
836 node->SetNodeType(0);
837 node->SetLeft(leftNode);
838 node->SetRight(rightNode);
839
840 this->BuildTree(rightSample, rightNode);
841 this->BuildTree(leftSample, leftNode );
842
843 }
844 }
845 else{ // it is a leaf node
846 if (DoRegression()) {
847 node->SetSeparationIndex(fRegType->GetSeparationIndex(s+b,target,target2));
848 node->SetResponse(target/(s+b));
849 if( almost_equal_double(target2/(s+b), target/(s+b)*target/(s+b)) ) {
850 node->SetRMS(0);
851 }else{
852 node->SetRMS(TMath::Sqrt(target2/(s+b) - target/(s+b)*target/(s+b)));
853 }
854 }
855 else {
856 node->SetSeparationIndex(fSepType->GetSeparationIndex(s,b));
857 if (node->GetPurity() > fNodePurityLimit) node->SetNodeType(1);
858 else node->SetNodeType(-1);
859 // loop through the event sample ending up in this node and check for events with negative weight
860 // those "cannot" be boosted normally. Hence, if there is one of those
861 // is misclassified, find randomly as many events with positive weights in this
862 // node as needed to get the same absolute number of weight, and mark them as
863 // "not to be boosted" in order to make up for not boosting the negative weight event
864 }
865
866
867 if (node->GetDepth() > this->GetTotalTreeDepth()) this->SetTotalTreeDepth(node->GetDepth());
868 }
869
870 // if (IsRootNode) this->CleanTree();
871 return fNNodes;
872}
873
874#endif
875
876////////////////////////////////////////////////////////////////////////////////
877/// fill the existing the decision tree structure by filling event
878/// in from the top node and see where they happen to end up
879
880void TMVA::DecisionTree::FillTree( const std::vector<TMVA::Event*> & eventSample )
881{
882 for (UInt_t i=0; i<eventSample.size(); i++) {
883 this->FillEvent(*(eventSample[i]),NULL);
884 }
885}
886
887////////////////////////////////////////////////////////////////////////////////
888/// fill the existing the decision tree structure by filling event
889/// in from the top node and see where they happen to end up
890
893{
894 if (node == NULL) { // that's the start, take the Root node
895 node = this->GetRoot();
896 }
897
898 node->IncrementNEvents( event.GetWeight() );
900
901 if (event.GetClass() == fSigClass) {
902 node->IncrementNSigEvents( event.GetWeight() );
904 }
905 else {
906 node->IncrementNBkgEvents( event.GetWeight() );
908 }
909 node->SetSeparationIndex(fSepType->GetSeparationIndex(node->GetNSigEvents(),
910 node->GetNBkgEvents()));
911
912 if (node->GetNodeType() == 0) { //intermediate node --> go down
913 if (node->GoesRight(event))
914 this->FillEvent(event, node->GetRight());
915 else
916 this->FillEvent(event, node->GetLeft());
917 }
918}
919
920////////////////////////////////////////////////////////////////////////////////
921/// clear the tree nodes (their S/N, Nevents etc), just keep the structure of the tree
922
924{
925 if (this->GetRoot()!=NULL) this->GetRoot()->ClearNodeAndAllDaughters();
926
927}
928
929////////////////////////////////////////////////////////////////////////////////
930/// remove those last splits that result in two leaf nodes that
931/// are both of the type (i.e. both signal or both background)
932/// this of course is only a reasonable thing to do when you use
933/// "YesOrNo" leafs, while it might loose s.th. if you use the
934/// purity information in the nodes.
935/// --> hence I don't call it automatically in the tree building
936
938{
939 if (node==NULL) {
940 node = this->GetRoot();
941 }
942
943 DecisionTreeNode *l = node->GetLeft();
944 DecisionTreeNode *r = node->GetRight();
945
946 if (node->GetNodeType() == 0) {
947 this->CleanTree(l);
948 this->CleanTree(r);
949 if (l->GetNodeType() * r->GetNodeType() > 0) {
950
951 this->PruneNode(node);
952 }
953 }
954 // update the number of nodes after the cleaning
955 return this->CountNodes();
956
957}
958
959////////////////////////////////////////////////////////////////////////////////
960/// prune (get rid of internal nodes) the Decision tree to avoid overtraining
961/// several different pruning methods can be applied as selected by the
962/// variable "fPruneMethod".
963
965{
966 IPruneTool* tool(NULL);
967 PruningInfo* info(NULL);
968
969 if( fPruneMethod == kNoPruning ) return 0.0;
970
971 if (fPruneMethod == kExpectedErrorPruning)
972 // tool = new ExpectedErrorPruneTool(logfile);
973 tool = new ExpectedErrorPruneTool();
974 else if (fPruneMethod == kCostComplexityPruning)
975 {
976 tool = new CostComplexityPruneTool();
977 }
978 else {
979 Log() << kFATAL << "Selected pruning method not yet implemented "
980 << Endl;
981 }
982
983 if(!tool) return 0.0;
984
985 tool->SetPruneStrength(GetPruneStrength());
986 if(tool->IsAutomatic()) {
987 if(validationSample == NULL){
988 Log() << kFATAL << "Cannot automate the pruning algorithm without an "
989 << "independent validation sample!" << Endl;
990 }else if(validationSample->size() == 0) {
991 Log() << kFATAL << "Cannot automate the pruning algorithm with "
992 << "independent validation sample of ZERO events!" << Endl;
993 }
994 }
995
996 info = tool->CalculatePruningInfo(this,validationSample);
997 Double_t pruneStrength=0;
998 if(!info) {
999 Log() << kFATAL << "Error pruning tree! Check prune.log for more information."
1000 << Endl;
1001 } else {
1002 pruneStrength = info->PruneStrength;
1003
1004 // Log() << kDEBUG << "Optimal prune strength (alpha): " << pruneStrength
1005 // << " has quality index " << info->QualityIndex << Endl;
1006
1007
1008 for (UInt_t i = 0; i < info->PruneSequence.size(); ++i) {
1009
1010 PruneNode(info->PruneSequence[i]);
1011 }
1012 // update the number of nodes after the pruning
1013 this->CountNodes();
1014 }
1015
1016 delete tool;
1017 delete info;
1018
1019 return pruneStrength;
1020};
1021
1022
1023////////////////////////////////////////////////////////////////////////////////
1024/// run the validation sample through the (pruned) tree and fill in the nodes
1025/// the variables NSValidation and NBValidadtion (i.e. how many of the Signal
1026/// and Background events from the validation sample. This is then later used
1027/// when asking for the "tree quality" ..
1028
1030{
1031 GetRoot()->ResetValidationData();
1032 for (UInt_t ievt=0; ievt < validationSample->size(); ievt++) {
1033 CheckEventWithPrunedTree((*validationSample)[ievt]);
1034 }
1035}
1036
1037////////////////////////////////////////////////////////////////////////////////
1038/// return the misclassification rate of a pruned tree
1039/// a "pruned tree" may have set the variable "IsTerminal" to "arbitrary" at
1040/// any node, hence this tree quality testing will stop there, hence test
1041/// the pruned tree (while the full tree is still in place for normal/later use)
1042
1044{
1045 if (n == NULL) { // default, start at the tree top, then descend recursively
1046 n = this->GetRoot();
1047 if (n == NULL) {
1048 Log() << kFATAL << "TestPrunedTreeQuality: started with undefined ROOT node" <<Endl;
1049 return 0;
1050 }
1051 }
1052
1053 if( n->GetLeft() != NULL && n->GetRight() != NULL && !n->IsTerminal() ) {
1054 return (TestPrunedTreeQuality( n->GetLeft(), mode ) +
1055 TestPrunedTreeQuality( n->GetRight(), mode ));
1056 }
1057 else { // terminal leaf (in a pruned subtree of T_max at least)
1058 if (DoRegression()) {
1059 Double_t sumw = n->GetNSValidation() + n->GetNBValidation();
1060 return n->GetSumTarget2() - 2*n->GetSumTarget()*n->GetResponse() + sumw*n->GetResponse()*n->GetResponse();
1061 }
1062 else {
1063 if (mode == 0) {
1064 if (n->GetPurity() > this->GetNodePurityLimit()) // this is a signal leaf, according to the training
1065 return n->GetNBValidation();
1066 else
1067 return n->GetNSValidation();
1068 }
1069 else if ( mode == 1 ) {
1070 // calculate the weighted error using the pruning validation sample
1071 return (n->GetPurity() * n->GetNBValidation() + (1.0 - n->GetPurity()) * n->GetNSValidation());
1072 }
1073 else {
1074 throw std::string("Unknown ValidationQualityMode");
1075 }
1076 }
1077 }
1078}
1079
1080////////////////////////////////////////////////////////////////////////////////
1081/// pass a single validation event through a pruned decision tree
1082/// on the way down the tree, fill in all the "intermediate" information
1083/// that would normally be there from training.
1084
1086{
1087 DecisionTreeNode* current = this->GetRoot();
1088 if (current == NULL) {
1089 Log() << kFATAL << "CheckEventWithPrunedTree: started with undefined ROOT node" <<Endl;
1090 }
1091
1092 while(current != NULL) {
1093 if(e->GetClass() == fSigClass)
1094 current->SetNSValidation(current->GetNSValidation() + e->GetWeight());
1095 else
1096 current->SetNBValidation(current->GetNBValidation() + e->GetWeight());
1097
1098 if (e->GetNTargets() > 0) {
1099 current->AddToSumTarget(e->GetWeight()*e->GetTarget(0));
1100 current->AddToSumTarget2(e->GetWeight()*e->GetTarget(0)*e->GetTarget(0));
1101 }
1102
1103 if (current->GetRight() == NULL || current->GetLeft() == NULL) {
1104 current = NULL;
1105 }
1106 else {
1107 if (current->GoesRight(*e))
1108 current = (TMVA::DecisionTreeNode*)current->GetRight();
1109 else
1110 current = (TMVA::DecisionTreeNode*)current->GetLeft();
1111 }
1112 }
1113}
1114
1115////////////////////////////////////////////////////////////////////////////////
1116/// calculate the normalization factor for a pruning validation sample
1117
1119{
1120 Double_t sumWeights = 0.0;
1121 for( EventConstList::const_iterator it = validationSample->begin();
1122 it != validationSample->end(); ++it ) {
1123 sumWeights += (*it)->GetWeight();
1124 }
1125 return sumWeights;
1126}
1127
1128////////////////////////////////////////////////////////////////////////////////
1129/// return the number of terminal nodes in the sub-tree below Node n
1130
1132{
1133 if (n == NULL) { // default, start at the tree top, then descend recursively
1134 n = this->GetRoot();
1135 if (n == NULL) {
1136 Log() << kFATAL << "CountLeafNodes: started with undefined ROOT node" <<Endl;
1137 return 0;
1138 }
1139 }
1140
1141 UInt_t countLeafs=0;
1142
1143 if ((this->GetLeftDaughter(n) == NULL) && (this->GetRightDaughter(n) == NULL) ) {
1144 countLeafs += 1;
1145 }
1146 else {
1147 if (this->GetLeftDaughter(n) != NULL) {
1148 countLeafs += this->CountLeafNodes( this->GetLeftDaughter(n) );
1149 }
1150 if (this->GetRightDaughter(n) != NULL) {
1151 countLeafs += this->CountLeafNodes( this->GetRightDaughter(n) );
1152 }
1153 }
1154 return countLeafs;
1155}
1156
1157////////////////////////////////////////////////////////////////////////////////
1158/// descend a tree to find all its leaf nodes
1159
1161{
1162 if (n == NULL) { // default, start at the tree top, then descend recursively
1163 n = this->GetRoot();
1164 if (n == NULL) {
1165 Log() << kFATAL << "DescendTree: started with undefined ROOT node" <<Endl;
1166 return ;
1167 }
1168 }
1169
1170 if ((this->GetLeftDaughter(n) == NULL) && (this->GetRightDaughter(n) == NULL) ) {
1171 // do nothing
1172 }
1173 else if ((this->GetLeftDaughter(n) == NULL) && (this->GetRightDaughter(n) != NULL) ) {
1174 Log() << kFATAL << " Node with only one daughter?? Something went wrong" << Endl;
1175 return;
1176 }
1177 else if ((this->GetLeftDaughter(n) != NULL) && (this->GetRightDaughter(n) == NULL) ) {
1178 Log() << kFATAL << " Node with only one daughter?? Something went wrong" << Endl;
1179 return;
1180 }
1181 else {
1182 if (this->GetLeftDaughter(n) != NULL) {
1183 this->DescendTree( this->GetLeftDaughter(n) );
1184 }
1185 if (this->GetRightDaughter(n) != NULL) {
1186 this->DescendTree( this->GetRightDaughter(n) );
1187 }
1188 }
1189}
1190
1191////////////////////////////////////////////////////////////////////////////////
1192/// prune away the subtree below the node
1193
1195{
1196 DecisionTreeNode *l = node->GetLeft();
1197 DecisionTreeNode *r = node->GetRight();
1198
1199 node->SetRight(NULL);
1200 node->SetLeft(NULL);
1201 node->SetSelector(-1);
1202 node->SetSeparationGain(-1);
1203 if (node->GetPurity() > fNodePurityLimit) node->SetNodeType(1);
1204 else node->SetNodeType(-1);
1205 this->DeleteNode(l);
1206 this->DeleteNode(r);
1207 // update the stored number of nodes in the Tree
1208 this->CountNodes();
1209
1210}
1211
1212////////////////////////////////////////////////////////////////////////////////
1213/// prune a node temporarily (without actually deleting its descendants
1214/// which allows testing the pruned tree quality for many different
1215/// pruning stages without "touching" the tree.
1216
1218 if(node == NULL) return;
1219 node->SetNTerminal(1);
1220 node->SetSubTreeR( node->GetNodeR() );
1221 node->SetAlpha( std::numeric_limits<double>::infinity( ) );
1222 node->SetAlphaMinSubtree( std::numeric_limits<double>::infinity( ) );
1223 node->SetTerminal(kTRUE); // set the node to be terminal without deleting its descendants FIXME not needed
1224}
1225
1226////////////////////////////////////////////////////////////////////////////////
1227/// retrieve node from the tree. Its position (up to a maximal tree depth of 64)
1228/// is coded as a sequence of left-right moves starting from the root, coded as
1229/// 0-1 bit patterns stored in the "long-integer" (i.e. 0:left ; 1:right
1230
1232{
1233 Node* current = this->GetRoot();
1234
1235 for (UInt_t i =0; i < depth; i++) {
1236 ULong_t tmp = 1 << i;
1237 if ( tmp & sequence) current = this->GetRightDaughter(current);
1238 else current = this->GetLeftDaughter(current);
1239 }
1240
1241 return current;
1242}
1243
1244////////////////////////////////////////////////////////////////////////////////
1245///
1246
1247void TMVA::DecisionTree::GetRandomisedVariables(Bool_t *useVariable, UInt_t *mapVariable, UInt_t &useNvars){
1248 for (UInt_t ivar=0; ivar<fNvars; ivar++) useVariable[ivar]=kFALSE;
1249 if (fUseNvars==0) { // no number specified ... choose s.th. which hopefully works well
1250 // watch out, should never happen as it is initialised automatically in MethodBDT already!!!
1251 fUseNvars = UInt_t(TMath::Sqrt(fNvars)+0.6);
1252 }
1253 if (fUsePoissonNvars) useNvars=TMath::Min(fNvars,TMath::Max(UInt_t(1),(UInt_t) fMyTrandom->Poisson(fUseNvars)));
1254 else useNvars = fUseNvars;
1255
1256 UInt_t nSelectedVars = 0;
1257 while (nSelectedVars < useNvars) {
1258 Double_t bla = fMyTrandom->Rndm()*fNvars;
1259 useVariable[Int_t (bla)] = kTRUE;
1260 nSelectedVars = 0;
1261 for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1262 if (useVariable[ivar] == kTRUE) {
1263 mapVariable[nSelectedVars] = ivar;
1264 nSelectedVars++;
1265 }
1266 }
1267 }
1268 if (nSelectedVars != useNvars) { std::cout << "Bug in TrainNode - GetRandisedVariables()... sorry" << std::endl; std::exit(1);}
1269}
1270
1271// Multithreaded version of DecisionTree::TrainNodeFast
1272#ifdef R__USE_IMT
1273//====================================================================================
1274// Had to define this struct to enable parallelization in TrainNodeFast.
1275// Using TrainNodeInfo, each thread can return the information needed.
1276// After each thread returns we can then merge the results, hence the + operator.
1277//====================================================================================
1279
1280 TrainNodeInfo(Int_t cNvars_, UInt_t* nBins_){
1281
1282 cNvars = cNvars_;
1283 nBins = nBins_;
1284
1285 nSelS = std::vector< std::vector<Double_t> >(cNvars);
1286 nSelB = std::vector< std::vector<Double_t> >(cNvars);
1287 nSelS_unWeighted = std::vector< std::vector<Double_t> >(cNvars);
1288 nSelB_unWeighted = std::vector< std::vector<Double_t> >(cNvars);
1289 target = std::vector< std::vector<Double_t> >(cNvars);
1290 target2 = std::vector< std::vector<Double_t> >(cNvars);
1291
1292 for(Int_t ivar=0; ivar<cNvars; ivar++){
1293 nSelS[ivar] = std::vector<Double_t>(nBins[ivar], 0);
1294 nSelB[ivar] = std::vector<Double_t>(nBins[ivar], 0);
1295 nSelS_unWeighted[ivar] = std::vector<Double_t>(nBins[ivar], 0);
1296 nSelB_unWeighted[ivar] = std::vector<Double_t>(nBins[ivar], 0);
1297 target[ivar] = std::vector<Double_t>(nBins[ivar], 0);
1298 target2[ivar] = std::vector<Double_t>(nBins[ivar], 0);
1299 }
1300 };
1301
1303
1304 // #### malloc problem if I define this and try to destruct xmin and xmax...
1305 // #### Maybe someone better at C++ can figure out why and fix this if it's a
1306 // ### serious memory problem
1307 //~TrainNodeInfo(){
1308 // delete [] xmin;
1309 // delete [] xmax;
1310 //};
1311
1312 Int_t cNvars = 0;
1314
1315 Double_t nTotS = 0;
1316 Double_t nTotS_unWeighted = 0;
1317 Double_t nTotB = 0;
1318 Double_t nTotB_unWeighted = 0;
1319
1320 std::vector< std::vector<Double_t> > nSelS;
1321 std::vector< std::vector<Double_t> > nSelB;
1322 std::vector< std::vector<Double_t> > nSelS_unWeighted;
1323 std::vector< std::vector<Double_t> > nSelB_unWeighted;
1324 std::vector< std::vector<Double_t> > target;
1325 std::vector< std::vector<Double_t> > target2;
1326
1327 // Define the addition operator for TrainNodeInfo
1328 // Make sure both TrainNodeInfos have the same nvars if we add them
1330 {
1331 TrainNodeInfo ret(cNvars, nBins);
1332
1333 // check that the two are compatible to add
1334 if(cNvars != other.cNvars)
1335 {
1336 std::cout << "!!! ERROR TrainNodeInfo1+TrainNodeInfo2 failure. cNvars1 != cNvars2." << std::endl;
1337 return ret;
1338 }
1339
1340 // add the signal, background, and target sums
1341 for (Int_t ivar=0; ivar<cNvars; ivar++) {
1342 for (UInt_t ibin=0; ibin<nBins[ivar]; ibin++) {
1343 ret.nSelS[ivar][ibin] = nSelS[ivar][ibin] + other.nSelS[ivar][ibin];
1344 ret.nSelB[ivar][ibin] = nSelB[ivar][ibin] + other.nSelB[ivar][ibin];
1345 ret.nSelS_unWeighted[ivar][ibin] = nSelS_unWeighted[ivar][ibin] + other.nSelS_unWeighted[ivar][ibin];
1346 ret.nSelB_unWeighted[ivar][ibin] = nSelB_unWeighted[ivar][ibin] + other.nSelB_unWeighted[ivar][ibin];
1347 ret.target[ivar][ibin] = target[ivar][ibin] + other.target[ivar][ibin];
1348 ret.target2[ivar][ibin] = target2[ivar][ibin] + other.target2[ivar][ibin];
1349 }
1350 }
1351
1352 ret.nTotS = nTotS + other.nTotS;
1353 ret.nTotS_unWeighted = nTotS_unWeighted + other.nTotS_unWeighted;
1354 ret.nTotB = nTotB + other.nTotB;
1355 ret.nTotB_unWeighted = nTotB_unWeighted + other.nTotB_unWeighted;
1356
1357 return ret;
1358 };
1359
1360};
1361//===========================================================================
1362// Done with TrainNodeInfo declaration
1363//===========================================================================
1364
1365////////////////////////////////////////////////////////////////////////////////
1366/// Decide how to split a node using one of the variables that gives
1367/// the best separation of signal/background. In order to do this, for each
1368/// variable a scan of the different cut values in a grid (grid = fNCuts) is
1369/// performed and the resulting separation gains are compared.
1370/// in addition to the individual variables, one can also ask for a fisher
1371/// discriminant being built out of (some) of the variables and used as a
1372/// possible multivariate split.
1373
1376{
1377 // #### OK let's comment this one to see how to parallelize it
1378 Double_t separationGainTotal = -1;
1379 Double_t *separationGain = new Double_t[fNvars+1];
1380 Int_t *cutIndex = new Int_t[fNvars+1]; //-1;
1381
1382 // #### initialize the sep gain and cut index values
1383 for (UInt_t ivar=0; ivar <= fNvars; ivar++) {
1384 separationGain[ivar]=-1;
1385 cutIndex[ivar]=-1;
1386 }
1387 // ### set up some other variables
1388 Int_t mxVar = -1;
1389 Bool_t cutType = kTRUE;
1390 UInt_t nevents = eventSample.size();
1391
1392
1393 // the +1 comes from the fact that I treat later on the Fisher output as an
1394 // additional possible variable.
1395 Bool_t *useVariable = new Bool_t[fNvars+1]; // for performance reasons instead of std::vector<Bool_t> useVariable(fNvars);
1396 UInt_t *mapVariable = new UInt_t[fNvars+1]; // map the subset of variables used in randomised trees to the original variable number (used in the Event() )
1397
1398 std::vector<Double_t> fisherCoeff;
1399
1400 // #### set up a map to the subset of variables using two arrays
1401 if (fRandomisedTree) { // choose for each node splitting a random subset of variables to choose from
1402 UInt_t tmp=fUseNvars;
1403 GetRandomisedVariables(useVariable,mapVariable,tmp);
1404 }
1405 else {
1406 for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1407 useVariable[ivar] = kTRUE;
1408 mapVariable[ivar] = ivar;
1409 }
1410 }
1411 // #### last variable entry is the fisher variable
1412 useVariable[fNvars] = kFALSE; //by default fisher is not used..
1413
1414 // #### Begin Fisher calculation
1415 Bool_t fisherOK = kFALSE; // flag to show that the fisher discriminant could be calculated correctly or not;
1416 if (fUseFisherCuts) {
1417 useVariable[fNvars] = kTRUE; // that's were I store the "fisher MVA"
1418
1419 //use for the Fisher discriminant ONLY those variables that show
1420 //some reasonable linear correlation in either Signal or Background
1421 Bool_t *useVarInFisher = new Bool_t[fNvars]; // for performance reasons instead of std::vector<Bool_t> useVariable(fNvars);
1422 UInt_t *mapVarInFisher = new UInt_t[fNvars]; // map the subset of variables used in randomised trees to the original variable number (used in the Event() )
1423 for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1424 useVarInFisher[ivar] = kFALSE;
1425 mapVarInFisher[ivar] = ivar;
1426 }
1427
1428 std::vector<TMatrixDSym*>* covMatrices;
1429 covMatrices = gTools().CalcCovarianceMatrices( eventSample, 2 ); // currently for 2 classes only
1430 if (!covMatrices){
1431 Log() << kWARNING << " in TrainNodeFast, the covariance Matrices needed for the Fisher-Cuts returned error --> revert to just normal cuts for this node" << Endl;
1432 fisherOK = kFALSE;
1433 }else{
1434 TMatrixD *ss = new TMatrixD(*(covMatrices->at(0)));
1435 TMatrixD *bb = new TMatrixD(*(covMatrices->at(1)));
1436 const TMatrixD *s = gTools().GetCorrelationMatrix(ss);
1437 const TMatrixD *b = gTools().GetCorrelationMatrix(bb);
1438
1439 for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1440 for (UInt_t jvar=ivar+1; jvar < fNvars; jvar++) {
1441 if ( ( TMath::Abs( (*s)(ivar, jvar)) > fMinLinCorrForFisher) ||
1442 ( TMath::Abs( (*b)(ivar, jvar)) > fMinLinCorrForFisher) ){
1443 useVarInFisher[ivar] = kTRUE;
1444 useVarInFisher[jvar] = kTRUE;
1445 }
1446 }
1447 }
1448 // now as you know which variables you want to use, count and map them:
1449 // such that you can use an array/matrix filled only with THOSE variables
1450 // that you used
1451 UInt_t nFisherVars = 0;
1452 for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1453 //now .. pick those variables that are used in the FIsher and are also
1454 // part of the "allowed" variables in case of Randomized Trees)
1455 if (useVarInFisher[ivar] && useVariable[ivar]) {
1456 mapVarInFisher[nFisherVars++]=ivar;
1457 // now exclude the variables used in the Fisher cuts, and don't
1458 // use them anymore in the individual variable scan
1459 if (fUseExclusiveVars) useVariable[ivar] = kFALSE;
1460 }
1461 }
1462
1463
1464 fisherCoeff = this->GetFisherCoefficients(eventSample, nFisherVars, mapVarInFisher);
1465 fisherOK = kTRUE;
1466 }
1467 delete [] useVarInFisher;
1468 delete [] mapVarInFisher;
1469
1470 }
1471 // #### End Fisher calculation
1472
1473
1474 UInt_t cNvars = fNvars;
1475 if (fUseFisherCuts && fisherOK) cNvars++; // use the Fisher output simple as additional variable
1476
1477 // #### set up the binning info arrays
1478 // #### each var has its own binning since some may be integers
1479 UInt_t* nBins = new UInt_t [cNvars];
1480 Double_t* binWidth = new Double_t [cNvars];
1481 Double_t* invBinWidth = new Double_t [cNvars];
1482 Double_t** cutValues = new Double_t* [cNvars];
1483
1484 // #### set up the xmin and xmax arrays
1485 // #### each var has its own range
1486 Double_t *xmin = new Double_t[cNvars];
1487 Double_t *xmax = new Double_t[cNvars];
1488
1489 // construct and intialize binning/cuts
1490 for (UInt_t ivar=0; ivar<cNvars; ivar++) {
1491 // ncuts means that we need n+1 bins for each variable
1492 nBins[ivar] = fNCuts+1;
1493 if (ivar < fNvars) {
1494 if (fDataSetInfo->GetVariableInfo(ivar).GetVarType() == 'I') {
1495 nBins[ivar] = node->GetSampleMax(ivar) - node->GetSampleMin(ivar) + 1;
1496 }
1497 }
1498
1499 cutValues[ivar] = new Double_t [nBins[ivar]];
1500 }
1501
1502 // init the range and cutvalues for each var now that we know the binning
1503 for (UInt_t ivar=0; ivar < cNvars; ivar++) {
1504 if (ivar < fNvars){
1505 xmin[ivar]=node->GetSampleMin(ivar);
1506 xmax[ivar]=node->GetSampleMax(ivar);
1507 if (almost_equal_float(xmax[ivar], xmin[ivar])) {
1508 // std::cout << " variable " << ivar << " has no proper range in (xmax[ivar]-xmin[ivar] = " << xmax[ivar]-xmin[ivar] << std::endl;
1509 // std::cout << " will set useVariable[ivar]=false"<<std::endl;
1510 useVariable[ivar]=kFALSE;
1511 }
1512
1513 } else { // the fisher variable
1514 xmin[ivar]=999;
1515 xmax[ivar]=-999;
1516 // too bad, for the moment I don't know how to do this without looping
1517 // once to get the "min max" and then AGAIN to fill the histogram
1518 for (UInt_t iev=0; iev<nevents; iev++) {
1519 // returns the Fisher value (no fixed range)
1520 Double_t result = fisherCoeff[fNvars]; // the fisher constant offset
1521 for (UInt_t jvar=0; jvar<fNvars; jvar++)
1522 result += fisherCoeff[jvar]*(eventSample[iev])->GetValueFast(jvar);
1523 if (result > xmax[ivar]) xmax[ivar]=result;
1524 if (result < xmin[ivar]) xmin[ivar]=result;
1525 }
1526 }
1527 // this loop could take a long time if nbins is large
1528 for (UInt_t ibin=0; ibin<nBins[ivar]; ibin++) {
1529 cutValues[ivar][ibin]=0;
1530 }
1531 }
1532
1533 // ====================================================================
1534 // ====================================================================
1535 // Parallelized Version
1536 // ====================================================================
1537 // ====================================================================
1538
1539 // #### Figure out the cut values, loops through vars then through cuts
1540 // #### if ncuts is on the order of the amount of training data/10 - ish then we get some gains from parallelizing this
1541 // fill the cut values for the scan:
1542 auto varSeeds = ROOT::TSeqU(cNvars);
1543 auto fvarInitCuts = [this, &useVariable, &cutValues, &invBinWidth, &binWidth, &nBins, &xmin, &xmax](UInt_t ivar = 0){
1544
1545 if ( useVariable[ivar] ) {
1546
1547 //set the grid for the cut scan on the variables like this:
1548 //
1549 // | | | | | ... | |
1550 // xmin xmax
1551 //
1552 // cut 0 1 2 3 ... fNCuts-1 (counting from zero)
1553 // bin 0 1 2 3 ..... nBins-1=fNCuts (counting from zero)
1554 // --> nBins = fNCuts+1
1555 // (NOTE, the cuts at xmin or xmax would just give the whole sample and
1556 // hence can be safely omitted
1557
1558 binWidth[ivar] = ( xmax[ivar] - xmin[ivar] ) / Double_t(nBins[ivar]);
1559 invBinWidth[ivar] = 1./binWidth[ivar];
1560 if (ivar < fNvars) {
1561 if (fDataSetInfo->GetVariableInfo(ivar).GetVarType() == 'I') { invBinWidth[ivar] = 1; binWidth[ivar] = 1; }
1562 }
1563
1564 // std::cout << "ivar="<<ivar
1565 // <<" min="<<xmin[ivar]
1566 // << " max="<<xmax[ivar]
1567 // << " width=" << istepSize
1568 // << " nBins["<<ivar<<"]="<<nBins[ivar]<<std::endl;
1569 for (UInt_t icut=0; icut<nBins[ivar]-1; icut++) {
1570 cutValues[ivar][icut]=xmin[ivar]+(Double_t(icut+1))*binWidth[ivar];
1571 // std::cout << " cutValues["<<ivar<<"]["<<icut<<"]=" << cutValues[ivar][icut] << std::endl;
1572 }
1573 }
1574 return 0;
1575 };
1576 TMVA::Config::Instance().GetThreadExecutor().Map(fvarInitCuts, varSeeds);
1577
1578 // #### Loop through the events to get the total sig and background
1579 // #### Then loop through the vars to get the counts in each bin in each var
1580 // #### So we have a loop through the events and a loop through the vars, but no loop through the cuts this is a calculation
1581
1582 TrainNodeInfo nodeInfo(cNvars, nBins);
1584
1585 // #### When nbins is low compared to ndata this version of parallelization is faster, so use it
1586 // #### Parallelize by chunking the data into the same number of sections as we have processors
1587 if(eventSample.size() >= cNvars*fNCuts*nPartitions*2)
1588 {
1589 auto seeds = ROOT::TSeqU(nPartitions);
1590
1591 // need a lambda function to pass to TThreadExecutor::MapReduce
1592 auto f = [this, &eventSample, &fisherCoeff, &useVariable, &invBinWidth,
1593 &nBins, &xmin, &cNvars, &nPartitions](UInt_t partition = 0){
1594
1595 UInt_t start = 1.0*partition/nPartitions*eventSample.size();
1596 UInt_t end = (partition+1.0)/nPartitions*eventSample.size();
1597
1598 TrainNodeInfo nodeInfof(cNvars, nBins);
1599
1600 for(UInt_t iev=start; iev<end; iev++) {
1601
1602 Double_t eventWeight = eventSample[iev]->GetWeight();
1603 if (eventSample[iev]->GetClass() == fSigClass) {
1604 nodeInfof.nTotS+=eventWeight;
1605 nodeInfof.nTotS_unWeighted++; }
1606 else {
1607 nodeInfof.nTotB+=eventWeight;
1608 nodeInfof.nTotB_unWeighted++;
1609 }
1610
1611 // #### Count the number in each bin
1612 Int_t iBin=-1;
1613 for (UInt_t ivar=0; ivar < cNvars; ivar++) {
1614 // now scan trough the cuts for each varable and find which one gives
1615 // the best separationGain at the current stage.
1616 if ( useVariable[ivar] ) {
1617 Double_t eventData;
1618 if (ivar < fNvars) eventData = eventSample[iev]->GetValueFast(ivar);
1619 else { // the fisher variable
1620 eventData = fisherCoeff[fNvars];
1621 for (UInt_t jvar=0; jvar<fNvars; jvar++)
1622 eventData += fisherCoeff[jvar]*(eventSample[iev])->GetValueFast(jvar);
1623
1624 }
1625 // #### figure out which bin it belongs in ...
1626 // "maximum" is nbins-1 (the "-1" because we start counting from 0 !!
1627 iBin = TMath::Min(Int_t(nBins[ivar]-1),TMath::Max(0,int (invBinWidth[ivar]*(eventData-xmin[ivar]) ) ));
1628 if (eventSample[iev]->GetClass() == fSigClass) {
1629 nodeInfof.nSelS[ivar][iBin]+=eventWeight;
1630 nodeInfof.nSelS_unWeighted[ivar][iBin]++;
1631 }
1632 else {
1633 nodeInfof.nSelB[ivar][iBin]+=eventWeight;
1634 nodeInfof.nSelB_unWeighted[ivar][iBin]++;
1635 }
1636 if (DoRegression()) {
1637 nodeInfof.target[ivar][iBin] +=eventWeight*eventSample[iev]->GetTarget(0);
1638 nodeInfof.target2[ivar][iBin]+=eventWeight*eventSample[iev]->GetTarget(0)*eventSample[iev]->GetTarget(0);
1639 }
1640 }
1641 }
1642 }
1643 return nodeInfof;
1644 };
1645
1646 // #### Need an intial struct to pass to std::accumulate
1647 TrainNodeInfo nodeInfoInit(cNvars, nBins);
1648
1649 // #### Run the threads in parallel then merge the results
1650 auto redfunc = [nodeInfoInit](std::vector<TrainNodeInfo> v) -> TrainNodeInfo { return std::accumulate(v.begin(), v.end(), nodeInfoInit); };
1651 nodeInfo = TMVA::Config::Instance().GetThreadExecutor().MapReduce(f, seeds, redfunc);
1652 }
1653
1654
1655 // #### When nbins is close to the order of the data this version of parallelization is faster
1656 // #### Parallelize by vectorizing the variable loop
1657 else {
1658
1659 auto fvarFillNodeInfo = [this, &nodeInfo, &eventSample, &fisherCoeff, &useVariable, &invBinWidth, &nBins, &xmin](UInt_t ivar = 0){
1660
1661 for(UInt_t iev=0; iev<eventSample.size(); iev++) {
1662
1663 Int_t iBin=-1;
1664 Double_t eventWeight = eventSample[iev]->GetWeight();
1665
1666 // Only count the net signal and background once
1667 if(ivar==0){
1668 if (eventSample[iev]->GetClass() == fSigClass) {
1669 nodeInfo.nTotS+=eventWeight;
1670 nodeInfo.nTotS_unWeighted++; }
1671 else {
1672 nodeInfo.nTotB+=eventWeight;
1673 nodeInfo.nTotB_unWeighted++;
1674 }
1675 }
1676
1677 // Figure out which bin the event belongs in and increment the bin in each histogram vector appropriately
1678 if ( useVariable[ivar] ) {
1679 Double_t eventData;
1680 if (ivar < fNvars) eventData = eventSample[iev]->GetValueFast(ivar);
1681 else { // the fisher variable
1682 eventData = fisherCoeff[fNvars];
1683 for (UInt_t jvar=0; jvar<fNvars; jvar++)
1684 eventData += fisherCoeff[jvar]*(eventSample[iev])->GetValueFast(jvar);
1685
1686 }
1687 // #### figure out which bin it belongs in ...
1688 // "maximum" is nbins-1 (the "-1" because we start counting from 0 !!
1689 iBin = TMath::Min(Int_t(nBins[ivar]-1),TMath::Max(0,int (invBinWidth[ivar]*(eventData-xmin[ivar]) ) ));
1690 if (eventSample[iev]->GetClass() == fSigClass) {
1691 nodeInfo.nSelS[ivar][iBin]+=eventWeight;
1692 nodeInfo.nSelS_unWeighted[ivar][iBin]++;
1693 }
1694 else {
1695 nodeInfo.nSelB[ivar][iBin]+=eventWeight;
1696 nodeInfo.nSelB_unWeighted[ivar][iBin]++;
1697 }
1698 if (DoRegression()) {
1699 nodeInfo.target[ivar][iBin] +=eventWeight*eventSample[iev]->GetTarget(0);
1700 nodeInfo.target2[ivar][iBin]+=eventWeight*eventSample[iev]->GetTarget(0)*eventSample[iev]->GetTarget(0);
1701 }
1702 }
1703 }
1704 return 0;
1705 };
1706
1707 TMVA::Config::Instance().GetThreadExecutor().Map(fvarFillNodeInfo, varSeeds);
1708 }
1709
1710 // now turn each "histogram" into a cumulative distribution
1711 // #### loops through the vars and then the bins, if the bins are on the order of the training data this is worth parallelizing
1712 // #### doesn't hurt otherwise, pretty unnoticeable
1713 auto fvarCumulative = [&nodeInfo, &useVariable, &nBins, this, &eventSample](UInt_t ivar = 0){
1714 if (useVariable[ivar]) {
1715 for (UInt_t ibin=1; ibin < nBins[ivar]; ibin++) {
1716 nodeInfo.nSelS[ivar][ibin]+=nodeInfo.nSelS[ivar][ibin-1];
1717 nodeInfo.nSelS_unWeighted[ivar][ibin]+=nodeInfo.nSelS_unWeighted[ivar][ibin-1];
1718 nodeInfo.nSelB[ivar][ibin]+=nodeInfo.nSelB[ivar][ibin-1];
1719 nodeInfo.nSelB_unWeighted[ivar][ibin]+=nodeInfo.nSelB_unWeighted[ivar][ibin-1];
1720 if (DoRegression()) {
1721 nodeInfo.target[ivar][ibin] +=nodeInfo.target[ivar][ibin-1] ;
1722 nodeInfo.target2[ivar][ibin]+=nodeInfo.target2[ivar][ibin-1];
1723 }
1724 }
1725 if (nodeInfo.nSelS_unWeighted[ivar][nBins[ivar]-1] +nodeInfo.nSelB_unWeighted[ivar][nBins[ivar]-1] != eventSample.size()) {
1726 Log() << kFATAL << "Helge, you have a bug ....nodeInfo.nSelS_unw..+nodeInfo.nSelB_unw..= "
1727 << nodeInfo.nSelS_unWeighted[ivar][nBins[ivar]-1] +nodeInfo.nSelB_unWeighted[ivar][nBins[ivar]-1]
1728 << " while eventsample size = " << eventSample.size()
1729 << Endl;
1730 }
1731 double lastBins=nodeInfo.nSelS[ivar][nBins[ivar]-1] +nodeInfo.nSelB[ivar][nBins[ivar]-1];
1732 double totalSum=nodeInfo.nTotS+nodeInfo.nTotB;
1733 if (TMath::Abs(lastBins-totalSum)/totalSum>0.01) {
1734 Log() << kFATAL << "Helge, you have another bug ....nodeInfo.nSelS+nodeInfo.nSelB= "
1735 << lastBins
1736 << " while total number of events = " << totalSum
1737 << Endl;
1738 }
1739 }
1740 return 0;
1741 };
1742 TMVA::Config::Instance().GetThreadExecutor().Map(fvarCumulative, varSeeds);
1743
1744 // #### Again, if bins is on the order of the training data or with an order or so, then this is worth parallelizing
1745 // now select the optimal cuts for each varable and find which one gives
1746 // the best separationGain at the current stage
1747 auto fvarMaxSep = [&nodeInfo, &useVariable, this, &separationGain, &cutIndex, &nBins] (UInt_t ivar = 0){
1748 if (useVariable[ivar]) {
1749 Double_t sepTmp;
1750 for (UInt_t iBin=0; iBin<nBins[ivar]-1; iBin++) { // the last bin contains "all events" -->skip
1751 // the separationGain is defined as the various indices (Gini, CorssEntropy, e.t.c)
1752 // calculated by the "SamplePurities" from the branches that would go to the
1753 // left or the right from this node if "these" cuts were used in the Node:
1754 // hereby: nodeInfo.nSelS and nodeInfo.nSelB would go to the right branch
1755 // (nodeInfo.nTotS - nodeInfo.nSelS) + (nodeInfo.nTotB - nodeInfo.nSelB) would go to the left branch;
1756
1757 // only allow splits where both daughter nodes match the specified minimum number
1758 // for this use the "unweighted" events, as you are interested in statistically
1759 // significant splits, which is determined by the actual number of entries
1760 // for a node, rather than the sum of event weights.
1761
1762 Double_t sl = nodeInfo.nSelS_unWeighted[ivar][iBin];
1763 Double_t bl = nodeInfo.nSelB_unWeighted[ivar][iBin];
1764 Double_t s = nodeInfo.nTotS_unWeighted;
1765 Double_t b = nodeInfo.nTotB_unWeighted;
1766 Double_t slW = nodeInfo.nSelS[ivar][iBin];
1767 Double_t blW = nodeInfo.nSelB[ivar][iBin];
1768 Double_t sW = nodeInfo.nTotS;
1769 Double_t bW = nodeInfo.nTotB;
1770 Double_t sr = s-sl;
1771 Double_t br = b-bl;
1772 Double_t srW = sW-slW;
1773 Double_t brW = bW-blW;
1774 // std::cout << "sl="<<sl << " bl="<<bl<<" fMinSize="<<fMinSize << "sr="<<sr << " br="<<br <<std::endl;
1775 if ( ((sl+bl)>=fMinSize && (sr+br)>=fMinSize)
1776 && ((slW+blW)>=fMinSize && (srW+brW)>=fMinSize)
1777 ) {
1778
1779 if (DoRegression()) {
1780 sepTmp = fRegType->GetSeparationGain(nodeInfo.nSelS[ivar][iBin]+nodeInfo.nSelB[ivar][iBin],
1781 nodeInfo.target[ivar][iBin],nodeInfo.target2[ivar][iBin],
1782 nodeInfo.nTotS+nodeInfo.nTotB,
1783 nodeInfo.target[ivar][nBins[ivar]-1],nodeInfo.target2[ivar][nBins[ivar]-1]);
1784 } else {
1785 sepTmp = fSepType->GetSeparationGain(nodeInfo.nSelS[ivar][iBin], nodeInfo.nSelB[ivar][iBin], nodeInfo.nTotS, nodeInfo.nTotB);
1786 }
1787 if (separationGain[ivar] < sepTmp) {
1788 separationGain[ivar] = sepTmp;
1789 cutIndex[ivar] = iBin;
1790 }
1791 }
1792 }
1793 }
1794 return 0;
1795 };
1796 TMVA::Config::Instance().GetThreadExecutor().Map(fvarMaxSep, varSeeds);
1797
1798 // you found the best separation cut for each variable, now compare the variables
1799 for (UInt_t ivar=0; ivar < cNvars; ivar++) {
1800 if (useVariable[ivar] ) {
1801 if (separationGainTotal < separationGain[ivar]) {
1802 separationGainTotal = separationGain[ivar];
1803 mxVar = ivar;
1804 }
1805 }
1806 }
1807
1808 if (mxVar >= 0) {
1809 if (DoRegression()) {
1810 node->SetSeparationIndex(fRegType->GetSeparationIndex(nodeInfo.nTotS+nodeInfo.nTotB,nodeInfo.target[0][nBins[mxVar]-1],nodeInfo.target2[0][nBins[mxVar]-1]));
1811 node->SetResponse(nodeInfo.target[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB));
1812 if ( almost_equal_double(nodeInfo.target2[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB), nodeInfo.target[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB)*nodeInfo.target[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB))) {
1813 node->SetRMS(0);
1814
1815 }else{
1816 node->SetRMS(TMath::Sqrt(nodeInfo.target2[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB) - nodeInfo.target[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB)*nodeInfo.target[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB)));
1817 }
1818 }
1819 else {
1820 node->SetSeparationIndex(fSepType->GetSeparationIndex(nodeInfo.nTotS,nodeInfo.nTotB));
1821 if (mxVar >=0){
1822 if (nodeInfo.nSelS[mxVar][cutIndex[mxVar]]/nodeInfo.nTotS > nodeInfo.nSelB[mxVar][cutIndex[mxVar]]/nodeInfo.nTotB) cutType=kTRUE;
1823 else cutType=kFALSE;
1824 }
1825 }
1826 node->SetSelector((UInt_t)mxVar);
1827 node->SetCutValue(cutValues[mxVar][cutIndex[mxVar]]);
1828 node->SetCutType(cutType);
1829 node->SetSeparationGain(separationGainTotal);
1830 if (mxVar < (Int_t) fNvars){ // the fisher cut is actually not used in this node, hence don't need to store fisher components
1831 node->SetNFisherCoeff(0);
1832 fVariableImportance[mxVar] += separationGainTotal*separationGainTotal * (nodeInfo.nTotS+nodeInfo.nTotB) * (nodeInfo.nTotS+nodeInfo.nTotB) ;
1833 //for (UInt_t ivar=0; ivar<fNvars; ivar++) fVariableImportance[ivar] += separationGain[ivar]*separationGain[ivar] * (nodeInfo.nTotS+nodeInfo.nTotB) * (nodeInfo.nTotS+nodeInfo.nTotB) ;
1834 }else{
1835 // allocate Fisher coefficients (use fNvars, and set the non-used ones to zero. Might
1836 // be even less storage space on average than storing also the mapping used otherwise
1837 // can always be changed relatively easy
1838 node->SetNFisherCoeff(fNvars+1);
1839 for (UInt_t ivar=0; ivar<=fNvars; ivar++) {
1840 node->SetFisherCoeff(ivar,fisherCoeff[ivar]);
1841 // take 'fisher coeff. weighted estimate as variable importance, "Don't fill the offset coefficient though :)
1842 if (ivar<fNvars){
1843 fVariableImportance[ivar] += fisherCoeff[ivar]*fisherCoeff[ivar]*separationGainTotal*separationGainTotal * (nodeInfo.nTotS+nodeInfo.nTotB) * (nodeInfo.nTotS+nodeInfo.nTotB) ;
1844 }
1845 }
1846 }
1847 }
1848 else {
1849 separationGainTotal = 0;
1850 }
1851
1852 // #### Now in TrainNodeInfo, but I got a malloc segfault when I tried to destruct arrays there.
1853 // #### So, I changed these from dynamic arrays to std::vector to fix this memory problem
1854 // #### so no need to destruct them anymore. I didn't see any performance drop as a result.
1855 for (UInt_t i=0; i<cNvars; i++) {
1856 // delete [] nodeInfo.nSelS[i];
1857 // delete [] nodeInfo.nSelB[i];
1858 // delete [] nodeInfo.nSelS_unWeighted[i];
1859 // delete [] nodeInfo.nSelB_unWeighted[i];
1860 // delete [] nodeInfo.target[i];
1861 // delete [] nodeInfo.target2[i];
1862 delete [] cutValues[i];
1863 }
1864 //delete [] nodeInfo.nSelS;
1865 //delete [] nodeInfo.nSelB;
1866 //delete [] nodeInfo.nSelS_unWeighted;
1867 //delete [] nodeInfo.nSelB_unWeighted;
1868 //delete [] nodeInfo.target;
1869 //delete [] nodeInfo.target2;
1870
1871 // #### left these as dynamic arrays as they were before
1872 // #### since I didn't need to mess with them for parallelization
1873 delete [] cutValues;
1874
1875 delete [] xmin;
1876 delete [] xmax;
1877
1878 delete [] useVariable;
1879 delete [] mapVariable;
1880
1881 delete [] separationGain;
1882 delete [] cutIndex;
1883
1884 delete [] nBins;
1885 delete [] binWidth;
1886 delete [] invBinWidth;
1887
1888 return separationGainTotal;
1889}
1890
1891// Standard version of DecisionTree::TrainNodeFast (multithreading is not enabled)
1892#else
1893Double_t TMVA::DecisionTree::TrainNodeFast( const EventConstList & eventSample,
1895{
1896// #### OK let's comment this one to see how to parallelize it
1897 Double_t separationGainTotal = -1, sepTmp;
1898 Double_t *separationGain = new Double_t[fNvars+1];
1899 Int_t *cutIndex = new Int_t[fNvars+1]; //-1;
1900
1901 // #### initialize the sep gain and cut index values
1902 for (UInt_t ivar=0; ivar <= fNvars; ivar++) {
1903 separationGain[ivar]=-1;
1904 cutIndex[ivar]=-1;
1905 }
1906 // ### set up some other variables
1907 Int_t mxVar = -1;
1908 Bool_t cutType = kTRUE;
1909 Double_t nTotS, nTotB;
1910 Int_t nTotS_unWeighted, nTotB_unWeighted;
1911 UInt_t nevents = eventSample.size();
1912
1913
1914 // the +1 comes from the fact that I treat later on the Fisher output as an
1915 // additional possible variable.
1916 Bool_t *useVariable = new Bool_t[fNvars+1]; // for performance reasons instead of std::vector<Bool_t> useVariable(fNvars);
1917 UInt_t *mapVariable = new UInt_t[fNvars+1]; // map the subset of variables used in randomised trees to the original variable number (used in the Event() )
1918
1919 std::vector<Double_t> fisherCoeff;
1920
1921 // #### set up a map to the subset of variables using two arrays
1922 if (fRandomisedTree) { // choose for each node splitting a random subset of variables to choose from
1923 UInt_t tmp=fUseNvars;
1924 GetRandomisedVariables(useVariable,mapVariable,tmp);
1925 }
1926 else {
1927 for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1928 useVariable[ivar] = kTRUE;
1929 mapVariable[ivar] = ivar;
1930 }
1931 }
1932 // #### last variable entry is the fisher variable
1933 useVariable[fNvars] = kFALSE; //by default fisher is not used..
1934
1935 // #### Begin Fisher calculation
1936 Bool_t fisherOK = kFALSE; // flag to show that the fisher discriminant could be calculated correctly or not;
1937 if (fUseFisherCuts) {
1938 useVariable[fNvars] = kTRUE; // that's were I store the "fisher MVA"
1939
1940 //use for the Fisher discriminant ONLY those variables that show
1941 //some reasonable linear correlation in either Signal or Background
1942 Bool_t *useVarInFisher = new Bool_t[fNvars]; // for performance reasons instead of std::vector<Bool_t> useVariable(fNvars);
1943 UInt_t *mapVarInFisher = new UInt_t[fNvars]; // map the subset of variables used in randomised trees to the original variable number (used in the Event() )
1944 for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1945 useVarInFisher[ivar] = kFALSE;
1946 mapVarInFisher[ivar] = ivar;
1947 }
1948
1949 std::vector<TMatrixDSym*>* covMatrices;
1950 covMatrices = gTools().CalcCovarianceMatrices( eventSample, 2 ); // currently for 2 classes only
1951 if (!covMatrices){
1952 Log() << kWARNING << " in TrainNodeFast, the covariance Matrices needed for the Fisher-Cuts returned error --> revert to just normal cuts for this node" << Endl;
1953 fisherOK = kFALSE;
1954 }else{
1955 TMatrixD *ss = new TMatrixD(*(covMatrices->at(0)));
1956 TMatrixD *bb = new TMatrixD(*(covMatrices->at(1)));
1957 const TMatrixD *s = gTools().GetCorrelationMatrix(ss);
1958 const TMatrixD *b = gTools().GetCorrelationMatrix(bb);
1959
1960 for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1961 for (UInt_t jvar=ivar+1; jvar < fNvars; jvar++) {
1962 if ( ( TMath::Abs( (*s)(ivar, jvar)) > fMinLinCorrForFisher) ||
1963 ( TMath::Abs( (*b)(ivar, jvar)) > fMinLinCorrForFisher) ){
1964 useVarInFisher[ivar] = kTRUE;
1965 useVarInFisher[jvar] = kTRUE;
1966 }
1967 }
1968 }
1969 // now as you know which variables you want to use, count and map them:
1970 // such that you can use an array/matrix filled only with THOSE variables
1971 // that you used
1972 UInt_t nFisherVars = 0;
1973 for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1974 //now .. pick those variables that are used in the FIsher and are also
1975 // part of the "allowed" variables in case of Randomized Trees)
1976 if (useVarInFisher[ivar] && useVariable[ivar]) {
1977 mapVarInFisher[nFisherVars++]=ivar;
1978 // now exclud the the variables used in the Fisher cuts, and don't
1979 // use them anymore in the individual variable scan
1980 if (fUseExclusiveVars) useVariable[ivar] = kFALSE;
1981 }
1982 }
1983
1984
1985 fisherCoeff = this->GetFisherCoefficients(eventSample, nFisherVars, mapVarInFisher);
1986 fisherOK = kTRUE;
1987 }
1988 delete [] useVarInFisher;
1989 delete [] mapVarInFisher;
1990
1991 }
1992 // #### End Fisher calculation
1993
1994
1995 UInt_t cNvars = fNvars;
1996 if (fUseFisherCuts && fisherOK) cNvars++; // use the Fisher output simple as additional variable
1997
1998 // #### OK now what's going on...
1999 // #### looks like we are setting up histograms
2000 UInt_t* nBins = new UInt_t [cNvars];
2001 Double_t* binWidth = new Double_t [cNvars];
2002 Double_t* invBinWidth = new Double_t [cNvars];
2003
2004 Double_t** nSelS = new Double_t* [cNvars];
2005 Double_t** nSelB = new Double_t* [cNvars];
2006 Double_t** nSelS_unWeighted = new Double_t* [cNvars];
2007 Double_t** nSelB_unWeighted = new Double_t* [cNvars];
2008 Double_t** target = new Double_t* [cNvars];
2009 Double_t** target2 = new Double_t* [cNvars];
2010 Double_t** cutValues = new Double_t* [cNvars];
2011
2012 // #### looping through the variables...
2013 for (UInt_t ivar=0; ivar<cNvars; ivar++) {
2014 // #### ncuts means that we need n+1 bins for each variable
2015 nBins[ivar] = fNCuts+1;
2016 if (ivar < fNvars) {
2017 if (fDataSetInfo->GetVariableInfo(ivar).GetVarType() == 'I') {
2018 nBins[ivar] = node->GetSampleMax(ivar) - node->GetSampleMin(ivar) + 1;
2019 }
2020 }
2021
2022 // #### make some new arrays for each ith var, size=nbins for each array
2023 // #### integer features get the same number of bins as values, set later
2024 nSelS[ivar] = new Double_t [nBins[ivar]];
2025 nSelB[ivar] = new Double_t [nBins[ivar]];
2026 nSelS_unWeighted[ivar] = new Double_t [nBins[ivar]];
2027 nSelB_unWeighted[ivar] = new Double_t [nBins[ivar]];
2028 target[ivar] = new Double_t [nBins[ivar]];
2029 target2[ivar] = new Double_t [nBins[ivar]];
2030 cutValues[ivar] = new Double_t [nBins[ivar]];
2031
2032 }
2033
2034 // #### xmin and xmax for earch variable
2035 Double_t *xmin = new Double_t[cNvars];
2036 Double_t *xmax = new Double_t[cNvars];
2037
2038 // #### ok loop through each variable to initialize all the values
2039 for (UInt_t ivar=0; ivar < cNvars; ivar++) {
2040 if (ivar < fNvars){
2041 xmin[ivar]=node->GetSampleMin(ivar);
2042 xmax[ivar]=node->GetSampleMax(ivar);
2043 if (almost_equal_float(xmax[ivar], xmin[ivar])) {
2044 // std::cout << " variable " << ivar << " has no proper range in (xmax[ivar]-xmin[ivar] = " << xmax[ivar]-xmin[ivar] << std::endl;
2045 // std::cout << " will set useVariable[ivar]=false"<<std::endl;
2046 useVariable[ivar]=kFALSE;
2047 }
2048
2049 } else { // the fisher variable
2050 xmin[ivar]=999;
2051 xmax[ivar]=-999;
2052 // too bad, for the moment I don't know how to do this without looping
2053 // once to get the "min max" and then AGAIN to fill the histogram
2054 for (UInt_t iev=0; iev<nevents; iev++) {
2055 // returns the Fisher value (no fixed range)
2056 Double_t result = fisherCoeff[fNvars]; // the fisher constant offset
2057 for (UInt_t jvar=0; jvar<fNvars; jvar++)
2058 result += fisherCoeff[jvar]*(eventSample[iev])->GetValueFast(jvar);
2059 if (result > xmax[ivar]) xmax[ivar]=result;
2060 if (result < xmin[ivar]) xmin[ivar]=result;
2061 }
2062 }
2063 for (UInt_t ibin=0; ibin<nBins[ivar]; ibin++) {
2064 nSelS[ivar][ibin]=0;
2065 nSelB[ivar][ibin]=0;
2066 nSelS_unWeighted[ivar][ibin]=0;
2067 nSelB_unWeighted[ivar][ibin]=0;
2068 target[ivar][ibin]=0;
2069 target2[ivar][ibin]=0;
2070 cutValues[ivar][ibin]=0;
2071 }
2072 }
2073
2074 // #### Nothing to parallelize here really, no loop through events
2075 // #### only figures out the bin edge values for the "histogram" arrays
2076 // fill the cut values for the scan:
2077 for (UInt_t ivar=0; ivar < cNvars; ivar++) {
2078
2079 if ( useVariable[ivar] ) {
2080
2081 //set the grid for the cut scan on the variables like this:
2082 //
2083 // | | | | | ... | |
2084 // xmin xmax
2085 //
2086 // cut 0 1 2 3 ... fNCuts-1 (counting from zero)
2087 // bin 0 1 2 3 ..... nBins-1=fNCuts (counting from zero)
2088 // --> nBins = fNCuts+1
2089 // (NOTE, the cuts at xmin or xmax would just give the whole sample and
2090 // hence can be safely omitted
2091
2092 binWidth[ivar] = ( xmax[ivar] - xmin[ivar] ) / Double_t(nBins[ivar]);
2093 invBinWidth[ivar] = 1./binWidth[ivar];
2094 if (ivar < fNvars) {
2095 if (fDataSetInfo->GetVariableInfo(ivar).GetVarType() == 'I') { invBinWidth[ivar] = 1; binWidth[ivar] = 1; }
2096 }
2097
2098 // std::cout << "ivar="<<ivar
2099 // <<" min="<<xmin[ivar]
2100 // << " max="<<xmax[ivar]
2101 // << " widht=" << istepSize
2102 // << " nBins["<<ivar<<"]="<<nBins[ivar]<<std::endl;
2103 for (UInt_t icut=0; icut<nBins[ivar]-1; icut++) {
2104 cutValues[ivar][icut]=xmin[ivar]+(Double_t(icut+1))*binWidth[ivar];
2105 // std::cout << " cutValues["<<ivar<<"]["<<icut<<"]=" << cutValues[ivar][icut] << std::endl;
2106 }
2107 }
2108 }
2109
2110 // #### Loop through the events to get the total sig and background
2111 nTotS=0; nTotB=0;
2112 nTotS_unWeighted=0; nTotB_unWeighted=0;
2113 for (UInt_t iev=0; iev<nevents; iev++) {
2114
2115 Double_t eventWeight = eventSample[iev]->GetWeight();
2116 if (eventSample[iev]->GetClass() == fSigClass) {
2117 nTotS+=eventWeight;
2118 nTotS_unWeighted++; }
2119 else {
2120 nTotB+=eventWeight;
2121 nTotB_unWeighted++;
2122 }
2123
2124 // #### Count the number in each bin (fill array "histograms")
2125 Int_t iBin=-1;
2126 for (UInt_t ivar=0; ivar < cNvars; ivar++) {
2127 // now scan trough the cuts for each varable and find which one gives
2128 // the best separationGain at the current stage.
2129 if ( useVariable[ivar] ) {
2130 Double_t eventData;
2131 if (ivar < fNvars) eventData = eventSample[iev]->GetValueFast(ivar);
2132 else { // the fisher variable
2133 eventData = fisherCoeff[fNvars];
2134 for (UInt_t jvar=0; jvar<fNvars; jvar++)
2135 eventData += fisherCoeff[jvar]*(eventSample[iev])->GetValueFast(jvar);
2136
2137 }
2138 // #### figure out which bin the event belongs in ...
2139 // "maximum" is nbins-1 (the "-1" because we start counting from 0 !!
2140 iBin = TMath::Min(Int_t(nBins[ivar]-1),TMath::Max(0,int (invBinWidth[ivar]*(eventData-xmin[ivar]) ) ));
2141 if (eventSample[iev]->GetClass() == fSigClass) {
2142 nSelS[ivar][iBin]+=eventWeight;
2143 nSelS_unWeighted[ivar][iBin]++;
2144 }
2145 else {
2146 nSelB[ivar][iBin]+=eventWeight;
2147 nSelB_unWeighted[ivar][iBin]++;
2148 }
2149 if (DoRegression()) {
2150 target[ivar][iBin] +=eventWeight*eventSample[iev]->GetTarget(0);
2151 target2[ivar][iBin]+=eventWeight*eventSample[iev]->GetTarget(0)*eventSample[iev]->GetTarget(0);
2152 }
2153 }
2154 }
2155 }
2156
2157 // now turn the "histogram" into a cumulative distribution
2158 for (UInt_t ivar=0; ivar < cNvars; ivar++) {
2159 if (useVariable[ivar]) {
2160 for (UInt_t ibin=1; ibin < nBins[ivar]; ibin++) {
2161 nSelS[ivar][ibin]+=nSelS[ivar][ibin-1];
2162 nSelS_unWeighted[ivar][ibin]+=nSelS_unWeighted[ivar][ibin-1];
2163 nSelB[ivar][ibin]+=nSelB[ivar][ibin-1];
2164 nSelB_unWeighted[ivar][ibin]+=nSelB_unWeighted[ivar][ibin-1];
2165 if (DoRegression()) {
2166 target[ivar][ibin] +=target[ivar][ibin-1] ;
2167 target2[ivar][ibin]+=target2[ivar][ibin-1];
2168 }
2169 }
2170 if (nSelS_unWeighted[ivar][nBins[ivar]-1] +nSelB_unWeighted[ivar][nBins[ivar]-1] != eventSample.size()) {
2171 Log() << kFATAL << "Helge, you have a bug ....nSelS_unw..+nSelB_unw..= "
2172 << nSelS_unWeighted[ivar][nBins[ivar]-1] +nSelB_unWeighted[ivar][nBins[ivar]-1]
2173 << " while eventsample size = " << eventSample.size()
2174 << Endl;
2175 }
2176 double lastBins=nSelS[ivar][nBins[ivar]-1] +nSelB[ivar][nBins[ivar]-1];
2177 double totalSum=nTotS+nTotB;
2178 if (TMath::Abs(lastBins-totalSum)/totalSum>0.01) {
2179 Log() << kFATAL << "Helge, you have another bug ....nSelS+nSelB= "
2180 << lastBins
2181 << " while total number of events = " << totalSum
2182 << Endl;
2183 }
2184 }
2185 }
2186 // #### Loops over vars and bins, but not events, not worth parallelizing unless nbins is on the order of ndata/10 ish ...
2187 // now select the optimal cuts for each varable and find which one gives
2188 // the best separationGain at the current stage
2189 for (UInt_t ivar=0; ivar < cNvars; ivar++) {
2190 if (useVariable[ivar]) {
2191 for (UInt_t iBin=0; iBin<nBins[ivar]-1; iBin++) { // the last bin contains "all events" -->skip
2192 // the separationGain is defined as the various indices (Gini, CorssEntropy, e.t.c)
2193 // calculated by the "SamplePurities" fom the branches that would go to the
2194 // left or the right from this node if "these" cuts were used in the Node:
2195 // hereby: nSelS and nSelB would go to the right branch
2196 // (nTotS - nSelS) + (nTotB - nSelB) would go to the left branch;
2197
2198 // only allow splits where both daughter nodes match the specified miniumum number
2199 // for this use the "unweighted" events, as you are interested in statistically
2200 // significant splits, which is determined by the actual number of entries
2201 // for a node, rather than the sum of event weights.
2202
2203 Double_t sl = nSelS_unWeighted[ivar][iBin];
2204 Double_t bl = nSelB_unWeighted[ivar][iBin];
2205 Double_t s = nTotS_unWeighted;
2206 Double_t b = nTotB_unWeighted;
2207 Double_t slW = nSelS[ivar][iBin];
2208 Double_t blW = nSelB[ivar][iBin];
2209 Double_t sW = nTotS;
2210 Double_t bW = nTotB;
2211 Double_t sr = s-sl;
2212 Double_t br = b-bl;
2213 Double_t srW = sW-slW;
2214 Double_t brW = bW-blW;
2215 // std::cout << "sl="<<sl << " bl="<<bl<<" fMinSize="<<fMinSize << "sr="<<sr << " br="<<br <<std::endl;
2216 if ( ((sl+bl)>=fMinSize && (sr+br)>=fMinSize)
2217 && ((slW+blW)>=fMinSize && (srW+brW)>=fMinSize)
2218 ) {
2219
2220 if (DoRegression()) {
2221 sepTmp = fRegType->GetSeparationGain(nSelS[ivar][iBin]+nSelB[ivar][iBin],
2222 target[ivar][iBin],target2[ivar][iBin],
2223 nTotS+nTotB,
2224 target[ivar][nBins[ivar]-1],target2[ivar][nBins[ivar]-1]);
2225 } else {
2226 sepTmp = fSepType->GetSeparationGain(nSelS[ivar][iBin], nSelB[ivar][iBin], nTotS, nTotB);
2227 }
2228 if (separationGain[ivar] < sepTmp) {
2229 separationGain[ivar] = sepTmp;
2230 cutIndex[ivar] = iBin;
2231 }
2232 }
2233 }
2234 }
2235 }
2236
2237 //now you have found the best separation cut for each variable, now compare the variables
2238 for (UInt_t ivar=0; ivar < cNvars; ivar++) {
2239 if (useVariable[ivar] ) {
2240 if (separationGainTotal < separationGain[ivar]) {
2241 separationGainTotal = separationGain[ivar];
2242 mxVar = ivar;
2243 }
2244 }
2245 }
2246
2247 if (mxVar >= 0) {
2248 if (DoRegression()) {
2249 node->SetSeparationIndex(fRegType->GetSeparationIndex(nTotS+nTotB,target[0][nBins[mxVar]-1],target2[0][nBins[mxVar]-1]));
2250 node->SetResponse(target[0][nBins[mxVar]-1]/(nTotS+nTotB));
2251 if ( almost_equal_double(target2[0][nBins[mxVar]-1]/(nTotS+nTotB), target[0][nBins[mxVar]-1]/(nTotS+nTotB)*target[0][nBins[mxVar]-1]/(nTotS+nTotB))) {
2252 node->SetRMS(0);
2253 }else{
2254 node->SetRMS(TMath::Sqrt(target2[0][nBins[mxVar]-1]/(nTotS+nTotB) - target[0][nBins[mxVar]-1]/(nTotS+nTotB)*target[0][nBins[mxVar]-1]/(nTotS+nTotB)));
2255 }
2256 }
2257 else {
2258 node->SetSeparationIndex(fSepType->GetSeparationIndex(nTotS,nTotB));
2259 if (mxVar >=0){
2260 if (nSelS[mxVar][cutIndex[mxVar]]/nTotS > nSelB[mxVar][cutIndex[mxVar]]/nTotB) cutType=kTRUE;
2261 else cutType=kFALSE;
2262 }
2263 }
2264 node->SetSelector((UInt_t)mxVar);
2265 node->SetCutValue(cutValues[mxVar][cutIndex[mxVar]]);
2266 node->SetCutType(cutType);
2267 node->SetSeparationGain(separationGainTotal);
2268 if (mxVar < (Int_t) fNvars){ // the fisher cut is actually not used in this node, hence don't need to store fisher components
2269 node->SetNFisherCoeff(0);
2270 fVariableImportance[mxVar] += separationGainTotal*separationGainTotal * (nTotS+nTotB) * (nTotS+nTotB) ;
2271 //for (UInt_t ivar=0; ivar<fNvars; ivar++) fVariableImportance[ivar] += separationGain[ivar]*separationGain[ivar] * (nTotS+nTotB) * (nTotS+nTotB) ;
2272 }else{
2273 // allocate Fisher coefficients (use fNvars, and set the non-used ones to zero. Might
2274 // be even less storage space on average than storing also the mapping used otherwise
2275 // can always be changed relatively easy
2276 node->SetNFisherCoeff(fNvars+1);
2277 for (UInt_t ivar=0; ivar<=fNvars; ivar++) {
2278 node->SetFisherCoeff(ivar,fisherCoeff[ivar]);
2279 // take 'fisher coeff. weighted estimate as variable importance, "Don't fill the offset coefficient though :)
2280 if (ivar<fNvars){
2281 fVariableImportance[ivar] += fisherCoeff[ivar]*fisherCoeff[ivar]*separationGainTotal*separationGainTotal * (nTotS+nTotB) * (nTotS+nTotB) ;
2282 }
2283 }
2284 }
2285 }
2286 else {
2287 separationGainTotal = 0;
2288 }
2289 // if (mxVar > -1) {
2290 // std::cout << "------------------------------------------------------------------"<<std::endl;
2291 // std::cout << "cutting on Var: " << mxVar << " with cutIndex " << cutIndex[mxVar] << " being: " << cutValues[mxVar][cutIndex[mxVar]] << std::endl;
2292 // std::cout << " nSelS = " << nSelS_unWeighted[mxVar][cutIndex[mxVar]] << " nSelB = " << nSelB_unWeighted[mxVar][cutIndex[mxVar]] << " (right) sum:= " << nSelS_unWeighted[mxVar][cutIndex[mxVar]] + nSelB_unWeighted[mxVar][cutIndex[mxVar]] << std::endl;
2293 // std::cout << " nSelS = " << nTotS_unWeighted - nSelS_unWeighted[mxVar][cutIndex[mxVar]] << " nSelB = " << nTotB_unWeighted-nSelB_unWeighted[mxVar][cutIndex[mxVar]] << " (left) sum:= " << nTotS_unWeighted + nTotB_unWeighted - nSelS_unWeighted[mxVar][cutIndex[mxVar]] - nSelB_unWeighted[mxVar][cutIndex[mxVar]] << std::endl;
2294 // std::cout << " nSelS = " << nSelS[mxVar][cutIndex[mxVar]] << " nSelB = " << nSelB[mxVar][cutIndex[mxVar]] << std::endl;
2295 // std::cout << " s/s+b " << nSelS_unWeighted[mxVar][cutIndex[mxVar]]/( nSelS_unWeighted[mxVar][cutIndex[mxVar]] + nSelB_unWeighted[mxVar][cutIndex[mxVar]])
2296 // << " s/s+b " << (nTotS - nSelS_unWeighted[mxVar][cutIndex[mxVar]])/( nTotS-nSelS_unWeighted[mxVar][cutIndex[mxVar]] + nTotB-nSelB_unWeighted[mxVar][cutIndex[mxVar]]) << std::endl;
2297 // std::cout << " nTotS = " << nTotS << " nTotB = " << nTotB << std::endl;
2298 // std::cout << " separationGainTotal " << separationGainTotal << std::endl;
2299 // } else {
2300 // std::cout << "------------------------------------------------------------------"<<std::endl;
2301 // std::cout << " obviously didn't find new mxVar " << mxVar << std::endl;
2302 // }
2303 for (UInt_t i=0; i<cNvars; i++) {
2304 delete [] nSelS[i];
2305 delete [] nSelB[i];
2306 delete [] nSelS_unWeighted[i];
2307 delete [] nSelB_unWeighted[i];
2308 delete [] target[i];
2309 delete [] target2[i];
2310 delete [] cutValues[i];
2311 }
2312 delete [] nSelS;
2313 delete [] nSelB;
2314 delete [] nSelS_unWeighted;
2315 delete [] nSelB_unWeighted;
2316 delete [] target;
2317 delete [] target2;
2318 delete [] cutValues;
2319
2320 delete [] xmin;
2321 delete [] xmax;
2322
2323 delete [] useVariable;
2324 delete [] mapVariable;
2325
2326 delete [] separationGain;
2327 delete [] cutIndex;
2328
2329 delete [] nBins;
2330 delete [] binWidth;
2331 delete [] invBinWidth;
2332
2333 return separationGainTotal;
2334
2335}
2336#endif
2337
2338
2339////////////////////////////////////////////////////////////////////////////////
2340/// calculate the fisher coefficients for the event sample and the variables used
2341
2342std::vector<Double_t> TMVA::DecisionTree::GetFisherCoefficients(const EventConstList &eventSample, UInt_t nFisherVars, UInt_t *mapVarInFisher){
2343 std::vector<Double_t> fisherCoeff(fNvars+1);
2344
2345 // initialization of global matrices and vectors
2346 // average value of each variables for S, B, S+B
2347 TMatrixD* meanMatx = new TMatrixD( nFisherVars, 3 );
2348
2349 // the covariance 'within class' and 'between class' matrices
2350 TMatrixD* betw = new TMatrixD( nFisherVars, nFisherVars );
2351 TMatrixD* with = new TMatrixD( nFisherVars, nFisherVars );
2352 TMatrixD* cov = new TMatrixD( nFisherVars, nFisherVars );
2353
2354 //
2355 // compute mean values of variables in each sample, and the overall means
2356 //
2357
2358 // initialize internal sum-of-weights variables
2359 Double_t sumOfWeightsS = 0;
2360 Double_t sumOfWeightsB = 0;
2361
2362
2363 // init vectors
2364 Double_t* sumS = new Double_t[nFisherVars];
2365 Double_t* sumB = new Double_t[nFisherVars];
2366 for (UInt_t ivar=0; ivar<nFisherVars; ivar++) { sumS[ivar] = sumB[ivar] = 0; }
2367
2368 UInt_t nevents = eventSample.size();
2369 // compute sample means
2370 for (UInt_t ievt=0; ievt<nevents; ievt++) {
2371
2372 // read the Training Event into "event"
2373 const Event * ev = eventSample[ievt];
2374
2375 // sum of weights
2376 Double_t weight = ev->GetWeight();
2377 if (ev->GetClass() == fSigClass) sumOfWeightsS += weight;
2378 else sumOfWeightsB += weight;
2379
2380 Double_t* sum = ev->GetClass() == fSigClass ? sumS : sumB;
2381 for (UInt_t ivar=0; ivar<nFisherVars; ivar++) {
2382 sum[ivar] += ev->GetValueFast( mapVarInFisher[ivar] )*weight;
2383 }
2384 }
2385 for (UInt_t ivar=0; ivar<nFisherVars; ivar++) {
2386 (*meanMatx)( ivar, 2 ) = sumS[ivar];
2387 (*meanMatx)( ivar, 0 ) = sumS[ivar]/sumOfWeightsS;
2388
2389 (*meanMatx)( ivar, 2 ) += sumB[ivar];
2390 (*meanMatx)( ivar, 1 ) = sumB[ivar]/sumOfWeightsB;
2391
2392 // signal + background
2393 (*meanMatx)( ivar, 2 ) /= (sumOfWeightsS + sumOfWeightsB);
2394 }
2395
2396 delete [] sumS;
2397
2398 delete [] sumB;
2399
2400 // the matrix of covariance 'within class' reflects the dispersion of the
2401 // events relative to the center of gravity of their own class
2402
2403 // assert required
2404
2405 assert( sumOfWeightsS > 0 && sumOfWeightsB > 0 );
2406
2407 // product matrices (x-<x>)(y-<y>) where x;y are variables
2408
2409 const Int_t nFisherVars2 = nFisherVars*nFisherVars;
2410 Double_t *sum2Sig = new Double_t[nFisherVars2];
2411 Double_t *sum2Bgd = new Double_t[nFisherVars2];
2412 Double_t *xval = new Double_t[nFisherVars2];
2413 memset(sum2Sig,0,nFisherVars2*sizeof(Double_t));
2414 memset(sum2Bgd,0,nFisherVars2*sizeof(Double_t));
2415
2416 // 'within class' covariance
2417 for (UInt_t ievt=0; ievt<nevents; ievt++) {
2418
2419 // read the Training Event into "event"
2420 // const Event* ev = eventSample[ievt];
2421 const Event* ev = eventSample.at(ievt);
2422
2423 Double_t weight = ev->GetWeight(); // may ignore events with negative weights
2424
2425 for (UInt_t x=0; x<nFisherVars; x++) {
2426 xval[x] = ev->GetValueFast( mapVarInFisher[x] );
2427 }
2428 Int_t k=0;
2429 for (UInt_t x=0; x<nFisherVars; x++) {
2430 for (UInt_t y=0; y<nFisherVars; y++) {
2431 if ( ev->GetClass() == fSigClass ) sum2Sig[k] += ( (xval[x] - (*meanMatx)(x, 0))*(xval[y] - (*meanMatx)(y, 0)) )*weight;
2432 else sum2Bgd[k] += ( (xval[x] - (*meanMatx)(x, 1))*(xval[y] - (*meanMatx)(y, 1)) )*weight;
2433 k++;
2434 }
2435 }
2436 }
2437 Int_t k=0;
2438 for (UInt_t x=0; x<nFisherVars; x++) {
2439 for (UInt_t y=0; y<nFisherVars; y++) {
2440 (*with)(x, y) = sum2Sig[k]/sumOfWeightsS + sum2Bgd[k]/sumOfWeightsB;
2441 k++;
2442 }
2443 }
2444
2445 delete [] sum2Sig;
2446 delete [] sum2Bgd;
2447 delete [] xval;
2448
2449
2450 // the matrix of covariance 'between class' reflects the dispersion of the
2451 // events of a class relative to the global center of gravity of all the class
2452 // hence the separation between classes
2453
2454
2455 Double_t prodSig, prodBgd;
2456
2457 for (UInt_t x=0; x<nFisherVars; x++) {
2458 for (UInt_t y=0; y<nFisherVars; y++) {
2459
2460 prodSig = ( ((*meanMatx)(x, 0) - (*meanMatx)(x, 2))*
2461 ((*meanMatx)(y, 0) - (*meanMatx)(y, 2)) );
2462 prodBgd = ( ((*meanMatx)(x, 1) - (*meanMatx)(x, 2))*
2463 ((*meanMatx)(y, 1) - (*meanMatx)(y, 2)) );
2464
2465 (*betw)(x, y) = (sumOfWeightsS*prodSig + sumOfWeightsB*prodBgd) / (sumOfWeightsS + sumOfWeightsB);
2466 }
2467 }
2468
2469
2470
2471 // compute full covariance matrix from sum of within and between matrices
2472 for (UInt_t x=0; x<nFisherVars; x++)
2473 for (UInt_t y=0; y<nFisherVars; y++)
2474 (*cov)(x, y) = (*with)(x, y) + (*betw)(x, y);
2475
2476 // Fisher = Sum { [coeff]*[variables] }
2477 //
2478 // let Xs be the array of the mean values of variables for signal evts
2479 // let Xb be the array of the mean values of variables for backgd evts
2480 // let InvWith be the inverse matrix of the 'within class' correlation matrix
2481 //
2482 // then the array of Fisher coefficients is
2483 // [coeff] =TMath::Sqrt(fNsig*fNbgd)/fNevt*transpose{Xs-Xb}*InvWith
2484 TMatrixD* theMat = with; // Fishers original
2485 // TMatrixD* theMat = cov; // Mahalanobis
2486
2487 TMatrixD invCov( *theMat );
2488 if ( TMath::Abs(invCov.Determinant()) < 10E-24 ) {
2489 Log() << kWARNING << "FisherCoeff matrix is almost singular with determinant="
2490 << TMath::Abs(invCov.Determinant())
2491 << " did you use the variables that are linear combinations or highly correlated?"
2492 << Endl;
2493 }
2494 if ( TMath::Abs(invCov.Determinant()) < 10E-120 ) {
2495 Log() << kFATAL << "FisherCoeff matrix is singular with determinant="
2496 << TMath::Abs(invCov.Determinant())
2497 << " did you use the variables that are linear combinations?"
2498 << Endl;
2499 }
2500
2501 invCov.Invert();
2502
2503 // apply rescaling factor
2504 Double_t xfact = TMath::Sqrt( sumOfWeightsS*sumOfWeightsB ) / (sumOfWeightsS + sumOfWeightsB);
2505
2506 // compute difference of mean values
2507 std::vector<Double_t> diffMeans( nFisherVars );
2508
2509 for (UInt_t ivar=0; ivar<=fNvars; ivar++) fisherCoeff[ivar] = 0;
2510 for (UInt_t ivar=0; ivar<nFisherVars; ivar++) {
2511 for (UInt_t jvar=0; jvar<nFisherVars; jvar++) {
2512 Double_t d = (*meanMatx)(jvar, 0) - (*meanMatx)(jvar, 1);
2513 fisherCoeff[mapVarInFisher[ivar]] += invCov(ivar, jvar)*d;
2514 }
2515
2516 // rescale
2517 fisherCoeff[mapVarInFisher[ivar]] *= xfact;
2518 }
2519
2520 // offset correction
2521 Double_t f0 = 0.0;
2522 for (UInt_t ivar=0; ivar<nFisherVars; ivar++){
2523 f0 += fisherCoeff[mapVarInFisher[ivar]]*((*meanMatx)(ivar, 0) + (*meanMatx)(ivar, 1));
2524 }
2525 f0 /= -2.0;
2526
2527 fisherCoeff[fNvars] = f0; //as we start counting variables from "zero", I store the fisher offset at the END
2528
2529 return fisherCoeff;
2530}
2531
2532////////////////////////////////////////////////////////////////////////////////
2533/// train a node by finding the single optimal cut for a single variable
2534/// that best separates signal and background (maximizes the separation gain)
2535
2538{
2539 Double_t nTotS = 0.0, nTotB = 0.0;
2540 Int_t nTotS_unWeighted = 0, nTotB_unWeighted = 0;
2541
2542 std::vector<TMVA::BDTEventWrapper> bdtEventSample;
2543
2544 // List of optimal cuts, separation gains, and cut types (removed background or signal) - one for each variable
2545 // each spot in parallel no problem
2546 std::vector<Double_t> lCutValue( fNvars, 0.0 );
2547 std::vector<Double_t> lSepGain( fNvars, -1.0e6 );
2548 std::vector<Char_t> lCutType( fNvars ); // <----- bool is stored (for performance reasons, no std::vector<bool> has been taken)
2549 lCutType.assign( fNvars, Char_t(kFALSE) );
2550
2551 // Initialize (un)weighted counters for signal & background
2552 // Construct a list of event wrappers that point to the original data
2553 for( std::vector<const TMVA::Event*>::const_iterator it = eventSample.begin(); it != eventSample.end(); ++it ) {
2554 if((*it)->GetClass() == fSigClass) { // signal or background event
2555 nTotS += (*it)->GetWeight();
2556 ++nTotS_unWeighted;
2557 }
2558 else {
2559 nTotB += (*it)->GetWeight();
2560 ++nTotB_unWeighted;
2561 }
2562 bdtEventSample.push_back(TMVA::BDTEventWrapper(*it));
2563 }
2564
2565 std::vector<Char_t> useVariable(fNvars); // <----- bool is stored (for performance reasons, no std::vector<bool> has been taken)
2566 useVariable.assign( fNvars, Char_t(kTRUE) );
2567
2568 for (UInt_t ivar=0; ivar < fNvars; ivar++) useVariable[ivar]=Char_t(kFALSE);
2569 if (fRandomisedTree) { // choose for each node splitting a random subset of variables to choose from
2570 if (fUseNvars ==0 ) { // no number specified ... choose s.th. which hopefully works well
2571 // watch out, should never happen as it is initialised automatically in MethodBDT already!!!
2572 fUseNvars = UInt_t(TMath::Sqrt(fNvars)+0.6);
2573 }
2574 Int_t nSelectedVars = 0;
2575 while (nSelectedVars < fUseNvars) {
2576 Double_t bla = fMyTrandom->Rndm()*fNvars;
2577 useVariable[Int_t (bla)] = Char_t(kTRUE);
2578 nSelectedVars = 0;
2579 for (UInt_t ivar=0; ivar < fNvars; ivar++) {
2580 if(useVariable[ivar] == Char_t(kTRUE)) nSelectedVars++;
2581 }
2582 }
2583 }
2584 else {
2585 for (UInt_t ivar=0; ivar < fNvars; ivar++) useVariable[ivar] = Char_t(kTRUE);
2586 }
2587 for( UInt_t ivar = 0; ivar < fNvars; ivar++ ) { // loop over all discriminating variables
2588 if(!useVariable[ivar]) continue; // only optimze with selected variables
2589
2590 TMVA::BDTEventWrapper::SetVarIndex(ivar); // select the variable to sort by
2591
2592 std::sort( bdtEventSample.begin(),bdtEventSample.end() ); // sort the event data
2593
2594
2595 Double_t bkgWeightCtr = 0.0, sigWeightCtr = 0.0;
2596
2597 std::vector<TMVA::BDTEventWrapper>::iterator it = bdtEventSample.begin(), it_end = bdtEventSample.end();
2598 for( ; it != it_end; ++it ) {
2599 if((**it)->GetClass() == fSigClass ) // specify signal or background event
2600 sigWeightCtr += (**it)->GetWeight();
2601 else
2602 bkgWeightCtr += (**it)->GetWeight();
2603 // Store the accumulated signal (background) weights
2604 it->SetCumulativeWeight(false,bkgWeightCtr);
2605 it->SetCumulativeWeight(true,sigWeightCtr);
2606 }
2607
2608 const Double_t fPMin = 1.0e-6;
2609 Bool_t cutType = kFALSE;
2610 Long64_t index = 0;
2611 Double_t separationGain = -1.0, sepTmp = 0.0, cutValue = 0.0, dVal = 0.0, norm = 0.0;
2612
2613 // Locate the optimal cut for this (ivar-th) variable
2614 for( it = bdtEventSample.begin(); it != it_end; ++it ) {
2615 if( index == 0 ) { ++index; continue; }
2616 if( *(*it) == NULL ) {
2617 Log() << kFATAL << "In TrainNodeFull(): have a null event! Where index="
2618 << index << ", and parent node=" << node->GetParent() << Endl;
2619 break;
2620 }
2621 dVal = bdtEventSample[index].GetVal() - bdtEventSample[index-1].GetVal();
2622 norm = TMath::Abs(bdtEventSample[index].GetVal() + bdtEventSample[index-1].GetVal());
2623 // Only allow splits where both daughter nodes have the specified minimum number of events
2624 // Splits are only sensible when the data are ordered (eg. don't split inside a sequence of 0's)
2625 if( index >= fMinSize && (nTotS_unWeighted + nTotB_unWeighted) - index >= fMinSize && TMath::Abs(dVal/(0.5*norm + 1)) > fPMin ) {
2626
2627 sepTmp = fSepType->GetSeparationGain( it->GetCumulativeWeight(true), it->GetCumulativeWeight(false), sigWeightCtr, bkgWeightCtr );
2628 if( sepTmp > separationGain ) {
2629 separationGain = sepTmp;
2630 cutValue = it->GetVal() - 0.5*dVal;
2631 Double_t nSelS = it->GetCumulativeWeight(true);
2632 Double_t nSelB = it->GetCumulativeWeight(false);
2633 // Indicate whether this cut is improving the node purity by removing background (enhancing signal)
2634 // or by removing signal (enhancing background)
2635 if( nSelS/sigWeightCtr > nSelB/bkgWeightCtr ) cutType = kTRUE;
2636 else cutType = kFALSE;
2637 }
2638 }
2639 ++index;
2640 }
2641 lCutType[ivar] = Char_t(cutType);
2642 lCutValue[ivar] = cutValue;
2643 lSepGain[ivar] = separationGain;
2644 }
2645 Double_t separationGain = -1.0;
2646 Int_t iVarIndex = -1;
2647 for( UInt_t ivar = 0; ivar < fNvars; ivar++ ) {
2648 if( lSepGain[ivar] > separationGain ) {
2649 iVarIndex = ivar;
2650 separationGain = lSepGain[ivar];
2651 }
2652 }
2653
2654 // #### storing the best values into the node
2655 if(iVarIndex >= 0) {
2656 node->SetSelector(iVarIndex);
2657 node->SetCutValue(lCutValue[iVarIndex]);
2658 node->SetSeparationGain(lSepGain[iVarIndex]);
2659 node->SetCutType(lCutType[iVarIndex]);
2660 fVariableImportance[iVarIndex] += separationGain*separationGain * (nTotS+nTotB) * (nTotS+nTotB);
2661 }
2662 else {
2663 separationGain = 0.0;
2664 }
2665
2666 return separationGain;
2667}
2668
2669////////////////////////////////////////////////////////////////////////////////
2670/// get the pointer to the leaf node where a particular event ends up in...
2671/// (used in gradient boosting)
2672
2674{
2675 TMVA::DecisionTreeNode *current = (TMVA::DecisionTreeNode*)this->GetRoot();
2676 while(current->GetNodeType() == 0) { // intermediate node in a tree
2677 current = (current->GoesRight(e)) ?
2678 (TMVA::DecisionTreeNode*)current->GetRight() :
2679 (TMVA::DecisionTreeNode*)current->GetLeft();
2680 }
2681 return current;
2682}
2683
2684////////////////////////////////////////////////////////////////////////////////
2685/// the event e is put into the decision tree (starting at the root node)
2686/// and the output is NodeType (signal) or (background) of the final node (basket)
2687/// in which the given events ends up. I.e. the result of the classification if
2688/// the event for this decision tree.
2689
2691{
2692 TMVA::DecisionTreeNode *current = this->GetRoot();
2693 if (!current){
2694 Log() << kFATAL << "CheckEvent: started with undefined ROOT node" <<Endl;
2695 return 0; //keeps coverity happy that doesn't know that kFATAL causes an exit
2696 }
2697
2698 while (current->GetNodeType() == 0) { // intermediate node in a (pruned) tree
2699 current = (current->GoesRight(*e)) ?
2700 current->GetRight() :
2701 current->GetLeft();
2702 if (!current) {
2703 Log() << kFATAL << "DT::CheckEvent: inconsistent tree structure" <<Endl;
2704 }
2705
2706 }
2707
2708 if (DoRegression()) {
2709 // Note: This path is also taken for MethodBDT with analysis type
2710 // kClassification and kMulticlass when using GradBoost.
2711 // See TMVA::MethodBDT::InitGradBoost
2712 return current->GetResponse();
2713 } else {
2714 if (UseYesNoLeaf) return Double_t ( current->GetNodeType() );
2715 else return current->GetPurity();
2716 }
2717}
2718
2719////////////////////////////////////////////////////////////////////////////////
2720/// calculates the purity S/(S+B) of a given event sample
2721
2722Double_t TMVA::DecisionTree::SamplePurity( std::vector<TMVA::Event*> eventSample )
2723{
2724 Double_t sumsig=0, sumbkg=0, sumtot=0;
2725 for (UInt_t ievt=0; ievt<eventSample.size(); ievt++) {
2726 if (eventSample[ievt]->GetClass() != fSigClass) sumbkg+=eventSample[ievt]->GetWeight();
2727 else sumsig+=eventSample[ievt]->GetWeight();
2728 sumtot+=eventSample[ievt]->GetWeight();
2729 }
2730 // sanity check
2731 if (sumtot!= (sumsig+sumbkg)){
2732 Log() << kFATAL << "<SamplePurity> sumtot != sumsig+sumbkg"
2733 << sumtot << " " << sumsig << " " << sumbkg << Endl;
2734 }
2735 if (sumtot>0) return sumsig/(sumsig + sumbkg);
2736 else return -1;
2737}
2738
2739////////////////////////////////////////////////////////////////////////////////
2740/// Return the relative variable importance, normalized to all
2741/// variables together having the importance 1. The importance in
2742/// evaluated as the total separation-gain that this variable had in
2743/// the decision trees (weighted by the number of events)
2744
2746{
2747 std::vector<Double_t> relativeImportance(fNvars);
2748 Double_t sum=0;
2749 for (UInt_t i=0; i< fNvars; i++) {
2750 sum += fVariableImportance[i];
2751 relativeImportance[i] = fVariableImportance[i];
2752 }
2753
2754 for (UInt_t i=0; i< fNvars; i++) {
2756 relativeImportance[i] /= sum;
2757 else
2758 relativeImportance[i] = 0;
2759 }
2760 return relativeImportance;
2761}
2762
2763////////////////////////////////////////////////////////////////////////////////
2764/// returns the relative importance of variable ivar
2765
2767{
2768 std::vector<Double_t> relativeImportance = this->GetVariableImportance();
2769 if (ivar < fNvars) return relativeImportance[ivar];
2770 else {
2771 Log() << kFATAL << "<GetVariableImportance>" << Endl
2772 << "--- ivar = " << ivar << " is out of range " << Endl;
2773 }
2774
2775 return -1;
2776}
2777
bool almost_equal_double(double x, double y, int ulp=4)
bool almost_equal_float(float x, float y, int ulp=4)
ROOT::R::TRInterface & r
Definition: Object.C:4
#define d(i)
Definition: RSha256.hxx:102
#define b(i)
Definition: RSha256.hxx:100
#define f(i)
Definition: RSha256.hxx:104
#define e(i)
Definition: RSha256.hxx:103
int Int_t
Definition: RtypesCore.h:45
char Char_t
Definition: RtypesCore.h:33
unsigned int UInt_t
Definition: RtypesCore.h:46
const Bool_t kFALSE
Definition: RtypesCore.h:101
unsigned long ULong_t
Definition: RtypesCore.h:55
bool Bool_t
Definition: RtypesCore.h:63
double Double_t
Definition: RtypesCore.h:59
long long Long64_t
Definition: RtypesCore.h:80
float Float_t
Definition: RtypesCore.h:57
const Bool_t kTRUE
Definition: RtypesCore.h:100
#define ClassImp(name)
Definition: Rtypes.h:364
int type
Definition: TGX11.cxx:121
float xmin
Definition: THbookFile.cxx:95
float xmax
Definition: THbookFile.cxx:95
TMatrixT< Double_t > TMatrixD
Definition: TMatrixDfwd.h:22
static void SetVarIndex(Int_t iVar)
Base class for BinarySearch and Decision Trees.
Definition: BinaryTree.h:62
virtual void ReadXML(void *node, UInt_t tmva_Version_Code=TMVA_VERSION_CODE)
read attributes from XML
Definition: BinaryTree.cxx:144
void SetRoot(Node *r)
Definition: BinaryTree.h:80
MsgLogger & Log() const
Definition: BinaryTree.cxx:235
Executor & GetThreadExecutor()
Get executor class for multi-thread usage In case when MT is not enabled will return a serial executo...
Definition: Config.h:83
static Config & Instance()
static function: returns TMVA instance
Definition: Config.cxx:106
A class to prune a decision tree using the Cost Complexity method.
Class that contains all the data information.
Definition: DataSetInfo.h:62
void SetNEvents_unweighted(Float_t nev)
void SetSeparationGain(Float_t sep)
void SetNBkgEvents(Float_t b)
Double_t GetNSValidation() const
void SetFisherCoeff(Int_t ivar, Double_t coeff)
set fisher coefficients
void SetNSigEvents_unboosted(Float_t s)
void SetAlphaMinSubtree(Double_t g)
void IncrementNBkgEvents(Float_t b)
void SetNEvents_unboosted(Float_t nev)
Float_t GetNSigEvents(void) const
virtual void SetLeft(Node *l)
void SetTerminal(Bool_t s=kTRUE)
void SetResponse(Float_t r)
void SetSampleMax(UInt_t ivar, Float_t xmax)
set the maximum of variable ivar from the training sample that pass/end up in this node
void SetNBValidation(Double_t b)
void IncrementNEvents(Float_t nev)
void SetPurity(void)
return the S/(S+B) (purity) for the node REM: even if nodes with purity 0.01 are very PURE background...
void SetSubTreeR(Double_t r)
void AddToSumTarget2(Float_t t2)
virtual DecisionTreeNode * GetLeft() const
Double_t GetNodeR() const
virtual Bool_t GoesRight(const Event &) const
test event if it descends the tree at this node to the right
void SetNFisherCoeff(Int_t nvars)
Short_t GetSelector() const
void SetNSigEvents(Float_t s)
Float_t GetResponse(void) const
Float_t GetCutValue(void) const
Int_t GetNodeType(void) const
void SetNSigEvents_unweighted(Float_t s)
Double_t GetNBValidation() const
void SetAlpha(Double_t alpha)
void SetSeparationIndex(Float_t sep)
virtual void SetRight(Node *r)
void SetNBkgEvents_unboosted(Float_t b)
Float_t GetPurity(void) const
void IncrementNSigEvents(Float_t s)
Float_t GetSampleMax(UInt_t ivar) const
return the maximum of variable ivar from the training sample that pass/end up in this node
void SetCutValue(Float_t c)
Float_t GetNBkgEvents(void) const
Float_t GetSampleMin(UInt_t ivar) const
return the minimum of variable ivar from the training sample that pass/end up in this node
void SetSampleMin(UInt_t ivar, Float_t xmin)
set the minimum of variable ivar from the training sample that pass/end up in this node
void SetSelector(Short_t i)
virtual DecisionTreeNode * GetParent() const
void SetNBkgEvents_unweighted(Float_t b)
void SetNSValidation(Double_t s)
void AddToSumTarget(Float_t t)
void SetNEvents(Float_t nev)
virtual DecisionTreeNode * GetRight() const
Implementation of a Decision Tree.
Definition: DecisionTree.h:65
void FillTree(const EventList &eventSample)
fill the existing the decision tree structure by filling event in from the top node and see where the...
void PruneNode(TMVA::DecisionTreeNode *node)
prune away the subtree below the node
void ApplyValidationSample(const EventConstList *validationSample) const
run the validation sample through the (pruned) tree and fill in the nodes the variables NSValidation ...
Double_t TrainNodeFull(const EventConstList &eventSample, DecisionTreeNode *node)
train a node by finding the single optimal cut for a single variable that best separates signal and b...
TMVA::DecisionTreeNode * GetEventNode(const TMVA::Event &e) const
get the pointer to the leaf node where a particular event ends up in... (used in gradient boosting)
void GetRandomisedVariables(Bool_t *useVariable, UInt_t *variableMap, UInt_t &nVars)
UInt_t CleanTree(DecisionTreeNode *node=NULL)
remove those last splits that result in two leaf nodes that are both of the type (i....
std::vector< const TMVA::Event * > EventConstList
Definition: DecisionTree.h:74
Double_t CheckEvent(const TMVA::Event *, Bool_t UseYesNoLeaf=kFALSE) const
the event e is put into the decision tree (starting at the root node) and the output is NodeType (sig...
static const Int_t fgRandomSeed
Definition: DecisionTree.h:69
UInt_t BuildTree(const EventConstList &eventSample, DecisionTreeNode *node=NULL)
building the decision tree by recursively calling the splitting of one (root-) node into two daughter...
Double_t PruneTree(const EventConstList *validationSample=NULL)
prune (get rid of internal nodes) the Decision tree to avoid overtraining several different pruning m...
virtual ~DecisionTree(void)
destructor
Types::EAnalysisType fAnalysisType
Definition: DecisionTree.h:239
std::vector< Double_t > GetVariableImportance()
Return the relative variable importance, normalized to all variables together having the importance 1...
void CheckEventWithPrunedTree(const TMVA::Event *) const
pass a single validation event through a pruned decision tree on the way down the tree,...
void PruneNodeInPlace(TMVA::DecisionTreeNode *node)
prune a node temporarily (without actually deleting its descendants which allows testing the pruned t...
void FillEvent(const TMVA::Event &event, TMVA::DecisionTreeNode *node)
fill the existing the decision tree structure by filling event in from the top node and see where the...
UInt_t CountLeafNodes(TMVA::Node *n=NULL)
return the number of terminal nodes in the sub-tree below Node n
Double_t TestPrunedTreeQuality(const DecisionTreeNode *dt=NULL, Int_t mode=0) const
return the misclassification rate of a pruned tree a "pruned tree" may have set the variable "IsTermi...
void ClearTree()
clear the tree nodes (their S/N, Nevents etc), just keep the structure of the tree
static DecisionTree * CreateFromXML(void *node, UInt_t tmva_Version_Code=TMVA_VERSION_CODE)
re-create a new tree (decision tree or search tree) from XML
Double_t SamplePurity(EventList eventSample)
calculates the purity S/(S+B) of a given event sample
void SetParentTreeInNodes(Node *n=NULL)
descend a tree to find all its leaf nodes, fill max depth reached in the tree at the same time.
Node * GetNode(ULong_t sequence, UInt_t depth)
retrieve node from the tree.
std::vector< Double_t > GetFisherCoefficients(const EventConstList &eventSample, UInt_t nFisherVars, UInt_t *mapVarInFisher)
calculate the fisher coefficients for the event sample and the variables used
Double_t TrainNodeFast(const EventConstList &eventSample, DecisionTreeNode *node)
Decide how to split a node using one of the variables that gives the best separation of signal/backgr...
void DescendTree(Node *n=NULL)
descend a tree to find all its leaf nodes
RegressionVariance * fRegType
Definition: DecisionTree.h:212
DecisionTree(void)
default constructor using the GiniIndex as separation criterion, no restrictions on minium number of ...
Double_t GetSumWeights(const EventConstList *validationSample) const
calculate the normalization factor for a pruning validation sample
Double_t GetOriginalWeight() const
Definition: Event.h:84
Double_t GetWeight() const
return the event weight - depending on whether the flag IgnoreNegWeightsInTraining is or not.
Definition: Event.cxx:381
UInt_t GetClass() const
Definition: Event.h:86
Float_t GetValueFast(UInt_t ivar) const
Definition: Event.h:93
Float_t GetTarget(UInt_t itgt) const
Definition: Event.h:102
unsigned int GetPoolSize() const
Definition: Executor.h:100
auto MapReduce(F func, ROOT::TSeq< INTEGER > args, R redfunc) -> typename std::result_of< F(INTEGER)>::type
Wrap TExecutor::MapReduce functions.
Definition: Executor.h:146
auto Map(F func, unsigned nTimes) -> std::vector< typename std::result_of< F()>::type >
Wrap TExecutor::Map functions.
Definition: Executor.h:134
A helper class to prune a decision tree using the expected error (C4.5) method.
IPruneTool - a helper interface class to prune a decision tree.
Definition: IPruneTool.h:70
void SetPruneStrength(Double_t alpha)
Definition: IPruneTool.h:88
virtual PruningInfo * CalculatePruningInfo(DecisionTree *dt, const EventSample *testEvents=NULL, Bool_t isAutomatic=kFALSE)=0
Bool_t IsAutomatic() const
Definition: IPruneTool.h:95
Node for the BinarySearch or Decision Trees.
Definition: Node.h:58
UInt_t GetDepth() const
Definition: Node.h:116
std::vector< DecisionTreeNode * > PruneSequence
the regularization parameter for pruning
Definition: IPruneTool.h:47
Double_t PruneStrength
quality measure for a pruned subtree T of T_max
Definition: IPruneTool.h:46
Calculate the "SeparationGain" for Regression analysis separation criteria used in various training a...
An interface to calculate the "SeparationGain" for different separation criteria used in various trai...
const TMatrixD * GetCorrelationMatrix(const TMatrixD *covMat)
turns covariance into correlation matrix
Definition: Tools.cxx:336
void ReadAttr(void *node, const char *, T &value)
read attribute from xml
Definition: Tools.h:335
std::vector< TMatrixDSym * > * CalcCovarianceMatrices(const std::vector< Event * > &events, Int_t maxCls, VariableTransformBase *transformBase=0)
compute covariance matrices
Definition: Tools.cxx:1526
Singleton class for Global types used by TMVA.
Definition: Types.h:73
@ kClassification
Definition: Types.h:129
@ kRegression
Definition: Types.h:130
@ kDEBUG
Definition: Types.h:58
@ kERROR
Definition: Types.h:62
@ kWARNING
Definition: Types.h:61
@ kFATAL
Definition: Types.h:63
TMatrixT< Element > & Invert(Double_t *det=0)
Invert the matrix and calculate its determinant.
Definition: TMatrixT.cxx:1397
virtual Double_t Determinant() const
Return the matrix determinant.
Definition: TMatrixT.cxx:1362
Random number generator class based on M.
Definition: TRandom3.h:27
Double_t y[n]
Definition: legend1.C:17
Double_t x[n]
Definition: legend1.C:17
const Int_t n
Definition: legend1.C:16
TClass * GetClass(T *)
Definition: TClass.h:658
TSeq< unsigned int > TSeqU
Definition: TSeq.hxx:195
static constexpr double sr
static constexpr double s
Tools & gTools()
MsgLogger & Endl(MsgLogger &ml)
Definition: MsgLogger.h:158
Short_t Max(Short_t a, Short_t b)
Definition: TMathBase.h:212
constexpr Double_t E()
Base of natural log:
Definition: TMath.h:96
Double_t Log(Double_t x)
Definition: TMath.h:760
Double_t Sqrt(Double_t x)
Definition: TMath.h:691
Short_t Min(Short_t a, Short_t b)
Definition: TMathBase.h:180
Short_t Abs(Short_t d)
Definition: TMathBase.h:120
BuildNodeInfo(Int_t fNvars, std::vector< Float_t > &inxmin, std::vector< Float_t > &inxmax)
std::vector< Float_t > xmin
BuildNodeInfo operator+(const BuildNodeInfo &other)
std::vector< Float_t > xmax
BuildNodeInfo(Int_t fNvars, const TMVA::Event *evt)
Double_t target2
Double_t nTotB_unWeighted
std::vector< std::vector< Double_t > > target2
std::vector< std::vector< Double_t > > nSelB_unWeighted
std::vector< std::vector< Double_t > > nSelB
Double_t nTotS_unWeighted
std::vector< std::vector< Double_t > > target
std::vector< std::vector< Double_t > > nSelS_unWeighted
TrainNodeInfo operator+(const TrainNodeInfo &other)
std::vector< std::vector< Double_t > > nSelS
TrainNodeInfo(Int_t cNvars_, UInt_t *nBins_)
auto * l
Definition: textangle.C:4
REAL epsilon
Definition: triangle.c:618