Logo ROOT  
Reference Guide
DecisionTree.cxx
Go to the documentation of this file.
1// @(#)root/tmva $Id$
2// Author: Andreas Hoecker, Joerg Stelzer, Helge Voss, Kai Voss, Eckhard von Toerne, Jan Therhaag
3
4/**********************************************************************************
5 * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6 * Package: TMVA *
7 * Class : TMVA::DecisionTree *
8 * Web : http://tmva.sourceforge.net *
9 * *
10 * Description: *
11 * Implementation of a Decision Tree *
12 * *
13 * Authors (alphabetical): *
14 * Andreas Hoecker <Andreas.Hocker@cern.ch> - CERN, Switzerland *
15 * Helge Voss <Helge.Voss@cern.ch> - MPI-K Heidelberg, Germany *
16 * Kai Voss <Kai.Voss@cern.ch> - U. of Victoria, Canada *
17 * Eckhard v. Toerne <evt@uni-bonn.de> - U of Bonn, Germany *
18 * Jan Therhaag <Jan.Therhaag@cern.ch> - U of Bonn, Germany *
19 * *
20 * Copyright (c) 2005-2011: *
21 * CERN, Switzerland *
22 * U. of Victoria, Canada *
23 * MPI-K Heidelberg, Germany *
24 * U. of Bonn, Germany *
25 * *
26 * Redistribution and use in source and binary forms, with or without *
27 * modification, are permitted according to the terms listed in LICENSE *
28 * (http://mva.sourceforge.net/license.txt) *
29 * *
30 **********************************************************************************/
31
32/*! \class TMVA::DecisionTree
33\ingroup TMVA
34
35Implementation of a Decision Tree
36
37In a decision tree successive decision nodes are used to categorize the
38events out of the sample as either signal or background. Each node
39uses only a single discriminating variable to decide if the event is
40signal-like ("goes right") or background-like ("goes left"). This
41forms a tree like structure with "baskets" at the end (leave nodes),
42and an event is classified as either signal or background according to
43whether the basket where it ends up has been classified signal or
44background during the training. Training of a decision tree is the
45process to define the "cut criteria" for each node. The training
46starts with the root node. Here one takes the full training event
47sample and selects the variable and corresponding cut value that gives
48the best separation between signal and background at this stage. Using
49this cut criterion, the sample is then divided into two subsamples, a
50signal-like (right) and a background-like (left) sample. Two new nodes
51are then created for each of the two sub-samples and they are
52constructed using the same mechanism as described for the root
53node. The devision is stopped once a certain node has reached either a
54minimum number of events, or a minimum or maximum signal purity. These
55leave nodes are then called "signal" or "background" if they contain
56more signal respective background events from the training sample.
57
58*/
59
60#include <iostream>
61#include <algorithm>
62#include <vector>
63#include <limits>
64#include <fstream>
65#include <algorithm>
66#include <cassert>
67
68#include "TRandom3.h"
69#include "TMath.h"
70#include "TMatrix.h"
71#include "TStopwatch.h"
72
73#include "TMVA/MsgLogger.h"
74#include "TMVA/DecisionTree.h"
77
78#include "TMVA/Tools.h"
79#include "TMVA/Config.h"
80
81#include "TMVA/GiniIndex.h"
82#include "TMVA/CrossEntropy.h"
84#include "TMVA/SdivSqrtSplusB.h"
85#include "TMVA/Event.h"
87#include "TMVA/IPruneTool.h"
90
91const Int_t TMVA::DecisionTree::fgRandomSeed = 0; // set nonzero for debugging and zero for random seeds
92
93using std::vector;
94
96
97bool almost_equal_float(float x, float y, int ulp=4){
98 // the machine epsilon has to be scaled to the magnitude of the values used
99 // and multiplied by the desired precision in ULPs (units in the last place)
100 return std::abs(x-y) < std::numeric_limits<float>::epsilon() * std::abs(x+y) * ulp
101 // unless the result is subnormal
102 || std::abs(x-y) < std::numeric_limits<float>::min();
103}
104
105bool almost_equal_double(double x, double y, int ulp=4){
106 // the machine epsilon has to be scaled to the magnitude of the values used
107 // and multiplied by the desired precision in ULPs (units in the last place)
108 return std::abs(x-y) < std::numeric_limits<double>::epsilon() * std::abs(x+y) * ulp
109 // unless the result is subnormal
110 || std::abs(x-y) < std::numeric_limits<double>::min();
111}
112
113////////////////////////////////////////////////////////////////////////////////
114/// default constructor using the GiniIndex as separation criterion,
115/// no restrictions on minium number of events in a leave note or the
116/// separation gain in the node splitting
117
119 BinaryTree(),
120 fNvars (0),
121 fNCuts (-1),
122 fUseFisherCuts (kFALSE),
123 fMinLinCorrForFisher (1),
124 fUseExclusiveVars (kTRUE),
125 fSepType (NULL),
126 fRegType (NULL),
127 fMinSize (0),
128 fMinNodeSize (1),
129 fMinSepGain (0),
130 fUseSearchTree(kFALSE),
131 fPruneStrength(0),
132 fPruneMethod (kNoPruning),
133 fNNodesBeforePruning(0),
134 fNodePurityLimit(0.5),
135 fRandomisedTree (kFALSE),
136 fUseNvars (0),
137 fUsePoissonNvars(kFALSE),
138 fMyTrandom (NULL),
139 fMaxDepth (999999),
140 fSigClass (0),
141 fTreeID (0),
142 fAnalysisType (Types::kClassification),
143 fDataSetInfo (NULL)
144
145{}
146
147////////////////////////////////////////////////////////////////////////////////
148/// constructor specifying the separation type, the min number of
149/// events in a no that is still subjected to further splitting, the
150/// number of bins in the grid used in applying the cut for the node
151/// splitting.
152
154 Bool_t randomisedTree, Int_t useNvars, Bool_t usePoissonNvars,
155 UInt_t nMaxDepth, Int_t iSeed, Float_t purityLimit, Int_t treeID):
156 BinaryTree(),
157 fNvars (0),
158 fNCuts (nCuts),
159 fUseFisherCuts (kFALSE),
160 fMinLinCorrForFisher (1),
161 fUseExclusiveVars (kTRUE),
162 fSepType (sepType),
163 fRegType (NULL),
164 fMinSize (0),
165 fMinNodeSize (minSize),
166 fMinSepGain (0),
167 fUseSearchTree (kFALSE),
168 fPruneStrength (0),
169 fPruneMethod (kNoPruning),
170 fNNodesBeforePruning(0),
171 fNodePurityLimit(purityLimit),
172 fRandomisedTree (randomisedTree),
173 fUseNvars (useNvars),
174 fUsePoissonNvars(usePoissonNvars),
175 fMyTrandom (new TRandom3(iSeed)),
176 fMaxDepth (nMaxDepth),
177 fSigClass (cls),
178 fTreeID (treeID),
179 fAnalysisType (Types::kClassification),
180 fDataSetInfo (dataInfo)
181{
182 if (sepType == NULL) { // it is interpreted as a regression tree, where
183 // currently the separation type (simple least square)
184 // cannot be chosen freely)
187 if ( nCuts <=0 ) {
188 fNCuts = 200;
189 Log() << kWARNING << " You had chosen the training mode using optimal cuts, not\n"
190 << " based on a grid of " << fNCuts << " by setting the option NCuts < 0\n"
191 << " as this doesn't exist yet, I set it to " << fNCuts << " and use the grid"
192 << Endl;
193 }
194 }else{
196 }
197}
198
199////////////////////////////////////////////////////////////////////////////////
200/// copy constructor that creates a true copy, i.e. a completely independent tree
201/// the node copy will recursively copy all the nodes
202
204 BinaryTree(),
205 fNvars (d.fNvars),
206 fNCuts (d.fNCuts),
207 fUseFisherCuts (d.fUseFisherCuts),
208 fMinLinCorrForFisher (d.fMinLinCorrForFisher),
209 fUseExclusiveVars (d.fUseExclusiveVars),
210 fSepType (d.fSepType),
211 fRegType (d.fRegType),
212 fMinSize (d.fMinSize),
213 fMinNodeSize(d.fMinNodeSize),
214 fMinSepGain (d.fMinSepGain),
215 fUseSearchTree (d.fUseSearchTree),
216 fPruneStrength (d.fPruneStrength),
217 fPruneMethod (d.fPruneMethod),
218 fNodePurityLimit(d.fNodePurityLimit),
219 fRandomisedTree (d.fRandomisedTree),
220 fUseNvars (d.fUseNvars),
221 fUsePoissonNvars(d.fUsePoissonNvars),
222 fMyTrandom (new TRandom3(fgRandomSeed)), // well, that means it's not an identical copy. But I only ever intend to really copy trees that are "outgrown" already.
223 fMaxDepth (d.fMaxDepth),
224 fSigClass (d.fSigClass),
225 fTreeID (d.fTreeID),
226 fAnalysisType(d.fAnalysisType),
227 fDataSetInfo (d.fDataSetInfo)
228{
229 this->SetRoot( new TMVA::DecisionTreeNode ( *((DecisionTreeNode*)(d.GetRoot())) ) );
230 this->SetParentTreeInNodes();
231 fNNodes = d.fNNodes;
232
233}
234
235
236////////////////////////////////////////////////////////////////////////////////
237/// destructor
238
240{
241 // destruction of the tree nodes done in the "base class" BinaryTree
242
243 if (fMyTrandom) delete fMyTrandom;
244 if (fRegType) delete fRegType;
245}
246
247////////////////////////////////////////////////////////////////////////////////
248/// descend a tree to find all its leaf nodes, fill max depth reached in the
249/// tree at the same time.
250
252{
253 if (n == NULL) { //default, start at the tree top, then descend recursively
254 n = this->GetRoot();
255 if (n == NULL) {
256 Log() << kFATAL << "SetParentTreeNodes: started with undefined ROOT node" <<Endl;
257 return ;
258 }
259 }
260
261 if ((this->GetLeftDaughter(n) == NULL) && (this->GetRightDaughter(n) != NULL) ) {
262 Log() << kFATAL << " Node with only one daughter?? Something went wrong" << Endl;
263 return;
264 } else if ((this->GetLeftDaughter(n) != NULL) && (this->GetRightDaughter(n) == NULL) ) {
265 Log() << kFATAL << " Node with only one daughter?? Something went wrong" << Endl;
266 return;
267 }
268 else {
269 if (this->GetLeftDaughter(n) != NULL) {
270 this->SetParentTreeInNodes( this->GetLeftDaughter(n) );
271 }
272 if (this->GetRightDaughter(n) != NULL) {
273 this->SetParentTreeInNodes( this->GetRightDaughter(n) );
274 }
275 }
276 n->SetParentTree(this);
277 if (n->GetDepth() > this->GetTotalTreeDepth()) this->SetTotalTreeDepth(n->GetDepth());
278 return;
279}
280
281////////////////////////////////////////////////////////////////////////////////
282/// re-create a new tree (decision tree or search tree) from XML
283
285 std::string type("");
286 gTools().ReadAttr(node,"type", type);
287 DecisionTree* dt = new DecisionTree();
288
289 dt->ReadXML( node, tmva_Version_Code );
290 return dt;
291}
292
293// #### Multithreaded DecisionTree::BuildTree
294#ifdef R__USE_IMT
295//====================================================================================
296// Had to define this struct to enable parallelization.
297// Using BuildNodeInfo, each thread can return the information needed.
298// After each thread returns we can then merge the results, hence the + operator.
299//====================================================================================
300struct BuildNodeInfo{
301
302 BuildNodeInfo(Int_t fNvars, const TMVA::Event* evt){
303
304 nvars = fNvars;
305 xmin = std::vector<Float_t>(nvars);
306 xmax = std::vector<Float_t>(nvars);
307
308 // #### the initial min and max for each feature
309 for (Int_t ivar=0; ivar<fNvars; ivar++) {
310 const Double_t val = evt->GetValueFast(ivar);
311 xmin[ivar]=val;
312 xmax[ivar]=val;
313 }
314 };
315
316 BuildNodeInfo(Int_t fNvars, std::vector<Float_t>& inxmin, std::vector<Float_t>& inxmax){
317
318 nvars = fNvars;
319 xmin = std::vector<Float_t>(nvars);
320 xmax = std::vector<Float_t>(nvars);
321
322 // #### the initial min and max for each feature
323 for (Int_t ivar=0; ivar<fNvars; ivar++) {
324 xmin[ivar]=inxmin[ivar];
325 xmax[ivar]=inxmax[ivar];
326 }
327 };
328 BuildNodeInfo(){};
329
330 Int_t nvars = 0;
331 Double_t s = 0;
332 Double_t suw = 0;
333 Double_t sub = 0;
334 Double_t b = 0;
335 Double_t buw = 0;
336 Double_t bub = 0;
337 Double_t target = 0;
338 Double_t target2 = 0;
339 std::vector<Float_t> xmin;
340 std::vector<Float_t> xmax;
341
342 // #### Define the addition operator for BuildNodeInfo
343 // #### Make sure both BuildNodeInfos have the same nvars if we add them
344 BuildNodeInfo operator+(const BuildNodeInfo& other)
345 {
346 BuildNodeInfo ret(nvars, xmin, xmax);
347 if(nvars != other.nvars)
348 {
349 std::cout << "!!! ERROR BuildNodeInfo1+BuildNodeInfo2 failure. Nvars1 != Nvars2." << std::endl;
350 return ret;
351 }
352 ret.s = s + other.s;
353 ret.suw = suw + other.suw;
354 ret.sub = sub + other.sub;
355 ret.b = b + other.b;
356 ret.buw = buw + other.buw;
357 ret.bub = bub + other.bub;
358 ret.target = target + other.target;
359 ret.target2 = target2 + other.target2;
360
361 // xmin is the min of both, xmax is the max of both
362 for(Int_t i=0; i<nvars; i++)
363 {
364 ret.xmin[i]=xmin[i]<other.xmin[i]?xmin[i]:other.xmin[i];
365 ret.xmax[i]=xmax[i]>other.xmax[i]?xmax[i]:other.xmax[i];
366 }
367 return ret;
368 };
369
370};
371//===========================================================================
372// Done with BuildNodeInfo declaration
373//===========================================================================
374
375
376////////////////////////////////////////////////////////////////////////////////
377/// building the decision tree by recursively calling the splitting of
378/// one (root-) node into two daughter nodes (returns the number of nodes)
379
380UInt_t TMVA::DecisionTree::BuildTree( const std::vector<const TMVA::Event*> & eventSample,
382{
383 if (node==NULL) {
384 //start with the root node
385 node = new TMVA::DecisionTreeNode();
386 fNNodes = 1;
387 this->SetRoot(node);
388 // have to use "s" for start as "r" for "root" would be the same as "r" for "right"
389 this->GetRoot()->SetPos('s');
390 this->GetRoot()->SetDepth(0);
391 this->GetRoot()->SetParentTree(this);
392 fMinSize = fMinNodeSize/100. * eventSample.size();
393 if (GetTreeID()==0){
394 Log() << kDEBUG << "\tThe minimal node size MinNodeSize=" << fMinNodeSize << " fMinNodeSize="<<fMinNodeSize<< "% is translated to an actual number of events = "<< fMinSize<< " for the training sample size of " << eventSample.size() << Endl;
395 Log() << kDEBUG << "\tNote: This number will be taken as absolute minimum in the node, " << Endl;
396 Log() << kDEBUG << " \tin terms of 'weighted events' and unweighted ones !! " << Endl;
397 }
398 }
399
400 UInt_t nevents = eventSample.size();
401
402 if (nevents > 0 ) {
403 if (fNvars==0) fNvars = eventSample[0]->GetNVariables(); // should have been set before, but ... well..
404 fVariableImportance.resize(fNvars);
405 }
406 else Log() << kFATAL << ":<BuildTree> eventsample Size == 0 " << Endl;
407
408 // sum up the totals
409 // sig and bkg for classification
410 // err and err2 for regression
411
412 // #### Set up prerequisite info for multithreading
414 auto seeds = ROOT::TSeqU(nPartitions);
415
416 // #### need a lambda function to pass to TThreadExecutor::MapReduce (multi-threading)
417 auto f = [this, &eventSample, &nPartitions](UInt_t partition = 0){
418
419 Int_t start = 1.0*partition/nPartitions*eventSample.size();
420 Int_t end = (partition+1.0)/nPartitions*eventSample.size();
421
422 BuildNodeInfo nodeInfof(fNvars, eventSample[0]);
423
424 for(Int_t iev=start; iev<end; iev++){
425 const TMVA::Event* evt = eventSample[iev];
426 const Double_t weight = evt->GetWeight();
427 const Double_t orgWeight = evt->GetOriginalWeight(); // unboosted!
428 if (evt->GetClass() == fSigClass) {
429 nodeInfof.s += weight;
430 nodeInfof.suw += 1;
431 nodeInfof.sub += orgWeight;
432 }
433 else {
434 nodeInfof.b += weight;
435 nodeInfof.buw += 1;
436 nodeInfof.bub += orgWeight;
437 }
438 if ( DoRegression() ) {
439 const Double_t tgt = evt->GetTarget(0);
440 nodeInfof.target +=weight*tgt;
441 nodeInfof.target2+=weight*tgt*tgt;
442 }
443
444 // save the min and max for each feature
445 for (UInt_t ivar=0; ivar<fNvars; ivar++) {
446 const Double_t val = evt->GetValueFast(ivar);
447 if (iev==start){
448 nodeInfof.xmin[ivar]=val;
449 nodeInfof.xmax[ivar]=val;
450 }
451 if (val < nodeInfof.xmin[ivar]) nodeInfof.xmin[ivar]=val;
452 if (val > nodeInfof.xmax[ivar]) nodeInfof.xmax[ivar]=val;
453 }
454 }
455 return nodeInfof;
456 };
457
458 // #### Need an initial struct to pass to std::accumulate
459 BuildNodeInfo nodeInfoInit(fNvars, eventSample[0]);
460
461 // #### Run the threads in parallel then merge the results
462 auto redfunc = [nodeInfoInit](std::vector<BuildNodeInfo> v) -> BuildNodeInfo { return std::accumulate(v.begin(), v.end(), nodeInfoInit); };
463 BuildNodeInfo nodeInfo = TMVA::Config::Instance().GetThreadExecutor().MapReduce(f, seeds, redfunc);
464 //NodeInfo nodeInfo(fNvars);
465
466 if (nodeInfo.s+nodeInfo.b < 0) {
467 Log() << kWARNING << " One of the Decision Tree nodes has negative total number of signal or background events. "
468 << "(Nsig="<<nodeInfo.s<<" Nbkg="<<nodeInfo.b<<" Probaby you use a Monte Carlo with negative weights. That should in principle "
469 << "be fine as long as on average you end up with something positive. For this you have to make sure that the "
470 << "minimal number of (unweighted) events demanded for a tree node (currently you use: MinNodeSize="<<fMinNodeSize
471 << "% of training events, you can set this via the BDT option string when booking the classifier) is large enough "
472 << "to allow for reasonable averaging!!!" << Endl
473 << " If this does not help.. maybe you want to try the option: NoNegWeightsInTraining which ignores events "
474 << "with negative weight in the training." << Endl;
475 double nBkg=0.;
476 for (UInt_t i=0; i<eventSample.size(); i++) {
477 if (eventSample[i]->GetClass() != fSigClass) {
478 nBkg += eventSample[i]->GetWeight();
479 Log() << kDEBUG << "Event "<< i<< " has (original) weight: " << eventSample[i]->GetWeight()/eventSample[i]->GetBoostWeight()
480 << " boostWeight: " << eventSample[i]->GetBoostWeight() << Endl;
481 }
482 }
483 Log() << kDEBUG << " that gives in total: " << nBkg<<Endl;
484 }
485
486 node->SetNSigEvents(nodeInfo.s);
487 node->SetNBkgEvents(nodeInfo.b);
488 node->SetNSigEvents_unweighted(nodeInfo.suw);
489 node->SetNBkgEvents_unweighted(nodeInfo.buw);
490 node->SetNSigEvents_unboosted(nodeInfo.sub);
491 node->SetNBkgEvents_unboosted(nodeInfo.bub);
492 node->SetPurity();
493 if (node == this->GetRoot()) {
494 node->SetNEvents(nodeInfo.s+nodeInfo.b);
495 node->SetNEvents_unweighted(nodeInfo.suw+nodeInfo.buw);
496 node->SetNEvents_unboosted(nodeInfo.sub+nodeInfo.bub);
497 }
498
499 // save the min and max for each feature
500 for (UInt_t ivar=0; ivar<fNvars; ivar++) {
501 node->SetSampleMin(ivar,nodeInfo.xmin[ivar]);
502 node->SetSampleMax(ivar,nodeInfo.xmax[ivar]);
503 }
504
505 // I now demand the minimum number of events for both daughter nodes. Hence if the number
506 // of events in the parent node is not at least two times as big, I don't even need to try
507 // splitting
508
509 // ask here for actual "events" independent of their weight.. OR the weighted events
510 // to exceed the min requested number of events per daughter node
511 // (NOTE: make sure that at the eventSample at the ROOT node has sum_of_weights == sample.size() !
512 // if ((eventSample.size() >= 2*fMinSize ||s+b >= 2*fMinSize) && node->GetDepth() < fMaxDepth
513 // std::cout << "------------------------------------------------------------------"<<std::endl;
514 // std::cout << "------------------------------------------------------------------"<<std::endl;
515 // std::cout << " eveSampleSize = "<< eventSample.size() << " s+b="<<s+b << std::endl;
516 if ((eventSample.size() >= 2*fMinSize && nodeInfo.s+nodeInfo.b >= 2*fMinSize) && node->GetDepth() < fMaxDepth
517 && ( ( nodeInfo.s!=0 && nodeInfo.b !=0 && !DoRegression()) || ( (nodeInfo.s+nodeInfo.b)!=0 && DoRegression()) ) ) {
518
519 // Train the node and figure out the separation gain and split points
520 Double_t separationGain;
521 if (fNCuts > 0){
522 separationGain = this->TrainNodeFast(eventSample, node);
523 }
524 else {
525 separationGain = this->TrainNodeFull(eventSample, node);
526 }
527
528 // The node has been trained and there is nothing to be gained by splitting
529 if (separationGain < std::numeric_limits<double>::epsilon()) { // we could not gain anything, e.g. all events are in one bin,
530 // no cut can actually do anything to improve the node
531 // hence, naturally, the current node is a leaf node
532 if (DoRegression()) {
533 node->SetSeparationIndex(fRegType->GetSeparationIndex(nodeInfo.s+nodeInfo.b,nodeInfo.target,nodeInfo.target2));
534 node->SetResponse(nodeInfo.target/(nodeInfo.s+nodeInfo.b));
535 if( almost_equal_double(nodeInfo.target2/(nodeInfo.s+nodeInfo.b),nodeInfo.target/(nodeInfo.s+nodeInfo.b)*nodeInfo.target/(nodeInfo.s+nodeInfo.b)) ){
536 node->SetRMS(0);
537 }else{
538 node->SetRMS(TMath::Sqrt(nodeInfo.target2/(nodeInfo.s+nodeInfo.b) - nodeInfo.target/(nodeInfo.s+nodeInfo.b)*nodeInfo.target/(nodeInfo.s+nodeInfo.b)));
539 }
540 }
541 else {
542 node->SetSeparationIndex(fSepType->GetSeparationIndex(nodeInfo.s,nodeInfo.b));
543 if (node->GetPurity() > fNodePurityLimit) node->SetNodeType(1);
544 else node->SetNodeType(-1);
545 }
546 if (node->GetDepth() > this->GetTotalTreeDepth()) this->SetTotalTreeDepth(node->GetDepth());
547 }
548 else {
549 // #### Couldn't parallelize this part (filtering events from mother node to daughter nodes)
550 // #### ... would need to avoid the push_back or use some threadsafe mutex locked version...
551 std::vector<const TMVA::Event*> leftSample; leftSample.reserve(nevents);
552 std::vector<const TMVA::Event*> rightSample; rightSample.reserve(nevents);
553
554 Double_t nRight=0, nLeft=0;
555 Double_t nRightUnBoosted=0, nLeftUnBoosted=0;
556
557 for (UInt_t ie=0; ie< nevents ; ie++) {
558 if (node->GoesRight(*eventSample[ie])) {
559 rightSample.push_back(eventSample[ie]);
560 nRight += eventSample[ie]->GetWeight();
561 nRightUnBoosted += eventSample[ie]->GetOriginalWeight();
562 }
563 else {
564 leftSample.push_back(eventSample[ie]);
565 nLeft += eventSample[ie]->GetWeight();
566 nLeftUnBoosted += eventSample[ie]->GetOriginalWeight();
567 }
568 }
569 // sanity check
570 if (leftSample.empty() || rightSample.empty()) {
571
572 Log() << kERROR << "<TrainNode> all events went to the same branch" << Endl
573 << "--- Hence new node == old node ... check" << Endl
574 << "--- left:" << leftSample.size()
575 << " right:" << rightSample.size() << Endl
576 << " while the separation is thought to be " << separationGain
577 << "\n when cutting on variable " << node->GetSelector()
578 << " at value " << node->GetCutValue()
579 << kFATAL << "--- this should never happen, please write a bug report to Helge.Voss@cern.ch" << Endl;
580 }
581
582 // continue building daughter nodes for the left and the right eventsample
583 TMVA::DecisionTreeNode *rightNode = new TMVA::DecisionTreeNode(node,'r');
584 fNNodes++;
585 rightNode->SetNEvents(nRight);
586 rightNode->SetNEvents_unboosted(nRightUnBoosted);
587 rightNode->SetNEvents_unweighted(rightSample.size());
588
589 TMVA::DecisionTreeNode *leftNode = new TMVA::DecisionTreeNode(node,'l');
590
591 fNNodes++;
592 leftNode->SetNEvents(nLeft);
593 leftNode->SetNEvents_unboosted(nLeftUnBoosted);
594 leftNode->SetNEvents_unweighted(leftSample.size());
595
596 node->SetNodeType(0);
597 node->SetLeft(leftNode);
598 node->SetRight(rightNode);
599
600 this->BuildTree(rightSample, rightNode);
601 this->BuildTree(leftSample, leftNode );
602
603 }
604 }
605 else{ // it is a leaf node
606 if (DoRegression()) {
607 node->SetSeparationIndex(fRegType->GetSeparationIndex(nodeInfo.s+nodeInfo.b,nodeInfo.target,nodeInfo.target2));
608 node->SetResponse(nodeInfo.target/(nodeInfo.s+nodeInfo.b));
609 if( almost_equal_double(nodeInfo.target2/(nodeInfo.s+nodeInfo.b), nodeInfo.target/(nodeInfo.s+nodeInfo.b)*nodeInfo.target/(nodeInfo.s+nodeInfo.b)) ) {
610 node->SetRMS(0);
611 }else{
612 node->SetRMS(TMath::Sqrt(nodeInfo.target2/(nodeInfo.s+nodeInfo.b) - nodeInfo.target/(nodeInfo.s+nodeInfo.b)*nodeInfo.target/(nodeInfo.s+nodeInfo.b)));
613 }
614 }
615 else {
616 node->SetSeparationIndex(fSepType->GetSeparationIndex(nodeInfo.s,nodeInfo.b));
617 if (node->GetPurity() > fNodePurityLimit) node->SetNodeType(1);
618 else node->SetNodeType(-1);
619 // loop through the event sample ending up in this node and check for events with negative weight
620 // those "cannot" be boosted normally. Hence, if there is one of those
621 // is misclassified, find randomly as many events with positive weights in this
622 // node as needed to get the same absolute number of weight, and mark them as
623 // "not to be boosted" in order to make up for not boosting the negative weight event
624 }
625
626
627 if (node->GetDepth() > this->GetTotalTreeDepth()) this->SetTotalTreeDepth(node->GetDepth());
628 }
629
630 // if (IsRootNode) this->CleanTree();
631 return fNNodes;
632}
633
634// Standard DecisionTree::BuildTree (multithreading is not enabled)
635#else
636
637UInt_t TMVA::DecisionTree::BuildTree( const std::vector<const TMVA::Event*> & eventSample,
639{
640 if (node==NULL) {
641 //start with the root node
642 node = new TMVA::DecisionTreeNode();
643 fNNodes = 1;
644 this->SetRoot(node);
645 // have to use "s" for start as "r" for "root" would be the same as "r" for "right"
646 this->GetRoot()->SetPos('s');
647 this->GetRoot()->SetDepth(0);
648 this->GetRoot()->SetParentTree(this);
649 fMinSize = fMinNodeSize/100. * eventSample.size();
650 if (GetTreeID()==0){
651 Log() << kDEBUG << "\tThe minimal node size MinNodeSize=" << fMinNodeSize << " fMinNodeSize="<<fMinNodeSize<< "% is translated to an actual number of events = "<< fMinSize<< " for the training sample size of " << eventSample.size() << Endl;
652 Log() << kDEBUG << "\tNote: This number will be taken as absolute minimum in the node, " << Endl;
653 Log() << kDEBUG << " \tin terms of 'weighted events' and unweighted ones !! " << Endl;
654 }
655 }
656
657 UInt_t nevents = eventSample.size();
658
659 if (nevents > 0 ) {
660 if (fNvars==0) fNvars = eventSample[0]->GetNVariables(); // should have been set before, but ... well..
661 fVariableImportance.resize(fNvars);
662 }
663 else Log() << kFATAL << ":<BuildTree> eventsample Size == 0 " << Endl;
664
665 Double_t s=0, b=0;
666 Double_t suw=0, buw=0;
667 Double_t sub=0, bub=0; // unboosted!
668 Double_t target=0, target2=0;
669 Float_t *xmin = new Float_t[fNvars];
670 Float_t *xmax = new Float_t[fNvars];
671
672 // initializing xmin and xmax for each variable
673 for (UInt_t ivar=0; ivar<fNvars; ivar++) {
674 xmin[ivar]=xmax[ivar]=0;
675 }
676 // sum up the totals
677 // sig and bkg for classification
678 // err and err2 for regression
679 for (UInt_t iev=0; iev<eventSample.size(); iev++) {
680 const TMVA::Event* evt = eventSample[iev];
681 const Double_t weight = evt->GetWeight();
682 const Double_t orgWeight = evt->GetOriginalWeight(); // unboosted!
683 if (evt->GetClass() == fSigClass) {
684 s += weight;
685 suw += 1;
686 sub += orgWeight;
687 }
688 else {
689 b += weight;
690 buw += 1;
691 bub += orgWeight;
692 }
693 if ( DoRegression() ) {
694 const Double_t tgt = evt->GetTarget(0);
695 target +=weight*tgt;
696 target2+=weight*tgt*tgt;
697 }
698
699 // save the min and max for each feature
700 for (UInt_t ivar=0; ivar<fNvars; ivar++) {
701 const Double_t val = evt->GetValueFast(ivar);
702 if (iev==0) xmin[ivar]=xmax[ivar]=val;
703 if (val < xmin[ivar]) xmin[ivar]=val;
704 if (val > xmax[ivar]) xmax[ivar]=val;
705 }
706 }
707
708
709 if (s+b < 0) {
710 Log() << kWARNING << " One of the Decision Tree nodes has negative total number of signal or background events. "
711 << "(Nsig="<<s<<" Nbkg="<<b<<" Probaby you use a Monte Carlo with negative weights. That should in principle "
712 << "be fine as long as on average you end up with something positive. For this you have to make sure that the "
713 << "minimul number of (unweighted) events demanded for a tree node (currently you use: MinNodeSize="<<fMinNodeSize
714 << "% of training events, you can set this via the BDT option string when booking the classifier) is large enough "
715 << "to allow for reasonable averaging!!!" << Endl
716 << " If this does not help.. maybe you want to try the option: NoNegWeightsInTraining which ignores events "
717 << "with negative weight in the training." << Endl;
718 double nBkg=0.;
719 for (UInt_t i=0; i<eventSample.size(); i++) {
720 if (eventSample[i]->GetClass() != fSigClass) {
721 nBkg += eventSample[i]->GetWeight();
722 Log() << kDEBUG << "Event "<< i<< " has (original) weight: " << eventSample[i]->GetWeight()/eventSample[i]->GetBoostWeight()
723 << " boostWeight: " << eventSample[i]->GetBoostWeight() << Endl;
724 }
725 }
726 Log() << kDEBUG << " that gives in total: " << nBkg<<Endl;
727 }
728
729 node->SetNSigEvents(s);
730 node->SetNBkgEvents(b);
731 node->SetNSigEvents_unweighted(suw);
732 node->SetNBkgEvents_unweighted(buw);
733 node->SetNSigEvents_unboosted(sub);
734 node->SetNBkgEvents_unboosted(bub);
735 node->SetPurity();
736 if (node == this->GetRoot()) {
737 node->SetNEvents(s+b);
738 node->SetNEvents_unweighted(suw+buw);
739 node->SetNEvents_unboosted(sub+bub);
740 }
741
742 // save the min and max for each feature
743 for (UInt_t ivar=0; ivar<fNvars; ivar++) {
744 node->SetSampleMin(ivar,xmin[ivar]);
745 node->SetSampleMax(ivar,xmax[ivar]);
746
747 }
748 delete[] xmin;
749 delete[] xmax;
750
751 // I now demand the minimum number of events for both daughter nodes. Hence if the number
752 // of events in the parent node is not at least two times as big, I don't even need to try
753 // splitting
754
755 // ask here for actuall "events" independent of their weight.. OR the weighted events
756 // to execeed the min requested number of events per dauther node
757 // (NOTE: make sure that at the eventSample at the ROOT node has sum_of_weights == sample.size() !
758 // if ((eventSample.size() >= 2*fMinSize ||s+b >= 2*fMinSize) && node->GetDepth() < fMaxDepth
759 // std::cout << "------------------------------------------------------------------"<<std::endl;
760 // std::cout << "------------------------------------------------------------------"<<std::endl;
761 // std::cout << " eveSampleSize = "<< eventSample.size() << " s+b="<<s+b << std::endl;
762 if ((eventSample.size() >= 2*fMinSize && s+b >= 2*fMinSize) && node->GetDepth() < fMaxDepth
763 && ( ( s!=0 && b !=0 && !DoRegression()) || ( (s+b)!=0 && DoRegression()) ) ) {
764 Double_t separationGain;
765 if (fNCuts > 0){
766 separationGain = this->TrainNodeFast(eventSample, node);
767 } else {
768 separationGain = this->TrainNodeFull(eventSample, node);
769 }
770 if (separationGain < std::numeric_limits<double>::epsilon()) { // we could not gain anything, e.g. all events are in one bin,
771 /// if (separationGain < 0.00000001) { // we could not gain anything, e.g. all events are in one bin,
772 // no cut can actually do anything to improve the node
773 // hence, naturally, the current node is a leaf node
774 if (DoRegression()) {
775 node->SetSeparationIndex(fRegType->GetSeparationIndex(s+b,target,target2));
776 node->SetResponse(target/(s+b));
777 if( almost_equal_double(target2/(s+b),target/(s+b)*target/(s+b)) ){
778 node->SetRMS(0);
779 }else{
780 node->SetRMS(TMath::Sqrt(target2/(s+b) - target/(s+b)*target/(s+b)));
781 }
782 }
783 else {
784 node->SetSeparationIndex(fSepType->GetSeparationIndex(s,b));
785
786 if (node->GetPurity() > fNodePurityLimit) node->SetNodeType(1);
787 else node->SetNodeType(-1);
788 }
789 if (node->GetDepth() > this->GetTotalTreeDepth()) this->SetTotalTreeDepth(node->GetDepth());
790
791 } else {
792
793 std::vector<const TMVA::Event*> leftSample; leftSample.reserve(nevents);
794 std::vector<const TMVA::Event*> rightSample; rightSample.reserve(nevents);
795
796 Double_t nRight=0, nLeft=0;
797 Double_t nRightUnBoosted=0, nLeftUnBoosted=0;
798
799 for (UInt_t ie=0; ie< nevents ; ie++) {
800 if (node->GoesRight(*eventSample[ie])) {
801 rightSample.push_back(eventSample[ie]);
802 nRight += eventSample[ie]->GetWeight();
803 nRightUnBoosted += eventSample[ie]->GetOriginalWeight();
804 }
805 else {
806 leftSample.push_back(eventSample[ie]);
807 nLeft += eventSample[ie]->GetWeight();
808 nLeftUnBoosted += eventSample[ie]->GetOriginalWeight();
809 }
810 }
811
812 // sanity check
813 if (leftSample.empty() || rightSample.empty()) {
814
815 Log() << kERROR << "<TrainNode> all events went to the same branch" << Endl
816 << "--- Hence new node == old node ... check" << Endl
817 << "--- left:" << leftSample.size()
818 << " right:" << rightSample.size() << Endl
819 << " while the separation is thought to be " << separationGain
820 << "\n when cutting on variable " << node->GetSelector()
821 << " at value " << node->GetCutValue()
822 << kFATAL << "--- this should never happen, please write a bug report to Helge.Voss@cern.ch" << Endl;
823 }
824
825 // continue building daughter nodes for the left and the right eventsample
826 TMVA::DecisionTreeNode *rightNode = new TMVA::DecisionTreeNode(node,'r');
827 fNNodes++;
828 rightNode->SetNEvents(nRight);
829 rightNode->SetNEvents_unboosted(nRightUnBoosted);
830 rightNode->SetNEvents_unweighted(rightSample.size());
831
832 TMVA::DecisionTreeNode *leftNode = new TMVA::DecisionTreeNode(node,'l');
833
834 fNNodes++;
835 leftNode->SetNEvents(nLeft);
836 leftNode->SetNEvents_unboosted(nLeftUnBoosted);
837 leftNode->SetNEvents_unweighted(leftSample.size());
838
839 node->SetNodeType(0);
840 node->SetLeft(leftNode);
841 node->SetRight(rightNode);
842
843 this->BuildTree(rightSample, rightNode);
844 this->BuildTree(leftSample, leftNode );
845
846 }
847 }
848 else{ // it is a leaf node
849 if (DoRegression()) {
850 node->SetSeparationIndex(fRegType->GetSeparationIndex(s+b,target,target2));
851 node->SetResponse(target/(s+b));
852 if( almost_equal_double(target2/(s+b), target/(s+b)*target/(s+b)) ) {
853 node->SetRMS(0);
854 }else{
855 node->SetRMS(TMath::Sqrt(target2/(s+b) - target/(s+b)*target/(s+b)));
856 }
857 }
858 else {
859 node->SetSeparationIndex(fSepType->GetSeparationIndex(s,b));
860 if (node->GetPurity() > fNodePurityLimit) node->SetNodeType(1);
861 else node->SetNodeType(-1);
862 // loop through the event sample ending up in this node and check for events with negative weight
863 // those "cannot" be boosted normally. Hence, if there is one of those
864 // is misclassified, find randomly as many events with positive weights in this
865 // node as needed to get the same absolute number of weight, and mark them as
866 // "not to be boosted" in order to make up for not boosting the negative weight event
867 }
868
869
870 if (node->GetDepth() > this->GetTotalTreeDepth()) this->SetTotalTreeDepth(node->GetDepth());
871 }
872
873 // if (IsRootNode) this->CleanTree();
874 return fNNodes;
875}
876
877#endif
878
879////////////////////////////////////////////////////////////////////////////////
880/// fill the existing the decision tree structure by filling event
881/// in from the top node and see where they happen to end up
882
883void TMVA::DecisionTree::FillTree( const std::vector<TMVA::Event*> & eventSample )
884{
885 for (UInt_t i=0; i<eventSample.size(); i++) {
886 this->FillEvent(*(eventSample[i]),NULL);
887 }
888}
889
890////////////////////////////////////////////////////////////////////////////////
891/// fill the existing the decision tree structure by filling event
892/// in from the top node and see where they happen to end up
893
896{
897 if (node == NULL) { // that's the start, take the Root node
898 node = this->GetRoot();
899 }
900
901 node->IncrementNEvents( event.GetWeight() );
903
904 if (event.GetClass() == fSigClass) {
905 node->IncrementNSigEvents( event.GetWeight() );
907 }
908 else {
909 node->IncrementNBkgEvents( event.GetWeight() );
911 }
912 node->SetSeparationIndex(fSepType->GetSeparationIndex(node->GetNSigEvents(),
913 node->GetNBkgEvents()));
914
915 if (node->GetNodeType() == 0) { //intermediate node --> go down
916 if (node->GoesRight(event))
917 this->FillEvent(event, node->GetRight());
918 else
919 this->FillEvent(event, node->GetLeft());
920 }
921}
922
923////////////////////////////////////////////////////////////////////////////////
924/// clear the tree nodes (their S/N, Nevents etc), just keep the structure of the tree
925
927{
928 if (this->GetRoot()!=NULL) this->GetRoot()->ClearNodeAndAllDaughters();
929
930}
931
932////////////////////////////////////////////////////////////////////////////////
933/// remove those last splits that result in two leaf nodes that
934/// are both of the type (i.e. both signal or both background)
935/// this of course is only a reasonable thing to do when you use
936/// "YesOrNo" leafs, while it might loose s.th. if you use the
937/// purity information in the nodes.
938/// --> hence I don't call it automatically in the tree building
939
941{
942 if (node==NULL) {
943 node = this->GetRoot();
944 }
945
946 DecisionTreeNode *l = node->GetLeft();
947 DecisionTreeNode *r = node->GetRight();
948
949 if (node->GetNodeType() == 0) {
950 this->CleanTree(l);
951 this->CleanTree(r);
952 if (l->GetNodeType() * r->GetNodeType() > 0) {
953
954 this->PruneNode(node);
955 }
956 }
957 // update the number of nodes after the cleaning
958 return this->CountNodes();
959
960}
961
962////////////////////////////////////////////////////////////////////////////////
963/// prune (get rid of internal nodes) the Decision tree to avoid overtraining
964/// several different pruning methods can be applied as selected by the
965/// variable "fPruneMethod".
966
968{
969 IPruneTool* tool(NULL);
970 PruningInfo* info(NULL);
971
972 if( fPruneMethod == kNoPruning ) return 0.0;
973
974 if (fPruneMethod == kExpectedErrorPruning)
975 // tool = new ExpectedErrorPruneTool(logfile);
976 tool = new ExpectedErrorPruneTool();
977 else if (fPruneMethod == kCostComplexityPruning)
978 {
979 tool = new CostComplexityPruneTool();
980 }
981 else {
982 Log() << kFATAL << "Selected pruning method not yet implemented "
983 << Endl;
984 }
985
986 if(!tool) return 0.0;
987
988 tool->SetPruneStrength(GetPruneStrength());
989 if(tool->IsAutomatic()) {
990 if(validationSample == NULL){
991 Log() << kFATAL << "Cannot automate the pruning algorithm without an "
992 << "independent validation sample!" << Endl;
993 }else if(validationSample->size() == 0) {
994 Log() << kFATAL << "Cannot automate the pruning algorithm with "
995 << "independent validation sample of ZERO events!" << Endl;
996 }
997 }
998
999 info = tool->CalculatePruningInfo(this,validationSample);
1000 Double_t pruneStrength=0;
1001 if(!info) {
1002 Log() << kFATAL << "Error pruning tree! Check prune.log for more information."
1003 << Endl;
1004 } else {
1005 pruneStrength = info->PruneStrength;
1006
1007 // Log() << kDEBUG << "Optimal prune strength (alpha): " << pruneStrength
1008 // << " has quality index " << info->QualityIndex << Endl;
1009
1010
1011 for (UInt_t i = 0; i < info->PruneSequence.size(); ++i) {
1012
1013 PruneNode(info->PruneSequence[i]);
1014 }
1015 // update the number of nodes after the pruning
1016 this->CountNodes();
1017 }
1018
1019 delete tool;
1020 delete info;
1021
1022 return pruneStrength;
1023};
1024
1025
1026////////////////////////////////////////////////////////////////////////////////
1027/// run the validation sample through the (pruned) tree and fill in the nodes
1028/// the variables NSValidation and NBValidadtion (i.e. how many of the Signal
1029/// and Background events from the validation sample. This is then later used
1030/// when asking for the "tree quality" ..
1031
1033{
1034 GetRoot()->ResetValidationData();
1035 for (UInt_t ievt=0; ievt < validationSample->size(); ievt++) {
1036 CheckEventWithPrunedTree((*validationSample)[ievt]);
1037 }
1038}
1039
1040////////////////////////////////////////////////////////////////////////////////
1041/// return the misclassification rate of a pruned tree
1042/// a "pruned tree" may have set the variable "IsTerminal" to "arbitrary" at
1043/// any node, hence this tree quality testing will stop there, hence test
1044/// the pruned tree (while the full tree is still in place for normal/later use)
1045
1047{
1048 if (n == NULL) { // default, start at the tree top, then descend recursively
1049 n = this->GetRoot();
1050 if (n == NULL) {
1051 Log() << kFATAL << "TestPrunedTreeQuality: started with undefined ROOT node" <<Endl;
1052 return 0;
1053 }
1054 }
1055
1056 if( n->GetLeft() != NULL && n->GetRight() != NULL && !n->IsTerminal() ) {
1057 return (TestPrunedTreeQuality( n->GetLeft(), mode ) +
1058 TestPrunedTreeQuality( n->GetRight(), mode ));
1059 }
1060 else { // terminal leaf (in a pruned subtree of T_max at least)
1061 if (DoRegression()) {
1062 Double_t sumw = n->GetNSValidation() + n->GetNBValidation();
1063 return n->GetSumTarget2() - 2*n->GetSumTarget()*n->GetResponse() + sumw*n->GetResponse()*n->GetResponse();
1064 }
1065 else {
1066 if (mode == 0) {
1067 if (n->GetPurity() > this->GetNodePurityLimit()) // this is a signal leaf, according to the training
1068 return n->GetNBValidation();
1069 else
1070 return n->GetNSValidation();
1071 }
1072 else if ( mode == 1 ) {
1073 // calculate the weighted error using the pruning validation sample
1074 return (n->GetPurity() * n->GetNBValidation() + (1.0 - n->GetPurity()) * n->GetNSValidation());
1075 }
1076 else {
1077 throw std::string("Unknown ValidationQualityMode");
1078 }
1079 }
1080 }
1081}
1082
1083////////////////////////////////////////////////////////////////////////////////
1084/// pass a single validation event through a pruned decision tree
1085/// on the way down the tree, fill in all the "intermediate" information
1086/// that would normally be there from training.
1087
1089{
1090 DecisionTreeNode* current = this->GetRoot();
1091 if (current == NULL) {
1092 Log() << kFATAL << "CheckEventWithPrunedTree: started with undefined ROOT node" <<Endl;
1093 }
1094
1095 while(current != NULL) {
1096 if(e->GetClass() == fSigClass)
1097 current->SetNSValidation(current->GetNSValidation() + e->GetWeight());
1098 else
1099 current->SetNBValidation(current->GetNBValidation() + e->GetWeight());
1100
1101 if (e->GetNTargets() > 0) {
1102 current->AddToSumTarget(e->GetWeight()*e->GetTarget(0));
1103 current->AddToSumTarget2(e->GetWeight()*e->GetTarget(0)*e->GetTarget(0));
1104 }
1105
1106 if (current->GetRight() == NULL || current->GetLeft() == NULL) {
1107 current = NULL;
1108 }
1109 else {
1110 if (current->GoesRight(*e))
1111 current = (TMVA::DecisionTreeNode*)current->GetRight();
1112 else
1113 current = (TMVA::DecisionTreeNode*)current->GetLeft();
1114 }
1115 }
1116}
1117
1118////////////////////////////////////////////////////////////////////////////////
1119/// calculate the normalization factor for a pruning validation sample
1120
1122{
1123 Double_t sumWeights = 0.0;
1124 for( EventConstList::const_iterator it = validationSample->begin();
1125 it != validationSample->end(); ++it ) {
1126 sumWeights += (*it)->GetWeight();
1127 }
1128 return sumWeights;
1129}
1130
1131////////////////////////////////////////////////////////////////////////////////
1132/// return the number of terminal nodes in the sub-tree below Node n
1133
1135{
1136 if (n == NULL) { // default, start at the tree top, then descend recursively
1137 n = this->GetRoot();
1138 if (n == NULL) {
1139 Log() << kFATAL << "CountLeafNodes: started with undefined ROOT node" <<Endl;
1140 return 0;
1141 }
1142 }
1143
1144 UInt_t countLeafs=0;
1145
1146 if ((this->GetLeftDaughter(n) == NULL) && (this->GetRightDaughter(n) == NULL) ) {
1147 countLeafs += 1;
1148 }
1149 else {
1150 if (this->GetLeftDaughter(n) != NULL) {
1151 countLeafs += this->CountLeafNodes( this->GetLeftDaughter(n) );
1152 }
1153 if (this->GetRightDaughter(n) != NULL) {
1154 countLeafs += this->CountLeafNodes( this->GetRightDaughter(n) );
1155 }
1156 }
1157 return countLeafs;
1158}
1159
1160////////////////////////////////////////////////////////////////////////////////
1161/// descend a tree to find all its leaf nodes
1162
1164{
1165 if (n == NULL) { // default, start at the tree top, then descend recursively
1166 n = this->GetRoot();
1167 if (n == NULL) {
1168 Log() << kFATAL << "DescendTree: started with undefined ROOT node" <<Endl;
1169 return ;
1170 }
1171 }
1172
1173 if ((this->GetLeftDaughter(n) == NULL) && (this->GetRightDaughter(n) == NULL) ) {
1174 // do nothing
1175 }
1176 else if ((this->GetLeftDaughter(n) == NULL) && (this->GetRightDaughter(n) != NULL) ) {
1177 Log() << kFATAL << " Node with only one daughter?? Something went wrong" << Endl;
1178 return;
1179 }
1180 else if ((this->GetLeftDaughter(n) != NULL) && (this->GetRightDaughter(n) == NULL) ) {
1181 Log() << kFATAL << " Node with only one daughter?? Something went wrong" << Endl;
1182 return;
1183 }
1184 else {
1185 if (this->GetLeftDaughter(n) != NULL) {
1186 this->DescendTree( this->GetLeftDaughter(n) );
1187 }
1188 if (this->GetRightDaughter(n) != NULL) {
1189 this->DescendTree( this->GetRightDaughter(n) );
1190 }
1191 }
1192}
1193
1194////////////////////////////////////////////////////////////////////////////////
1195/// prune away the subtree below the node
1196
1198{
1199 DecisionTreeNode *l = node->GetLeft();
1200 DecisionTreeNode *r = node->GetRight();
1201
1202 node->SetRight(NULL);
1203 node->SetLeft(NULL);
1204 node->SetSelector(-1);
1205 node->SetSeparationGain(-1);
1206 if (node->GetPurity() > fNodePurityLimit) node->SetNodeType(1);
1207 else node->SetNodeType(-1);
1208 this->DeleteNode(l);
1209 this->DeleteNode(r);
1210 // update the stored number of nodes in the Tree
1211 this->CountNodes();
1212
1213}
1214
1215////////////////////////////////////////////////////////////////////////////////
1216/// prune a node temporarily (without actually deleting its descendants
1217/// which allows testing the pruned tree quality for many different
1218/// pruning stages without "touching" the tree.
1219
1221 if(node == NULL) return;
1222 node->SetNTerminal(1);
1223 node->SetSubTreeR( node->GetNodeR() );
1224 node->SetAlpha( std::numeric_limits<double>::infinity( ) );
1225 node->SetAlphaMinSubtree( std::numeric_limits<double>::infinity( ) );
1226 node->SetTerminal(kTRUE); // set the node to be terminal without deleting its descendants FIXME not needed
1227}
1228
1229////////////////////////////////////////////////////////////////////////////////
1230/// retrieve node from the tree. Its position (up to a maximal tree depth of 64)
1231/// is coded as a sequence of left-right moves starting from the root, coded as
1232/// 0-1 bit patterns stored in the "long-integer" (i.e. 0:left ; 1:right
1233
1235{
1236 Node* current = this->GetRoot();
1237
1238 for (UInt_t i =0; i < depth; i++) {
1239 ULong_t tmp = 1 << i;
1240 if ( tmp & sequence) current = this->GetRightDaughter(current);
1241 else current = this->GetLeftDaughter(current);
1242 }
1243
1244 return current;
1245}
1246
1247////////////////////////////////////////////////////////////////////////////////
1248///
1249
1250void TMVA::DecisionTree::GetRandomisedVariables(Bool_t *useVariable, UInt_t *mapVariable, UInt_t &useNvars){
1251 for (UInt_t ivar=0; ivar<fNvars; ivar++) useVariable[ivar]=kFALSE;
1252 if (fUseNvars==0) { // no number specified ... choose s.th. which hopefully works well
1253 // watch out, should never happen as it is initialised automatically in MethodBDT already!!!
1254 fUseNvars = UInt_t(TMath::Sqrt(fNvars)+0.6);
1255 }
1256 if (fUsePoissonNvars) useNvars=TMath::Min(fNvars,TMath::Max(UInt_t(1),(UInt_t) fMyTrandom->Poisson(fUseNvars)));
1257 else useNvars = fUseNvars;
1258
1259 UInt_t nSelectedVars = 0;
1260 while (nSelectedVars < useNvars) {
1261 Double_t bla = fMyTrandom->Rndm()*fNvars;
1262 useVariable[Int_t (bla)] = kTRUE;
1263 nSelectedVars = 0;
1264 for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1265 if (useVariable[ivar] == kTRUE) {
1266 mapVariable[nSelectedVars] = ivar;
1267 nSelectedVars++;
1268 }
1269 }
1270 }
1271 if (nSelectedVars != useNvars) { std::cout << "Bug in TrainNode - GetRandisedVariables()... sorry" << std::endl; std::exit(1);}
1272}
1273
1274// Multithreaded version of DecisionTree::TrainNodeFast
1275#ifdef R__USE_IMT
1276//====================================================================================
1277// Had to define this struct to enable parallelization in TrainNodeFast.
1278// Using TrainNodeInfo, each thread can return the information needed.
1279// After each thread returns we can then merge the results, hence the + operator.
1280//====================================================================================
1281struct TrainNodeInfo{
1282
1283 TrainNodeInfo(Int_t cNvars_, UInt_t* nBins_){
1284
1285 cNvars = cNvars_;
1286 nBins = nBins_;
1287
1288 nSelS = std::vector< std::vector<Double_t> >(cNvars);
1289 nSelB = std::vector< std::vector<Double_t> >(cNvars);
1290 nSelS_unWeighted = std::vector< std::vector<Double_t> >(cNvars);
1291 nSelB_unWeighted = std::vector< std::vector<Double_t> >(cNvars);
1292 target = std::vector< std::vector<Double_t> >(cNvars);
1293 target2 = std::vector< std::vector<Double_t> >(cNvars);
1294
1295 for(Int_t ivar=0; ivar<cNvars; ivar++){
1296 nSelS[ivar] = std::vector<Double_t>(nBins[ivar], 0);
1297 nSelB[ivar] = std::vector<Double_t>(nBins[ivar], 0);
1298 nSelS_unWeighted[ivar] = std::vector<Double_t>(nBins[ivar], 0);
1299 nSelB_unWeighted[ivar] = std::vector<Double_t>(nBins[ivar], 0);
1300 target[ivar] = std::vector<Double_t>(nBins[ivar], 0);
1301 target2[ivar] = std::vector<Double_t>(nBins[ivar], 0);
1302 }
1303 };
1304
1305 TrainNodeInfo(){};
1306
1307 // #### malloc problem if I define this and try to destruct xmin and xmax...
1308 // #### Maybe someone better at C++ can figure out why and fix this if it's a
1309 // ### serious memory problem
1310 //~TrainNodeInfo(){
1311 // delete [] xmin;
1312 // delete [] xmax;
1313 //};
1314
1315 Int_t cNvars = 0;
1316 UInt_t* nBins;
1317
1318 Double_t nTotS = 0;
1319 Double_t nTotS_unWeighted = 0;
1320 Double_t nTotB = 0;
1321 Double_t nTotB_unWeighted = 0;
1322
1323 std::vector< std::vector<Double_t> > nSelS;
1324 std::vector< std::vector<Double_t> > nSelB;
1325 std::vector< std::vector<Double_t> > nSelS_unWeighted;
1326 std::vector< std::vector<Double_t> > nSelB_unWeighted;
1327 std::vector< std::vector<Double_t> > target;
1328 std::vector< std::vector<Double_t> > target2;
1329
1330 // Define the addition operator for TrainNodeInfo
1331 // Make sure both TrainNodeInfos have the same nvars if we add them
1332 TrainNodeInfo operator+(const TrainNodeInfo& other)
1333 {
1334 TrainNodeInfo ret(cNvars, nBins);
1335
1336 // check that the two are compatible to add
1337 if(cNvars != other.cNvars)
1338 {
1339 std::cout << "!!! ERROR TrainNodeInfo1+TrainNodeInfo2 failure. cNvars1 != cNvars2." << std::endl;
1340 return ret;
1341 }
1342
1343 // add the signal, background, and target sums
1344 for (Int_t ivar=0; ivar<cNvars; ivar++) {
1345 for (UInt_t ibin=0; ibin<nBins[ivar]; ibin++) {
1346 ret.nSelS[ivar][ibin] = nSelS[ivar][ibin] + other.nSelS[ivar][ibin];
1347 ret.nSelB[ivar][ibin] = nSelB[ivar][ibin] + other.nSelB[ivar][ibin];
1348 ret.nSelS_unWeighted[ivar][ibin] = nSelS_unWeighted[ivar][ibin] + other.nSelS_unWeighted[ivar][ibin];
1349 ret.nSelB_unWeighted[ivar][ibin] = nSelB_unWeighted[ivar][ibin] + other.nSelB_unWeighted[ivar][ibin];
1350 ret.target[ivar][ibin] = target[ivar][ibin] + other.target[ivar][ibin];
1351 ret.target2[ivar][ibin] = target2[ivar][ibin] + other.target2[ivar][ibin];
1352 }
1353 }
1354
1355 ret.nTotS = nTotS + other.nTotS;
1356 ret.nTotS_unWeighted = nTotS_unWeighted + other.nTotS_unWeighted;
1357 ret.nTotB = nTotB + other.nTotB;
1358 ret.nTotB_unWeighted = nTotB_unWeighted + other.nTotB_unWeighted;
1359
1360 return ret;
1361 };
1362
1363};
1364//===========================================================================
1365// Done with TrainNodeInfo declaration
1366//===========================================================================
1367
1368////////////////////////////////////////////////////////////////////////////////
1369/// Decide how to split a node using one of the variables that gives
1370/// the best separation of signal/background. In order to do this, for each
1371/// variable a scan of the different cut values in a grid (grid = fNCuts) is
1372/// performed and the resulting separation gains are compared.
1373/// in addition to the individual variables, one can also ask for a fisher
1374/// discriminant being built out of (some) of the variables and used as a
1375/// possible multivariate split.
1376
1379{
1380 // #### OK let's comment this one to see how to parallelize it
1381 Double_t separationGainTotal = -1;
1382 Double_t *separationGain = new Double_t[fNvars+1];
1383 Int_t *cutIndex = new Int_t[fNvars+1]; //-1;
1384
1385 // #### initialize the sep gain and cut index values
1386 for (UInt_t ivar=0; ivar <= fNvars; ivar++) {
1387 separationGain[ivar]=-1;
1388 cutIndex[ivar]=-1;
1389 }
1390 // ### set up some other variables
1391 Int_t mxVar = -1;
1392 Bool_t cutType = kTRUE;
1393 UInt_t nevents = eventSample.size();
1394
1395
1396 // the +1 comes from the fact that I treat later on the Fisher output as an
1397 // additional possible variable.
1398 Bool_t *useVariable = new Bool_t[fNvars+1]; // for performance reasons instead of std::vector<Bool_t> useVariable(fNvars);
1399 UInt_t *mapVariable = new UInt_t[fNvars+1]; // map the subset of variables used in randomised trees to the original variable number (used in the Event() )
1400
1401 std::vector<Double_t> fisherCoeff;
1402
1403 // #### set up a map to the subset of variables using two arrays
1404 if (fRandomisedTree) { // choose for each node splitting a random subset of variables to choose from
1405 UInt_t tmp=fUseNvars;
1406 GetRandomisedVariables(useVariable,mapVariable,tmp);
1407 }
1408 else {
1409 for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1410 useVariable[ivar] = kTRUE;
1411 mapVariable[ivar] = ivar;
1412 }
1413 }
1414 // #### last variable entry is the fisher variable
1415 useVariable[fNvars] = kFALSE; //by default fisher is not used..
1416
1417 // #### Begin Fisher calculation
1418 Bool_t fisherOK = kFALSE; // flag to show that the fisher discriminant could be calculated correctly or not;
1419 if (fUseFisherCuts) {
1420 useVariable[fNvars] = kTRUE; // that's were I store the "fisher MVA"
1421
1422 //use for the Fisher discriminant ONLY those variables that show
1423 //some reasonable linear correlation in either Signal or Background
1424 Bool_t *useVarInFisher = new Bool_t[fNvars]; // for performance reasons instead of std::vector<Bool_t> useVariable(fNvars);
1425 UInt_t *mapVarInFisher = new UInt_t[fNvars]; // map the subset of variables used in randomised trees to the original variable number (used in the Event() )
1426 for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1427 useVarInFisher[ivar] = kFALSE;
1428 mapVarInFisher[ivar] = ivar;
1429 }
1430
1431 std::vector<TMatrixDSym*>* covMatrices;
1432 covMatrices = gTools().CalcCovarianceMatrices( eventSample, 2 ); // currently for 2 classes only
1433 if (!covMatrices){
1434 Log() << kWARNING << " in TrainNodeFast, the covariance Matrices needed for the Fisher-Cuts returned error --> revert to just normal cuts for this node" << Endl;
1435 fisherOK = kFALSE;
1436 }else{
1437 TMatrixD *ss = new TMatrixD(*(covMatrices->at(0)));
1438 TMatrixD *bb = new TMatrixD(*(covMatrices->at(1)));
1439 const TMatrixD *s = gTools().GetCorrelationMatrix(ss);
1440 const TMatrixD *b = gTools().GetCorrelationMatrix(bb);
1441
1442 for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1443 for (UInt_t jvar=ivar+1; jvar < fNvars; jvar++) {
1444 if ( ( TMath::Abs( (*s)(ivar, jvar)) > fMinLinCorrForFisher) ||
1445 ( TMath::Abs( (*b)(ivar, jvar)) > fMinLinCorrForFisher) ){
1446 useVarInFisher[ivar] = kTRUE;
1447 useVarInFisher[jvar] = kTRUE;
1448 }
1449 }
1450 }
1451 // now as you know which variables you want to use, count and map them:
1452 // such that you can use an array/matrix filled only with THOSE variables
1453 // that you used
1454 UInt_t nFisherVars = 0;
1455 for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1456 //now .. pick those variables that are used in the FIsher and are also
1457 // part of the "allowed" variables in case of Randomized Trees)
1458 if (useVarInFisher[ivar] && useVariable[ivar]) {
1459 mapVarInFisher[nFisherVars++]=ivar;
1460 // now exclude the variables used in the Fisher cuts, and don't
1461 // use them anymore in the individual variable scan
1462 if (fUseExclusiveVars) useVariable[ivar] = kFALSE;
1463 }
1464 }
1465
1466
1467 fisherCoeff = this->GetFisherCoefficients(eventSample, nFisherVars, mapVarInFisher);
1468 fisherOK = kTRUE;
1469 }
1470 delete [] useVarInFisher;
1471 delete [] mapVarInFisher;
1472
1473 }
1474 // #### End Fisher calculation
1475
1476
1477 UInt_t cNvars = fNvars;
1478 if (fUseFisherCuts && fisherOK) cNvars++; // use the Fisher output simple as additional variable
1479
1480 // #### set up the binning info arrays
1481 // #### each var has its own binning since some may be integers
1482 UInt_t* nBins = new UInt_t [cNvars];
1483 Double_t* binWidth = new Double_t [cNvars];
1484 Double_t* invBinWidth = new Double_t [cNvars];
1485 Double_t** cutValues = new Double_t* [cNvars];
1486
1487 // #### set up the xmin and xmax arrays
1488 // #### each var has its own range
1489 Double_t *xmin = new Double_t[cNvars];
1490 Double_t *xmax = new Double_t[cNvars];
1491
1492 // construct and intialize binning/cuts
1493 for (UInt_t ivar=0; ivar<cNvars; ivar++) {
1494 // ncuts means that we need n+1 bins for each variable
1495 nBins[ivar] = fNCuts+1;
1496 if (ivar < fNvars) {
1497 if (fDataSetInfo->GetVariableInfo(ivar).GetVarType() == 'I') {
1498 nBins[ivar] = node->GetSampleMax(ivar) - node->GetSampleMin(ivar) + 1;
1499 }
1500 }
1501
1502 cutValues[ivar] = new Double_t [nBins[ivar]];
1503 }
1504
1505 // init the range and cutvalues for each var now that we know the binning
1506 for (UInt_t ivar=0; ivar < cNvars; ivar++) {
1507 if (ivar < fNvars){
1508 xmin[ivar]=node->GetSampleMin(ivar);
1509 xmax[ivar]=node->GetSampleMax(ivar);
1510 if (almost_equal_float(xmax[ivar], xmin[ivar])) {
1511 // std::cout << " variable " << ivar << " has no proper range in (xmax[ivar]-xmin[ivar] = " << xmax[ivar]-xmin[ivar] << std::endl;
1512 // std::cout << " will set useVariable[ivar]=false"<<std::endl;
1513 useVariable[ivar]=kFALSE;
1514 }
1515
1516 } else { // the fisher variable
1517 xmin[ivar]=999;
1518 xmax[ivar]=-999;
1519 // too bad, for the moment I don't know how to do this without looping
1520 // once to get the "min max" and then AGAIN to fill the histogram
1521 for (UInt_t iev=0; iev<nevents; iev++) {
1522 // returns the Fisher value (no fixed range)
1523 Double_t result = fisherCoeff[fNvars]; // the fisher constant offset
1524 for (UInt_t jvar=0; jvar<fNvars; jvar++)
1525 result += fisherCoeff[jvar]*(eventSample[iev])->GetValueFast(jvar);
1526 if (result > xmax[ivar]) xmax[ivar]=result;
1527 if (result < xmin[ivar]) xmin[ivar]=result;
1528 }
1529 }
1530 // this loop could take a long time if nbins is large
1531 for (UInt_t ibin=0; ibin<nBins[ivar]; ibin++) {
1532 cutValues[ivar][ibin]=0;
1533 }
1534 }
1535
1536 // ====================================================================
1537 // ====================================================================
1538 // Parallelized Version
1539 // ====================================================================
1540 // ====================================================================
1541
1542 // #### Figure out the cut values, loops through vars then through cuts
1543 // #### if ncuts is on the order of the amount of training data/10 - ish then we get some gains from parallelizing this
1544 // fill the cut values for the scan:
1545 auto varSeeds = ROOT::TSeqU(cNvars);
1546 auto fvarInitCuts = [this, &useVariable, &cutValues, &invBinWidth, &binWidth, &nBins, &xmin, &xmax](UInt_t ivar = 0){
1547
1548 if ( useVariable[ivar] ) {
1549
1550 //set the grid for the cut scan on the variables like this:
1551 //
1552 // | | | | | ... | |
1553 // xmin xmax
1554 //
1555 // cut 0 1 2 3 ... fNCuts-1 (counting from zero)
1556 // bin 0 1 2 3 ..... nBins-1=fNCuts (counting from zero)
1557 // --> nBins = fNCuts+1
1558 // (NOTE, the cuts at xmin or xmax would just give the whole sample and
1559 // hence can be safely omitted
1560
1561 binWidth[ivar] = ( xmax[ivar] - xmin[ivar] ) / Double_t(nBins[ivar]);
1562 invBinWidth[ivar] = 1./binWidth[ivar];
1563 if (ivar < fNvars) {
1564 if (fDataSetInfo->GetVariableInfo(ivar).GetVarType() == 'I') { invBinWidth[ivar] = 1; binWidth[ivar] = 1; }
1565 }
1566
1567 // std::cout << "ivar="<<ivar
1568 // <<" min="<<xmin[ivar]
1569 // << " max="<<xmax[ivar]
1570 // << " width=" << istepSize
1571 // << " nBins["<<ivar<<"]="<<nBins[ivar]<<std::endl;
1572 for (UInt_t icut=0; icut<nBins[ivar]-1; icut++) {
1573 cutValues[ivar][icut]=xmin[ivar]+(Double_t(icut+1))*binWidth[ivar];
1574 // std::cout << " cutValues["<<ivar<<"]["<<icut<<"]=" << cutValues[ivar][icut] << std::endl;
1575 }
1576 }
1577 return 0;
1578 };
1579 TMVA::Config::Instance().GetThreadExecutor().Map(fvarInitCuts, varSeeds);
1580
1581 // #### Loop through the events to get the total sig and background
1582 // #### Then loop through the vars to get the counts in each bin in each var
1583 // #### So we have a loop through the events and a loop through the vars, but no loop through the cuts this is a calculation
1584
1585 TrainNodeInfo nodeInfo(cNvars, nBins);
1587
1588 // #### When nbins is low compared to ndata this version of parallelization is faster, so use it
1589 // #### Parallelize by chunking the data into the same number of sections as we have processors
1590 if(eventSample.size() >= cNvars*fNCuts*nPartitions*2)
1591 {
1592 auto seeds = ROOT::TSeqU(nPartitions);
1593
1594 // need a lambda function to pass to TThreadExecutor::MapReduce
1595 auto f = [this, &eventSample, &fisherCoeff, &useVariable, &invBinWidth,
1596 &nBins, &xmin, &cNvars, &nPartitions](UInt_t partition = 0){
1597
1598 UInt_t start = 1.0*partition/nPartitions*eventSample.size();
1599 UInt_t end = (partition+1.0)/nPartitions*eventSample.size();
1600
1601 TrainNodeInfo nodeInfof(cNvars, nBins);
1602
1603 for(UInt_t iev=start; iev<end; iev++) {
1604
1605 Double_t eventWeight = eventSample[iev]->GetWeight();
1606 if (eventSample[iev]->GetClass() == fSigClass) {
1607 nodeInfof.nTotS+=eventWeight;
1608 nodeInfof.nTotS_unWeighted++; }
1609 else {
1610 nodeInfof.nTotB+=eventWeight;
1611 nodeInfof.nTotB_unWeighted++;
1612 }
1613
1614 // #### Count the number in each bin
1615 Int_t iBin=-1;
1616 for (UInt_t ivar=0; ivar < cNvars; ivar++) {
1617 // now scan trough the cuts for each varable and find which one gives
1618 // the best separationGain at the current stage.
1619 if ( useVariable[ivar] ) {
1620 Double_t eventData;
1621 if (ivar < fNvars) eventData = eventSample[iev]->GetValueFast(ivar);
1622 else { // the fisher variable
1623 eventData = fisherCoeff[fNvars];
1624 for (UInt_t jvar=0; jvar<fNvars; jvar++)
1625 eventData += fisherCoeff[jvar]*(eventSample[iev])->GetValueFast(jvar);
1626
1627 }
1628 // #### figure out which bin it belongs in ...
1629 // "maximum" is nbins-1 (the "-1" because we start counting from 0 !!
1630 iBin = TMath::Min(Int_t(nBins[ivar]-1),TMath::Max(0,int (invBinWidth[ivar]*(eventData-xmin[ivar]) ) ));
1631 if (eventSample[iev]->GetClass() == fSigClass) {
1632 nodeInfof.nSelS[ivar][iBin]+=eventWeight;
1633 nodeInfof.nSelS_unWeighted[ivar][iBin]++;
1634 }
1635 else {
1636 nodeInfof.nSelB[ivar][iBin]+=eventWeight;
1637 nodeInfof.nSelB_unWeighted[ivar][iBin]++;
1638 }
1639 if (DoRegression()) {
1640 nodeInfof.target[ivar][iBin] +=eventWeight*eventSample[iev]->GetTarget(0);
1641 nodeInfof.target2[ivar][iBin]+=eventWeight*eventSample[iev]->GetTarget(0)*eventSample[iev]->GetTarget(0);
1642 }
1643 }
1644 }
1645 }
1646 return nodeInfof;
1647 };
1648
1649 // #### Need an intial struct to pass to std::accumulate
1650 TrainNodeInfo nodeInfoInit(cNvars, nBins);
1651
1652 // #### Run the threads in parallel then merge the results
1653 auto redfunc = [nodeInfoInit](std::vector<TrainNodeInfo> v) -> TrainNodeInfo { return std::accumulate(v.begin(), v.end(), nodeInfoInit); };
1654 nodeInfo = TMVA::Config::Instance().GetThreadExecutor().MapReduce(f, seeds, redfunc);
1655 }
1656
1657
1658 // #### When nbins is close to the order of the data this version of parallelization is faster
1659 // #### Parallelize by vectorizing the variable loop
1660 else {
1661
1662 auto fvarFillNodeInfo = [this, &nodeInfo, &eventSample, &fisherCoeff, &useVariable, &invBinWidth, &nBins, &xmin](UInt_t ivar = 0){
1663
1664 for(UInt_t iev=0; iev<eventSample.size(); iev++) {
1665
1666 Int_t iBin=-1;
1667 Double_t eventWeight = eventSample[iev]->GetWeight();
1668
1669 // Only count the net signal and background once
1670 if(ivar==0){
1671 if (eventSample[iev]->GetClass() == fSigClass) {
1672 nodeInfo.nTotS+=eventWeight;
1673 nodeInfo.nTotS_unWeighted++; }
1674 else {
1675 nodeInfo.nTotB+=eventWeight;
1676 nodeInfo.nTotB_unWeighted++;
1677 }
1678 }
1679
1680 // Figure out which bin the event belongs in and increment the bin in each histogram vector appropriately
1681 if ( useVariable[ivar] ) {
1682 Double_t eventData;
1683 if (ivar < fNvars) eventData = eventSample[iev]->GetValueFast(ivar);
1684 else { // the fisher variable
1685 eventData = fisherCoeff[fNvars];
1686 for (UInt_t jvar=0; jvar<fNvars; jvar++)
1687 eventData += fisherCoeff[jvar]*(eventSample[iev])->GetValueFast(jvar);
1688
1689 }
1690 // #### figure out which bin it belongs in ...
1691 // "maximum" is nbins-1 (the "-1" because we start counting from 0 !!
1692 iBin = TMath::Min(Int_t(nBins[ivar]-1),TMath::Max(0,int (invBinWidth[ivar]*(eventData-xmin[ivar]) ) ));
1693 if (eventSample[iev]->GetClass() == fSigClass) {
1694 nodeInfo.nSelS[ivar][iBin]+=eventWeight;
1695 nodeInfo.nSelS_unWeighted[ivar][iBin]++;
1696 }
1697 else {
1698 nodeInfo.nSelB[ivar][iBin]+=eventWeight;
1699 nodeInfo.nSelB_unWeighted[ivar][iBin]++;
1700 }
1701 if (DoRegression()) {
1702 nodeInfo.target[ivar][iBin] +=eventWeight*eventSample[iev]->GetTarget(0);
1703 nodeInfo.target2[ivar][iBin]+=eventWeight*eventSample[iev]->GetTarget(0)*eventSample[iev]->GetTarget(0);
1704 }
1705 }
1706 }
1707 return 0;
1708 };
1709
1710 TMVA::Config::Instance().GetThreadExecutor().Map(fvarFillNodeInfo, varSeeds);
1711 }
1712
1713 // now turn each "histogram" into a cumulative distribution
1714 // #### loops through the vars and then the bins, if the bins are on the order of the training data this is worth parallelizing
1715 // #### doesn't hurt otherwise, pretty unnoticeable
1716 auto fvarCumulative = [&nodeInfo, &useVariable, &nBins, this, &eventSample](UInt_t ivar = 0){
1717 if (useVariable[ivar]) {
1718 for (UInt_t ibin=1; ibin < nBins[ivar]; ibin++) {
1719 nodeInfo.nSelS[ivar][ibin]+=nodeInfo.nSelS[ivar][ibin-1];
1720 nodeInfo.nSelS_unWeighted[ivar][ibin]+=nodeInfo.nSelS_unWeighted[ivar][ibin-1];
1721 nodeInfo.nSelB[ivar][ibin]+=nodeInfo.nSelB[ivar][ibin-1];
1722 nodeInfo.nSelB_unWeighted[ivar][ibin]+=nodeInfo.nSelB_unWeighted[ivar][ibin-1];
1723 if (DoRegression()) {
1724 nodeInfo.target[ivar][ibin] +=nodeInfo.target[ivar][ibin-1] ;
1725 nodeInfo.target2[ivar][ibin]+=nodeInfo.target2[ivar][ibin-1];
1726 }
1727 }
1728 if (nodeInfo.nSelS_unWeighted[ivar][nBins[ivar]-1] +nodeInfo.nSelB_unWeighted[ivar][nBins[ivar]-1] != eventSample.size()) {
1729 Log() << kFATAL << "Helge, you have a bug ....nodeInfo.nSelS_unw..+nodeInfo.nSelB_unw..= "
1730 << nodeInfo.nSelS_unWeighted[ivar][nBins[ivar]-1] +nodeInfo.nSelB_unWeighted[ivar][nBins[ivar]-1]
1731 << " while eventsample size = " << eventSample.size()
1732 << Endl;
1733 }
1734 double lastBins=nodeInfo.nSelS[ivar][nBins[ivar]-1] +nodeInfo.nSelB[ivar][nBins[ivar]-1];
1735 double totalSum=nodeInfo.nTotS+nodeInfo.nTotB;
1736 if (TMath::Abs(lastBins-totalSum)/totalSum>0.01) {
1737 Log() << kFATAL << "Helge, you have another bug ....nodeInfo.nSelS+nodeInfo.nSelB= "
1738 << lastBins
1739 << " while total number of events = " << totalSum
1740 << Endl;
1741 }
1742 }
1743 return 0;
1744 };
1745 TMVA::Config::Instance().GetThreadExecutor().Map(fvarCumulative, varSeeds);
1746
1747 // #### Again, if bins is on the order of the training data or with an order or so, then this is worth parallelizing
1748 // now select the optimal cuts for each varable and find which one gives
1749 // the best separationGain at the current stage
1750 auto fvarMaxSep = [&nodeInfo, &useVariable, this, &separationGain, &cutIndex, &nBins] (UInt_t ivar = 0){
1751 if (useVariable[ivar]) {
1752 Double_t sepTmp;
1753 for (UInt_t iBin=0; iBin<nBins[ivar]-1; iBin++) { // the last bin contains "all events" -->skip
1754 // the separationGain is defined as the various indices (Gini, CorssEntropy, e.t.c)
1755 // calculated by the "SamplePurities" from the branches that would go to the
1756 // left or the right from this node if "these" cuts were used in the Node:
1757 // hereby: nodeInfo.nSelS and nodeInfo.nSelB would go to the right branch
1758 // (nodeInfo.nTotS - nodeInfo.nSelS) + (nodeInfo.nTotB - nodeInfo.nSelB) would go to the left branch;
1759
1760 // only allow splits where both daughter nodes match the specified minimum number
1761 // for this use the "unweighted" events, as you are interested in statistically
1762 // significant splits, which is determined by the actual number of entries
1763 // for a node, rather than the sum of event weights.
1764
1765 Double_t sl = nodeInfo.nSelS_unWeighted[ivar][iBin];
1766 Double_t bl = nodeInfo.nSelB_unWeighted[ivar][iBin];
1767 Double_t s = nodeInfo.nTotS_unWeighted;
1768 Double_t b = nodeInfo.nTotB_unWeighted;
1769 Double_t slW = nodeInfo.nSelS[ivar][iBin];
1770 Double_t blW = nodeInfo.nSelB[ivar][iBin];
1771 Double_t sW = nodeInfo.nTotS;
1772 Double_t bW = nodeInfo.nTotB;
1773 Double_t sr = s-sl;
1774 Double_t br = b-bl;
1775 Double_t srW = sW-slW;
1776 Double_t brW = bW-blW;
1777 // std::cout << "sl="<<sl << " bl="<<bl<<" fMinSize="<<fMinSize << "sr="<<sr << " br="<<br <<std::endl;
1778 if ( ((sl+bl)>=fMinSize && (sr+br)>=fMinSize)
1779 && ((slW+blW)>=fMinSize && (srW+brW)>=fMinSize)
1780 ) {
1781
1782 if (DoRegression()) {
1783 sepTmp = fRegType->GetSeparationGain(nodeInfo.nSelS[ivar][iBin]+nodeInfo.nSelB[ivar][iBin],
1784 nodeInfo.target[ivar][iBin],nodeInfo.target2[ivar][iBin],
1785 nodeInfo.nTotS+nodeInfo.nTotB,
1786 nodeInfo.target[ivar][nBins[ivar]-1],nodeInfo.target2[ivar][nBins[ivar]-1]);
1787 } else {
1788 sepTmp = fSepType->GetSeparationGain(nodeInfo.nSelS[ivar][iBin], nodeInfo.nSelB[ivar][iBin], nodeInfo.nTotS, nodeInfo.nTotB);
1789 }
1790 if (separationGain[ivar] < sepTmp) {
1791 separationGain[ivar] = sepTmp;
1792 cutIndex[ivar] = iBin;
1793 }
1794 }
1795 }
1796 }
1797 return 0;
1798 };
1799 TMVA::Config::Instance().GetThreadExecutor().Map(fvarMaxSep, varSeeds);
1800
1801 // you found the best separation cut for each variable, now compare the variables
1802 for (UInt_t ivar=0; ivar < cNvars; ivar++) {
1803 if (useVariable[ivar] ) {
1804 if (separationGainTotal < separationGain[ivar]) {
1805 separationGainTotal = separationGain[ivar];
1806 mxVar = ivar;
1807 }
1808 }
1809 }
1810
1811 if (mxVar >= 0) {
1812 if (DoRegression()) {
1813 node->SetSeparationIndex(fRegType->GetSeparationIndex(nodeInfo.nTotS+nodeInfo.nTotB,nodeInfo.target[0][nBins[mxVar]-1],nodeInfo.target2[0][nBins[mxVar]-1]));
1814 node->SetResponse(nodeInfo.target[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB));
1815 if ( almost_equal_double(nodeInfo.target2[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB), nodeInfo.target[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB)*nodeInfo.target[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB))) {
1816 node->SetRMS(0);
1817
1818 }else{
1819 node->SetRMS(TMath::Sqrt(nodeInfo.target2[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB) - nodeInfo.target[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB)*nodeInfo.target[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB)));
1820 }
1821 }
1822 else {
1823 node->SetSeparationIndex(fSepType->GetSeparationIndex(nodeInfo.nTotS,nodeInfo.nTotB));
1824 if (mxVar >=0){
1825 if (nodeInfo.nSelS[mxVar][cutIndex[mxVar]]/nodeInfo.nTotS > nodeInfo.nSelB[mxVar][cutIndex[mxVar]]/nodeInfo.nTotB) cutType=kTRUE;
1826 else cutType=kFALSE;
1827 }
1828 }
1829 node->SetSelector((UInt_t)mxVar);
1830 node->SetCutValue(cutValues[mxVar][cutIndex[mxVar]]);
1831 node->SetCutType(cutType);
1832 node->SetSeparationGain(separationGainTotal);
1833 if (mxVar < (Int_t) fNvars){ // the fisher cut is actually not used in this node, hence don't need to store fisher components
1834 node->SetNFisherCoeff(0);
1835 fVariableImportance[mxVar] += separationGainTotal*separationGainTotal * (nodeInfo.nTotS+nodeInfo.nTotB) * (nodeInfo.nTotS+nodeInfo.nTotB) ;
1836 //for (UInt_t ivar=0; ivar<fNvars; ivar++) fVariableImportance[ivar] += separationGain[ivar]*separationGain[ivar] * (nodeInfo.nTotS+nodeInfo.nTotB) * (nodeInfo.nTotS+nodeInfo.nTotB) ;
1837 }else{
1838 // allocate Fisher coefficients (use fNvars, and set the non-used ones to zero. Might
1839 // be even less storage space on average than storing also the mapping used otherwise
1840 // can always be changed relatively easy
1841 node->SetNFisherCoeff(fNvars+1);
1842 for (UInt_t ivar=0; ivar<=fNvars; ivar++) {
1843 node->SetFisherCoeff(ivar,fisherCoeff[ivar]);
1844 // take 'fisher coeff. weighted estimate as variable importance, "Don't fill the offset coefficient though :)
1845 if (ivar<fNvars){
1846 fVariableImportance[ivar] += fisherCoeff[ivar]*fisherCoeff[ivar]*separationGainTotal*separationGainTotal * (nodeInfo.nTotS+nodeInfo.nTotB) * (nodeInfo.nTotS+nodeInfo.nTotB) ;
1847 }
1848 }
1849 }
1850 }
1851 else {
1852 separationGainTotal = 0;
1853 }
1854
1855 // #### Now in TrainNodeInfo, but I got a malloc segfault when I tried to destruct arrays there.
1856 // #### So, I changed these from dynamic arrays to std::vector to fix this memory problem
1857 // #### so no need to destruct them anymore. I didn't see any performance drop as a result.
1858 for (UInt_t i=0; i<cNvars; i++) {
1859 // delete [] nodeInfo.nSelS[i];
1860 // delete [] nodeInfo.nSelB[i];
1861 // delete [] nodeInfo.nSelS_unWeighted[i];
1862 // delete [] nodeInfo.nSelB_unWeighted[i];
1863 // delete [] nodeInfo.target[i];
1864 // delete [] nodeInfo.target2[i];
1865 delete [] cutValues[i];
1866 }
1867 //delete [] nodeInfo.nSelS;
1868 //delete [] nodeInfo.nSelB;
1869 //delete [] nodeInfo.nSelS_unWeighted;
1870 //delete [] nodeInfo.nSelB_unWeighted;
1871 //delete [] nodeInfo.target;
1872 //delete [] nodeInfo.target2;
1873
1874 // #### left these as dynamic arrays as they were before
1875 // #### since I didn't need to mess with them for parallelization
1876 delete [] cutValues;
1877
1878 delete [] xmin;
1879 delete [] xmax;
1880
1881 delete [] useVariable;
1882 delete [] mapVariable;
1883
1884 delete [] separationGain;
1885 delete [] cutIndex;
1886
1887 delete [] nBins;
1888 delete [] binWidth;
1889 delete [] invBinWidth;
1890
1891 return separationGainTotal;
1892}
1893
1894// Standard version of DecisionTree::TrainNodeFast (multithreading is not enabled)
1895#else
1896Double_t TMVA::DecisionTree::TrainNodeFast( const EventConstList & eventSample,
1898{
1899// #### OK let's comment this one to see how to parallelize it
1900 Double_t separationGainTotal = -1, sepTmp;
1901 Double_t *separationGain = new Double_t[fNvars+1];
1902 Int_t *cutIndex = new Int_t[fNvars+1]; //-1;
1903
1904 // #### initialize the sep gain and cut index values
1905 for (UInt_t ivar=0; ivar <= fNvars; ivar++) {
1906 separationGain[ivar]=-1;
1907 cutIndex[ivar]=-1;
1908 }
1909 // ### set up some other variables
1910 Int_t mxVar = -1;
1911 Bool_t cutType = kTRUE;
1912 Double_t nTotS, nTotB;
1913 Int_t nTotS_unWeighted, nTotB_unWeighted;
1914 UInt_t nevents = eventSample.size();
1915
1916
1917 // the +1 comes from the fact that I treat later on the Fisher output as an
1918 // additional possible variable.
1919 Bool_t *useVariable = new Bool_t[fNvars+1]; // for performance reasons instead of std::vector<Bool_t> useVariable(fNvars);
1920 UInt_t *mapVariable = new UInt_t[fNvars+1]; // map the subset of variables used in randomised trees to the original variable number (used in the Event() )
1921
1922 std::vector<Double_t> fisherCoeff;
1923
1924 // #### set up a map to the subset of variables using two arrays
1925 if (fRandomisedTree) { // choose for each node splitting a random subset of variables to choose from
1926 UInt_t tmp=fUseNvars;
1927 GetRandomisedVariables(useVariable,mapVariable,tmp);
1928 }
1929 else {
1930 for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1931 useVariable[ivar] = kTRUE;
1932 mapVariable[ivar] = ivar;
1933 }
1934 }
1935 // #### last variable entry is the fisher variable
1936 useVariable[fNvars] = kFALSE; //by default fisher is not used..
1937
1938 // #### Begin Fisher calculation
1939 Bool_t fisherOK = kFALSE; // flag to show that the fisher discriminant could be calculated correctly or not;
1940 if (fUseFisherCuts) {
1941 useVariable[fNvars] = kTRUE; // that's were I store the "fisher MVA"
1942
1943 //use for the Fisher discriminant ONLY those variables that show
1944 //some reasonable linear correlation in either Signal or Background
1945 Bool_t *useVarInFisher = new Bool_t[fNvars]; // for performance reasons instead of std::vector<Bool_t> useVariable(fNvars);
1946 UInt_t *mapVarInFisher = new UInt_t[fNvars]; // map the subset of variables used in randomised trees to the original variable number (used in the Event() )
1947 for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1948 useVarInFisher[ivar] = kFALSE;
1949 mapVarInFisher[ivar] = ivar;
1950 }
1951
1952 std::vector<TMatrixDSym*>* covMatrices;
1953 covMatrices = gTools().CalcCovarianceMatrices( eventSample, 2 ); // currently for 2 classes only
1954 if (!covMatrices){
1955 Log() << kWARNING << " in TrainNodeFast, the covariance Matrices needed for the Fisher-Cuts returned error --> revert to just normal cuts for this node" << Endl;
1956 fisherOK = kFALSE;
1957 }else{
1958 TMatrixD *ss = new TMatrixD(*(covMatrices->at(0)));
1959 TMatrixD *bb = new TMatrixD(*(covMatrices->at(1)));
1960 const TMatrixD *s = gTools().GetCorrelationMatrix(ss);
1961 const TMatrixD *b = gTools().GetCorrelationMatrix(bb);
1962
1963 for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1964 for (UInt_t jvar=ivar+1; jvar < fNvars; jvar++) {
1965 if ( ( TMath::Abs( (*s)(ivar, jvar)) > fMinLinCorrForFisher) ||
1966 ( TMath::Abs( (*b)(ivar, jvar)) > fMinLinCorrForFisher) ){
1967 useVarInFisher[ivar] = kTRUE;
1968 useVarInFisher[jvar] = kTRUE;
1969 }
1970 }
1971 }
1972 // now as you know which variables you want to use, count and map them:
1973 // such that you can use an array/matrix filled only with THOSE variables
1974 // that you used
1975 UInt_t nFisherVars = 0;
1976 for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1977 //now .. pick those variables that are used in the FIsher and are also
1978 // part of the "allowed" variables in case of Randomized Trees)
1979 if (useVarInFisher[ivar] && useVariable[ivar]) {
1980 mapVarInFisher[nFisherVars++]=ivar;
1981 // now exclud the the variables used in the Fisher cuts, and don't
1982 // use them anymore in the individual variable scan
1983 if (fUseExclusiveVars) useVariable[ivar] = kFALSE;
1984 }
1985 }
1986
1987
1988 fisherCoeff = this->GetFisherCoefficients(eventSample, nFisherVars, mapVarInFisher);
1989 fisherOK = kTRUE;
1990 }
1991 delete [] useVarInFisher;
1992 delete [] mapVarInFisher;
1993
1994 }
1995 // #### End Fisher calculation
1996
1997
1998 UInt_t cNvars = fNvars;
1999 if (fUseFisherCuts && fisherOK) cNvars++; // use the Fisher output simple as additional variable
2000
2001 // #### OK now what's going on...
2002 // #### looks like we are setting up histograms
2003 UInt_t* nBins = new UInt_t [cNvars];
2004 Double_t* binWidth = new Double_t [cNvars];
2005 Double_t* invBinWidth = new Double_t [cNvars];
2006
2007 Double_t** nSelS = new Double_t* [cNvars];
2008 Double_t** nSelB = new Double_t* [cNvars];
2009 Double_t** nSelS_unWeighted = new Double_t* [cNvars];
2010 Double_t** nSelB_unWeighted = new Double_t* [cNvars];
2011 Double_t** target = new Double_t* [cNvars];
2012 Double_t** target2 = new Double_t* [cNvars];
2013 Double_t** cutValues = new Double_t* [cNvars];
2014
2015 // #### looping through the variables...
2016 for (UInt_t ivar=0; ivar<cNvars; ivar++) {
2017 // #### ncuts means that we need n+1 bins for each variable
2018 nBins[ivar] = fNCuts+1;
2019 if (ivar < fNvars) {
2020 if (fDataSetInfo->GetVariableInfo(ivar).GetVarType() == 'I') {
2021 nBins[ivar] = node->GetSampleMax(ivar) - node->GetSampleMin(ivar) + 1;
2022 }
2023 }
2024
2025 // #### make some new arrays for each ith var, size=nbins for each array
2026 // #### integer features get the same number of bins as values, set later
2027 nSelS[ivar] = new Double_t [nBins[ivar]];
2028 nSelB[ivar] = new Double_t [nBins[ivar]];
2029 nSelS_unWeighted[ivar] = new Double_t [nBins[ivar]];
2030 nSelB_unWeighted[ivar] = new Double_t [nBins[ivar]];
2031 target[ivar] = new Double_t [nBins[ivar]];
2032 target2[ivar] = new Double_t [nBins[ivar]];
2033 cutValues[ivar] = new Double_t [nBins[ivar]];
2034
2035 }
2036
2037 // #### xmin and xmax for earch variable
2038 Double_t *xmin = new Double_t[cNvars];
2039 Double_t *xmax = new Double_t[cNvars];
2040
2041 // #### ok loop through each variable to initialize all the values
2042 for (UInt_t ivar=0; ivar < cNvars; ivar++) {
2043 if (ivar < fNvars){
2044 xmin[ivar]=node->GetSampleMin(ivar);
2045 xmax[ivar]=node->GetSampleMax(ivar);
2046 if (almost_equal_float(xmax[ivar], xmin[ivar])) {
2047 // std::cout << " variable " << ivar << " has no proper range in (xmax[ivar]-xmin[ivar] = " << xmax[ivar]-xmin[ivar] << std::endl;
2048 // std::cout << " will set useVariable[ivar]=false"<<std::endl;
2049 useVariable[ivar]=kFALSE;
2050 }
2051
2052 } else { // the fisher variable
2053 xmin[ivar]=999;
2054 xmax[ivar]=-999;
2055 // too bad, for the moment I don't know how to do this without looping
2056 // once to get the "min max" and then AGAIN to fill the histogram
2057 for (UInt_t iev=0; iev<nevents; iev++) {
2058 // returns the Fisher value (no fixed range)
2059 Double_t result = fisherCoeff[fNvars]; // the fisher constant offset
2060 for (UInt_t jvar=0; jvar<fNvars; jvar++)
2061 result += fisherCoeff[jvar]*(eventSample[iev])->GetValueFast(jvar);
2062 if (result > xmax[ivar]) xmax[ivar]=result;
2063 if (result < xmin[ivar]) xmin[ivar]=result;
2064 }
2065 }
2066 for (UInt_t ibin=0; ibin<nBins[ivar]; ibin++) {
2067 nSelS[ivar][ibin]=0;
2068 nSelB[ivar][ibin]=0;
2069 nSelS_unWeighted[ivar][ibin]=0;
2070 nSelB_unWeighted[ivar][ibin]=0;
2071 target[ivar][ibin]=0;
2072 target2[ivar][ibin]=0;
2073 cutValues[ivar][ibin]=0;
2074 }
2075 }
2076
2077 // #### Nothing to parallelize here really, no loop through events
2078 // #### only figures out the bin edge values for the "histogram" arrays
2079 // fill the cut values for the scan:
2080 for (UInt_t ivar=0; ivar < cNvars; ivar++) {
2081
2082 if ( useVariable[ivar] ) {
2083
2084 //set the grid for the cut scan on the variables like this:
2085 //
2086 // | | | | | ... | |
2087 // xmin xmax
2088 //
2089 // cut 0 1 2 3 ... fNCuts-1 (counting from zero)
2090 // bin 0 1 2 3 ..... nBins-1=fNCuts (counting from zero)
2091 // --> nBins = fNCuts+1
2092 // (NOTE, the cuts at xmin or xmax would just give the whole sample and
2093 // hence can be safely omitted
2094
2095 binWidth[ivar] = ( xmax[ivar] - xmin[ivar] ) / Double_t(nBins[ivar]);
2096 invBinWidth[ivar] = 1./binWidth[ivar];
2097 if (ivar < fNvars) {
2098 if (fDataSetInfo->GetVariableInfo(ivar).GetVarType() == 'I') { invBinWidth[ivar] = 1; binWidth[ivar] = 1; }
2099 }
2100
2101 // std::cout << "ivar="<<ivar
2102 // <<" min="<<xmin[ivar]
2103 // << " max="<<xmax[ivar]
2104 // << " widht=" << istepSize
2105 // << " nBins["<<ivar<<"]="<<nBins[ivar]<<std::endl;
2106 for (UInt_t icut=0; icut<nBins[ivar]-1; icut++) {
2107 cutValues[ivar][icut]=xmin[ivar]+(Double_t(icut+1))*binWidth[ivar];
2108 // std::cout << " cutValues["<<ivar<<"]["<<icut<<"]=" << cutValues[ivar][icut] << std::endl;
2109 }
2110 }
2111 }
2112
2113 // #### Loop through the events to get the total sig and background
2114 nTotS=0; nTotB=0;
2115 nTotS_unWeighted=0; nTotB_unWeighted=0;
2116 for (UInt_t iev=0; iev<nevents; iev++) {
2117
2118 Double_t eventWeight = eventSample[iev]->GetWeight();
2119 if (eventSample[iev]->GetClass() == fSigClass) {
2120 nTotS+=eventWeight;
2121 nTotS_unWeighted++; }
2122 else {
2123 nTotB+=eventWeight;
2124 nTotB_unWeighted++;
2125 }
2126
2127 // #### Count the number in each bin (fill array "histograms")
2128 Int_t iBin=-1;
2129 for (UInt_t ivar=0; ivar < cNvars; ivar++) {
2130 // now scan trough the cuts for each varable and find which one gives
2131 // the best separationGain at the current stage.
2132 if ( useVariable[ivar] ) {
2133 Double_t eventData;
2134 if (ivar < fNvars) eventData = eventSample[iev]->GetValueFast(ivar);
2135 else { // the fisher variable
2136 eventData = fisherCoeff[fNvars];
2137 for (UInt_t jvar=0; jvar<fNvars; jvar++)
2138 eventData += fisherCoeff[jvar]*(eventSample[iev])->GetValueFast(jvar);
2139
2140 }
2141 // #### figure out which bin the event belongs in ...
2142 // "maximum" is nbins-1 (the "-1" because we start counting from 0 !!
2143 iBin = TMath::Min(Int_t(nBins[ivar]-1),TMath::Max(0,int (invBinWidth[ivar]*(eventData-xmin[ivar]) ) ));
2144 if (eventSample[iev]->GetClass() == fSigClass) {
2145 nSelS[ivar][iBin]+=eventWeight;
2146 nSelS_unWeighted[ivar][iBin]++;
2147 }
2148 else {
2149 nSelB[ivar][iBin]+=eventWeight;
2150 nSelB_unWeighted[ivar][iBin]++;
2151 }
2152 if (DoRegression()) {
2153 target[ivar][iBin] +=eventWeight*eventSample[iev]->GetTarget(0);
2154 target2[ivar][iBin]+=eventWeight*eventSample[iev]->GetTarget(0)*eventSample[iev]->GetTarget(0);
2155 }
2156 }
2157 }
2158 }
2159
2160 // now turn the "histogram" into a cumulative distribution
2161 for (UInt_t ivar=0; ivar < cNvars; ivar++) {
2162 if (useVariable[ivar]) {
2163 for (UInt_t ibin=1; ibin < nBins[ivar]; ibin++) {
2164 nSelS[ivar][ibin]+=nSelS[ivar][ibin-1];
2165 nSelS_unWeighted[ivar][ibin]+=nSelS_unWeighted[ivar][ibin-1];
2166 nSelB[ivar][ibin]+=nSelB[ivar][ibin-1];
2167 nSelB_unWeighted[ivar][ibin]+=nSelB_unWeighted[ivar][ibin-1];
2168 if (DoRegression()) {
2169 target[ivar][ibin] +=target[ivar][ibin-1] ;
2170 target2[ivar][ibin]+=target2[ivar][ibin-1];
2171 }
2172 }
2173 if (nSelS_unWeighted[ivar][nBins[ivar]-1] +nSelB_unWeighted[ivar][nBins[ivar]-1] != eventSample.size()) {
2174 Log() << kFATAL << "Helge, you have a bug ....nSelS_unw..+nSelB_unw..= "
2175 << nSelS_unWeighted[ivar][nBins[ivar]-1] +nSelB_unWeighted[ivar][nBins[ivar]-1]
2176 << " while eventsample size = " << eventSample.size()
2177 << Endl;
2178 }
2179 double lastBins=nSelS[ivar][nBins[ivar]-1] +nSelB[ivar][nBins[ivar]-1];
2180 double totalSum=nTotS+nTotB;
2181 if (TMath::Abs(lastBins-totalSum)/totalSum>0.01) {
2182 Log() << kFATAL << "Helge, you have another bug ....nSelS+nSelB= "
2183 << lastBins
2184 << " while total number of events = " << totalSum
2185 << Endl;
2186 }
2187 }
2188 }
2189 // #### Loops over vars and bins, but not events, not worth parallelizing unless nbins is on the order of ndata/10 ish ...
2190 // now select the optimal cuts for each varable and find which one gives
2191 // the best separationGain at the current stage
2192 for (UInt_t ivar=0; ivar < cNvars; ivar++) {
2193 if (useVariable[ivar]) {
2194 for (UInt_t iBin=0; iBin<nBins[ivar]-1; iBin++) { // the last bin contains "all events" -->skip
2195 // the separationGain is defined as the various indices (Gini, CorssEntropy, e.t.c)
2196 // calculated by the "SamplePurities" fom the branches that would go to the
2197 // left or the right from this node if "these" cuts were used in the Node:
2198 // hereby: nSelS and nSelB would go to the right branch
2199 // (nTotS - nSelS) + (nTotB - nSelB) would go to the left branch;
2200
2201 // only allow splits where both daughter nodes match the specified miniumum number
2202 // for this use the "unweighted" events, as you are interested in statistically
2203 // significant splits, which is determined by the actual number of entries
2204 // for a node, rather than the sum of event weights.
2205
2206 Double_t sl = nSelS_unWeighted[ivar][iBin];
2207 Double_t bl = nSelB_unWeighted[ivar][iBin];
2208 Double_t s = nTotS_unWeighted;
2209 Double_t b = nTotB_unWeighted;
2210 Double_t slW = nSelS[ivar][iBin];
2211 Double_t blW = nSelB[ivar][iBin];
2212 Double_t sW = nTotS;
2213 Double_t bW = nTotB;
2214 Double_t sr = s-sl;
2215 Double_t br = b-bl;
2216 Double_t srW = sW-slW;
2217 Double_t brW = bW-blW;
2218 // std::cout << "sl="<<sl << " bl="<<bl<<" fMinSize="<<fMinSize << "sr="<<sr << " br="<<br <<std::endl;
2219 if ( ((sl+bl)>=fMinSize && (sr+br)>=fMinSize)
2220 && ((slW+blW)>=fMinSize && (srW+brW)>=fMinSize)
2221 ) {
2222
2223 if (DoRegression()) {
2224 sepTmp = fRegType->GetSeparationGain(nSelS[ivar][iBin]+nSelB[ivar][iBin],
2225 target[ivar][iBin],target2[ivar][iBin],
2226 nTotS+nTotB,
2227 target[ivar][nBins[ivar]-1],target2[ivar][nBins[ivar]-1]);
2228 } else {
2229 sepTmp = fSepType->GetSeparationGain(nSelS[ivar][iBin], nSelB[ivar][iBin], nTotS, nTotB);
2230 }
2231 if (separationGain[ivar] < sepTmp) {
2232 separationGain[ivar] = sepTmp;
2233 cutIndex[ivar] = iBin;
2234 }
2235 }
2236 }
2237 }
2238 }
2239
2240 //now you have found the best separation cut for each variable, now compare the variables
2241 for (UInt_t ivar=0; ivar < cNvars; ivar++) {
2242 if (useVariable[ivar] ) {
2243 if (separationGainTotal < separationGain[ivar]) {
2244 separationGainTotal = separationGain[ivar];
2245 mxVar = ivar;
2246 }
2247 }
2248 }
2249
2250 if (mxVar >= 0) {
2251 if (DoRegression()) {
2252 node->SetSeparationIndex(fRegType->GetSeparationIndex(nTotS+nTotB,target[0][nBins[mxVar]-1],target2[0][nBins[mxVar]-1]));
2253 node->SetResponse(target[0][nBins[mxVar]-1]/(nTotS+nTotB));
2254 if ( almost_equal_double(target2[0][nBins[mxVar]-1]/(nTotS+nTotB), target[0][nBins[mxVar]-1]/(nTotS+nTotB)*target[0][nBins[mxVar]-1]/(nTotS+nTotB))) {
2255 node->SetRMS(0);
2256 }else{
2257 node->SetRMS(TMath::Sqrt(target2[0][nBins[mxVar]-1]/(nTotS+nTotB) - target[0][nBins[mxVar]-1]/(nTotS+nTotB)*target[0][nBins[mxVar]-1]/(nTotS+nTotB)));
2258 }
2259 }
2260 else {
2261 node->SetSeparationIndex(fSepType->GetSeparationIndex(nTotS,nTotB));
2262 if (mxVar >=0){
2263 if (nSelS[mxVar][cutIndex[mxVar]]/nTotS > nSelB[mxVar][cutIndex[mxVar]]/nTotB) cutType=kTRUE;
2264 else cutType=kFALSE;
2265 }
2266 }
2267 node->SetSelector((UInt_t)mxVar);
2268 node->SetCutValue(cutValues[mxVar][cutIndex[mxVar]]);
2269 node->SetCutType(cutType);
2270 node->SetSeparationGain(separationGainTotal);
2271 if (mxVar < (Int_t) fNvars){ // the fisher cut is actually not used in this node, hence don't need to store fisher components
2272 node->SetNFisherCoeff(0);
2273 fVariableImportance[mxVar] += separationGainTotal*separationGainTotal * (nTotS+nTotB) * (nTotS+nTotB) ;
2274 //for (UInt_t ivar=0; ivar<fNvars; ivar++) fVariableImportance[ivar] += separationGain[ivar]*separationGain[ivar] * (nTotS+nTotB) * (nTotS+nTotB) ;
2275 }else{
2276 // allocate Fisher coefficients (use fNvars, and set the non-used ones to zero. Might
2277 // be even less storage space on average than storing also the mapping used otherwise
2278 // can always be changed relatively easy
2279 node->SetNFisherCoeff(fNvars+1);
2280 for (UInt_t ivar=0; ivar<=fNvars; ivar++) {
2281 node->SetFisherCoeff(ivar,fisherCoeff[ivar]);
2282 // take 'fisher coeff. weighted estimate as variable importance, "Don't fill the offset coefficient though :)
2283 if (ivar<fNvars){
2284 fVariableImportance[ivar] += fisherCoeff[ivar]*fisherCoeff[ivar]*separationGainTotal*separationGainTotal * (nTotS+nTotB) * (nTotS+nTotB) ;
2285 }
2286 }
2287 }
2288 }
2289 else {
2290 separationGainTotal = 0;
2291 }
2292 // if (mxVar > -1) {
2293 // std::cout << "------------------------------------------------------------------"<<std::endl;
2294 // std::cout << "cutting on Var: " << mxVar << " with cutIndex " << cutIndex[mxVar] << " being: " << cutValues[mxVar][cutIndex[mxVar]] << std::endl;
2295 // std::cout << " nSelS = " << nSelS_unWeighted[mxVar][cutIndex[mxVar]] << " nSelB = " << nSelB_unWeighted[mxVar][cutIndex[mxVar]] << " (right) sum:= " << nSelS_unWeighted[mxVar][cutIndex[mxVar]] + nSelB_unWeighted[mxVar][cutIndex[mxVar]] << std::endl;
2296 // std::cout << " nSelS = " << nTotS_unWeighted - nSelS_unWeighted[mxVar][cutIndex[mxVar]] << " nSelB = " << nTotB_unWeighted-nSelB_unWeighted[mxVar][cutIndex[mxVar]] << " (left) sum:= " << nTotS_unWeighted + nTotB_unWeighted - nSelS_unWeighted[mxVar][cutIndex[mxVar]] - nSelB_unWeighted[mxVar][cutIndex[mxVar]] << std::endl;
2297 // std::cout << " nSelS = " << nSelS[mxVar][cutIndex[mxVar]] << " nSelB = " << nSelB[mxVar][cutIndex[mxVar]] << std::endl;
2298 // std::cout << " s/s+b " << nSelS_unWeighted[mxVar][cutIndex[mxVar]]/( nSelS_unWeighted[mxVar][cutIndex[mxVar]] + nSelB_unWeighted[mxVar][cutIndex[mxVar]])
2299 // << " s/s+b " << (nTotS - nSelS_unWeighted[mxVar][cutIndex[mxVar]])/( nTotS-nSelS_unWeighted[mxVar][cutIndex[mxVar]] + nTotB-nSelB_unWeighted[mxVar][cutIndex[mxVar]]) << std::endl;
2300 // std::cout << " nTotS = " << nTotS << " nTotB = " << nTotB << std::endl;
2301 // std::cout << " separationGainTotal " << separationGainTotal << std::endl;
2302 // } else {
2303 // std::cout << "------------------------------------------------------------------"<<std::endl;
2304 // std::cout << " obviously didn't find new mxVar " << mxVar << std::endl;
2305 // }
2306 for (UInt_t i=0; i<cNvars; i++) {
2307 delete [] nSelS[i];
2308 delete [] nSelB[i];
2309 delete [] nSelS_unWeighted[i];
2310 delete [] nSelB_unWeighted[i];
2311 delete [] target[i];
2312 delete [] target2[i];
2313 delete [] cutValues[i];
2314 }
2315 delete [] nSelS;
2316 delete [] nSelB;
2317 delete [] nSelS_unWeighted;
2318 delete [] nSelB_unWeighted;
2319 delete [] target;
2320 delete [] target2;
2321 delete [] cutValues;
2322
2323 delete [] xmin;
2324 delete [] xmax;
2325
2326 delete [] useVariable;
2327 delete [] mapVariable;
2328
2329 delete [] separationGain;
2330 delete [] cutIndex;
2331
2332 delete [] nBins;
2333 delete [] binWidth;
2334 delete [] invBinWidth;
2335
2336 return separationGainTotal;
2337
2338}
2339#endif
2340
2341
2342////////////////////////////////////////////////////////////////////////////////
2343/// calculate the fisher coefficients for the event sample and the variables used
2344
2345std::vector<Double_t> TMVA::DecisionTree::GetFisherCoefficients(const EventConstList &eventSample, UInt_t nFisherVars, UInt_t *mapVarInFisher){
2346 std::vector<Double_t> fisherCoeff(fNvars+1);
2347
2348 // initialization of global matrices and vectors
2349 // average value of each variables for S, B, S+B
2350 TMatrixD* meanMatx = new TMatrixD( nFisherVars, 3 );
2351
2352 // the covariance 'within class' and 'between class' matrices
2353 TMatrixD* betw = new TMatrixD( nFisherVars, nFisherVars );
2354 TMatrixD* with = new TMatrixD( nFisherVars, nFisherVars );
2355 TMatrixD* cov = new TMatrixD( nFisherVars, nFisherVars );
2356
2357 //
2358 // compute mean values of variables in each sample, and the overall means
2359 //
2360
2361 // initialize internal sum-of-weights variables
2362 Double_t sumOfWeightsS = 0;
2363 Double_t sumOfWeightsB = 0;
2364
2365
2366 // init vectors
2367 Double_t* sumS = new Double_t[nFisherVars];
2368 Double_t* sumB = new Double_t[nFisherVars];
2369 for (UInt_t ivar=0; ivar<nFisherVars; ivar++) { sumS[ivar] = sumB[ivar] = 0; }
2370
2371 UInt_t nevents = eventSample.size();
2372 // compute sample means
2373 for (UInt_t ievt=0; ievt<nevents; ievt++) {
2374
2375 // read the Training Event into "event"
2376 const Event * ev = eventSample[ievt];
2377
2378 // sum of weights
2379 Double_t weight = ev->GetWeight();
2380 if (ev->GetClass() == fSigClass) sumOfWeightsS += weight;
2381 else sumOfWeightsB += weight;
2382
2383 Double_t* sum = ev->GetClass() == fSigClass ? sumS : sumB;
2384 for (UInt_t ivar=0; ivar<nFisherVars; ivar++) {
2385 sum[ivar] += ev->GetValueFast( mapVarInFisher[ivar] )*weight;
2386 }
2387 }
2388 for (UInt_t ivar=0; ivar<nFisherVars; ivar++) {
2389 (*meanMatx)( ivar, 2 ) = sumS[ivar];
2390 (*meanMatx)( ivar, 0 ) = sumS[ivar]/sumOfWeightsS;
2391
2392 (*meanMatx)( ivar, 2 ) += sumB[ivar];
2393 (*meanMatx)( ivar, 1 ) = sumB[ivar]/sumOfWeightsB;
2394
2395 // signal + background
2396 (*meanMatx)( ivar, 2 ) /= (sumOfWeightsS + sumOfWeightsB);
2397 }
2398
2399 delete [] sumS;
2400
2401 delete [] sumB;
2402
2403 // the matrix of covariance 'within class' reflects the dispersion of the
2404 // events relative to the center of gravity of their own class
2405
2406 // assert required
2407
2408 assert( sumOfWeightsS > 0 && sumOfWeightsB > 0 );
2409
2410 // product matrices (x-<x>)(y-<y>) where x;y are variables
2411
2412 const Int_t nFisherVars2 = nFisherVars*nFisherVars;
2413 Double_t *sum2Sig = new Double_t[nFisherVars2];
2414 Double_t *sum2Bgd = new Double_t[nFisherVars2];
2415 Double_t *xval = new Double_t[nFisherVars2];
2416 memset(sum2Sig,0,nFisherVars2*sizeof(Double_t));
2417 memset(sum2Bgd,0,nFisherVars2*sizeof(Double_t));
2418
2419 // 'within class' covariance
2420 for (UInt_t ievt=0; ievt<nevents; ievt++) {
2421
2422 // read the Training Event into "event"
2423 // const Event* ev = eventSample[ievt];
2424 const Event* ev = eventSample.at(ievt);
2425
2426 Double_t weight = ev->GetWeight(); // may ignore events with negative weights
2427
2428 for (UInt_t x=0; x<nFisherVars; x++) {
2429 xval[x] = ev->GetValueFast( mapVarInFisher[x] );
2430 }
2431 Int_t k=0;
2432 for (UInt_t x=0; x<nFisherVars; x++) {
2433 for (UInt_t y=0; y<nFisherVars; y++) {
2434 if ( ev->GetClass() == fSigClass ) sum2Sig[k] += ( (xval[x] - (*meanMatx)(x, 0))*(xval[y] - (*meanMatx)(y, 0)) )*weight;
2435 else sum2Bgd[k] += ( (xval[x] - (*meanMatx)(x, 1))*(xval[y] - (*meanMatx)(y, 1)) )*weight;
2436 k++;
2437 }
2438 }
2439 }
2440 Int_t k=0;
2441 for (UInt_t x=0; x<nFisherVars; x++) {
2442 for (UInt_t y=0; y<nFisherVars; y++) {
2443 (*with)(x, y) = sum2Sig[k]/sumOfWeightsS + sum2Bgd[k]/sumOfWeightsB;
2444 k++;
2445 }
2446 }
2447
2448 delete [] sum2Sig;
2449 delete [] sum2Bgd;
2450 delete [] xval;
2451
2452
2453 // the matrix of covariance 'between class' reflects the dispersion of the
2454 // events of a class relative to the global center of gravity of all the class
2455 // hence the separation between classes
2456
2457
2458 Double_t prodSig, prodBgd;
2459
2460 for (UInt_t x=0; x<nFisherVars; x++) {
2461 for (UInt_t y=0; y<nFisherVars; y++) {
2462
2463 prodSig = ( ((*meanMatx)(x, 0) - (*meanMatx)(x, 2))*
2464 ((*meanMatx)(y, 0) - (*meanMatx)(y, 2)) );
2465 prodBgd = ( ((*meanMatx)(x, 1) - (*meanMatx)(x, 2))*
2466 ((*meanMatx)(y, 1) - (*meanMatx)(y, 2)) );
2467
2468 (*betw)(x, y) = (sumOfWeightsS*prodSig + sumOfWeightsB*prodBgd) / (sumOfWeightsS + sumOfWeightsB);
2469 }
2470 }
2471
2472
2473
2474 // compute full covariance matrix from sum of within and between matrices
2475 for (UInt_t x=0; x<nFisherVars; x++)
2476 for (UInt_t y=0; y<nFisherVars; y++)
2477 (*cov)(x, y) = (*with)(x, y) + (*betw)(x, y);
2478
2479 // Fisher = Sum { [coeff]*[variables] }
2480 //
2481 // let Xs be the array of the mean values of variables for signal evts
2482 // let Xb be the array of the mean values of variables for backgd evts
2483 // let InvWith be the inverse matrix of the 'within class' correlation matrix
2484 //
2485 // then the array of Fisher coefficients is
2486 // [coeff] =TMath::Sqrt(fNsig*fNbgd)/fNevt*transpose{Xs-Xb}*InvWith
2487 TMatrixD* theMat = with; // Fishers original
2488 // TMatrixD* theMat = cov; // Mahalanobis
2489
2490 TMatrixD invCov( *theMat );
2491 if ( TMath::Abs(invCov.Determinant()) < 10E-24 ) {
2492 Log() << kWARNING << "FisherCoeff matrix is almost singular with determinant="
2493 << TMath::Abs(invCov.Determinant())
2494 << " did you use the variables that are linear combinations or highly correlated?"
2495 << Endl;
2496 }
2497 if ( TMath::Abs(invCov.Determinant()) < 10E-120 ) {
2498 Log() << kFATAL << "FisherCoeff matrix is singular with determinant="
2499 << TMath::Abs(invCov.Determinant())
2500 << " did you use the variables that are linear combinations?"
2501 << Endl;
2502 }
2503
2504 invCov.Invert();
2505
2506 // apply rescaling factor
2507 Double_t xfact = TMath::Sqrt( sumOfWeightsS*sumOfWeightsB ) / (sumOfWeightsS + sumOfWeightsB);
2508
2509 // compute difference of mean values
2510 std::vector<Double_t> diffMeans( nFisherVars );
2511
2512 for (UInt_t ivar=0; ivar<=fNvars; ivar++) fisherCoeff[ivar] = 0;
2513 for (UInt_t ivar=0; ivar<nFisherVars; ivar++) {
2514 for (UInt_t jvar=0; jvar<nFisherVars; jvar++) {
2515 Double_t d = (*meanMatx)(jvar, 0) - (*meanMatx)(jvar, 1);
2516 fisherCoeff[mapVarInFisher[ivar]] += invCov(ivar, jvar)*d;
2517 }
2518
2519 // rescale
2520 fisherCoeff[mapVarInFisher[ivar]] *= xfact;
2521 }
2522
2523 // offset correction
2524 Double_t f0 = 0.0;
2525 for (UInt_t ivar=0; ivar<nFisherVars; ivar++){
2526 f0 += fisherCoeff[mapVarInFisher[ivar]]*((*meanMatx)(ivar, 0) + (*meanMatx)(ivar, 1));
2527 }
2528 f0 /= -2.0;
2529
2530 fisherCoeff[fNvars] = f0; //as we start counting variables from "zero", I store the fisher offset at the END
2531
2532 return fisherCoeff;
2533}
2534
2535////////////////////////////////////////////////////////////////////////////////
2536/// train a node by finding the single optimal cut for a single variable
2537/// that best separates signal and background (maximizes the separation gain)
2538
2541{
2542 Double_t nTotS = 0.0, nTotB = 0.0;
2543 Int_t nTotS_unWeighted = 0, nTotB_unWeighted = 0;
2544
2545 std::vector<TMVA::BDTEventWrapper> bdtEventSample;
2546
2547 // List of optimal cuts, separation gains, and cut types (removed background or signal) - one for each variable
2548 // each spot in parallel no problem
2549 std::vector<Double_t> lCutValue( fNvars, 0.0 );
2550 std::vector<Double_t> lSepGain( fNvars, -1.0e6 );
2551 std::vector<Char_t> lCutType( fNvars ); // <----- bool is stored (for performance reasons, no std::vector<bool> has been taken)
2552 lCutType.assign( fNvars, Char_t(kFALSE) );
2553
2554 // Initialize (un)weighted counters for signal & background
2555 // Construct a list of event wrappers that point to the original data
2556 for( std::vector<const TMVA::Event*>::const_iterator it = eventSample.begin(); it != eventSample.end(); ++it ) {
2557 if((*it)->GetClass() == fSigClass) { // signal or background event
2558 nTotS += (*it)->GetWeight();
2559 ++nTotS_unWeighted;
2560 }
2561 else {
2562 nTotB += (*it)->GetWeight();
2563 ++nTotB_unWeighted;
2564 }
2565 bdtEventSample.push_back(TMVA::BDTEventWrapper(*it));
2566 }
2567
2568 std::vector<Char_t> useVariable(fNvars); // <----- bool is stored (for performance reasons, no std::vector<bool> has been taken)
2569 useVariable.assign( fNvars, Char_t(kTRUE) );
2570
2571 for (UInt_t ivar=0; ivar < fNvars; ivar++) useVariable[ivar]=Char_t(kFALSE);
2572 if (fRandomisedTree) { // choose for each node splitting a random subset of variables to choose from
2573 if (fUseNvars ==0 ) { // no number specified ... choose s.th. which hopefully works well
2574 // watch out, should never happen as it is initialised automatically in MethodBDT already!!!
2575 fUseNvars = UInt_t(TMath::Sqrt(fNvars)+0.6);
2576 }
2577 Int_t nSelectedVars = 0;
2578 while (nSelectedVars < fUseNvars) {
2579 Double_t bla = fMyTrandom->Rndm()*fNvars;
2580 useVariable[Int_t (bla)] = Char_t(kTRUE);
2581 nSelectedVars = 0;
2582 for (UInt_t ivar=0; ivar < fNvars; ivar++) {
2583 if(useVariable[ivar] == Char_t(kTRUE)) nSelectedVars++;
2584 }
2585 }
2586 }
2587 else {
2588 for (UInt_t ivar=0; ivar < fNvars; ivar++) useVariable[ivar] = Char_t(kTRUE);
2589 }
2590 for( UInt_t ivar = 0; ivar < fNvars; ivar++ ) { // loop over all discriminating variables
2591 if(!useVariable[ivar]) continue; // only optimze with selected variables
2592
2593 TMVA::BDTEventWrapper::SetVarIndex(ivar); // select the variable to sort by
2594
2595 std::sort( bdtEventSample.begin(),bdtEventSample.end() ); // sort the event data
2596
2597
2598 Double_t bkgWeightCtr = 0.0, sigWeightCtr = 0.0;
2599
2600 std::vector<TMVA::BDTEventWrapper>::iterator it = bdtEventSample.begin(), it_end = bdtEventSample.end();
2601 for( ; it != it_end; ++it ) {
2602 if((**it)->GetClass() == fSigClass ) // specify signal or background event
2603 sigWeightCtr += (**it)->GetWeight();
2604 else
2605 bkgWeightCtr += (**it)->GetWeight();
2606 // Store the accumulated signal (background) weights
2607 it->SetCumulativeWeight(false,bkgWeightCtr);
2608 it->SetCumulativeWeight(true,sigWeightCtr);
2609 }
2610
2611 const Double_t fPMin = 1.0e-6;
2612 Bool_t cutType = kFALSE;
2613 Long64_t index = 0;
2614 Double_t separationGain = -1.0, sepTmp = 0.0, cutValue = 0.0, dVal = 0.0, norm = 0.0;
2615
2616 // Locate the optimal cut for this (ivar-th) variable
2617 for( it = bdtEventSample.begin(); it != it_end; ++it ) {
2618 if( index == 0 ) { ++index; continue; }
2619 if( *(*it) == NULL ) {
2620 Log() << kFATAL << "In TrainNodeFull(): have a null event! Where index="
2621 << index << ", and parent node=" << node->GetParent() << Endl;
2622 break;
2623 }
2624 dVal = bdtEventSample[index].GetVal() - bdtEventSample[index-1].GetVal();
2625 norm = TMath::Abs(bdtEventSample[index].GetVal() + bdtEventSample[index-1].GetVal());
2626 // Only allow splits where both daughter nodes have the specified minimum number of events
2627 // Splits are only sensible when the data are ordered (eg. don't split inside a sequence of 0's)
2628 if( index >= fMinSize && (nTotS_unWeighted + nTotB_unWeighted) - index >= fMinSize && TMath::Abs(dVal/(0.5*norm + 1)) > fPMin ) {
2629
2630 sepTmp = fSepType->GetSeparationGain( it->GetCumulativeWeight(true), it->GetCumulativeWeight(false), sigWeightCtr, bkgWeightCtr );
2631 if( sepTmp > separationGain ) {
2632 separationGain = sepTmp;
2633 cutValue = it->GetVal() - 0.5*dVal;
2634 Double_t nSelS = it->GetCumulativeWeight(true);
2635 Double_t nSelB = it->GetCumulativeWeight(false);
2636 // Indicate whether this cut is improving the node purity by removing background (enhancing signal)
2637 // or by removing signal (enhancing background)
2638 if( nSelS/sigWeightCtr > nSelB/bkgWeightCtr ) cutType = kTRUE;
2639 else cutType = kFALSE;
2640 }
2641 }
2642 ++index;
2643 }
2644 lCutType[ivar] = Char_t(cutType);
2645 lCutValue[ivar] = cutValue;
2646 lSepGain[ivar] = separationGain;
2647 }
2648 Double_t separationGain = -1.0;
2649 Int_t iVarIndex = -1;
2650 for( UInt_t ivar = 0; ivar < fNvars; ivar++ ) {
2651 if( lSepGain[ivar] > separationGain ) {
2652 iVarIndex = ivar;
2653 separationGain = lSepGain[ivar];
2654 }
2655 }
2656
2657 // #### storing the best values into the node
2658 if(iVarIndex >= 0) {
2659 node->SetSelector(iVarIndex);
2660 node->SetCutValue(lCutValue[iVarIndex]);
2661 node->SetSeparationGain(lSepGain[iVarIndex]);
2662 node->SetCutType(lCutType[iVarIndex]);
2663 fVariableImportance[iVarIndex] += separationGain*separationGain * (nTotS+nTotB) * (nTotS+nTotB);
2664 }
2665 else {
2666 separationGain = 0.0;
2667 }
2668
2669 return separationGain;
2670}
2671
2672////////////////////////////////////////////////////////////////////////////////
2673/// get the pointer to the leaf node where a particular event ends up in...
2674/// (used in gradient boosting)
2675
2677{
2678 TMVA::DecisionTreeNode *current = (TMVA::DecisionTreeNode*)this->GetRoot();
2679 while(current->GetNodeType() == 0) { // intermediate node in a tree
2680 current = (current->GoesRight(e)) ?
2681 (TMVA::DecisionTreeNode*)current->GetRight() :
2682 (TMVA::DecisionTreeNode*)current->GetLeft();
2683 }
2684 return current;
2685}
2686
2687////////////////////////////////////////////////////////////////////////////////
2688/// the event e is put into the decision tree (starting at the root node)
2689/// and the output is NodeType (signal) or (background) of the final node (basket)
2690/// in which the given events ends up. I.e. the result of the classification if
2691/// the event for this decision tree.
2692
2694{
2695 TMVA::DecisionTreeNode *current = this->GetRoot();
2696 if (!current){
2697 Log() << kFATAL << "CheckEvent: started with undefined ROOT node" <<Endl;
2698 return 0; //keeps coverity happy that doesn't know that kFATAL causes an exit
2699 }
2700
2701 while (current->GetNodeType() == 0) { // intermediate node in a (pruned) tree
2702 current = (current->GoesRight(*e)) ?
2703 current->GetRight() :
2704 current->GetLeft();
2705 if (!current) {
2706 Log() << kFATAL << "DT::CheckEvent: inconsistent tree structure" <<Endl;
2707 }
2708
2709 }
2710
2711 if (DoRegression()) {
2712 // Note: This path is also taken for MethodBDT with analysis type
2713 // kClassification and kMulticlass when using GradBoost.
2714 // See TMVA::MethodBDT::InitGradBoost
2715 return current->GetResponse();
2716 } else {
2717 if (UseYesNoLeaf) return Double_t ( current->GetNodeType() );
2718 else return current->GetPurity();
2719 }
2720}
2721
2722////////////////////////////////////////////////////////////////////////////////
2723/// calculates the purity S/(S+B) of a given event sample
2724
2725Double_t TMVA::DecisionTree::SamplePurity( std::vector<TMVA::Event*> eventSample )
2726{
2727 Double_t sumsig=0, sumbkg=0, sumtot=0;
2728 for (UInt_t ievt=0; ievt<eventSample.size(); ievt++) {
2729 if (eventSample[ievt]->GetClass() != fSigClass) sumbkg+=eventSample[ievt]->GetWeight();
2730 else sumsig+=eventSample[ievt]->GetWeight();
2731 sumtot+=eventSample[ievt]->GetWeight();
2732 }
2733 // sanity check
2734 if (sumtot!= (sumsig+sumbkg)){
2735 Log() << kFATAL << "<SamplePurity> sumtot != sumsig+sumbkg"
2736 << sumtot << " " << sumsig << " " << sumbkg << Endl;
2737 }
2738 if (sumtot>0) return sumsig/(sumsig + sumbkg);
2739 else return -1;
2740}
2741
2742////////////////////////////////////////////////////////////////////////////////
2743/// Return the relative variable importance, normalized to all
2744/// variables together having the importance 1. The importance in
2745/// evaluated as the total separation-gain that this variable had in
2746/// the decision trees (weighted by the number of events)
2747
2749{
2750 std::vector<Double_t> relativeImportance(fNvars);
2751 Double_t sum=0;
2752 for (UInt_t i=0; i< fNvars; i++) {
2753 sum += fVariableImportance[i];
2754 relativeImportance[i] = fVariableImportance[i];
2755 }
2756
2757 for (UInt_t i=0; i< fNvars; i++) {
2759 relativeImportance[i] /= sum;
2760 else
2761 relativeImportance[i] = 0;
2762 }
2763 return relativeImportance;
2764}
2765
2766////////////////////////////////////////////////////////////////////////////////
2767/// returns the relative importance of variable ivar
2768
2770{
2771 std::vector<Double_t> relativeImportance = this->GetVariableImportance();
2772 if (ivar < fNvars) return relativeImportance[ivar];
2773 else {
2774 Log() << kFATAL << "<GetVariableImportance>" << Endl
2775 << "--- ivar = " << ivar << " is out of range " << Endl;
2776 }
2777
2778 return -1;
2779}
2780
bool almost_equal_double(double x, double y, int ulp=4)
bool almost_equal_float(float x, float y, int ulp=4)
ROOT::R::TRInterface & r
Definition: Object.C:4
#define d(i)
Definition: RSha256.hxx:102
#define b(i)
Definition: RSha256.hxx:100
#define f(i)
Definition: RSha256.hxx:104
#define e(i)
Definition: RSha256.hxx:103
int Int_t
Definition: RtypesCore.h:41
char Char_t
Definition: RtypesCore.h:29
unsigned int UInt_t
Definition: RtypesCore.h:42
const Bool_t kFALSE
Definition: RtypesCore.h:88
unsigned long ULong_t
Definition: RtypesCore.h:51
bool Bool_t
Definition: RtypesCore.h:59
double Double_t
Definition: RtypesCore.h:55
long long Long64_t
Definition: RtypesCore.h:69
float Float_t
Definition: RtypesCore.h:53
const Bool_t kTRUE
Definition: RtypesCore.h:87
#define ClassImp(name)
Definition: Rtypes.h:365
int type
Definition: TGX11.cxx:120
float xmin
Definition: THbookFile.cxx:93
float xmax
Definition: THbookFile.cxx:93
TMatrixT< Double_t > TMatrixD
Definition: TMatrixDfwd.h:22
TString operator+(const TString &s1, const TString &s2)
Use the special concatenation constructor.
Definition: TString.cxx:1474
static void SetVarIndex(Int_t iVar)
Base class for BinarySearch and Decision Trees.
Definition: BinaryTree.h:62
virtual void ReadXML(void *node, UInt_t tmva_Version_Code=TMVA_VERSION_CODE)
read attributes from XML
Definition: BinaryTree.cxx:144
void SetRoot(Node *r)
Definition: BinaryTree.h:80
MsgLogger & Log() const
Definition: BinaryTree.cxx:235
Executor & GetThreadExecutor()
Get executor class for multi-thread usage In case when MT is not enabled will return a serial executo...
Definition: Config.h:83
static Config & Instance()
static function: returns TMVA instance
Definition: Config.cxx:107
A class to prune a decision tree using the Cost Complexity method.
Class that contains all the data information.
Definition: DataSetInfo.h:60
void SetNEvents_unweighted(Float_t nev)
void SetSeparationGain(Float_t sep)
void SetNBkgEvents(Float_t b)
Double_t GetNSValidation() const
void SetFisherCoeff(Int_t ivar, Double_t coeff)
set fisher coefficients
void SetNSigEvents_unboosted(Float_t s)
void SetAlphaMinSubtree(Double_t g)
void IncrementNBkgEvents(Float_t b)
void SetNEvents_unboosted(Float_t nev)
Float_t GetNSigEvents(void) const
virtual void SetLeft(Node *l)
void SetTerminal(Bool_t s=kTRUE)
void SetResponse(Float_t r)
void SetSampleMax(UInt_t ivar, Float_t xmax)
set the maximum of variable ivar from the training sample that pass/end up in this node
void SetNBValidation(Double_t b)
void IncrementNEvents(Float_t nev)
void SetPurity(void)
return the S/(S+B) (purity) for the node REM: even if nodes with purity 0.01 are very PURE background...
void SetSubTreeR(Double_t r)
void AddToSumTarget2(Float_t t2)
virtual DecisionTreeNode * GetLeft() const
Double_t GetNodeR() const
virtual Bool_t GoesRight(const Event &) const
test event if it descends the tree at this node to the right
void SetNFisherCoeff(Int_t nvars)
Short_t GetSelector() const
void SetNSigEvents(Float_t s)
Float_t GetResponse(void) const
Float_t GetCutValue(void) const
Int_t GetNodeType(void) const
void SetNSigEvents_unweighted(Float_t s)
Double_t GetNBValidation() const
void SetAlpha(Double_t alpha)
void SetSeparationIndex(Float_t sep)
virtual void SetRight(Node *r)
void SetNBkgEvents_unboosted(Float_t b)
Float_t GetPurity(void) const
void IncrementNSigEvents(Float_t s)
Float_t GetSampleMax(UInt_t ivar) const
return the maximum of variable ivar from the training sample that pass/end up in this node
void SetCutValue(Float_t c)
Float_t GetNBkgEvents(void) const
Float_t GetSampleMin(UInt_t ivar) const
return the minimum of variable ivar from the training sample that pass/end up in this node
void SetSampleMin(UInt_t ivar, Float_t xmin)
set the minimum of variable ivar from the training sample that pass/end up in this node
void SetSelector(Short_t i)
virtual DecisionTreeNode * GetParent() const
void SetNBkgEvents_unweighted(Float_t b)
void SetNSValidation(Double_t s)
void AddToSumTarget(Float_t t)
void SetNEvents(Float_t nev)
virtual DecisionTreeNode * GetRight() const
Implementation of a Decision Tree.
Definition: DecisionTree.h:64
void FillTree(const EventList &eventSample)
fill the existing the decision tree structure by filling event in from the top node and see where the...
void PruneNode(TMVA::DecisionTreeNode *node)
prune away the subtree below the node
void ApplyValidationSample(const EventConstList *validationSample) const
run the validation sample through the (pruned) tree and fill in the nodes the variables NSValidation ...
Double_t TrainNodeFull(const EventConstList &eventSample, DecisionTreeNode *node)
train a node by finding the single optimal cut for a single variable that best separates signal and b...
TMVA::DecisionTreeNode * GetEventNode(const TMVA::Event &e) const
get the pointer to the leaf node where a particular event ends up in... (used in gradient boosting)
void GetRandomisedVariables(Bool_t *useVariable, UInt_t *variableMap, UInt_t &nVars)
UInt_t CleanTree(DecisionTreeNode *node=NULL)
remove those last splits that result in two leaf nodes that are both of the type (i....
std::vector< const TMVA::Event * > EventConstList
Definition: DecisionTree.h:73
Double_t CheckEvent(const TMVA::Event *, Bool_t UseYesNoLeaf=kFALSE) const
the event e is put into the decision tree (starting at the root node) and the output is NodeType (sig...
static const Int_t fgRandomSeed
Definition: DecisionTree.h:68
UInt_t BuildTree(const EventConstList &eventSample, DecisionTreeNode *node=NULL)
building the decision tree by recursively calling the splitting of one (root-) node into two daughter...
Double_t PruneTree(const EventConstList *validationSample=NULL)
prune (get rid of internal nodes) the Decision tree to avoid overtraining several different pruning m...
virtual ~DecisionTree(void)
destructor
Types::EAnalysisType fAnalysisType
Definition: DecisionTree.h:238
std::vector< Double_t > GetVariableImportance()
Return the relative variable importance, normalized to all variables together having the importance 1...
void CheckEventWithPrunedTree(const TMVA::Event *) const
pass a single validation event through a pruned decision tree on the way down the tree,...
void PruneNodeInPlace(TMVA::DecisionTreeNode *node)
prune a node temporarily (without actually deleting its descendants which allows testing the pruned t...
void FillEvent(const TMVA::Event &event, TMVA::DecisionTreeNode *node)
fill the existing the decision tree structure by filling event in from the top node and see where the...
UInt_t CountLeafNodes(TMVA::Node *n=NULL)
return the number of terminal nodes in the sub-tree below Node n
Double_t TestPrunedTreeQuality(const DecisionTreeNode *dt=NULL, Int_t mode=0) const
return the misclassification rate of a pruned tree a "pruned tree" may have set the variable "IsTermi...
void ClearTree()
clear the tree nodes (their S/N, Nevents etc), just keep the structure of the tree
static DecisionTree * CreateFromXML(void *node, UInt_t tmva_Version_Code=TMVA_VERSION_CODE)
re-create a new tree (decision tree or search tree) from XML
Double_t SamplePurity(EventList eventSample)
calculates the purity S/(S+B) of a given event sample
void SetParentTreeInNodes(Node *n=NULL)
descend a tree to find all its leaf nodes, fill max depth reached in the tree at the same time.
Node * GetNode(ULong_t sequence, UInt_t depth)
retrieve node from the tree.
std::vector< Double_t > GetFisherCoefficients(const EventConstList &eventSample, UInt_t nFisherVars, UInt_t *mapVarInFisher)
calculate the fisher coefficients for the event sample and the variables used
Double_t TrainNodeFast(const EventConstList &eventSample, DecisionTreeNode *node)
Decide how to split a node using one of the variables that gives the best separation of signal/backgr...
void DescendTree(Node *n=NULL)
descend a tree to find all its leaf nodes
RegressionVariance * fRegType
Definition: DecisionTree.h:211
DecisionTree(void)
default constructor using the GiniIndex as separation criterion, no restrictions on minium number of ...
Double_t GetSumWeights(const EventConstList *validationSample) const
calculate the normalization factor for a pruning validation sample
Double_t GetOriginalWeight() const
Definition: Event.h:84
Double_t GetWeight() const
return the event weight - depending on whether the flag IgnoreNegWeightsInTraining is or not.
Definition: Event.cxx:382
UInt_t GetClass() const
Definition: Event.h:86
Float_t GetValueFast(UInt_t ivar) const
Definition: Event.h:93
Float_t GetTarget(UInt_t itgt) const
Definition: Event.h:102
unsigned int GetPoolSize() const
Definition: Executor.h:99
auto MapReduce(F func, ROOT::TSeq< INTEGER > args, R redfunc) -> typename std::result_of< F(INTEGER)>::type
Wrap TExecutor::MapReduce functions.
Definition: Executor.h:145
auto Map(F func, unsigned nTimes) -> std::vector< typename std::result_of< F()>::type >
Wrap TExecutor::Map functions.
Definition: Executor.h:133
A helper class to prune a decision tree using the expected error (C4.5) method.
IPruneTool - a helper interface class to prune a decision tree.
Definition: IPruneTool.h:70
void SetPruneStrength(Double_t alpha)
Definition: IPruneTool.h:88
virtual PruningInfo * CalculatePruningInfo(DecisionTree *dt, const EventSample *testEvents=NULL, Bool_t isAutomatic=kFALSE)=0
Bool_t IsAutomatic() const
Definition: IPruneTool.h:95
Node for the BinarySearch or Decision Trees.
Definition: Node.h:56
UInt_t GetDepth() const
Definition: Node.h:114
std::vector< DecisionTreeNode * > PruneSequence
the regularization parameter for pruning
Definition: IPruneTool.h:47
Double_t PruneStrength
quality measure for a pruned subtree T of T_max
Definition: IPruneTool.h:46
Calculate the "SeparationGain" for Regression analysis separation criteria used in various training a...
An interface to calculate the "SeparationGain" for different separation criteria used in various trai...
const TMatrixD * GetCorrelationMatrix(const TMatrixD *covMat)
turns covariance into correlation matrix
Definition: Tools.cxx:336
void ReadAttr(void *node, const char *, T &value)
read attribute from xml
Definition: Tools.h:337
std::vector< TMatrixDSym * > * CalcCovarianceMatrices(const std::vector< Event * > &events, Int_t maxCls, VariableTransformBase *transformBase=0)
compute covariance matrices
Definition: Tools.cxx:1526
Singleton class for Global types used by TMVA.
Definition: Types.h:73
@ kClassification
Definition: Types.h:128
@ kRegression
Definition: Types.h:129
TMatrixT< Element > & Invert(Double_t *det=0)
Invert the matrix and calculate its determinant.
Definition: TMatrixT.cxx:1396
virtual Double_t Determinant() const
Return the matrix determinant.
Definition: TMatrixT.cxx:1361
Random number generator class based on M.
Definition: TRandom3.h:27
Double_t y[n]
Definition: legend1.C:17
Double_t x[n]
Definition: legend1.C:17
const Int_t n
Definition: legend1.C:16
TClass * GetClass(T *)
Definition: TClass.h:608
TSeq< unsigned int > TSeqU
Definition: TSeq.hxx:195
static constexpr double sr
static constexpr double s
Tools & gTools()
MsgLogger & Endl(MsgLogger &ml)
Definition: MsgLogger.h:158
Short_t Max(Short_t a, Short_t b)
Definition: TMathBase.h:212
constexpr Double_t E()
Base of natural log:
Definition: TMath.h:97
Double_t Log(Double_t x)
Definition: TMath.h:750
Double_t Sqrt(Double_t x)
Definition: TMath.h:681
Short_t Min(Short_t a, Short_t b)
Definition: TMathBase.h:180
Short_t Abs(Short_t d)
Definition: TMathBase.h:120
auto * l
Definition: textangle.C:4
static long int sum(long int i)
Definition: Factory.cxx:2276
REAL epsilon
Definition: triangle.c:617