Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
DecisionTree.cxx
Go to the documentation of this file.
1// @(#)root/tmva $Id$
2// Author: Andreas Hoecker, Joerg Stelzer, Helge Voss, Kai Voss, Eckhard von Toerne, Jan Therhaag
3
4/**********************************************************************************
5 * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6 * Package: TMVA *
7 * Class : TMVA::DecisionTree *
8 * *
9 * *
10 * Description: *
11 * Implementation of a Decision Tree *
12 * *
13 * Authors (alphabetical): *
14 * Andreas Hoecker <Andreas.Hocker@cern.ch> - CERN, Switzerland *
15 * Helge Voss <Helge.Voss@cern.ch> - MPI-K Heidelberg, Germany *
16 * Kai Voss <Kai.Voss@cern.ch> - U. of Victoria, Canada *
17 * Eckhard v. Toerne <evt@uni-bonn.de> - U of Bonn, Germany *
18 * Jan Therhaag <Jan.Therhaag@cern.ch> - U of Bonn, Germany *
19 * *
20 * Copyright (c) 2005-2011: *
21 * CERN, Switzerland *
22 * U. of Victoria, Canada *
23 * MPI-K Heidelberg, Germany *
24 * U. of Bonn, Germany *
25 * *
26 * Redistribution and use in source and binary forms, with or without *
27 * modification, are permitted according to the terms listed in LICENSE *
28 * (http://mva.sourceforge.net/license.txt) *
29 * *
30 **********************************************************************************/
31
32/*! \class TMVA::DecisionTree
33\ingroup TMVA
34
35Implementation of a Decision Tree
36
37In a decision tree successive decision nodes are used to categorize the
38events out of the sample as either signal or background. Each node
39uses only a single discriminating variable to decide if the event is
40signal-like ("goes right") or background-like ("goes left"). This
41forms a tree like structure with "baskets" at the end (leave nodes),
42and an event is classified as either signal or background according to
43whether the basket where it ends up has been classified signal or
44background during the training. Training of a decision tree is the
45process to define the "cut criteria" for each node. The training
46starts with the root node. Here one takes the full training event
47sample and selects the variable and corresponding cut value that gives
48the best separation between signal and background at this stage. Using
49this cut criterion, the sample is then divided into two subsamples, a
50signal-like (right) and a background-like (left) sample. Two new nodes
51are then created for each of the two sub-samples and they are
52constructed using the same mechanism as described for the root
53node. The devision is stopped once a certain node has reached either a
54minimum number of events, or a minimum or maximum signal purity. These
55leave nodes are then called "signal" or "background" if they contain
56more signal respective background events from the training sample.
57
58*/
59
60#include <iostream>
61#include <algorithm>
62#include <vector>
63#include <limits>
64#include <cassert>
65
66#include "TRandom3.h"
67#include "TMath.h"
68#include "TMatrix.h"
69
70#include "TMVA/MsgLogger.h"
71#include "TMVA/DecisionTree.h"
74
75#include "TMVA/Tools.h"
76#include "TMVA/Config.h"
77
78#include "TMVA/GiniIndex.h"
79#include "TMVA/CrossEntropy.h"
81#include "TMVA/SdivSqrtSplusB.h"
82#include "TMVA/Event.h"
84#include "TMVA/IPruneTool.h"
87
88const Int_t TMVA::DecisionTree::fgRandomSeed = 0; // set nonzero for debugging and zero for random seeds
89
90using std::vector;
91
92
93bool almost_equal_float(float x, float y, int ulp=4){
94 // the machine epsilon has to be scaled to the magnitude of the values used
95 // and multiplied by the desired precision in ULPs (units in the last place)
96 return std::abs(x-y) < std::numeric_limits<float>::epsilon() * std::abs(x+y) * ulp
97 // unless the result is subnormal
98 || std::abs(x-y) < std::numeric_limits<float>::min();
99}
100
101bool almost_equal_double(double x, double y, int ulp=4){
102 // the machine epsilon has to be scaled to the magnitude of the values used
103 // and multiplied by the desired precision in ULPs (units in the last place)
104 return std::abs(x-y) < std::numeric_limits<double>::epsilon() * std::abs(x+y) * ulp
105 // unless the result is subnormal
106 || std::abs(x-y) < std::numeric_limits<double>::min();
107}
108
109////////////////////////////////////////////////////////////////////////////////
110/// default constructor using the GiniIndex as separation criterion,
111/// no restrictions on minium number of events in a leave note or the
112/// separation gain in the node splitting
113
115 BinaryTree(),
116 fNvars (0),
117 fNCuts (-1),
118 fUseFisherCuts (kFALSE),
119 fMinLinCorrForFisher (1),
120 fUseExclusiveVars (kTRUE),
121 fSepType (NULL),
122 fRegType (NULL),
123 fMinSize (0),
124 fMinNodeSize (1),
125 fMinSepGain (0),
126 fUseSearchTree(kFALSE),
127 fPruneStrength(0),
128 fPruneMethod (kNoPruning),
129 fNNodesBeforePruning(0),
130 fNodePurityLimit(0.5),
131 fRandomisedTree (kFALSE),
132 fUseNvars (0),
133 fUsePoissonNvars(kFALSE),
134 fMyTrandom (NULL),
135 fMaxDepth (999999),
136 fSigClass (0),
137 fTreeID (0),
138 fAnalysisType (Types::kClassification),
139 fDataSetInfo (NULL)
140
141{}
142
143////////////////////////////////////////////////////////////////////////////////
144/// constructor specifying the separation type, the min number of
145/// events in a no that is still subjected to further splitting, the
146/// number of bins in the grid used in applying the cut for the node
147/// splitting.
148
152 BinaryTree(),
153 fNvars (0),
154 fNCuts (nCuts),
155 fUseFisherCuts (kFALSE),
156 fMinLinCorrForFisher (1),
157 fUseExclusiveVars (kTRUE),
158 fSepType (sepType),
159 fRegType (NULL),
160 fMinSize (0),
161 fMinNodeSize (minSize),
162 fMinSepGain (0),
163 fUseSearchTree (kFALSE),
164 fPruneStrength (0),
165 fPruneMethod (kNoPruning),
166 fNNodesBeforePruning(0),
167 fNodePurityLimit(purityLimit),
168 fRandomisedTree (randomisedTree),
169 fUseNvars (useNvars),
170 fUsePoissonNvars(usePoissonNvars),
171 fMyTrandom (new TRandom3(iSeed)),
172 fMaxDepth (nMaxDepth),
173 fSigClass (cls),
174 fTreeID (treeID),
175 fAnalysisType (Types::kClassification),
176 fDataSetInfo (dataInfo)
177{
178 if (sepType == NULL) { // it is interpreted as a regression tree, where
179 // currently the separation type (simple least square)
180 // cannot be chosen freely)
183 if ( nCuts <=0 ) {
184 fNCuts = 200;
185 Log() << kWARNING << " You had chosen the training mode using optimal cuts, not\n"
186 << " based on a grid of " << fNCuts << " by setting the option NCuts < 0\n"
187 << " as this doesn't exist yet, I set it to " << fNCuts << " and use the grid"
188 << Endl;
189 }
190 }else{
192 }
193}
194
195////////////////////////////////////////////////////////////////////////////////
196/// copy constructor that creates a true copy, i.e. a completely independent tree
197/// the node copy will recursively copy all the nodes
198
200 BinaryTree(),
201 fNvars (d.fNvars),
202 fNCuts (d.fNCuts),
203 fUseFisherCuts (d.fUseFisherCuts),
204 fMinLinCorrForFisher (d.fMinLinCorrForFisher),
205 fUseExclusiveVars (d.fUseExclusiveVars),
206 fSepType (d.fSepType),
207 fRegType (d.fRegType),
208 fMinSize (d.fMinSize),
209 fMinNodeSize(d.fMinNodeSize),
210 fMinSepGain (d.fMinSepGain),
211 fUseSearchTree (d.fUseSearchTree),
212 fPruneStrength (d.fPruneStrength),
213 fPruneMethod (d.fPruneMethod),
214 fNodePurityLimit(d.fNodePurityLimit),
215 fRandomisedTree (d.fRandomisedTree),
216 fUseNvars (d.fUseNvars),
217 fUsePoissonNvars(d.fUsePoissonNvars),
218 fMyTrandom (new TRandom3(fgRandomSeed)), // well, that means it's not an identical copy. But I only ever intend to really copy trees that are "outgrown" already.
219 fMaxDepth (d.fMaxDepth),
220 fSigClass (d.fSigClass),
221 fTreeID (d.fTreeID),
222 fAnalysisType(d.fAnalysisType),
223 fDataSetInfo (d.fDataSetInfo)
224{
225 this->SetRoot( new TMVA::DecisionTreeNode ( *((DecisionTreeNode*)(d.GetRoot())) ) );
226 this->SetParentTreeInNodes();
227 fNNodes = d.fNNodes;
228
229}
230
231
232////////////////////////////////////////////////////////////////////////////////
233/// destructor
234
236{
237 // destruction of the tree nodes done in the "base class" BinaryTree
238
239 if (fMyTrandom) delete fMyTrandom;
240 if (fRegType) delete fRegType;
241}
242
243////////////////////////////////////////////////////////////////////////////////
244/// descend a tree to find all its leaf nodes, fill max depth reached in the
245/// tree at the same time.
246
248{
249 if (n == NULL) { //default, start at the tree top, then descend recursively
250 n = this->GetRoot();
251 if (n == NULL) {
252 Log() << kFATAL << "SetParentTreeNodes: started with undefined ROOT node" <<Endl;
253 return ;
254 }
255 }
256
257 if ((this->GetLeftDaughter(n) == NULL) && (this->GetRightDaughter(n) != NULL) ) {
258 Log() << kFATAL << " Node with only one daughter?? Something went wrong" << Endl;
259 return;
260 } else if ((this->GetLeftDaughter(n) != NULL) && (this->GetRightDaughter(n) == NULL) ) {
261 Log() << kFATAL << " Node with only one daughter?? Something went wrong" << Endl;
262 return;
263 }
264 else {
265 if (this->GetLeftDaughter(n) != NULL) {
266 this->SetParentTreeInNodes( this->GetLeftDaughter(n) );
267 }
268 if (this->GetRightDaughter(n) != NULL) {
269 this->SetParentTreeInNodes( this->GetRightDaughter(n) );
270 }
271 }
272 n->SetParentTree(this);
273 if (n->GetDepth() > this->GetTotalTreeDepth()) this->SetTotalTreeDepth(n->GetDepth());
274 return;
275}
276
277////////////////////////////////////////////////////////////////////////////////
278/// re-create a new tree (decision tree or search tree) from XML
279
281 std::string type("");
282 gTools().ReadAttr(node,"type", type);
284
285 dt->ReadXML( node, tmva_Version_Code );
286 return dt;
287}
288
289// #### Multithreaded DecisionTree::BuildTree
290#ifdef R__USE_IMT
291//====================================================================================
292// Had to define this struct to enable parallelization.
293// Using BuildNodeInfo, each thread can return the information needed.
294// After each thread returns we can then merge the results, hence the + operator.
295//====================================================================================
297
299
300 nvars = fNvars;
301 xmin = std::vector<Float_t>(nvars);
302 xmax = std::vector<Float_t>(nvars);
303
304 // #### the initial min and max for each feature
305 for (Int_t ivar=0; ivar<fNvars; ivar++) {
306 const Double_t val = evt->GetValueFast(ivar);
307 xmin[ivar]=val;
308 xmax[ivar]=val;
309 }
310 };
311
312 BuildNodeInfo(Int_t fNvars, std::vector<Float_t>& inxmin, std::vector<Float_t>& inxmax){
313
314 nvars = fNvars;
315 xmin = std::vector<Float_t>(nvars);
316 xmax = std::vector<Float_t>(nvars);
317
318 // #### the initial min and max for each feature
319 for (Int_t ivar=0; ivar<fNvars; ivar++) {
322 }
323 };
325
326 Int_t nvars = 0;
327 Double_t s = 0;
328 Double_t suw = 0;
329 Double_t sub = 0;
331 Double_t buw = 0;
332 Double_t bub = 0;
334 Double_t target2 = 0;
335 std::vector<Float_t> xmin;
336 std::vector<Float_t> xmax;
337
338 // #### Define the addition operator for BuildNodeInfo
339 // #### Make sure both BuildNodeInfos have the same nvars if we add them
341 {
342 BuildNodeInfo ret(nvars, xmin, xmax);
343 if(nvars != other.nvars)
344 {
345 std::cout << "!!! ERROR BuildNodeInfo1+BuildNodeInfo2 failure. Nvars1 != Nvars2." << std::endl;
346 return ret;
347 }
348 ret.s = s + other.s;
349 ret.suw = suw + other.suw;
350 ret.sub = sub + other.sub;
351 ret.b = b + other.b;
352 ret.buw = buw + other.buw;
353 ret.bub = bub + other.bub;
354 ret.target = target + other.target;
355 ret.target2 = target2 + other.target2;
356
357 // xmin is the min of both, xmax is the max of both
358 for(Int_t i=0; i<nvars; i++)
359 {
360 ret.xmin[i]=xmin[i]<other.xmin[i]?xmin[i]:other.xmin[i];
361 ret.xmax[i]=xmax[i]>other.xmax[i]?xmax[i]:other.xmax[i];
362 }
363 return ret;
364 };
365
366};
367//===========================================================================
368// Done with BuildNodeInfo declaration
369//===========================================================================
370
371
372////////////////////////////////////////////////////////////////////////////////
373/// building the decision tree by recursively calling the splitting of
374/// one (root-) node into two daughter nodes (returns the number of nodes)
375
376UInt_t TMVA::DecisionTree::BuildTree( const std::vector<const TMVA::Event*> & eventSample,
378{
379 if (node==NULL) {
380 //start with the root node
381 node = new TMVA::DecisionTreeNode();
382 fNNodes = 1;
383 this->SetRoot(node);
384 // have to use "s" for start as "r" for "root" would be the same as "r" for "right"
385 this->GetRoot()->SetPos('s');
386 this->GetRoot()->SetDepth(0);
387 this->GetRoot()->SetParentTree(this);
388 fMinSize = fMinNodeSize/100. * eventSample.size();
389 if (GetTreeID()==0){
390 Log() << kDEBUG << "\tThe minimal node size MinNodeSize=" << fMinNodeSize << " fMinNodeSize="<<fMinNodeSize<< "% is translated to an actual number of events = "<< fMinSize<< " for the training sample size of " << eventSample.size() << Endl;
391 Log() << kDEBUG << "\tNote: This number will be taken as absolute minimum in the node, " << Endl;
392 Log() << kDEBUG << " \tin terms of 'weighted events' and unweighted ones !! " << Endl;
393 }
394 }
395
396 UInt_t nevents = eventSample.size();
397
398 if (nevents > 0 ) {
399 if (fNvars==0) fNvars = eventSample[0]->GetNVariables(); // should have been set before, but ... well..
400 fVariableImportance.resize(fNvars);
401 }
402 else Log() << kFATAL << ":<BuildTree> eventsample Size == 0 " << Endl;
403
404 // sum up the totals
405 // sig and bkg for classification
406 // err and err2 for regression
407
408 // #### Set up prerequisite info for multithreading
409 UInt_t nPartitions = TMVA::Config::Instance().GetThreadExecutor().GetPoolSize();
411
412 // #### need a lambda function to pass to TThreadExecutor::MapReduce (multi-threading)
413 auto f = [this, &eventSample, &nPartitions](UInt_t partition = 0){
414
415 Int_t start = 1.0*partition/nPartitions*eventSample.size();
416 Int_t end = (partition+1.0)/nPartitions*eventSample.size();
417
419
420 for(Int_t iev=start; iev<end; iev++){
421 const TMVA::Event* evt = eventSample[iev];
422 const Double_t weight = evt->GetWeight();
423 const Double_t orgWeight = evt->GetOriginalWeight(); // unboosted!
424 if (evt->GetClass() == fSigClass) {
425 nodeInfof.s += weight;
426 nodeInfof.suw += 1;
427 nodeInfof.sub += orgWeight;
428 }
429 else {
430 nodeInfof.b += weight;
431 nodeInfof.buw += 1;
432 nodeInfof.bub += orgWeight;
433 }
434 if ( DoRegression() ) {
435 const Double_t tgt = evt->GetTarget(0);
436 nodeInfof.target +=weight*tgt;
437 nodeInfof.target2+=weight*tgt*tgt;
438 }
439
440 // save the min and max for each feature
441 for (UInt_t ivar=0; ivar<fNvars; ivar++) {
442 const Double_t val = evt->GetValueFast(ivar);
443 if (iev==start){
444 nodeInfof.xmin[ivar]=val;
445 nodeInfof.xmax[ivar]=val;
446 }
447 if (val < nodeInfof.xmin[ivar]) nodeInfof.xmin[ivar]=val;
448 if (val > nodeInfof.xmax[ivar]) nodeInfof.xmax[ivar]=val;
449 }
450 }
451 return nodeInfof;
452 };
453
454 // #### Need an initial struct to pass to std::accumulate
456
457 // #### Run the threads in parallel then merge the results
458 auto redfunc = [nodeInfoInit](std::vector<BuildNodeInfo> v) -> BuildNodeInfo { return std::accumulate(v.begin(), v.end(), nodeInfoInit); };
459 BuildNodeInfo nodeInfo = TMVA::Config::Instance().GetThreadExecutor().MapReduce(f, seeds, redfunc);
460 //NodeInfo nodeInfo(fNvars);
461
462 if (nodeInfo.s+nodeInfo.b < 0) {
463 Log() << kWARNING << " One of the Decision Tree nodes has negative total number of signal or background events. "
464 << "(Nsig="<<nodeInfo.s<<" Nbkg="<<nodeInfo.b<<" Probaby you use a Monte Carlo with negative weights. That should in principle "
465 << "be fine as long as on average you end up with something positive. For this you have to make sure that the "
466 << "minimal number of (unweighted) events demanded for a tree node (currently you use: MinNodeSize="<<fMinNodeSize
467 << "% of training events, you can set this via the BDT option string when booking the classifier) is large enough "
468 << "to allow for reasonable averaging!!!" << Endl
469 << " If this does not help.. maybe you want to try the option: NoNegWeightsInTraining which ignores events "
470 << "with negative weight in the training." << Endl;
471 double nBkg=0.;
472 for (UInt_t i=0; i<eventSample.size(); i++) {
473 if (eventSample[i]->GetClass() != fSigClass) {
474 nBkg += eventSample[i]->GetWeight();
475 Log() << kDEBUG << "Event "<< i<< " has (original) weight: " << eventSample[i]->GetWeight()/eventSample[i]->GetBoostWeight()
476 << " boostWeight: " << eventSample[i]->GetBoostWeight() << Endl;
477 }
478 }
479 Log() << kDEBUG << " that gives in total: " << nBkg<<Endl;
480 }
481
482 node->SetNSigEvents(nodeInfo.s);
483 node->SetNBkgEvents(nodeInfo.b);
488 node->SetPurity();
489 if (node == this->GetRoot()) {
490 node->SetNEvents(nodeInfo.s+nodeInfo.b);
493 }
494
495 // save the min and max for each feature
496 for (UInt_t ivar=0; ivar<fNvars; ivar++) {
497 node->SetSampleMin(ivar,nodeInfo.xmin[ivar]);
498 node->SetSampleMax(ivar,nodeInfo.xmax[ivar]);
499 }
500
501 // I now demand the minimum number of events for both daughter nodes. Hence if the number
502 // of events in the parent node is not at least two times as big, I don't even need to try
503 // splitting
504
505 // ask here for actual "events" independent of their weight.. OR the weighted events
506 // to exceed the min requested number of events per daughter node
507 // (NOTE: make sure that at the eventSample at the ROOT node has sum_of_weights == sample.size() !
508 // if ((eventSample.size() >= 2*fMinSize ||s+b >= 2*fMinSize) && node->GetDepth() < fMaxDepth
509 // std::cout << "------------------------------------------------------------------"<<std::endl;
510 // std::cout << "------------------------------------------------------------------"<<std::endl;
511 // std::cout << " eveSampleSize = "<< eventSample.size() << " s+b="<<s+b << std::endl;
512 if ((eventSample.size() >= 2*fMinSize && nodeInfo.s+nodeInfo.b >= 2*fMinSize) && node->GetDepth() < fMaxDepth
513 && ( ( nodeInfo.s!=0 && nodeInfo.b !=0 && !DoRegression()) || ( (nodeInfo.s+nodeInfo.b)!=0 && DoRegression()) ) ) {
514
515 // Train the node and figure out the separation gain and split points
517 if (fNCuts > 0){
518 separationGain = this->TrainNodeFast(eventSample, node);
519 }
520 else {
521 separationGain = this->TrainNodeFull(eventSample, node);
522 }
523
524 // The node has been trained and there is nothing to be gained by splitting
525 if (separationGain < std::numeric_limits<double>::epsilon()) { // we could not gain anything, e.g. all events are in one bin,
526 // no cut can actually do anything to improve the node
527 // hence, naturally, the current node is a leaf node
528 if (DoRegression()) {
529 node->SetSeparationIndex(fRegType->GetSeparationIndex(nodeInfo.s+nodeInfo.b,nodeInfo.target,nodeInfo.target2));
530 node->SetResponse(nodeInfo.target/(nodeInfo.s+nodeInfo.b));
531 if( almost_equal_double(nodeInfo.target2/(nodeInfo.s+nodeInfo.b),nodeInfo.target/(nodeInfo.s+nodeInfo.b)*nodeInfo.target/(nodeInfo.s+nodeInfo.b)) ){
532 node->SetRMS(0);
533 }else{
534 node->SetRMS(TMath::Sqrt(nodeInfo.target2/(nodeInfo.s+nodeInfo.b) - nodeInfo.target/(nodeInfo.s+nodeInfo.b)*nodeInfo.target/(nodeInfo.s+nodeInfo.b)));
535 }
536 }
537 else {
538 node->SetSeparationIndex(fSepType->GetSeparationIndex(nodeInfo.s,nodeInfo.b));
539 if (node->GetPurity() > fNodePurityLimit) node->SetNodeType(1);
540 else node->SetNodeType(-1);
541 }
542 if (node->GetDepth() > this->GetTotalTreeDepth()) this->SetTotalTreeDepth(node->GetDepth());
543 }
544 else {
545 // #### Couldn't parallelize this part (filtering events from mother node to daughter nodes)
546 // #### ... would need to avoid the push_back or use some threadsafe mutex locked version...
547 std::vector<const TMVA::Event*> leftSample; leftSample.reserve(nevents);
548 std::vector<const TMVA::Event*> rightSample; rightSample.reserve(nevents);
549
550 Double_t nRight=0, nLeft=0;
552
553 for (UInt_t ie=0; ie< nevents ; ie++) {
554 if (node->GoesRight(*eventSample[ie])) {
555 rightSample.push_back(eventSample[ie]);
556 nRight += eventSample[ie]->GetWeight();
557 nRightUnBoosted += eventSample[ie]->GetOriginalWeight();
558 }
559 else {
560 leftSample.push_back(eventSample[ie]);
561 nLeft += eventSample[ie]->GetWeight();
562 nLeftUnBoosted += eventSample[ie]->GetOriginalWeight();
563 }
564 }
565 // sanity check
566 if (leftSample.empty() || rightSample.empty()) {
567
568 Log() << kERROR << "<TrainNode> all events went to the same branch" << Endl
569 << "--- Hence new node == old node ... check" << Endl
570 << "--- left:" << leftSample.size()
571 << " right:" << rightSample.size() << Endl
572 << " while the separation is thought to be " << separationGain
573 << "\n when cutting on variable " << node->GetSelector()
574 << " at value " << node->GetCutValue()
575 << kFATAL << "--- this should never happen, please write a bug report to Helge.Voss@cern.ch" << Endl;
576 }
577
578 // continue building daughter nodes for the left and the right eventsample
580 fNNodes++;
581 rightNode->SetNEvents(nRight);
582 rightNode->SetNEvents_unboosted(nRightUnBoosted);
583 rightNode->SetNEvents_unweighted(rightSample.size());
584
586
587 fNNodes++;
588 leftNode->SetNEvents(nLeft);
589 leftNode->SetNEvents_unboosted(nLeftUnBoosted);
590 leftNode->SetNEvents_unweighted(leftSample.size());
591
592 node->SetNodeType(0);
593 node->SetLeft(leftNode);
594 node->SetRight(rightNode);
595
596 this->BuildTree(rightSample, rightNode);
597 this->BuildTree(leftSample, leftNode );
598
599 }
600 }
601 else{ // it is a leaf node
602 if (DoRegression()) {
603 node->SetSeparationIndex(fRegType->GetSeparationIndex(nodeInfo.s+nodeInfo.b,nodeInfo.target,nodeInfo.target2));
604 node->SetResponse(nodeInfo.target/(nodeInfo.s+nodeInfo.b));
605 if( almost_equal_double(nodeInfo.target2/(nodeInfo.s+nodeInfo.b), nodeInfo.target/(nodeInfo.s+nodeInfo.b)*nodeInfo.target/(nodeInfo.s+nodeInfo.b)) ) {
606 node->SetRMS(0);
607 }else{
608 node->SetRMS(TMath::Sqrt(nodeInfo.target2/(nodeInfo.s+nodeInfo.b) - nodeInfo.target/(nodeInfo.s+nodeInfo.b)*nodeInfo.target/(nodeInfo.s+nodeInfo.b)));
609 }
610 }
611 else {
612 node->SetSeparationIndex(fSepType->GetSeparationIndex(nodeInfo.s,nodeInfo.b));
613 if (node->GetPurity() > fNodePurityLimit) node->SetNodeType(1);
614 else node->SetNodeType(-1);
615 // loop through the event sample ending up in this node and check for events with negative weight
616 // those "cannot" be boosted normally. Hence, if there is one of those
617 // is misclassified, find randomly as many events with positive weights in this
618 // node as needed to get the same absolute number of weight, and mark them as
619 // "not to be boosted" in order to make up for not boosting the negative weight event
620 }
621
622
623 if (node->GetDepth() > this->GetTotalTreeDepth()) this->SetTotalTreeDepth(node->GetDepth());
624 }
625
626 // if (IsRootNode) this->CleanTree();
627 return fNNodes;
628}
629
630// Standard DecisionTree::BuildTree (multithreading is not enabled)
631#else
632
633UInt_t TMVA::DecisionTree::BuildTree( const std::vector<const TMVA::Event*> & eventSample,
635{
636 if (node==NULL) {
637 //start with the root node
638 node = new TMVA::DecisionTreeNode();
639 fNNodes = 1;
640 this->SetRoot(node);
641 // have to use "s" for start as "r" for "root" would be the same as "r" for "right"
642 this->GetRoot()->SetPos('s');
643 this->GetRoot()->SetDepth(0);
644 this->GetRoot()->SetParentTree(this);
645 fMinSize = fMinNodeSize/100. * eventSample.size();
646 if (GetTreeID()==0){
647 Log() << kDEBUG << "\tThe minimal node size MinNodeSize=" << fMinNodeSize << " fMinNodeSize="<<fMinNodeSize<< "% is translated to an actual number of events = "<< fMinSize<< " for the training sample size of " << eventSample.size() << Endl;
648 Log() << kDEBUG << "\tNote: This number will be taken as absolute minimum in the node, " << Endl;
649 Log() << kDEBUG << " \tin terms of 'weighted events' and unweighted ones !! " << Endl;
650 }
651 }
652
653 UInt_t nevents = eventSample.size();
654
655 if (nevents > 0 ) {
656 if (fNvars==0) fNvars = eventSample[0]->GetNVariables(); // should have been set before, but ... well..
657 fVariableImportance.resize(fNvars);
658 }
659 else Log() << kFATAL << ":<BuildTree> eventsample Size == 0 " << Endl;
660
661 Double_t s=0, b=0;
662 Double_t suw=0, buw=0;
663 Double_t sub=0, bub=0; // unboosted!
664 Double_t target=0, target2=0;
665 Float_t *xmin = new Float_t[fNvars];
666 Float_t *xmax = new Float_t[fNvars];
667
668 // initializing xmin and xmax for each variable
669 for (UInt_t ivar=0; ivar<fNvars; ivar++) {
670 xmin[ivar]=xmax[ivar]=0;
671 }
672 // sum up the totals
673 // sig and bkg for classification
674 // err and err2 for regression
675 for (UInt_t iev=0; iev<eventSample.size(); iev++) {
676 const TMVA::Event* evt = eventSample[iev];
677 const Double_t weight = evt->GetWeight();
678 const Double_t orgWeight = evt->GetOriginalWeight(); // unboosted!
679 if (evt->GetClass() == fSigClass) {
680 s += weight;
681 suw += 1;
682 sub += orgWeight;
683 }
684 else {
685 b += weight;
686 buw += 1;
687 bub += orgWeight;
688 }
689 if ( DoRegression() ) {
690 const Double_t tgt = evt->GetTarget(0);
691 target +=weight*tgt;
692 target2+=weight*tgt*tgt;
693 }
694
695 // save the min and max for each feature
696 for (UInt_t ivar=0; ivar<fNvars; ivar++) {
697 const Double_t val = evt->GetValueFast(ivar);
698 if (iev==0) xmin[ivar]=xmax[ivar]=val;
699 if (val < xmin[ivar]) xmin[ivar]=val;
700 if (val > xmax[ivar]) xmax[ivar]=val;
701 }
702 }
703
704
705 if (s+b < 0) {
706 Log() << kWARNING << " One of the Decision Tree nodes has negative total number of signal or background events. "
707 << "(Nsig="<<s<<" Nbkg="<<b<<" Probaby you use a Monte Carlo with negative weights. That should in principle "
708 << "be fine as long as on average you end up with something positive. For this you have to make sure that the "
709 << "minimul number of (unweighted) events demanded for a tree node (currently you use: MinNodeSize="<<fMinNodeSize
710 << "% of training events, you can set this via the BDT option string when booking the classifier) is large enough "
711 << "to allow for reasonable averaging!!!" << Endl
712 << " If this does not help.. maybe you want to try the option: NoNegWeightsInTraining which ignores events "
713 << "with negative weight in the training." << Endl;
714 double nBkg=0.;
715 for (UInt_t i=0; i<eventSample.size(); i++) {
716 if (eventSample[i]->GetClass() != fSigClass) {
717 nBkg += eventSample[i]->GetWeight();
718 Log() << kDEBUG << "Event "<< i<< " has (original) weight: " << eventSample[i]->GetWeight()/eventSample[i]->GetBoostWeight()
719 << " boostWeight: " << eventSample[i]->GetBoostWeight() << Endl;
720 }
721 }
722 Log() << kDEBUG << " that gives in total: " << nBkg<<Endl;
723 }
724
725 node->SetNSigEvents(s);
726 node->SetNBkgEvents(b);
727 node->SetNSigEvents_unweighted(suw);
728 node->SetNBkgEvents_unweighted(buw);
729 node->SetNSigEvents_unboosted(sub);
730 node->SetNBkgEvents_unboosted(bub);
731 node->SetPurity();
732 if (node == this->GetRoot()) {
733 node->SetNEvents(s+b);
734 node->SetNEvents_unweighted(suw+buw);
735 node->SetNEvents_unboosted(sub+bub);
736 }
737
738 // save the min and max for each feature
739 for (UInt_t ivar=0; ivar<fNvars; ivar++) {
740 node->SetSampleMin(ivar,xmin[ivar]);
741 node->SetSampleMax(ivar,xmax[ivar]);
742
743 }
744 delete[] xmin;
745 delete[] xmax;
746
747 // I now demand the minimum number of events for both daughter nodes. Hence if the number
748 // of events in the parent node is not at least two times as big, I don't even need to try
749 // splitting
750
751 // ask here for actuall "events" independent of their weight.. OR the weighted events
752 // to execeed the min requested number of events per dauther node
753 // (NOTE: make sure that at the eventSample at the ROOT node has sum_of_weights == sample.size() !
754 // if ((eventSample.size() >= 2*fMinSize ||s+b >= 2*fMinSize) && node->GetDepth() < fMaxDepth
755 // std::cout << "------------------------------------------------------------------"<<std::endl;
756 // std::cout << "------------------------------------------------------------------"<<std::endl;
757 // std::cout << " eveSampleSize = "<< eventSample.size() << " s+b="<<s+b << std::endl;
758 if ((eventSample.size() >= 2*fMinSize && s+b >= 2*fMinSize) && node->GetDepth() < fMaxDepth
759 && ( ( s!=0 && b !=0 && !DoRegression()) || ( (s+b)!=0 && DoRegression()) ) ) {
761 if (fNCuts > 0){
762 separationGain = this->TrainNodeFast(eventSample, node);
763 } else {
764 separationGain = this->TrainNodeFull(eventSample, node);
765 }
766 if (separationGain < std::numeric_limits<double>::epsilon()) { // we could not gain anything, e.g. all events are in one bin,
767 /// if (separationGain < 0.00000001) { // we could not gain anything, e.g. all events are in one bin,
768 // no cut can actually do anything to improve the node
769 // hence, naturally, the current node is a leaf node
770 if (DoRegression()) {
771 node->SetSeparationIndex(fRegType->GetSeparationIndex(s+b,target,target2));
772 node->SetResponse(target/(s+b));
773 if( almost_equal_double(target2/(s+b),target/(s+b)*target/(s+b)) ){
774 node->SetRMS(0);
775 }else{
776 node->SetRMS(TMath::Sqrt(target2/(s+b) - target/(s+b)*target/(s+b)));
777 }
778 }
779 else {
780 node->SetSeparationIndex(fSepType->GetSeparationIndex(s,b));
781
782 if (node->GetPurity() > fNodePurityLimit) node->SetNodeType(1);
783 else node->SetNodeType(-1);
784 }
785 if (node->GetDepth() > this->GetTotalTreeDepth()) this->SetTotalTreeDepth(node->GetDepth());
786
787 } else {
788
789 std::vector<const TMVA::Event*> leftSample; leftSample.reserve(nevents);
790 std::vector<const TMVA::Event*> rightSample; rightSample.reserve(nevents);
791
792 Double_t nRight=0, nLeft=0;
794
795 for (UInt_t ie=0; ie< nevents ; ie++) {
796 if (node->GoesRight(*eventSample[ie])) {
797 rightSample.push_back(eventSample[ie]);
798 nRight += eventSample[ie]->GetWeight();
799 nRightUnBoosted += eventSample[ie]->GetOriginalWeight();
800 }
801 else {
802 leftSample.push_back(eventSample[ie]);
803 nLeft += eventSample[ie]->GetWeight();
804 nLeftUnBoosted += eventSample[ie]->GetOriginalWeight();
805 }
806 }
807
808 // sanity check
809 if (leftSample.empty() || rightSample.empty()) {
810
811 Log() << kERROR << "<TrainNode> all events went to the same branch" << Endl
812 << "--- Hence new node == old node ... check" << Endl
813 << "--- left:" << leftSample.size()
814 << " right:" << rightSample.size() << Endl
815 << " while the separation is thought to be " << separationGain
816 << "\n when cutting on variable " << node->GetSelector()
817 << " at value " << node->GetCutValue()
818 << kFATAL << "--- this should never happen, please write a bug report to Helge.Voss@cern.ch" << Endl;
819 }
820
821 // continue building daughter nodes for the left and the right eventsample
823 fNNodes++;
824 rightNode->SetNEvents(nRight);
825 rightNode->SetNEvents_unboosted(nRightUnBoosted);
826 rightNode->SetNEvents_unweighted(rightSample.size());
827
829
830 fNNodes++;
831 leftNode->SetNEvents(nLeft);
832 leftNode->SetNEvents_unboosted(nLeftUnBoosted);
833 leftNode->SetNEvents_unweighted(leftSample.size());
834
835 node->SetNodeType(0);
836 node->SetLeft(leftNode);
837 node->SetRight(rightNode);
838
839 this->BuildTree(rightSample, rightNode);
840 this->BuildTree(leftSample, leftNode );
841
842 }
843 }
844 else{ // it is a leaf node
845 if (DoRegression()) {
846 node->SetSeparationIndex(fRegType->GetSeparationIndex(s+b,target,target2));
847 node->SetResponse(target/(s+b));
848 if( almost_equal_double(target2/(s+b), target/(s+b)*target/(s+b)) ) {
849 node->SetRMS(0);
850 }else{
851 node->SetRMS(TMath::Sqrt(target2/(s+b) - target/(s+b)*target/(s+b)));
852 }
853 }
854 else {
855 node->SetSeparationIndex(fSepType->GetSeparationIndex(s,b));
856 if (node->GetPurity() > fNodePurityLimit) node->SetNodeType(1);
857 else node->SetNodeType(-1);
858 // loop through the event sample ending up in this node and check for events with negative weight
859 // those "cannot" be boosted normally. Hence, if there is one of those
860 // is misclassified, find randomly as many events with positive weights in this
861 // node as needed to get the same absolute number of weight, and mark them as
862 // "not to be boosted" in order to make up for not boosting the negative weight event
863 }
864
865
866 if (node->GetDepth() > this->GetTotalTreeDepth()) this->SetTotalTreeDepth(node->GetDepth());
867 }
868
869 // if (IsRootNode) this->CleanTree();
870 return fNNodes;
871}
872
873#endif
874
875////////////////////////////////////////////////////////////////////////////////
876/// fill the existing the decision tree structure by filling event
877/// in from the top node and see where they happen to end up
878
879void TMVA::DecisionTree::FillTree( const std::vector<TMVA::Event*> & eventSample )
880{
881 for (UInt_t i=0; i<eventSample.size(); i++) {
882 this->FillEvent(*(eventSample[i]),NULL);
883 }
884}
885
886////////////////////////////////////////////////////////////////////////////////
887/// fill the existing the decision tree structure by filling event
888/// in from the top node and see where they happen to end up
889
892{
893 if (node == NULL) { // that's the start, take the Root node
894 node = this->GetRoot();
895 }
896
897 node->IncrementNEvents( event.GetWeight() );
899
900 if (event.GetClass() == fSigClass) {
901 node->IncrementNSigEvents( event.GetWeight() );
903 }
904 else {
905 node->IncrementNBkgEvents( event.GetWeight() );
907 }
908 node->SetSeparationIndex(fSepType->GetSeparationIndex(node->GetNSigEvents(),
909 node->GetNBkgEvents()));
910
911 if (node->GetNodeType() == 0) { //intermediate node --> go down
912 if (node->GoesRight(event))
913 this->FillEvent(event, node->GetRight());
914 else
915 this->FillEvent(event, node->GetLeft());
916 }
917}
918
919////////////////////////////////////////////////////////////////////////////////
920/// clear the tree nodes (their S/N, Nevents etc), just keep the structure of the tree
921
923{
924 if (this->GetRoot()!=NULL) this->GetRoot()->ClearNodeAndAllDaughters();
925
926}
927
928////////////////////////////////////////////////////////////////////////////////
929/// remove those last splits that result in two leaf nodes that
930/// are both of the type (i.e. both signal or both background)
931/// this of course is only a reasonable thing to do when you use
932/// "YesOrNo" leafs, while it might loose s.th. if you use the
933/// purity information in the nodes.
934/// --> hence I don't call it automatically in the tree building
935
937{
938 if (node==NULL) {
939 node = this->GetRoot();
940 }
941
942 DecisionTreeNode *l = node->GetLeft();
943 DecisionTreeNode *r = node->GetRight();
944
945 if (node->GetNodeType() == 0) {
946 this->CleanTree(l);
947 this->CleanTree(r);
948 if (l->GetNodeType() * r->GetNodeType() > 0) {
949
950 this->PruneNode(node);
951 }
952 }
953 // update the number of nodes after the cleaning
954 return this->CountNodes();
955
956}
957
958////////////////////////////////////////////////////////////////////////////////
959/// prune (get rid of internal nodes) the Decision tree to avoid overtraining
960/// several different pruning methods can be applied as selected by the
961/// variable "fPruneMethod".
962
964{
967
968 if( fPruneMethod == kNoPruning ) return 0.0;
969
970 if (fPruneMethod == kExpectedErrorPruning)
971 // tool = new ExpectedErrorPruneTool(logfile);
973 else if (fPruneMethod == kCostComplexityPruning)
974 {
976 }
977 else {
978 Log() << kFATAL << "Selected pruning method not yet implemented "
979 << Endl;
980 }
981
982 if(!tool) return 0.0;
983
984 tool->SetPruneStrength(GetPruneStrength());
985 if(tool->IsAutomatic()) {
986 if(validationSample == NULL){
987 Log() << kFATAL << "Cannot automate the pruning algorithm without an "
988 << "independent validation sample!" << Endl;
989 }else if(validationSample->size() == 0) {
990 Log() << kFATAL << "Cannot automate the pruning algorithm with "
991 << "independent validation sample of ZERO events!" << Endl;
992 }
993 }
994
995 info = tool->CalculatePruningInfo(this,validationSample);
997 if(!info) {
998 Log() << kFATAL << "Error pruning tree! Check prune.log for more information."
999 << Endl;
1000 } else {
1001 pruneStrength = info->PruneStrength;
1002
1003 // Log() << kDEBUG << "Optimal prune strength (alpha): " << pruneStrength
1004 // << " has quality index " << info->QualityIndex << Endl;
1005
1006
1007 for (UInt_t i = 0; i < info->PruneSequence.size(); ++i) {
1008
1009 PruneNode(info->PruneSequence[i]);
1010 }
1011 // update the number of nodes after the pruning
1012 this->CountNodes();
1013 }
1014
1015 delete tool;
1016 delete info;
1017
1018 return pruneStrength;
1019};
1020
1021
1022////////////////////////////////////////////////////////////////////////////////
1023/// run the validation sample through the (pruned) tree and fill in the nodes
1024/// the variables NSValidation and NBValidadtion (i.e. how many of the Signal
1025/// and Background events from the validation sample. This is then later used
1026/// when asking for the "tree quality" ..
1027
1029{
1030 GetRoot()->ResetValidationData();
1031 for (UInt_t ievt=0; ievt < validationSample->size(); ievt++) {
1032 CheckEventWithPrunedTree((*validationSample)[ievt]);
1033 }
1034}
1035
1036////////////////////////////////////////////////////////////////////////////////
1037/// return the misclassification rate of a pruned tree
1038/// a "pruned tree" may have set the variable "IsTerminal" to "arbitrary" at
1039/// any node, hence this tree quality testing will stop there, hence test
1040/// the pruned tree (while the full tree is still in place for normal/later use)
1041
1043{
1044 if (n == NULL) { // default, start at the tree top, then descend recursively
1045 n = this->GetRoot();
1046 if (n == NULL) {
1047 Log() << kFATAL << "TestPrunedTreeQuality: started with undefined ROOT node" <<Endl;
1048 return 0;
1049 }
1050 }
1051
1052 if( n->GetLeft() != NULL && n->GetRight() != NULL && !n->IsTerminal() ) {
1053 return (TestPrunedTreeQuality( n->GetLeft(), mode ) +
1054 TestPrunedTreeQuality( n->GetRight(), mode ));
1055 }
1056 else { // terminal leaf (in a pruned subtree of T_max at least)
1057 if (DoRegression()) {
1058 Double_t sumw = n->GetNSValidation() + n->GetNBValidation();
1059 return n->GetSumTarget2() - 2*n->GetSumTarget()*n->GetResponse() + sumw*n->GetResponse()*n->GetResponse();
1060 }
1061 else {
1062 if (mode == 0) {
1063 if (n->GetPurity() > this->GetNodePurityLimit()) // this is a signal leaf, according to the training
1064 return n->GetNBValidation();
1065 else
1066 return n->GetNSValidation();
1067 }
1068 else if ( mode == 1 ) {
1069 // calculate the weighted error using the pruning validation sample
1070 return (n->GetPurity() * n->GetNBValidation() + (1.0 - n->GetPurity()) * n->GetNSValidation());
1071 }
1072 else {
1073 throw std::string("Unknown ValidationQualityMode");
1074 }
1075 }
1076 }
1077}
1078
1079////////////////////////////////////////////////////////////////////////////////
1080/// pass a single validation event through a pruned decision tree
1081/// on the way down the tree, fill in all the "intermediate" information
1082/// that would normally be there from training.
1083
1085{
1086 DecisionTreeNode* current = this->GetRoot();
1087 if (current == NULL) {
1088 Log() << kFATAL << "CheckEventWithPrunedTree: started with undefined ROOT node" <<Endl;
1089 }
1090
1091 while(current != NULL) {
1092 if(e->GetClass() == fSigClass)
1093 current->SetNSValidation(current->GetNSValidation() + e->GetWeight());
1094 else
1095 current->SetNBValidation(current->GetNBValidation() + e->GetWeight());
1096
1097 if (e->GetNTargets() > 0) {
1098 current->AddToSumTarget(e->GetWeight()*e->GetTarget(0));
1099 current->AddToSumTarget2(e->GetWeight()*e->GetTarget(0)*e->GetTarget(0));
1100 }
1101
1102 if (current->GetRight() == NULL || current->GetLeft() == NULL) {
1103 current = NULL;
1104 }
1105 else {
1106 if (current->GoesRight(*e))
1107 current = (TMVA::DecisionTreeNode*)current->GetRight();
1108 else
1109 current = (TMVA::DecisionTreeNode*)current->GetLeft();
1110 }
1111 }
1112}
1113
1114////////////////////////////////////////////////////////////////////////////////
1115/// calculate the normalization factor for a pruning validation sample
1116
1118{
1119 Double_t sumWeights = 0.0;
1120 for( EventConstList::const_iterator it = validationSample->begin();
1121 it != validationSample->end(); ++it ) {
1122 sumWeights += (*it)->GetWeight();
1123 }
1124 return sumWeights;
1125}
1126
1127////////////////////////////////////////////////////////////////////////////////
1128/// return the number of terminal nodes in the sub-tree below Node n
1129
1131{
1132 if (n == NULL) { // default, start at the tree top, then descend recursively
1133 n = this->GetRoot();
1134 if (n == NULL) {
1135 Log() << kFATAL << "CountLeafNodes: started with undefined ROOT node" <<Endl;
1136 return 0;
1137 }
1138 }
1139
1141
1142 if ((this->GetLeftDaughter(n) == NULL) && (this->GetRightDaughter(n) == NULL) ) {
1143 countLeafs += 1;
1144 }
1145 else {
1146 if (this->GetLeftDaughter(n) != NULL) {
1147 countLeafs += this->CountLeafNodes( this->GetLeftDaughter(n) );
1148 }
1149 if (this->GetRightDaughter(n) != NULL) {
1150 countLeafs += this->CountLeafNodes( this->GetRightDaughter(n) );
1151 }
1152 }
1153 return countLeafs;
1154}
1155
1156////////////////////////////////////////////////////////////////////////////////
1157/// descend a tree to find all its leaf nodes
1158
1160{
1161 if (n == NULL) { // default, start at the tree top, then descend recursively
1162 n = this->GetRoot();
1163 if (n == NULL) {
1164 Log() << kFATAL << "DescendTree: started with undefined ROOT node" <<Endl;
1165 return ;
1166 }
1167 }
1168
1169 if ((this->GetLeftDaughter(n) == NULL) && (this->GetRightDaughter(n) == NULL) ) {
1170 // do nothing
1171 }
1172 else if ((this->GetLeftDaughter(n) == NULL) && (this->GetRightDaughter(n) != NULL) ) {
1173 Log() << kFATAL << " Node with only one daughter?? Something went wrong" << Endl;
1174 return;
1175 }
1176 else if ((this->GetLeftDaughter(n) != NULL) && (this->GetRightDaughter(n) == NULL) ) {
1177 Log() << kFATAL << " Node with only one daughter?? Something went wrong" << Endl;
1178 return;
1179 }
1180 else {
1181 if (this->GetLeftDaughter(n) != NULL) {
1182 this->DescendTree( this->GetLeftDaughter(n) );
1183 }
1184 if (this->GetRightDaughter(n) != NULL) {
1185 this->DescendTree( this->GetRightDaughter(n) );
1186 }
1187 }
1188}
1189
1190////////////////////////////////////////////////////////////////////////////////
1191/// prune away the subtree below the node
1192
1194{
1195 DecisionTreeNode *l = node->GetLeft();
1196 DecisionTreeNode *r = node->GetRight();
1197
1198 node->SetRight(NULL);
1199 node->SetLeft(NULL);
1200 node->SetSelector(-1);
1201 node->SetSeparationGain(-1);
1202 if (node->GetPurity() > fNodePurityLimit) node->SetNodeType(1);
1203 else node->SetNodeType(-1);
1204 this->DeleteNode(l);
1205 this->DeleteNode(r);
1206 // update the stored number of nodes in the Tree
1207 this->CountNodes();
1208
1209}
1210
1211////////////////////////////////////////////////////////////////////////////////
1212/// prune a node temporarily (without actually deleting its descendants
1213/// which allows testing the pruned tree quality for many different
1214/// pruning stages without "touching" the tree.
1215
1217 if(node == NULL) return;
1218 node->SetNTerminal(1);
1219 node->SetSubTreeR( node->GetNodeR() );
1220 node->SetAlpha( std::numeric_limits<double>::infinity( ) );
1221 node->SetAlphaMinSubtree( std::numeric_limits<double>::infinity( ) );
1222 node->SetTerminal(kTRUE); // set the node to be terminal without deleting its descendants FIXME not needed
1223}
1224
1225////////////////////////////////////////////////////////////////////////////////
1226/// retrieve node from the tree. Its position (up to a maximal tree depth of 64)
1227/// is coded as a sequence of left-right moves starting from the root, coded as
1228/// 0-1 bit patterns stored in the "long-integer" (i.e. 0:left ; 1:right
1229
1231{
1232 Node* current = this->GetRoot();
1233
1234 for (UInt_t i =0; i < depth; i++) {
1235 ULong_t tmp = 1 << i;
1236 if ( tmp & sequence) current = this->GetRightDaughter(current);
1237 else current = this->GetLeftDaughter(current);
1238 }
1239
1240 return current;
1241}
1242
1243////////////////////////////////////////////////////////////////////////////////
1244///
1245
1247 for (UInt_t ivar=0; ivar<fNvars; ivar++) useVariable[ivar]=kFALSE;
1248 if (fUseNvars==0) { // no number specified ... choose s.th. which hopefully works well
1249 // watch out, should never happen as it is initialised automatically in MethodBDT already!!!
1250 fUseNvars = UInt_t(TMath::Sqrt(fNvars)+0.6);
1251 }
1252 if (fUsePoissonNvars) useNvars=TMath::Min(fNvars,TMath::Max(UInt_t(1),(UInt_t) fMyTrandom->Poisson(fUseNvars)));
1253 else useNvars = fUseNvars;
1254
1256 while (nSelectedVars < useNvars) {
1257 Double_t bla = fMyTrandom->Rndm()*fNvars;
1258 useVariable[Int_t (bla)] = kTRUE;
1259 nSelectedVars = 0;
1260 for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1261 if (useVariable[ivar] == kTRUE) {
1263 nSelectedVars++;
1264 }
1265 }
1266 }
1267 if (nSelectedVars != useNvars) { std::cout << "Bug in TrainNode - GetRandisedVariables()... sorry" << std::endl; std::exit(1);}
1268}
1269
1270// Multithreaded version of DecisionTree::TrainNodeFast
1271#ifdef R__USE_IMT
1272//====================================================================================
1273// Had to define this struct to enable parallelization in TrainNodeFast.
1274// Using TrainNodeInfo, each thread can return the information needed.
1275// After each thread returns we can then merge the results, hence the + operator.
1276//====================================================================================
1278
1280
1281 cNvars = cNvars_;
1282 nBins = nBins_;
1283
1284 nSelS = std::vector< std::vector<Double_t> >(cNvars);
1285 nSelB = std::vector< std::vector<Double_t> >(cNvars);
1286 nSelS_unWeighted = std::vector< std::vector<Double_t> >(cNvars);
1287 nSelB_unWeighted = std::vector< std::vector<Double_t> >(cNvars);
1288 target = std::vector< std::vector<Double_t> >(cNvars);
1289 target2 = std::vector< std::vector<Double_t> >(cNvars);
1290
1291 for(Int_t ivar=0; ivar<cNvars; ivar++){
1292 nSelS[ivar] = std::vector<Double_t>(nBins[ivar], 0);
1293 nSelB[ivar] = std::vector<Double_t>(nBins[ivar], 0);
1294 nSelS_unWeighted[ivar] = std::vector<Double_t>(nBins[ivar], 0);
1295 nSelB_unWeighted[ivar] = std::vector<Double_t>(nBins[ivar], 0);
1296 target[ivar] = std::vector<Double_t>(nBins[ivar], 0);
1297 target2[ivar] = std::vector<Double_t>(nBins[ivar], 0);
1298 }
1299 };
1300
1302
1303 // #### malloc problem if I define this and try to destruct xmin and xmax...
1304 // #### Maybe someone better at C++ can figure out why and fix this if it's a
1305 // ### serious memory problem
1306 //~TrainNodeInfo(){
1307 // delete [] xmin;
1308 // delete [] xmax;
1309 //};
1310
1311 Int_t cNvars = 0;
1313
1314 Double_t nTotS = 0;
1315 Double_t nTotS_unWeighted = 0;
1316 Double_t nTotB = 0;
1317 Double_t nTotB_unWeighted = 0;
1318
1319 std::vector< std::vector<Double_t> > nSelS;
1320 std::vector< std::vector<Double_t> > nSelB;
1321 std::vector< std::vector<Double_t> > nSelS_unWeighted;
1322 std::vector< std::vector<Double_t> > nSelB_unWeighted;
1323 std::vector< std::vector<Double_t> > target;
1324 std::vector< std::vector<Double_t> > target2;
1325
1326 // Define the addition operator for TrainNodeInfo
1327 // Make sure both TrainNodeInfos have the same nvars if we add them
1329 {
1330 TrainNodeInfo ret(cNvars, nBins);
1331
1332 // check that the two are compatible to add
1333 if(cNvars != other.cNvars)
1334 {
1335 std::cout << "!!! ERROR TrainNodeInfo1+TrainNodeInfo2 failure. cNvars1 != cNvars2." << std::endl;
1336 return ret;
1337 }
1338
1339 // add the signal, background, and target sums
1340 for (Int_t ivar=0; ivar<cNvars; ivar++) {
1341 for (UInt_t ibin=0; ibin<nBins[ivar]; ibin++) {
1342 ret.nSelS[ivar][ibin] = nSelS[ivar][ibin] + other.nSelS[ivar][ibin];
1343 ret.nSelB[ivar][ibin] = nSelB[ivar][ibin] + other.nSelB[ivar][ibin];
1344 ret.nSelS_unWeighted[ivar][ibin] = nSelS_unWeighted[ivar][ibin] + other.nSelS_unWeighted[ivar][ibin];
1345 ret.nSelB_unWeighted[ivar][ibin] = nSelB_unWeighted[ivar][ibin] + other.nSelB_unWeighted[ivar][ibin];
1346 ret.target[ivar][ibin] = target[ivar][ibin] + other.target[ivar][ibin];
1347 ret.target2[ivar][ibin] = target2[ivar][ibin] + other.target2[ivar][ibin];
1348 }
1349 }
1350
1351 ret.nTotS = nTotS + other.nTotS;
1352 ret.nTotS_unWeighted = nTotS_unWeighted + other.nTotS_unWeighted;
1353 ret.nTotB = nTotB + other.nTotB;
1354 ret.nTotB_unWeighted = nTotB_unWeighted + other.nTotB_unWeighted;
1355
1356 return ret;
1357 };
1358
1359};
1360//===========================================================================
1361// Done with TrainNodeInfo declaration
1362//===========================================================================
1363
1364////////////////////////////////////////////////////////////////////////////////
1365/// Decide how to split a node using one of the variables that gives
1366/// the best separation of signal/background. In order to do this, for each
1367/// variable a scan of the different cut values in a grid (grid = fNCuts) is
1368/// performed and the resulting separation gains are compared.
1369/// in addition to the individual variables, one can also ask for a fisher
1370/// discriminant being built out of (some) of the variables and used as a
1371/// possible multivariate split.
1372
1375{
1376 // #### OK let's comment this one to see how to parallelize it
1378 Double_t *separationGain = new Double_t[fNvars+1];
1379 Int_t *cutIndex = new Int_t[fNvars+1]; //-1;
1380
1381 // #### initialize the sep gain and cut index values
1382 for (UInt_t ivar=0; ivar <= fNvars; ivar++) {
1383 separationGain[ivar]=-1;
1384 cutIndex[ivar]=-1;
1385 }
1386 // ### set up some other variables
1387 Int_t mxVar = -1;
1389 UInt_t nevents = eventSample.size();
1390
1391
1392 // the +1 comes from the fact that I treat later on the Fisher output as an
1393 // additional possible variable.
1394 Bool_t *useVariable = new Bool_t[fNvars+1]; // for performance reasons instead of std::vector<Bool_t> useVariable(fNvars);
1395 UInt_t *mapVariable = new UInt_t[fNvars+1]; // map the subset of variables used in randomised trees to the original variable number (used in the Event() )
1396
1397 std::vector<Double_t> fisherCoeff;
1398
1399 // #### set up a map to the subset of variables using two arrays
1400 if (fRandomisedTree) { // choose for each node splitting a random subset of variables to choose from
1401 UInt_t tmp=fUseNvars;
1402 GetRandomisedVariables(useVariable,mapVariable,tmp);
1403 }
1404 else {
1405 for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1408 }
1409 }
1410 // #### last variable entry is the fisher variable
1411 useVariable[fNvars] = kFALSE; //by default fisher is not used..
1412
1413 // #### Begin Fisher calculation
1414 Bool_t fisherOK = kFALSE; // flag to show that the fisher discriminant could be calculated correctly or not;
1415 if (fUseFisherCuts) {
1416 useVariable[fNvars] = kTRUE; // that's were I store the "fisher MVA"
1417
1418 //use for the Fisher discriminant ONLY those variables that show
1419 //some reasonable linear correlation in either Signal or Background
1420 Bool_t *useVarInFisher = new Bool_t[fNvars]; // for performance reasons instead of std::vector<Bool_t> useVariable(fNvars);
1421 UInt_t *mapVarInFisher = new UInt_t[fNvars]; // map the subset of variables used in randomised trees to the original variable number (used in the Event() )
1422 for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1425 }
1426
1427 std::vector<TMatrixDSym*>* covMatrices;
1428 covMatrices = gTools().CalcCovarianceMatrices( eventSample, 2 ); // currently for 2 classes only
1429 if (!covMatrices){
1430 Log() << kWARNING << " in TrainNodeFast, the covariance Matrices needed for the Fisher-Cuts returned error --> revert to just normal cuts for this node" << Endl;
1431 fisherOK = kFALSE;
1432 }else{
1433 TMatrixD *ss = new TMatrixD(*(covMatrices->at(0)));
1434 TMatrixD *bb = new TMatrixD(*(covMatrices->at(1)));
1435 const TMatrixD *s = gTools().GetCorrelationMatrix(ss);
1437
1438 for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1439 for (UInt_t jvar=ivar+1; jvar < fNvars; jvar++) {
1440 if ( ( TMath::Abs( (*s)(ivar, jvar)) > fMinLinCorrForFisher) ||
1441 ( TMath::Abs( (*b)(ivar, jvar)) > fMinLinCorrForFisher) ){
1444 }
1445 }
1446 }
1447 // now as you know which variables you want to use, count and map them:
1448 // such that you can use an array/matrix filled only with THOSE variables
1449 // that you used
1450 UInt_t nFisherVars = 0;
1451 for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1452 //now .. pick those variables that are used in the FIsher and are also
1453 // part of the "allowed" variables in case of Randomized Trees)
1456 // now exclude the variables used in the Fisher cuts, and don't
1457 // use them anymore in the individual variable scan
1458 if (fUseExclusiveVars) useVariable[ivar] = kFALSE;
1459 }
1460 }
1461
1462
1463 fisherCoeff = this->GetFisherCoefficients(eventSample, nFisherVars, mapVarInFisher);
1464 fisherOK = kTRUE;
1465 }
1466 delete [] useVarInFisher;
1467 delete [] mapVarInFisher;
1468
1469 }
1470 // #### End Fisher calculation
1471
1472
1473 UInt_t cNvars = fNvars;
1474 if (fUseFisherCuts && fisherOK) cNvars++; // use the Fisher output simple as additional variable
1475
1476 // #### set up the binning info arrays
1477 // #### each var has its own binning since some may be integers
1478 UInt_t* nBins = new UInt_t [cNvars];
1479 Double_t* binWidth = new Double_t [cNvars];
1480 Double_t* invBinWidth = new Double_t [cNvars];
1481 Double_t** cutValues = new Double_t* [cNvars];
1482
1483 // #### set up the xmin and xmax arrays
1484 // #### each var has its own range
1485 Double_t *xmin = new Double_t[cNvars];
1486 Double_t *xmax = new Double_t[cNvars];
1487
1488 // construct and intialize binning/cuts
1489 for (UInt_t ivar=0; ivar<cNvars; ivar++) {
1490 // ncuts means that we need n+1 bins for each variable
1491 nBins[ivar] = fNCuts+1;
1492 if (ivar < fNvars) {
1493 if (fDataSetInfo->GetVariableInfo(ivar).GetVarType() == 'I') {
1494 nBins[ivar] = node->GetSampleMax(ivar) - node->GetSampleMin(ivar) + 1;
1495 }
1496 }
1497
1498 cutValues[ivar] = new Double_t [nBins[ivar]];
1499 }
1500
1501 // init the range and cutvalues for each var now that we know the binning
1502 for (UInt_t ivar=0; ivar < cNvars; ivar++) {
1503 if (ivar < fNvars){
1504 xmin[ivar]=node->GetSampleMin(ivar);
1505 xmax[ivar]=node->GetSampleMax(ivar);
1507 // std::cout << " variable " << ivar << " has no proper range in (xmax[ivar]-xmin[ivar] = " << xmax[ivar]-xmin[ivar] << std::endl;
1508 // std::cout << " will set useVariable[ivar]=false"<<std::endl;
1510 }
1511
1512 } else { // the fisher variable
1513 xmin[ivar]=999;
1514 xmax[ivar]=-999;
1515 // too bad, for the moment I don't know how to do this without looping
1516 // once to get the "min max" and then AGAIN to fill the histogram
1517 for (UInt_t iev=0; iev<nevents; iev++) {
1518 // returns the Fisher value (no fixed range)
1519 Double_t result = fisherCoeff[fNvars]; // the fisher constant offset
1520 for (UInt_t jvar=0; jvar<fNvars; jvar++)
1521 result += fisherCoeff[jvar]*(eventSample[iev])->GetValueFast(jvar);
1522 if (result > xmax[ivar]) xmax[ivar]=result;
1523 if (result < xmin[ivar]) xmin[ivar]=result;
1524 }
1525 }
1526 // this loop could take a long time if nbins is large
1527 for (UInt_t ibin=0; ibin<nBins[ivar]; ibin++) {
1528 cutValues[ivar][ibin]=0;
1529 }
1530 }
1531
1532 // ====================================================================
1533 // ====================================================================
1534 // Parallelized Version
1535 // ====================================================================
1536 // ====================================================================
1537
1538 // #### Figure out the cut values, loops through vars then through cuts
1539 // #### if ncuts is on the order of the amount of training data/10 - ish then we get some gains from parallelizing this
1540 // fill the cut values for the scan:
1541 auto varSeeds = ROOT::TSeqU(cNvars);
1542 auto fvarInitCuts = [this, &useVariable, &cutValues, &invBinWidth, &binWidth, &nBins, &xmin, &xmax](UInt_t ivar = 0){
1543
1544 if ( useVariable[ivar] ) {
1545
1546 //set the grid for the cut scan on the variables like this:
1547 //
1548 // | | | | | ... | |
1549 // xmin xmax
1550 //
1551 // cut 0 1 2 3 ... fNCuts-1 (counting from zero)
1552 // bin 0 1 2 3 ..... nBins-1=fNCuts (counting from zero)
1553 // --> nBins = fNCuts+1
1554 // (NOTE, the cuts at xmin or xmax would just give the whole sample and
1555 // hence can be safely omitted
1556
1557 binWidth[ivar] = ( xmax[ivar] - xmin[ivar] ) / Double_t(nBins[ivar]);
1558 invBinWidth[ivar] = 1./binWidth[ivar];
1559 if (ivar < fNvars) {
1560 if (fDataSetInfo->GetVariableInfo(ivar).GetVarType() == 'I') { invBinWidth[ivar] = 1; binWidth[ivar] = 1; }
1561 }
1562
1563 // std::cout << "ivar="<<ivar
1564 // <<" min="<<xmin[ivar]
1565 // << " max="<<xmax[ivar]
1566 // << " width=" << istepSize
1567 // << " nBins["<<ivar<<"]="<<nBins[ivar]<<std::endl;
1568 for (UInt_t icut=0; icut<nBins[ivar]-1; icut++) {
1569 cutValues[ivar][icut]=xmin[ivar]+(Double_t(icut+1))*binWidth[ivar];
1570 // std::cout << " cutValues["<<ivar<<"]["<<icut<<"]=" << cutValues[ivar][icut] << std::endl;
1571 }
1572 }
1573 return 0;
1574 };
1575 TMVA::Config::Instance().GetThreadExecutor().Map(fvarInitCuts, varSeeds);
1576
1577 // #### Loop through the events to get the total sig and background
1578 // #### Then loop through the vars to get the counts in each bin in each var
1579 // #### So we have a loop through the events and a loop through the vars, but no loop through the cuts this is a calculation
1580
1581 TrainNodeInfo nodeInfo(cNvars, nBins);
1582 UInt_t nPartitions = TMVA::Config::Instance().GetThreadExecutor().GetPoolSize();
1583
1584 // #### When nbins is low compared to ndata this version of parallelization is faster, so use it
1585 // #### Parallelize by chunking the data into the same number of sections as we have processors
1586 if(eventSample.size() >= cNvars*fNCuts*nPartitions*2)
1587 {
1589
1590 // need a lambda function to pass to TThreadExecutor::MapReduce
1591 auto f = [this, &eventSample, &fisherCoeff, &useVariable, &invBinWidth,
1592 &nBins, &xmin, &cNvars, &nPartitions](UInt_t partition = 0){
1593
1594 UInt_t start = 1.0*partition/nPartitions*eventSample.size();
1595 UInt_t end = (partition+1.0)/nPartitions*eventSample.size();
1596
1597 TrainNodeInfo nodeInfof(cNvars, nBins);
1598
1599 for(UInt_t iev=start; iev<end; iev++) {
1600
1601 Double_t eventWeight = eventSample[iev]->GetWeight();
1602 if (eventSample[iev]->GetClass() == fSigClass) {
1603 nodeInfof.nTotS+=eventWeight;
1604 nodeInfof.nTotS_unWeighted++; }
1605 else {
1606 nodeInfof.nTotB+=eventWeight;
1607 nodeInfof.nTotB_unWeighted++;
1608 }
1609
1610 // #### Count the number in each bin
1611 Int_t iBin=-1;
1612 for (UInt_t ivar=0; ivar < cNvars; ivar++) {
1613 // now scan trough the cuts for each varable and find which one gives
1614 // the best separationGain at the current stage.
1615 if ( useVariable[ivar] ) {
1617 if (ivar < fNvars) eventData = eventSample[iev]->GetValueFast(ivar);
1618 else { // the fisher variable
1619 eventData = fisherCoeff[fNvars];
1620 for (UInt_t jvar=0; jvar<fNvars; jvar++)
1621 eventData += fisherCoeff[jvar]*(eventSample[iev])->GetValueFast(jvar);
1622
1623 }
1624 // #### figure out which bin it belongs in ...
1625 // "maximum" is nbins-1 (the "-1" because we start counting from 0 !!
1626 iBin = TMath::Min(Int_t(nBins[ivar]-1),TMath::Max(0,int (invBinWidth[ivar]*(eventData-xmin[ivar]) ) ));
1627 if (eventSample[iev]->GetClass() == fSigClass) {
1628 nodeInfof.nSelS[ivar][iBin]+=eventWeight;
1629 nodeInfof.nSelS_unWeighted[ivar][iBin]++;
1630 }
1631 else {
1632 nodeInfof.nSelB[ivar][iBin]+=eventWeight;
1633 nodeInfof.nSelB_unWeighted[ivar][iBin]++;
1634 }
1635 if (DoRegression()) {
1636 nodeInfof.target[ivar][iBin] +=eventWeight*eventSample[iev]->GetTarget(0);
1637 nodeInfof.target2[ivar][iBin]+=eventWeight*eventSample[iev]->GetTarget(0)*eventSample[iev]->GetTarget(0);
1638 }
1639 }
1640 }
1641 }
1642 return nodeInfof;
1643 };
1644
1645 // #### Need an intial struct to pass to std::accumulate
1646 TrainNodeInfo nodeInfoInit(cNvars, nBins);
1647
1648 // #### Run the threads in parallel then merge the results
1649 auto redfunc = [nodeInfoInit](std::vector<TrainNodeInfo> v) -> TrainNodeInfo { return std::accumulate(v.begin(), v.end(), nodeInfoInit); };
1650 nodeInfo = TMVA::Config::Instance().GetThreadExecutor().MapReduce(f, seeds, redfunc);
1651 }
1652
1653
1654 // #### When nbins is close to the order of the data this version of parallelization is faster
1655 // #### Parallelize by vectorizing the variable loop
1656 else {
1657
1659
1660 for(UInt_t iev=0; iev<eventSample.size(); iev++) {
1661
1662 Int_t iBin=-1;
1663 Double_t eventWeight = eventSample[iev]->GetWeight();
1664
1665 // Only count the net signal and background once
1666 if(ivar==0){
1667 if (eventSample[iev]->GetClass() == fSigClass) {
1668 nodeInfo.nTotS+=eventWeight;
1669 nodeInfo.nTotS_unWeighted++; }
1670 else {
1671 nodeInfo.nTotB+=eventWeight;
1672 nodeInfo.nTotB_unWeighted++;
1673 }
1674 }
1675
1676 // Figure out which bin the event belongs in and increment the bin in each histogram vector appropriately
1677 if ( useVariable[ivar] ) {
1679 if (ivar < fNvars) eventData = eventSample[iev]->GetValueFast(ivar);
1680 else { // the fisher variable
1681 eventData = fisherCoeff[fNvars];
1682 for (UInt_t jvar=0; jvar<fNvars; jvar++)
1683 eventData += fisherCoeff[jvar]*(eventSample[iev])->GetValueFast(jvar);
1684
1685 }
1686 // #### figure out which bin it belongs in ...
1687 // "maximum" is nbins-1 (the "-1" because we start counting from 0 !!
1688 iBin = TMath::Min(Int_t(nBins[ivar]-1),TMath::Max(0,int (invBinWidth[ivar]*(eventData-xmin[ivar]) ) ));
1689 if (eventSample[iev]->GetClass() == fSigClass) {
1690 nodeInfo.nSelS[ivar][iBin]+=eventWeight;
1691 nodeInfo.nSelS_unWeighted[ivar][iBin]++;
1692 }
1693 else {
1694 nodeInfo.nSelB[ivar][iBin]+=eventWeight;
1695 nodeInfo.nSelB_unWeighted[ivar][iBin]++;
1696 }
1697 if (DoRegression()) {
1698 nodeInfo.target[ivar][iBin] +=eventWeight*eventSample[iev]->GetTarget(0);
1699 nodeInfo.target2[ivar][iBin]+=eventWeight*eventSample[iev]->GetTarget(0)*eventSample[iev]->GetTarget(0);
1700 }
1701 }
1702 }
1703 return 0;
1704 };
1705
1706 TMVA::Config::Instance().GetThreadExecutor().Map(fvarFillNodeInfo, varSeeds);
1707 }
1708
1709 // now turn each "histogram" into a cumulative distribution
1710 // #### loops through the vars and then the bins, if the bins are on the order of the training data this is worth parallelizing
1711 // #### doesn't hurt otherwise, pretty unnoticeable
1712 auto fvarCumulative = [&nodeInfo, &useVariable, &nBins, this, &eventSample](UInt_t ivar = 0){
1713 if (useVariable[ivar]) {
1714 for (UInt_t ibin=1; ibin < nBins[ivar]; ibin++) {
1715 nodeInfo.nSelS[ivar][ibin]+=nodeInfo.nSelS[ivar][ibin-1];
1716 nodeInfo.nSelS_unWeighted[ivar][ibin]+=nodeInfo.nSelS_unWeighted[ivar][ibin-1];
1717 nodeInfo.nSelB[ivar][ibin]+=nodeInfo.nSelB[ivar][ibin-1];
1718 nodeInfo.nSelB_unWeighted[ivar][ibin]+=nodeInfo.nSelB_unWeighted[ivar][ibin-1];
1719 if (DoRegression()) {
1720 nodeInfo.target[ivar][ibin] +=nodeInfo.target[ivar][ibin-1] ;
1721 nodeInfo.target2[ivar][ibin]+=nodeInfo.target2[ivar][ibin-1];
1722 }
1723 }
1724 if (nodeInfo.nSelS_unWeighted[ivar][nBins[ivar]-1] +nodeInfo.nSelB_unWeighted[ivar][nBins[ivar]-1] != eventSample.size()) {
1725 Log() << kFATAL << "Helge, you have a bug ....nodeInfo.nSelS_unw..+nodeInfo.nSelB_unw..= "
1726 << nodeInfo.nSelS_unWeighted[ivar][nBins[ivar]-1] +nodeInfo.nSelB_unWeighted[ivar][nBins[ivar]-1]
1727 << " while eventsample size = " << eventSample.size()
1728 << Endl;
1729 }
1730 double lastBins=nodeInfo.nSelS[ivar][nBins[ivar]-1] +nodeInfo.nSelB[ivar][nBins[ivar]-1];
1731 double totalSum=nodeInfo.nTotS+nodeInfo.nTotB;
1733 Log() << kFATAL << "Helge, you have another bug ....nodeInfo.nSelS+nodeInfo.nSelB= "
1734 << lastBins
1735 << " while total number of events = " << totalSum
1736 << Endl;
1737 }
1738 }
1739 return 0;
1740 };
1741 TMVA::Config::Instance().GetThreadExecutor().Map(fvarCumulative, varSeeds);
1742
1743 // #### Again, if bins is on the order of the training data or with an order or so, then this is worth parallelizing
1744 // now select the optimal cuts for each varable and find which one gives
1745 // the best separationGain at the current stage
1746 auto fvarMaxSep = [&nodeInfo, &useVariable, this, &separationGain, &cutIndex, &nBins] (UInt_t ivar = 0){
1747 if (useVariable[ivar]) {
1749 for (UInt_t iBin=0; iBin<nBins[ivar]-1; iBin++) { // the last bin contains "all events" -->skip
1750 // the separationGain is defined as the various indices (Gini, CorssEntropy, e.t.c)
1751 // calculated by the "SamplePurities" from the branches that would go to the
1752 // left or the right from this node if "these" cuts were used in the Node:
1753 // hereby: nodeInfo.nSelS and nodeInfo.nSelB would go to the right branch
1754 // (nodeInfo.nTotS - nodeInfo.nSelS) + (nodeInfo.nTotB - nodeInfo.nSelB) would go to the left branch;
1755
1756 // only allow splits where both daughter nodes match the specified minimum number
1757 // for this use the "unweighted" events, as you are interested in statistically
1758 // significant splits, which is determined by the actual number of entries
1759 // for a node, rather than the sum of event weights.
1760
1761 Double_t sl = nodeInfo.nSelS_unWeighted[ivar][iBin];
1762 Double_t bl = nodeInfo.nSelB_unWeighted[ivar][iBin];
1763 Double_t s = nodeInfo.nTotS_unWeighted;
1764 Double_t b = nodeInfo.nTotB_unWeighted;
1765 Double_t slW = nodeInfo.nSelS[ivar][iBin];
1766 Double_t blW = nodeInfo.nSelB[ivar][iBin];
1767 Double_t sW = nodeInfo.nTotS;
1768 Double_t bW = nodeInfo.nTotB;
1769 Double_t sr = s-sl;
1770 Double_t br = b-bl;
1771 Double_t srW = sW-slW;
1772 Double_t brW = bW-blW;
1773 // std::cout << "sl="<<sl << " bl="<<bl<<" fMinSize="<<fMinSize << "sr="<<sr << " br="<<br <<std::endl;
1774 if ( ((sl+bl)>=fMinSize && (sr+br)>=fMinSize)
1775 && ((slW+blW)>=fMinSize && (srW+brW)>=fMinSize)
1776 ) {
1777
1778 if (DoRegression()) {
1779 sepTmp = fRegType->GetSeparationGain(nodeInfo.nSelS[ivar][iBin]+nodeInfo.nSelB[ivar][iBin],
1780 nodeInfo.target[ivar][iBin],nodeInfo.target2[ivar][iBin],
1781 nodeInfo.nTotS+nodeInfo.nTotB,
1782 nodeInfo.target[ivar][nBins[ivar]-1],nodeInfo.target2[ivar][nBins[ivar]-1]);
1783 } else {
1784 sepTmp = fSepType->GetSeparationGain(nodeInfo.nSelS[ivar][iBin], nodeInfo.nSelB[ivar][iBin], nodeInfo.nTotS, nodeInfo.nTotB);
1785 }
1786 if (separationGain[ivar] < sepTmp) {
1788 cutIndex[ivar] = iBin;
1789 }
1790 }
1791 }
1792 }
1793 return 0;
1794 };
1795 TMVA::Config::Instance().GetThreadExecutor().Map(fvarMaxSep, varSeeds);
1796
1797 // you found the best separation cut for each variable, now compare the variables
1798 for (UInt_t ivar=0; ivar < cNvars; ivar++) {
1799 if (useVariable[ivar] ) {
1802 mxVar = ivar;
1803 }
1804 }
1805 }
1806
1807 if (mxVar >= 0) {
1808 if (DoRegression()) {
1809 node->SetSeparationIndex(fRegType->GetSeparationIndex(nodeInfo.nTotS+nodeInfo.nTotB,nodeInfo.target[0][nBins[mxVar]-1],nodeInfo.target2[0][nBins[mxVar]-1]));
1810 node->SetResponse(nodeInfo.target[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB));
1811 if ( almost_equal_double(nodeInfo.target2[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB), nodeInfo.target[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB)*nodeInfo.target[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB))) {
1812 node->SetRMS(0);
1813
1814 }else{
1815 node->SetRMS(TMath::Sqrt(nodeInfo.target2[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB) - nodeInfo.target[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB)*nodeInfo.target[0][nBins[mxVar]-1]/(nodeInfo.nTotS+nodeInfo.nTotB)));
1816 }
1817 }
1818 else {
1819 node->SetSeparationIndex(fSepType->GetSeparationIndex(nodeInfo.nTotS,nodeInfo.nTotB));
1820 if (mxVar >=0){
1821 if (nodeInfo.nSelS[mxVar][cutIndex[mxVar]]/nodeInfo.nTotS > nodeInfo.nSelB[mxVar][cutIndex[mxVar]]/nodeInfo.nTotB) cutType=kTRUE;
1822 else cutType=kFALSE;
1823 }
1824 }
1825 node->SetSelector((UInt_t)mxVar);
1827 node->SetCutType(cutType);
1829 if (mxVar < (Int_t) fNvars){ // the fisher cut is actually not used in this node, hence don't need to store fisher components
1830 node->SetNFisherCoeff(0);
1831 fVariableImportance[mxVar] += separationGainTotal*separationGainTotal * (nodeInfo.nTotS+nodeInfo.nTotB) * (nodeInfo.nTotS+nodeInfo.nTotB) ;
1832 //for (UInt_t ivar=0; ivar<fNvars; ivar++) fVariableImportance[ivar] += separationGain[ivar]*separationGain[ivar] * (nodeInfo.nTotS+nodeInfo.nTotB) * (nodeInfo.nTotS+nodeInfo.nTotB) ;
1833 }else{
1834 // allocate Fisher coefficients (use fNvars, and set the non-used ones to zero. Might
1835 // be even less storage space on average than storing also the mapping used otherwise
1836 // can always be changed relatively easy
1837 node->SetNFisherCoeff(fNvars+1);
1838 for (UInt_t ivar=0; ivar<=fNvars; ivar++) {
1840 // take 'fisher coeff. weighted estimate as variable importance, "Don't fill the offset coefficient though :)
1841 if (ivar<fNvars){
1842 fVariableImportance[ivar] += fisherCoeff[ivar]*fisherCoeff[ivar]*separationGainTotal*separationGainTotal * (nodeInfo.nTotS+nodeInfo.nTotB) * (nodeInfo.nTotS+nodeInfo.nTotB) ;
1843 }
1844 }
1845 }
1846 }
1847 else {
1849 }
1850
1851 // #### Now in TrainNodeInfo, but I got a malloc segfault when I tried to destruct arrays there.
1852 // #### So, I changed these from dynamic arrays to std::vector to fix this memory problem
1853 // #### so no need to destruct them anymore. I didn't see any performance drop as a result.
1854 for (UInt_t i=0; i<cNvars; i++) {
1855 // delete [] nodeInfo.nSelS[i];
1856 // delete [] nodeInfo.nSelB[i];
1857 // delete [] nodeInfo.nSelS_unWeighted[i];
1858 // delete [] nodeInfo.nSelB_unWeighted[i];
1859 // delete [] nodeInfo.target[i];
1860 // delete [] nodeInfo.target2[i];
1861 delete [] cutValues[i];
1862 }
1863 //delete [] nodeInfo.nSelS;
1864 //delete [] nodeInfo.nSelB;
1865 //delete [] nodeInfo.nSelS_unWeighted;
1866 //delete [] nodeInfo.nSelB_unWeighted;
1867 //delete [] nodeInfo.target;
1868 //delete [] nodeInfo.target2;
1869
1870 // #### left these as dynamic arrays as they were before
1871 // #### since I didn't need to mess with them for parallelization
1872 delete [] cutValues;
1873
1874 delete [] xmin;
1875 delete [] xmax;
1876
1877 delete [] useVariable;
1878 delete [] mapVariable;
1879
1880 delete [] separationGain;
1881 delete [] cutIndex;
1882
1883 delete [] nBins;
1884 delete [] binWidth;
1885 delete [] invBinWidth;
1886
1887 return separationGainTotal;
1888}
1889
1890// Standard version of DecisionTree::TrainNodeFast (multithreading is not enabled)
1891#else
1894{
1895// #### OK let's comment this one to see how to parallelize it
1897 Double_t *separationGain = new Double_t[fNvars+1];
1898 Int_t *cutIndex = new Int_t[fNvars+1]; //-1;
1899
1900 // #### initialize the sep gain and cut index values
1901 for (UInt_t ivar=0; ivar <= fNvars; ivar++) {
1902 separationGain[ivar]=-1;
1903 cutIndex[ivar]=-1;
1904 }
1905 // ### set up some other variables
1906 Int_t mxVar = -1;
1908 Double_t nTotS, nTotB;
1909 Int_t nTotS_unWeighted, nTotB_unWeighted;
1910 UInt_t nevents = eventSample.size();
1911
1912
1913 // the +1 comes from the fact that I treat later on the Fisher output as an
1914 // additional possible variable.
1915 Bool_t *useVariable = new Bool_t[fNvars+1]; // for performance reasons instead of std::vector<Bool_t> useVariable(fNvars);
1916 UInt_t *mapVariable = new UInt_t[fNvars+1]; // map the subset of variables used in randomised trees to the original variable number (used in the Event() )
1917
1918 std::vector<Double_t> fisherCoeff;
1919
1920 // #### set up a map to the subset of variables using two arrays
1921 if (fRandomisedTree) { // choose for each node splitting a random subset of variables to choose from
1922 UInt_t tmp=fUseNvars;
1923 GetRandomisedVariables(useVariable,mapVariable,tmp);
1924 }
1925 else {
1926 for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1929 }
1930 }
1931 // #### last variable entry is the fisher variable
1932 useVariable[fNvars] = kFALSE; //by default fisher is not used..
1933
1934 // #### Begin Fisher calculation
1935 Bool_t fisherOK = kFALSE; // flag to show that the fisher discriminant could be calculated correctly or not;
1936 if (fUseFisherCuts) {
1937 useVariable[fNvars] = kTRUE; // that's were I store the "fisher MVA"
1938
1939 //use for the Fisher discriminant ONLY those variables that show
1940 //some reasonable linear correlation in either Signal or Background
1941 Bool_t *useVarInFisher = new Bool_t[fNvars]; // for performance reasons instead of std::vector<Bool_t> useVariable(fNvars);
1942 UInt_t *mapVarInFisher = new UInt_t[fNvars]; // map the subset of variables used in randomised trees to the original variable number (used in the Event() )
1943 for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1946 }
1947
1948 std::vector<TMatrixDSym*>* covMatrices;
1949 covMatrices = gTools().CalcCovarianceMatrices( eventSample, 2 ); // currently for 2 classes only
1950 if (!covMatrices){
1951 Log() << kWARNING << " in TrainNodeFast, the covariance Matrices needed for the Fisher-Cuts returned error --> revert to just normal cuts for this node" << Endl;
1952 fisherOK = kFALSE;
1953 }else{
1954 TMatrixD *ss = new TMatrixD(*(covMatrices->at(0)));
1955 TMatrixD *bb = new TMatrixD(*(covMatrices->at(1)));
1956 const TMatrixD *s = gTools().GetCorrelationMatrix(ss);
1958
1959 for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1960 for (UInt_t jvar=ivar+1; jvar < fNvars; jvar++) {
1961 if ( ( TMath::Abs( (*s)(ivar, jvar)) > fMinLinCorrForFisher) ||
1962 ( TMath::Abs( (*b)(ivar, jvar)) > fMinLinCorrForFisher) ){
1965 }
1966 }
1967 }
1968 // now as you know which variables you want to use, count and map them:
1969 // such that you can use an array/matrix filled only with THOSE variables
1970 // that you used
1971 UInt_t nFisherVars = 0;
1972 for (UInt_t ivar=0; ivar < fNvars; ivar++) {
1973 //now .. pick those variables that are used in the FIsher and are also
1974 // part of the "allowed" variables in case of Randomized Trees)
1977 // now exclud the variables used in the Fisher cuts, and don't
1978 // use them anymore in the individual variable scan
1979 if (fUseExclusiveVars) useVariable[ivar] = kFALSE;
1980 }
1981 }
1982
1983
1984 fisherCoeff = this->GetFisherCoefficients(eventSample, nFisherVars, mapVarInFisher);
1985 fisherOK = kTRUE;
1986 }
1987 delete [] useVarInFisher;
1988 delete [] mapVarInFisher;
1989
1990 }
1991 // #### End Fisher calculation
1992
1993
1994 UInt_t cNvars = fNvars;
1995 if (fUseFisherCuts && fisherOK) cNvars++; // use the Fisher output simple as additional variable
1996
1997 // #### OK now what's going on...
1998 // #### looks like we are setting up histograms
1999 UInt_t* nBins = new UInt_t [cNvars];
2000 Double_t* binWidth = new Double_t [cNvars];
2001 Double_t* invBinWidth = new Double_t [cNvars];
2002
2003 Double_t** nSelS = new Double_t* [cNvars];
2004 Double_t** nSelB = new Double_t* [cNvars];
2005 Double_t** nSelS_unWeighted = new Double_t* [cNvars];
2006 Double_t** nSelB_unWeighted = new Double_t* [cNvars];
2007 Double_t** target = new Double_t* [cNvars];
2008 Double_t** target2 = new Double_t* [cNvars];
2009 Double_t** cutValues = new Double_t* [cNvars];
2010
2011 // #### looping through the variables...
2012 for (UInt_t ivar=0; ivar<cNvars; ivar++) {
2013 // #### ncuts means that we need n+1 bins for each variable
2014 nBins[ivar] = fNCuts+1;
2015 if (ivar < fNvars) {
2016 if (fDataSetInfo->GetVariableInfo(ivar).GetVarType() == 'I') {
2017 nBins[ivar] = node->GetSampleMax(ivar) - node->GetSampleMin(ivar) + 1;
2018 }
2019 }
2020
2021 // #### make some new arrays for each ith var, size=nbins for each array
2022 // #### integer features get the same number of bins as values, set later
2023 nSelS[ivar] = new Double_t [nBins[ivar]];
2024 nSelB[ivar] = new Double_t [nBins[ivar]];
2025 nSelS_unWeighted[ivar] = new Double_t [nBins[ivar]];
2026 nSelB_unWeighted[ivar] = new Double_t [nBins[ivar]];
2027 target[ivar] = new Double_t [nBins[ivar]];
2028 target2[ivar] = new Double_t [nBins[ivar]];
2029 cutValues[ivar] = new Double_t [nBins[ivar]];
2030
2031 }
2032
2033 // #### xmin and xmax for earch variable
2034 Double_t *xmin = new Double_t[cNvars];
2035 Double_t *xmax = new Double_t[cNvars];
2036
2037 // #### ok loop through each variable to initialize all the values
2038 for (UInt_t ivar=0; ivar < cNvars; ivar++) {
2039 if (ivar < fNvars){
2040 xmin[ivar]=node->GetSampleMin(ivar);
2041 xmax[ivar]=node->GetSampleMax(ivar);
2043 // std::cout << " variable " << ivar << " has no proper range in (xmax[ivar]-xmin[ivar] = " << xmax[ivar]-xmin[ivar] << std::endl;
2044 // std::cout << " will set useVariable[ivar]=false"<<std::endl;
2046 }
2047
2048 } else { // the fisher variable
2049 xmin[ivar]=999;
2050 xmax[ivar]=-999;
2051 // too bad, for the moment I don't know how to do this without looping
2052 // once to get the "min max" and then AGAIN to fill the histogram
2053 for (UInt_t iev=0; iev<nevents; iev++) {
2054 // returns the Fisher value (no fixed range)
2055 Double_t result = fisherCoeff[fNvars]; // the fisher constant offset
2056 for (UInt_t jvar=0; jvar<fNvars; jvar++)
2057 result += fisherCoeff[jvar]*(eventSample[iev])->GetValueFast(jvar);
2058 if (result > xmax[ivar]) xmax[ivar]=result;
2059 if (result < xmin[ivar]) xmin[ivar]=result;
2060 }
2061 }
2062 for (UInt_t ibin=0; ibin<nBins[ivar]; ibin++) {
2063 nSelS[ivar][ibin]=0;
2064 nSelB[ivar][ibin]=0;
2065 nSelS_unWeighted[ivar][ibin]=0;
2066 nSelB_unWeighted[ivar][ibin]=0;
2067 target[ivar][ibin]=0;
2068 target2[ivar][ibin]=0;
2069 cutValues[ivar][ibin]=0;
2070 }
2071 }
2072
2073 // #### Nothing to parallelize here really, no loop through events
2074 // #### only figures out the bin edge values for the "histogram" arrays
2075 // fill the cut values for the scan:
2076 for (UInt_t ivar=0; ivar < cNvars; ivar++) {
2077
2078 if ( useVariable[ivar] ) {
2079
2080 //set the grid for the cut scan on the variables like this:
2081 //
2082 // | | | | | ... | |
2083 // xmin xmax
2084 //
2085 // cut 0 1 2 3 ... fNCuts-1 (counting from zero)
2086 // bin 0 1 2 3 ..... nBins-1=fNCuts (counting from zero)
2087 // --> nBins = fNCuts+1
2088 // (NOTE, the cuts at xmin or xmax would just give the whole sample and
2089 // hence can be safely omitted
2090
2091 binWidth[ivar] = ( xmax[ivar] - xmin[ivar] ) / Double_t(nBins[ivar]);
2092 invBinWidth[ivar] = 1./binWidth[ivar];
2093 if (ivar < fNvars) {
2094 if (fDataSetInfo->GetVariableInfo(ivar).GetVarType() == 'I') { invBinWidth[ivar] = 1; binWidth[ivar] = 1; }
2095 }
2096
2097 // std::cout << "ivar="<<ivar
2098 // <<" min="<<xmin[ivar]
2099 // << " max="<<xmax[ivar]
2100 // << " widht=" << istepSize
2101 // << " nBins["<<ivar<<"]="<<nBins[ivar]<<std::endl;
2102 for (UInt_t icut=0; icut<nBins[ivar]-1; icut++) {
2103 cutValues[ivar][icut]=xmin[ivar]+(Double_t(icut+1))*binWidth[ivar];
2104 // std::cout << " cutValues["<<ivar<<"]["<<icut<<"]=" << cutValues[ivar][icut] << std::endl;
2105 }
2106 }
2107 }
2108
2109 // #### Loop through the events to get the total sig and background
2110 nTotS=0; nTotB=0;
2111 nTotS_unWeighted=0; nTotB_unWeighted=0;
2112 for (UInt_t iev=0; iev<nevents; iev++) {
2113
2114 Double_t eventWeight = eventSample[iev]->GetWeight();
2115 if (eventSample[iev]->GetClass() == fSigClass) {
2116 nTotS+=eventWeight;
2117 nTotS_unWeighted++; }
2118 else {
2119 nTotB+=eventWeight;
2120 nTotB_unWeighted++;
2121 }
2122
2123 // #### Count the number in each bin (fill array "histograms")
2124 Int_t iBin=-1;
2125 for (UInt_t ivar=0; ivar < cNvars; ivar++) {
2126 // now scan trough the cuts for each varable and find which one gives
2127 // the best separationGain at the current stage.
2128 if ( useVariable[ivar] ) {
2130 if (ivar < fNvars) eventData = eventSample[iev]->GetValueFast(ivar);
2131 else { // the fisher variable
2132 eventData = fisherCoeff[fNvars];
2133 for (UInt_t jvar=0; jvar<fNvars; jvar++)
2134 eventData += fisherCoeff[jvar]*(eventSample[iev])->GetValueFast(jvar);
2135
2136 }
2137 // #### figure out which bin the event belongs in ...
2138 // "maximum" is nbins-1 (the "-1" because we start counting from 0 !!
2139 iBin = TMath::Min(Int_t(nBins[ivar]-1),TMath::Max(0,int (invBinWidth[ivar]*(eventData-xmin[ivar]) ) ));
2140 if (eventSample[iev]->GetClass() == fSigClass) {
2141 nSelS[ivar][iBin]+=eventWeight;
2142 nSelS_unWeighted[ivar][iBin]++;
2143 }
2144 else {
2145 nSelB[ivar][iBin]+=eventWeight;
2146 nSelB_unWeighted[ivar][iBin]++;
2147 }
2148 if (DoRegression()) {
2149 target[ivar][iBin] +=eventWeight*eventSample[iev]->GetTarget(0);
2150 target2[ivar][iBin]+=eventWeight*eventSample[iev]->GetTarget(0)*eventSample[iev]->GetTarget(0);
2151 }
2152 }
2153 }
2154 }
2155
2156 // now turn the "histogram" into a cumulative distribution
2157 for (UInt_t ivar=0; ivar < cNvars; ivar++) {
2158 if (useVariable[ivar]) {
2159 for (UInt_t ibin=1; ibin < nBins[ivar]; ibin++) {
2160 nSelS[ivar][ibin]+=nSelS[ivar][ibin-1];
2161 nSelS_unWeighted[ivar][ibin]+=nSelS_unWeighted[ivar][ibin-1];
2162 nSelB[ivar][ibin]+=nSelB[ivar][ibin-1];
2163 nSelB_unWeighted[ivar][ibin]+=nSelB_unWeighted[ivar][ibin-1];
2164 if (DoRegression()) {
2165 target[ivar][ibin] +=target[ivar][ibin-1] ;
2166 target2[ivar][ibin]+=target2[ivar][ibin-1];
2167 }
2168 }
2169 if (nSelS_unWeighted[ivar][nBins[ivar]-1] +nSelB_unWeighted[ivar][nBins[ivar]-1] != eventSample.size()) {
2170 Log() << kFATAL << "Helge, you have a bug ....nSelS_unw..+nSelB_unw..= "
2171 << nSelS_unWeighted[ivar][nBins[ivar]-1] +nSelB_unWeighted[ivar][nBins[ivar]-1]
2172 << " while eventsample size = " << eventSample.size()
2173 << Endl;
2174 }
2175 double lastBins=nSelS[ivar][nBins[ivar]-1] +nSelB[ivar][nBins[ivar]-1];
2176 double totalSum=nTotS+nTotB;
2178 Log() << kFATAL << "Helge, you have another bug ....nSelS+nSelB= "
2179 << lastBins
2180 << " while total number of events = " << totalSum
2181 << Endl;
2182 }
2183 }
2184 }
2185 // #### Loops over vars and bins, but not events, not worth parallelizing unless nbins is on the order of ndata/10 ish ...
2186 // now select the optimal cuts for each varable and find which one gives
2187 // the best separationGain at the current stage
2188 for (UInt_t ivar=0; ivar < cNvars; ivar++) {
2189 if (useVariable[ivar]) {
2190 for (UInt_t iBin=0; iBin<nBins[ivar]-1; iBin++) { // the last bin contains "all events" -->skip
2191 // the separationGain is defined as the various indices (Gini, CorssEntropy, e.t.c)
2192 // calculated by the "SamplePurities" fom the branches that would go to the
2193 // left or the right from this node if "these" cuts were used in the Node:
2194 // hereby: nSelS and nSelB would go to the right branch
2195 // (nTotS - nSelS) + (nTotB - nSelB) would go to the left branch;
2196
2197 // only allow splits where both daughter nodes match the specified miniumum number
2198 // for this use the "unweighted" events, as you are interested in statistically
2199 // significant splits, which is determined by the actual number of entries
2200 // for a node, rather than the sum of event weights.
2201
2202 Double_t sl = nSelS_unWeighted[ivar][iBin];
2203 Double_t bl = nSelB_unWeighted[ivar][iBin];
2204 Double_t s = nTotS_unWeighted;
2205 Double_t b = nTotB_unWeighted;
2206 Double_t slW = nSelS[ivar][iBin];
2207 Double_t blW = nSelB[ivar][iBin];
2208 Double_t sW = nTotS;
2209 Double_t bW = nTotB;
2210 Double_t sr = s-sl;
2211 Double_t br = b-bl;
2212 Double_t srW = sW-slW;
2213 Double_t brW = bW-blW;
2214 // std::cout << "sl="<<sl << " bl="<<bl<<" fMinSize="<<fMinSize << "sr="<<sr << " br="<<br <<std::endl;
2215 if ( ((sl+bl)>=fMinSize && (sr+br)>=fMinSize)
2216 && ((slW+blW)>=fMinSize && (srW+brW)>=fMinSize)
2217 ) {
2218
2219 if (DoRegression()) {
2220 sepTmp = fRegType->GetSeparationGain(nSelS[ivar][iBin]+nSelB[ivar][iBin],
2221 target[ivar][iBin],target2[ivar][iBin],
2222 nTotS+nTotB,
2223 target[ivar][nBins[ivar]-1],target2[ivar][nBins[ivar]-1]);
2224 } else {
2225 sepTmp = fSepType->GetSeparationGain(nSelS[ivar][iBin], nSelB[ivar][iBin], nTotS, nTotB);
2226 }
2227 if (separationGain[ivar] < sepTmp) {
2229 cutIndex[ivar] = iBin;
2230 }
2231 }
2232 }
2233 }
2234 }
2235
2236 //now you have found the best separation cut for each variable, now compare the variables
2237 for (UInt_t ivar=0; ivar < cNvars; ivar++) {
2238 if (useVariable[ivar] ) {
2241 mxVar = ivar;
2242 }
2243 }
2244 }
2245
2246 if (mxVar >= 0) {
2247 if (DoRegression()) {
2248 node->SetSeparationIndex(fRegType->GetSeparationIndex(nTotS+nTotB,target[0][nBins[mxVar]-1],target2[0][nBins[mxVar]-1]));
2249 node->SetResponse(target[0][nBins[mxVar]-1]/(nTotS+nTotB));
2250 if ( almost_equal_double(target2[0][nBins[mxVar]-1]/(nTotS+nTotB), target[0][nBins[mxVar]-1]/(nTotS+nTotB)*target[0][nBins[mxVar]-1]/(nTotS+nTotB))) {
2251 node->SetRMS(0);
2252 }else{
2253 node->SetRMS(TMath::Sqrt(target2[0][nBins[mxVar]-1]/(nTotS+nTotB) - target[0][nBins[mxVar]-1]/(nTotS+nTotB)*target[0][nBins[mxVar]-1]/(nTotS+nTotB)));
2254 }
2255 }
2256 else {
2257 node->SetSeparationIndex(fSepType->GetSeparationIndex(nTotS,nTotB));
2258 if (mxVar >=0){
2259 if (nSelS[mxVar][cutIndex[mxVar]]/nTotS > nSelB[mxVar][cutIndex[mxVar]]/nTotB) cutType=kTRUE;
2260 else cutType=kFALSE;
2261 }
2262 }
2263 node->SetSelector((UInt_t)mxVar);
2265 node->SetCutType(cutType);
2267 if (mxVar < (Int_t) fNvars){ // the fisher cut is actually not used in this node, hence don't need to store fisher components
2268 node->SetNFisherCoeff(0);
2269 fVariableImportance[mxVar] += separationGainTotal*separationGainTotal * (nTotS+nTotB) * (nTotS+nTotB) ;
2270 //for (UInt_t ivar=0; ivar<fNvars; ivar++) fVariableImportance[ivar] += separationGain[ivar]*separationGain[ivar] * (nTotS+nTotB) * (nTotS+nTotB) ;
2271 }else{
2272 // allocate Fisher coefficients (use fNvars, and set the non-used ones to zero. Might
2273 // be even less storage space on average than storing also the mapping used otherwise
2274 // can always be changed relatively easy
2275 node->SetNFisherCoeff(fNvars+1);
2276 for (UInt_t ivar=0; ivar<=fNvars; ivar++) {
2278 // take 'fisher coeff. weighted estimate as variable importance, "Don't fill the offset coefficient though :)
2279 if (ivar<fNvars){
2280 fVariableImportance[ivar] += fisherCoeff[ivar]*fisherCoeff[ivar]*separationGainTotal*separationGainTotal * (nTotS+nTotB) * (nTotS+nTotB) ;
2281 }
2282 }
2283 }
2284 }
2285 else {
2287 }
2288 // if (mxVar > -1) {
2289 // std::cout << "------------------------------------------------------------------"<<std::endl;
2290 // std::cout << "cutting on Var: " << mxVar << " with cutIndex " << cutIndex[mxVar] << " being: " << cutValues[mxVar][cutIndex[mxVar]] << std::endl;
2291 // std::cout << " nSelS = " << nSelS_unWeighted[mxVar][cutIndex[mxVar]] << " nSelB = " << nSelB_unWeighted[mxVar][cutIndex[mxVar]] << " (right) sum:= " << nSelS_unWeighted[mxVar][cutIndex[mxVar]] + nSelB_unWeighted[mxVar][cutIndex[mxVar]] << std::endl;
2292 // std::cout << " nSelS = " << nTotS_unWeighted - nSelS_unWeighted[mxVar][cutIndex[mxVar]] << " nSelB = " << nTotB_unWeighted-nSelB_unWeighted[mxVar][cutIndex[mxVar]] << " (left) sum:= " << nTotS_unWeighted + nTotB_unWeighted - nSelS_unWeighted[mxVar][cutIndex[mxVar]] - nSelB_unWeighted[mxVar][cutIndex[mxVar]] << std::endl;
2293 // std::cout << " nSelS = " << nSelS[mxVar][cutIndex[mxVar]] << " nSelB = " << nSelB[mxVar][cutIndex[mxVar]] << std::endl;
2294 // std::cout << " s/s+b " << nSelS_unWeighted[mxVar][cutIndex[mxVar]]/( nSelS_unWeighted[mxVar][cutIndex[mxVar]] + nSelB_unWeighted[mxVar][cutIndex[mxVar]])
2295 // << " s/s+b " << (nTotS - nSelS_unWeighted[mxVar][cutIndex[mxVar]])/( nTotS-nSelS_unWeighted[mxVar][cutIndex[mxVar]] + nTotB-nSelB_unWeighted[mxVar][cutIndex[mxVar]]) << std::endl;
2296 // std::cout << " nTotS = " << nTotS << " nTotB = " << nTotB << std::endl;
2297 // std::cout << " separationGainTotal " << separationGainTotal << std::endl;
2298 // } else {
2299 // std::cout << "------------------------------------------------------------------"<<std::endl;
2300 // std::cout << " obviously didn't find new mxVar " << mxVar << std::endl;
2301 // }
2302 for (UInt_t i=0; i<cNvars; i++) {
2303 delete [] nSelS[i];
2304 delete [] nSelB[i];
2305 delete [] nSelS_unWeighted[i];
2306 delete [] nSelB_unWeighted[i];
2307 delete [] target[i];
2308 delete [] target2[i];
2309 delete [] cutValues[i];
2310 }
2311 delete [] nSelS;
2312 delete [] nSelB;
2313 delete [] nSelS_unWeighted;
2314 delete [] nSelB_unWeighted;
2315 delete [] target;
2316 delete [] target2;
2317 delete [] cutValues;
2318
2319 delete [] xmin;
2320 delete [] xmax;
2321
2322 delete [] useVariable;
2323 delete [] mapVariable;
2324
2325 delete [] separationGain;
2326 delete [] cutIndex;
2327
2328 delete [] nBins;
2329 delete [] binWidth;
2330 delete [] invBinWidth;
2331
2332 return separationGainTotal;
2333
2334}
2335#endif
2336
2337
2338////////////////////////////////////////////////////////////////////////////////
2339/// calculate the fisher coefficients for the event sample and the variables used
2340
2342 std::vector<Double_t> fisherCoeff(fNvars+1);
2343
2344 // initialization of global matrices and vectors
2345 // average value of each variables for S, B, S+B
2347
2348 // the covariance 'within class' and 'between class' matrices
2352
2353 //
2354 // compute mean values of variables in each sample, and the overall means
2355 //
2356
2357 // initialize internal sum-of-weights variables
2360
2361
2362 // init vectors
2365 for (UInt_t ivar=0; ivar<nFisherVars; ivar++) { sumS[ivar] = sumB[ivar] = 0; }
2366
2367 UInt_t nevents = eventSample.size();
2368 // compute sample means
2369 for (UInt_t ievt=0; ievt<nevents; ievt++) {
2370
2371 // read the Training Event into "event"
2372 const Event * ev = eventSample[ievt];
2373
2374 // sum of weights
2375 Double_t weight = ev->GetWeight();
2376 if (ev->GetClass() == fSigClass) sumOfWeightsS += weight;
2377 else sumOfWeightsB += weight;
2378
2379 Double_t* sum = ev->GetClass() == fSigClass ? sumS : sumB;
2380 for (UInt_t ivar=0; ivar<nFisherVars; ivar++) {
2381 sum[ivar] += ev->GetValueFast( mapVarInFisher[ivar] )*weight;
2382 }
2383 }
2384 for (UInt_t ivar=0; ivar<nFisherVars; ivar++) {
2385 (*meanMatx)( ivar, 2 ) = sumS[ivar];
2386 (*meanMatx)( ivar, 0 ) = sumS[ivar]/sumOfWeightsS;
2387
2388 (*meanMatx)( ivar, 2 ) += sumB[ivar];
2389 (*meanMatx)( ivar, 1 ) = sumB[ivar]/sumOfWeightsB;
2390
2391 // signal + background
2392 (*meanMatx)( ivar, 2 ) /= (sumOfWeightsS + sumOfWeightsB);
2393 }
2394
2395 delete [] sumS;
2396
2397 delete [] sumB;
2398
2399 // the matrix of covariance 'within class' reflects the dispersion of the
2400 // events relative to the center of gravity of their own class
2401
2402 // assert required
2403
2404 assert( sumOfWeightsS > 0 && sumOfWeightsB > 0 );
2405
2406 // product matrices (x-<x>)(y-<y>) where x;y are variables
2407
2414
2415 // 'within class' covariance
2416 for (UInt_t ievt=0; ievt<nevents; ievt++) {
2417
2418 // read the Training Event into "event"
2419 // const Event* ev = eventSample[ievt];
2420 const Event* ev = eventSample.at(ievt);
2421
2422 Double_t weight = ev->GetWeight(); // may ignore events with negative weights
2423
2424 for (UInt_t x=0; x<nFisherVars; x++) {
2425 xval[x] = ev->GetValueFast( mapVarInFisher[x] );
2426 }
2427 Int_t k=0;
2428 for (UInt_t x=0; x<nFisherVars; x++) {
2429 for (UInt_t y=0; y<nFisherVars; y++) {
2430 if ( ev->GetClass() == fSigClass ) sum2Sig[k] += ( (xval[x] - (*meanMatx)(x, 0))*(xval[y] - (*meanMatx)(y, 0)) )*weight;
2431 else sum2Bgd[k] += ( (xval[x] - (*meanMatx)(x, 1))*(xval[y] - (*meanMatx)(y, 1)) )*weight;
2432 k++;
2433 }
2434 }
2435 }
2436 Int_t k=0;
2437 for (UInt_t x=0; x<nFisherVars; x++) {
2438 for (UInt_t y=0; y<nFisherVars; y++) {
2439 (*with)(x, y) = sum2Sig[k]/sumOfWeightsS + sum2Bgd[k]/sumOfWeightsB;
2440 k++;
2441 }
2442 }
2443
2444 delete [] sum2Sig;
2445 delete [] sum2Bgd;
2446 delete [] xval;
2447
2448
2449 // the matrix of covariance 'between class' reflects the dispersion of the
2450 // events of a class relative to the global center of gravity of all the class
2451 // hence the separation between classes
2452
2453
2455
2456 for (UInt_t x=0; x<nFisherVars; x++) {
2457 for (UInt_t y=0; y<nFisherVars; y++) {
2458
2459 prodSig = ( ((*meanMatx)(x, 0) - (*meanMatx)(x, 2))*
2460 ((*meanMatx)(y, 0) - (*meanMatx)(y, 2)) );
2461 prodBgd = ( ((*meanMatx)(x, 1) - (*meanMatx)(x, 2))*
2462 ((*meanMatx)(y, 1) - (*meanMatx)(y, 2)) );
2463
2465 }
2466 }
2467
2468
2469
2470 // compute full covariance matrix from sum of within and between matrices
2471 for (UInt_t x=0; x<nFisherVars; x++)
2472 for (UInt_t y=0; y<nFisherVars; y++)
2473 (*cov)(x, y) = (*with)(x, y) + (*betw)(x, y);
2474
2475 // Fisher = Sum { [coeff]*[variables] }
2476 //
2477 // let Xs be the array of the mean values of variables for signal evts
2478 // let Xb be the array of the mean values of variables for backgd evts
2479 // let InvWith be the inverse matrix of the 'within class' correlation matrix
2480 //
2481 // then the array of Fisher coefficients is
2482 // [coeff] =TMath::Sqrt(fNsig*fNbgd)/fNevt*transpose{Xs-Xb}*InvWith
2483 TMatrixD* theMat = with; // Fishers original
2484 // TMatrixD* theMat = cov; // Mahalanobis
2485
2487 if ( TMath::Abs(invCov.Determinant()) < 10E-24 ) {
2488 Log() << kWARNING << "FisherCoeff matrix is almost singular with determinant="
2489 << TMath::Abs(invCov.Determinant())
2490 << " did you use the variables that are linear combinations or highly correlated?"
2491 << Endl;
2492 }
2493 if ( TMath::Abs(invCov.Determinant()) < 10E-120 ) {
2494 Log() << kFATAL << "FisherCoeff matrix is singular with determinant="
2495 << TMath::Abs(invCov.Determinant())
2496 << " did you use the variables that are linear combinations?"
2497 << Endl;
2498 }
2499
2500 invCov.Invert();
2501
2502 // apply rescaling factor
2504
2505 // compute difference of mean values
2506 std::vector<Double_t> diffMeans( nFisherVars );
2507
2508 for (UInt_t ivar=0; ivar<=fNvars; ivar++) fisherCoeff[ivar] = 0;
2509 for (UInt_t ivar=0; ivar<nFisherVars; ivar++) {
2510 for (UInt_t jvar=0; jvar<nFisherVars; jvar++) {
2511 Double_t d = (*meanMatx)(jvar, 0) - (*meanMatx)(jvar, 1);
2513 }
2514
2515 // rescale
2517 }
2518
2519 // offset correction
2520 Double_t f0 = 0.0;
2521 for (UInt_t ivar=0; ivar<nFisherVars; ivar++){
2522 f0 += fisherCoeff[mapVarInFisher[ivar]]*((*meanMatx)(ivar, 0) + (*meanMatx)(ivar, 1));
2523 }
2524 f0 /= -2.0;
2525
2526 fisherCoeff[fNvars] = f0; //as we start counting variables from "zero", I store the fisher offset at the END
2527
2528 return fisherCoeff;
2529}
2530
2531////////////////////////////////////////////////////////////////////////////////
2532/// train a node by finding the single optimal cut for a single variable
2533/// that best separates signal and background (maximizes the separation gain)
2534
2537{
2538 Double_t nTotS = 0.0, nTotB = 0.0;
2539 Int_t nTotS_unWeighted = 0, nTotB_unWeighted = 0;
2540
2541 std::vector<TMVA::BDTEventWrapper> bdtEventSample;
2542
2543 // List of optimal cuts, separation gains, and cut types (removed background or signal) - one for each variable
2544 // each spot in parallel no problem
2545 std::vector<Double_t> lCutValue( fNvars, 0.0 );
2546 std::vector<Double_t> lSepGain( fNvars, -1.0e6 );
2547 std::vector<Char_t> lCutType( fNvars ); // <----- bool is stored (for performance reasons, no std::vector<bool> has been taken)
2548 lCutType.assign( fNvars, Char_t(kFALSE) );
2549
2550 // Initialize (un)weighted counters for signal & background
2551 // Construct a list of event wrappers that point to the original data
2552 for( std::vector<const TMVA::Event*>::const_iterator it = eventSample.begin(); it != eventSample.end(); ++it ) {
2553 if((*it)->GetClass() == fSigClass) { // signal or background event
2554 nTotS += (*it)->GetWeight();
2555 ++nTotS_unWeighted;
2556 }
2557 else {
2558 nTotB += (*it)->GetWeight();
2559 ++nTotB_unWeighted;
2560 }
2561 bdtEventSample.push_back(TMVA::BDTEventWrapper(*it));
2562 }
2563
2564 std::vector<Char_t> useVariable(fNvars); // <----- bool is stored (for performance reasons, no std::vector<bool> has been taken)
2565 useVariable.assign( fNvars, Char_t(kTRUE) );
2566
2567 for (UInt_t ivar=0; ivar < fNvars; ivar++) useVariable[ivar]=Char_t(kFALSE);
2568 if (fRandomisedTree) { // choose for each node splitting a random subset of variables to choose from
2569 if (fUseNvars ==0 ) { // no number specified ... choose s.th. which hopefully works well
2570 // watch out, should never happen as it is initialised automatically in MethodBDT already!!!
2571 fUseNvars = UInt_t(TMath::Sqrt(fNvars)+0.6);
2572 }
2573 Int_t nSelectedVars = 0;
2574 while (nSelectedVars < fUseNvars) {
2575 Double_t bla = fMyTrandom->Rndm()*fNvars;
2576 useVariable[Int_t (bla)] = Char_t(kTRUE);
2577 nSelectedVars = 0;
2578 for (UInt_t ivar=0; ivar < fNvars; ivar++) {
2580 }
2581 }
2582 }
2583 else {
2584 for (UInt_t ivar=0; ivar < fNvars; ivar++) useVariable[ivar] = Char_t(kTRUE);
2585 }
2586 for( UInt_t ivar = 0; ivar < fNvars; ivar++ ) { // loop over all discriminating variables
2587 if(!useVariable[ivar]) continue; // only optimze with selected variables
2588
2589 TMVA::BDTEventWrapper::SetVarIndex(ivar); // select the variable to sort by
2590
2591 std::sort( bdtEventSample.begin(),bdtEventSample.end() ); // sort the event data
2592
2593
2594 Double_t bkgWeightCtr = 0.0, sigWeightCtr = 0.0;
2595
2596 std::vector<TMVA::BDTEventWrapper>::iterator it = bdtEventSample.begin(), it_end = bdtEventSample.end();
2597 for( ; it != it_end; ++it ) {
2598 if((**it)->GetClass() == fSigClass ) // specify signal or background event
2599 sigWeightCtr += (**it)->GetWeight();
2600 else
2601 bkgWeightCtr += (**it)->GetWeight();
2602 // Store the accumulated signal (background) weights
2603 it->SetCumulativeWeight(false,bkgWeightCtr);
2604 it->SetCumulativeWeight(true,sigWeightCtr);
2605 }
2606
2607 const Double_t fPMin = 1.0e-6;
2609 Long64_t index = 0;
2610 Double_t separationGain = -1.0, sepTmp = 0.0, cutValue = 0.0, dVal = 0.0, norm = 0.0;
2611
2612 // Locate the optimal cut for this (ivar-th) variable
2613 for( it = bdtEventSample.begin(); it != it_end; ++it ) {
2614 if( index == 0 ) { ++index; continue; }
2615 if( *(*it) == NULL ) {
2616 Log() << kFATAL << "In TrainNodeFull(): have a null event! Where index="
2617 << index << ", and parent node=" << node->GetParent() << Endl;
2618 break;
2619 }
2620 dVal = bdtEventSample[index].GetVal() - bdtEventSample[index-1].GetVal();
2621 norm = TMath::Abs(bdtEventSample[index].GetVal() + bdtEventSample[index-1].GetVal());
2622 // Only allow splits where both daughter nodes have the specified minimum number of events
2623 // Splits are only sensible when the data are ordered (eg. don't split inside a sequence of 0's)
2624 if( index >= fMinSize && (nTotS_unWeighted + nTotB_unWeighted) - index >= fMinSize && TMath::Abs(dVal/(0.5*norm + 1)) > fPMin ) {
2625
2626 sepTmp = fSepType->GetSeparationGain( it->GetCumulativeWeight(true), it->GetCumulativeWeight(false), sigWeightCtr, bkgWeightCtr );
2627 if( sepTmp > separationGain ) {
2629 cutValue = it->GetVal() - 0.5*dVal;
2630 Double_t nSelS = it->GetCumulativeWeight(true);
2631 Double_t nSelB = it->GetCumulativeWeight(false);
2632 // Indicate whether this cut is improving the node purity by removing background (enhancing signal)
2633 // or by removing signal (enhancing background)
2634 if( nSelS/sigWeightCtr > nSelB/bkgWeightCtr ) cutType = kTRUE;
2635 else cutType = kFALSE;
2636 }
2637 }
2638 ++index;
2639 }
2643 }
2644 Double_t separationGain = -1.0;
2645 Int_t iVarIndex = -1;
2646 for( UInt_t ivar = 0; ivar < fNvars; ivar++ ) {
2647 if( lSepGain[ivar] > separationGain ) {
2648 iVarIndex = ivar;
2650 }
2651 }
2652
2653 // #### storing the best values into the node
2654 if(iVarIndex >= 0) {
2655 node->SetSelector(iVarIndex);
2659 fVariableImportance[iVarIndex] += separationGain*separationGain * (nTotS+nTotB) * (nTotS+nTotB);
2660 }
2661 else {
2662 separationGain = 0.0;
2663 }
2664
2665 return separationGain;
2666}
2667
2668////////////////////////////////////////////////////////////////////////////////
2669/// get the pointer to the leaf node where a particular event ends up in...
2670/// (used in gradient boosting)
2671
2673{
2674 TMVA::DecisionTreeNode *current = (TMVA::DecisionTreeNode*)this->GetRoot();
2675 while(current->GetNodeType() == 0) { // intermediate node in a tree
2676 current = (current->GoesRight(e)) ?
2677 (TMVA::DecisionTreeNode*)current->GetRight() :
2678 (TMVA::DecisionTreeNode*)current->GetLeft();
2679 }
2680 return current;
2681}
2682
2683////////////////////////////////////////////////////////////////////////////////
2684/// the event e is put into the decision tree (starting at the root node)
2685/// and the output is NodeType (signal) or (background) of the final node (basket)
2686/// in which the given events ends up. I.e. the result of the classification if
2687/// the event for this decision tree.
2688
2690{
2691 TMVA::DecisionTreeNode *current = this->GetRoot();
2692 if (!current){
2693 Log() << kFATAL << "CheckEvent: started with undefined ROOT node" <<Endl;
2694 return 0; //keeps coverity happy that doesn't know that kFATAL causes an exit
2695 }
2696
2697 while (current->GetNodeType() == 0) { // intermediate node in a (pruned) tree
2698 current = (current->GoesRight(*e)) ?
2699 current->GetRight() :
2700 current->GetLeft();
2701 if (!current) {
2702 Log() << kFATAL << "DT::CheckEvent: inconsistent tree structure" <<Endl;
2703 }
2704
2705 }
2706
2707 if (DoRegression()) {
2708 // Note: This path is also taken for MethodBDT with analysis type
2709 // kClassification and kMulticlass when using GradBoost.
2710 // See TMVA::MethodBDT::InitGradBoost
2711 return current->GetResponse();
2712 } else {
2713 if (UseYesNoLeaf) return Double_t ( current->GetNodeType() );
2714 else return current->GetPurity();
2715 }
2716}
2717
2718////////////////////////////////////////////////////////////////////////////////
2719/// calculates the purity S/(S+B) of a given event sample
2720
2722{
2723 Double_t sumsig=0, sumbkg=0, sumtot=0;
2724 for (UInt_t ievt=0; ievt<eventSample.size(); ievt++) {
2725 if (eventSample[ievt]->GetClass() != fSigClass) sumbkg+=eventSample[ievt]->GetWeight();
2726 else sumsig+=eventSample[ievt]->GetWeight();
2727 sumtot+=eventSample[ievt]->GetWeight();
2728 }
2729 // sanity check
2730 if (sumtot!= (sumsig+sumbkg)){
2731 Log() << kFATAL << "<SamplePurity> sumtot != sumsig+sumbkg"
2732 << sumtot << " " << sumsig << " " << sumbkg << Endl;
2733 }
2734 if (sumtot>0) return sumsig/(sumsig + sumbkg);
2735 else return -1;
2736}
2737
2738////////////////////////////////////////////////////////////////////////////////
2739/// Return the relative variable importance, normalized to all
2740/// variables together having the importance 1. The importance in
2741/// evaluated as the total separation-gain that this variable had in
2742/// the decision trees (weighted by the number of events)
2743
2745{
2746 std::vector<Double_t> relativeImportance(fNvars);
2747 Double_t sum=0;
2748 for (UInt_t i=0; i< fNvars; i++) {
2749 sum += fVariableImportance[i];
2750 relativeImportance[i] = fVariableImportance[i];
2751 }
2752
2753 for (UInt_t i=0; i< fNvars; i++) {
2754 if (sum > std::numeric_limits<double>::epsilon())
2755 relativeImportance[i] /= sum;
2756 else
2757 relativeImportance[i] = 0;
2758 }
2759 return relativeImportance;
2760}
2761
2762////////////////////////////////////////////////////////////////////////////////
2763/// returns the relative importance of variable ivar
2764
2766{
2767 std::vector<Double_t> relativeImportance = this->GetVariableImportance();
2768 if (ivar < fNvars) return relativeImportance[ivar];
2769 else {
2770 Log() << kFATAL << "<GetVariableImportance>" << Endl
2771 << "--- ivar = " << ivar << " is out of range " << Endl;
2772 }
2773
2774 return -1;
2775}
2776
bool almost_equal_double(double x, double y, int ulp=4)
bool almost_equal_float(float x, float y, int ulp=4)
#define d(i)
Definition RSha256.hxx:102
#define b(i)
Definition RSha256.hxx:100
#define f(i)
Definition RSha256.hxx:104
#define e(i)
Definition RSha256.hxx:103
int Int_t
Signed integer 4 bytes (int)
Definition RtypesCore.h:59
char Char_t
Character 1 byte (char)
Definition RtypesCore.h:51
unsigned long ULong_t
Unsigned long integer 4 bytes (unsigned long). Size depends on architecture.
Definition RtypesCore.h:69
unsigned int UInt_t
Unsigned integer 4 bytes (unsigned int)
Definition RtypesCore.h:60
float Float_t
Float 4 bytes (float)
Definition RtypesCore.h:71
constexpr Bool_t kFALSE
Definition RtypesCore.h:108
double Double_t
Double 8 bytes.
Definition RtypesCore.h:73
constexpr Bool_t kTRUE
Definition RtypesCore.h:107
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char Pixmap_t Pixmap_t PictureAttributes_t attr const char char ret_data h unsigned char height h Atom_t Int_t ULong_t ULong_t unsigned char prop_list Atom_t Atom_t target
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t r
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t result
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t index
Option_t Option_t TPoint TPoint const char mode
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char Pixmap_t Pixmap_t PictureAttributes_t attr const char char ret_data h unsigned char height h Atom_t Int_t ULong_t ULong_t unsigned char prop_list Atom_t Atom_t Atom_t Time_t type
float xmin
float xmax
TMatrixT< Double_t > TMatrixD
Definition TMatrixDfwd.h:23
const_iterator begin() const
const_iterator end() const
static void SetVarIndex(Int_t iVar)
Base class for BinarySearch and Decision Trees.
Definition BinaryTree.h:62
UInt_t fNNodes
total number of nodes in the tree (counted)
Definition BinaryTree.h:117
void SetRoot(Node *r)
Definition BinaryTree.h:80
MsgLogger & Log() const
static Config & Instance()
static function: returns TMVA instance
Definition Config.cxx:97
A class to prune a decision tree using the Cost Complexity method.
Class that contains all the data information.
Definition DataSetInfo.h:62
void SetNEvents_unweighted(Float_t nev)
set the number of unweighted events that entered the node (during training), if traininfo defined
void SetNodeType(Int_t t)
set node type: 1 signal node, -1 bkg leave, 0 intermediate Node
void SetSeparationGain(Float_t sep)
set the separation, or information gained BY this node's selection, if traininfo defined
void SetNBkgEvents(Float_t b)
set the sum of the backgr weights in the node, if traininfo defined
void SetCutType(Bool_t t)
set true: if event variable > cutValue ==> signal , false otherwise
Double_t GetNSValidation() const
return number of signal events from the pruning validation sample, or -1 if traininfo undefined
void IncrementNEvents_unweighted()
increment the number of events that entered the node (during training), if traininfo defined
void SetFisherCoeff(Int_t ivar, Double_t coeff)
set fisher coefficients
void SetNSigEvents_unboosted(Float_t s)
set the sum of the unboosted signal events in the node, if traininfo defined
void SetAlphaMinSubtree(Double_t g)
set the minimum alpha in the tree rooted at this node, if traininfo defined
void IncrementNBkgEvents(Float_t b)
increment the sum of the backgr weights in the node, if traininfo defined
void SetNEvents_unboosted(Float_t nev)
set the number of unboosted events that entered the node (during training), if traininfo defined
Float_t GetNSigEvents(void) const
return the sum of the signal weights in the node, or -1 if traininfo undefined
DecisionTreeNode * GetLeft() const override
void SetTerminal(Bool_t s=kTRUE)
void SetLeft(Node *l) override
DecisionTreeNode * GetRight() const override
void SetResponse(Float_t r)
set the response of the node (for regression)
void SetSampleMax(UInt_t ivar, Float_t xmax)
set the maximum of variable ivar from the training sample that pass/end up in this node,...
void SetNBValidation(Double_t b)
set number of background events from the pruning validation sample, if traininfo defined
void IncrementNEvents(Float_t nev)
void SetPurity(void)
return the S/(S+B) (purity) for the node REM: even if nodes with purity 0.01 are very PURE background...
void SetSubTreeR(Double_t r)
set the resubstitution estimate, R(T_t), of the tree rooted at this node, if traininfo defined
void AddToSumTarget2(Float_t t2)
add to sum target 2, if traininfo defined
DecisionTreeNode * GetParent() const override
Double_t GetNodeR() const
return the node resubstitution estimate, R(t), for Cost Complexity pruning, or -1 if traininfo undefi...
void SetNFisherCoeff(Int_t nvars)
Short_t GetSelector() const
return index of variable used for discrimination at this node
void SetNSigEvents(Float_t s)
set the sum of the signal weights in the node, if traininfo defined
Float_t GetResponse(void) const
return the response of the node (for regression)
Float_t GetCutValue(void) const
return the cut value applied at this node
Bool_t GoesRight(const Event &) const override
test event if it descends the tree at this node to the right
Int_t GetNodeType(void) const
return node type: 1 signal node, -1 bkg leave, 0 intermediate Node
void IncrementNBkgEvents_unweighted()
increment the sum of the backgr weights in the node, if traininfo defined
void SetNSigEvents_unweighted(Float_t s)
set the sum of the unweighted signal events in the node, if traininfo defined
void SetRight(Node *r) override
Double_t GetNBValidation() const
return number of background events from the pruning validation sample, or -1 if traininfo undefined
void SetAlpha(Double_t alpha)
set the critical point alpha, if traininfo defined
void SetSeparationIndex(Float_t sep)
set the chosen index, measure of "purity" (separation between S and B) AT this node,...
void SetRMS(Float_t r)
set the RMS of the response of the node (for regression)
void IncrementNSigEvents_unweighted()
increment the sum of the signal weights in the node, if traininfo defined
void SetNBkgEvents_unboosted(Float_t b)
set the sum of the unboosted backgr events in the node, if traininfo defined
Float_t GetPurity(void) const
return S/(S+B) (purity) at this node (from training)
void IncrementNSigEvents(Float_t s)
increment the sum of the signal weights in the node, if traininfo defined
Float_t GetSampleMax(UInt_t ivar) const
return the maximum of variable ivar from the training sample that pass/end up in this node,...
void SetCutValue(Float_t c)
set the cut value applied at this node
Float_t GetNBkgEvents(void) const
return the sum of the backgr weights in the node, or -1 if traininfo undefined
Float_t GetSampleMin(UInt_t ivar) const
return the minimum of variable ivar from the training sample that pass/end up in this node,...
void SetSampleMin(UInt_t ivar, Float_t xmin)
set the minimum of variable ivar from the training sample that pass/end up in this node,...
void SetSelector(Short_t i)
set index of variable used for discrimination at this node
void SetNBkgEvents_unweighted(Float_t b)
set the sum of the unweighted backgr events in the node, if traininfo defined
void SetNSValidation(Double_t s)
set number of signal events from the pruning validation sample, if traininfo defined
void AddToSumTarget(Float_t t)
add to sum target, if traininfo defined
void SetNTerminal(Int_t n)
set number of terminal nodes in the subtree rooted here, if traininfo defined
void SetNEvents(Float_t nev)
set the number of events that entered the node (during training), if traininfo defined
Implementation of a Decision Tree.
UInt_t BuildTree(const EventConstList &eventSample, DecisionTreeNode *node=nullptr)
building the decision tree by recursively calling the splitting of one (root-) node into two daughter...
void FillTree(const EventList &eventSample)
fill the existing the decision tree structure by filling event in from the top node and see where the...
void PruneNode(TMVA::DecisionTreeNode *node)
prune away the subtree below the node
void ApplyValidationSample(const EventConstList *validationSample) const
run the validation sample through the (pruned) tree and fill in the nodes the variables NSValidation ...
Double_t TrainNodeFull(const EventConstList &eventSample, DecisionTreeNode *node)
train a node by finding the single optimal cut for a single variable that best separates signal and b...
TMVA::DecisionTreeNode * GetEventNode(const TMVA::Event &e) const
get the pointer to the leaf node where a particular event ends up in... (used in gradient boosting)
void GetRandomisedVariables(Bool_t *useVariable, UInt_t *variableMap, UInt_t &nVars)
void SetParentTreeInNodes(Node *n=nullptr)
descend a tree to find all its leaf nodes, fill max depth reached in the tree at the same time.
void DescendTree(Node *n=nullptr)
descend a tree to find all its leaf nodes
static DecisionTree * CreateFromXML(void *node, UInt_t tmva_Version_Code=262657)
re-create a new tree (decision tree or search tree) from XML
std::vector< const TMVA::Event * > EventConstList
Double_t CheckEvent(const TMVA::Event *, Bool_t UseYesNoLeaf=kFALSE) const
the event e is put into the decision tree (starting at the root node) and the output is NodeType (sig...
static const Int_t fgRandomSeed
Int_t fNCuts
number of grid point in variable cut scans
UInt_t CleanTree(DecisionTreeNode *node=nullptr)
remove those last splits that result in two leaf nodes that are both of the type (i....
virtual ~DecisionTree(void)
destructor
Types::EAnalysisType fAnalysisType
kClassification(=0=false) or kRegression(=1=true)
std::vector< Double_t > GetVariableImportance()
Return the relative variable importance, normalized to all variables together having the importance 1...
void CheckEventWithPrunedTree(const TMVA::Event *) const
pass a single validation event through a pruned decision tree on the way down the tree,...
void PruneNodeInPlace(TMVA::DecisionTreeNode *node)
prune a node temporarily (without actually deleting its descendants which allows testing the pruned t...
Double_t TestPrunedTreeQuality(const DecisionTreeNode *dt=nullptr, Int_t mode=0) const
return the misclassification rate of a pruned tree a "pruned tree" may have set the variable "IsTermi...
Double_t PruneTree(const EventConstList *validationSample=nullptr)
prune (get rid of internal nodes) the Decision tree to avoid overtraining several different pruning m...
void FillEvent(const TMVA::Event &event, TMVA::DecisionTreeNode *node)
fill the existing the decision tree structure by filling event in from the top node and see where the...
void ClearTree()
clear the tree nodes (their S/N, Nevents etc), just keep the structure of the tree
Double_t SamplePurity(EventList eventSample)
calculates the purity S/(S+B) of a given event sample
Node * GetNode(ULong_t sequence, UInt_t depth)
retrieve node from the tree.
std::vector< Double_t > GetFisherCoefficients(const EventConstList &eventSample, UInt_t nFisherVars, UInt_t *mapVarInFisher)
calculate the fisher coefficients for the event sample and the variables used
UInt_t CountLeafNodes(TMVA::Node *n=nullptr)
return the number of terminal nodes in the sub-tree below Node n
Double_t TrainNodeFast(const EventConstList &eventSample, DecisionTreeNode *node)
Decide how to split a node using one of the variables that gives the best separation of signal/backgr...
RegressionVariance * fRegType
the separation criteria used in Regression
DecisionTree(void)
default constructor using the GiniIndex as separation criterion, no restrictions on minium number of ...
Double_t GetSumWeights(const EventConstList *validationSample) const
calculate the normalization factor for a pruning validation sample
Double_t GetWeight() const
return the event weight - depending on whether the flag IgnoreNegWeightsInTraining is or not.
Definition Event.cxx:389
UInt_t GetClass() const
Definition Event.h:86
A helper class to prune a decision tree using the expected error (C4.5) method.
IPruneTool - a helper interface class to prune a decision tree.
Definition IPruneTool.h:70
Node for the BinarySearch or Decision Trees.
Definition Node.h:58
UInt_t GetDepth() const
Definition Node.h:116
Calculate the "SeparationGain" for Regression analysis separation criteria used in various training a...
An interface to calculate the "SeparationGain" for different separation criteria used in various trai...
const TMatrixD * GetCorrelationMatrix(const TMatrixD *covMat)
turns covariance into correlation matrix
Definition Tools.cxx:324
void ReadAttr(void *node, const char *, T &value)
read attribute from xml
Definition Tools.h:329
std::vector< TMatrixDSym * > * CalcCovarianceMatrices(const std::vector< Event * > &events, Int_t maxCls, VariableTransformBase *transformBase=nullptr)
compute covariance matrices
Definition Tools.cxx:1513
Singleton class for Global types used by TMVA.
Definition Types.h:71
@ kClassification
Definition Types.h:127
@ kRegression
Definition Types.h:128
@ kDEBUG
Definition Types.h:56
@ kERROR
Definition Types.h:60
@ kWARNING
Definition Types.h:59
@ kFATAL
Definition Types.h:61
Random number generator class based on M.
Definition TRandom3.h:27
Double_t y[n]
Definition legend1.C:17
Double_t x[n]
Definition legend1.C:17
const Int_t n
Definition legend1.C:16
TClass * GetClass(T *)
Definition TClass.h:679
TSeq< unsigned int > TSeqU
Definition TSeq.hxx:204
Tools & gTools()
MsgLogger & Endl(MsgLogger &ml)
Definition MsgLogger.h:148
Short_t Max(Short_t a, Short_t b)
Returns the largest of a and b.
Definition TMathBase.h:251
Double_t Log(Double_t x)
Returns the natural logarithm of x.
Definition TMath.h:767
Double_t Sqrt(Double_t x)
Returns the square root of x.
Definition TMath.h:673
Short_t Min(Short_t a, Short_t b)
Returns the smallest of a and b.
Definition TMathBase.h:199
Short_t Abs(Short_t d)
Returns the absolute value of parameter Short_t d.
Definition TMathBase.h:124
BuildNodeInfo(Int_t fNvars, std::vector< Float_t > &inxmin, std::vector< Float_t > &inxmax)
std::vector< Float_t > xmin
BuildNodeInfo operator+(const BuildNodeInfo &other)
std::vector< Float_t > xmax
BuildNodeInfo(Int_t fNvars, const TMVA::Event *evt)
std::vector< std::vector< Double_t > > target2
std::vector< std::vector< Double_t > > nSelB_unWeighted
std::vector< std::vector< Double_t > > nSelB
std::vector< std::vector< Double_t > > target
std::vector< std::vector< Double_t > > nSelS_unWeighted
TrainNodeInfo operator+(const TrainNodeInfo &other)
std::vector< std::vector< Double_t > > nSelS
TrainNodeInfo(Int_t cNvars_, UInt_t *nBins_)
TLine l
Definition textangle.C:4
static uint64_t sum(uint64_t i)
Definition Factory.cxx:2339