98 TMVA::DecisionTree::DecisionTree():
103 fMinLinCorrForFisher (1),
104 fUseExclusiveVars (
kTRUE),
110 fUseSearchTree(kFALSE),
112 fPruneMethod (kNoPruning),
113 fNNodesBeforePruning(0),
114 fNodePurityLimit(0.5),
115 fRandomisedTree (kFALSE),
117 fUsePoissonNvars(kFALSE),
122 fAnalysisType (Types::kClassification),
140 fMinLinCorrForFisher (1),
141 fUseExclusiveVars (
kTRUE),
145 fMinNodeSize (minSize),
149 fPruneMethod (kNoPruning),
150 fNNodesBeforePruning(0),
151 fNodePurityLimit(purityLimit),
152 fRandomisedTree (randomisedTree),
153 fUseNvars (useNvars),
154 fUsePoissonNvars(usePoissonNvars),
156 fMaxDepth (nMaxDepth),
159 fAnalysisType (
Types::kClassification),
160 fDataSetInfo (dataInfo)
162 if (sepType ==
NULL) {
169 Log() <<
kWARNING <<
" You had choosen the training mode using optimal cuts, not\n"
170 <<
" based on a grid of " <<
fNCuts <<
" by setting the option NCuts < 0\n"
171 <<
" as this doesn't exist yet, I set it to " <<
fNCuts <<
" and use the grid"
187 fUseFisherCuts (d.fUseFisherCuts),
188 fMinLinCorrForFisher (d.fMinLinCorrForFisher),
189 fUseExclusiveVars (d.fUseExclusiveVars),
190 fSepType (d.fSepType),
191 fRegType (d.fRegType),
192 fMinSize (d.fMinSize),
193 fMinNodeSize(d.fMinNodeSize),
194 fMinSepGain (d.fMinSepGain),
195 fUseSearchTree (d.fUseSearchTree),
196 fPruneStrength (d.fPruneStrength),
197 fPruneMethod (d.fPruneMethod),
198 fNodePurityLimit(d.fNodePurityLimit),
199 fRandomisedTree (d.fRandomisedTree),
200 fUseNvars (d.fUseNvars),
201 fUsePoissonNvars(d.fUsePoissonNvars),
202 fMyTrandom (new
TRandom3(fgRandomSeed)),
203 fMaxDepth (d.fMaxDepth),
204 fSigClass (d.fSigClass),
206 fAnalysisType(d.fAnalysisType),
207 fDataSetInfo (d.fDataSetInfo)
222 if (fMyTrandom)
delete fMyTrandom;
223 if (fRegType)
delete fRegType;
235 Log() <<
kFATAL <<
"SetParentTreeNodes: started with undefined ROOT node" <<
Endl;
240 if ((this->GetLeftDaughter(n) ==
NULL) && (this->GetRightDaughter(n) !=
NULL) ) {
241 Log() <<
kFATAL <<
" Node with only one daughter?? Something went wrong" <<
Endl;
243 }
else if ((this->GetLeftDaughter(n) !=
NULL) && (this->GetRightDaughter(n) ==
NULL) ) {
244 Log() <<
kFATAL <<
" Node with only one daughter?? Something went wrong" <<
Endl;
248 if (this->GetLeftDaughter(n) !=
NULL) {
249 this->SetParentTreeInNodes( this->GetLeftDaughter(n) );
251 if (this->GetRightDaughter(n) !=
NULL) {
252 this->SetParentTreeInNodes( this->GetRightDaughter(n) );
256 if (n->
GetDepth() > this->GetTotalTreeDepth()) this->SetTotalTreeDepth(n->
GetDepth());
264 std::string
type(
"");
268 dt->
ReadXML( node, tmva_Version_Code );
286 this->GetRoot()->SetPos(
's');
287 this->GetRoot()->SetDepth(0);
288 this->GetRoot()->SetParentTree(
this);
289 fMinSize = fMinNodeSize/100. * eventSample.size();
291 Log() <<
kINFO <<
"The minimal node size MinNodeSize=" << fMinNodeSize <<
" fMinNodeSize="<<fMinNodeSize<<
"% is translated to an actual number of events = "<< fMinSize<<
" for the training sample size of " << eventSample.size() <<
Endl;
292 Log() <<
kINFO <<
"Note: This number will be taken as absolute minimum in the node, " <<
Endl;
293 Log() <<
kINFO <<
" in terms of 'weighted events' and unweighted ones !! " <<
Endl;
297 UInt_t nevents = eventSample.size();
300 if (fNvars==0) fNvars = eventSample[0]->GetNVariables();
301 fVariableImportance.resize(fNvars);
303 else Log() <<
kFATAL <<
":<BuildTree> eventsample Size == 0 " <<
Endl;
311 for (
UInt_t ivar=0; ivar<fNvars; ivar++) {
312 xmin[ivar]=xmax[ivar]=0;
314 for (
UInt_t iev=0; iev<eventSample.size(); iev++) {
328 if ( DoRegression() ) {
331 target2+=weight*tgt*tgt;
334 for (
UInt_t ivar=0; ivar<fNvars; ivar++) {
336 if (iev==0) xmin[ivar]=xmax[ivar]=val;
337 if (val < xmin[ivar]) xmin[ivar]=val;
338 if (val > xmax[ivar]) xmax[ivar]=val;
344 Log() <<
kWARNING <<
" One of the Decision Tree nodes has negative total number of signal or background events. "
345 <<
"(Nsig="<<s<<
" Nbkg="<<b<<
" Probaby you use a Monte Carlo with negative weights. That should in principle "
346 <<
"be fine as long as on average you end up with something positive. For this you have to make sure that the "
347 <<
"minimul number of (unweighted) events demanded for a tree node (currently you use: MinNodeSize="<<fMinNodeSize
348 <<
"% of training events, you can set this via the BDT option string when booking the classifier) is large enough "
349 <<
"to allow for reasonable averaging!!!" << Endl
350 <<
" If this does not help.. maybe you want to try the option: NoNegWeightsInTraining which ignores events "
351 <<
"with negative weight in the training." <<
Endl;
353 for (
UInt_t i=0; i<eventSample.size(); i++) {
354 if (eventSample[i]->
GetClass() != fSigClass) {
355 nBkg += eventSample[i]->GetWeight();
356 Log() <<
kDEBUG <<
"Event "<< i<<
" has (original) weight: " << eventSample[i]->GetWeight()/eventSample[i]->GetBoostWeight()
357 <<
" boostWeight: " << eventSample[i]->GetBoostWeight() <<
Endl;
370 if (node == this->GetRoot()) {
375 for (
UInt_t ivar=0; ivar<fNvars; ivar++) {
393 if ((eventSample.size() >= 2*fMinSize && s+b >= 2*fMinSize) && node->
GetDepth() < fMaxDepth
394 && ( ( s!=0 && b !=0 && !DoRegression()) || ( (s+b)!=0 && DoRegression()) ) ) {
397 separationGain = this->TrainNodeFast(eventSample, node);
399 separationGain = this->TrainNodeFull(eventSample, node);
405 if (DoRegression()) {
420 if (node->
GetDepth() > this->GetTotalTreeDepth()) this->SetTotalTreeDepth(node->
GetDepth());
424 std::vector<const TMVA::Event*> leftSample; leftSample.reserve(nevents);
425 std::vector<const TMVA::Event*> rightSample; rightSample.reserve(nevents);
428 Double_t nRightUnBoosted=0, nLeftUnBoosted=0;
430 for (
UInt_t ie=0; ie< nevents ; ie++) {
432 rightSample.push_back(eventSample[ie]);
433 nRight += eventSample[ie]->GetWeight();
434 nRightUnBoosted += eventSample[ie]->GetOriginalWeight();
437 leftSample.push_back(eventSample[ie]);
438 nLeft += eventSample[ie]->GetWeight();
439 nLeftUnBoosted += eventSample[ie]->GetOriginalWeight();
450 if (leftSample.empty() || rightSample.empty()) {
451 Log() <<
kERROR <<
"<TrainNode> all events went to the same branch" << Endl
452 <<
"--- Hence new node == old node ... check" << Endl
453 <<
"--- left:" << leftSample.size()
454 <<
" right:" << rightSample.size() << Endl
455 <<
" while the separation is thought to be " << separationGain
456 <<
kFATAL <<
"--- this should never happen, please write a bug report to Helge.Voss@cern.ch" <<
Endl;
483 if (DoRegression()) {
504 if (node->
GetDepth() > this->GetTotalTreeDepth()) this->SetTotalTreeDepth(node->
GetDepth());
518 for (
UInt_t i=0; i<eventSample.size(); i++) {
519 this->FillEvent(*(eventSample[i]),
NULL);
531 node = this->GetRoot();
537 if (event.
GetClass() == fSigClass) {
550 this->FillEvent(event,dynamic_cast<TMVA::DecisionTreeNode*>(node->
GetRight())) ;
552 this->FillEvent(event,dynamic_cast<TMVA::DecisionTreeNode*>(node->
GetLeft())) ;
563 if (this->GetRoot()!=
NULL) this->GetRoot()->ClearNodeAndAllDaughters();
578 node = this->GetRoot();
589 this->PruneNode(node);
593 return this->CountNodes();
611 if( fPruneMethod == kNoPruning )
return 0.0;
613 if (fPruneMethod == kExpectedErrorPruning)
616 else if (fPruneMethod == kCostComplexityPruning)
621 Log() <<
kFATAL <<
"Selected pruning method not yet implemented "
625 if(!tool)
return 0.0;
629 if(validationSample ==
NULL){
630 Log() <<
kFATAL <<
"Cannot automate the pruning algorithm without an "
631 <<
"independent validation sample!" <<
Endl;
632 }
else if(validationSample->size() == 0) {
633 Log() <<
kFATAL <<
"Cannot automate the pruning algorithm with "
634 <<
"independent validation sample of ZERO events!" <<
Endl;
641 Log() <<
kFATAL <<
"Error pruning tree! Check prune.log for more information."
661 return pruneStrength;
673 GetRoot()->ResetValidationData();
674 for (
UInt_t ievt=0; ievt < validationSample->size(); ievt++) {
675 CheckEventWithPrunedTree((*validationSample)[ievt]);
690 Log() <<
kFATAL <<
"TestPrunedTreeQuality: started with undefined ROOT node" <<
Endl;
696 return (TestPrunedTreeQuality( n->
GetLeft(), mode ) +
697 TestPrunedTreeQuality( n->
GetRight(), mode ));
700 if (DoRegression()) {
706 if (n->
GetPurity() > this->GetNodePurityLimit())
711 else if ( mode == 1 ) {
716 throw std::string(
"Unknown ValidationQualityMode");
730 if (current ==
NULL) {
731 Log() <<
kFATAL <<
"CheckEventWithPrunedTree: started with undefined ROOT node" <<
Endl;
734 while(current !=
NULL) {
763 for( EventConstList::const_iterator it = validationSample->begin();
764 it != validationSample->end(); ++it ) {
765 sumWeights += (*it)->GetWeight();
780 Log() <<
kFATAL <<
"CountLeafNodes: started with undefined ROOT node" <<
Endl;
787 if ((this->GetLeftDaughter(n) ==
NULL) && (this->GetRightDaughter(n) ==
NULL) ) {
791 if (this->GetLeftDaughter(n) !=
NULL) {
792 countLeafs += this->CountLeafNodes( this->GetLeftDaughter(n) );
794 if (this->GetRightDaughter(n) !=
NULL) {
795 countLeafs += this->CountLeafNodes( this->GetRightDaughter(n) );
809 Log() <<
kFATAL <<
"DescendTree: started with undefined ROOT node" <<
Endl;
814 if ((this->GetLeftDaughter(n) ==
NULL) && (this->GetRightDaughter(n) ==
NULL) ) {
817 else if ((this->GetLeftDaughter(n) ==
NULL) && (this->GetRightDaughter(n) !=
NULL) ) {
818 Log() <<
kFATAL <<
" Node with only one daughter?? Something went wrong" <<
Endl;
821 else if ((this->GetLeftDaughter(n) !=
NULL) && (this->GetRightDaughter(n) ==
NULL) ) {
822 Log() <<
kFATAL <<
" Node with only one daughter?? Something went wrong" <<
Endl;
826 if (this->GetLeftDaughter(n) !=
NULL) {
827 this->DescendTree( this->GetLeftDaughter(n) );
829 if (this->GetRightDaughter(n) !=
NULL) {
830 this->DescendTree( this->GetRightDaughter(n) );
862 if(node ==
NULL)
return;
877 Node* current = this->GetRoot();
879 for (
UInt_t i =0; i < depth; i++) {
881 if ( tmp & sequence) current = this->GetRightDaughter(current);
882 else current = this->GetLeftDaughter(current);
893 for (
UInt_t ivar=0; ivar<fNvars; ivar++) useVariable[ivar]=
kFALSE;
899 else useNvars = fUseNvars;
902 while (nSelectedVars < useNvars) {
903 Double_t bla = fMyTrandom->Rndm()*fNvars;
906 for (
UInt_t ivar=0; ivar < fNvars; ivar++) {
907 if (useVariable[ivar] ==
kTRUE) {
908 mapVariable[nSelectedVars] = ivar;
913 if (nSelectedVars != useNvars) { std::cout <<
"Bug in TrainNode - GetRandisedVariables()... sorry" << std::endl; std::exit(1);}
928 Double_t separationGainTotal = -1, sepTmp;
932 for (
UInt_t ivar=0; ivar <= fNvars; ivar++) {
933 separationGain[ivar]=-1;
939 Int_t nTotS_unWeighted, nTotB_unWeighted;
940 UInt_t nevents = eventSample.size();
948 std::vector<Double_t> fisherCoeff;
950 if (fRandomisedTree) {
952 GetRandomisedVariables(useVariable,mapVariable,tmp);
955 for (
UInt_t ivar=0; ivar < fNvars; ivar++) {
956 useVariable[ivar] =
kTRUE;
957 mapVariable[ivar] = ivar;
960 useVariable[fNvars] =
kFALSE;
963 if (fUseFisherCuts) {
964 useVariable[fNvars] =
kTRUE;
970 for (
UInt_t ivar=0; ivar < fNvars; ivar++) {
971 useVarInFisher[ivar] =
kFALSE;
972 mapVarInFisher[ivar] = ivar;
975 std::vector<TMatrixDSym*>* covMatrices;
978 Log() <<
kWARNING <<
" in TrainNodeFast, the covariance Matrices needed for the Fisher-Cuts returned error --> revert to just normal cuts for this node" <<
Endl;
986 for (
UInt_t ivar=0; ivar < fNvars; ivar++) {
987 for (
UInt_t jvar=ivar+1; jvar < fNvars; jvar++) {
988 if ( (
TMath::Abs( (*s)(ivar, jvar)) > fMinLinCorrForFisher) ||
989 (
TMath::Abs( (*b)(ivar, jvar)) > fMinLinCorrForFisher) ){
990 useVarInFisher[ivar] =
kTRUE;
991 useVarInFisher[jvar] =
kTRUE;
999 for (
UInt_t ivar=0; ivar < fNvars; ivar++) {
1002 if (useVarInFisher[ivar] && useVariable[ivar]) {
1003 mapVarInFisher[nFisherVars++]=ivar;
1006 if (fUseExclusiveVars) useVariable[ivar] =
kFALSE;
1011 fisherCoeff = this->GetFisherCoefficients(eventSample, nFisherVars, mapVarInFisher);
1014 delete [] useVarInFisher;
1015 delete [] mapVarInFisher;
1021 if (fUseFisherCuts && fisherOK) cNvars++;
1033 for (
UInt_t ivar=0; ivar<cNvars; ivar++) {
1034 nBins[ivar] = fNCuts+1;
1035 if (ivar < fNvars) {
1036 if (fDataSetInfo->GetVariableInfo(ivar).GetVarType() ==
'I') {
1041 nSelS[ivar] =
new Double_t [nBins[ivar]];
1042 nSelB[ivar] =
new Double_t [nBins[ivar]];
1043 nSelS_unWeighted[ivar] =
new Double_t [nBins[ivar]];
1044 nSelB_unWeighted[ivar] =
new Double_t [nBins[ivar]];
1045 target[ivar] =
new Double_t [nBins[ivar]];
1046 target2[ivar] =
new Double_t [nBins[ivar]];
1047 cutValues[ivar] =
new Double_t [nBins[ivar]];
1054 for (
UInt_t ivar=0; ivar < cNvars; ivar++) {
1061 useVariable[ivar]=
kFALSE;
1069 for (
UInt_t iev=0; iev<nevents; iev++) {
1072 for (
UInt_t jvar=0; jvar<fNvars; jvar++)
1073 result += fisherCoeff[jvar]*(eventSample[iev])->GetValue(jvar);
1074 if (result > xmax[ivar]) xmax[ivar]=
result;
1075 if (result < xmin[ivar]) xmin[ivar]=
result;
1078 for (
UInt_t ibin=0; ibin<nBins[ivar]; ibin++) {
1079 nSelS[ivar][ibin]=0;
1080 nSelB[ivar][ibin]=0;
1081 nSelS_unWeighted[ivar][ibin]=0;
1082 nSelB_unWeighted[ivar][ibin]=0;
1083 target[ivar][ibin]=0;
1084 target2[ivar][ibin]=0;
1085 cutValues[ivar][ibin]=0;
1090 for (
UInt_t ivar=0; ivar < cNvars; ivar++) {
1092 if ( useVariable[ivar] ) {
1106 if (ivar < fNvars) {
1107 if (fDataSetInfo->GetVariableInfo(ivar).GetVarType() ==
'I') istepSize = 1;
1115 for (
UInt_t icut=0; icut<nBins[ivar]-1; icut++) {
1116 cutValues[ivar][icut]=xmin[ivar]+(
Double_t(icut+1))*istepSize;
1123 nTotS_unWeighted=0; nTotB_unWeighted=0;
1124 for (
UInt_t iev=0; iev<nevents; iev++) {
1126 Double_t eventWeight = eventSample[iev]->GetWeight();
1127 if (eventSample[iev]->
GetClass() == fSigClass) {
1137 for (
UInt_t ivar=0; ivar < cNvars; ivar++) {
1140 if ( useVariable[ivar] ) {
1142 if (ivar < fNvars) eventData = eventSample[iev]->GetValue(ivar);
1144 eventData = fisherCoeff[fNvars];
1145 for (
UInt_t jvar=0; jvar<fNvars; jvar++)
1146 eventData += fisherCoeff[jvar]*(eventSample[iev])->GetValue(jvar);
1150 iBin =
TMath::Min(
Int_t(nBins[ivar]-1),
TMath::Max(0,
int (nBins[ivar]*(eventData-xmin[ivar])/(xmax[ivar]-xmin[ivar]) ) ));
1151 if (eventSample[iev]->
GetClass() == fSigClass) {
1152 nSelS[ivar][iBin]+=eventWeight;
1153 nSelS_unWeighted[ivar][iBin]++;
1156 nSelB[ivar][iBin]+=eventWeight;
1157 nSelB_unWeighted[ivar][iBin]++;
1159 if (DoRegression()) {
1160 target[ivar][iBin] +=eventWeight*eventSample[iev]->GetTarget(0);
1161 target2[ivar][iBin]+=eventWeight*eventSample[iev]->GetTarget(0)*eventSample[iev]->GetTarget(0);
1167 for (
UInt_t ivar=0; ivar < cNvars; ivar++) {
1168 if (useVariable[ivar]) {
1169 for (
UInt_t ibin=1; ibin < nBins[ivar]; ibin++) {
1170 nSelS[ivar][ibin]+=nSelS[ivar][ibin-1];
1171 nSelS_unWeighted[ivar][ibin]+=nSelS_unWeighted[ivar][ibin-1];
1172 nSelB[ivar][ibin]+=nSelB[ivar][ibin-1];
1173 nSelB_unWeighted[ivar][ibin]+=nSelB_unWeighted[ivar][ibin-1];
1174 if (DoRegression()) {
1175 target[ivar][ibin] +=target[ivar][ibin-1] ;
1176 target2[ivar][ibin]+=target2[ivar][ibin-1];
1179 if (nSelS_unWeighted[ivar][nBins[ivar]-1] +nSelB_unWeighted[ivar][nBins[ivar]-1] != eventSample.size()) {
1180 Log() <<
kFATAL <<
"Helge, you have a bug ....nSelS_unw..+nSelB_unw..= "
1181 << nSelS_unWeighted[ivar][nBins[ivar]-1] +nSelB_unWeighted[ivar][nBins[ivar]-1]
1182 <<
" while eventsample size = " << eventSample.size()
1185 double lastBins=nSelS[ivar][nBins[ivar]-1] +nSelB[ivar][nBins[ivar]-1];
1186 double totalSum=nTotS+nTotB;
1187 if (
TMath::Abs(lastBins-totalSum)/totalSum>0.01) {
1188 Log() <<
kFATAL <<
"Helge, you have another bug ....nSelS+nSelB= "
1190 <<
" while total number of events = " << totalSum
1197 for (
UInt_t ivar=0; ivar < cNvars; ivar++) {
1198 if (useVariable[ivar]) {
1199 for (
UInt_t iBin=0; iBin<nBins[ivar]-1; iBin++) {
1211 Double_t sl = nSelS_unWeighted[ivar][iBin];
1212 Double_t bl = nSelB_unWeighted[ivar][iBin];
1224 if ( ((sl+bl)>=fMinSize && (sr+br)>=fMinSize)
1225 && ((slW+blW)>=fMinSize && (srW+brW)>=fMinSize)
1228 if (DoRegression()) {
1229 sepTmp = fRegType->GetSeparationGain(nSelS[ivar][iBin]+nSelB[ivar][iBin],
1230 target[ivar][iBin],target2[ivar][iBin],
1232 target[ivar][nBins[ivar]-1],target2[ivar][nBins[ivar]-1]);
1234 sepTmp = fSepType->GetSeparationGain(nSelS[ivar][iBin], nSelB[ivar][iBin], nTotS, nTotB);
1236 if (separationGain[ivar] < sepTmp) {
1237 separationGain[ivar] = sepTmp;
1238 cutIndex[ivar] = iBin;
1247 for (
UInt_t ivar=0; ivar < cNvars; ivar++) {
1248 if (useVariable[ivar] ) {
1249 if (separationGainTotal < separationGain[ivar]) {
1250 separationGainTotal = separationGain[ivar];
1257 if (DoRegression()) {
1258 node->
SetSeparationIndex(fRegType->GetSeparationIndex(nTotS+nTotB,target[0][nBins[mxVar]-1],target2[0][nBins[mxVar]-1]));
1259 node->
SetResponse(target[0][nBins[mxVar]-1]/(nTotS+nTotB));
1263 node->
SetRMS(
TMath::Sqrt(target2[0][nBins[mxVar]-1]/(nTotS+nTotB) - target[0][nBins[mxVar]-1]/(nTotS+nTotB)*target[0][nBins[mxVar]-1]/(nTotS+nTotB)));
1269 if (nSelS[mxVar][cutIndex[mxVar]]/nTotS > nSelB[mxVar][cutIndex[mxVar]]/nTotB) cutType=
kTRUE;
1274 node->
SetCutValue(cutValues[mxVar][cutIndex[mxVar]]);
1277 if (mxVar < (
Int_t) fNvars){
1279 fVariableImportance[mxVar] += separationGainTotal*separationGainTotal * (nTotS+nTotB) * (nTotS+nTotB) ;
1286 for (
UInt_t ivar=0; ivar<=fNvars; ivar++) {
1290 fVariableImportance[ivar] += fisherCoeff[ivar]*fisherCoeff[ivar]*separationGainTotal*separationGainTotal * (nTotS+nTotB) * (nTotS+nTotB) ;
1296 separationGainTotal = 0;
1313 for (
UInt_t i=0; i<cNvars; i++) {
1316 delete [] nSelS_unWeighted[i];
1317 delete [] nSelB_unWeighted[i];
1318 delete [] target[i];
1319 delete [] target2[i];
1320 delete [] cutValues[i];
1324 delete [] nSelS_unWeighted;
1325 delete [] nSelB_unWeighted;
1328 delete [] cutValues;
1333 delete [] useVariable;
1334 delete [] mapVariable;
1336 delete [] separationGain;
1341 return separationGainTotal;
1351 std::vector<Double_t> fisherCoeff(fNvars+1);
1374 for (
UInt_t ivar=0; ivar<nFisherVars; ivar++) { sumS[ivar] = sumB[ivar] = 0; }
1376 UInt_t nevents = eventSample.size();
1378 for (
UInt_t ievt=0; ievt<nevents; ievt++) {
1381 const Event * ev = eventSample[ievt];
1385 if (ev->
GetClass() == fSigClass) sumOfWeightsS += weight;
1386 else sumOfWeightsB += weight;
1389 for (
UInt_t ivar=0; ivar<nFisherVars; ivar++) {
1390 sum[ivar] += ev->
GetValue( mapVarInFisher[ivar] )*weight;
1393 for (
UInt_t ivar=0; ivar<nFisherVars; ivar++) {
1394 (*meanMatx)( ivar, 2 ) = sumS[ivar];
1395 (*meanMatx)( ivar, 0 ) = sumS[ivar]/sumOfWeightsS;
1397 (*meanMatx)( ivar, 2 ) += sumB[ivar];
1398 (*meanMatx)( ivar, 1 ) = sumB[ivar]/sumOfWeightsB;
1401 (*meanMatx)( ivar, 2 ) /= (sumOfWeightsS + sumOfWeightsB);
1413 assert( sumOfWeightsS > 0 && sumOfWeightsB > 0 );
1417 const Int_t nFisherVars2 = nFisherVars*nFisherVars;
1421 memset(sum2Sig,0,nFisherVars2*
sizeof(
Double_t));
1422 memset(sum2Bgd,0,nFisherVars2*
sizeof(
Double_t));
1425 for (
UInt_t ievt=0; ievt<nevents; ievt++) {
1429 const Event* ev = eventSample.at(ievt);
1439 if ( ev->
GetClass() == fSigClass ) sum2Sig[k] += ( (xval[
x] - (*meanMatx)(
x, 0))*(xval[
y] - (*meanMatx)(
y, 0)) )*weight;
1440 else sum2Bgd[k] += ( (xval[
x] - (*meanMatx)(
x, 1))*(xval[
y] - (*meanMatx)(
y, 1)) )*weight;
1448 (*with)(
x,
y) = sum2Sig[k]/sumOfWeightsS + sum2Bgd[k]/sumOfWeightsB;
1468 prodSig = ( ((*meanMatx)(
x, 0) - (*meanMatx)(
x, 2))*
1469 ((*meanMatx)(
y, 0) - (*meanMatx)(
y, 2)) );
1470 prodBgd = ( ((*meanMatx)(
x, 1) - (*meanMatx)(
x, 2))*
1471 ((*meanMatx)(
y, 1) - (*meanMatx)(
y, 2)) );
1473 (*betw)(
x,
y) = (sumOfWeightsS*prodSig + sumOfWeightsB*prodBgd) / (sumOfWeightsS + sumOfWeightsB);
1482 (*cov)(
x,
y) = (*with)(
x,
y) + (*betw)(
x,
y);
1497 Log() <<
kWARNING <<
"FisherCoeff matrix is almost singular with deterninant="
1499 <<
" did you use the variables that are linear combinations or highly correlated?"
1503 Log() <<
kFATAL <<
"FisherCoeff matrix is singular with determinant="
1505 <<
" did you use the variables that are linear combinations?"
1512 Double_t xfact =
TMath::Sqrt( sumOfWeightsS*sumOfWeightsB ) / (sumOfWeightsS + sumOfWeightsB);
1515 std::vector<Double_t> diffMeans( nFisherVars );
1517 for (
UInt_t ivar=0; ivar<=fNvars; ivar++) fisherCoeff[ivar] = 0;
1518 for (
UInt_t ivar=0; ivar<nFisherVars; ivar++) {
1519 for (
UInt_t jvar=0; jvar<nFisherVars; jvar++) {
1520 Double_t d = (*meanMatx)(jvar, 0) - (*meanMatx)(jvar, 1);
1521 fisherCoeff[mapVarInFisher[ivar]] += invCov(ivar, jvar)*d;
1525 fisherCoeff[mapVarInFisher[ivar]] *= xfact;
1530 for (
UInt_t ivar=0; ivar<nFisherVars; ivar++){
1531 f0 += fisherCoeff[mapVarInFisher[ivar]]*((*meanMatx)(ivar, 0) + (*meanMatx)(ivar, 1));
1535 fisherCoeff[fNvars] = f0;
1549 Int_t nTotS_unWeighted = 0, nTotB_unWeighted = 0;
1551 std::vector<TMVA::BDTEventWrapper> bdtEventSample;
1554 std::vector<Double_t> lCutValue( fNvars, 0.0 );
1555 std::vector<Double_t> lSepGain( fNvars, -1.0e6 );
1556 std::vector<Char_t> lCutType( fNvars );
1561 for( std::vector<const TMVA::Event*>::const_iterator it = eventSample.begin(); it != eventSample.end(); ++it ) {
1562 if((*it)->GetClass() == fSigClass) {
1563 nTotS += (*it)->GetWeight();
1567 nTotB += (*it)->GetWeight();
1573 std::vector<Char_t> useVariable(fNvars);
1577 if (fRandomisedTree) {
1578 if (fUseNvars ==0 ) {
1582 Int_t nSelectedVars = 0;
1583 while (nSelectedVars < fUseNvars) {
1584 Double_t bla = fMyTrandom->Rndm()*fNvars;
1587 for (
UInt_t ivar=0; ivar < fNvars; ivar++) {
1588 if(useVariable[ivar] ==
Char_t(
kTRUE)) nSelectedVars++;
1596 for(
UInt_t ivar = 0; ivar < fNvars; ivar++ ) {
1597 if(!useVariable[ivar])
continue;
1599 std::sort( bdtEventSample.begin(),bdtEventSample.end() );
1601 Double_t bkgWeightCtr = 0.0, sigWeightCtr = 0.0;
1602 std::vector<TMVA::BDTEventWrapper>::iterator it = bdtEventSample.begin(), it_end = bdtEventSample.end();
1603 for( ; it != it_end; ++it ) {
1604 if((**it)->GetClass() == fSigClass )
1605 sigWeightCtr += (**it)->GetWeight();
1607 bkgWeightCtr += (**it)->GetWeight();
1609 it->SetCumulativeWeight(
false,bkgWeightCtr);
1610 it->SetCumulativeWeight(
true,sigWeightCtr);
1616 Double_t separationGain = -1.0, sepTmp = 0.0, cutValue = 0.0, dVal = 0.0,
norm = 0.0;
1618 for( it = bdtEventSample.begin(); it != it_end; ++it ) {
1619 if( index == 0 ) { ++index;
continue; }
1620 if( *(*it) ==
NULL ) {
1621 Log() <<
kFATAL <<
"In TrainNodeFull(): have a null event! Where index="
1622 << index <<
", and parent node=" << node->
GetParent() <<
Endl;
1625 dVal = bdtEventSample[index].GetVal() - bdtEventSample[index-1].GetVal();
1626 norm =
TMath::Abs(bdtEventSample[index].GetVal() + bdtEventSample[index-1].GetVal());
1629 if( index >= fMinSize && (nTotS_unWeighted + nTotB_unWeighted) - index >= fMinSize &&
TMath::Abs(dVal/(0.5*
norm + 1)) > fPMin ) {
1630 sepTmp = fSepType->GetSeparationGain( it->GetCumulativeWeight(
true), it->GetCumulativeWeight(
false), sigWeightCtr, bkgWeightCtr );
1631 if( sepTmp > separationGain ) {
1632 separationGain = sepTmp;
1633 cutValue = it->GetVal() - 0.5*dVal;
1634 Double_t nSelS = it->GetCumulativeWeight(
true);
1635 Double_t nSelB = it->GetCumulativeWeight(
false);
1638 if( nSelS/sigWeightCtr > nSelB/bkgWeightCtr ) cutType =
kTRUE;
1644 lCutType[ivar] =
Char_t(cutType);
1645 lCutValue[ivar] = cutValue;
1646 lSepGain[ivar] = separationGain;
1650 Int_t iVarIndex = -1;
1651 for(
UInt_t ivar = 0; ivar < fNvars; ivar++ ) {
1652 if( lSepGain[ivar] > separationGain ) {
1654 separationGain = lSepGain[ivar];
1658 if(iVarIndex >= 0) {
1663 fVariableImportance[iVarIndex] += separationGain*separationGain * (nTotS+nTotB) * (nTotS+nTotB);
1666 separationGain = 0.0;
1669 return separationGain;
1697 Log() <<
kFATAL <<
"CheckEvent: started with undefined ROOT node" <<
Endl;
1706 Log() <<
kFATAL <<
"DT::CheckEvent: inconsistent tree structure" <<
Endl;
1711 if ( DoRegression() ){
1725 Double_t sumsig=0, sumbkg=0, sumtot=0;
1726 for (
UInt_t ievt=0; ievt<eventSample.size(); ievt++) {
1727 if (eventSample[ievt]->
GetClass() != fSigClass) sumbkg+=eventSample[ievt]->
GetWeight();
1728 else sumsig+=eventSample[ievt]->GetWeight();
1729 sumtot+=eventSample[ievt]->GetWeight();
1732 if (sumtot!= (sumsig+sumbkg)){
1733 Log() <<
kFATAL <<
"<SamplePurity> sumtot != sumsig+sumbkg"
1734 << sumtot <<
" " << sumsig <<
" " << sumbkg <<
Endl;
1736 if (sumtot>0)
return sumsig/(sumsig + sumbkg);
1748 std::vector<Double_t> relativeImportance(fNvars);
1750 for (
UInt_t i=0; i< fNvars; i++) {
1751 sum += fVariableImportance[i];
1752 relativeImportance[i] = fVariableImportance[i];
1755 for (
UInt_t i=0; i< fNvars; i++) {
1757 relativeImportance[i] /= sum;
1759 relativeImportance[i] = 0;
1761 return relativeImportance;
1769 std::vector<Double_t> relativeImportance = this->GetVariableImportance();
1770 if (ivar < fNvars)
return relativeImportance[ivar];
1773 <<
"--- ivar = " << ivar <<
" is out of range " <<
Endl;
void SetNTerminal(Int_t n)
Double_t PruneStrength
quality measure for a pruned subtree T of T_max
Random number generator class based on M.
void SetSelector(Short_t i)
MsgLogger & Endl(MsgLogger &ml)
void SetFisherCoeff(Int_t ivar, Double_t coeff)
set fisher coefficients
Float_t GetSumTarget() const
UInt_t GetNTargets() const
accessor to the number of targets
virtual DecisionTreeNode * GetRight() const
TMVA::DecisionTreeNode * GetEventNode(const TMVA::Event &e) const
get the pointer to the leaf node where a particular event ends up in...
void IncrementNEvents_unweighted()
void IncrementNEvents(Float_t nev)
Short_t Min(Short_t a, Short_t b)
Int_t GetNodeType(void) const
std::vector< Double_t > GetFisherCoefficients(const EventConstList &eventSample, UInt_t nFisherVars, UInt_t *mapVarInFisher)
calculate the fisher coefficients for the event sample and the variables used
std::vector< DecisionTreeNode * > PruneSequence
the regularization parameter for pruning
virtual void SetParentTree(TMVA::BinaryTree *t)
virtual void SetRight(Node *r)
virtual ~DecisionTree(void)
destructor
virtual DecisionTreeNode * GetLeft() const
Double_t GetWeight() const
return the event weight - depending on whether the flag IgnoreNegWeightsInTraining is or not...
virtual DecisionTreeNode * GetParent() const
void SetNSigEvents_unweighted(Float_t s)
void SetResponse(Float_t r)
void SetNBValidation(Double_t b)
std::vector< Double_t > GetVariableImportance()
Return the relative variable importance, normalized to all variables together having the importance 1...
virtual DecisionTreeNode * GetRoot() const
Float_t GetValue(UInt_t ivar) const
return value of i'th variable
void SetNFisherCoeff(Int_t nvars)
std::vector< const TMVA::Event * > EventConstList
Double_t GetSumWeights(const EventConstList *validationSample) const
calculate the normalization factor for a pruning validation sample
static const Int_t fgRandomSeed
Double_t GetNSValidation() const
void FillTree(const EventList &eventSample)
void IncrementNBkgEvents(Float_t b)
Double_t SamplePurity(EventList eventSample)
calculates the purity S/(S+B) of a given event sample
Float_t GetPurity(void) const
void SetSeparationGain(Float_t sep)
void SetNBkgEvents(Float_t b)
void SetNSValidation(Double_t s)
UInt_t CountLeafNodes(TMVA::Node *n=NULL)
return the number of terminal nodes in the sub-tree below Node n
void AddToSumTarget(Float_t t)
Double_t TrainNodeFast(const EventConstList &eventSample, DecisionTreeNode *node)
Decide how to split a node using one of the variables that gives the best separation of signal/backgr...
ROOT::Math::KDTree< _DataPoint > * BuildTree(const std::vector< const _DataPoint * > &vDataPoints, const unsigned int iBucketSize)
Double_t GetOriginalWeight() const
void DescendTree(Node *n=NULL)
descend a tree to find all its leaf nodes
TMatrixT< Element > & Invert(Double_t *det=0)
Invert the matrix and calculate its determinant.
void FillEvent(const TMVA::Event &event, TMVA::DecisionTreeNode *node)
fill the existing the decision tree structure by filling event in from the top node and see where the...
void SetNEvents(Float_t nev)
Double_t CheckEvent(const TMVA::Event *, Bool_t UseYesNoLeaf=kFALSE) const
the event e is put into the decision tree (starting at the root node) and the output is NodeType (sig...
TMatrixT< Double_t > TMatrixD
Float_t GetNBkgEvents(void) const
void SetSubTreeR(Double_t r)
Double_t GetNBValidation() const
virtual void SetLeft(Node *l)
void SetAlpha(Double_t alpha)
UInt_t CleanTree(DecisionTreeNode *node=NULL)
remove those last splits that result in two leaf nodes that are both of the type (i.e.
void SetSampleMin(UInt_t ivar, Float_t xmin)
set the minimum of variable ivar from the training sample that pass/end up in this node ...
void SetCutValue(Float_t c)
void GetRandomisedVariables(Bool_t *useVariable, UInt_t *variableMap, UInt_t &nVars)
Double_t TrainNodeFull(const EventConstList &eventSample, DecisionTreeNode *node)
void SetParentTreeInNodes(Node *n=NULL)
descend a tree to find all its leaf nodes, fill max depth reached in the tree at the same time...
void SetPurity(void)
return the S/(S+B) (purity) for the node REM: even if nodes with purity 0.01 are very PURE background...
void CheckEventWithPrunedTree(const TMVA::Event *) const
pass a single validation event throught a pruned decision tree on the way down the tree...
Double_t GetWeight(Double_t x) const
void SetCutType(Bool_t t)
Float_t GetSampleMin(UInt_t ivar) const
return the minimum of variable ivar from the training sample that pass/end up in this node ...
void IncrementNSigEvents_unweighted()
virtual void ReadXML(void *node, UInt_t tmva_Version_Code=TMVA_VERSION_CODE)
read attributes from XML
void PruneNodeInPlace(TMVA::DecisionTreeNode *node)
prune a node temporaily (without actually deleting its decendants which allows testing the pruned tre...
Bool_t IsTerminal() const
static void SetVarIndex(Int_t iVar)
void AddToSumTarget2(Float_t t2)
void SetSampleMax(UInt_t ivar, Float_t xmax)
set the maximum of variable ivar from the training sample that pass/end up in this node ...
ClassImp(TMVA::DecisionTree) TMVA
default constructor using the GiniIndex as separation criterion, no restrictions on minium number of ...
Double_t GetNodeR() const
Node * GetNode(ULong_t sequence, UInt_t depth)
retrieve node from the tree.
void IncrementNSigEvents(Float_t s)
Float_t GetSumTarget2() const
void SetNodeType(Int_t t)
void ClearTree()
clear the tree nodes (their S/N, Nevents etc), just keep the structure of the tree ...
void SetAlphaMinSubtree(Double_t g)
Types::EAnalysisType fAnalysisType
static DecisionTree * CreateFromXML(void *node, UInt_t tmva_Version_Code=TMVA_VERSION_CODE)
re-create a new tree (decision tree or search tree) from XML
void SetNEvents_unboosted(Float_t nev)
void SetNSigEvents_unboosted(Float_t s)
void SetTerminal(Bool_t s=kTRUE)
RegressionVariance * fRegType
void SetNSigEvents(Float_t s)
Float_t GetTarget(UInt_t itgt) const
void SetNBkgEvents_unboosted(Float_t b)
void SetNBkgEvents_unweighted(Float_t b)
void IncrementNBkgEvents_unweighted()
Double_t PruneTree(const EventConstList *validationSample=NULL)
prune (get rid of internal nodes) the Decision tree to avoid overtraining serveral different pruning ...
Abstract ClassifierFactory template that handles arbitrary types.
Short_t Max(Short_t a, Short_t b)
UInt_t BuildTree(const EventConstList &eventSample, DecisionTreeNode *node=NULL)
building the decision tree by recursively calling the splitting of one (root-) node into two daughter...
Double_t TestPrunedTreeQuality(const DecisionTreeNode *dt=NULL, Int_t mode=0) const
return the misclassification rate of a pruned tree a "pruned tree" may have set the variable "IsTermi...
virtual Double_t Determinant() const
Return the matrix determinant.
Float_t GetNSigEvents(void) const
void SetSeparationIndex(Float_t sep)
Double_t Sqrt(Double_t x)
double norm(double *x, double *p)
virtual Bool_t GoesRight(const Event &) const
test event if it decends the tree at this node to the right
Float_t GetResponse(void) const
void SetNEvents_unweighted(Float_t nev)
void ApplyValidationSample(const EventConstList *validationSample) const
run the validation sample through the (pruned) tree and fill in the nodes the variables NSValidation ...
void PruneNode(TMVA::DecisionTreeNode *node)
prune away the subtree below the node
Float_t GetSampleMax(UInt_t ivar) const
return the maximum of variable ivar from the training sample that pass/end up in this node ...