82 if (a<b) {
Int_t tmp =
a; a=
b; b=tmp; }
95 fVerboseLevel(TString(
"Info")),
96 fScaleWithPreselEff(0),
100 fLogger( new
MsgLogger(
"DataSetFactory", kINFO) )
109 std::vector<TTreeFormula*>::const_iterator formIt;
149 Log() << kDEBUG <<
Form(
"Dataset[%s] : ",dsi.
GetName()) <<
"Build DataSet consisting of one Event with dynamically changing variables" <<
Endl;
159 std::vector<Float_t*>* evdyn =
new std::vector<Float_t*>(0);
163 if (varinfos.empty())
164 Log() << kFATAL <<
Form(
"Dataset[%s] : ",dsi.
GetName()) <<
"Dynamic data set cannot be built, since no variable informations are present. Apparently no variables have been set. This should not happen, please contact the TMVA authors." <<
Endl;
166 std::vector<VariableInfo>::iterator it = varinfos.begin(), itEnd=varinfos.end();
167 for (;it!=itEnd;++it) {
170 Log() << kDEBUG <<
Form(
"Dataset[%s] : ",dsi.
GetName()) <<
"The link to the external variable is NULL while I am trying to build a dynamic data set. In this case fTmpEvent from MethodBase HAS TO BE USED in the method to get useful values in variables." <<
Endl;
171 else evdyn->push_back (external);
175 it = spectatorinfos.begin();
176 for (;it!=spectatorinfos.end();it++) evdyn->push_back( (
Float_t*)(*it).GetExternalLink() );
178 TMVA::Event * ev =
new Event((
const std::vector<Float_t*>*&)evdyn, varinfos.size());
179 std::vector<Event*>* newEventVector =
new std::vector<Event*>;
180 newEventVector->push_back(ev);
184 ds->SetCurrentEvent( 0 );
186 delete newEventVector;
203 std::vector< TString >* classList = dataInput.
GetClassList();
204 for (std::vector<TString>::iterator it = classList->begin(); it< classList->end(); it++) {
215 InitOptions( dsi, eventCounts, normMode, splitSeed, splitMode , mixMode );
221 splitMode, mixMode, normMode, splitSeed );
224 if (showCollectedOutput) {
245 const TString& expression,
251 Log() << kFATAL <<
"Expression " << expression.Data()
252 <<
" could not be resolved to a valid formula. " <<
Endl;
254 Log() << kWARNING <<
"Expression: " << expression.Data()
255 <<
" does not provide data for this event. " 256 <<
"This event is not taken into account. --> please check if you use as a variable " 257 <<
"an entry of an array which is not filled for some events " 258 <<
"(e.g. arr[4] when arr has only 3 elements)." <<
Endl;
259 Log() << kWARNING <<
"If you want to take the event into account you can do something like: " 260 <<
"\"Alt$(arr[4],0)\" where in cases where arr doesn't have a 4th element, " 261 <<
" 0 is taken as an alternative." <<
Endl;
264 if( expression.Contains(
"$") )
268 for (
int i = 0, iEnd = ttf->
GetNcodes (); i < iEnd; ++i)
288 TTree *tr = tinfo.
GetTree()->GetTree();
290 tr->SetBranchStatus(
"*",1);
295 Log() << kDEBUG <<
Form(
"Dataset[%s] : ",dsi.
GetName()) <<
"transform input variables" <<
Endl;
296 std::vector<TTreeFormula*>::const_iterator formIt, formItEnd;
311 Log() << kDEBUG <<
Form(
"Dataset[%s] : ",dsi.
GetName()) <<
"transform regression targets" <<
Endl;
324 Log() << kDEBUG <<
Form(
"Dataset[%s] : ",dsi.
GetName()) <<
"transform spectator variables" <<
Endl;
338 for (formIt =
fCutFormulas.begin(), formItEnd =
fCutFormulas.end(); formIt!=formItEnd; formIt++)
if (*formIt)
delete *formIt;
342 const TString tmpCutExp(tmpCut.GetTitle());
358 Log() << kDEBUG <<
Form(
"Dataset[%s] : ",dsi.
GetName()) <<
"transform weights" <<
Endl;
371 ttf =
new TTreeFormula(
"FormulaWeight", tmpWeight, tr );
387 tr->SetBranchStatus(
"*",0);
388 Log() << kDEBUG <<
Form(
"Dataset[%s] : ",dsi.
GetName()) <<
"enable branches: input variables" <<
Endl;
397 Log() << kDEBUG <<
Form(
"Dataset[%s] : ",dsi.
GetName()) <<
"enable branches: targets" <<
Endl;
404 Log() << kDEBUG <<
Form(
"Dataset[%s] : ",dsi.
GetName()) <<
"enable branches: spectators" <<
Endl;
411 Log() << kDEBUG <<
Form(
"Dataset[%s] : ",dsi.
GetName()) <<
"enable branches: cuts" <<
Endl;
419 Log() << kDEBUG <<
Form(
"Dataset[%s] : ",dsi.
GetName()) <<
"enable branches: weights" <<
Endl;
427 Log() << kDEBUG <<
Form(
"Dataset[%s] : ",dsi.
GetName()) <<
"tree initialized" <<
Endl;
447 for (
UInt_t ivar=0; ivar<nvar ; ivar++) { min[ivar] = FLT_MAX; max[ivar] = -FLT_MAX; }
448 for (
UInt_t ivar=0; ivar<ntgts; ivar++) { tgmin[ivar] = FLT_MAX; tgmax[ivar] = -FLT_MAX; }
449 for (
UInt_t ivar=0; ivar<nvis; ivar++) { vmin[ivar] = FLT_MAX; vmax[ivar] = -FLT_MAX; }
455 for (
UInt_t ivar=0; ivar<nvar; ivar++) {
457 if (v<min[ivar]) min[ivar] =
v;
458 if (v>max[ivar]) max[ivar] =
v;
460 for (
UInt_t itgt=0; itgt<ntgts; itgt++) {
462 if (v<tgmin[itgt]) tgmin[itgt] =
v;
463 if (v>tgmax[itgt]) tgmax[itgt] =
v;
465 for (
UInt_t ivis=0; ivis<nvis; ivis++) {
467 if (v<vmin[ivis]) vmin[ivis] =
v;
468 if (v>vmax[ivis]) vmax[ivis] =
v;
472 for (
UInt_t ivar=0; ivar<nvar; ivar++) {
475 if(
TMath::Abs(max[ivar]-min[ivar]) <= FLT_MIN )
478 for (
UInt_t ivar=0; ivar<ntgts; ivar++) {
481 if(
TMath::Abs(tgmax[ivar]-tgmin[ivar]) <= FLT_MIN )
484 for (
UInt_t ivar=0; ivar<nvis; ivar++) {
511 for (ivar=0; ivar<nvar; ivar++) {
512 for (jvar=0; jvar<nvar; jvar++) {
514 Double_t d = (*mat)(ivar, ivar)*(*mat)(jvar, jvar);
515 if (d > 0) (*mat)(ivar, jvar) /=
sqrt(d);
517 Log() << kWARNING <<
Form(
"Dataset[%s] : ",
DataSetInfo().
GetName())<<
"<GetCorrelationMatrix> Zero variances for variables " 518 <<
"(" << ivar <<
", " << jvar <<
") = " << d
520 (*mat)(ivar, jvar) = 0;
526 for (ivar=0; ivar<nvar; ivar++) (*mat)(ivar, ivar) = 1.0;
537 UInt_t ivar = 0, jvar = 0;
544 for (ivar=0; ivar<nvar; ivar++) {
546 for (jvar=0; jvar<nvar; jvar++) mat2(ivar, jvar) = 0;
554 if (ev->
GetClass() != classNumber )
continue;
559 for (ivar=0; ivar<nvar; ivar++) {
562 vec(ivar) += xi*weight;
563 mat2(ivar, ivar) += (xi*xi*weight);
565 for (jvar=ivar+1; jvar<nvar; jvar++) {
567 mat2(ivar, jvar) += (xi*xj*weight);
572 for (ivar=0; ivar<nvar; ivar++)
573 for (jvar=ivar+1; jvar<nvar; jvar++)
574 mat2(jvar, ivar) = mat2(ivar, jvar);
578 for (ivar=0; ivar<nvar; ivar++) {
579 for (jvar=0; jvar<nvar; jvar++) {
580 (*mat)(ivar, jvar) = mat2(ivar, jvar)/ic - vec(ivar)*vec(jvar)/(ic*ic);
602 splitSpecs.SetConfigDescription(
"Configuration options given in the \"PrepareForTrainingAndTesting\" call; these options define the creation of the data sets used for training and expert validation by TMVA" );
604 splitMode =
"Random";
605 splitSpecs.DeclareOptionRef( splitMode,
"SplitMode",
606 "Method of picking training and testing events (default: random)" );
607 splitSpecs.AddPreDefVal(TString(
"Random"));
608 splitSpecs.AddPreDefVal(TString(
"Alternate"));
609 splitSpecs.AddPreDefVal(TString(
"Block"));
611 mixMode =
"SameAsSplitMode";
612 splitSpecs.DeclareOptionRef( mixMode,
"MixMode",
613 "Method of mixing events of different classes into one dataset (default: SameAsSplitMode)" );
614 splitSpecs.AddPreDefVal(TString(
"SameAsSplitMode"));
615 splitSpecs.AddPreDefVal(TString(
"Random"));
616 splitSpecs.AddPreDefVal(TString(
"Alternate"));
617 splitSpecs.AddPreDefVal(TString(
"Block"));
620 splitSpecs.DeclareOptionRef( splitSeed,
"SplitSeed",
621 "Seed for random event shuffling" );
623 normMode =
"EqualNumEvents";
624 splitSpecs.DeclareOptionRef( normMode,
"NormMode",
625 "Overall renormalisation of event-by-event weights used in the training (NumEvents: average weight of 1 per event, independently for signal and background; EqualNumEvents: average weight of 1 per event for signal, and sum of weights for background equal to sum of weights for signal)" );
626 splitSpecs.AddPreDefVal(TString(
"None"));
627 splitSpecs.AddPreDefVal(TString(
"NumEvents"));
628 splitSpecs.AddPreDefVal(TString(
"EqualNumEvents"));
630 splitSpecs.DeclareOptionRef(
fScaleWithPreselEff=
kFALSE,
"ScaleWithPreselEff",
"Scale the number of requested events by the eff. of the preselection cuts (or not)" );
637 TString titleTrain = TString().Format(
"Number of training events of class %s (default: 0 = all)",clName.Data()).
Data();
638 TString titleTest = TString().Format(
"Number of test events of class %s (default: 0 = all)",clName.Data()).
Data();
639 TString titleSplit = TString().Format(
"Split in training and test events of class %s (default: 0 = deactivated)",clName.Data()).
Data();
641 splitSpecs.DeclareOptionRef( nEventRequests.at(cl).nTrainingEventsRequested, TString(
"nTrain_")+clName, titleTrain );
642 splitSpecs.DeclareOptionRef( nEventRequests.at(cl).nTestingEventsRequested , TString(
"nTest_")+clName , titleTest );
643 splitSpecs.DeclareOptionRef( nEventRequests.at(cl).TrainTestSplitRequested , TString(
"TrainTestSplit_")+clName , titleTest );
646 splitSpecs.DeclareOptionRef(
fVerbose,
"V",
"Verbosity (default: true)" );
648 splitSpecs.DeclareOptionRef(
fVerboseLevel=TString(
"Info"),
"VerboseLevel",
"VerboseLevel (Debug/Verbose/Info)" );
649 splitSpecs.AddPreDefVal(TString(
"Debug"));
650 splitSpecs.AddPreDefVal(TString(
"Verbose"));
651 splitSpecs.AddPreDefVal(TString(
"Info"));
653 splitSpecs.ParseOptions();
654 splitSpecs.CheckForUnusedOptions();
663 splitMode.ToUpper(); mixMode.ToUpper(); normMode.ToUpper();
666 <<
"\tSplitmode is: \"" << splitMode <<
"\" the mixmode is: \"" << mixMode <<
"\"" <<
Endl;
667 if (mixMode==
"SAMEASSPLITMODE") mixMode = splitMode;
668 else if (mixMode!=splitMode)
669 Log() << kINFO <<
Form(
"Dataset[%s] : ",dsi.
GetName()) <<
"DataSet splitmode="<<splitMode
670 <<
" differs from mixmode="<<mixMode<<Endl;
694 for (
size_t i=0; i<nclasses; i++) {
695 eventCounts[i].varAvLength =
new Float_t[nvars];
696 for (
UInt_t ivar=0; ivar<nvars; ivar++)
697 eventCounts[i].varAvLength[ivar] = 0;
707 std::map<TString, int> nanInfWarnings;
708 std::map<TString, int> nanInfErrors;
712 for (
UInt_t cl=0; cl<nclasses; cl++) {
716 EventStats& classEventCounts = eventCounts[cl];
724 TString currentFileName(
"");
730 std::vector<Float_t> vars(nvars);
731 std::vector<Float_t> tgts(ntgts);
732 std::vector<Float_t> vis(nvis);
739 Bool_t isChain = (TString(
"TChain") == currentInfo.
GetTree()->ClassName());
740 currentInfo.
GetTree()->LoadTree(0);
748 for (
Long64_t evtIdx = 0; evtIdx < nEvts; evtIdx++) {
749 currentInfo.
GetTree()->LoadTree(evtIdx);
753 if (currentInfo.
GetTree()->GetTree()->GetDirectory()->GetFile()->GetName() != currentFileName) {
754 currentFileName = currentInfo.
GetTree()->GetTree()->GetDirectory()->GetFile()->GetName();
758 currentInfo.
GetTree()->GetEntry(evtIdx);
759 Int_t sizeOfArrays = 1;
760 Int_t prevArrExpr = 0;
765 for (
UInt_t ivar=0; ivar<nvars; ivar++) {
768 if (ndata == 1)
continue;
770 varIsArray[ivar] =
kTRUE;
771 if (sizeOfArrays == 1) {
772 sizeOfArrays =
ndata;
775 else if (sizeOfArrays!=ndata) {
776 Log() << kERROR <<
Form(
"Dataset[%s] : ",dsi.
GetName())<<
"ERROR while preparing training and testing trees:" <<
Endl;
777 Log() <<
Form(
"Dataset[%s] : ",dsi.
GetName())<<
" multiple array-type expressions of different length were encountered" <<
Endl;
778 Log() <<
Form(
"Dataset[%s] : ",dsi.
GetName())<<
" location of error: event " << evtIdx
779 <<
" in tree " << currentInfo.
GetTree()->GetName()
780 <<
" of file " << currentInfo.
GetTree()->GetCurrentFile()->GetName() <<
Endl;
782 <<
Form(
"Dataset[%s] : ",dsi.
GetName()) << ndata <<
" entries, while" <<
Endl;
785 Log() << kFATAL <<
Form(
"Dataset[%s] : ",dsi.
GetName())<<
"Need to abort" << Endl;
790 for (
Int_t idata = 0; idata<sizeOfArrays; idata++) {
793 auto checkNanInf = [&](std::map<TString, int> &msgMap,
Float_t value,
const char *what,
const char *formulaTitle) {
795 contains_NaN_or_inf =
kTRUE;
796 ++msgMap[
TString::Format(
"Dataset[%s] : %s expression resolves to indeterminate value (NaN): %s", dsi.
GetName(), what, formulaTitle)];
798 contains_NaN_or_inf =
kTRUE;
799 ++msgMap[
TString::Format(
"Dataset[%s] : %s expression resolves to infinite value (+inf or -inf): %s", dsi.
GetName(), what, formulaTitle)];
813 checkNanInf(nanInfErrors, cutVal,
"Cut", formula->
GetTitle());
817 auto &nanMessages = cutVal < 0.5 ? nanInfWarnings : nanInfErrors;
820 for (
UInt_t ivar=0; ivar<nvars; ivar++) {
823 vars[ivar] = (ndata == 1 ?
826 checkNanInf(nanMessages, vars[ivar],
"Input", formula->
GetTitle());
830 for (
UInt_t itrgt=0; itrgt<ntgts; itrgt++) {
833 tgts[itrgt] = (ndata == 1 ?
836 checkNanInf(nanMessages, tgts[itrgt],
"Target", formula->
GetTitle());
840 for (
UInt_t itVis=0; itVis<nvis; itVis++) {
843 vis[itVis] = (ndata == 1 ?
846 checkNanInf(nanMessages, vis[itVis],
"Spectator", formula->
GetTitle());
855 weight *= (ndata == 1 ?
858 checkNanInf(nanMessages, weight,
"Weight", formula->
GetTitle());
868 if (cutVal<0.5)
continue;
877 if (contains_NaN_or_inf) {
878 Log() << kWARNING <<
Form(
"Dataset[%s] : ",dsi.
GetName())<<
"NaN or +-inf in Event " << evtIdx <<
Endl;
879 if (sizeOfArrays>1)
Log() << kWARNING <<
Form(
"Dataset[%s] : ",dsi.
GetName())<<
" rejected" <<
Endl;
889 event_v.push_back(
new Event(vars, tgts , vis, cl , weight));
892 currentInfo.
GetTree()->ResetBranchAddresses();
896 if (!nanInfWarnings.empty()) {
897 Log() << kWARNING <<
"Found events with NaN and/or +-inf values" <<
Endl;
898 for (
const auto &warning : nanInfWarnings) {
899 auto &
log =
Log() << kWARNING << warning.first;
900 if (warning.second > 1)
log <<
" (" << warning.second <<
" times)";
903 Log() << kWARNING <<
"These NaN and/or +-infs were all removed by the specified cut, continuing." <<
Endl;
907 if (!nanInfErrors.empty()) {
908 Log() << kWARNING <<
"Found events with NaN and/or +-inf values (not removed by cut)" <<
Endl;
909 for (
const auto &error : nanInfErrors) {
910 auto &
log =
Log() << kWARNING << error.first;
911 if (error.second > 1)
log <<
" (" << error.second <<
" times)";
914 Log() << kFATAL <<
"How am I supposed to train a NaN or +-inf?!" <<
Endl;
920 Log() << kHEADER <<
Form(
"[%s] : ",dsi.
GetName()) <<
"Number of events in input trees" <<
Endl;
921 Log() << kDEBUG <<
"(after possible flattening of arrays):" <<
Endl;
928 <<
" -- number of events : " 929 << std::setw(5) << eventCounts[cl].nEvBeforeCut
930 <<
" / sum of weights: " << std::setw(5) << eventCounts[cl].nWeEvBeforeCut <<
Endl;
936 <<
" tree -- total number of entries: " 942 <<
"\tPreselection: (will affect number of requested training and testing events)" <<
Endl;
945 <<
"\tPreselection: (will NOT affect number of requested training and testing events)" <<
Endl;
953 <<
" -- number of events passed: " 954 << std::setw(5) << eventCounts[cl].nEvAfterCut
955 <<
" / sum of weights: " << std::setw(5) << eventCounts[cl].nWeEvAfterCut <<
Endl;
958 <<
" -- efficiency : " 959 << std::setw(6) << eventCounts[cl].nWeEvAfterCut/eventCounts[cl].nWeEvBeforeCut <<
Endl;
963 <<
" No preselection cuts applied on event classes" <<
Endl;
976 const TString& splitMode,
977 const TString& mixMode,
978 const TString& normMode,
986 if (splitMode.Contains(
"RANDOM" ) ) {
990 if( ! unspecifiedEvents.empty() ) {
991 Log() << kDEBUG <<
"randomly shuffling " 992 << unspecifiedEvents.size()
993 <<
" events of class " << cls
994 <<
" which are not yet associated to testing or training" <<
Endl;
995 std::random_shuffle( unspecifiedEvents.begin(),
996 unspecifiedEvents.end(),
1003 Log() << kDEBUG <<
Form(
"Dataset[%s] : ",dsi.
GetName())<<
"SPLITTING ========" <<
Endl;
1005 Log() << kDEBUG <<
Form(
"Dataset[%s] : ",dsi.
GetName())<<
"---- class " << cls <<
Endl;
1006 Log() << kDEBUG <<
Form(
"Dataset[%s] : ",dsi.
GetName())<<
"check number of training/testing events, requested and available number of events and for class " << cls <<
Endl;
1013 Int_t availableTraining = eventVectorTraining.size();
1014 Int_t availableTesting = eventVectorTesting.size();
1015 Int_t availableUndefined = eventVectorUndefined.size();
1019 presel_scale = eventCounts[cls].cutScaling();
1020 if (presel_scale < 1)
1021 Log() << kINFO <<
Form(
"Dataset[%s] : ",dsi.
GetName()) <<
" you have opted for scaling the number of requested training/testing events\n to be scaled by the preselection efficiency"<<
Endl;
1024 if (eventCounts[cls].cutScaling() < 1)
1025 Log() << kINFO <<
Form(
"Dataset[%s] : ",dsi.
GetName()) <<
" you have opted for interpreting the requested number of training/testing events\n to be the number of events AFTER your preselection cuts" <<
Endl;
1032 if(eventCounts[cls].TrainTestSplitRequested < 1.0 && eventCounts[cls].TrainTestSplitRequested > 0.0){
1033 eventCounts[cls].nTrainingEventsRequested =
Int_t(eventCounts[cls].TrainTestSplitRequested*(availableTraining+availableTesting+availableUndefined));
1034 eventCounts[cls].nTestingEventsRequested =
Int_t(0);
1036 else if(eventCounts[cls].TrainTestSplitRequested != 0.0)
Log() << kFATAL <<
Form(
"The option TrainTestSplit_<class> has to be in range (0, 1] but is set to %f.",eventCounts[cls].TrainTestSplitRequested) <<
Endl;
1037 Int_t requestedTraining =
Int_t(eventCounts[cls].nTrainingEventsRequested * presel_scale);
1038 Int_t requestedTesting =
Int_t(eventCounts[cls].nTestingEventsRequested * presel_scale);
1040 Log() << kDEBUG <<
Form(
"Dataset[%s] : ",dsi.
GetName())<<
"events in training trees : " << availableTraining << Endl;
1041 Log() << kDEBUG <<
Form(
"Dataset[%s] : ",dsi.
GetName())<<
"events in testing trees : " << availableTesting << Endl;
1042 Log() << kDEBUG <<
Form(
"Dataset[%s] : ",dsi.
GetName())<<
"events in unspecified trees : " << availableUndefined << Endl;
1043 Log() << kDEBUG <<
Form(
"Dataset[%s] : ",dsi.
GetName())<<
"requested for training : " << requestedTraining << Endl;;
1046 Log() <<
" ( " << eventCounts[cls].nTrainingEventsRequested
1047 <<
" * " << presel_scale <<
" preselection efficiency)" <<
Endl;
1050 Log() << kDEBUG <<
"requested for testing : " << requestedTesting;
1052 Log() <<
" ( " << eventCounts[cls].nTestingEventsRequested
1053 <<
" * " << presel_scale <<
" preselection efficiency)" <<
Endl;
1104 Int_t useForTesting(0),useForTraining(0);
1105 Int_t allAvailable(availableUndefined + availableTraining + availableTesting);
1107 if( (requestedTraining == 0) && (requestedTesting == 0)){
1111 if ( availableUndefined >=
TMath::Abs(availableTraining - availableTesting) ) {
1113 useForTraining = useForTesting = allAvailable/2;
1116 useForTraining = availableTraining;
1117 useForTesting = availableTesting;
1118 if (availableTraining < availableTesting)
1119 useForTraining += availableUndefined;
1121 useForTesting += availableUndefined;
1123 requestedTraining = useForTraining;
1124 requestedTesting = useForTesting;
1127 else if (requestedTesting == 0){
1129 useForTraining =
TMath::Max(requestedTraining,availableTraining);
1130 if (allAvailable < useForTraining) {
1131 Log() << kFATAL <<
Form(
"Dataset[%s] : ",dsi.
GetName())<<
"More events requested for training (" 1132 << requestedTraining <<
") than available (" 1133 << allAvailable <<
")!" << Endl;
1135 useForTesting = allAvailable - useForTraining;
1136 requestedTesting = useForTesting;
1139 else if (requestedTraining == 0){
1140 useForTesting =
TMath::Max(requestedTesting,availableTesting);
1141 if (allAvailable < useForTesting) {
1142 Log() << kFATAL <<
Form(
"Dataset[%s] : ",dsi.
GetName())<<
"More events requested for testing (" 1143 << requestedTesting <<
") than available (" 1144 << allAvailable <<
")!" << Endl;
1146 useForTraining= allAvailable - useForTesting;
1147 requestedTraining = useForTraining;
1156 Int_t stillNeedForTraining =
TMath::Max(requestedTraining-availableTraining,0);
1157 Int_t stillNeedForTesting =
TMath::Max(requestedTesting-availableTesting,0);
1159 int NFree = availableUndefined - stillNeedForTraining - stillNeedForTesting;
1160 if (NFree <0) NFree = 0;
1161 useForTraining =
TMath::Max(requestedTraining,availableTraining) + NFree/2;
1162 useForTesting= allAvailable - useForTraining;
1165 Log() << kDEBUG <<
Form(
"Dataset[%s] : ",dsi.
GetName())<<
"determined event sample size to select training sample from="<<useForTraining<<Endl;
1166 Log() << kDEBUG <<
Form(
"Dataset[%s] : ",dsi.
GetName())<<
"determined event sample size to select test sample from="<<useForTesting<<Endl;
1171 if( splitMode ==
"ALTERNATE" ){
1172 Log() << kDEBUG <<
Form(
"Dataset[%s] : ",dsi.
GetName())<<
"split 'ALTERNATE'" << Endl;
1173 Int_t nTraining = availableTraining;
1174 Int_t nTesting = availableTesting;
1175 for( EventVector::iterator it = eventVectorUndefined.begin(), itEnd = eventVectorUndefined.end(); it != itEnd; ){
1177 if( nTraining <= requestedTraining ){
1178 eventVectorTraining.insert( eventVectorTraining.end(), (*it) );
1183 eventVectorTesting.insert( eventVectorTesting.end(), (*it) );
1188 Log() << kDEBUG <<
Form(
"Dataset[%s] : ",dsi.
GetName())<<
"split '" << splitMode <<
"'" << Endl;
1191 Log() << kDEBUG <<
Form(
"Dataset[%s] : ",dsi.
GetName())<<
"availableundefined : " << availableUndefined << Endl;
1192 Log() << kDEBUG <<
Form(
"Dataset[%s] : ",dsi.
GetName())<<
"useForTraining : " << useForTraining << Endl;
1193 Log() << kDEBUG <<
Form(
"Dataset[%s] : ",dsi.
GetName())<<
"useForTesting : " << useForTesting << Endl;
1194 Log() << kDEBUG <<
Form(
"Dataset[%s] : ",dsi.
GetName())<<
"availableTraining : " << availableTraining << Endl;
1195 Log() << kDEBUG <<
Form(
"Dataset[%s] : ",dsi.
GetName())<<
"availableTesting : " << availableTesting << Endl;
1197 if( availableUndefined<(useForTraining-availableTraining) ||
1198 availableUndefined<(useForTesting -availableTesting ) ||
1199 availableUndefined<(useForTraining+useForTesting-availableTraining-availableTesting ) ){
1200 Log() << kFATAL <<
Form(
"Dataset[%s] : ",dsi.
GetName())<<
"More events requested than available!" << Endl;
1204 if (useForTraining>availableTraining){
1205 eventVectorTraining.insert( eventVectorTraining.end() , eventVectorUndefined.begin(), eventVectorUndefined.begin()+ useForTraining- availableTraining );
1206 eventVectorUndefined.erase( eventVectorUndefined.begin(), eventVectorUndefined.begin() + useForTraining- availableTraining);
1208 if (useForTesting>availableTesting){
1209 eventVectorTesting.insert( eventVectorTesting.end() , eventVectorUndefined.begin(), eventVectorUndefined.begin()+ useForTesting- availableTesting );
1212 eventVectorUndefined.clear();
1215 if (splitMode.Contains(
"RANDOM" )){
1216 UInt_t sizeTraining = eventVectorTraining.size();
1217 if( sizeTraining >
UInt_t(requestedTraining) ){
1218 std::vector<UInt_t> indicesTraining( sizeTraining );
1222 std::random_shuffle( indicesTraining.begin(), indicesTraining.end(), rndm );
1224 indicesTraining.erase( indicesTraining.begin()+sizeTraining-
UInt_t(requestedTraining), indicesTraining.end() );
1226 for( std::vector<UInt_t>::iterator it = indicesTraining.begin(), itEnd = indicesTraining.end(); it != itEnd; ++it ){
1227 delete eventVectorTraining.at( (*it) );
1228 eventVectorTraining.at( (*it) ) =
NULL;
1231 eventVectorTraining.erase( std::remove( eventVectorTraining.begin(), eventVectorTraining.end(), (
void*)
NULL ), eventVectorTraining.end() );
1234 UInt_t sizeTesting = eventVectorTesting.size();
1235 if( sizeTesting >
UInt_t(requestedTesting) ){
1236 std::vector<UInt_t> indicesTesting( sizeTesting );
1240 std::random_shuffle( indicesTesting.begin(), indicesTesting.end(), rndm );
1242 indicesTesting.erase( indicesTesting.begin()+sizeTesting-
UInt_t(requestedTesting), indicesTesting.end() );
1244 for( std::vector<UInt_t>::iterator it = indicesTesting.begin(), itEnd = indicesTesting.end(); it != itEnd; ++it ){
1245 delete eventVectorTesting.at( (*it) );
1246 eventVectorTesting.at( (*it) ) =
NULL;
1249 eventVectorTesting.erase( std::remove( eventVectorTesting.begin(), eventVectorTesting.end(), (
void*)
NULL ), eventVectorTesting.end() );
1253 if( eventVectorTraining.size() <
UInt_t(requestedTraining) )
1254 Log() << kWARNING <<
Form(
"Dataset[%s] : ",dsi.
GetName())<<
"DataSetFactory/requested number of training samples larger than size of eventVectorTraining.\n" 1255 <<
"There is probably an issue. Please contact the TMVA developers." << Endl;
1256 std::for_each( eventVectorTraining.begin()+requestedTraining, eventVectorTraining.end(), DeleteFunctor<Event>() );
1257 eventVectorTraining.erase(eventVectorTraining.begin()+requestedTraining,eventVectorTraining.end());
1259 if( eventVectorTesting.size() <
UInt_t(requestedTesting) )
1260 Log() << kWARNING <<
Form(
"Dataset[%s] : ",dsi.
GetName())<<
"DataSetFactory/requested number of testing samples larger than size of eventVectorTesting.\n" 1261 <<
"There is probably an issue. Please contact the TMVA developers." << Endl;
1262 std::for_each( eventVectorTesting.begin()+requestedTesting, eventVectorTesting.end(), DeleteFunctor<Event>() );
1263 eventVectorTesting.erase(eventVectorTesting.begin()+requestedTesting,eventVectorTesting.end());
1269 Int_t trainingSize = 0;
1270 Int_t testingSize = 0;
1284 trainingEventVector->reserve( trainingSize );
1285 testingEventVector->reserve( testingSize );
1291 Log() << kDEBUG <<
" MIXING ============= " <<
Endl;
1293 if( mixMode ==
"ALTERNATE" ){
1298 Log() << kINFO <<
Form(
"Dataset[%s] : ",dsi.
GetName()) <<
"Training sample: You are trying to mix events in alternate mode although the classes have different event numbers. This works but the alternation stops at the last event of the smaller class."<<Endl;
1301 Log() << kINFO <<
Form(
"Dataset[%s] : ",dsi.
GetName()) <<
"Testing sample: You are trying to mix events in alternate mode although the classes have different event numbers. This works but the alternation stops at the last event of the smaller class."<<Endl;
1304 typedef EventVector::iterator EvtVecIt;
1305 EvtVecIt itEvent, itEventEnd;
1308 Log() << kDEBUG <<
"insert class 0 into training and test vector" <<
Endl;
1310 testingEventVector->insert( testingEventVector->end(), tmpEventVector[
Types::kTesting].at(0).begin(), tmpEventVector[
Types::kTesting].at(0).end() );
1315 Log() << kDEBUG <<
Form(
"Dataset[%s] : ",dsi.
GetName())<<
"insert class " << cls << Endl;
1317 itTarget = trainingEventVector->begin() - 1;
1319 for( itEvent = tmpEventVector[
Types::kTraining].at(cls).begin(), itEventEnd = tmpEventVector[
Types::kTraining].at(cls).end(); itEvent != itEventEnd; ++itEvent ){
1321 if( (trainingEventVector->end() - itTarget) <
Int_t(cls+1) ) {
1322 itTarget = trainingEventVector->end();
1323 trainingEventVector->insert( itTarget, itEvent, itEventEnd );
1327 trainingEventVector->insert( itTarget, (*itEvent) );
1331 itTarget = testingEventVector->begin() - 1;
1333 for( itEvent = tmpEventVector[
Types::kTesting].at(cls).begin(), itEventEnd = tmpEventVector[
Types::kTesting].at(cls).end(); itEvent != itEventEnd; ++itEvent ){
1335 if( ( testingEventVector->end() - itTarget ) <
Int_t(cls+1) ) {
1336 itTarget = testingEventVector->end();
1337 testingEventVector->insert( itTarget, itEvent, itEventEnd );
1341 testingEventVector->insert( itTarget, (*itEvent) );
1358 trainingEventVector->insert( trainingEventVector->end(), tmpEventVector[
Types::kTraining].at(cls).begin(), tmpEventVector[
Types::kTraining].at(cls).end() );
1359 testingEventVector->insert ( testingEventVector->end(), tmpEventVector[
Types::kTesting].at(cls).begin(), tmpEventVector[
Types::kTesting].at(cls).end() );
1375 if (mixMode ==
"RANDOM") {
1376 Log() << kDEBUG <<
Form(
"Dataset[%s] : ",dsi.
GetName())<<
"shuffling events"<<Endl;
1378 std::random_shuffle( trainingEventVector->begin(), trainingEventVector->end(), rndm );
1379 std::random_shuffle( testingEventVector->begin(), testingEventVector->end(), rndm );
1382 Log() << kDEBUG <<
Form(
"Dataset[%s] : ",dsi.
GetName())<<
"trainingEventVector " << trainingEventVector->size() <<
Endl;
1383 Log() << kDEBUG <<
Form(
"Dataset[%s] : ",dsi.
GetName())<<
"testingEventVector " << testingEventVector->size() <<
Endl;
1395 Log() << kFATAL <<
"Dataset " << std::string(dsi.
GetName()) <<
" does not have any training events, I better stop here and let you fix that one first " << Endl;
1399 Log() << kERROR <<
"Dataset " << std::string(dsi.
GetName()) <<
" does not have any testing events, guess that will cause problems later..but for now, I continue " << Endl;
1402 delete trainingEventVector;
1403 delete testingEventVector;
1420 const TString& normMode )
1427 Int_t trainingSize = 0;
1428 Int_t testingSize = 0;
1436 Double_t trainingSumSignalWeights = 0;
1437 Double_t trainingSumBackgrWeights = 0;
1438 Double_t testingSumSignalWeights = 0;
1439 Double_t testingSumBackgrWeights = 0;
1444 trainingSizePerClass.at(cls) = tmpEventVector[
Types::kTraining].at(cls).size();
1445 testingSizePerClass.at(cls) = tmpEventVector[
Types::kTesting].at(cls).size();
1447 trainingSize += trainingSizePerClass.back();
1448 testingSize += testingSizePerClass.back();
1462 trainingSumWeightsPerClass.at(cls) = std::accumulate( tmpEventVector[
Types::kTraining].at(cls).begin(),
1469 testingSumWeightsPerClass.at(cls) = std::accumulate( tmpEventVector[
Types::kTesting].at(cls).begin(),
1477 trainingSumSignalWeights += trainingSumWeightsPerClass.at(cls);
1478 testingSumSignalWeights += testingSumWeightsPerClass.at(cls);
1480 trainingSumBackgrWeights += trainingSumWeightsPerClass.at(cls);
1481 testingSumBackgrWeights += testingSumWeightsPerClass.at(cls);
1501 if (normMode ==
"NONE") {
1502 Log() << kINFO <<
Form(
"Dataset[%s] : ",dsi.
GetName()) <<
"No weight renormalisation applied: use original global and event weights" <<
Endl;
1508 else if (normMode ==
"NUMEVENTS") {
1510 <<
"\tWeight renormalisation mode: \"NumEvents\": renormalises all event classes " <<
Endl;
1512 <<
" such that the effective (weighted) number of events in each class equals the respective " <<
Endl;
1514 <<
" number of events (entries) that you demanded in PrepareTrainingAndTestTree(\"\",\"nTrain_Signal=.. )" <<
Endl;
1516 <<
" ... i.e. such that Sum[i=1..N_j]{w_i} = N_j, j=0,1,2..." <<
Endl;
1518 <<
" ... (note that N_j is the sum of TRAINING events (nTrain_j...with j=Signal,Background.." <<
Endl;
1520 <<
" ..... Testing events are not renormalised nor included in the renormalisation factor! )"<<
Endl;
1526 renormFactor.at(cls) = ((
Float_t)trainingSizePerClass.at(cls) )/
1527 (trainingSumWeightsPerClass.at(cls)) ;
1530 else if (normMode ==
"EQUALNUMEVENTS") {
1536 Log() << kINFO <<
Form(
"Dataset[%s] : ",dsi.
GetName()) <<
"Weight renormalisation mode: \"EqualNumEvents\": renormalises all event classes ..." <<
Endl;
1537 Log() << kINFO <<
Form(
"Dataset[%s] : ",dsi.
GetName()) <<
" such that the effective (weighted) number of events in each class is the same " <<
Endl;
1538 Log() << kINFO <<
Form(
"Dataset[%s] : ",dsi.
GetName()) <<
" (and equals the number of events (entries) given for class=0 )" <<
Endl;
1539 Log() << kINFO <<
Form(
"Dataset[%s] : ",dsi.
GetName()) <<
"... i.e. such that Sum[i=1..N_j]{w_i} = N_classA, j=classA, classB, ..." <<
Endl;
1540 Log() << kINFO <<
Form(
"Dataset[%s] : ",dsi.
GetName()) <<
"... (note that N_j is the sum of TRAINING events" <<
Endl;
1541 Log() << kINFO <<
Form(
"Dataset[%s] : ",dsi.
GetName()) <<
" ..... Testing events are not renormalised nor included in the renormalisation factor!)" <<
Endl;
1544 UInt_t referenceClass = 0;
1546 renormFactor.at(cls) =
Float_t(trainingSizePerClass.at(referenceClass))/
1547 (trainingSumWeightsPerClass.at(cls));
1551 Log() << kFATAL <<
Form(
"Dataset[%s] : ",dsi.
GetName())<<
"<PrepareForTrainingAndTesting> Unknown NormMode: " << normMode <<
Endl;
1559 <<
"--> Rescale " << setiosflags(ios::left) << std::setw(maxL)
1561 for (EventVector::iterator it = tmpEventVector[
Types::kTraining].at(cls).begin(),
1562 itEnd = tmpEventVector[
Types::kTraining].at(cls).end(); it != itEnd; ++it){
1563 (*it)->SetWeight ((*it)->GetWeight() * renormFactor.at(cls));
1574 <<
"Number of training and testing events" <<
Endl;
1575 Log() << kDEBUG <<
"\tafter rescaling:" <<
Endl;
1577 <<
"---------------------------------------------------------------------------" <<
Endl;
1579 trainingSumSignalWeights = 0;
1580 trainingSumBackgrWeights = 0;
1581 testingSumSignalWeights = 0;
1582 testingSumBackgrWeights = 0;
1586 trainingSumWeightsPerClass.at(cls) = (std::accumulate( tmpEventVector[
Types::kTraining].at(cls).begin(),
1593 testingSumWeightsPerClass.at(cls) = std::accumulate( tmpEventVector[
Types::kTesting].at(cls).begin(),
1602 trainingSumSignalWeights += trainingSumWeightsPerClass.at(cls);
1603 testingSumSignalWeights += testingSumWeightsPerClass.at(cls);
1605 trainingSumBackgrWeights += trainingSumWeightsPerClass.at(cls);
1606 testingSumBackgrWeights += testingSumWeightsPerClass.at(cls);
1612 << setiosflags(ios::left) << std::setw(maxL)
1614 <<
"training events : " << trainingSizePerClass.at(cls) <<
Endl;
1615 Log() << kDEBUG <<
"\t(sum of weights: " << trainingSumWeightsPerClass.at(cls) <<
")" 1616 <<
" - requested were " << eventCounts[cls].nTrainingEventsRequested <<
" events" <<
Endl;
1618 << setiosflags(ios::left) << std::setw(maxL)
1620 <<
"testing events : " << testingSizePerClass.at(cls) <<
Endl;
1621 Log() << kDEBUG <<
"\t(sum of weights: " << testingSumWeightsPerClass.at(cls) <<
")" 1622 <<
" - requested were " << eventCounts[cls].nTestingEventsRequested <<
" events" <<
Endl;
1624 << setiosflags(ios::left) << std::setw(maxL)
1626 <<
"training and testing events: " 1627 << (trainingSizePerClass.at(cls)+testingSizePerClass.at(cls)) << Endl;
1628 Log() << kDEBUG <<
"\t(sum of weights: " 1629 << (trainingSumWeightsPerClass.at(cls)+testingSumWeightsPerClass.at(cls)) <<
")" << Endl;
1630 if(eventCounts[cls].nEvAfterCut<eventCounts[cls].nEvBeforeCut) {
1631 Log() << kINFO <<
Form(
"Dataset[%s] : ",dsi.
GetName()) << setiosflags(ios::left) << std::setw(maxL)
1633 <<
"due to the preselection a scaling factor has been applied to the numbers of requested events: " 1634 << eventCounts[cls].cutScaling() <<
Endl;
virtual const char * GetName() const
Returns name of object.
A TLeaf describes individual elements of a TBranch See TBranch structure in TTree.
UInt_t GetNVariables() const
std::vector< EventVector > EventVectorOfClasses
void SetTrainingSumBackgrWeights(Double_t trainingSumBackgrWeights)
MsgLogger & Endl(MsgLogger &ml)
const TString & GetInternalName() const
std::vector< VariableInfo > & GetSpectatorInfos()
std::vector< TTreeFormula * > fInputFormulas
std::vector< TTreeFormula * > fCutFormulas
std::vector< Double_t > ValuePerClass
UInt_t GetNVariables() const
access the number of variables through the datasetinfo
void SetTrainingSumSignalWeights(Double_t trainingSumSignalWeights)
void SetTestingSumBackgrWeights(Double_t testingSumBackgrWeights)
void BuildEventVector(DataSetInfo &dsi, DataInputHandler &dataInput, EventVectorOfClassesOfTreeType &eventsmap, EvtStatsPerClass &eventCounts)
build empty event vectors distributes events between kTraining/kTesting/kMaxTreeType ...
void generate(R &r, TH1D *h)
UInt_t GetNClasses() const
void CalcMinMax(DataSet *, DataSetInfo &dsi)
compute covariance matrix
const TString & GetExpression() const
std::vector< int > NumberPerClass
std::map< Types::ETreeType, EventVectorOfClasses > EventVectorOfClassesOfTreeType
void InitOptions(DataSetInfo &dsi, EvtStatsPerClass &eventsmap, TString &normMode, UInt_t &splitSeed, TString &splitMode, TString &mixMode)
the dataset splitting
DataSet * BuildDynamicDataSet(DataSetInfo &)
static TString Format(const char *fmt,...)
Static method which formats a string using a printf style format descriptor and return a TString...
UInt_t GetNSpectators() const
access the number of targets through the datasetinfo
MsgLogger & Log() const
message logger
std::vector< TTreeFormula * > fWeightFormula
void SetTestingSumSignalWeights(Double_t testingSumSignalWeights)
std::vector< std::vector< double > > Data
void PrintCorrelationMatrix(const TString &className)
calculates the correlation matrices for signal and background, prints them to standard output...
void SetMinType(EMsgType minType)
void RenormEvents(DataSetInfo &dsi, EventVectorOfClassesOfTreeType &eventsmap, const EvtStatsPerClass &eventCounts, const TString &normMode)
renormalisation of the TRAINING event weights
Class that contains all the data information.
Double_t GetWeight() const
return the event weight - depending on whether the flag IgnoreNegWeightsInTraining is or not...
Long64_t GetNTrainingEvents() const
void ChangeToNewTree(TreeInfo &, const DataSetInfo &)
While the data gets copied into the local training and testing trees, the input tree can change (for ...
TMatrixT< Double_t > TMatrixD
void SetCorrelationMatrix(const TString &className, TMatrixD *matrix)
Class that contains all the data information.
DataSetFactory()
constructor
TMatrixD * CalcCorrelationMatrix(DataSet *, const UInt_t classNumber)
computes correlation matrix for variables "theVars" in tree; "theType" defines the required event "ty...
Float_t GetTarget(UInt_t itgt) const
Int_t LargestCommonDivider(Int_t a, Int_t b)
UInt_t GetNTargets() const
Bool_t fScaleWithPreselEff
Types::ETreeType GetTreeType() const
ClassInfo * GetClassInfo(Int_t clNum) const
std::vector< TTreeFormula * > fSpectatorFormulas
DataSet * CreateDataSet(DataSetInfo &, DataInputHandler &)
steering the creation of a new dataset
VariableInfo & GetTargetInfo(Int_t i)
char * Form(const char *fmt,...)
UInt_t GetNSpectators(bool all=kTRUE) const
UInt_t GetSignalClassIndex()
std::vector< TTreeFormula * > fTargetFormulas
std::vector< Event *> EventVector
Long64_t GetNTestEvents() const
Float_t GetValue(UInt_t ivar) const
return value of i'th variable
DataSet * BuildInitialDataSet(DataSetInfo &, TMVA::DataInputHandler &)
if no entries, than create a DataSet with one Event which uses dynamic variables (pointers to variabl...
~DataSetFactory()
destructor
const TString & GetClassName() const
VariableInfo & GetSpectatorInfo(Int_t i)
compose_binary_t< F, G, H > compose_binary(const F &_f, const G &_g, const H &_h)
void SetEventCollection(std::vector< Event *> *, Types::ETreeType, Bool_t deleteEvents=true)
Sets the event collection (by DataSetFactory)
VariableInfo & GetVariableInfo(Int_t i)
ClassInfo * AddClass(const TString &className)
Int_t GetClassNameMaxLength() const
virtual const char * GetName() const
Returns name of object.
Long64_t GetNClassEvents(Int_t type, UInt_t classNumber)
ostringstream derivative to redirect and format output
void SetConfigName(const char *n)
virtual const char * GetTitle() const
Returns title of object.
Abstract ClassifierFactory template that handles arbitrary types.
const TCut & GetCut() const
std::vector< EventStats > EvtStatsPerClass
const TString & GetSplitOptions() const
Short_t Max(Short_t a, Short_t b)
Double_t GetOriginalWeight() const
Bool_t CheckTTreeFormula(TTreeFormula *ttf, const TString &expression, Bool_t &hasDollar)
checks a TTreeFormula for problems
void SetNumber(const UInt_t index)
you should not use this method at all Int_t Int_t Double_t Double_t Double_t Int_t Double_t Double_t Double_t Double_t b
Long64_t GetNEvents(Types::ETreeType type=Types::kMaxTreeType) const
const TString & GetWeight() const
DataSet * MixEvents(DataSetInfo &dsi, EventVectorOfClassesOfTreeType &eventsmap, EvtStatsPerClass &eventCounts, const TString &splitMode, const TString &mixMode, const TString &normMode, UInt_t splitSeed)
Select and distribute unassigned events to kTraining and kTesting.
TBranch * GetBranch() const
virtual const char * GetName() const
Returns name of object.
virtual Bool_t IsOnTerminalBranch() const
Double_t GetWeight() const
UInt_t GetNTargets() const
access the number of targets through the datasetinfo
Float_t GetSpectator(UInt_t ivar) const
return spectator content
void SetNormalization(const TString &norm)
TMatrixD * CalcCovarianceMatrix(DataSet *, const UInt_t classNumber)
compute covariance matrix
const Event * GetEvent() const
std::vector< VariableInfo > & GetVariableInfos()
virtual const char * GetTitle() const
Returns title of object.