76 : MethodBase(jobName, Types::kDNN, methodTitle, theData, theOption), fWeightInitialization(), fOutputFunction(),
77 fLayoutString(), fErrorStrategy(), fTrainingStrategyString(), fWeightInitializationString(),
78 fArchitectureString(), fTrainingSettings(), fResume(false), fSettings()
87 : MethodBase( Types::kDNN, theData, theWeightFile),
88 fWeightInitialization(), fOutputFunction(), fLayoutString(), fErrorStrategy(),
89 fTrainingStrategyString(), fWeightInitializationString(), fArchitectureString(),
90 fTrainingSettings(), fResume(false), fSettings()
92 fWeightInitialization = DNN::EInitialization::kGauss;
93 fOutputFunction = DNN::EOutputFunction::kSigmoid;
125 <<
"MethodDNN is deprecated and it will be removed in future ROOT version. "
126 "Please use MethodDL ( TMVA::kDL)"
147 DeclareOptionRef(fLayoutString=
"SOFTSIGN|(N+100)*2,LINEAR",
149 "Layout of the network.");
151 DeclareOptionRef(fValidationSize =
"20%",
"ValidationSize",
152 "Part of the training data to use for "
153 "validation. Specify as 0.2 or 20% to use a "
154 "fifth of the data set as validation set. "
155 "Specify as 100 to use exactly 100 events. "
158 DeclareOptionRef(fErrorStrategy=
"CROSSENTROPY",
160 "Loss function: Mean squared error (regression)"
161 " or cross entropy (binary classification).");
162 AddPreDefVal(
TString(
"CROSSENTROPY"));
163 AddPreDefVal(
TString(
"SUMOFSQUARES"));
164 AddPreDefVal(
TString(
"MUTUALEXCLUSIVE"));
166 DeclareOptionRef(fWeightInitializationString=
"XAVIER",
167 "WeightInitialization",
168 "Weight initialization strategy");
169 AddPreDefVal(
TString(
"XAVIER"));
170 AddPreDefVal(
TString(
"XAVIERUNIFORM"));
172 DeclareOptionRef(fArchitectureString =
"CPU",
"Architecture",
"Which architecture to perform the training on.");
173 AddPreDefVal(
TString(
"STANDARD"));
176 AddPreDefVal(
TString(
"OPENCL"));
179 fTrainingStrategyString =
"LearningRate=1e-1,"
182 "ConvergenceSteps=50,"
188 "DropRepetitions=5|LearningRate=1e-4,"
191 "ConvergenceSteps=50,"
196 "DropConfig=0.0+0.5+0.5,"
198 "Multithreading=True",
200 "Defines the training strategies.");
210 LayoutVector_t layout;
211 const TString layerDelimiter(
",");
212 const TString subDelimiter(
"|");
214 const size_t inputSize = GetNvar();
216 TObjArray* layerStrings = layoutString.Tokenize(layerDelimiter);
217 TIter nextLayer (layerStrings);
220 for (; layerString !=
nullptr; layerString = (
TObjString*) nextLayer()) {
225 TIter nextToken (subStrings);
228 for (; token !=
nullptr; token = (
TObjString *) nextToken()) {
234 if (strActFnc ==
"RELU") {
236 }
else if (strActFnc ==
"TANH") {
238 }
else if (strActFnc ==
"SYMMRELU") {
240 }
else if (strActFnc ==
"SOFTSIGN") {
242 }
else if (strActFnc ==
"SIGMOID") {
244 }
else if (strActFnc ==
"LINEAR") {
246 }
else if (strActFnc ==
"GAUSS") {
255 strNumNodes.ReplaceAll (
"N", strN);
256 strNumNodes.ReplaceAll (
"n", strN);
258 numNodes = fml.Eval (inputSize);
264 layout.push_back(std::make_pair(numNodes, activationFunction));
277 KeyValueVector_t blockKeyValues;
278 const TString keyValueDelim (
"=");
280 TObjArray* blockStrings = parseString.Tokenize (blockDelim);
281 TIter nextBlock (blockStrings);
284 for (; blockString !=
nullptr; blockString = (
TObjString *) nextBlock())
286 blockKeyValues.push_back (std::map<TString,TString>());
287 std::map<TString,TString>& currentBlock = blockKeyValues.back ();
290 TIter nextToken (subStrings);
293 for (; token !=
nullptr; token = (
TObjString *)nextToken())
296 int delimPos = strKeyValue.First (keyValueDelim.Data ());
302 TString strValue =
TString (strKeyValue (delimPos+1, strKeyValue.Length ()));
307 currentBlock.insert (std::make_pair (strKey, strValue));
310 return blockKeyValues;
318 std::map<TString, TString>::const_iterator it = keyValueMap.find (key);
319 if (it == keyValueMap.end()) {
343 return value.
Atoi ();
349double fetchValue (
const std::map<TString,TString>& keyValueMap,
350 TString key,
double defaultValue)
356 return value.
Atof ();
375bool fetchValue (
const std::map<TString,TString>& keyValueMap,
376 TString key,
bool defaultValue)
383 if (value ==
"TRUE" || value ==
"T" || value ==
"1") {
392std::vector<double>
fetchValue(
const std::map<TString, TString> & keyValueMap,
394 std::vector<double> defaultValue)
397 if (parseString ==
"") {
401 std::vector<double> values;
403 const TString tokenDelim (
"+");
405 TIter nextToken (tokenStrings);
407 for (; tokenString != NULL; tokenString = (
TObjString*)nextToken ()) {
408 std::stringstream sstr;
411 sstr >> currentValue;
412 values.push_back (currentValue);
421 if (IgnoreEventsWithNegWeightsInTraining()) {
423 <<
"Will ignore negative events in training!"
427 if (fArchitectureString ==
"STANDARD") {
428 Log() << kERROR <<
"The STANDARD architecture has been deprecated. "
429 "Please use Architecture=CPU or Architecture=CPU."
430 "See the TMVA Users' Guide for instructions if you "
431 "encounter problems."
433 Log() << kFATAL <<
"The STANDARD architecture has been deprecated. "
434 "Please use Architecture=CPU or Architecture=CPU."
435 "See the TMVA Users' Guide for instructions if you "
436 "encounter problems."
440 if (fArchitectureString ==
"OPENCL") {
441 Log() << kERROR <<
"The OPENCL architecture has not been implemented yet. "
442 "Please use Architecture=CPU or Architecture=CPU for the "
443 "time being. See the TMVA Users' Guide for instructions "
444 "if you encounter problems."
446 Log() << kFATAL <<
"The OPENCL architecture has not been implemented yet. "
447 "Please use Architecture=CPU or Architecture=CPU for the "
448 "time being. See the TMVA Users' Guide for instructions "
449 "if you encounter problems."
453 if (fArchitectureString ==
"GPU") {
455 Log() << kERROR <<
"CUDA backend not enabled. Please make sure "
456 "you have CUDA installed and it was successfully "
459 Log() << kFATAL <<
"CUDA backend not enabled. Please make sure "
460 "you have CUDA installed and it was successfully "
466 if (fArchitectureString ==
"CPU") {
468 Log() << kERROR <<
"Multi-core CPU backend not enabled. Please make sure "
469 "you have a BLAS implementation and it was successfully "
470 "detected by CMake as well that the imt CMake flag is set."
472 Log() << kFATAL <<
"Multi-core CPU backend not enabled. Please make sure "
473 "you have a BLAS implementation and it was successfully "
474 "detected by CMake as well that the imt CMake flag is set."
484 size_t inputSize = GetNVariables ();
485 size_t outputSize = 1;
487 outputSize = GetNTargets();
489 outputSize = DataInfo().GetNClasses();
492 fNet.SetBatchSize(1);
493 fNet.SetInputWidth(inputSize);
495 auto itLayout = std::begin (fLayout);
496 auto itLayoutEnd = std::end (fLayout)-1;
497 for ( ; itLayout != itLayoutEnd; ++itLayout) {
498 fNet.AddLayer((*itLayout).first, (*itLayout).second);
509 if (fErrorStrategy ==
"SUMOFSQUARES") {
510 fNet.SetLossFunction(ELossFunction::kMeanSquaredError);
512 if (fErrorStrategy ==
"CROSSENTROPY") {
517 if (fErrorStrategy !=
"SUMOFSQUARES") {
518 Log () << kWARNING <<
"For regression only SUMOFSQUARES is a valid "
519 <<
" neural net error function. Setting error function to "
520 <<
" SUMOFSQUARES now." <<
Endl;
522 fNet.SetLossFunction(ELossFunction::kMeanSquaredError);
525 if (fErrorStrategy ==
"SUMOFSQUARES") {
526 fNet.SetLossFunction(ELossFunction::kMeanSquaredError);
528 if (fErrorStrategy ==
"CROSSENTROPY") {
531 if (fErrorStrategy ==
"MUTUALEXCLUSIVE") {
532 fNet.SetLossFunction(ELossFunction::kSoftmaxCrossEntropy);
541 if (fWeightInitializationString ==
"XAVIER") {
544 else if (fWeightInitializationString ==
"XAVIERUNIFORM") {
556 GetNumValidationSamples();
558 KeyValueVector_t strategyKeyValues = ParseKeyValueString(fTrainingStrategyString,
562 std::cout <<
"Parsed Training DNN string " << fTrainingStrategyString << std::endl;
563 std::cout <<
"STring has size " << strategyKeyValues.size() << std::endl;
564 for (
auto&
block : strategyKeyValues) {
574 std::vector<Double_t>());
594 fTrainingSettings.push_back(settings);
609 Int_t nValidationSamples = 0;
614 if (fValidationSize.EndsWith(
"%")) {
619 Double_t valSizeAsDouble = fValidationSize.Atof() / 100.0;
620 nValidationSamples = GetEventCollection(
Types::kTraining).size() * valSizeAsDouble;
622 Log() << kFATAL <<
"Cannot parse number \"" << fValidationSize
623 <<
"\". Expected string like \"20%\" or \"20.0%\"." <<
Endl;
625 }
else if (fValidationSize.IsFloat()) {
626 Double_t valSizeAsDouble = fValidationSize.Atof();
628 if (valSizeAsDouble < 1.0) {
630 nValidationSamples = GetEventCollection(
Types::kTraining).size() * valSizeAsDouble;
633 nValidationSamples = valSizeAsDouble;
636 Log() << kFATAL <<
"Cannot parse number \"" << fValidationSize <<
"\". Expected string like \"0.2\" or \"100\"."
642 if (nValidationSamples < 0) {
643 Log() << kFATAL <<
"Validation size \"" << fValidationSize <<
"\" is negative." <<
Endl;
646 if (nValidationSamples == 0) {
647 Log() << kFATAL <<
"Validation size \"" << fValidationSize <<
"\" is zero." <<
Endl;
650 if (nValidationSamples >= (
Int_t)trainingSetSize) {
651 Log() << kFATAL <<
"Validation size \"" << fValidationSize
652 <<
"\" is larger than or equal in size to training set (size=\"" << trainingSetSize <<
"\")." <<
Endl;
655 return nValidationSamples;
662 if (fInteractive && fInteractive->NotInitialized()){
663 std::vector<TString> titles = {
"Error on training set",
"Error on test set"};
664 fInteractive->Init(titles);
670 size_t nValidationSamples = GetNumValidationSamples();
671 size_t nTrainingSamples = GetEventCollection(
Types::kTraining).size() - nValidationSamples;
672 size_t nTestSamples = nValidationSamples;
674 if (nTrainingSamples < settings.batchSize ||
675 nValidationSamples < settings.batchSize ||
676 nTestSamples < settings.batchSize) {
677 Log() << kFATAL <<
"Number of samples in the datasets are train: "
678 << nTrainingSamples <<
" valid: " << nValidationSamples
679 <<
" test: " << nTestSamples <<
". "
680 <<
"One of these is smaller than the batch size of "
681 << settings.batchSize <<
". Please increase the batch"
682 <<
" size to be at least the same size as the smallest"
683 <<
" of these values." <<
Endl;
687 if (fArchitectureString ==
"GPU") {
689 if (!fExitFromTraining) fIPyMaxIter = fIPyCurrentIter;
692 }
else if (fArchitectureString ==
"OpenCL") {
693 Log() << kFATAL <<
"OpenCL backend not yet supported." <<
Endl;
695 }
else if (fArchitectureString ==
"CPU") {
697 if (!fExitFromTraining) fIPyMaxIter = fIPyCurrentIter;
702 Log() << kINFO <<
"Using Standard Implementation.";
704 std::vector<Pattern> trainPattern;
705 std::vector<Pattern> testPattern;
707 size_t nValidationSamples = GetNumValidationSamples();
708 size_t nTrainingSamples = GetEventCollection(
Types::kTraining).size() - nValidationSamples;
710 const std::vector<TMVA::Event *> &allData = GetEventCollection(
Types::kTraining);
711 const std::vector<TMVA::Event *> eventCollectionTraining{allData.begin(), allData.begin() + nTrainingSamples};
712 const std::vector<TMVA::Event *> eventCollectionTesting{allData.begin() + nTrainingSamples, allData.end()};
714 for (
auto &event : eventCollectionTraining) {
715 const std::vector<Float_t>& values =
event->GetValues();
717 double outputValue =
event->GetClass () == 0 ? 0.9 : 0.1;
718 trainPattern.push_back(
Pattern (values.begin(),
721 event->GetWeight()));
722 trainPattern.back().addInput(1.0);
724 std::vector<Float_t> oneHot(DataInfo().GetNClasses(), 0.0);
725 oneHot[
event->GetClass()] = 1.0;
726 trainPattern.push_back(
Pattern (values.begin(), values.end(),
727 oneHot.cbegin(), oneHot.cend(),
728 event->GetWeight()));
729 trainPattern.back().addInput(1.0);
731 const std::vector<Float_t>& targets =
event->GetTargets ();
732 trainPattern.push_back(
Pattern(values.begin(),
736 event->GetWeight ()));
737 trainPattern.back ().addInput (1.0);
741 for (
auto &event : eventCollectionTesting) {
742 const std::vector<Float_t>& values =
event->GetValues();
744 double outputValue =
event->GetClass () == 0 ? 0.9 : 0.1;
745 testPattern.push_back(
Pattern (values.begin(),
748 event->GetWeight()));
749 testPattern.back().addInput(1.0);
751 std::vector<Float_t> oneHot(DataInfo().GetNClasses(), 0.0);
752 oneHot[
event->GetClass()] = 1.0;
753 testPattern.push_back(
Pattern (values.begin(), values.end(),
754 oneHot.cbegin(), oneHot.cend(),
755 event->GetWeight()));
756 testPattern.back().addInput(1.0);
758 const std::vector<Float_t>& targets =
event->GetTargets ();
759 testPattern.push_back(
Pattern(values.begin(),
763 event->GetWeight ()));
764 testPattern.back ().addInput (1.0);
769 std::vector<double> weights;
776 for (
size_t i = 0; i < fNet.GetDepth(); i++) {
781 case EActivationFunction::kRelu:
g = EnumFunction::RELU;
break;
784 case EActivationFunction::kFastTanh:
g = EnumFunction::TANH;
break;
785 case EActivationFunction::kSymmRelu:
g = EnumFunction::SYMMRELU;
break;
786 case EActivationFunction::kSoftSign:
g = EnumFunction::SOFTSIGN;
break;
789 if (i < fNet.GetDepth() - 1) {
793 switch(fOutputFunction) {
802 switch(fNet.GetLossFunction()) {
803 case ELossFunction::kMeanSquaredError:
809 case ELossFunction::kSoftmaxCrossEntropy:
814 switch(fWeightInitialization) {
817 std::back_inserter(weights));
821 std::back_inserter(weights));
825 std::back_inserter(weights));
830 for (
auto s : fTrainingSettings) {
833 switch(
s.regularization) {
835 case ERegularization::kL1:
r = EnumRegularization::L1;
break;
836 case ERegularization::kL2:
r = EnumRegularization::L2;
break;
840 s.testInterval,
s.weightDecay,
r,
842 s.momentum, 1,
s.multithreading);
843 std::shared_ptr<Settings> ptrSettings(settings);
844 ptrSettings->setMonitoring (0);
846 <<
"Training with learning rate = " << ptrSettings->learningRate ()
847 <<
", momentum = " << ptrSettings->momentum ()
848 <<
", repetitions = " << ptrSettings->repetitions ()
851 ptrSettings->setProgressLimits ((idxSetting)*100.0/(fSettings.size ()),
852 (idxSetting+1)*100.0/(fSettings.size ()));
854 const std::vector<double>& dropConfig = ptrSettings->dropFractions ();
855 if (!dropConfig.empty ()) {
856 Log () << kINFO <<
"Drop configuration" <<
Endl
857 <<
" drop repetitions = " << ptrSettings->dropRepetitions()
862 for (
auto f : dropConfig) {
863 Log () << kINFO <<
" Layer " << idx <<
" = " <<
f <<
Endl;
869 ptrSettings->momentum(),
870 ptrSettings->repetitions());
871 net.
train(weights, trainPattern, testPattern, minimizer, *ptrSettings.get());
876 size_t weightIndex = 0;
877 for (
size_t l = 0;
l < fNet.GetDepth();
l++) {
878 auto & layerWeights = fNet.GetLayer(
l).GetWeights();
879 for (
Int_t j = 0; j < layerWeights.GetNcols(); j++) {
880 for (
Int_t i = 0; i < layerWeights.GetNrows(); i++) {
881 layerWeights(i,j) = weights[weightIndex];
885 auto & layerBiases = fNet.GetLayer(
l).GetBiases();
887 for (
Int_t i = 0; i < layerBiases.GetNrows(); i++) {
888 layerBiases(i,0) = weights[weightIndex];
892 for (
Int_t i = 0; i < layerBiases.GetNrows(); i++) {
893 layerBiases(i,0) = 0.0;
897 if (!fExitFromTraining) fIPyMaxIter = fIPyCurrentIter;
907 Log() << kINFO <<
"Start of neural network training on GPU." <<
Endl <<
Endl;
909 size_t nValidationSamples = GetNumValidationSamples();
910 size_t nTrainingSamples = GetEventCollection(
Types::kTraining).size() - nValidationSamples;
911 size_t nTestSamples = nValidationSamples;
913 Log() << kDEBUG <<
"Using " << nValidationSamples <<
" validation samples." <<
Endl;
914 Log() << kDEBUG <<
"Using " << nTestSamples <<
" training samples." <<
Endl;
916 size_t trainingPhase = 1;
917 fNet.Initialize(fWeightInitialization);
921 fInteractive->ClearGraphs();
930 std::vector<Double_t> dropoutVector(settings.dropoutProbabilities);
931 for (
auto & p : dropoutVector) {
937 auto testNet = net.
CreateClone(settings.batchSize);
939 Log() << kINFO <<
"Training phase " << trainingPhase <<
" of "
940 << fTrainingSettings.size() <<
":" <<
Endl;
947 const std::vector<Event *> trainingInputData =
948 std::vector<Event *>(allData.begin(), allData.begin() + nTrainingSamples);
949 const std::vector<Event *> testInputData =
950 std::vector<Event *>(allData.begin() + nTrainingSamples, allData.end());
952 if (trainingInputData.size() != nTrainingSamples) {
953 Log() << kFATAL <<
"Inconsistent training sample size" <<
Endl;
955 if (testInputData.size() != nTestSamples) {
956 Log() << kFATAL <<
"Inconsistent test sample size" <<
Endl;
960 TMVAInput_t trainingTuple = std::tie(trainingInputData, DataInfo());
961 TMVAInput_t testTuple = std::tie(testInputData, DataInfo());
962 DataLoader_t trainingData(trainingTuple, nTrainingSamples,
965 DataLoader_t testData(testTuple, nTestSamples, testNet.GetBatchSize(),
969 settings.convergenceSteps,
970 settings.testInterval);
972 std::vector<TNet<TCuda<>>> nets{};
973 std::vector<TBatch<TCuda<>>> batches{};
974 nets.reserve(nThreads);
975 for (
size_t i = 0; i < nThreads; i++) {
977 for (
size_t j = 0; j < net.
GetDepth(); j++)
979 auto &masterLayer = net.
GetLayer(j);
980 auto &layer = nets.back().GetLayer(j);
982 masterLayer.GetWeights());
984 masterLayer.GetBiases());
988 bool converged =
false;
989 size_t stepCount = 0;
990 size_t batchesInEpoch = nTrainingSamples / net.
GetBatchSize();
992 std::chrono::time_point<std::chrono::system_clock> start, end;
993 start = std::chrono::system_clock::now();
996 Log() << std::setw(10) <<
"Epoch" <<
" | "
997 << std::setw(12) <<
"Train Err."
998 << std::setw(12) <<
"Test Err."
999 << std::setw(12) <<
"GFLOP/s"
1000 << std::setw(12) <<
"Conv. Steps" <<
Endl;
1010 trainingData.Shuffle();
1011 for (
size_t i = 0; i < batchesInEpoch; i += nThreads) {
1013 for (
size_t j = 0; j < nThreads; j++) {
1014 batches.reserve(nThreads);
1015 batches.push_back(trainingData.GetBatch());
1017 if (settings.momentum > 0.0) {
1018 minimizer.
StepMomentum(net, nets, batches, settings.momentum);
1020 minimizer.
Step(net, nets, batches);
1028 for (
auto batch : testData) {
1029 auto inputMatrix = batch.GetInput();
1030 auto outputMatrix = batch.GetOutput();
1031 testError += testNet.Loss(inputMatrix, outputMatrix);
1033 testError /= (
Double_t) (nTestSamples / settings.batchSize);
1036 fTrainHistory.AddValue(
"testError",stepCount,testError);
1038 end = std::chrono::system_clock::now();
1042 for (
auto batch : trainingData) {
1043 auto inputMatrix = batch.GetInput();
1044 auto outputMatrix = batch.GetOutput();
1045 trainingError += net.
Loss(inputMatrix, outputMatrix);
1047 trainingError /= (
Double_t) (nTrainingSamples / settings.batchSize);
1049 fTrainHistory.AddValue(
"trainingError",stepCount,trainingError);
1052 std::chrono::duration<double> elapsed_seconds = end - start;
1053 double seconds = elapsed_seconds.count();
1054 double nFlops = (
double) (settings.testInterval * batchesInEpoch);
1058 start = std::chrono::system_clock::now();
1061 fInteractive->AddPoint(stepCount, trainingError, testError);
1064 if (fExitFromTraining)
break;
1066 Log() << std::setw(10) << stepCount <<
" | "
1067 << std::setw(12) << trainingError
1068 << std::setw(12) << testError
1069 << std::setw(12) << nFlops / seconds
1085 Log() << kFATAL <<
"CUDA backend not enabled. Please make sure "
1086 "you have CUDA installed and it was successfully "
1087 "detected by CMAKE." <<
Endl;
1097 Log() << kINFO <<
"Start of neural network training on CPU." <<
Endl <<
Endl;
1099 size_t nValidationSamples = GetNumValidationSamples();
1100 size_t nTrainingSamples = GetEventCollection(
Types::kTraining).size() - nValidationSamples;
1101 size_t nTestSamples = nValidationSamples;
1103 Log() << kDEBUG <<
"Using " << nValidationSamples <<
" validation samples." <<
Endl;
1104 Log() << kDEBUG <<
"Using " << nTestSamples <<
" training samples." <<
Endl;
1106 fNet.Initialize(fWeightInitialization);
1108 size_t trainingPhase = 1;
1112 fInteractive->ClearGraphs();
1115 Log() <<
"Training phase " << trainingPhase <<
" of "
1116 << fTrainingSettings.size() <<
":" <<
Endl;
1124 std::vector<Double_t> dropoutVector(settings.dropoutProbabilities);
1125 for (
auto & p : dropoutVector) {
1130 auto testNet = net.
CreateClone(settings.batchSize);
1135 const std::vector<Event *> &allData = GetEventCollection(
Types::kTraining);
1136 const std::vector<Event *> trainingInputData =
1137 std::vector<Event *>(allData.begin(), allData.begin() + nTrainingSamples);
1138 const std::vector<Event *> testInputData =
1139 std::vector<Event *>(allData.begin() + nTrainingSamples, allData.end());
1141 if (trainingInputData.size() != nTrainingSamples) {
1142 Log() << kFATAL <<
"Inconsistent training sample size" <<
Endl;
1144 if (testInputData.size() != nTestSamples) {
1145 Log() << kFATAL <<
"Inconsistent test sample size" <<
Endl;
1148 size_t nThreads = 1;
1149 TMVAInput_t trainingTuple = std::tie(trainingInputData, DataInfo());
1150 TMVAInput_t testTuple = std::tie(testInputData, DataInfo());
1151 DataLoader_t trainingData(trainingTuple, nTrainingSamples,
1154 DataLoader_t testData(testTuple, nTestSamples, testNet.GetBatchSize(),
1158 settings.convergenceSteps,
1159 settings.testInterval);
1161 std::vector<TNet<TCpu<>>> nets{};
1162 std::vector<TBatch<TCpu<>>> batches{};
1163 nets.reserve(nThreads);
1164 for (
size_t i = 0; i < nThreads; i++) {
1165 nets.push_back(net);
1166 for (
size_t j = 0; j < net.
GetDepth(); j++)
1168 auto &masterLayer = net.
GetLayer(j);
1169 auto &layer = nets.back().GetLayer(j);
1171 masterLayer.GetWeights());
1173 masterLayer.GetBiases());
1177 bool converged =
false;
1178 size_t stepCount = 0;
1179 size_t batchesInEpoch = nTrainingSamples / net.
GetBatchSize();
1181 std::chrono::time_point<std::chrono::system_clock> start, end;
1182 start = std::chrono::system_clock::now();
1184 if (!fInteractive) {
1185 Log() << std::setw(10) <<
"Epoch" <<
" | "
1186 << std::setw(12) <<
"Train Err."
1187 << std::setw(12) <<
"Test Err."
1188 << std::setw(12) <<
"GFLOP/s"
1189 << std::setw(12) <<
"Conv. Steps" <<
Endl;
1198 trainingData.Shuffle();
1199 for (
size_t i = 0; i < batchesInEpoch; i += nThreads) {
1201 for (
size_t j = 0; j < nThreads; j++) {
1202 batches.reserve(nThreads);
1203 batches.push_back(trainingData.GetBatch());
1205 if (settings.momentum > 0.0) {
1206 minimizer.
StepMomentum(net, nets, batches, settings.momentum);
1208 minimizer.
Step(net, nets, batches);
1216 for (
auto batch : testData) {
1217 auto inputMatrix = batch.GetInput();
1218 auto outputMatrix = batch.GetOutput();
1219 auto weightMatrix = batch.GetWeights();
1220 testError += testNet.Loss(inputMatrix, outputMatrix, weightMatrix);
1222 testError /= (
Double_t) (nTestSamples / settings.batchSize);
1225 fTrainHistory.AddValue(
"testError",stepCount,testError);
1227 end = std::chrono::system_clock::now();
1231 for (
auto batch : trainingData) {
1232 auto inputMatrix = batch.GetInput();
1233 auto outputMatrix = batch.GetOutput();
1234 auto weightMatrix = batch.GetWeights();
1235 trainingError += net.
Loss(inputMatrix, outputMatrix, weightMatrix);
1237 trainingError /= (
Double_t) (nTrainingSamples / settings.batchSize);
1240 fTrainHistory.AddValue(
"trainingError",stepCount,trainingError);
1243 fInteractive->AddPoint(stepCount, trainingError, testError);
1245 if (fExitFromTraining)
break;
1249 std::chrono::duration<double> elapsed_seconds = end - start;
1250 double seconds = elapsed_seconds.count();
1251 double nFlops = (
double) (settings.testInterval * batchesInEpoch);
1255 start = std::chrono::system_clock::now();
1258 fInteractive->AddPoint(stepCount, trainingError, testError);
1261 if (fExitFromTraining)
break;
1263 Log() << std::setw(10) << stepCount <<
" | "
1264 << std::setw(12) << trainingError
1265 << std::setw(12) << testError
1266 << std::setw(12) << nFlops / seconds
1277 auto & layer = fNet.GetLayer(
l);
1284 Log() << kFATAL <<
"Multi-core CPU backend not enabled. Please make sure "
1285 "you have a BLAS implementation and it was successfully "
1286 "detected by CMake as well that the imt CMake flag is set." <<
Endl;
1294 size_t nVariables = GetEvent()->GetNVariables();
1298 const std::vector<Float_t>& inputValues = GetEvent()->GetValues();
1299 for (
size_t i = 0; i < nVariables; i++) {
1300 X(0,i) = inputValues[i];
1303 fNet.Prediction(YHat, X, fOutputFunction);
1311 size_t nVariables = GetEvent()->GetNVariables();
1314 const Event *ev = GetEvent();
1315 const std::vector<Float_t>& inputValues = ev->
GetValues();
1316 for (
size_t i = 0; i < nVariables; i++) {
1317 X(0,i) = inputValues[i];
1320 size_t nTargets = std::max(1u, ev->
GetNTargets());
1322 std::vector<Float_t>
output(nTargets);
1323 auto net = fNet.CreateClone(1);
1324 net.Prediction(YHat, X, fOutputFunction);
1326 for (
size_t i = 0; i < nTargets; i++)
1329 if (fRegressionReturnVal == NULL) {
1330 fRegressionReturnVal =
new std::vector<Float_t>();
1332 fRegressionReturnVal->clear();
1335 for (
size_t i = 0; i < nTargets; ++i) {
1339 const Event* evT2 = GetTransformationHandler().InverseTransform(evT);
1340 for (
size_t i = 0; i < nTargets; ++i) {
1341 fRegressionReturnVal->push_back(evT2->
GetTarget(i));
1344 return *fRegressionReturnVal;
1349 size_t nVariables = GetEvent()->GetNVariables();
1351 Matrix_t YHat(1, DataInfo().GetNClasses());
1352 if (fMulticlassReturnVal == NULL) {
1353 fMulticlassReturnVal =
new std::vector<Float_t>(DataInfo().GetNClasses());
1356 const std::vector<Float_t>& inputValues = GetEvent()->GetValues();
1357 for (
size_t i = 0; i < nVariables; i++) {
1358 X(0,i) = inputValues[i];
1361 fNet.Prediction(YHat, X, fOutputFunction);
1362 for (
size_t i = 0; i < (size_t) YHat.GetNcols(); i++) {
1363 (*fMulticlassReturnVal)[i] = YHat(0, i);
1365 return *fMulticlassReturnVal;
1373 Int_t inputWidth = fNet.GetInputWidth();
1374 Int_t depth = fNet.GetDepth();
1375 char lossFunction =
static_cast<char>(fNet.GetLossFunction());
1377 gTools().StringFromInt(inputWidth));
1381 TString(
static_cast<char>(fOutputFunction)));
1383 for (
Int_t i = 0; i < depth; i++) {
1384 const auto& layer = fNet.GetLayer(i);
1386 int activationFunction =
static_cast<int>(layer.GetActivationFunction());
1389 WriteMatrixXML(layerxml,
"Weights", layer.GetWeights());
1390 WriteMatrixXML(layerxml,
"Biases", layer.GetBiases());
1404 fNet.SetBatchSize(1);
1406 size_t inputWidth, depth;
1409 char lossFunctionChar;
1411 char outputFunctionChar;
1414 fNet.SetInputWidth(inputWidth);
1415 fNet.SetLossFunction(
static_cast<ELossFunction>(lossFunctionChar));
1418 size_t previousWidth = inputWidth;
1420 for (
size_t i = 0; i < depth; i++) {
1436 ReadMatrixXML(layerXML,
"Weights", weights);
1437 ReadMatrixXML(layerXML,
"Biases", biases);
1438 fNet.GetLayer(i).GetWeights() = weights;
1439 fNet.GetLayer(i).GetBiases() = biases;
1442 previousWidth =
width;
1456 fRanking =
new Ranking( GetName(),
"Importance" );
1457 for (
UInt_t ivar=0; ivar<GetNvar(); ivar++) {
1458 fRanking->AddRank(
Rank( GetInputLabel(ivar), 1.0));
1482 Log() << col <<
"--- Short description:" << colres <<
Endl;
1484 Log() <<
"The DNN neural network is a feedforward" <<
Endl;
1485 Log() <<
"multilayer perceptron implementation. The DNN has a user-" <<
Endl;
1486 Log() <<
"defined hidden layer architecture, where the number of input (output)" <<
Endl;
1487 Log() <<
"nodes is determined by the input variables (output classes, i.e., " <<
Endl;
1488 Log() <<
"signal and one background, regression or multiclass). " <<
Endl;
1490 Log() << col <<
"--- Performance optimisation:" << colres <<
Endl;
1493 const char* txt =
"The DNN supports various options to improve performance in terms of training speed and \n \
1494reduction of overfitting: \n \
1496 - different training settings can be stacked. Such that the initial training \n\
1497 is done with a large learning rate and a large drop out fraction whilst \n \
1498 in a later stage learning rate and drop out can be reduced. \n \
1501 initial training stage: 0.0 for the first layer, 0.5 for later layers. \n \
1502 later training stage: 0.1 or 0.0 for all layers \n \
1503 final training stage: 0.0] \n \
1504 Drop out is a technique where a at each training cycle a fraction of arbitrary \n \
1505 nodes is disabled. This reduces co-adaptation of weights and thus reduces overfitting. \n \
1506 - L1 and L2 regularization are available \n \
1508 [recommended 10 - 150] \n \
1509 Arbitrary mini-batch sizes can be chosen. \n \
1510 - Multithreading \n \
1511 [recommended: True] \n \
1512 Multithreading can be turned on. The minibatches are distributed to the available \n \
1513 cores. The algorithm is lock-free (\"Hogwild!\"-style) for each cycle. \n \
1517 - example: \"TANH|(N+30)*2,TANH|(N+30),LINEAR\" \n \
1519 . two hidden layers (separated by \",\") \n \
1520 . the activation function is TANH (other options: RELU, SOFTSIGN, LINEAR) \n \
1521 . the activation function for the output layer is LINEAR \n \
1522 . the first hidden layer has (N+30)*2 nodes where N is the number of input neurons \n \
1523 . the second hidden layer has N+30 nodes, where N is the number of input neurons \n \
1524 . the number of nodes in the output layer is determined by the number of output nodes \n \
1525 and can therefore not be chosen freely. \n \
1527 \"ErrorStrategy\": \n \
1529 The error of the neural net is determined by a sum-of-squares error function \n \
1530 For regression, this is the only possible choice. \n \
1532 The error of the neural net is determined by a cross entropy function. The \n \
1533 output values are automatically (internally) transformed into probabilities \n \
1534 using a sigmoid function. \n \
1535 For signal/background classification this is the default choice. \n \
1536 For multiclass using cross entropy more than one or no output classes \n \
1537 can be equally true or false (e.g. Event 0: A and B are true, Event 1: \n \
1538 A and C is true, Event 2: C is true, ...) \n \
1539 - MUTUALEXCLUSIVE \n \
1540 In multiclass settings, exactly one of the output classes can be true (e.g. either A or B or C) \n \
1542 \"WeightInitialization\" \n \
1545 \"Xavier Glorot & Yoshua Bengio\"-style of initializing the weights. The weights are chosen randomly \n \
1546 such that the variance of the values of the nodes is preserved for each layer. \n \
1547 - XAVIERUNIFORM \n \
1548 The same as XAVIER, but with uniformly distributed weights instead of gaussian weights \n \
1550 Random values scaled by the layer size \n \
1552 \"TrainingStrategy\" \n \
1553 - example: \"LearningRate=1e-1,Momentum=0.3,ConvergenceSteps=50,BatchSize=30,TestRepetitions=7,WeightDecay=0.0,Renormalize=L2,DropConfig=0.0,DropRepetitions=5|LearningRate=1e-4,Momentum=0.3,ConvergenceSteps=50,BatchSize=20,TestRepetitions=7,WeightDecay=0.001,Renormalize=L2,DropFraction=0.0,DropRepetitions=5\" \n \
1554 - explanation: two stacked training settings separated by \"|\" \n \
1555 . first training setting: \"LearningRate=1e-1,Momentum=0.3,ConvergenceSteps=50,BatchSize=30,TestRepetitions=7,WeightDecay=0.0,Renormalize=L2,DropConfig=0.0,DropRepetitions=5\" \n \
1556 . second training setting : \"LearningRate=1e-4,Momentum=0.3,ConvergenceSteps=50,BatchSize=20,TestRepetitions=7,WeightDecay=0.001,Renormalize=L2,DropFractions=0.0,DropRepetitions=5\" \n \
1557 . LearningRate : \n \
1558 - recommended for classification: 0.1 initially, 1e-4 later \n \
1559 - recommended for regression: 1e-4 and less \n \
1561 preserve a fraction of the momentum for the next training batch [fraction = 0.0 - 1.0] \n \
1562 . Repetitions : \n \
1563 train \"Repetitions\" repetitions with the same minibatch before switching to the next one \n \
1564 . ConvergenceSteps : \n \
1565 Assume that convergence is reached after \"ConvergenceSteps\" cycles where no improvement \n \
1566 of the error on the test samples has been found. (Mind that only at each \"TestRepetitions\" \n \
1567 cycle the test samples are evaluated and thus the convergence is checked) \n \
1569 Size of the mini-batches. \n \
1570 . TestRepetitions \n \
1571 Perform testing the neural net on the test samples each \"TestRepetitions\" cycle \n \
1573 If \"Renormalize\" is set to L1 or L2, \"WeightDecay\" provides the renormalization factor \n \
1575 NONE, L1 (|w|) or L2 (w^2) \n \
1577 Drop a fraction of arbitrary nodes of each of the layers according to the values given \n \
1578 in the DropConfig. \n \
1579 [example: DropConfig=0.0+0.5+0.3 \n \
1580 meaning: drop no nodes in layer 0 (input layer), half of the nodes in layer 1 and 30% of the nodes \n \
1582 recommended: leave all the nodes turned on for the input layer (layer 0) \n \
1583 turn off half of the nodes in later layers for the initial training; leave all nodes \n \
1584 turned on (0.0) in later training stages] \n \
1585 . DropRepetitions \n \
1586 Each \"DropRepetitions\" cycle the configuration of which nodes are dropped is changed \n \
1587 [recommended : 1] \n \
1588 . Multithreading \n \
1589 turn on multithreading [recommended: True] \n \
#define REGISTER_METHOD(CLASS)
for example
include TDocParser_001 C image html pict1_TDocParser_001 png width
Bool_t WriteOptionsReference() const
Layer defines the layout of a layer.
void setInputSize(size_t sizeInput)
set the input size of the DNN
void SetIpythonInteractive(IPythonInteractive *fI, bool *fE, UInt_t *M, UInt_t *C)
double train(std::vector< double > &weights, std::vector< Pattern > &trainPattern, const std::vector< Pattern > &testPattern, Minimizer &minimizer, Settings &settings)
start the training
void setErrorFunction(ModeErrorFunction eErrorFunction)
which error function is to be used
void initializeWeights(WeightInitializationStrategy eInitStrategy, OutIterator itWeight)
initialize the weights with the given strategy
void addLayer(Layer &layer)
add a layer (layout)
void setOutputSize(size_t sizeOutput)
set the output size of the DNN
Settings for the training of the neural net.
Steepest Gradient Descent algorithm (SGD)
static void Copy(Matrix_t &B, const Matrix_t &A)
static void Copy(Matrix_t &B, const Matrix_t &A)
bool HasConverged()
Increases the minimization step counter by the test error evaluation period and uses the current inte...
void Step(Net_t &net, Matrix_t &input, const Matrix_t &output, const Matrix_t &weights)
Perform a single optimization step on a given batch.
size_t GetTestInterval() const
void StepMomentum(Net_t &master, std::vector< Net_t > &nets, std::vector< TBatch< Architecture_t > > &batches, Scalar_t momentum)
Same as the Step(...) method for multiple batches but uses momentum.
size_t GetConvergenceCount() const
size_t GetConvergenceSteps() const
Generic neural network class.
void SetWeightDecay(Scalar_t weightDecay)
Scalar_t Loss(const Matrix_t &Y, const Matrix_t &weights, bool includeRegularization=true) const
Evaluate the loss function of the net using the activations that are currently stored in the output l...
void SetRegularization(ERegularization R)
size_t GetOutputWidth() const
void InitializeGradients()
Initialize the gradients in the net to zero.
TNet< Architecture_t, TSharedLayer< Architecture_t > > CreateClone(size_t batchSize)
Create a clone that uses the same weight and biases matrices but potentially a difference batch size.
size_t GetBatchSize() const
void SetDropoutProbabilities(const std::vector< Double_t > &probabilities)
size_t GetInputWidth() const
Layer_t & GetLayer(size_t i)
void SetTarget(UInt_t itgt, Float_t value)
set the target value (dimension itgt) to value
UInt_t GetNTargets() const
accessor to the number of targets
std::vector< Float_t > & GetValues()
Float_t GetTarget(UInt_t itgt) const
Deep Neural Network Implementation.
virtual Bool_t HasAnalysisType(Types::EAnalysisType type, UInt_t numberClasses, UInt_t numberTargets)
virtual const std::vector< Float_t > & GetMulticlassValues()
UInt_t GetNumValidationSamples()
void ReadWeightsFromXML(void *wghtnode)
std::vector< std::map< TString, TString > > KeyValueVector_t
typename Architecture_t::Matrix_t Matrix_t
void ReadWeightsFromStream(std::istream &i)
LayoutVector_t ParseLayoutString(TString layerSpec)
void MakeClassSpecific(std::ostream &, const TString &) const
MethodDNN(const TString &jobName, const TString &methodTitle, DataSetInfo &theData, const TString &theOption)
DNN::EInitialization fWeightInitialization
virtual Double_t GetMvaValue(Double_t *err=0, Double_t *errUpper=0)
const Ranking * CreateRanking()
KeyValueVector_t ParseKeyValueString(TString parseString, TString blockDelim, TString tokenDelim)
DNN::EOutputFunction fOutputFunction
void AddWeightsXMLTo(void *parent) const
void GetHelpMessage() const
virtual const std::vector< Float_t > & GetRegressionValues()
Ranking for variables in method (implementation)
Collectable string class.
const TString & GetString() const
Int_t Atoi() const
Return integer value of string.
TSubString Strip(EStripType s=kTrailing, char c=' ') const
Return a substring of self stripped at beginning and/or end.
Double_t Atof() const
Return floating-point value contained in string.
Bool_t IsFloat() const
Returns kTRUE if string contains a floating point or integer number.
const char * Data() const
void ToUpper()
Change string to upper case.
TObjArray * Tokenize(const TString &delim) const
This function is used to isolate sequential tokens in a TString.
Bool_t BeginsWith(const char *s, ECaseCompare cmp=kExact) const
static TString Itoa(Int_t value, Int_t base)
Converts an Int_t to a TString with respect to the base specified (2-36).
XMLNodePointer_t NewChild(XMLNodePointer_t parent, XMLNsPointer_t ns, const char *name, const char *content=nullptr)
create new child element for parent node
XMLNodePointer_t GetChild(XMLNodePointer_t xmlnode, Bool_t realnode=kTRUE)
returns first child of xmlnode
XMLAttrPointer_t NewAttr(XMLNodePointer_t xmlnode, XMLNsPointer_t, const char *name, const char *value)
creates new attribute for xmlnode, namespaces are not supported for attributes
static constexpr double s
EOutputFunction
Enum that represents output functions.
auto regularization(const typename Architecture_t::Matrix_t &A, ERegularization R) -> decltype(Architecture_t::L1Regularization(A))
Evaluate the regularization functional for a given weight matrix.
EActivationFunction
Enum that represents layer activation functions.
ELossFunction
Enum that represents objective functions for the net, i.e.
std::tuple< const std::vector< Event * > &, const DataSetInfo & > TMVAInput_t
create variable transformations
TString fetchValue(const std::map< TString, TString > &keyValueMap, TString key)
MsgLogger & Endl(MsgLogger &ml)
DNN::ERegularization regularization
std::vector< Double_t > dropoutProbabilities
static void output(int code)