81 const TString &theOption)
82 :
MethodBase(jobName,
Types::kDNN, methodTitle, theData, theOption), fWeightInitialization(), fOutputFunction(),
83 fLayoutString(), fErrorStrategy(), fTrainingStrategyString(), fWeightInitializationString(),
84 fArchitectureString(), fTrainingSettings(), fResume(false), fSettings()
93 : MethodBase( Types::kDNN, theData, theWeightFile),
94 fWeightInitialization(), fOutputFunction(), fLayoutString(), fErrorStrategy(),
95 fTrainingStrategyString(), fWeightInitializationString(), fArchitectureString(),
96 fTrainingSettings(), fResume(false), fSettings()
98 fWeightInitialization = DNN::EInitialization::kGauss;
99 fOutputFunction = DNN::EOutputFunction::kSigmoid;
131 <<
"MethodDNN is deprecated and it will be removed in future ROOT version. "
132 "Please use MethodDL ( TMVA::kDL)"
153 DeclareOptionRef(fLayoutString=
"SOFTSIGN|(N+100)*2,LINEAR",
155 "Layout of the network.");
157 DeclareOptionRef(fValidationSize =
"20%",
"ValidationSize",
158 "Part of the training data to use for "
159 "validation. Specify as 0.2 or 20% to use a "
160 "fifth of the data set as validation set. "
161 "Specify as 100 to use exactly 100 events. "
164 DeclareOptionRef(fErrorStrategy=
"CROSSENTROPY",
166 "Loss function: Mean squared error (regression)"
167 " or cross entropy (binary classification).");
168 AddPreDefVal(TString(
"CROSSENTROPY"));
169 AddPreDefVal(TString(
"SUMOFSQUARES"));
170 AddPreDefVal(TString(
"MUTUALEXCLUSIVE"));
172 DeclareOptionRef(fWeightInitializationString=
"XAVIER",
173 "WeightInitialization",
174 "Weight initialization strategy");
175 AddPreDefVal(TString(
"XAVIER"));
176 AddPreDefVal(TString(
"XAVIERUNIFORM"));
178 DeclareOptionRef(fArchitectureString =
"CPU",
"Architecture",
"Which architecture to perform the training on.");
179 AddPreDefVal(TString(
"STANDARD"));
180 AddPreDefVal(TString(
"CPU"));
181 AddPreDefVal(TString(
"GPU"));
182 AddPreDefVal(TString(
"OPENCL"));
185 fTrainingStrategyString =
"LearningRate=1e-1,"
188 "ConvergenceSteps=50,"
194 "DropRepetitions=5|LearningRate=1e-4,"
197 "ConvergenceSteps=50,"
202 "DropConfig=0.0+0.5+0.5,"
204 "Multithreading=True",
206 "Defines the training strategies.");
216 LayoutVector_t layout;
217 const TString layerDelimiter(
",");
218 const TString subDelimiter(
"|");
220 const size_t inputSize = GetNvar();
222 TObjArray* layerStrings = layoutString.Tokenize(layerDelimiter);
223 TIter nextLayer (layerStrings);
224 TObjString* layerString = (TObjString*)nextLayer ();
226 for (; layerString !=
nullptr; layerString = (TObjString*) nextLayer()) {
231 TIter nextToken (subStrings);
232 TObjString* token = (TObjString *) nextToken();
234 for (; token !=
nullptr; token = (TObjString *) nextToken()) {
240 if (strActFnc ==
"RELU") {
242 }
else if (strActFnc ==
"TANH") {
244 }
else if (strActFnc ==
"SYMMRELU") {
246 }
else if (strActFnc ==
"SOFTSIGN") {
248 }
else if (strActFnc ==
"SIGMOID") {
250 }
else if (strActFnc ==
"LINEAR") {
252 }
else if (strActFnc ==
"GAUSS") {
259 TString strNumNodes (token->
GetString ());
261 strNumNodes.ReplaceAll (
"N", strN);
262 strNumNodes.ReplaceAll (
"n", strN);
263 TFormula fml (
"tmp",strNumNodes);
264 numNodes = fml.Eval (inputSize);
270 layout.push_back(std::make_pair(numNodes, activationFunction));
283 KeyValueVector_t blockKeyValues;
284 const TString keyValueDelim (
"=");
286 TObjArray* blockStrings = parseString.Tokenize (blockDelim);
287 TIter nextBlock (blockStrings);
288 TObjString* blockString = (TObjString *) nextBlock();
290 for (; blockString !=
nullptr; blockString = (TObjString *) nextBlock())
292 blockKeyValues.push_back (std::map<TString,TString>());
293 std::map<TString,TString>& currentBlock = blockKeyValues.back ();
296 TIter nextToken (subStrings);
297 TObjString* token = (TObjString*)nextToken ();
299 for (; token !=
nullptr; token = (TObjString *)nextToken())
301 TString strKeyValue (token->
GetString ());
302 int delimPos = strKeyValue.First (keyValueDelim.Data ());
306 TString strKey = TString (strKeyValue (0, delimPos));
308 TString strValue = TString (strKeyValue (delimPos+1, strKeyValue.Length ()));
313 currentBlock.insert (std::make_pair (strKey, strValue));
316 return blockKeyValues;
324 std::map<TString, TString>::const_iterator it = keyValueMap.find (key);
325 if (it == keyValueMap.end()) {
349 return value.Atoi ();
355double fetchValue (
const std::map<TString,TString>& keyValueMap,
356 TString key,
double defaultValue)
362 return value.Atof ();
381bool fetchValue (
const std::map<TString,TString>& keyValueMap,
382 TString key,
bool defaultValue)
398std::vector<double>
fetchValue(
const std::map<TString, TString> & keyValueMap,
400 std::vector<double> defaultValue)
403 if (parseString ==
"") {
407 std::vector<double> values;
409 const TString tokenDelim (
"+");
411 TIter nextToken (tokenStrings);
413 for (; tokenString != NULL; tokenString = (
TObjString*)nextToken ()) {
414 std::stringstream sstr;
417 sstr >> currentValue;
418 values.push_back (currentValue);
429 <<
"Will ignore negative events in training!"
434 Log() << kERROR <<
"The STANDARD architecture has been deprecated. "
435 "Please use Architecture=CPU or Architecture=CPU."
436 "See the TMVA Users' Guide for instructions if you "
437 "encounter problems."
439 Log() << kFATAL <<
"The STANDARD architecture has been deprecated. "
440 "Please use Architecture=CPU or Architecture=CPU."
441 "See the TMVA Users' Guide for instructions if you "
442 "encounter problems."
447 Log() << kERROR <<
"The OPENCL architecture has not been implemented yet. "
448 "Please use Architecture=CPU or Architecture=CPU for the "
449 "time being. See the TMVA Users' Guide for instructions "
450 "if you encounter problems."
452 Log() << kFATAL <<
"The OPENCL architecture has not been implemented yet. "
453 "Please use Architecture=CPU or Architecture=CPU for the "
454 "time being. See the TMVA Users' Guide for instructions "
455 "if you encounter problems."
461 Log() << kERROR <<
"CUDA backend not enabled. Please make sure "
462 "you have CUDA installed and it was successfully "
465 Log() << kFATAL <<
"CUDA backend not enabled. Please make sure "
466 "you have CUDA installed and it was successfully "
474 Log() << kERROR <<
"Multi-core CPU backend not enabled. Please make sure "
475 "you have a BLAS implementation and it was successfully "
476 "detected by CMake as well that the imt CMake flag is set."
478 Log() << kFATAL <<
"Multi-core CPU backend not enabled. Please make sure "
479 "you have a BLAS implementation and it was successfully "
480 "detected by CMake as well that the imt CMake flag is set."
491 size_t outputSize = 1;
495 outputSize =
DataInfo().GetNClasses();
498 fNet.SetBatchSize(1);
499 fNet.SetInputWidth(inputSize);
501 auto itLayout = std::begin (
fLayout);
502 auto itLayoutEnd = std::end (
fLayout)-1;
503 for ( ; itLayout != itLayoutEnd; ++itLayout) {
504 fNet.AddLayer((*itLayout).first, (*itLayout).second);
506 fNet.AddLayer(outputSize, EActivationFunction::kIdentity);
516 fNet.SetLossFunction(ELossFunction::kMeanSquaredError);
519 fNet.SetLossFunction(ELossFunction::kCrossEntropy);
524 Log () << kWARNING <<
"For regression only SUMOFSQUARES is a valid "
525 <<
" neural net error function. Setting error function to "
526 <<
" SUMOFSQUARES now." <<
Endl;
528 fNet.SetLossFunction(ELossFunction::kMeanSquaredError);
532 fNet.SetLossFunction(ELossFunction::kMeanSquaredError);
535 fNet.SetLossFunction(ELossFunction::kCrossEntropy);
538 fNet.SetLossFunction(ELossFunction::kSoftmaxCrossEntropy);
569 std::cout <<
"STring has size " << strategyKeyValues.size() << std::endl;
570 for (
auto& block : strategyKeyValues) {
580 std::vector<Double_t>());
615 Int_t nValidationSamples = 0;
620 if (fValidationSize.EndsWith(
"%")) {
625 Double_t valSizeAsDouble = fValidationSize.Atof() / 100.0;
626 nValidationSamples = GetEventCollection(
Types::kTraining).size() * valSizeAsDouble;
628 Log() << kFATAL <<
"Cannot parse number \"" << fValidationSize
629 <<
"\". Expected string like \"20%\" or \"20.0%\"." <<
Endl;
631 }
else if (fValidationSize.IsFloat()) {
632 Double_t valSizeAsDouble = fValidationSize.Atof();
634 if (valSizeAsDouble < 1.0) {
636 nValidationSamples = GetEventCollection(
Types::kTraining).size() * valSizeAsDouble;
639 nValidationSamples = valSizeAsDouble;
642 Log() <<
kFATAL <<
"Cannot parse number \"" << fValidationSize <<
"\". Expected string like \"0.2\" or \"100\"."
648 if (nValidationSamples < 0) {
649 Log() <<
kFATAL <<
"Validation size \"" << fValidationSize <<
"\" is negative." <<
Endl;
652 if (nValidationSamples == 0) {
653 Log() <<
kFATAL <<
"Validation size \"" << fValidationSize <<
"\" is zero." <<
Endl;
656 if (nValidationSamples >= (
Int_t)trainingSetSize) {
657 Log() <<
kFATAL <<
"Validation size \"" << fValidationSize
658 <<
"\" is larger than or equal in size to training set (size=\"" << trainingSetSize <<
"\")." <<
Endl;
661 return nValidationSamples;
669 std::vector<TString> titles = {
"Error on training set",
"Error on test set"};
678 size_t nTestSamples = nValidationSamples;
680 if (nTrainingSamples < settings.batchSize ||
681 nValidationSamples < settings.batchSize ||
682 nTestSamples < settings.batchSize) {
683 Log() << kFATAL <<
"Number of samples in the datasets are train: "
684 << nTrainingSamples <<
" valid: " << nValidationSamples
685 <<
" test: " << nTestSamples <<
". "
686 <<
"One of these is smaller than the batch size of "
687 << settings.batchSize <<
". Please increase the batch"
688 <<
" size to be at least the same size as the smallest"
689 <<
" of these values." <<
Endl;
699 Log() << kFATAL <<
"OpenCL backend not yet supported." <<
Endl;
708 Log() << kINFO <<
"Using Standard Implementation.";
710 std::vector<Pattern> trainPattern;
711 std::vector<Pattern> testPattern;
717 const std::vector<TMVA::Event *> eventCollectionTraining{allData.begin(), allData.begin() + nTrainingSamples};
718 const std::vector<TMVA::Event *> eventCollectionTesting{allData.begin() + nTrainingSamples, allData.end()};
720 for (
auto &event : eventCollectionTraining) {
721 const std::vector<Float_t>& values =
event->GetValues();
723 double outputValue =
event->GetClass () == 0 ? 0.9 : 0.1;
724 trainPattern.push_back(
Pattern (values.begin(),
727 event->GetWeight()));
728 trainPattern.back().addInput(1.0);
730 std::vector<Float_t> oneHot(
DataInfo().GetNClasses(), 0.0);
731 oneHot[
event->GetClass()] = 1.0;
732 trainPattern.push_back(
Pattern (values.begin(), values.end(),
733 oneHot.cbegin(), oneHot.cend(),
734 event->GetWeight()));
735 trainPattern.back().addInput(1.0);
737 const std::vector<Float_t>& targets =
event->GetTargets ();
738 trainPattern.push_back(
Pattern(values.begin(),
742 event->GetWeight ()));
743 trainPattern.back ().addInput (1.0);
747 for (
auto &event : eventCollectionTesting) {
748 const std::vector<Float_t>& values =
event->GetValues();
750 double outputValue =
event->GetClass () == 0 ? 0.9 : 0.1;
751 testPattern.push_back(
Pattern (values.begin(),
754 event->GetWeight()));
755 testPattern.back().addInput(1.0);
757 std::vector<Float_t> oneHot(
DataInfo().GetNClasses(), 0.0);
758 oneHot[
event->GetClass()] = 1.0;
759 testPattern.push_back(
Pattern (values.begin(), values.end(),
760 oneHot.cbegin(), oneHot.cend(),
761 event->GetWeight()));
762 testPattern.back().addInput(1.0);
764 const std::vector<Float_t>& targets =
event->GetTargets ();
765 testPattern.push_back(
Pattern(values.begin(),
769 event->GetWeight ()));
770 testPattern.back ().addInput (1.0);
775 std::vector<double> weights;
779 net.setInputSize(
fNet.GetInputWidth() + 1);
780 net.setOutputSize(
fNet.GetOutputWidth() + 1);
782 for (
size_t i = 0;
i <
fNet.GetDepth();
i++) {
786 case EActivationFunction::kIdentity:
g = EnumFunction::LINEAR;
break;
787 case EActivationFunction::kRelu:
g = EnumFunction::RELU;
break;
788 case EActivationFunction::kSigmoid:
g = EnumFunction::SIGMOID;
break;
789 case EActivationFunction::kTanh:
g = EnumFunction::TANH;
break;
790 case EActivationFunction::kFastTanh:
g = EnumFunction::TANH;
break;
791 case EActivationFunction::kSymmRelu:
g = EnumFunction::SYMMRELU;
break;
792 case EActivationFunction::kSoftSign:
g = EnumFunction::SOFTSIGN;
break;
793 case EActivationFunction::kGauss:
g = EnumFunction::GAUSS;
break;
795 if (
i <
fNet.GetDepth() - 1) {
796 net.addLayer(
Layer(
fNet.GetLayer(
i).GetWidth(),
g));
800 case EOutputFunction::kIdentity:
h = ModeOutputValues::DIRECT;
break;
801 case EOutputFunction::kSigmoid:
h = ModeOutputValues::SIGMOID;
break;
802 case EOutputFunction::kSoftmax:
h = ModeOutputValues::SOFTMAX;
break;
808 switch(
fNet.GetLossFunction()) {
809 case ELossFunction::kMeanSquaredError:
810 net.setErrorFunction(ModeErrorFunction::SUMOFSQUARES);
812 case ELossFunction::kCrossEntropy:
813 net.setErrorFunction(ModeErrorFunction::CROSSENTROPY);
815 case ELossFunction::kSoftmaxCrossEntropy:
816 net.setErrorFunction(ModeErrorFunction::CROSSENTROPY_MUTUALEXCLUSIVE);
821 case EInitialization::kGauss:
822 net.initializeWeights(WeightInitializationStrategy::XAVIER,
823 std::back_inserter(weights));
825 case EInitialization::kUniform:
826 net.initializeWeights(WeightInitializationStrategy::XAVIERUNIFORM,
827 std::back_inserter(weights));
830 net.initializeWeights(WeightInitializationStrategy::XAVIER,
831 std::back_inserter(weights));
839 switch(s.regularization) {
840 case ERegularization::kNone:
r = EnumRegularization::NONE;
break;
841 case ERegularization::kL1:
r = EnumRegularization::L1;
break;
842 case ERegularization::kL2:
r = EnumRegularization::L2;
break;
846 s.testInterval, s.weightDecay,
r,
847 MinimizerType::fSteepest, s.learningRate,
848 s.momentum, 1, s.multithreading);
849 std::shared_ptr<Settings> ptrSettings(settings);
850 ptrSettings->setMonitoring (0);
852 <<
"Training with learning rate = " << ptrSettings->learningRate ()
853 <<
", momentum = " << ptrSettings->momentum ()
854 <<
", repetitions = " << ptrSettings->repetitions ()
857 ptrSettings->setProgressLimits ((idxSetting)*100.0/(
fSettings.size ()),
858 (idxSetting+1)*100.0/(
fSettings.size ()));
860 const std::vector<double>& dropConfig = ptrSettings->dropFractions ();
861 if (!dropConfig.empty ()) {
862 Log () << kINFO <<
"Drop configuration" <<
Endl
863 <<
" drop repetitions = " << ptrSettings->dropRepetitions()
868 for (
auto f : dropConfig) {
869 Log () << kINFO <<
" Layer " << idx <<
" = " <<
f <<
Endl;
875 ptrSettings->momentum(),
876 ptrSettings->repetitions());
877 net.train(weights, trainPattern, testPattern, minimizer, *ptrSettings.get());
882 size_t weightIndex = 0;
883 for (
size_t l = 0;
l <
fNet.GetDepth();
l++) {
884 auto & layerWeights =
fNet.GetLayer(
l).GetWeights();
885 for (
Int_t j = 0; j < layerWeights.GetNcols(); j++) {
886 for (
Int_t i = 0;
i < layerWeights.GetNrows();
i++) {
887 layerWeights(
i,j) = weights[weightIndex];
891 auto & layerBiases =
fNet.GetLayer(
l).GetBiases();
893 for (
Int_t i = 0;
i < layerBiases.GetNrows();
i++) {
894 layerBiases(
i,0) = weights[weightIndex];
898 for (
Int_t i = 0;
i < layerBiases.GetNrows();
i++) {
899 layerBiases(
i,0) = 0.0;
913 Log() << kINFO <<
"Start of neural network training on GPU." <<
Endl <<
Endl;
917 size_t nTestSamples = nValidationSamples;
919 Log() << kDEBUG <<
"Using " << nValidationSamples <<
" validation samples." <<
Endl;
920 Log() << kDEBUG <<
"Using " << nTestSamples <<
" training samples." <<
Endl;
922 size_t trainingPhase = 1;
931 net.SetWeightDecay(settings.weightDecay);
932 net.SetRegularization(settings.regularization);
936 std::vector<Double_t> dropoutVector(settings.dropoutProbabilities);
937 for (
auto & p : dropoutVector) {
940 net.SetDropoutProbabilities(dropoutVector);
942 net.InitializeGradients();
943 auto testNet = net.CreateClone(settings.batchSize);
945 Log() << kINFO <<
"Training phase " << trainingPhase <<
" of "
953 const std::vector<Event *> trainingInputData =
954 std::vector<Event *>(allData.begin(), allData.begin() + nTrainingSamples);
955 const std::vector<Event *> testInputData =
956 std::vector<Event *>(allData.begin() + nTrainingSamples, allData.end());
958 if (trainingInputData.size() != nTrainingSamples) {
959 Log() << kFATAL <<
"Inconsistent training sample size" <<
Endl;
961 if (testInputData.size() != nTestSamples) {
962 Log() << kFATAL <<
"Inconsistent test sample size" <<
Endl;
968 DataLoader_t trainingData(trainingTuple, nTrainingSamples,
969 net.GetBatchSize(), net.GetInputWidth(),
970 net.GetOutputWidth(), nThreads);
971 DataLoader_t testData(testTuple, nTestSamples, testNet.GetBatchSize(),
972 net.GetInputWidth(), net.GetOutputWidth(),
975 settings.convergenceSteps,
976 settings.testInterval);
978 std::vector<TNet<TCuda<>>> nets{};
979 std::vector<TBatch<TCuda<>>> batches{};
980 nets.reserve(nThreads);
981 for (
size_t i = 0;
i < nThreads;
i++) {
983 for (
size_t j = 0; j < net.GetDepth(); j++)
985 auto &masterLayer = net.GetLayer(j);
986 auto &layer = nets.back().GetLayer(j);
988 masterLayer.GetWeights());
990 masterLayer.GetBiases());
994 bool converged =
false;
995 size_t stepCount = 0;
996 size_t batchesInEpoch = nTrainingSamples / net.GetBatchSize();
998 std::chrono::time_point<std::chrono::system_clock> start, end;
999 start = std::chrono::system_clock::now();
1002 Log() << std::setw(10) <<
"Epoch" <<
" | "
1003 << std::setw(12) <<
"Train Err."
1004 << std::setw(12) <<
"Test Err."
1005 << std::setw(12) <<
"GFLOP/s"
1006 << std::setw(12) <<
"Conv. Steps" <<
Endl;
1007 std::string separator(62,
'-');
1016 trainingData.Shuffle();
1017 for (
size_t i = 0;
i < batchesInEpoch;
i += nThreads) {
1019 for (
size_t j = 0; j < nThreads; j++) {
1020 batches.reserve(nThreads);
1021 batches.push_back(trainingData.GetBatch());
1023 if (settings.momentum > 0.0) {
1024 minimizer.
StepMomentum(net, nets, batches, settings.momentum);
1026 minimizer.
Step(net, nets, batches);
1034 for (
auto batch : testData) {
1035 auto inputMatrix = batch.GetInput();
1036 auto outputMatrix = batch.GetOutput();
1037 testError += testNet.Loss(inputMatrix, outputMatrix);
1039 testError /= (
Double_t) (nTestSamples / settings.batchSize);
1044 end = std::chrono::system_clock::now();
1048 for (
auto batch : trainingData) {
1049 auto inputMatrix = batch.GetInput();
1050 auto outputMatrix = batch.GetOutput();
1051 trainingError += net.Loss(inputMatrix, outputMatrix);
1053 trainingError /= (
Double_t) (nTrainingSamples / settings.batchSize);
1055 fTrainHistory.AddValue(
"trainingError",stepCount,trainingError);
1058 std::chrono::duration<double> elapsed_seconds = end - start;
1059 double seconds = elapsed_seconds.count();
1060 double nFlops = (
double) (settings.testInterval * batchesInEpoch);
1061 nFlops *= net.GetNFlops() * 1
e-9;
1064 start = std::chrono::system_clock::now();
1067 fInteractive->AddPoint(stepCount, trainingError, testError);
1072 Log() << std::setw(10) << stepCount <<
" | "
1073 << std::setw(12) << trainingError
1074 << std::setw(12) << testError
1075 << std::setw(12) << nFlops / seconds
1083 for (
size_t l = 0;
l < net.GetDepth();
l++) {
1091 Log() << kFATAL <<
"CUDA backend not enabled. Please make sure "
1092 "you have CUDA installed and it was successfully "
1093 "detected by CMAKE." <<
Endl;
1103 Log() << kINFO <<
"Start of neural network training on CPU." <<
Endl <<
Endl;
1107 size_t nTestSamples = nValidationSamples;
1109 Log() << kDEBUG <<
"Using " << nValidationSamples <<
" validation samples." <<
Endl;
1110 Log() << kDEBUG <<
"Using " << nTestSamples <<
" training samples." <<
Endl;
1114 size_t trainingPhase = 1;
1121 Log() <<
"Training phase " << trainingPhase <<
" of "
1126 net.SetWeightDecay(settings.weightDecay);
1127 net.SetRegularization(settings.regularization);
1130 std::vector<Double_t> dropoutVector(settings.dropoutProbabilities);
1131 for (
auto & p : dropoutVector) {
1134 net.SetDropoutProbabilities(dropoutVector);
1135 net.InitializeGradients();
1136 auto testNet = net.CreateClone(settings.batchSize);
1142 const std::vector<Event *> trainingInputData =
1143 std::vector<Event *>(allData.begin(), allData.begin() + nTrainingSamples);
1144 const std::vector<Event *> testInputData =
1145 std::vector<Event *>(allData.begin() + nTrainingSamples, allData.end());
1147 if (trainingInputData.size() != nTrainingSamples) {
1148 Log() << kFATAL <<
"Inconsistent training sample size" <<
Endl;
1150 if (testInputData.size() != nTestSamples) {
1151 Log() << kFATAL <<
"Inconsistent test sample size" <<
Endl;
1154 size_t nThreads = 1;
1157 DataLoader_t trainingData(trainingTuple, nTrainingSamples,
1158 net.GetBatchSize(), net.GetInputWidth(),
1159 net.GetOutputWidth(), nThreads);
1160 DataLoader_t testData(testTuple, nTestSamples, testNet.GetBatchSize(),
1161 net.GetInputWidth(), net.GetOutputWidth(),
1164 settings.convergenceSteps,
1165 settings.testInterval);
1167 std::vector<TNet<TCpu<>>> nets{};
1168 std::vector<TBatch<TCpu<>>> batches{};
1169 nets.reserve(nThreads);
1170 for (
size_t i = 0;
i < nThreads;
i++) {
1171 nets.push_back(net);
1172 for (
size_t j = 0; j < net.GetDepth(); j++)
1174 auto &masterLayer = net.GetLayer(j);
1175 auto &layer = nets.back().GetLayer(j);
1177 masterLayer.GetWeights());
1179 masterLayer.GetBiases());
1183 bool converged =
false;
1184 size_t stepCount = 0;
1185 size_t batchesInEpoch = nTrainingSamples / net.GetBatchSize();
1187 std::chrono::time_point<std::chrono::system_clock> start, end;
1188 start = std::chrono::system_clock::now();
1191 Log() << std::setw(10) <<
"Epoch" <<
" | "
1192 << std::setw(12) <<
"Train Err."
1193 << std::setw(12) <<
"Test Err."
1194 << std::setw(12) <<
"GFLOP/s"
1195 << std::setw(12) <<
"Conv. Steps" <<
Endl;
1196 std::string separator(62,
'-');
1204 trainingData.Shuffle();
1205 for (
size_t i = 0;
i < batchesInEpoch;
i += nThreads) {
1207 for (
size_t j = 0; j < nThreads; j++) {
1208 batches.reserve(nThreads);
1209 batches.push_back(trainingData.GetBatch());
1211 if (settings.momentum > 0.0) {
1212 minimizer.
StepMomentum(net, nets, batches, settings.momentum);
1214 minimizer.
Step(net, nets, batches);
1222 for (
auto batch : testData) {
1223 auto inputMatrix = batch.GetInput();
1224 auto outputMatrix = batch.GetOutput();
1225 auto weightMatrix = batch.GetWeights();
1226 testError += testNet.Loss(inputMatrix, outputMatrix, weightMatrix);
1228 testError /= (
Double_t) (nTestSamples / settings.batchSize);
1233 end = std::chrono::system_clock::now();
1237 for (
auto batch : trainingData) {
1238 auto inputMatrix = batch.GetInput();
1239 auto outputMatrix = batch.GetOutput();
1240 auto weightMatrix = batch.GetWeights();
1241 trainingError += net.Loss(inputMatrix, outputMatrix, weightMatrix);
1243 trainingError /= (
Double_t) (nTrainingSamples / settings.batchSize);
1246 fTrainHistory.AddValue(
"trainingError",stepCount,trainingError);
1249 fInteractive->AddPoint(stepCount, trainingError, testError);
1255 std::chrono::duration<double> elapsed_seconds = end - start;
1256 double seconds = elapsed_seconds.count();
1257 double nFlops = (
double) (settings.testInterval * batchesInEpoch);
1258 nFlops *= net.GetNFlops() * 1
e-9;
1261 start = std::chrono::system_clock::now();
1264 fInteractive->AddPoint(stepCount, trainingError, testError);
1269 Log() << std::setw(10) << stepCount <<
" | "
1270 << std::setw(12) << trainingError
1271 << std::setw(12) << testError
1272 << std::setw(12) << nFlops / seconds
1282 for (
size_t l = 0;
l < net.GetDepth();
l++) {
1283 auto & layer =
fNet.GetLayer(
l);
1290 Log() << kFATAL <<
"Multi-core CPU backend not enabled. Please make sure "
1291 "you have a BLAS implementation and it was successfully "
1292 "detected by CMake as well that the imt CMake flag is set." <<
Endl;
1300 size_t nVariables =
GetEvent()->GetNVariables();
1304 const std::vector<Float_t>& inputValues =
GetEvent()->GetValues();
1305 for (
size_t i = 0;
i < nVariables;
i++) {
1306 X(0,
i) = inputValues[
i];
1317 size_t nVariables =
GetEvent()->GetNVariables();
1321 const std::vector<Float_t>& inputValues = ev->
GetValues();
1322 for (
size_t i = 0;
i < nVariables;
i++) {
1323 X(0,
i) = inputValues[
i];
1326 size_t nTargets = std::max(1u, ev->
GetNTargets());
1328 std::vector<Float_t>
output(nTargets);
1329 auto net =
fNet.CreateClone(1);
1332 for (
size_t i = 0;
i < nTargets;
i++)
1341 for (
size_t i = 0;
i < nTargets; ++
i) {
1346 for (
size_t i = 0;
i < nTargets; ++
i) {
1355 size_t nVariables =
GetEvent()->GetNVariables();
1362 const std::vector<Float_t>& inputValues =
GetEvent()->GetValues();
1363 for (
size_t i = 0;
i < nVariables;
i++) {
1364 X(0,
i) = inputValues[
i];
1368 for (
size_t i = 0;
i < (size_t) YHat.GetNcols();
i++) {
1369 (*fMulticlassReturnVal)[
i] = YHat(0,
i);
1379 Int_t inputWidth =
fNet.GetInputWidth();
1381 char lossFunction =
static_cast<char>(
fNet.GetLossFunction());
1383 gTools().StringFromInt(inputWidth));
1389 for (
Int_t i = 0;
i < depth;
i++) {
1390 const auto& layer =
fNet.GetLayer(
i);
1392 int activationFunction =
static_cast<int>(layer.GetActivationFunction());
1410 fNet.SetBatchSize(1);
1412 size_t inputWidth, depth;
1415 char lossFunctionChar;
1417 char outputFunctionChar;
1420 fNet.SetInputWidth(inputWidth);
1424 size_t previousWidth = inputWidth;
1426 for (
size_t i = 0;
i < depth;
i++) {
1444 fNet.GetLayer(
i).GetWeights() = weights;
1445 fNet.GetLayer(
i).GetBiases() = biases;
1448 previousWidth =
width;
1488 Log() << col <<
"--- Short description:" << colres <<
Endl;
1490 Log() <<
"The DNN neural network is a feedforward" <<
Endl;
1491 Log() <<
"multilayer perceptron implementation. The DNN has a user-" <<
Endl;
1492 Log() <<
"defined hidden layer architecture, where the number of input (output)" <<
Endl;
1493 Log() <<
"nodes is determined by the input variables (output classes, i.e., " <<
Endl;
1494 Log() <<
"signal and one background, regression or multiclass). " <<
Endl;
1496 Log() << col <<
"--- Performance optimisation:" << colres <<
Endl;
1499 const char* txt =
"The DNN supports various options to improve performance in terms of training speed and \n \
1500reduction of overfitting: \n \
1502 - different training settings can be stacked. Such that the initial training \n\
1503 is done with a large learning rate and a large drop out fraction whilst \n \
1504 in a later stage learning rate and drop out can be reduced. \n \
1507 initial training stage: 0.0 for the first layer, 0.5 for later layers. \n \
1508 later training stage: 0.1 or 0.0 for all layers \n \
1509 final training stage: 0.0] \n \
1510 Drop out is a technique where a at each training cycle a fraction of arbitrary \n \
1511 nodes is disabled. This reduces co-adaptation of weights and thus reduces overfitting. \n \
1512 - L1 and L2 regularization are available \n \
1514 [recommended 10 - 150] \n \
1515 Arbitrary mini-batch sizes can be chosen. \n \
1516 - Multithreading \n \
1517 [recommended: True] \n \
1518 Multithreading can be turned on. The minibatches are distributed to the available \n \
1519 cores. The algorithm is lock-free (\"Hogwild!\"-style) for each cycle. \n \
1523 - example: \"TANH|(N+30)*2,TANH|(N+30),LINEAR\" \n \
1525 . two hidden layers (separated by \",\") \n \
1526 . the activation function is TANH (other options: RELU, SOFTSIGN, LINEAR) \n \
1527 . the activation function for the output layer is LINEAR \n \
1528 . the first hidden layer has (N+30)*2 nodes where N is the number of input neurons \n \
1529 . the second hidden layer has N+30 nodes, where N is the number of input neurons \n \
1530 . the number of nodes in the output layer is determined by the number of output nodes \n \
1531 and can therefore not be chosen freely. \n \
1533 \"ErrorStrategy\": \n \
1535 The error of the neural net is determined by a sum-of-squares error function \n \
1536 For regression, this is the only possible choice. \n \
1538 The error of the neural net is determined by a cross entropy function. The \n \
1539 output values are automatically (internally) transformed into probabilities \n \
1540 using a sigmoid function. \n \
1541 For signal/background classification this is the default choice. \n \
1542 For multiclass using cross entropy more than one or no output classes \n \
1543 can be equally true or false (e.g. Event 0: A and B are true, Event 1: \n \
1544 A and C is true, Event 2: C is true, ...) \n \
1545 - MUTUALEXCLUSIVE \n \
1546 In multiclass settings, exactly one of the output classes can be true (e.g. either A or B or C) \n \
1548 \"WeightInitialization\" \n \
1551 \"Xavier Glorot & Yoshua Bengio\"-style of initializing the weights. The weights are chosen randomly \n \
1552 such that the variance of the values of the nodes is preserved for each layer. \n \
1553 - XAVIERUNIFORM \n \
1554 The same as XAVIER, but with uniformly distributed weights instead of gaussian weights \n \
1556 Random values scaled by the layer size \n \
1558 \"TrainingStrategy\" \n \
1559 - example: \"LearningRate=1e-1,Momentum=0.3,ConvergenceSteps=50,BatchSize=30,TestRepetitions=7,WeightDecay=0.0,Renormalize=L2,DropConfig=0.0,DropRepetitions=5|LearningRate=1e-4,Momentum=0.3,ConvergenceSteps=50,BatchSize=20,TestRepetitions=7,WeightDecay=0.001,Renormalize=L2,DropFraction=0.0,DropRepetitions=5\" \n \
1560 - explanation: two stacked training settings separated by \"|\" \n \
1561 . first training setting: \"LearningRate=1e-1,Momentum=0.3,ConvergenceSteps=50,BatchSize=30,TestRepetitions=7,WeightDecay=0.0,Renormalize=L2,DropConfig=0.0,DropRepetitions=5\" \n \
1562 . second training setting : \"LearningRate=1e-4,Momentum=0.3,ConvergenceSteps=50,BatchSize=20,TestRepetitions=7,WeightDecay=0.001,Renormalize=L2,DropFractions=0.0,DropRepetitions=5\" \n \
1563 . LearningRate : \n \
1564 - recommended for classification: 0.1 initially, 1e-4 later \n \
1565 - recommended for regression: 1e-4 and less \n \
1567 preserve a fraction of the momentum for the next training batch [fraction = 0.0 - 1.0] \n \
1568 . Repetitions : \n \
1569 train \"Repetitions\" repetitions with the same minibatch before switching to the next one \n \
1570 . ConvergenceSteps : \n \
1571 Assume that convergence is reached after \"ConvergenceSteps\" cycles where no improvement \n \
1572 of the error on the test samples has been found. (Mind that only at each \"TestRepetitions\" \n \
1573 cycle the test samples are evaluated and thus the convergence is checked) \n \
1575 Size of the mini-batches. \n \
1576 . TestRepetitions \n \
1577 Perform testing the neural net on the test samples each \"TestRepetitions\" cycle \n \
1579 If \"Renormalize\" is set to L1 or L2, \"WeightDecay\" provides the renormalization factor \n \
1581 NONE, L1 (|w|) or L2 (w^2) \n \
1583 Drop a fraction of arbitrary nodes of each of the layers according to the values given \n \
1584 in the DropConfig. \n \
1585 [example: DropConfig=0.0+0.5+0.3 \n \
1586 meaning: drop no nodes in layer 0 (input layer), half of the nodes in layer 1 and 30% of the nodes \n \
1588 recommended: leave all the nodes turned on for the input layer (layer 0) \n \
1589 turn off half of the nodes in later layers for the initial training; leave all nodes \n \
1590 turned on (0.0) in later training stages] \n \
1591 . DropRepetitions \n \
1592 Each \"DropRepetitions\" cycle the configuration of which nodes are dropped is changed \n \
1593 [recommended : 1] \n \
1594 . Multithreading \n \
1595 turn on multithreading [recommended: True] \n \
#define REGISTER_METHOD(CLASS)
for example
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t r
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void value
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char Pixmap_t Pixmap_t PictureAttributes_t attr const char char ret_data h unsigned char height h Atom_t Int_t ULong_t ULong_t unsigned char prop_list Atom_t Atom_t Atom_t Time_t type
Bool_t WriteOptionsReference() const
Layer defines the layout of a layer.
Settings for the training of the neural net.
Steepest Gradient Descent algorithm (SGD)
static void Copy(Matrix_t &B, const Matrix_t &A)
static void Copy(Matrix_t &B, const Matrix_t &A)
bool HasConverged()
Increases the minimization step counter by the test error evaluation period and uses the current inte...
void Step(Net_t &net, Matrix_t &input, const Matrix_t &output, const Matrix_t &weights)
Perform a single optimization step on a given batch.
size_t GetTestInterval() const
void StepMomentum(Net_t &master, std::vector< Net_t > &nets, std::vector< TBatch< Architecture_t > > &batches, Scalar_t momentum)
Same as the Step(...) method for multiple batches but uses momentum.
size_t GetConvergenceCount() const
size_t GetConvergenceSteps() const
Generic neural network class.
Class that contains all the data information.
void SetTarget(UInt_t itgt, Float_t value)
set the target value (dimension itgt) to value
UInt_t GetNTargets() const
accessor to the number of targets
std::vector< Float_t > & GetValues()
Float_t GetTarget(UInt_t itgt) const
Virtual base Class for all MVA method.
const char * GetName() const
Bool_t IgnoreEventsWithNegWeightsInTraining() const
const std::vector< TMVA::Event * > & GetEventCollection(Types::ETreeType type)
returns the event collection (i.e.
UInt_t GetNTargets() const
std::vector< Float_t > * fRegressionReturnVal
std::vector< Float_t > * fMulticlassReturnVal
const Event * GetEvent() const
DataSetInfo & DataInfo() const
UInt_t GetNVariables() const
Types::EAnalysisType fAnalysisType
TransformationHandler & GetTransformationHandler(Bool_t takeReroutedIfAvailable=true)
const TString & GetInputLabel(Int_t i) const
TrainingHistory fTrainHistory
IPythonInteractive * fInteractive
temporary dataset used when evaluating on a different data (used by MethodCategory::GetMvaValues)
Deep Neural Network Implementation.
virtual Bool_t HasAnalysisType(Types::EAnalysisType type, UInt_t numberClasses, UInt_t numberTargets)
virtual const std::vector< Float_t > & GetMulticlassValues()
UInt_t GetNumValidationSamples()
void ReadWeightsFromXML(void *wghtnode)
typename Architecture_t::Matrix_t Matrix_t
TString fTrainingStrategyString
KeyValueVector_t fSettings
void ReadWeightsFromStream(std::istream &i)
LayoutVector_t ParseLayoutString(TString layerSpec)
static void WriteMatrixXML(void *parent, const char *name, const TMatrixT< Double_t > &X)
void MakeClassSpecific(std::ostream &, const TString &) const
MethodDNN(const TString &jobName, const TString &methodTitle, DataSetInfo &theData, const TString &theOption)
virtual Double_t GetMvaValue(Double_t *err=nullptr, Double_t *errUpper=nullptr)
TString fWeightInitializationString
std::vector< std::map< TString, TString > > KeyValueVector_t
DNN::EInitialization fWeightInitialization
std::vector< TTrainingSettings > fTrainingSettings
TString fArchitectureString
const Ranking * CreateRanking()
KeyValueVector_t ParseKeyValueString(TString parseString, TString blockDelim, TString tokenDelim)
DNN::EOutputFunction fOutputFunction
void AddWeightsXMLTo(void *parent) const
void GetHelpMessage() const
static void ReadMatrixXML(void *xml, const char *name, TMatrixT< Double_t > &X)
virtual const std::vector< Float_t > & GetRegressionValues()
Ranking for variables in method (implementation)
Singleton class for Global types used by TMVA.
Collectable string class.
const TString & GetString() const
Int_t Atoi() const
Return integer value of string.
TSubString Strip(EStripType s=kTrailing, char c=' ') const
Return a substring of self stripped at beginning and/or end.
Bool_t IsFloat() const
Returns kTRUE if string contains a floating point or integer number.
const char * Data() const
void ToUpper()
Change string to upper case.
TObjArray * Tokenize(const TString &delim) const
This function is used to isolate sequential tokens in a TString.
Bool_t BeginsWith(const char *s, ECaseCompare cmp=kExact) const
static TString Itoa(Int_t value, Int_t base)
Converts an Int_t to a TString with respect to the base specified (2-36).
XMLNodePointer_t NewChild(XMLNodePointer_t parent, XMLNsPointer_t ns, const char *name, const char *content=nullptr)
create new child element for parent node
XMLNodePointer_t GetChild(XMLNodePointer_t xmlnode, Bool_t realnode=kTRUE)
returns first child of xmlnode
XMLAttrPointer_t NewAttr(XMLNodePointer_t xmlnode, XMLNsPointer_t, const char *name, const char *value)
creates new attribute for xmlnode, namespaces are not supported for attributes
EOutputFunction
Enum that represents output functions.
auto regularization(const typename Architecture_t::Matrix_t &A, ERegularization R) -> decltype(Architecture_t::L1Regularization(A))
Evaluate the regularization functional for a given weight matrix.
EActivationFunction
Enum that represents layer activation functions.
ELossFunction
Enum that represents objective functions for the net, i.e.
std::tuple< const std::vector< Event * > &, const DataSetInfo & > TMVAInput_t
create variable transformations
TString fetchValue(const std::map< TString, TString > &keyValueMap, TString key)
MsgLogger & Endl(MsgLogger &ml)
Double_t Log(Double_t x)
Returns the natural logarithm of x.
DNN::ERegularization regularization
std::vector< Double_t > dropoutProbabilities