11#ifndef TMVA_DNN_MINIMIZERS
12#define TMVA_DNN_MINIMIZERS
54template<
typename Architecture_t>
58 using Scalar_t =
typename Architecture_t::Scalar_t;
59 using Matrix_t =
typename Architecture_t::Matrix_t;
77 size_t convergenceSteps,
90 template <
typename Data_t,
typename Net_t>
92 const Data_t & TestDataIn,
size_t nTestSamples,
93 Net_t & net,
size_t nThreads = 1);
96 template <
typename Data_t,
typename Net_t>
98 const Data_t & TestDataIn,
size_t nTestSamples,
99 Net_t & net,
Scalar_t momentum,
size_t nThreads = 1);
106 template <
typename Net_t>
111 template <
typename Net_t>
120 template <
typename Net_t>
121 void Step(Net_t &master,
122 std::vector<Net_t> &nets,
126 template <
typename Net_t>
128 std::vector<Net_t> &nets,
131 template <
typename Net_t>
136 std::vector<Net_t> &nets,
143 template <
typename Net_t>
148 template <
typename Net_t>
174template <
typename Architecture_t>
176 : fBatchSize(0), fStepCount(0), fConvergenceSteps(0), fConvergenceCount(0), fTestInterval(0),
177 fTrainingError(0), fTestError(0), fLearningRate(0),
178 fMinimumError(std::numeric_limits<
Scalar_t>::infinity())
184template <
typename Architecture_t>
186 : fBatchSize(0), fStepCount(0), fConvergenceSteps(convergenceSteps), fConvergenceCount(0),
187 fTestInterval(testInterval), fTrainingError(0), fTestError(0),
188 fLearningRate(learningRate), fMinimumError(std::numeric_limits<
Scalar_t>::infinity())
194template<
typename Architecture_t>
195template <
typename Data_t,
typename Net_t>
197 size_t nTrainingSamples,
198 const Data_t & testData,
210 net.GetOutputWidth(), nThreads);
211 auto testNet = net.CreateClone(nTestSamples);
213 testNet.GetBatchSize(),
214 testNet.GetInputWidth(),
215 net.GetOutputWidth());
216 std::vector<Net_t> nets{};
217 nets.reserve(nThreads);
218 for (
size_t i = 0; i < nThreads; i++) {
220 for (
size_t j = 0; j < net.GetDepth(); j++)
222 auto &masterLayer = net.GetLayer(j);
223 auto &layer = nets.back().GetLayer(j);
224 Architecture_t::Copy(layer.GetWeights(),
225 masterLayer.GetWeights());
226 Architecture_t::Copy(layer.GetBiases(),
227 masterLayer.GetBiases());
231 size_t batchesInEpoch = nTrainingSamples / net.GetBatchSize();
232 std::vector<TBatch<Architecture_t>> batches{};
233 batches.reserve(nThreads);
236 for (fStepCount = 0; fStepCount < fTestInterval; fStepCount++) {
238 for (
size_t i = 0; i < batchesInEpoch; i += nThreads) {
240 for (
size_t j = 0; j < nThreads; j++) batches.push_back(trainLoader.
GetBatch());
241 Step(net, nets, batches);
245 auto b = *testLoader.
begin();
246 auto inputMatrix =
b.GetInput();
247 auto outputMatrix =
b.GetOutput();
248 auto weightMatrix =
b.GetWeights();
249 fTestError = testNet.Loss(inputMatrix, outputMatrix, weightMatrix);
251 }
while (!HasConverged());
253 return fMinimumError;
257template<
typename Architecture_t>
258template <
typename Data_t,
typename Net_t>
260 size_t nTrainingSamples,
261 const Data_t & testData,
274 net.GetOutputWidth(), nThreads);
275 auto testNet = net.CreateClone(net.GetBatchSize());
277 testNet.GetBatchSize(),
278 testNet.GetInputWidth(),
279 net.GetOutputWidth());
281 net.InitializeGradients();
282 std::vector<Net_t> nets{};
283 nets.reserve(nThreads);
284 for (
size_t i = 0; i < nThreads; i++) {
286 for (
size_t j = 0; j < net.GetDepth(); j++)
288 auto &masterLayer = net.GetLayer(j);
289 auto &layer = nets.back().GetLayer(j);
290 Architecture_t::Copy(layer.GetWeights(),
291 masterLayer.GetWeights());
292 Architecture_t::Copy(layer.GetBiases(),
293 masterLayer.GetBiases());
297 size_t batchesInEpoch = nTrainingSamples / net.GetBatchSize();
298 std::vector<TBatch<Architecture_t>> batches{};
299 batches.reserve(nThreads);
302 for (fStepCount = 0; fStepCount < fTestInterval; fStepCount++) {
304 for (
size_t i = 0; i < batchesInEpoch; i += nThreads) {
306 for (
size_t j = 0; j < nThreads; j++) batches.push_back(trainLoader.
GetBatch());
307 if (momentum != 0.0) {
308 StepMomentum(net, nets, batches, momentum);
310 Step(net, nets, batches);
316 for (
size_t i = 0; i < batchesInEpoch; i++) {
318 auto inputMatrix =
b.GetInput();
319 auto outputMatrix =
b.GetOutput();
320 auto weightMatrix =
b.GetWeights();
321 fTestError += testNet.Loss(inputMatrix, outputMatrix, weightMatrix);
323 fTestError /= (
Double_t)batchesInEpoch;
324 }
while (!HasConverged());
325 return fMinimumError;
329template <
typename Architecture_t>
330template <
typename Net_t>
334 net.Forward(
input,
true);
337 for (
size_t i = 0; i < net.GetDepth(); i++)
339 auto &layer = net.GetLayer(i);
340 Architecture_t::ScaleAdd(layer.GetWeights(),
341 layer.GetWeightGradients(),
343 Architecture_t::ScaleAdd(layer.GetBiases(),
344 layer.GetBiasGradients(),
350template <
typename Architecture_t>
351template <
typename Net_t>
358 for (
size_t i = 0; i < net.GetDepth(); i++)
360 auto &layer = net.GetLayer(i);
361 Architecture_t::ScaleAdd(layer.GetWeights(),
362 layer.GetWeightGradients(),
364 Architecture_t::ScaleAdd(layer.GetBiases(),
365 layer.GetBiasGradients(),
372template<
typename Architecture_t>
373 template <
typename Net_t>
376 std::vector<Net_t> & nets,
379 typename Architecture_t::Matrix_t dummy(0,0);
380 size_t depth = master.GetDepth();
383 for (
size_t j = 0; j < nets.size(); j++) {
384 nets[j].GetLayer(0).Forward(batches[j].GetInput(),
true);
387 for (
size_t i = 1; i < depth; i++)
389 for (
size_t j = 0; j < nets.size(); j++) {
390 nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(),
true);
394 for (
size_t j = 0; j < nets.size(); j++) {
395 evaluateGradients<Architecture_t>(nets[j].GetLayer(depth - 1).GetActivationGradients(), nets[j].GetLossFunction(),
396 batches[j].GetOutput(), nets[j].GetLayer(depth - 1).GetOutput(),
397 batches[j].GetWeights());
400 for (
size_t i = depth - 1; i > 0; i--)
402 for (
size_t j = 0; j < nets.size(); j++) {
403 nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
404 nets[j].GetLayer(i-1).GetOutput(),
405 nets[j].GetRegularization(),
406 nets[j].GetWeightDecay());
409 for (
size_t j = 0; j < nets.size(); j++) {
410 nets[j].GetLayer(0).Backward(dummy,
411 batches[j].GetInput(),
412 nets[j].GetRegularization(),
413 nets[j].GetWeightDecay());
416 for (
size_t j = 0; j < nets.size(); j++) {
417 for (
size_t i = 0; i < depth; i++)
419 auto &masterLayer = master.GetLayer(i);
420 auto &layer = nets[j].GetLayer(i);
421 Architecture_t::ScaleAdd(masterLayer.GetWeights(),
422 layer.GetWeightGradients(),
424 Architecture_t::Copy(layer.GetWeights(),
425 masterLayer.GetWeights());
426 Architecture_t::ScaleAdd(masterLayer.GetBiases(),
427 layer.GetBiasGradients(),
429 Architecture_t::Copy(layer.GetBiases(),
430 masterLayer.GetBiases());
436template<
typename Architecture_t>
437template <
typename Net_t>
440 std::vector<Net_t> & nets,
444 typename Architecture_t::Matrix_t dummy(0,0);
445 size_t depth = master.GetDepth();
448 for (
size_t j = 0; j < nets.size(); j++) {
449 nets[j].GetLayer(0).Forward(batches[j].GetInput(),
true);
452 for (
size_t i = 1; i < depth; i++)
454 for (
size_t j = 0; j < nets.size(); j++) {
455 nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(),
true);
459 for (
size_t j = 0; j < nets.size(); j++) {
460 evaluateGradients<Architecture_t>(nets[j].GetLayer(depth - 1).GetActivationGradients(), nets[j].GetLossFunction(),
461 batches[j].GetOutput(), nets[j].GetLayer(depth - 1).GetOutput(),
462 batches[j].GetWeights());
465 for (
size_t i = depth - 1; i > 0; i--)
467 for (
size_t j = 0; j < nets.size(); j++) {
468 nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
469 nets[j].GetLayer(i-1).GetOutput(),
470 nets[j].GetRegularization(),
471 nets[j].GetWeightDecay());
472 Architecture_t::ScaleAdd(master.GetLayer(i).GetWeightGradients(),
473 nets[j].GetLayer(i).GetWeightGradients(),
474 - fLearningRate / momentum);
475 Architecture_t::ScaleAdd(master.GetLayer(i).GetBiasGradients(),
476 nets[j].GetLayer(i).GetBiasGradients(),
477 - fLearningRate / momentum);
479 Architecture_t::ScaleAdd(master.GetLayer(i).GetWeightGradients(),
480 master.GetLayer(i).GetWeightGradients(),
482 Architecture_t::ScaleAdd(master.GetLayer(i).GetBiasGradients(),
483 master.GetLayer(i).GetBiasGradients(),
486 for (
size_t j = 0; j < nets.size(); j++) {
487 nets[j].GetLayer(0).Backward(dummy,
488 batches[j].GetInput(),
489 nets[j].GetRegularization(),
490 nets[j].GetWeightDecay());
491 Architecture_t::ScaleAdd(master.GetLayer(0).GetWeightGradients(),
492 nets[j].GetLayer(0).GetWeightGradients(),
493 - fLearningRate / momentum);
494 Architecture_t::ScaleAdd(master.GetLayer(0).GetBiasGradients(),
495 nets[j].GetLayer(0).GetBiasGradients(),
496 - fLearningRate / momentum);
499 Architecture_t::ScaleAdd(master.GetLayer(0).GetWeightGradients(),
500 master.GetLayer(0).GetWeightGradients(),
502 Architecture_t::ScaleAdd(master.GetLayer(0).GetBiasGradients(),
503 master.GetLayer(0).GetBiasGradients(),
506 for (
size_t i = 0; i < depth; i++)
508 auto &masterLayer = master.GetLayer(i);
509 Architecture_t::ScaleAdd(masterLayer.GetWeights(),
510 masterLayer.GetWeightGradients(),
512 Architecture_t::ScaleAdd(masterLayer.GetBiases(),
513 masterLayer.GetBiasGradients(),
515 for (
size_t j = 0; j < nets.size(); j++) {
516 auto &layer = nets[j].GetLayer(i);
517 Architecture_t::Copy(layer.GetWeights(),
518 masterLayer.GetWeights());
519 Architecture_t::Copy(layer.GetBiases(),
520 masterLayer.GetBiases());
526template<
typename Architecture_t>
527template <
typename Net_t>
530 std::vector<Net_t> & nets,
534 typename Architecture_t::Matrix_t dummy(0,0);
535 size_t depth = master.GetDepth();
538 for (
size_t j = 0; j < nets.size(); j++) {
539 nets[j].GetLayer(0).Forward(batches[j].GetInput(),
true);
542 for (
size_t i = 1; i < depth; i++)
544 for (
size_t j = 0; j < nets.size(); j++) {
545 nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(),
true);
550 for (
size_t j = 0; j < nets.size(); j++) {
551 evaluateGradients<Architecture_t>(nets[j].GetLayer(depth - 1).GetActivationGradients(), nets[j].GetLossFunction(),
552 batches[j].GetOutput(), nets[j].GetLayer(depth - 1).GetOutput(),
553 batches[j].GetWeights());
557 for (
size_t i = depth - 1; i > 0; i--)
559 for (
size_t j = 0; j < nets.size(); j++) {
560 nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
561 nets[j].GetLayer(i-1).GetOutput(),
562 nets[j].GetRegularization(),
563 nets[j].GetWeightDecay());
567 for (
size_t j = 0; j < nets.size(); j++) {
568 nets[j].GetLayer(0).Backward(dummy,
569 batches[j].GetInput(),
570 nets[j].GetRegularization(),
571 nets[j].GetWeightDecay());
574 for (
size_t i = 0; i < depth; i++)
576 auto &masterLayer = master.GetLayer(i);
577 for (
size_t j = 0; j < nets.size(); j++) {
578 auto &layer = nets[j].GetLayer(i);
579 Architecture_t::Copy(layer.GetWeights(),
580 masterLayer.GetWeights());
581 Architecture_t::Copy(layer.GetBiases(),
582 masterLayer.GetBiases());
583 Architecture_t::ScaleAdd(layer.GetWeights(),
584 masterLayer.GetWeightGradients(),
586 Architecture_t::ScaleAdd(layer.GetBiases(),
587 masterLayer.GetBiasGradients(),
590 for (
size_t j = 0; j < nets.size(); j++) {
591 auto &layer = nets[j].GetLayer(i);
592 Architecture_t::ScaleAdd(masterLayer.GetWeightGradients(),
593 layer.GetWeightGradients(),
594 - fLearningRate / momentum);
595 Architecture_t::ScaleAdd(masterLayer.GetBiasGradients(),
596 layer.GetBiasGradients(),
597 - fLearningRate / momentum);
599 Architecture_t::ScaleAdd(masterLayer.GetWeightGradients(),
600 masterLayer.GetWeightGradients(),
602 Architecture_t::ScaleAdd(masterLayer.GetBiasGradients(),
603 masterLayer.GetBiasGradients(),
605 Architecture_t::ScaleAdd(masterLayer.GetWeights(),
606 masterLayer.GetWeightGradients(),
608 Architecture_t::ScaleAdd(masterLayer.GetBiases(),
609 masterLayer.GetBiasGradients(),
615template<
typename Architecture_t>
616template <
typename Net_t>
622 net.Forward(
input,
true);
625 for (
size_t i = 0; i < net.GetDepth(); i++)
627 auto &layer = net.GetLayer(i);
628 Architecture_t::ScaleAdd(layer.GetWeights(),
629 layer.GetWeightGradients(),
632 Architecture_t::ScaleAdd(layer.GetBiases(),
633 layer.GetBiasGradients(),
640template <
typename Architecture_t>
641template <
typename Net_t>
647 fTrainingError = loss;
650 for (
size_t i = 0; i < net.GetDepth(); i++)
652 auto &layer = net.GetLayer(i);
653 Architecture_t::ScaleAdd(layer.GetWeights(),
654 layer.GetWeightGradients(),
657 Architecture_t::ScaleAdd(layer.GetBiases(),
658 layer.GetBiasGradients(),
666template<
typename Architecture_t>
669 if (fTestError < fMinimumError * 0.999) {
670 fConvergenceCount = 0;
671 fMinimumError = fTestError;
676 return (fConvergenceCount >= fConvergenceSteps);
680template<
typename Architecture_t>
683 fTestError = testError;
684 if (fTestError < fMinimumError * 0.999) {
685 fConvergenceCount = 0;
686 fMinimumError = fTestError;
688 fConvergenceCount += fTestInterval;
690 return (fConvergenceCount >= fConvergenceSteps);
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void input
TBatch< AArchitecture > GetBatch()
Return the next batch from the training set.
void Shuffle()
Shuffle the order of the samples in the batch.
size_t fConvergenceCount
Current number of training epochs without.
bool HasConverged()
Increases the minimization step counter by the test error evaluation period and uses the current inte...
Scalar_t fTrainingError
Holds the most recently computed training loss.
void Step(Net_t &net, Matrix_t &input, const Matrix_t &output, const Matrix_t &weights)
Perform a single optimization step on a given batch.
size_t fConvergenceSteps
Number of training epochs without considerable.
Scalar_t StepReducedWeightsLoss(Net_t &net, Matrix_t &input, const Matrix_t &output, const Matrix_t &weights)
Similar to StepReducedWeights(...) but also evaluates the loss.
Scalar_t StepLoss(Net_t &net, Matrix_t &input, const Matrix_t &output, const Matrix_t &weights)
Same as Step(...) but also evaluate the loss on the given training data.
void Reset()
Reset minimizer object to default state.
Scalar_t GetTrainingError() const
size_t fTestInterval
Interval for the computation of the test error.
void StepNesterov(Net_t &master, std::vector< Net_t > &nets, std::vector< TBatch< Architecture_t > > &batches, Scalar_t momentum)
Same as the Step(...) method for multiple batches but uses Nesterov momentum.
void SetBatchSize(Scalar_t rate)
Scalar_t fTestError
Holds the most recently computed test loss.
typename Architecture_t::Matrix_t Matrix_t
size_t GetTestInterval() const
size_t fStepCount
Number of steps performed in the current training session.
void StepMomentum(Net_t &master, std::vector< Net_t > &nets, std::vector< TBatch< Architecture_t > > &batches, Scalar_t momentum)
Same as the Step(...) method for multiple batches but uses momentum.
void SetConvergenceSteps(size_t steps)
Scalar_t TrainMomentum(const Data_t &TrainingDataIn, size_t nTrainingSamples, const Data_t &TestDataIn, size_t nTestSamples, Net_t &net, Scalar_t momentum, size_t nThreads=1)
Same as Train(...) but uses the given momentum.
size_t GetConvergenceCount() const
Scalar_t fMinimumError
The minimum loss achieved on the training set during the current training session.
void SetLearningRate(Scalar_t rate)
Scalar_t fLearningRate
Learning rate .
size_t fBatchSize
Batch size to use for the training.
size_t GetConvergenceSteps() const
Scalar_t Train(const Data_t &TrainingDataIn, size_t nTrainingSamples, const Data_t &TestDataIn, size_t nTestSamples, Net_t &net, size_t nThreads=1)
Train the given net using the given training input data (events), training output data (labels),...
void SetTestInterval(size_t interval)
void StepReducedWeights(Net_t &net, Matrix_t &input, const Matrix_t &output)
Does not evaluate the loss and therefore not trigger a possible synchronization with the device.
Scalar_t GetTestError() const
typename Architecture_t::Scalar_t Scalar_t
create variable transformations