11#ifndef TMVA_DNN_MINIMIZERS
12#define TMVA_DNN_MINIMIZERS
54template<
typename Architecture_t>
58 using Scalar_t =
typename Architecture_t::Scalar_t;
59 using Matrix_t =
typename Architecture_t::Matrix_t;
78 size_t convergenceSteps,
91 template <
typename Data_t,
typename Net_t>
93 const Data_t & TestDataIn,
size_t nTestSamples,
94 Net_t & net,
size_t nThreads = 1);
97 template <
typename Data_t,
typename Net_t>
99 const Data_t & TestDataIn,
size_t nTestSamples,
100 Net_t & net,
Scalar_t momentum,
size_t nThreads = 1);
107 template <
typename Net_t>
112 template <
typename Net_t>
121 template <
typename Net_t>
122 void Step(Net_t &master,
123 std::vector<Net_t> &nets,
127 template <
typename Net_t>
129 std::vector<Net_t> &nets,
132 template <
typename Net_t>
137 std::vector<Net_t> &nets,
144 template <
typename Net_t>
149 template <
typename Net_t>
175template <
typename Architecture_t>
177 : fBatchSize(0), fStepCount(0), fConvergenceSteps(0), fConvergenceCount(0), fTestInterval(0),
178 fTrainingError(0), fTestError(0), fLearningRate(0),
179 fMinimumError(std::numeric_limits<
Scalar_t>::infinity())
185template <
typename Architecture_t>
187 : fBatchSize(0), fStepCount(0), fConvergenceSteps(convergenceSteps), fConvergenceCount(0),
188 fTestInterval(testInterval), fTrainingError(0), fTestError(0),
189 fLearningRate(learningRate), fMinimumError(std::numeric_limits<
Scalar_t>::infinity())
195template<
typename Architecture_t>
196template <
typename Data_t,
typename Net_t>
198 size_t nTrainingSamples,
199 const Data_t & testData,
211 net.GetOutputWidth(), nThreads);
212 auto testNet = net.CreateClone(nTestSamples);
214 testNet.GetBatchSize(),
215 testNet.GetInputWidth(),
216 net.GetOutputWidth());
217 std::vector<Net_t> nets{};
218 nets.reserve(nThreads);
219 for (
size_t i = 0; i < nThreads; i++) {
221 for (
size_t j = 0; j < net.GetDepth(); j++)
223 auto &masterLayer = net.GetLayer(j);
224 auto &layer = nets.back().GetLayer(j);
225 Architecture_t::Copy(layer.GetWeights(),
226 masterLayer.GetWeights());
227 Architecture_t::Copy(layer.GetBiases(),
228 masterLayer.GetBiases());
232 size_t batchesInEpoch = nTrainingSamples / net.GetBatchSize();
233 std::vector<TBatch<Architecture_t>> batches{};
234 batches.reserve(nThreads);
237 for (fStepCount = 0; fStepCount < fTestInterval; fStepCount++) {
239 for (
size_t i = 0; i < batchesInEpoch; i += nThreads) {
241 for (
size_t j = 0; j < nThreads; j++) batches.push_back(trainLoader.
GetBatch());
242 Step(net, nets, batches);
246 auto b = *testLoader.
begin();
247 auto inputMatrix =
b.GetInput();
248 auto outputMatrix =
b.GetOutput();
249 auto weightMatrix =
b.GetWeights();
250 fTestError = testNet.Loss(inputMatrix, outputMatrix, weightMatrix);
252 }
while (!HasConverged());
254 return fMinimumError;
258template<
typename Architecture_t>
259template <
typename Data_t,
typename Net_t>
261 size_t nTrainingSamples,
262 const Data_t & testData,
275 net.GetOutputWidth(), nThreads);
276 auto testNet = net.CreateClone(net.GetBatchSize());
278 testNet.GetBatchSize(),
279 testNet.GetInputWidth(),
280 net.GetOutputWidth());
282 net.InitializeGradients();
283 std::vector<Net_t> nets{};
284 nets.reserve(nThreads);
285 for (
size_t i = 0; i < nThreads; i++) {
287 for (
size_t j = 0; j < net.GetDepth(); j++)
289 auto &masterLayer = net.GetLayer(j);
290 auto &layer = nets.back().GetLayer(j);
291 Architecture_t::Copy(layer.GetWeights(),
292 masterLayer.GetWeights());
293 Architecture_t::Copy(layer.GetBiases(),
294 masterLayer.GetBiases());
298 size_t batchesInEpoch = nTrainingSamples / net.GetBatchSize();
299 std::vector<TBatch<Architecture_t>> batches{};
300 batches.reserve(nThreads);
303 for (fStepCount = 0; fStepCount < fTestInterval; fStepCount++) {
305 for (
size_t i = 0; i < batchesInEpoch; i += nThreads) {
307 for (
size_t j = 0; j < nThreads; j++) batches.push_back(trainLoader.
GetBatch());
308 if (momentum != 0.0) {
309 StepMomentum(net, nets, batches, momentum);
311 Step(net, nets, batches);
317 for (
size_t i = 0; i < batchesInEpoch; i++) {
319 auto inputMatrix =
b.GetInput();
320 auto outputMatrix =
b.GetOutput();
321 auto weightMatrix =
b.GetWeights();
322 fTestError += testNet.Loss(inputMatrix, outputMatrix, weightMatrix);
324 fTestError /= (
Double_t)batchesInEpoch;
325 }
while (!HasConverged());
326 return fMinimumError;
330template <
typename Architecture_t>
331template <
typename Net_t>
335 net.Forward(input,
true);
336 net.Backward(input,
output, weights);
338 for (
size_t i = 0; i < net.GetDepth(); i++)
340 auto &layer = net.GetLayer(i);
341 Architecture_t::ScaleAdd(layer.GetWeights(),
342 layer.GetWeightGradients(),
344 Architecture_t::ScaleAdd(layer.GetBiases(),
345 layer.GetBiasGradients(),
351template <
typename Architecture_t>
352template <
typename Net_t>
357 net.Backward(input,
output);
359 for (
size_t i = 0; i < net.GetDepth(); i++)
361 auto &layer = net.GetLayer(i);
362 Architecture_t::ScaleAdd(layer.GetWeights(),
363 layer.GetWeightGradients(),
365 Architecture_t::ScaleAdd(layer.GetBiases(),
366 layer.GetBiasGradients(),
373template<
typename Architecture_t>
374 template <
typename Net_t>
377 std::vector<Net_t> & nets,
380 typename Architecture_t::Matrix_t dummy(0,0);
381 size_t depth = master.GetDepth();
384 for (
size_t j = 0; j < nets.size(); j++) {
385 nets[j].GetLayer(0).Forward(batches[j].GetInput(),
true);
388 for (
size_t i = 1; i < depth; i++)
390 for (
size_t j = 0; j < nets.size(); j++) {
391 nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(),
true);
395 for (
size_t j = 0; j < nets.size(); j++) {
396 evaluateGradients<Architecture_t>(nets[j].GetLayer(depth - 1).GetActivationGradients(), nets[j].GetLossFunction(),
397 batches[j].GetOutput(), nets[j].GetLayer(depth - 1).GetOutput(),
398 batches[j].GetWeights());
401 for (
size_t i = depth - 1; i > 0; i--)
403 for (
size_t j = 0; j < nets.size(); j++) {
404 nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
405 nets[j].GetLayer(i-1).GetOutput(),
406 nets[j].GetRegularization(),
407 nets[j].GetWeightDecay());
410 for (
size_t j = 0; j < nets.size(); j++) {
411 nets[j].GetLayer(0).Backward(dummy,
412 batches[j].GetInput(),
413 nets[j].GetRegularization(),
414 nets[j].GetWeightDecay());
417 for (
size_t j = 0; j < nets.size(); j++) {
418 for (
size_t i = 0; i < depth; i++)
420 auto &masterLayer = master.GetLayer(i);
421 auto &layer = nets[j].GetLayer(i);
422 Architecture_t::ScaleAdd(masterLayer.GetWeights(),
423 layer.GetWeightGradients(),
425 Architecture_t::Copy(layer.GetWeights(),
426 masterLayer.GetWeights());
427 Architecture_t::ScaleAdd(masterLayer.GetBiases(),
428 layer.GetBiasGradients(),
430 Architecture_t::Copy(layer.GetBiases(),
431 masterLayer.GetBiases());
437template<
typename Architecture_t>
438template <
typename Net_t>
441 std::vector<Net_t> & nets,
445 typename Architecture_t::Matrix_t dummy(0,0);
446 size_t depth = master.GetDepth();
449 for (
size_t j = 0; j < nets.size(); j++) {
450 nets[j].GetLayer(0).Forward(batches[j].GetInput(),
true);
453 for (
size_t i = 1; i < depth; i++)
455 for (
size_t j = 0; j < nets.size(); j++) {
456 nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(),
true);
460 for (
size_t j = 0; j < nets.size(); j++) {
461 evaluateGradients<Architecture_t>(nets[j].GetLayer(depth - 1).GetActivationGradients(), nets[j].GetLossFunction(),
462 batches[j].GetOutput(), nets[j].GetLayer(depth - 1).GetOutput(),
463 batches[j].GetWeights());
466 for (
size_t i = depth - 1; i > 0; i--)
468 for (
size_t j = 0; j < nets.size(); j++) {
469 nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
470 nets[j].GetLayer(i-1).GetOutput(),
471 nets[j].GetRegularization(),
472 nets[j].GetWeightDecay());
473 Architecture_t::ScaleAdd(master.GetLayer(i).GetWeightGradients(),
474 nets[j].GetLayer(i).GetWeightGradients(),
475 - fLearningRate / momentum);
476 Architecture_t::ScaleAdd(master.GetLayer(i).GetBiasGradients(),
477 nets[j].GetLayer(i).GetBiasGradients(),
478 - fLearningRate / momentum);
480 Architecture_t::ScaleAdd(master.GetLayer(i).GetWeightGradients(),
481 master.GetLayer(i).GetWeightGradients(),
483 Architecture_t::ScaleAdd(master.GetLayer(i).GetBiasGradients(),
484 master.GetLayer(i).GetBiasGradients(),
487 for (
size_t j = 0; j < nets.size(); j++) {
488 nets[j].GetLayer(0).Backward(dummy,
489 batches[j].GetInput(),
490 nets[j].GetRegularization(),
491 nets[j].GetWeightDecay());
492 Architecture_t::ScaleAdd(master.GetLayer(0).GetWeightGradients(),
493 nets[j].GetLayer(0).GetWeightGradients(),
494 - fLearningRate / momentum);
495 Architecture_t::ScaleAdd(master.GetLayer(0).GetBiasGradients(),
496 nets[j].GetLayer(0).GetBiasGradients(),
497 - fLearningRate / momentum);
500 Architecture_t::ScaleAdd(master.GetLayer(0).GetWeightGradients(),
501 master.GetLayer(0).GetWeightGradients(),
503 Architecture_t::ScaleAdd(master.GetLayer(0).GetBiasGradients(),
504 master.GetLayer(0).GetBiasGradients(),
507 for (
size_t i = 0; i < depth; i++)
509 auto &masterLayer = master.GetLayer(i);
510 Architecture_t::ScaleAdd(masterLayer.GetWeights(),
511 masterLayer.GetWeightGradients(),
513 Architecture_t::ScaleAdd(masterLayer.GetBiases(),
514 masterLayer.GetBiasGradients(),
516 for (
size_t j = 0; j < nets.size(); j++) {
517 auto &layer = nets[j].GetLayer(i);
518 Architecture_t::Copy(layer.GetWeights(),
519 masterLayer.GetWeights());
520 Architecture_t::Copy(layer.GetBiases(),
521 masterLayer.GetBiases());
527template<
typename Architecture_t>
528template <
typename Net_t>
531 std::vector<Net_t> & nets,
535 typename Architecture_t::Matrix_t dummy(0,0);
536 size_t depth = master.GetDepth();
539 for (
size_t j = 0; j < nets.size(); j++) {
540 nets[j].GetLayer(0).Forward(batches[j].GetInput(),
true);
543 for (
size_t i = 1; i < depth; i++)
545 for (
size_t j = 0; j < nets.size(); j++) {
546 nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(),
true);
551 for (
size_t j = 0; j < nets.size(); j++) {
552 evaluateGradients<Architecture_t>(nets[j].GetLayer(depth - 1).GetActivationGradients(), nets[j].GetLossFunction(),
553 batches[j].GetOutput(), nets[j].GetLayer(depth - 1).GetOutput(),
554 batches[j].GetWeights());
558 for (
size_t i = depth - 1; i > 0; i--)
560 for (
size_t j = 0; j < nets.size(); j++) {
561 nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
562 nets[j].GetLayer(i-1).GetOutput(),
563 nets[j].GetRegularization(),
564 nets[j].GetWeightDecay());
568 for (
size_t j = 0; j < nets.size(); j++) {
569 nets[j].GetLayer(0).Backward(dummy,
570 batches[j].GetInput(),
571 nets[j].GetRegularization(),
572 nets[j].GetWeightDecay());
575 for (
size_t i = 0; i < depth; i++)
577 auto &masterLayer = master.GetLayer(i);
578 for (
size_t j = 0; j < nets.size(); j++) {
579 auto &layer = nets[j].GetLayer(i);
580 Architecture_t::Copy(layer.GetWeights(),
581 masterLayer.GetWeights());
582 Architecture_t::Copy(layer.GetBiases(),
583 masterLayer.GetBiases());
584 Architecture_t::ScaleAdd(layer.GetWeights(),
585 masterLayer.GetWeightGradients(),
587 Architecture_t::ScaleAdd(layer.GetBiases(),
588 masterLayer.GetBiasGradients(),
591 for (
size_t j = 0; j < nets.size(); j++) {
592 auto &layer = nets[j].GetLayer(i);
593 Architecture_t::ScaleAdd(masterLayer.GetWeightGradients(),
594 layer.GetWeightGradients(),
595 - fLearningRate / momentum);
596 Architecture_t::ScaleAdd(masterLayer.GetBiasGradients(),
597 layer.GetBiasGradients(),
598 - fLearningRate / momentum);
600 Architecture_t::ScaleAdd(masterLayer.GetWeightGradients(),
601 masterLayer.GetWeightGradients(),
603 Architecture_t::ScaleAdd(masterLayer.GetBiasGradients(),
604 masterLayer.GetBiasGradients(),
606 Architecture_t::ScaleAdd(masterLayer.GetWeights(),
607 masterLayer.GetWeightGradients(),
609 Architecture_t::ScaleAdd(masterLayer.GetBiases(),
610 masterLayer.GetBiasGradients(),
616template<
typename Architecture_t>
617template <
typename Net_t>
623 net.Forward(input,
true);
624 net.Backward(input,
output);
626 for (
size_t i = 0; i < net.GetDepth(); i++)
628 auto &layer = net.GetLayer(i);
629 Architecture_t::ScaleAdd(layer.GetWeights(),
630 layer.GetWeightGradients(),
633 Architecture_t::ScaleAdd(layer.GetBiases(),
634 layer.GetBiasGradients(),
641template <
typename Architecture_t>
642template <
typename Net_t>
648 fTrainingError = loss;
649 net.Backward(input,
output, weights);
651 for (
size_t i = 0; i < net.GetDepth(); i++)
653 auto &layer = net.GetLayer(i);
654 Architecture_t::ScaleAdd(layer.GetWeights(),
655 layer.GetWeightGradients(),
658 Architecture_t::ScaleAdd(layer.GetBiases(),
659 layer.GetBiasGradients(),
667template<
typename Architecture_t>
670 if (fTestError < fMinimumError * 0.999) {
671 fConvergenceCount = 0;
672 fMinimumError = fTestError;
677 return (fConvergenceCount >= fConvergenceSteps);
681template<
typename Architecture_t>
684 fTestError = testError;
685 if (fTestError < fMinimumError * 0.999) {
686 fConvergenceCount = 0;
687 fMinimumError = fTestError;
689 fConvergenceCount += fTestInterval;
691 return (fConvergenceCount >= fConvergenceSteps);
TBatch< AArchitecture > GetBatch()
Return the next batch from the training set.
void Shuffle()
Shuffle the order of the samples in the batch.
size_t fConvergenceCount
Current number of training epochs without.
bool HasConverged()
Increases the minimization step counter by the test error evaluation period and uses the current inte...
Scalar_t fTrainingError
Holds the most recently computed training loss.
void Step(Net_t &net, Matrix_t &input, const Matrix_t &output, const Matrix_t &weights)
Perform a single optimization step on a given batch.
size_t fConvergenceSteps
Number of training epochs without considerable.
Scalar_t StepReducedWeightsLoss(Net_t &net, Matrix_t &input, const Matrix_t &output, const Matrix_t &weights)
Similar to StepReducedWeights(...) but also evaluates the loss.
Scalar_t StepLoss(Net_t &net, Matrix_t &input, const Matrix_t &output, const Matrix_t &weights)
Same as Step(...) but also evaluate the loss on the given training data.
void Reset()
Reset minimizer object to default state.
Scalar_t GetTrainingError() const
size_t fTestInterval
Interval for the computation of the test error.
void StepNesterov(Net_t &master, std::vector< Net_t > &nets, std::vector< TBatch< Architecture_t > > &batches, Scalar_t momentum)
Same as the Step(...) method for multiple batches but uses Nesterov momentum.
void SetBatchSize(Scalar_t rate)
Scalar_t fTestError
Holds the most recently computed test loss.
typename Architecture_t::Matrix_t Matrix_t
size_t GetTestInterval() const
size_t fStepCount
Number of steps performed in the current training session.
void StepMomentum(Net_t &master, std::vector< Net_t > &nets, std::vector< TBatch< Architecture_t > > &batches, Scalar_t momentum)
Same as the Step(...) method for multiple batches but uses momentum.
void SetConvergenceSteps(size_t steps)
Scalar_t TrainMomentum(const Data_t &TrainingDataIn, size_t nTrainingSamples, const Data_t &TestDataIn, size_t nTestSamples, Net_t &net, Scalar_t momentum, size_t nThreads=1)
Same as Train(...) but uses the given momentum.
size_t GetConvergenceCount() const
Scalar_t fMinimumError
The minimum loss achieved on the training set.
void SetLearningRate(Scalar_t rate)
Scalar_t fLearningRate
Learning rate .
size_t fBatchSize
Batch size to use for the training.
size_t GetConvergenceSteps() const
Scalar_t Train(const Data_t &TrainingDataIn, size_t nTrainingSamples, const Data_t &TestDataIn, size_t nTestSamples, Net_t &net, size_t nThreads=1)
Train the given net using the given training input data (events), training output data (labels),...
void SetTestInterval(size_t interval)
void StepReducedWeights(Net_t &net, Matrix_t &input, const Matrix_t &output)
Does not evaluate the loss and therefore not trigger a possible synchronization with the device.
Scalar_t GetTestError() const
typename Architecture_t::Scalar_t Scalar_t
create variable transformations
static void output(int code)