Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
Minimizers.h
Go to the documentation of this file.
1// @(#)root/tmva $Id$
2// Author: Simon Pfreundschuh 21/06/16
3
4/*************************************************************************
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
11#ifndef TMVA_DNN_MINIMIZERS
12#define TMVA_DNN_MINIMIZERS
13
14#include "DataLoader.h"
15#include "Functions.h"
16
17#include <limits>
18#include <vector>
19
20namespace TMVA {
21namespace DNN {
22
23//______________________________________________________________________________
24//
25// Generic Gradient Descent Class
26//______________________________________________________________________________
27//
28
29/*** \class TGradientDescent
30*
31* Generic implementation of gradient descent minimization.
32*
33* The TGradientDescent class implements an architecture and input data
34* independent implementation of the gradient descent minimization algorithm.
35*
36* Provides Train(...) and TrainMomentum(...) functions that perform a complete
37* training of a neural network. Those are mainly used for testing since for
38* production a more fine grained control of the training process is desirable.
39* This is provided by the Step(...), StepMomentum(...) and StepNesterov(...)
40* functions that perform a single minimization step.
41*
42* The main training characteristics are defined by the provided learning rate,
43* the test interval, and the convergence steps required for convergence. The
44* test interval defines how often the error on the validation set is computed,
45* and the values with which the step counter is increased each time the
46* HasConverged() member function is called. A convergence step is defined as
47* a step in which the test error is NOT less than 0.999 times the current
48* minimal test error that has been reached. If between two subsequent calls
49* to HasConverged(Double_t) the test error has not been sufficiently reduced
50* it is assumed that a number of convergence steps equal to the test interval
51* has been performed.
52*
53*/
54template<typename Architecture_t>
56{
57public:
58 using Scalar_t = typename Architecture_t::Scalar_t;
59 using Matrix_t = typename Architecture_t::Matrix_t;
60
61private:
62 size_t fBatchSize; ///< Batch size to use for the training.
63 size_t fStepCount; ///< Number of steps performed in the current training session
64 size_t fConvergenceSteps; ///< Number of training epochs without considerable
65 ///< decrease in the test error for convergence.
66 size_t fConvergenceCount; ///< Current number of training epochs without
67 ///< considerable decrease in the test error.
68 size_t fTestInterval; ///< Interval for the computation of the test error.
69 Scalar_t fTrainingError;///< Holds the most recently computed training loss.
70 Scalar_t fTestError; ///< Holds the most recently computed test loss.
71 Scalar_t fLearningRate; ///< Learning rate \f$\alpha\f$
72 Scalar_t fMinimumError; ///< The minimum loss achieved on the training set
73 ///< during the current traning session.
74
75public:
77 TGradientDescent(Scalar_t learningRate,
78 size_t convergenceSteps,
79 size_t testInterval);
80
81 /** Reset minimizer object to default state. */
82 void Reset()
83 {
84 fMinimumError = std::numeric_limits<Scalar_t>::infinity();
86 fStepCount = 0;
87 };
88
89 /** Train the given net using the given training input data (events), training
90 output data (labels), test input data (events), test output data (labels). */
91 template <typename Data_t, typename Net_t>
92 Scalar_t Train(const Data_t & TrainingDataIn, size_t nTrainingSamples,
93 const Data_t & TestDataIn, size_t nTestSamples,
94 Net_t & net, size_t nThreads = 1);
95
96 /** Same as Train(...) but uses the given momentum.*/
97 template <typename Data_t, typename Net_t>
98 Scalar_t TrainMomentum(const Data_t & TrainingDataIn, size_t nTrainingSamples,
99 const Data_t & TestDataIn, size_t nTestSamples,
100 Net_t & net, Scalar_t momentum, size_t nThreads = 1);
101
102 /** Perform a single optimization step on a given batch. Propagates the input
103 matrix foward through the net, evaluates the loss and propagates the gradients
104 backward through the net. The computed gradients are scaled by the learning
105 rate \f$\alpha\f$ and subtracted from the weights and bias values of each
106 layer. */
107 template <typename Net_t>
108 void Step(Net_t &net, Matrix_t &input, const Matrix_t &output, const Matrix_t &weights);
109
110 /** Same as Step(...) but also evaluate the loss on the given training data.
111 * Note that this requires synchronization between host and device. */
112 template <typename Net_t>
113 Scalar_t StepLoss(Net_t &net, Matrix_t &input, const Matrix_t &output, const Matrix_t &weights);
114
115 /** Perform multiple optimization steps simultaneously. Performs the
116 * backprop algorithm on the input batches given in \p batches on
117 * the neural networks given in \p nets. The forward and backward propagation
118 * steps are executed in an interleaving manner in order to exploit potential
119 * batch-level parallelism for asynchronous device calls.
120 */
121 template <typename Net_t>
122 void Step(Net_t &master,
123 std::vector<Net_t> &nets,
124 std::vector<TBatch<Architecture_t>> &batches);
125
126 /** Same as the Step(...) method for multiple batches but uses momentum. */
127 template <typename Net_t>
128 void StepMomentum(Net_t &master,
129 std::vector<Net_t> &nets,
130 std::vector<TBatch<Architecture_t>> &batches,
131 Scalar_t momentum);
132 template <typename Net_t>
133
134 /** Same as the Step(...) method for multiple batches but uses Nesterov
135 * momentum. */
136 void StepNesterov(Net_t &master,
137 std::vector<Net_t> &nets,
138 std::vector<TBatch<Architecture_t>> &batches,
139 Scalar_t momentum);
140
141 /** Does not evaluate the loss and therefore not trigger a possible synchronization
142 * with the device. Trains the weights of each layer, but only the bias terms of
143 * the first layer for compatibility with the previous implementation. */
144 template <typename Net_t>
145 void StepReducedWeights(Net_t &net, Matrix_t &input, const Matrix_t &output);
146
147 /** Similar to StepReducedWeights(...) but also evaluates the loss. May trigger
148 * synchronization with the device. */
149 template <typename Net_t>
150 Scalar_t StepReducedWeightsLoss(Net_t &net, Matrix_t &input, const Matrix_t &output, const Matrix_t &weights);
151 /** Increases the minimization step counter by the test error evaluation
152 * period and uses the current internal value of the test error to
153 * determine if the minimization has converged. */
154 bool HasConverged();
155 /** Increases the minimization step counter by the test error evaluation
156 * period and uses the provided test error value to determine if the
157 * minimization has converged. */
158 bool HasConverged(Scalar_t testError);
159
160 size_t GetConvergenceCount() const {return fConvergenceCount;}
161 size_t GetConvergenceSteps() const {return fConvergenceSteps;}
164 size_t GetTestInterval() const {return fTestInterval;}
165
166 void SetConvergenceSteps(size_t steps) {fConvergenceSteps = steps;}
167 void SetTestInterval(size_t interval) {fTestInterval = interval;}
169 void SetBatchSize(Scalar_t rate) {fBatchSize = rate;}
170};
171
172//
173// Implementation
174//______________________________________________________________________________
175template <typename Architecture_t>
177 : fBatchSize(0), fStepCount(0), fConvergenceSteps(0), fConvergenceCount(0), fTestInterval(0),
178 fTrainingError(0), fTestError(0), fLearningRate(0),
179 fMinimumError(std::numeric_limits<Scalar_t>::infinity())
180{
181 // Nothing to do here.
182}
183
184//______________________________________________________________________________
185template <typename Architecture_t>
186TGradientDescent<Architecture_t>::TGradientDescent(Scalar_t learningRate, size_t convergenceSteps, size_t testInterval)
187 : fBatchSize(0), fStepCount(0), fConvergenceSteps(convergenceSteps), fConvergenceCount(0),
188 fTestInterval(testInterval), fTrainingError(0), fTestError(0),
189 fLearningRate(learningRate), fMinimumError(std::numeric_limits<Scalar_t>::infinity())
190{
191 // Nothing to do here.
192}
193
194//______________________________________________________________________________
195template<typename Architecture_t>
196template <typename Data_t, typename Net_t>
197 auto TGradientDescent<Architecture_t>::Train(const Data_t & trainingData,
198 size_t nTrainingSamples,
199 const Data_t & testData,
200 size_t nTestSamples,
201 Net_t & net,
202 size_t nThreads)
203 -> Scalar_t
204{
205 Reset();
206
207 // Prepare training data.
208 TDataLoader<Data_t, Architecture_t> trainLoader(trainingData, nTrainingSamples,
209 net.GetBatchSize(),
210 net.GetInputWidth(),
211 net.GetOutputWidth(), nThreads);
212 auto testNet = net.CreateClone(nTestSamples);
213 TDataLoader<Data_t, Architecture_t> testLoader(testData, nTestSamples,
214 testNet.GetBatchSize(),
215 testNet.GetInputWidth(),
216 net.GetOutputWidth());
217 std::vector<Net_t> nets{};
218 nets.reserve(nThreads);
219 for (size_t i = 0; i < nThreads; i++) {
220 nets.push_back(net);
221 for (size_t j = 0; j < net.GetDepth(); j++)
222 {
223 auto &masterLayer = net.GetLayer(j);
224 auto &layer = nets.back().GetLayer(j);
225 Architecture_t::Copy(layer.GetWeights(),
226 masterLayer.GetWeights());
227 Architecture_t::Copy(layer.GetBiases(),
228 masterLayer.GetBiases());
229 }
230 }
231
232 size_t batchesInEpoch = nTrainingSamples / net.GetBatchSize();
233 std::vector<TBatch<Architecture_t>> batches{};
234 batches.reserve(nThreads);
235
236 do {
237 for (fStepCount = 0; fStepCount < fTestInterval; fStepCount++) {
238 trainLoader.Shuffle();
239 for (size_t i = 0; i < batchesInEpoch; i += nThreads) {
240 batches.clear();
241 for (size_t j = 0; j < nThreads; j++) batches.push_back(trainLoader.GetBatch());
242 Step(net, nets, batches);
243 }
244 }
245
246 auto b = *testLoader.begin();
247 auto inputMatrix = b.GetInput();
248 auto outputMatrix = b.GetOutput();
249 auto weightMatrix = b.GetWeights();
250 fTestError = testNet.Loss(inputMatrix, outputMatrix, weightMatrix);
251
252 } while (!HasConverged());
253
254 return fMinimumError;
255}
256
257//______________________________________________________________________________
258template<typename Architecture_t>
259template <typename Data_t, typename Net_t>
260auto TGradientDescent<Architecture_t>::TrainMomentum(const Data_t & trainingData,
261 size_t nTrainingSamples,
262 const Data_t & testData,
263 size_t nTestSamples,
264 Net_t & net,
265 Scalar_t momentum,
266 size_t nThreads)
267 -> Scalar_t
268{
269 Reset();
270
271 // Prepare training data.
272 TDataLoader<Data_t, Architecture_t> trainLoader(trainingData, nTrainingSamples,
273 net.GetBatchSize(),
274 net.GetInputWidth(),
275 net.GetOutputWidth(), nThreads);
276 auto testNet = net.CreateClone(net.GetBatchSize());
277 TDataLoader<Data_t, Architecture_t> testLoader(testData, nTestSamples,
278 testNet.GetBatchSize(),
279 testNet.GetInputWidth(),
280 net.GetOutputWidth());
281
282 net.InitializeGradients();
283 std::vector<Net_t> nets{};
284 nets.reserve(nThreads);
285 for (size_t i = 0; i < nThreads; i++) {
286 nets.push_back(net);
287 for (size_t j = 0; j < net.GetDepth(); j++)
288 {
289 auto &masterLayer = net.GetLayer(j);
290 auto &layer = nets.back().GetLayer(j);
291 Architecture_t::Copy(layer.GetWeights(),
292 masterLayer.GetWeights());
293 Architecture_t::Copy(layer.GetBiases(),
294 masterLayer.GetBiases());
295 }
296 }
297
298 size_t batchesInEpoch = nTrainingSamples / net.GetBatchSize();
299 std::vector<TBatch<Architecture_t>> batches{};
300 batches.reserve(nThreads);
301
302 do {
303 for (fStepCount = 0; fStepCount < fTestInterval; fStepCount++) {
304 trainLoader.Shuffle();
305 for (size_t i = 0; i < batchesInEpoch; i += nThreads) {
306 batches.clear();
307 for (size_t j = 0; j < nThreads; j++) batches.push_back(trainLoader.GetBatch());
308 if (momentum != 0.0) {
309 StepMomentum(net, nets, batches, momentum);
310 } else {
311 Step(net, nets, batches);
312 }
313 }
314 }
315
316 fTestError = 0.0;
317 for (size_t i = 0; i < batchesInEpoch; i++) {
318 auto b = testLoader.GetBatch();
319 auto inputMatrix = b.GetInput();
320 auto outputMatrix = b.GetOutput();
321 auto weightMatrix = b.GetWeights();
322 fTestError += testNet.Loss(inputMatrix, outputMatrix, weightMatrix);
323 }
324 fTestError /= (Double_t)batchesInEpoch;
325 } while (!HasConverged());
326 return fMinimumError;
327}
328
329//______________________________________________________________________________
330template <typename Architecture_t>
331template <typename Net_t>
332void inline TGradientDescent<Architecture_t>::Step(Net_t &net, Matrix_t &input, const Matrix_t &output,
333 const Matrix_t &weights)
334{
335 net.Forward(input, true);
336 net.Backward(input, output, weights);
337
338 for (size_t i = 0; i < net.GetDepth(); i++)
339 {
340 auto &layer = net.GetLayer(i);
341 Architecture_t::ScaleAdd(layer.GetWeights(),
342 layer.GetWeightGradients(),
343 -fLearningRate);
344 Architecture_t::ScaleAdd(layer.GetBiases(),
345 layer.GetBiasGradients(),
346 -fLearningRate);
347 }
348}
349
350//______________________________________________________________________________
351template <typename Architecture_t>
352template <typename Net_t>
354 const Matrix_t &weights) -> Scalar_t
355{
356 Scalar_t loss = net.Loss(input, output, weights);
357 net.Backward(input, output);
358
359 for (size_t i = 0; i < net.GetDepth(); i++)
360 {
361 auto &layer = net.GetLayer(i);
362 Architecture_t::ScaleAdd(layer.GetWeights(),
363 layer.GetWeightGradients(),
364 -fLearningRate);
365 Architecture_t::ScaleAdd(layer.GetBiases(),
366 layer.GetBiasGradients(),
367 -fLearningRate);
368 }
369 return loss;
370}
371
372//______________________________________________________________________________
373template<typename Architecture_t>
374 template <typename Net_t>
376 Net_t & master,
377 std::vector<Net_t> & nets,
378 std::vector<TBatch<Architecture_t>> & batches)
379{
380 typename Architecture_t::Matrix_t dummy(0,0);
381 size_t depth = master.GetDepth();
382
383 // Forward
384 for (size_t j = 0; j < nets.size(); j++) {
385 nets[j].GetLayer(0).Forward(batches[j].GetInput(), true);
386 }
387
388 for (size_t i = 1; i < depth; i++)
389 {
390 for (size_t j = 0; j < nets.size(); j++) {
391 nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(), true);
392 }
393 }
394 // Gradients
395 for (size_t j = 0; j < nets.size(); j++) {
396 evaluateGradients<Architecture_t>(nets[j].GetLayer(depth - 1).GetActivationGradients(), nets[j].GetLossFunction(),
397 batches[j].GetOutput(), nets[j].GetLayer(depth - 1).GetOutput(),
398 batches[j].GetWeights());
399 }
400 // Backward
401 for (size_t i = depth - 1; i > 0; i--)
402 {
403 for (size_t j = 0; j < nets.size(); j++) {
404 nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
405 nets[j].GetLayer(i-1).GetOutput(),
406 nets[j].GetRegularization(),
407 nets[j].GetWeightDecay());
408 }
409 }
410 for (size_t j = 0; j < nets.size(); j++) {
411 nets[j].GetLayer(0).Backward(dummy,
412 batches[j].GetInput(),
413 nets[j].GetRegularization(),
414 nets[j].GetWeightDecay());
415 }
416
417 for (size_t j = 0; j < nets.size(); j++) {
418 for (size_t i = 0; i < depth; i++)
419 {
420 auto &masterLayer = master.GetLayer(i);
421 auto &layer = nets[j].GetLayer(i);
422 Architecture_t::ScaleAdd(masterLayer.GetWeights(),
423 layer.GetWeightGradients(),
424 -fLearningRate);
425 Architecture_t::Copy(layer.GetWeights(),
426 masterLayer.GetWeights());
427 Architecture_t::ScaleAdd(masterLayer.GetBiases(),
428 layer.GetBiasGradients(),
429 -fLearningRate);
430 Architecture_t::Copy(layer.GetBiases(),
431 masterLayer.GetBiases());
432 }
433 }
434}
435
436//______________________________________________________________________________
437template<typename Architecture_t>
438template <typename Net_t>
440 Net_t & master,
441 std::vector<Net_t> & nets,
442 std::vector<TBatch<Architecture_t>> & batches,
443 Scalar_t momentum)
444{
445 typename Architecture_t::Matrix_t dummy(0,0);
446 size_t depth = master.GetDepth();
447
448 // Forward
449 for (size_t j = 0; j < nets.size(); j++) {
450 nets[j].GetLayer(0).Forward(batches[j].GetInput(), true);
451 }
452
453 for (size_t i = 1; i < depth; i++)
454 {
455 for (size_t j = 0; j < nets.size(); j++) {
456 nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(), true);
457 }
458 }
459 // Gradients
460 for (size_t j = 0; j < nets.size(); j++) {
461 evaluateGradients<Architecture_t>(nets[j].GetLayer(depth - 1).GetActivationGradients(), nets[j].GetLossFunction(),
462 batches[j].GetOutput(), nets[j].GetLayer(depth - 1).GetOutput(),
463 batches[j].GetWeights());
464 }
465 // Backward
466 for (size_t i = depth - 1; i > 0; i--)
467 {
468 for (size_t j = 0; j < nets.size(); j++) {
469 nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
470 nets[j].GetLayer(i-1).GetOutput(),
471 nets[j].GetRegularization(),
472 nets[j].GetWeightDecay());
473 Architecture_t::ScaleAdd(master.GetLayer(i).GetWeightGradients(),
474 nets[j].GetLayer(i).GetWeightGradients(),
475 - fLearningRate / momentum);
476 Architecture_t::ScaleAdd(master.GetLayer(i).GetBiasGradients(),
477 nets[j].GetLayer(i).GetBiasGradients(),
478 - fLearningRate / momentum);
479 }
480 Architecture_t::ScaleAdd(master.GetLayer(i).GetWeightGradients(),
481 master.GetLayer(i).GetWeightGradients(),
482 momentum - 1.0);
483 Architecture_t::ScaleAdd(master.GetLayer(i).GetBiasGradients(),
484 master.GetLayer(i).GetBiasGradients(),
485 momentum - 1.0);
486 }
487 for (size_t j = 0; j < nets.size(); j++) {
488 nets[j].GetLayer(0).Backward(dummy,
489 batches[j].GetInput(),
490 nets[j].GetRegularization(),
491 nets[j].GetWeightDecay());
492 Architecture_t::ScaleAdd(master.GetLayer(0).GetWeightGradients(),
493 nets[j].GetLayer(0).GetWeightGradients(),
494 - fLearningRate / momentum);
495 Architecture_t::ScaleAdd(master.GetLayer(0).GetBiasGradients(),
496 nets[j].GetLayer(0).GetBiasGradients(),
497 - fLearningRate / momentum);
498 }
499
500 Architecture_t::ScaleAdd(master.GetLayer(0).GetWeightGradients(),
501 master.GetLayer(0).GetWeightGradients(),
502 momentum - 1.0);
503 Architecture_t::ScaleAdd(master.GetLayer(0).GetBiasGradients(),
504 master.GetLayer(0).GetBiasGradients(),
505 momentum - 1.0);
506
507 for (size_t i = 0; i < depth; i++)
508 {
509 auto &masterLayer = master.GetLayer(i);
510 Architecture_t::ScaleAdd(masterLayer.GetWeights(),
511 masterLayer.GetWeightGradients(),
512 1.0);
513 Architecture_t::ScaleAdd(masterLayer.GetBiases(),
514 masterLayer.GetBiasGradients(),
515 1.0);
516 for (size_t j = 0; j < nets.size(); j++) {
517 auto &layer = nets[j].GetLayer(i);
518 Architecture_t::Copy(layer.GetWeights(),
519 masterLayer.GetWeights());
520 Architecture_t::Copy(layer.GetBiases(),
521 masterLayer.GetBiases());
522 }
523 }
524}
525
526//______________________________________________________________________________
527template<typename Architecture_t>
528template <typename Net_t>
530 Net_t & master,
531 std::vector<Net_t> & nets,
532 std::vector<TBatch<Architecture_t>> & batches,
533 Scalar_t momentum)
534{
535 typename Architecture_t::Matrix_t dummy(0,0);
536 size_t depth = master.GetDepth();
537
538 // Forward
539 for (size_t j = 0; j < nets.size(); j++) {
540 nets[j].GetLayer(0).Forward(batches[j].GetInput(), true);
541 }
542
543 for (size_t i = 1; i < depth; i++)
544 {
545 for (size_t j = 0; j < nets.size(); j++) {
546 nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(), true);
547 }
548 }
549
550 // Gradients
551 for (size_t j = 0; j < nets.size(); j++) {
552 evaluateGradients<Architecture_t>(nets[j].GetLayer(depth - 1).GetActivationGradients(), nets[j].GetLossFunction(),
553 batches[j].GetOutput(), nets[j].GetLayer(depth - 1).GetOutput(),
554 batches[j].GetWeights());
555 }
556
557 // Backward
558 for (size_t i = depth - 1; i > 0; i--)
559 {
560 for (size_t j = 0; j < nets.size(); j++) {
561 nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
562 nets[j].GetLayer(i-1).GetOutput(),
563 nets[j].GetRegularization(),
564 nets[j].GetWeightDecay());
565 }
566 }
567
568 for (size_t j = 0; j < nets.size(); j++) {
569 nets[j].GetLayer(0).Backward(dummy,
570 batches[j].GetInput(),
571 nets[j].GetRegularization(),
572 nets[j].GetWeightDecay());
573 }
574
575 for (size_t i = 0; i < depth; i++)
576 {
577 auto &masterLayer = master.GetLayer(i);
578 for (size_t j = 0; j < nets.size(); j++) {
579 auto &layer = nets[j].GetLayer(i);
580 Architecture_t::Copy(layer.GetWeights(),
581 masterLayer.GetWeights());
582 Architecture_t::Copy(layer.GetBiases(),
583 masterLayer.GetBiases());
584 Architecture_t::ScaleAdd(layer.GetWeights(),
585 masterLayer.GetWeightGradients(),
586 1.0);
587 Architecture_t::ScaleAdd(layer.GetBiases(),
588 masterLayer.GetBiasGradients(),
589 1.0);
590 }
591 for (size_t j = 0; j < nets.size(); j++) {
592 auto &layer = nets[j].GetLayer(i);
593 Architecture_t::ScaleAdd(masterLayer.GetWeightGradients(),
594 layer.GetWeightGradients(),
595 - fLearningRate / momentum);
596 Architecture_t::ScaleAdd(masterLayer.GetBiasGradients(),
597 layer.GetBiasGradients(),
598 - fLearningRate / momentum);
599 }
600 Architecture_t::ScaleAdd(masterLayer.GetWeightGradients(),
601 masterLayer.GetWeightGradients(),
602 momentum - 1.0);
603 Architecture_t::ScaleAdd(masterLayer.GetBiasGradients(),
604 masterLayer.GetBiasGradients(),
605 momentum - 1.0);
606 Architecture_t::ScaleAdd(masterLayer.GetWeights(),
607 masterLayer.GetWeightGradients(),
608 1.0);
609 Architecture_t::ScaleAdd(masterLayer.GetBiases(),
610 masterLayer.GetBiasGradients(),
611 1.0);
612 }
613}
614
615//______________________________________________________________________________
616template<typename Architecture_t>
617template <typename Net_t>
619 Net_t & net,
620 Matrix_t &input,
621 const Matrix_t &output)
622{
623 net.Forward(input, true);
624 net.Backward(input, output);
625
626 for (size_t i = 0; i < net.GetDepth(); i++)
627 {
628 auto &layer = net.GetLayer(i);
629 Architecture_t::ScaleAdd(layer.GetWeights(),
630 layer.GetWeightGradients(),
631 -fLearningRate);
632 if (i == 0) {
633 Architecture_t::ScaleAdd(layer.GetBiases(),
634 layer.GetBiasGradients(),
635 -fLearningRate);
636 }
637 }
638}
639
640//______________________________________________________________________________
641template <typename Architecture_t>
642template <typename Net_t>
644 const Matrix_t &output, const Matrix_t &weights)
645 -> Scalar_t
646{
647 Scalar_t loss = net.Loss(input, output);
648 fTrainingError = loss;
649 net.Backward(input, output, weights);
650
651 for (size_t i = 0; i < net.GetDepth(); i++)
652 {
653 auto &layer = net.GetLayer(i);
654 Architecture_t::ScaleAdd(layer.GetWeights(),
655 layer.GetWeightGradients(),
656 -fLearningRate);
657 if (i == 0) {
658 Architecture_t::ScaleAdd(layer.GetBiases(),
659 layer.GetBiasGradients(),
660 -fLearningRate);
661 }
662 }
663 return loss;
664}
665
666//______________________________________________________________________________
667template<typename Architecture_t>
669{
670 if (fTestError < fMinimumError * 0.999) {
671 fConvergenceCount = 0;
672 fMinimumError = fTestError;
673 } else {
674 fConvergenceCount++;
675 }
676
677 return (fConvergenceCount >= fConvergenceSteps);
678}
679
680//______________________________________________________________________________
681template<typename Architecture_t>
683{
684 fTestError = testError;
685 if (fTestError < fMinimumError * 0.999) {
686 fConvergenceCount = 0;
687 fMinimumError = fTestError;
688 } else {
689 fConvergenceCount += fTestInterval;
690 }
691 return (fConvergenceCount >= fConvergenceSteps);
692}
693} // namespace DNN
694} // namespace TMVA
695
696#endif
#define b(i)
Definition RSha256.hxx:100
double Double_t
Definition RtypesCore.h:59
BatchIterator_t begin()
Definition DataLoader.h:170
TBatch< AArchitecture > GetBatch()
Return the next batch from the training set.
Definition DataLoader.h:228
void Shuffle()
Shuffle the order of the samples in the batch.
Definition DataLoader.h:269
size_t fConvergenceCount
Current number of training epochs without.
Definition Minimizers.h:66
bool HasConverged()
Increases the minimization step counter by the test error evaluation period and uses the current inte...
Definition Minimizers.h:668
Scalar_t fTrainingError
Holds the most recently computed training loss.
Definition Minimizers.h:69
void Step(Net_t &net, Matrix_t &input, const Matrix_t &output, const Matrix_t &weights)
Perform a single optimization step on a given batch.
Definition Minimizers.h:332
size_t fConvergenceSteps
Number of training epochs without considerable.
Definition Minimizers.h:64
Scalar_t StepReducedWeightsLoss(Net_t &net, Matrix_t &input, const Matrix_t &output, const Matrix_t &weights)
Similar to StepReducedWeights(...) but also evaluates the loss.
Scalar_t StepLoss(Net_t &net, Matrix_t &input, const Matrix_t &output, const Matrix_t &weights)
Same as Step(...) but also evaluate the loss on the given training data.
void Reset()
Reset minimizer object to default state.
Definition Minimizers.h:82
Scalar_t GetTrainingError() const
Definition Minimizers.h:162
size_t fTestInterval
Interval for the computation of the test error.
Definition Minimizers.h:68
void StepNesterov(Net_t &master, std::vector< Net_t > &nets, std::vector< TBatch< Architecture_t > > &batches, Scalar_t momentum)
Same as the Step(...) method for multiple batches but uses Nesterov momentum.
Definition Minimizers.h:529
void SetBatchSize(Scalar_t rate)
Definition Minimizers.h:169
Scalar_t fTestError
Holds the most recently computed test loss.
Definition Minimizers.h:70
typename Architecture_t::Matrix_t Matrix_t
Definition Minimizers.h:59
size_t GetTestInterval() const
Definition Minimizers.h:164
size_t fStepCount
Number of steps performed in the current training session.
Definition Minimizers.h:63
void StepMomentum(Net_t &master, std::vector< Net_t > &nets, std::vector< TBatch< Architecture_t > > &batches, Scalar_t momentum)
Same as the Step(...) method for multiple batches but uses momentum.
Definition Minimizers.h:439
void SetConvergenceSteps(size_t steps)
Definition Minimizers.h:166
Scalar_t TrainMomentum(const Data_t &TrainingDataIn, size_t nTrainingSamples, const Data_t &TestDataIn, size_t nTestSamples, Net_t &net, Scalar_t momentum, size_t nThreads=1)
Same as Train(...) but uses the given momentum.
size_t GetConvergenceCount() const
Definition Minimizers.h:160
Scalar_t fMinimumError
The minimum loss achieved on the training set.
Definition Minimizers.h:72
void SetLearningRate(Scalar_t rate)
Definition Minimizers.h:168
Scalar_t fLearningRate
Learning rate .
Definition Minimizers.h:71
size_t fBatchSize
Batch size to use for the training.
Definition Minimizers.h:62
size_t GetConvergenceSteps() const
Definition Minimizers.h:161
Scalar_t Train(const Data_t &TrainingDataIn, size_t nTrainingSamples, const Data_t &TestDataIn, size_t nTestSamples, Net_t &net, size_t nThreads=1)
Train the given net using the given training input data (events), training output data (labels),...
void SetTestInterval(size_t interval)
Definition Minimizers.h:167
void StepReducedWeights(Net_t &net, Matrix_t &input, const Matrix_t &output)
Does not evaluate the loss and therefore not trigger a possible synchronization with the device.
Definition Minimizers.h:618
Scalar_t GetTestError() const
Definition Minimizers.h:163
typename Architecture_t::Scalar_t Scalar_t
Definition Minimizers.h:58
create variable transformations
static void output(int code)
Definition gifencode.c:226