Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
Minimizers.h
Go to the documentation of this file.
1// @(#)root/tmva $Id$
2// Author: Simon Pfreundschuh 21/06/16
3
4/*************************************************************************
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
11#ifndef TMVA_DNN_MINIMIZERS
12#define TMVA_DNN_MINIMIZERS
13
14#include "DataLoader.h"
15#include "Functions.h"
16
17#include <limits>
18#include <vector>
19
20namespace TMVA {
21namespace DNN {
22
23//______________________________________________________________________________
24//
25// Generic Gradient Descent Class
26//______________________________________________________________________________
27//
28
29/*** \class TGradientDescent
30*
31* Generic implementation of gradient descent minimization.
32*
33* The TGradientDescent class implements an architecture and input data
34* independent implementation of the gradient descent minimization algorithm.
35*
36* Provides Train(...) and TrainMomentum(...) functions that perform a complete
37* training of a neural network. Those are mainly used for testing since for
38* production a more fine grained control of the training process is desirable.
39* This is provided by the Step(...), StepMomentum(...) and StepNesterov(...)
40* functions that perform a single minimization step.
41*
42* The main training characteristics are defined by the provided learning rate,
43* the test interval, and the convergence steps required for convergence. The
44* test interval defines how often the error on the validation set is computed,
45* and the values with which the step counter is increased each time the
46* HasConverged() member function is called. A convergence step is defined as
47* a step in which the test error is NOT less than 0.999 times the current
48* minimal test error that has been reached. If between two subsequent calls
49* to HasConverged(Double_t) the test error has not been sufficiently reduced
50* it is assumed that a number of convergence steps equal to the test interval
51* has been performed.
52*
53*/
54template<typename Architecture_t>
56{
57public:
58 using Scalar_t = typename Architecture_t::Scalar_t;
59 using Matrix_t = typename Architecture_t::Matrix_t;
60
61private:
62 size_t fBatchSize; ///< Batch size to use for the training.
63 size_t fStepCount; ///< Number of steps performed in the current training session
64 size_t fConvergenceSteps; ///< Number of training epochs without considerable
65 ///< decrease in the test error for convergence.
66 size_t fConvergenceCount; ///< Current number of training epochs without
67 ///< considerable decrease in the test error.
68 size_t fTestInterval; ///< Interval for the computation of the test error.
69 Scalar_t fTrainingError;///< Holds the most recently computed training loss.
70 Scalar_t fTestError; ///< Holds the most recently computed test loss.
71 Scalar_t fLearningRate; ///< Learning rate \f$\alpha\f$
72 Scalar_t fMinimumError; ///< The minimum loss achieved on the training set during the current training session.
73
74public:
76 TGradientDescent(Scalar_t learningRate,
77 size_t convergenceSteps,
78 size_t testInterval);
79
80 /** Reset minimizer object to default state. */
81 void Reset()
82 {
83 fMinimumError = std::numeric_limits<Scalar_t>::infinity();
85 fStepCount = 0;
86 };
87
88 /** Train the given net using the given training input data (events), training
89 output data (labels), test input data (events), test output data (labels). */
90 template <typename Data_t, typename Net_t>
91 Scalar_t Train(const Data_t & TrainingDataIn, size_t nTrainingSamples,
92 const Data_t & TestDataIn, size_t nTestSamples,
93 Net_t & net, size_t nThreads = 1);
94
95 /** Same as Train(...) but uses the given momentum.*/
96 template <typename Data_t, typename Net_t>
97 Scalar_t TrainMomentum(const Data_t & TrainingDataIn, size_t nTrainingSamples,
98 const Data_t & TestDataIn, size_t nTestSamples,
99 Net_t & net, Scalar_t momentum, size_t nThreads = 1);
100
101 /** Perform a single optimization step on a given batch. Propagates the input
102 matrix forward through the net, evaluates the loss and propagates the gradients
103 backward through the net. The computed gradients are scaled by the learning
104 rate \f$\alpha\f$ and subtracted from the weights and bias values of each
105 layer. */
106 template <typename Net_t>
107 void Step(Net_t &net, Matrix_t &input, const Matrix_t &output, const Matrix_t &weights);
108
109 /** Same as Step(...) but also evaluate the loss on the given training data.
110 * Note that this requires synchronization between host and device. */
111 template <typename Net_t>
112 Scalar_t StepLoss(Net_t &net, Matrix_t &input, const Matrix_t &output, const Matrix_t &weights);
113
114 /** Perform multiple optimization steps simultaneously. Performs the
115 * backprop algorithm on the input batches given in \p batches on
116 * the neural networks given in \p nets. The forward and backward propagation
117 * steps are executed in an interleaving manner in order to exploit potential
118 * batch-level parallelism for asynchronous device calls.
119 */
120 template <typename Net_t>
121 void Step(Net_t &master,
122 std::vector<Net_t> &nets,
123 std::vector<TBatch<Architecture_t>> &batches);
124
125 /** Same as the Step(...) method for multiple batches but uses momentum. */
126 template <typename Net_t>
127 void StepMomentum(Net_t &master,
128 std::vector<Net_t> &nets,
129 std::vector<TBatch<Architecture_t>> &batches,
130 Scalar_t momentum);
131 template <typename Net_t>
132
133 /** Same as the Step(...) method for multiple batches but uses Nesterov
134 * momentum. */
135 void StepNesterov(Net_t &master,
136 std::vector<Net_t> &nets,
137 std::vector<TBatch<Architecture_t>> &batches,
138 Scalar_t momentum);
139
140 /** Does not evaluate the loss and therefore not trigger a possible synchronization
141 * with the device. Trains the weights of each layer, but only the bias terms of
142 * the first layer for compatibility with the previous implementation. */
143 template <typename Net_t>
144 void StepReducedWeights(Net_t &net, Matrix_t &input, const Matrix_t &output);
145
146 /** Similar to StepReducedWeights(...) but also evaluates the loss. May trigger
147 * synchronization with the device. */
148 template <typename Net_t>
149 Scalar_t StepReducedWeightsLoss(Net_t &net, Matrix_t &input, const Matrix_t &output, const Matrix_t &weights);
150 /** Increases the minimization step counter by the test error evaluation
151 * period and uses the current internal value of the test error to
152 * determine if the minimization has converged. */
153 bool HasConverged();
154 /** Increases the minimization step counter by the test error evaluation
155 * period and uses the provided test error value to determine if the
156 * minimization has converged. */
157 bool HasConverged(Scalar_t testError);
158
159 size_t GetConvergenceCount() const {return fConvergenceCount;}
160 size_t GetConvergenceSteps() const {return fConvergenceSteps;}
163 size_t GetTestInterval() const {return fTestInterval;}
164
165 void SetConvergenceSteps(size_t steps) {fConvergenceSteps = steps;}
166 void SetTestInterval(size_t interval) {fTestInterval = interval;}
168 void SetBatchSize(Scalar_t rate) {fBatchSize = rate;}
169};
170
171//
172// Implementation
173//______________________________________________________________________________
174template <typename Architecture_t>
176 : fBatchSize(0), fStepCount(0), fConvergenceSteps(0), fConvergenceCount(0), fTestInterval(0),
177 fTrainingError(0), fTestError(0), fLearningRate(0),
178 fMinimumError(std::numeric_limits<Scalar_t>::infinity())
179{
180 // Nothing to do here.
181}
182
183//______________________________________________________________________________
184template <typename Architecture_t>
185TGradientDescent<Architecture_t>::TGradientDescent(Scalar_t learningRate, size_t convergenceSteps, size_t testInterval)
186 : fBatchSize(0), fStepCount(0), fConvergenceSteps(convergenceSteps), fConvergenceCount(0),
187 fTestInterval(testInterval), fTrainingError(0), fTestError(0),
188 fLearningRate(learningRate), fMinimumError(std::numeric_limits<Scalar_t>::infinity())
189{
190 // Nothing to do here.
191}
192
193//______________________________________________________________________________
194template<typename Architecture_t>
195template <typename Data_t, typename Net_t>
196 auto TGradientDescent<Architecture_t>::Train(const Data_t & trainingData,
197 size_t nTrainingSamples,
198 const Data_t & testData,
199 size_t nTestSamples,
200 Net_t & net,
201 size_t nThreads)
202 -> Scalar_t
203{
204 Reset();
205
206 // Prepare training data.
207 TDataLoader<Data_t, Architecture_t> trainLoader(trainingData, nTrainingSamples,
208 net.GetBatchSize(),
209 net.GetInputWidth(),
210 net.GetOutputWidth(), nThreads);
211 auto testNet = net.CreateClone(nTestSamples);
212 TDataLoader<Data_t, Architecture_t> testLoader(testData, nTestSamples,
213 testNet.GetBatchSize(),
214 testNet.GetInputWidth(),
215 net.GetOutputWidth());
216 std::vector<Net_t> nets{};
217 nets.reserve(nThreads);
218 for (size_t i = 0; i < nThreads; i++) {
219 nets.push_back(net);
220 for (size_t j = 0; j < net.GetDepth(); j++)
221 {
222 auto &masterLayer = net.GetLayer(j);
223 auto &layer = nets.back().GetLayer(j);
224 Architecture_t::Copy(layer.GetWeights(),
225 masterLayer.GetWeights());
226 Architecture_t::Copy(layer.GetBiases(),
227 masterLayer.GetBiases());
228 }
229 }
230
231 size_t batchesInEpoch = nTrainingSamples / net.GetBatchSize();
232 std::vector<TBatch<Architecture_t>> batches{};
233 batches.reserve(nThreads);
234
235 do {
236 for (fStepCount = 0; fStepCount < fTestInterval; fStepCount++) {
237 trainLoader.Shuffle();
238 for (size_t i = 0; i < batchesInEpoch; i += nThreads) {
239 batches.clear();
240 for (size_t j = 0; j < nThreads; j++) batches.push_back(trainLoader.GetBatch());
241 Step(net, nets, batches);
242 }
243 }
244
245 auto b = *testLoader.begin();
246 auto inputMatrix = b.GetInput();
247 auto outputMatrix = b.GetOutput();
248 auto weightMatrix = b.GetWeights();
249 fTestError = testNet.Loss(inputMatrix, outputMatrix, weightMatrix);
250
251 } while (!HasConverged());
252
253 return fMinimumError;
254}
255
256//______________________________________________________________________________
257template<typename Architecture_t>
258template <typename Data_t, typename Net_t>
259auto TGradientDescent<Architecture_t>::TrainMomentum(const Data_t & trainingData,
260 size_t nTrainingSamples,
261 const Data_t & testData,
262 size_t nTestSamples,
263 Net_t & net,
264 Scalar_t momentum,
265 size_t nThreads)
266 -> Scalar_t
267{
268 Reset();
269
270 // Prepare training data.
271 TDataLoader<Data_t, Architecture_t> trainLoader(trainingData, nTrainingSamples,
272 net.GetBatchSize(),
273 net.GetInputWidth(),
274 net.GetOutputWidth(), nThreads);
275 auto testNet = net.CreateClone(net.GetBatchSize());
276 TDataLoader<Data_t, Architecture_t> testLoader(testData, nTestSamples,
277 testNet.GetBatchSize(),
278 testNet.GetInputWidth(),
279 net.GetOutputWidth());
280
281 net.InitializeGradients();
282 std::vector<Net_t> nets{};
283 nets.reserve(nThreads);
284 for (size_t i = 0; i < nThreads; i++) {
285 nets.push_back(net);
286 for (size_t j = 0; j < net.GetDepth(); j++)
287 {
288 auto &masterLayer = net.GetLayer(j);
289 auto &layer = nets.back().GetLayer(j);
290 Architecture_t::Copy(layer.GetWeights(),
291 masterLayer.GetWeights());
292 Architecture_t::Copy(layer.GetBiases(),
293 masterLayer.GetBiases());
294 }
295 }
296
297 size_t batchesInEpoch = nTrainingSamples / net.GetBatchSize();
298 std::vector<TBatch<Architecture_t>> batches{};
299 batches.reserve(nThreads);
300
301 do {
302 for (fStepCount = 0; fStepCount < fTestInterval; fStepCount++) {
303 trainLoader.Shuffle();
304 for (size_t i = 0; i < batchesInEpoch; i += nThreads) {
305 batches.clear();
306 for (size_t j = 0; j < nThreads; j++) batches.push_back(trainLoader.GetBatch());
307 if (momentum != 0.0) {
308 StepMomentum(net, nets, batches, momentum);
309 } else {
310 Step(net, nets, batches);
311 }
312 }
313 }
314
315 fTestError = 0.0;
316 for (size_t i = 0; i < batchesInEpoch; i++) {
317 auto b = testLoader.GetBatch();
318 auto inputMatrix = b.GetInput();
319 auto outputMatrix = b.GetOutput();
320 auto weightMatrix = b.GetWeights();
321 fTestError += testNet.Loss(inputMatrix, outputMatrix, weightMatrix);
322 }
323 fTestError /= (Double_t)batchesInEpoch;
324 } while (!HasConverged());
325 return fMinimumError;
326}
327
328//______________________________________________________________________________
329template <typename Architecture_t>
330template <typename Net_t>
332 const Matrix_t &weights)
333{
334 net.Forward(input, true);
335 net.Backward(input, output, weights);
336
337 for (size_t i = 0; i < net.GetDepth(); i++)
338 {
339 auto &layer = net.GetLayer(i);
340 Architecture_t::ScaleAdd(layer.GetWeights(),
341 layer.GetWeightGradients(),
342 -fLearningRate);
343 Architecture_t::ScaleAdd(layer.GetBiases(),
344 layer.GetBiasGradients(),
345 -fLearningRate);
346 }
347}
348
349//______________________________________________________________________________
350template <typename Architecture_t>
351template <typename Net_t>
353 const Matrix_t &weights) -> Scalar_t
354{
355 Scalar_t loss = net.Loss(input, output, weights);
356 net.Backward(input, output);
357
358 for (size_t i = 0; i < net.GetDepth(); i++)
359 {
360 auto &layer = net.GetLayer(i);
361 Architecture_t::ScaleAdd(layer.GetWeights(),
362 layer.GetWeightGradients(),
363 -fLearningRate);
364 Architecture_t::ScaleAdd(layer.GetBiases(),
365 layer.GetBiasGradients(),
366 -fLearningRate);
367 }
368 return loss;
369}
370
371//______________________________________________________________________________
372template<typename Architecture_t>
373 template <typename Net_t>
375 Net_t & master,
376 std::vector<Net_t> & nets,
377 std::vector<TBatch<Architecture_t>> & batches)
378{
379 typename Architecture_t::Matrix_t dummy(0,0);
380 size_t depth = master.GetDepth();
381
382 // Forward
383 for (size_t j = 0; j < nets.size(); j++) {
384 nets[j].GetLayer(0).Forward(batches[j].GetInput(), true);
385 }
386
387 for (size_t i = 1; i < depth; i++)
388 {
389 for (size_t j = 0; j < nets.size(); j++) {
390 nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(), true);
391 }
392 }
393 // Gradients
394 for (size_t j = 0; j < nets.size(); j++) {
395 evaluateGradients<Architecture_t>(nets[j].GetLayer(depth - 1).GetActivationGradients(), nets[j].GetLossFunction(),
396 batches[j].GetOutput(), nets[j].GetLayer(depth - 1).GetOutput(),
397 batches[j].GetWeights());
398 }
399 // Backward
400 for (size_t i = depth - 1; i > 0; i--)
401 {
402 for (size_t j = 0; j < nets.size(); j++) {
403 nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
404 nets[j].GetLayer(i-1).GetOutput(),
405 nets[j].GetRegularization(),
406 nets[j].GetWeightDecay());
407 }
408 }
409 for (size_t j = 0; j < nets.size(); j++) {
410 nets[j].GetLayer(0).Backward(dummy,
411 batches[j].GetInput(),
412 nets[j].GetRegularization(),
413 nets[j].GetWeightDecay());
414 }
415
416 for (size_t j = 0; j < nets.size(); j++) {
417 for (size_t i = 0; i < depth; i++)
418 {
419 auto &masterLayer = master.GetLayer(i);
420 auto &layer = nets[j].GetLayer(i);
421 Architecture_t::ScaleAdd(masterLayer.GetWeights(),
422 layer.GetWeightGradients(),
423 -fLearningRate);
424 Architecture_t::Copy(layer.GetWeights(),
425 masterLayer.GetWeights());
426 Architecture_t::ScaleAdd(masterLayer.GetBiases(),
427 layer.GetBiasGradients(),
428 -fLearningRate);
429 Architecture_t::Copy(layer.GetBiases(),
430 masterLayer.GetBiases());
431 }
432 }
433}
434
435//______________________________________________________________________________
436template<typename Architecture_t>
437template <typename Net_t>
439 Net_t & master,
440 std::vector<Net_t> & nets,
441 std::vector<TBatch<Architecture_t>> & batches,
442 Scalar_t momentum)
443{
444 typename Architecture_t::Matrix_t dummy(0,0);
445 size_t depth = master.GetDepth();
446
447 // Forward
448 for (size_t j = 0; j < nets.size(); j++) {
449 nets[j].GetLayer(0).Forward(batches[j].GetInput(), true);
450 }
451
452 for (size_t i = 1; i < depth; i++)
453 {
454 for (size_t j = 0; j < nets.size(); j++) {
455 nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(), true);
456 }
457 }
458 // Gradients
459 for (size_t j = 0; j < nets.size(); j++) {
460 evaluateGradients<Architecture_t>(nets[j].GetLayer(depth - 1).GetActivationGradients(), nets[j].GetLossFunction(),
461 batches[j].GetOutput(), nets[j].GetLayer(depth - 1).GetOutput(),
462 batches[j].GetWeights());
463 }
464 // Backward
465 for (size_t i = depth - 1; i > 0; i--)
466 {
467 for (size_t j = 0; j < nets.size(); j++) {
468 nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
469 nets[j].GetLayer(i-1).GetOutput(),
470 nets[j].GetRegularization(),
471 nets[j].GetWeightDecay());
472 Architecture_t::ScaleAdd(master.GetLayer(i).GetWeightGradients(),
473 nets[j].GetLayer(i).GetWeightGradients(),
474 - fLearningRate / momentum);
475 Architecture_t::ScaleAdd(master.GetLayer(i).GetBiasGradients(),
476 nets[j].GetLayer(i).GetBiasGradients(),
477 - fLearningRate / momentum);
478 }
479 Architecture_t::ScaleAdd(master.GetLayer(i).GetWeightGradients(),
480 master.GetLayer(i).GetWeightGradients(),
481 momentum - 1.0);
482 Architecture_t::ScaleAdd(master.GetLayer(i).GetBiasGradients(),
483 master.GetLayer(i).GetBiasGradients(),
484 momentum - 1.0);
485 }
486 for (size_t j = 0; j < nets.size(); j++) {
487 nets[j].GetLayer(0).Backward(dummy,
488 batches[j].GetInput(),
489 nets[j].GetRegularization(),
490 nets[j].GetWeightDecay());
491 Architecture_t::ScaleAdd(master.GetLayer(0).GetWeightGradients(),
492 nets[j].GetLayer(0).GetWeightGradients(),
493 - fLearningRate / momentum);
494 Architecture_t::ScaleAdd(master.GetLayer(0).GetBiasGradients(),
495 nets[j].GetLayer(0).GetBiasGradients(),
496 - fLearningRate / momentum);
497 }
498
499 Architecture_t::ScaleAdd(master.GetLayer(0).GetWeightGradients(),
500 master.GetLayer(0).GetWeightGradients(),
501 momentum - 1.0);
502 Architecture_t::ScaleAdd(master.GetLayer(0).GetBiasGradients(),
503 master.GetLayer(0).GetBiasGradients(),
504 momentum - 1.0);
505
506 for (size_t i = 0; i < depth; i++)
507 {
508 auto &masterLayer = master.GetLayer(i);
509 Architecture_t::ScaleAdd(masterLayer.GetWeights(),
510 masterLayer.GetWeightGradients(),
511 1.0);
512 Architecture_t::ScaleAdd(masterLayer.GetBiases(),
513 masterLayer.GetBiasGradients(),
514 1.0);
515 for (size_t j = 0; j < nets.size(); j++) {
516 auto &layer = nets[j].GetLayer(i);
517 Architecture_t::Copy(layer.GetWeights(),
518 masterLayer.GetWeights());
519 Architecture_t::Copy(layer.GetBiases(),
520 masterLayer.GetBiases());
521 }
522 }
523}
524
525//______________________________________________________________________________
526template<typename Architecture_t>
527template <typename Net_t>
529 Net_t & master,
530 std::vector<Net_t> & nets,
531 std::vector<TBatch<Architecture_t>> & batches,
532 Scalar_t momentum)
533{
534 typename Architecture_t::Matrix_t dummy(0,0);
535 size_t depth = master.GetDepth();
536
537 // Forward
538 for (size_t j = 0; j < nets.size(); j++) {
539 nets[j].GetLayer(0).Forward(batches[j].GetInput(), true);
540 }
541
542 for (size_t i = 1; i < depth; i++)
543 {
544 for (size_t j = 0; j < nets.size(); j++) {
545 nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(), true);
546 }
547 }
548
549 // Gradients
550 for (size_t j = 0; j < nets.size(); j++) {
551 evaluateGradients<Architecture_t>(nets[j].GetLayer(depth - 1).GetActivationGradients(), nets[j].GetLossFunction(),
552 batches[j].GetOutput(), nets[j].GetLayer(depth - 1).GetOutput(),
553 batches[j].GetWeights());
554 }
555
556 // Backward
557 for (size_t i = depth - 1; i > 0; i--)
558 {
559 for (size_t j = 0; j < nets.size(); j++) {
560 nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
561 nets[j].GetLayer(i-1).GetOutput(),
562 nets[j].GetRegularization(),
563 nets[j].GetWeightDecay());
564 }
565 }
566
567 for (size_t j = 0; j < nets.size(); j++) {
568 nets[j].GetLayer(0).Backward(dummy,
569 batches[j].GetInput(),
570 nets[j].GetRegularization(),
571 nets[j].GetWeightDecay());
572 }
573
574 for (size_t i = 0; i < depth; i++)
575 {
576 auto &masterLayer = master.GetLayer(i);
577 for (size_t j = 0; j < nets.size(); j++) {
578 auto &layer = nets[j].GetLayer(i);
579 Architecture_t::Copy(layer.GetWeights(),
580 masterLayer.GetWeights());
581 Architecture_t::Copy(layer.GetBiases(),
582 masterLayer.GetBiases());
583 Architecture_t::ScaleAdd(layer.GetWeights(),
584 masterLayer.GetWeightGradients(),
585 1.0);
586 Architecture_t::ScaleAdd(layer.GetBiases(),
587 masterLayer.GetBiasGradients(),
588 1.0);
589 }
590 for (size_t j = 0; j < nets.size(); j++) {
591 auto &layer = nets[j].GetLayer(i);
592 Architecture_t::ScaleAdd(masterLayer.GetWeightGradients(),
593 layer.GetWeightGradients(),
594 - fLearningRate / momentum);
595 Architecture_t::ScaleAdd(masterLayer.GetBiasGradients(),
596 layer.GetBiasGradients(),
597 - fLearningRate / momentum);
598 }
599 Architecture_t::ScaleAdd(masterLayer.GetWeightGradients(),
600 masterLayer.GetWeightGradients(),
601 momentum - 1.0);
602 Architecture_t::ScaleAdd(masterLayer.GetBiasGradients(),
603 masterLayer.GetBiasGradients(),
604 momentum - 1.0);
605 Architecture_t::ScaleAdd(masterLayer.GetWeights(),
606 masterLayer.GetWeightGradients(),
607 1.0);
608 Architecture_t::ScaleAdd(masterLayer.GetBiases(),
609 masterLayer.GetBiasGradients(),
610 1.0);
611 }
612}
613
614//______________________________________________________________________________
615template<typename Architecture_t>
616template <typename Net_t>
618 Net_t & net,
620 const Matrix_t &output)
621{
622 net.Forward(input, true);
623 net.Backward(input, output);
624
625 for (size_t i = 0; i < net.GetDepth(); i++)
626 {
627 auto &layer = net.GetLayer(i);
628 Architecture_t::ScaleAdd(layer.GetWeights(),
629 layer.GetWeightGradients(),
630 -fLearningRate);
631 if (i == 0) {
632 Architecture_t::ScaleAdd(layer.GetBiases(),
633 layer.GetBiasGradients(),
634 -fLearningRate);
635 }
636 }
637}
638
639//______________________________________________________________________________
640template <typename Architecture_t>
641template <typename Net_t>
643 const Matrix_t &output, const Matrix_t &weights)
644 -> Scalar_t
645{
646 Scalar_t loss = net.Loss(input, output);
647 fTrainingError = loss;
648 net.Backward(input, output, weights);
649
650 for (size_t i = 0; i < net.GetDepth(); i++)
651 {
652 auto &layer = net.GetLayer(i);
653 Architecture_t::ScaleAdd(layer.GetWeights(),
654 layer.GetWeightGradients(),
655 -fLearningRate);
656 if (i == 0) {
657 Architecture_t::ScaleAdd(layer.GetBiases(),
658 layer.GetBiasGradients(),
659 -fLearningRate);
660 }
661 }
662 return loss;
663}
664
665//______________________________________________________________________________
666template<typename Architecture_t>
668{
669 if (fTestError < fMinimumError * 0.999) {
670 fConvergenceCount = 0;
671 fMinimumError = fTestError;
672 } else {
673 fConvergenceCount++;
674 }
675
676 return (fConvergenceCount >= fConvergenceSteps);
677}
678
679//______________________________________________________________________________
680template<typename Architecture_t>
682{
683 fTestError = testError;
684 if (fTestError < fMinimumError * 0.999) {
685 fConvergenceCount = 0;
686 fMinimumError = fTestError;
687 } else {
688 fConvergenceCount += fTestInterval;
689 }
690 return (fConvergenceCount >= fConvergenceSteps);
691}
692} // namespace DNN
693} // namespace TMVA
694
695#endif
#define b(i)
Definition RSha256.hxx:100
double Double_t
Definition RtypesCore.h:59
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void input
BatchIterator_t begin()
Definition DataLoader.h:170
TBatch< AArchitecture > GetBatch()
Return the next batch from the training set.
Definition DataLoader.h:228
void Shuffle()
Shuffle the order of the samples in the batch.
Definition DataLoader.h:269
size_t fConvergenceCount
Current number of training epochs without.
Definition Minimizers.h:66
bool HasConverged()
Increases the minimization step counter by the test error evaluation period and uses the current inte...
Definition Minimizers.h:667
Scalar_t fTrainingError
Holds the most recently computed training loss.
Definition Minimizers.h:69
void Step(Net_t &net, Matrix_t &input, const Matrix_t &output, const Matrix_t &weights)
Perform a single optimization step on a given batch.
Definition Minimizers.h:331
size_t fConvergenceSteps
Number of training epochs without considerable.
Definition Minimizers.h:64
Scalar_t StepReducedWeightsLoss(Net_t &net, Matrix_t &input, const Matrix_t &output, const Matrix_t &weights)
Similar to StepReducedWeights(...) but also evaluates the loss.
Scalar_t StepLoss(Net_t &net, Matrix_t &input, const Matrix_t &output, const Matrix_t &weights)
Same as Step(...) but also evaluate the loss on the given training data.
void Reset()
Reset minimizer object to default state.
Definition Minimizers.h:81
Scalar_t GetTrainingError() const
Definition Minimizers.h:161
size_t fTestInterval
Interval for the computation of the test error.
Definition Minimizers.h:68
void StepNesterov(Net_t &master, std::vector< Net_t > &nets, std::vector< TBatch< Architecture_t > > &batches, Scalar_t momentum)
Same as the Step(...) method for multiple batches but uses Nesterov momentum.
Definition Minimizers.h:528
void SetBatchSize(Scalar_t rate)
Definition Minimizers.h:168
Scalar_t fTestError
Holds the most recently computed test loss.
Definition Minimizers.h:70
typename Architecture_t::Matrix_t Matrix_t
Definition Minimizers.h:59
size_t GetTestInterval() const
Definition Minimizers.h:163
size_t fStepCount
Number of steps performed in the current training session.
Definition Minimizers.h:63
void StepMomentum(Net_t &master, std::vector< Net_t > &nets, std::vector< TBatch< Architecture_t > > &batches, Scalar_t momentum)
Same as the Step(...) method for multiple batches but uses momentum.
Definition Minimizers.h:438
void SetConvergenceSteps(size_t steps)
Definition Minimizers.h:165
Scalar_t TrainMomentum(const Data_t &TrainingDataIn, size_t nTrainingSamples, const Data_t &TestDataIn, size_t nTestSamples, Net_t &net, Scalar_t momentum, size_t nThreads=1)
Same as Train(...) but uses the given momentum.
size_t GetConvergenceCount() const
Definition Minimizers.h:159
Scalar_t fMinimumError
The minimum loss achieved on the training set during the current training session.
Definition Minimizers.h:72
void SetLearningRate(Scalar_t rate)
Definition Minimizers.h:167
Scalar_t fLearningRate
Learning rate .
Definition Minimizers.h:71
size_t fBatchSize
Batch size to use for the training.
Definition Minimizers.h:62
size_t GetConvergenceSteps() const
Definition Minimizers.h:160
Scalar_t Train(const Data_t &TrainingDataIn, size_t nTrainingSamples, const Data_t &TestDataIn, size_t nTestSamples, Net_t &net, size_t nThreads=1)
Train the given net using the given training input data (events), training output data (labels),...
void SetTestInterval(size_t interval)
Definition Minimizers.h:166
void StepReducedWeights(Net_t &net, Matrix_t &input, const Matrix_t &output)
Does not evaluate the loss and therefore not trigger a possible synchronization with the device.
Definition Minimizers.h:617
Scalar_t GetTestError() const
Definition Minimizers.h:162
typename Architecture_t::Scalar_t Scalar_t
Definition Minimizers.h:58
create variable transformations
static void output()