Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
Minimizers.h
Go to the documentation of this file.
1// @(#)root/tmva $Id$
2// Author: Simon Pfreundschuh 21/06/16
3
4/*************************************************************************
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
11#ifndef TMVA_DNN_MINIMIZERS
12#define TMVA_DNN_MINIMIZERS
13
14#include "DataLoader.h"
15#include "Functions.h"
16
17#include <limits>
18#include <vector>
19
20namespace TMVA {
21namespace DNN {
22
23//______________________________________________________________________________
24//
25// Generic Gradient Descent Class
26//______________________________________________________________________________
27//
28
29/*** \class TGradientDescent
30*
31* Generic implementation of gradient descent minimization.
32*
33* The TGradientDescent class implements an architecture and input data
34* independent implementation of the gradient descent minimization algorithm.
35*
36* Provides Train(...) and TrainMomentum(...) functions that perform a complete
37* training of a neural network. Those are mainly used for testing since for
38* production a more fine grained control of the training process is desirable.
39* This is provided by the Step(...), StepMomentum(...) and StepNesterov(...)
40* functions that perform a single minimization step.
41*
42* The main training characteristics are defined by the provided learning rate,
43* the test interval, and the convergence steps required for convergence. The
44* test interval defines how often the error on the validation set is computed,
45* and the values with which the step counter is increased each time the
46* HasConverged() member function is called. A convergence step is defined as
47* a step in which the test error is NOT less than 0.999 times the current
48* minimal test error that has been reached. If between two subsequent calls
49* to HasConverged(Double_t) the test error has not been sufficiently reduced
50* it is assumed that a number of convergence steps equal to the test interval
51* has been performed.
52*
53*/
54template<typename Architecture_t>
56{
57public:
58 using Scalar_t = typename Architecture_t::Scalar_t;
59 using Matrix_t = typename Architecture_t::Matrix_t;
60
61private:
62 size_t fBatchSize; ///< Batch size to use for the training.
63 size_t fStepCount; ///< Number of steps performed in the current training session
64 size_t fConvergenceSteps; ///< Number of training epochs without considerable
65 ///< decrease in the test error for convergence.
66 size_t fConvergenceCount; ///< Current number of training epochs without
67 ///< considerable decrease in the test error.
68 size_t fTestInterval; ///< Interval for the computation of the test error.
69 Scalar_t fTrainingError;///< Holds the most recently computed training loss.
70 Scalar_t fTestError; ///< Holds the most recently computed test loss.
71 Scalar_t fLearningRate; ///< Learning rate \f$\alpha\f$
72 Scalar_t fMinimumError; ///< The minimum loss achieved on the training set
73 ///< during the current traning session.
74
75public:
77 TGradientDescent(Scalar_t learningRate,
78 size_t convergenceSteps,
79 size_t testInterval);
80
81 /** Reset minimizer object to default state. */
82 void Reset()
83 {
84 fMinimumError = std::numeric_limits<Scalar_t>::infinity();
86 fStepCount = 0;
87 };
88
89 /** Train the given net using the given training input data (events), training
90 output data (labels), test input data (events), test output data (labels). */
91 template <typename Data_t, typename Net_t>
92 Scalar_t Train(const Data_t & TrainingDataIn, size_t nTrainingSamples,
93 const Data_t & TestDataIn, size_t nTestSamples,
94 Net_t & net, size_t nThreads = 1);
95
96 /** Same as Train(...) but uses the given momentum.*/
97 template <typename Data_t, typename Net_t>
98 Scalar_t TrainMomentum(const Data_t & TrainingDataIn, size_t nTrainingSamples,
99 const Data_t & TestDataIn, size_t nTestSamples,
100 Net_t & net, Scalar_t momentum, size_t nThreads = 1);
101
102 /** Perform a single optimization step on a given batch. Propagates the input
103 matrix foward through the net, evaluates the loss and propagates the gradients
104 backward through the net. The computed gradients are scaled by the learning
105 rate \f$\alpha\f$ and subtracted from the weights and bias values of each
106 layer. */
107 template <typename Net_t>
108 void Step(Net_t &net, Matrix_t &input, const Matrix_t &output, const Matrix_t &weights);
109
110 /** Same as Step(...) but also evaluate the loss on the given training data.
111 * Note that this requires synchronization between host and device. */
112 template <typename Net_t>
113 Scalar_t StepLoss(Net_t &net, Matrix_t &input, const Matrix_t &output, const Matrix_t &weights);
114
115 /** Perform multiple optimization steps simultaneously. Performs the
116 * backprop algorithm on the input batches given in \p batches on
117 * the neural networks given in \p nets. The forward and backward propagation
118 * steps are executed in an interleaving manner in order to exploit potential
119 * batch-level parallelism for asynchronous device calls.
120 */
121 template <typename Net_t>
122 void Step(Net_t &master,
123 std::vector<Net_t> &nets,
124 std::vector<TBatch<Architecture_t>> &batches);
125
126 /** Same as the Step(...) method for multiple batches but uses momentum. */
127 template <typename Net_t>
128 void StepMomentum(Net_t &master,
129 std::vector<Net_t> &nets,
130 std::vector<TBatch<Architecture_t>> &batches,
131 Scalar_t momentum);
132 template <typename Net_t>
133
134 /** Same as the Step(...) method for multiple batches but uses Nesterov
135 * momentum. */
136 void StepNesterov(Net_t &master,
137 std::vector<Net_t> &nets,
138 std::vector<TBatch<Architecture_t>> &batches,
139 Scalar_t momentum);
140
141 /** Does not evaluate the loss and therefore not trigger a possible synchronization
142 * with the device. Trains the weights of each layer, but only the bias terms of
143 * the first layer for compatibility with the previous implementation. */
144 template <typename Net_t>
145 void StepReducedWeights(Net_t &net, Matrix_t &input, const Matrix_t &output);
146
147 /** Similar to StepReducedWeights(...) but also evaluates the loss. May trigger
148 * synchronization with the device. */
149 template <typename Net_t>
150 Scalar_t StepReducedWeightsLoss(Net_t &net, Matrix_t &input, const Matrix_t &output, const Matrix_t &weights);
151 /** Increases the minimization step counter by the test error evaluation
152 * period and uses the current internal value of the test error to
153 * determine if the minimization has converged. */
154 bool HasConverged();
155 /** Increases the minimization step counter by the test error evaluation
156 * period and uses the provided test error value to determine if the
157 * minimization has converged. */
158 bool HasConverged(Scalar_t testError);
159
160 size_t GetConvergenceCount() const {return fConvergenceCount;}
161 size_t GetConvergenceSteps() const {return fConvergenceSteps;}
164 size_t GetTestInterval() const {return fTestInterval;}
165
166 void SetConvergenceSteps(size_t steps) {fConvergenceSteps = steps;}
167 void SetTestInterval(size_t interval) {fTestInterval = interval;}
169 void SetBatchSize(Scalar_t rate) {fBatchSize = rate;}
170};
171
172//
173// Implementation
174//______________________________________________________________________________
175template <typename Architecture_t>
177 : fBatchSize(0), fStepCount(0), fConvergenceSteps(0), fConvergenceCount(0), fTestInterval(0), fLearningRate(0),
178 fMinimumError(std::numeric_limits<Scalar_t>::infinity())
179{
180 // Nothing to do here.
181}
182
183//______________________________________________________________________________
184template <typename Architecture_t>
185TGradientDescent<Architecture_t>::TGradientDescent(Scalar_t learningRate, size_t convergenceSteps, size_t testInterval)
186 : fBatchSize(0), fStepCount(0), fConvergenceSteps(convergenceSteps), fConvergenceCount(0),
187 fTestInterval(testInterval), fLearningRate(learningRate), fMinimumError(std::numeric_limits<Scalar_t>::infinity())
188{
189 // Nothing to do here.
190}
191
192//______________________________________________________________________________
193template<typename Architecture_t>
194template <typename Data_t, typename Net_t>
195 auto TGradientDescent<Architecture_t>::Train(const Data_t & trainingData,
196 size_t nTrainingSamples,
197 const Data_t & testData,
198 size_t nTestSamples,
199 Net_t & net,
200 size_t nThreads)
201 -> Scalar_t
202{
203 Reset();
204
205 // Prepare training data.
206 TDataLoader<Data_t, Architecture_t> trainLoader(trainingData, nTrainingSamples,
207 net.GetBatchSize(),
208 net.GetInputWidth(),
209 net.GetOutputWidth(), nThreads);
210 auto testNet = net.CreateClone(nTestSamples);
211 TDataLoader<Data_t, Architecture_t> testLoader(testData, nTestSamples,
212 testNet.GetBatchSize(),
213 testNet.GetInputWidth(),
214 net.GetOutputWidth());
215 std::vector<Net_t> nets{};
216 nets.reserve(nThreads);
217 for (size_t i = 0; i < nThreads; i++) {
218 nets.push_back(net);
219 for (size_t j = 0; j < net.GetDepth(); j++)
220 {
221 auto &masterLayer = net.GetLayer(j);
222 auto &layer = nets.back().GetLayer(j);
223 Architecture_t::Copy(layer.GetWeights(),
224 masterLayer.GetWeights());
225 Architecture_t::Copy(layer.GetBiases(),
226 masterLayer.GetBiases());
227 }
228 }
229
230 size_t batchesInEpoch = nTrainingSamples / net.GetBatchSize();
231 std::vector<TBatch<Architecture_t>> batches{};
232 batches.reserve(nThreads);
233
234 do {
235 for (fStepCount = 0; fStepCount < fTestInterval; fStepCount++) {
236 trainLoader.Shuffle();
237 for (size_t i = 0; i < batchesInEpoch; i += nThreads) {
238 batches.clear();
239 for (size_t j = 0; j < nThreads; j++) batches.push_back(trainLoader.GetBatch());
240 Step(net, nets, batches);
241 }
242 }
243
244 auto b = *testLoader.begin();
245 auto inputMatrix = b.GetInput();
246 auto outputMatrix = b.GetOutput();
247 auto weightMatrix = b.GetWeights();
248 fTestError = testNet.Loss(inputMatrix, outputMatrix, weightMatrix);
249
250 } while (!HasConverged());
251
252 return fMinimumError;
253}
254
255//______________________________________________________________________________
256template<typename Architecture_t>
257template <typename Data_t, typename Net_t>
258auto TGradientDescent<Architecture_t>::TrainMomentum(const Data_t & trainingData,
259 size_t nTrainingSamples,
260 const Data_t & testData,
261 size_t nTestSamples,
262 Net_t & net,
263 Scalar_t momentum,
264 size_t nThreads)
265 -> Scalar_t
266{
267 Reset();
268
269 // Prepare training data.
270 TDataLoader<Data_t, Architecture_t> trainLoader(trainingData, nTrainingSamples,
271 net.GetBatchSize(),
272 net.GetInputWidth(),
273 net.GetOutputWidth(), nThreads);
274 auto testNet = net.CreateClone(net.GetBatchSize());
275 TDataLoader<Data_t, Architecture_t> testLoader(testData, nTestSamples,
276 testNet.GetBatchSize(),
277 testNet.GetInputWidth(),
278 net.GetOutputWidth());
279
280 net.InitializeGradients();
281 std::vector<Net_t> nets{};
282 nets.reserve(nThreads);
283 for (size_t i = 0; i < nThreads; i++) {
284 nets.push_back(net);
285 for (size_t j = 0; j < net.GetDepth(); j++)
286 {
287 auto &masterLayer = net.GetLayer(j);
288 auto &layer = nets.back().GetLayer(j);
289 Architecture_t::Copy(layer.GetWeights(),
290 masterLayer.GetWeights());
291 Architecture_t::Copy(layer.GetBiases(),
292 masterLayer.GetBiases());
293 }
294 }
295
296 size_t batchesInEpoch = nTrainingSamples / net.GetBatchSize();
297 std::vector<TBatch<Architecture_t>> batches{};
298 batches.reserve(nThreads);
299
300 do {
301 for (fStepCount = 0; fStepCount < fTestInterval; fStepCount++) {
302 trainLoader.Shuffle();
303 for (size_t i = 0; i < batchesInEpoch; i += nThreads) {
304 batches.clear();
305 for (size_t j = 0; j < nThreads; j++) batches.push_back(trainLoader.GetBatch());
306 if (momentum != 0.0) {
307 StepMomentum(net, nets, batches, momentum);
308 } else {
309 Step(net, nets, batches);
310 }
311 }
312 }
313
314 fTestError = 0.0;
315 for (size_t i = 0; i < batchesInEpoch; i++) {
316 auto b = testLoader.GetBatch();
317 auto inputMatrix = b.GetInput();
318 auto outputMatrix = b.GetOutput();
319 auto weightMatrix = b.GetWeights();
320 fTestError += testNet.Loss(inputMatrix, outputMatrix, weightMatrix);
321 }
322 fTestError /= (Double_t)batchesInEpoch;
323 } while (!HasConverged());
324 return fMinimumError;
325}
326
327//______________________________________________________________________________
328template <typename Architecture_t>
329template <typename Net_t>
330void inline TGradientDescent<Architecture_t>::Step(Net_t &net, Matrix_t &input, const Matrix_t &output,
331 const Matrix_t &weights)
332{
333 net.Forward(input, true);
334 net.Backward(input, output, weights);
335
336 for (size_t i = 0; i < net.GetDepth(); i++)
337 {
338 auto &layer = net.GetLayer(i);
339 Architecture_t::ScaleAdd(layer.GetWeights(),
340 layer.GetWeightGradients(),
341 -fLearningRate);
342 Architecture_t::ScaleAdd(layer.GetBiases(),
343 layer.GetBiasGradients(),
344 -fLearningRate);
345 }
346}
347
348//______________________________________________________________________________
349template <typename Architecture_t>
350template <typename Net_t>
352 const Matrix_t &weights) -> Scalar_t
353{
354 Scalar_t loss = net.Loss(input, output, weights);
355 net.Backward(input, output);
356
357 for (size_t i = 0; i < net.GetDepth(); i++)
358 {
359 auto &layer = net.GetLayer(i);
360 Architecture_t::ScaleAdd(layer.GetWeights(),
361 layer.GetWeightGradients(),
362 -fLearningRate);
363 Architecture_t::ScaleAdd(layer.GetBiases(),
364 layer.GetBiasGradients(),
365 -fLearningRate);
366 }
367 return loss;
368}
369
370//______________________________________________________________________________
371template<typename Architecture_t>
372 template <typename Net_t>
374 Net_t & master,
375 std::vector<Net_t> & nets,
376 std::vector<TBatch<Architecture_t>> & batches)
377{
378 typename Architecture_t::Matrix_t dummy(0,0);
379 size_t depth = master.GetDepth();
380
381 // Forward
382 for (size_t j = 0; j < nets.size(); j++) {
383 nets[j].GetLayer(0).Forward(batches[j].GetInput(), true);
384 }
385
386 for (size_t i = 1; i < depth; i++)
387 {
388 for (size_t j = 0; j < nets.size(); j++) {
389 nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(), true);
390 }
391 }
392 // Gradients
393 for (size_t j = 0; j < nets.size(); j++) {
394 evaluateGradients<Architecture_t>(nets[j].GetLayer(depth - 1).GetActivationGradients(), nets[j].GetLossFunction(),
395 batches[j].GetOutput(), nets[j].GetLayer(depth - 1).GetOutput(),
396 batches[j].GetWeights());
397 }
398 // Backward
399 for (size_t i = depth - 1; i > 0; i--)
400 {
401 for (size_t j = 0; j < nets.size(); j++) {
402 nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
403 nets[j].GetLayer(i-1).GetOutput(),
404 nets[j].GetRegularization(),
405 nets[j].GetWeightDecay());
406 }
407 }
408 for (size_t j = 0; j < nets.size(); j++) {
409 nets[j].GetLayer(0).Backward(dummy,
410 batches[j].GetInput(),
411 nets[j].GetRegularization(),
412 nets[j].GetWeightDecay());
413 }
414
415 for (size_t j = 0; j < nets.size(); j++) {
416 for (size_t i = 0; i < depth; i++)
417 {
418 auto &masterLayer = master.GetLayer(i);
419 auto &layer = nets[j].GetLayer(i);
420 Architecture_t::ScaleAdd(masterLayer.GetWeights(),
421 layer.GetWeightGradients(),
422 -fLearningRate);
423 Architecture_t::Copy(layer.GetWeights(),
424 masterLayer.GetWeights());
425 Architecture_t::ScaleAdd(masterLayer.GetBiases(),
426 layer.GetBiasGradients(),
427 -fLearningRate);
428 Architecture_t::Copy(layer.GetBiases(),
429 masterLayer.GetBiases());
430 }
431 }
432}
433
434//______________________________________________________________________________
435template<typename Architecture_t>
436template <typename Net_t>
438 Net_t & master,
439 std::vector<Net_t> & nets,
440 std::vector<TBatch<Architecture_t>> & batches,
441 Scalar_t momentum)
442{
443 typename Architecture_t::Matrix_t dummy(0,0);
444 size_t depth = master.GetDepth();
445
446 // Forward
447 for (size_t j = 0; j < nets.size(); j++) {
448 nets[j].GetLayer(0).Forward(batches[j].GetInput(), true);
449 }
450
451 for (size_t i = 1; i < depth; i++)
452 {
453 for (size_t j = 0; j < nets.size(); j++) {
454 nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(), true);
455 }
456 }
457 // Gradients
458 for (size_t j = 0; j < nets.size(); j++) {
459 evaluateGradients<Architecture_t>(nets[j].GetLayer(depth - 1).GetActivationGradients(), nets[j].GetLossFunction(),
460 batches[j].GetOutput(), nets[j].GetLayer(depth - 1).GetOutput(),
461 batches[j].GetWeights());
462 }
463 // Backward
464 for (size_t i = depth - 1; i > 0; i--)
465 {
466 for (size_t j = 0; j < nets.size(); j++) {
467 nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
468 nets[j].GetLayer(i-1).GetOutput(),
469 nets[j].GetRegularization(),
470 nets[j].GetWeightDecay());
471 Architecture_t::ScaleAdd(master.GetLayer(i).GetWeightGradients(),
472 nets[j].GetLayer(i).GetWeightGradients(),
473 - fLearningRate / momentum);
474 Architecture_t::ScaleAdd(master.GetLayer(i).GetBiasGradients(),
475 nets[j].GetLayer(i).GetBiasGradients(),
476 - fLearningRate / momentum);
477 }
478 Architecture_t::ScaleAdd(master.GetLayer(i).GetWeightGradients(),
479 master.GetLayer(i).GetWeightGradients(),
480 momentum - 1.0);
481 Architecture_t::ScaleAdd(master.GetLayer(i).GetBiasGradients(),
482 master.GetLayer(i).GetBiasGradients(),
483 momentum - 1.0);
484 }
485 for (size_t j = 0; j < nets.size(); j++) {
486 nets[j].GetLayer(0).Backward(dummy,
487 batches[j].GetInput(),
488 nets[j].GetRegularization(),
489 nets[j].GetWeightDecay());
490 Architecture_t::ScaleAdd(master.GetLayer(0).GetWeightGradients(),
491 nets[j].GetLayer(0).GetWeightGradients(),
492 - fLearningRate / momentum);
493 Architecture_t::ScaleAdd(master.GetLayer(0).GetBiasGradients(),
494 nets[j].GetLayer(0).GetBiasGradients(),
495 - fLearningRate / momentum);
496 }
497
498 Architecture_t::ScaleAdd(master.GetLayer(0).GetWeightGradients(),
499 master.GetLayer(0).GetWeightGradients(),
500 momentum - 1.0);
501 Architecture_t::ScaleAdd(master.GetLayer(0).GetBiasGradients(),
502 master.GetLayer(0).GetBiasGradients(),
503 momentum - 1.0);
504
505 for (size_t i = 0; i < depth; i++)
506 {
507 auto &masterLayer = master.GetLayer(i);
508 Architecture_t::ScaleAdd(masterLayer.GetWeights(),
509 masterLayer.GetWeightGradients(),
510 1.0);
511 Architecture_t::ScaleAdd(masterLayer.GetBiases(),
512 masterLayer.GetBiasGradients(),
513 1.0);
514 for (size_t j = 0; j < nets.size(); j++) {
515 auto &layer = nets[j].GetLayer(i);
516 Architecture_t::Copy(layer.GetWeights(),
517 masterLayer.GetWeights());
518 Architecture_t::Copy(layer.GetBiases(),
519 masterLayer.GetBiases());
520 }
521 }
522}
523
524//______________________________________________________________________________
525template<typename Architecture_t>
526template <typename Net_t>
528 Net_t & master,
529 std::vector<Net_t> & nets,
530 std::vector<TBatch<Architecture_t>> & batches,
531 Scalar_t momentum)
532{
533 typename Architecture_t::Matrix_t dummy(0,0);
534 size_t depth = master.GetDepth();
535
536 // Forward
537 for (size_t j = 0; j < nets.size(); j++) {
538 nets[j].GetLayer(0).Forward(batches[j].GetInput(), true);
539 }
540
541 for (size_t i = 1; i < depth; i++)
542 {
543 for (size_t j = 0; j < nets.size(); j++) {
544 nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(), true);
545 }
546 }
547
548 // Gradients
549 for (size_t j = 0; j < nets.size(); j++) {
550 evaluateGradients<Architecture_t>(nets[j].GetLayer(depth - 1).GetActivationGradients(), nets[j].GetLossFunction(),
551 batches[j].GetOutput(), nets[j].GetLayer(depth - 1).GetOutput(),
552 batches[j].GetWeights());
553 }
554
555 // Backward
556 for (size_t i = depth - 1; i > 0; i--)
557 {
558 for (size_t j = 0; j < nets.size(); j++) {
559 nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
560 nets[j].GetLayer(i-1).GetOutput(),
561 nets[j].GetRegularization(),
562 nets[j].GetWeightDecay());
563 }
564 }
565
566 for (size_t j = 0; j < nets.size(); j++) {
567 nets[j].GetLayer(0).Backward(dummy,
568 batches[j].GetInput(),
569 nets[j].GetRegularization(),
570 nets[j].GetWeightDecay());
571 }
572
573 for (size_t i = 0; i < depth; i++)
574 {
575 auto &masterLayer = master.GetLayer(i);
576 for (size_t j = 0; j < nets.size(); j++) {
577 auto &layer = nets[j].GetLayer(i);
578 Architecture_t::Copy(layer.GetWeights(),
579 masterLayer.GetWeights());
580 Architecture_t::Copy(layer.GetBiases(),
581 masterLayer.GetBiases());
582 Architecture_t::ScaleAdd(layer.GetWeights(),
583 masterLayer.GetWeightGradients(),
584 1.0);
585 Architecture_t::ScaleAdd(layer.GetBiases(),
586 masterLayer.GetBiasGradients(),
587 1.0);
588 }
589 for (size_t j = 0; j < nets.size(); j++) {
590 auto &layer = nets[j].GetLayer(i);
591 Architecture_t::ScaleAdd(masterLayer.GetWeightGradients(),
592 layer.GetWeightGradients(),
593 - fLearningRate / momentum);
594 Architecture_t::ScaleAdd(masterLayer.GetBiasGradients(),
595 layer.GetBiasGradients(),
596 - fLearningRate / momentum);
597 }
598 Architecture_t::ScaleAdd(masterLayer.GetWeightGradients(),
599 masterLayer.GetWeightGradients(),
600 momentum - 1.0);
601 Architecture_t::ScaleAdd(masterLayer.GetBiasGradients(),
602 masterLayer.GetBiasGradients(),
603 momentum - 1.0);
604 Architecture_t::ScaleAdd(masterLayer.GetWeights(),
605 masterLayer.GetWeightGradients(),
606 1.0);
607 Architecture_t::ScaleAdd(masterLayer.GetBiases(),
608 masterLayer.GetBiasGradients(),
609 1.0);
610 }
611}
612
613//______________________________________________________________________________
614template<typename Architecture_t>
615template <typename Net_t>
617 Net_t & net,
618 Matrix_t &input,
619 const Matrix_t &output)
620{
621 net.Forward(input, true);
622 net.Backward(input, output);
623
624 for (size_t i = 0; i < net.GetDepth(); i++)
625 {
626 auto &layer = net.GetLayer(i);
627 Architecture_t::ScaleAdd(layer.GetWeights(),
628 layer.GetWeightGradients(),
629 -fLearningRate);
630 if (i == 0) {
631 Architecture_t::ScaleAdd(layer.GetBiases(),
632 layer.GetBiasGradients(),
633 -fLearningRate);
634 }
635 }
636}
637
638//______________________________________________________________________________
639template <typename Architecture_t>
640template <typename Net_t>
642 const Matrix_t &output, const Matrix_t &weights)
643 -> Scalar_t
644{
645 Scalar_t loss = net.Loss(input, output);
646 fTrainingError = loss;
647 net.Backward(input, output, weights);
648
649 for (size_t i = 0; i < net.GetDepth(); i++)
650 {
651 auto &layer = net.GetLayer(i);
652 Architecture_t::ScaleAdd(layer.GetWeights(),
653 layer.GetWeightGradients(),
654 -fLearningRate);
655 if (i == 0) {
656 Architecture_t::ScaleAdd(layer.GetBiases(),
657 layer.GetBiasGradients(),
658 -fLearningRate);
659 }
660 }
661 return loss;
662}
663
664//______________________________________________________________________________
665template<typename Architecture_t>
667{
668 if (fTestError < fMinimumError * 0.999) {
669 fConvergenceCount = 0;
670 fMinimumError = fTestError;
671 } else {
672 fConvergenceCount++;
673 }
674
675 return (fConvergenceCount >= fConvergenceSteps);
676}
677
678//______________________________________________________________________________
679template<typename Architecture_t>
681{
682 fTestError = testError;
683 if (fTestError < fMinimumError * 0.999) {
684 fConvergenceCount = 0;
685 fMinimumError = fTestError;
686 } else {
687 fConvergenceCount += fTestInterval;
688 }
689 return (fConvergenceCount >= fConvergenceSteps);
690}
691} // namespace DNN
692} // namespace TMVA
693
694#endif
#define b(i)
Definition RSha256.hxx:100
double Double_t
Definition RtypesCore.h:59
BatchIterator_t begin()
Definition DataLoader.h:170
TBatch< AArchitecture > GetBatch()
Return the next batch from the training set.
Definition DataLoader.h:228
void Shuffle()
Shuffle the order of the samples in the batch.
Definition DataLoader.h:269
size_t fConvergenceCount
Current number of training epochs without.
Definition Minimizers.h:66
bool HasConverged()
Increases the minimization step counter by the test error evaluation period and uses the current inte...
Definition Minimizers.h:666
Scalar_t fTrainingError
Holds the most recently computed training loss.
Definition Minimizers.h:69
void Step(Net_t &net, Matrix_t &input, const Matrix_t &output, const Matrix_t &weights)
Perform a single optimization step on a given batch.
Definition Minimizers.h:330
size_t fConvergenceSteps
Number of training epochs without considerable.
Definition Minimizers.h:64
Scalar_t StepReducedWeightsLoss(Net_t &net, Matrix_t &input, const Matrix_t &output, const Matrix_t &weights)
Similar to StepReducedWeights(...) but also evaluates the loss.
Scalar_t StepLoss(Net_t &net, Matrix_t &input, const Matrix_t &output, const Matrix_t &weights)
Same as Step(...) but also evaluate the loss on the given training data.
void Reset()
Reset minimizer object to default state.
Definition Minimizers.h:82
Scalar_t GetTrainingError() const
Definition Minimizers.h:162
size_t fTestInterval
Interval for the computation of the test error.
Definition Minimizers.h:68
void StepNesterov(Net_t &master, std::vector< Net_t > &nets, std::vector< TBatch< Architecture_t > > &batches, Scalar_t momentum)
Same as the Step(...) method for multiple batches but uses Nesterov momentum.
Definition Minimizers.h:527
void SetBatchSize(Scalar_t rate)
Definition Minimizers.h:169
Scalar_t fTestError
Holds the most recently computed test loss.
Definition Minimizers.h:70
typename Architecture_t::Matrix_t Matrix_t
Definition Minimizers.h:59
size_t GetTestInterval() const
Definition Minimizers.h:164
size_t fStepCount
Number of steps performed in the current training session.
Definition Minimizers.h:63
void StepMomentum(Net_t &master, std::vector< Net_t > &nets, std::vector< TBatch< Architecture_t > > &batches, Scalar_t momentum)
Same as the Step(...) method for multiple batches but uses momentum.
Definition Minimizers.h:437
void SetConvergenceSteps(size_t steps)
Definition Minimizers.h:166
Scalar_t TrainMomentum(const Data_t &TrainingDataIn, size_t nTrainingSamples, const Data_t &TestDataIn, size_t nTestSamples, Net_t &net, Scalar_t momentum, size_t nThreads=1)
Same as Train(...) but uses the given momentum.
size_t GetConvergenceCount() const
Definition Minimizers.h:160
Scalar_t fMinimumError
The minimum loss achieved on the training set.
Definition Minimizers.h:72
void SetLearningRate(Scalar_t rate)
Definition Minimizers.h:168
Scalar_t fLearningRate
Learning rate .
Definition Minimizers.h:71
size_t fBatchSize
Batch size to use for the training.
Definition Minimizers.h:62
size_t GetConvergenceSteps() const
Definition Minimizers.h:161
Scalar_t Train(const Data_t &TrainingDataIn, size_t nTrainingSamples, const Data_t &TestDataIn, size_t nTestSamples, Net_t &net, size_t nThreads=1)
Train the given net using the given training input data (events), training output data (labels),...
void SetTestInterval(size_t interval)
Definition Minimizers.h:167
void StepReducedWeights(Net_t &net, Matrix_t &input, const Matrix_t &output)
Does not evaluate the loss and therefore not trigger a possible synchronization with the device.
Definition Minimizers.h:616
Scalar_t GetTestError() const
Definition Minimizers.h:163
typename Architecture_t::Scalar_t Scalar_t
Definition Minimizers.h:58
create variable transformations
static void output(int code)
Definition gifencode.c:226