Logo ROOT  
Reference Guide
Minimizers.h
Go to the documentation of this file.
1// @(#)root/tmva $Id$
2// Author: Simon Pfreundschuh 21/06/16
3
4/*************************************************************************
5 * All rights reserved. *
6 * *
7 * For the licensing terms see $ROOTSYS/LICENSE. *
8 * For the list of contributors see $ROOTSYS/README/CREDITS. *
9 *************************************************************************/
10
11#ifndef TMVA_DNN_MINIMIZERS
12#define TMVA_DNN_MINIMIZERS
13
14#include "DataLoader.h"
15#include "Functions.h"
16
17#include <limits>
18
19namespace TMVA {
20namespace DNN {
21
22//______________________________________________________________________________
23//
24// Generic Gradient Descent Class
25//______________________________________________________________________________
26//
27
28/*** \class TGradientDescent
29*
30* Generic implementation of gradient descent minimization.
31*
32* The TGradientDescent class implements an architecture and input data
33* independent implementation of the gradient descent minimization algorithm.
34*
35* Provides Train(...) and TrainMomentum(...) functions that perform a complete
36* training of a neural network. Those are mainly used for testing since for
37* production a more fine grained control of the training process is desirable.
38* This is provided by the Step(...), StepMomentum(...) and StepNesterov(...)
39* functions that perform a single minimization step.
40*
41* The main training characteristics are defined by the provided learning rate,
42* the test interval, and the convergence steps required for convergence. The
43* test interval defines how often the error on the validation set is computed,
44* and the values with which the step counter is increased each time the
45* HasConverged() member function is called. A convergence step is defined as
46* a step in which the test error is NOT less than 0.999 times the current
47* minimal test error that has been reached. If between two subsequent calls
48* to HasConverged(Double_t) the test error has not been sufficiently reduced
49* it is assumed that a number of convergence steps equal to the test interval
50* has been performed.
51*
52*/
53template<typename Architecture_t>
55{
56public:
57 using Scalar_t = typename Architecture_t::Scalar_t;
58 using Matrix_t = typename Architecture_t::Matrix_t;
59
60private:
61 size_t fBatchSize; ///< Batch size to use for the training.
62 size_t fStepCount; ///< Number of steps performed in the current training session
63 size_t fConvergenceSteps; ///< Number of training epochs without considerable
64 ///< decrease in the test error for convergence.
65 size_t fConvergenceCount; ///< Current number of training epochs without
66 ///< considerable decrease in the test error.
67 size_t fTestInterval; ///< Interval for the computation of the test error.
68 Scalar_t fTrainingError;///< Holds the most recently computed training loss.
69 Scalar_t fTestError; ///< Holds the most recently computed test loss.
70 Scalar_t fLearningRate; ///< Learning rate \f$\alpha\f$
71 Scalar_t fMinimumError; ///< The minimum loss achieved on the training set
72 ///< during the current traning session.
73
74public:
76 TGradientDescent(Scalar_t learningRate,
77 size_t convergenceSteps,
78 size_t testInterval);
79
80 /** Reset minimizer object to default state. */
81 void Reset()
82 {
83 fMinimumError = std::numeric_limits<Scalar_t>::infinity();
85 fStepCount = 0;
86 };
87
88 /** Train the given net using the given training input data (events), training
89 output data (labels), test input data (events), test output data (labels). */
90 template <typename Data_t, typename Net_t>
91 Scalar_t Train(const Data_t & TrainingDataIn, size_t nTrainingSamples,
92 const Data_t & TestDataIn, size_t nTestSamples,
93 Net_t & net, size_t nThreads = 1);
94
95 /** Same as Train(...) but uses the given momentum.*/
96 template <typename Data_t, typename Net_t>
97 Scalar_t TrainMomentum(const Data_t & TrainingDataIn, size_t nTrainingSamples,
98 const Data_t & TestDataIn, size_t nTestSamples,
99 Net_t & net, Scalar_t momentum, size_t nThreads = 1);
100
101 /** Perform a single optimization step on a given batch. Propagates the input
102 matrix foward through the net, evaluates the loss and propagates the gradients
103 backward through the net. The computed gradients are scaled by the learning
104 rate \f$\alpha\f$ and subtracted from the weights and bias values of each
105 layer. */
106 template <typename Net_t>
107 void Step(Net_t &net, Matrix_t &input, const Matrix_t &output, const Matrix_t &weights);
108
109 /** Same as Step(...) but also evaluate the loss on the given training data.
110 * Note that this requires synchronization between host and device. */
111 template <typename Net_t>
112 Scalar_t StepLoss(Net_t &net, Matrix_t &input, const Matrix_t &output, const Matrix_t &weights);
113
114 /** Perform multiple optimization steps simultaneously. Performs the
115 * backprop algorithm on the input batches given in \p batches on
116 * the neural networks given in \p nets. The forward and backward propagation
117 * steps are executed in an interleaving manner in order to exploit potential
118 * batch-level parallelism for asynchronous device calls.
119 */
120 template <typename Net_t>
121 void Step(Net_t &master,
122 std::vector<Net_t> &nets,
123 std::vector<TBatch<Architecture_t>> &batches);
124
125 /** Same as the Step(...) method for multiple batches but uses momentum. */
126 template <typename Net_t>
127 void StepMomentum(Net_t &master,
128 std::vector<Net_t> &nets,
129 std::vector<TBatch<Architecture_t>> &batches,
130 Scalar_t momentum);
131 template <typename Net_t>
132
133 /** Same as the Step(...) method for multiple batches but uses Nesterov
134 * momentum. */
135 void StepNesterov(Net_t &master,
136 std::vector<Net_t> &nets,
137 std::vector<TBatch<Architecture_t>> &batches,
138 Scalar_t momentum);
139
140 /** Does not evaluate the loss and therefore not trigger a possible synchronization
141 * with the device. Trains the weights of each layer, but only the bias terms of
142 * the first layer for compatibility with the previous implementation. */
143 template <typename Net_t>
144 void StepReducedWeights(Net_t &net, Matrix_t &input, const Matrix_t &output);
145
146 /** Similar to StepReducedWeights(...) but also evaluates the loss. May trigger
147 * synchronization with the device. */
148 template <typename Net_t>
149 Scalar_t StepReducedWeightsLoss(Net_t &net, Matrix_t &input, const Matrix_t &output, const Matrix_t &weights);
150 /** Increases the minimization step counter by the test error evaluation
151 * period and uses the current internal value of the test error to
152 * determine if the minimization has converged. */
153 bool HasConverged();
154 /** Increases the minimization step counter by the test error evaluation
155 * period and uses the provided test error value to determine if the
156 * minimization has converged. */
157 bool HasConverged(Scalar_t testError);
158
159 size_t GetConvergenceCount() const {return fConvergenceCount;}
160 size_t GetConvergenceSteps() const {return fConvergenceSteps;}
163 size_t GetTestInterval() const {return fTestInterval;}
164
165 void SetConvergenceSteps(size_t steps) {fConvergenceSteps = steps;}
166 void SetTestInterval(size_t interval) {fTestInterval = interval;}
168 void SetBatchSize(Scalar_t rate) {fBatchSize = rate;}
169};
170
171//
172// Implementation
173//______________________________________________________________________________
174template <typename Architecture_t>
176 : fBatchSize(0), fStepCount(0), fConvergenceSteps(0), fConvergenceCount(0), fTestInterval(0), fLearningRate(0),
177 fMinimumError(std::numeric_limits<Scalar_t>::infinity())
178{
179 // Nothing to do here.
180}
181
182//______________________________________________________________________________
183template <typename Architecture_t>
184TGradientDescent<Architecture_t>::TGradientDescent(Scalar_t learningRate, size_t convergenceSteps, size_t testInterval)
185 : fBatchSize(0), fStepCount(0), fConvergenceSteps(convergenceSteps), fConvergenceCount(0),
186 fTestInterval(testInterval), fLearningRate(learningRate), fMinimumError(std::numeric_limits<Scalar_t>::infinity())
187{
188 // Nothing to do here.
189}
190
191//______________________________________________________________________________
192template<typename Architecture_t>
193template <typename Data_t, typename Net_t>
194 auto TGradientDescent<Architecture_t>::Train(const Data_t & trainingData,
195 size_t nTrainingSamples,
196 const Data_t & testData,
197 size_t nTestSamples,
198 Net_t & net,
199 size_t nThreads)
200 -> Scalar_t
201{
202 Reset();
203
204 // Prepare training data.
205 TDataLoader<Data_t, Architecture_t> trainLoader(trainingData, nTrainingSamples,
206 net.GetBatchSize(),
207 net.GetInputWidth(),
208 net.GetOutputWidth(), nThreads);
209 auto testNet = net.CreateClone(nTestSamples);
210 TDataLoader<Data_t, Architecture_t> testLoader(testData, nTestSamples,
211 testNet.GetBatchSize(),
212 testNet.GetInputWidth(),
213 net.GetOutputWidth());
214 std::vector<Net_t> nets{};
215 nets.reserve(nThreads);
216 for (size_t i = 0; i < nThreads; i++) {
217 nets.push_back(net);
218 for (size_t j = 0; j < net.GetDepth(); j++)
219 {
220 auto &masterLayer = net.GetLayer(j);
221 auto &layer = nets.back().GetLayer(j);
222 Architecture_t::Copy(layer.GetWeights(),
223 masterLayer.GetWeights());
224 Architecture_t::Copy(layer.GetBiases(),
225 masterLayer.GetBiases());
226 }
227 }
228
229 size_t batchesInEpoch = nTrainingSamples / net.GetBatchSize();
230 std::vector<TBatch<Architecture_t>> batches{};
231 batches.reserve(nThreads);
232
233 do {
234 for (fStepCount = 0; fStepCount < fTestInterval; fStepCount++) {
235 trainLoader.Shuffle();
236 for (size_t i = 0; i < batchesInEpoch; i += nThreads) {
237 batches.clear();
238 for (size_t j = 0; j < nThreads; j++) batches.push_back(trainLoader.GetBatch());
239 Step(net, nets, batches);
240 }
241 }
242
243 auto b = *testLoader.begin();
244 auto inputMatrix = b.GetInput();
245 auto outputMatrix = b.GetOutput();
246 auto weightMatrix = b.GetWeights();
247 fTestError = testNet.Loss(inputMatrix, outputMatrix, weightMatrix);
248
249 } while (!HasConverged());
250
251 return fMinimumError;
252}
253
254//______________________________________________________________________________
255template<typename Architecture_t>
256template <typename Data_t, typename Net_t>
257auto TGradientDescent<Architecture_t>::TrainMomentum(const Data_t & trainingData,
258 size_t nTrainingSamples,
259 const Data_t & testData,
260 size_t nTestSamples,
261 Net_t & net,
262 Scalar_t momentum,
263 size_t nThreads)
264 -> Scalar_t
265{
266 Reset();
267
268 // Prepare training data.
269 TDataLoader<Data_t, Architecture_t> trainLoader(trainingData, nTrainingSamples,
270 net.GetBatchSize(),
271 net.GetInputWidth(),
272 net.GetOutputWidth(), nThreads);
273 auto testNet = net.CreateClone(net.GetBatchSize());
274 TDataLoader<Data_t, Architecture_t> testLoader(testData, nTestSamples,
275 testNet.GetBatchSize(),
276 testNet.GetInputWidth(),
277 net.GetOutputWidth());
278
279 net.InitializeGradients();
280 std::vector<Net_t> nets{};
281 nets.reserve(nThreads);
282 for (size_t i = 0; i < nThreads; i++) {
283 nets.push_back(net);
284 for (size_t j = 0; j < net.GetDepth(); j++)
285 {
286 auto &masterLayer = net.GetLayer(j);
287 auto &layer = nets.back().GetLayer(j);
288 Architecture_t::Copy(layer.GetWeights(),
289 masterLayer.GetWeights());
290 Architecture_t::Copy(layer.GetBiases(),
291 masterLayer.GetBiases());
292 }
293 }
294
295 size_t batchesInEpoch = nTrainingSamples / net.GetBatchSize();
296 std::vector<TBatch<Architecture_t>> batches{};
297 batches.reserve(nThreads);
298
299 do {
300 for (fStepCount = 0; fStepCount < fTestInterval; fStepCount++) {
301 trainLoader.Shuffle();
302 for (size_t i = 0; i < batchesInEpoch; i += nThreads) {
303 batches.clear();
304 for (size_t j = 0; j < nThreads; j++) batches.push_back(trainLoader.GetBatch());
305 if (momentum != 0.0) {
306 StepMomentum(net, nets, batches, momentum);
307 } else {
308 Step(net, nets, batches);
309 }
310 }
311 }
312
313 fTestError = 0.0;
314 for (size_t i = 0; i < batchesInEpoch; i++) {
315 auto b = testLoader.GetBatch();
316 auto inputMatrix = b.GetInput();
317 auto outputMatrix = b.GetOutput();
318 auto weightMatrix = b.GetWeights();
319 fTestError += testNet.Loss(inputMatrix, outputMatrix, weightMatrix);
320 }
321 fTestError /= (Double_t)batchesInEpoch;
322 } while (!HasConverged());
323 return fMinimumError;
324}
325
326//______________________________________________________________________________
327template <typename Architecture_t>
328template <typename Net_t>
329void inline TGradientDescent<Architecture_t>::Step(Net_t &net, Matrix_t &input, const Matrix_t &output,
330 const Matrix_t &weights)
331{
332 net.Forward(input, true);
333 net.Backward(input, output, weights);
334
335 for (size_t i = 0; i < net.GetDepth(); i++)
336 {
337 auto &layer = net.GetLayer(i);
338 Architecture_t::ScaleAdd(layer.GetWeights(),
339 layer.GetWeightGradients(),
340 -fLearningRate);
341 Architecture_t::ScaleAdd(layer.GetBiases(),
342 layer.GetBiasGradients(),
343 -fLearningRate);
344 }
345}
346
347//______________________________________________________________________________
348template <typename Architecture_t>
349template <typename Net_t>
351 const Matrix_t &weights) -> Scalar_t
352{
353 Scalar_t loss = net.Loss(input, output, weights);
354 net.Backward(input, output);
355
356 for (size_t i = 0; i < net.GetDepth(); i++)
357 {
358 auto &layer = net.GetLayer(i);
359 Architecture_t::ScaleAdd(layer.GetWeights(),
360 layer.GetWeightGradients(),
361 -fLearningRate);
362 Architecture_t::ScaleAdd(layer.GetBiases(),
363 layer.GetBiasGradients(),
364 -fLearningRate);
365 }
366 return loss;
367}
368
369//______________________________________________________________________________
370template<typename Architecture_t>
371 template <typename Net_t>
373 Net_t & master,
374 std::vector<Net_t> & nets,
375 std::vector<TBatch<Architecture_t>> & batches)
376{
377 typename Architecture_t::Matrix_t dummy(0,0);
378 size_t depth = master.GetDepth();
379
380 // Forward
381 for (size_t j = 0; j < nets.size(); j++) {
382 nets[j].GetLayer(0).Forward(batches[j].GetInput(), true);
383 }
384
385 for (size_t i = 1; i < depth; i++)
386 {
387 for (size_t j = 0; j < nets.size(); j++) {
388 nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(), true);
389 }
390 }
391 // Gradients
392 for (size_t j = 0; j < nets.size(); j++) {
393 evaluateGradients<Architecture_t>(nets[j].GetLayer(depth - 1).GetActivationGradients(), nets[j].GetLossFunction(),
394 batches[j].GetOutput(), nets[j].GetLayer(depth - 1).GetOutput(),
395 batches[j].GetWeights());
396 }
397 // Backward
398 for (size_t i = depth - 1; i > 0; i--)
399 {
400 for (size_t j = 0; j < nets.size(); j++) {
401 nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
402 nets[j].GetLayer(i-1).GetOutput(),
403 nets[j].GetRegularization(),
404 nets[j].GetWeightDecay());
405 }
406 }
407 for (size_t j = 0; j < nets.size(); j++) {
408 nets[j].GetLayer(0).Backward(dummy,
409 batches[j].GetInput(),
410 nets[j].GetRegularization(),
411 nets[j].GetWeightDecay());
412 }
413
414 for (size_t j = 0; j < nets.size(); j++) {
415 for (size_t i = 0; i < depth; i++)
416 {
417 auto &masterLayer = master.GetLayer(i);
418 auto &layer = nets[j].GetLayer(i);
419 Architecture_t::ScaleAdd(masterLayer.GetWeights(),
420 layer.GetWeightGradients(),
421 -fLearningRate);
422 Architecture_t::Copy(layer.GetWeights(),
423 masterLayer.GetWeights());
424 Architecture_t::ScaleAdd(masterLayer.GetBiases(),
425 layer.GetBiasGradients(),
426 -fLearningRate);
427 Architecture_t::Copy(layer.GetBiases(),
428 masterLayer.GetBiases());
429 }
430 }
431}
432
433//______________________________________________________________________________
434template<typename Architecture_t>
435template <typename Net_t>
437 Net_t & master,
438 std::vector<Net_t> & nets,
439 std::vector<TBatch<Architecture_t>> & batches,
440 Scalar_t momentum)
441{
442 typename Architecture_t::Matrix_t dummy(0,0);
443 size_t depth = master.GetDepth();
444
445 // Forward
446 for (size_t j = 0; j < nets.size(); j++) {
447 nets[j].GetLayer(0).Forward(batches[j].GetInput(), true);
448 }
449
450 for (size_t i = 1; i < depth; i++)
451 {
452 for (size_t j = 0; j < nets.size(); j++) {
453 nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(), true);
454 }
455 }
456 // Gradients
457 for (size_t j = 0; j < nets.size(); j++) {
458 evaluateGradients<Architecture_t>(nets[j].GetLayer(depth - 1).GetActivationGradients(), nets[j].GetLossFunction(),
459 batches[j].GetOutput(), nets[j].GetLayer(depth - 1).GetOutput(),
460 batches[j].GetWeights());
461 }
462 // Backward
463 for (size_t i = depth - 1; i > 0; i--)
464 {
465 for (size_t j = 0; j < nets.size(); j++) {
466 nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
467 nets[j].GetLayer(i-1).GetOutput(),
468 nets[j].GetRegularization(),
469 nets[j].GetWeightDecay());
470 Architecture_t::ScaleAdd(master.GetLayer(i).GetWeightGradients(),
471 nets[j].GetLayer(i).GetWeightGradients(),
472 - fLearningRate / momentum);
473 Architecture_t::ScaleAdd(master.GetLayer(i).GetBiasGradients(),
474 nets[j].GetLayer(i).GetBiasGradients(),
475 - fLearningRate / momentum);
476 }
477 Architecture_t::ScaleAdd(master.GetLayer(i).GetWeightGradients(),
478 master.GetLayer(i).GetWeightGradients(),
479 momentum - 1.0);
480 Architecture_t::ScaleAdd(master.GetLayer(i).GetBiasGradients(),
481 master.GetLayer(i).GetBiasGradients(),
482 momentum - 1.0);
483 }
484 for (size_t j = 0; j < nets.size(); j++) {
485 nets[j].GetLayer(0).Backward(dummy,
486 batches[j].GetInput(),
487 nets[j].GetRegularization(),
488 nets[j].GetWeightDecay());
489 Architecture_t::ScaleAdd(master.GetLayer(0).GetWeightGradients(),
490 nets[j].GetLayer(0).GetWeightGradients(),
491 - fLearningRate / momentum);
492 Architecture_t::ScaleAdd(master.GetLayer(0).GetBiasGradients(),
493 nets[j].GetLayer(0).GetBiasGradients(),
494 - fLearningRate / momentum);
495 }
496
497 Architecture_t::ScaleAdd(master.GetLayer(0).GetWeightGradients(),
498 master.GetLayer(0).GetWeightGradients(),
499 momentum - 1.0);
500 Architecture_t::ScaleAdd(master.GetLayer(0).GetBiasGradients(),
501 master.GetLayer(0).GetBiasGradients(),
502 momentum - 1.0);
503
504 for (size_t i = 0; i < depth; i++)
505 {
506 auto &masterLayer = master.GetLayer(i);
507 Architecture_t::ScaleAdd(masterLayer.GetWeights(),
508 masterLayer.GetWeightGradients(),
509 1.0);
510 Architecture_t::ScaleAdd(masterLayer.GetBiases(),
511 masterLayer.GetBiasGradients(),
512 1.0);
513 for (size_t j = 0; j < nets.size(); j++) {
514 auto &layer = nets[j].GetLayer(i);
515 Architecture_t::Copy(layer.GetWeights(),
516 masterLayer.GetWeights());
517 Architecture_t::Copy(layer.GetBiases(),
518 masterLayer.GetBiases());
519 }
520 }
521}
522
523//______________________________________________________________________________
524template<typename Architecture_t>
525template <typename Net_t>
527 Net_t & master,
528 std::vector<Net_t> & nets,
529 std::vector<TBatch<Architecture_t>> & batches,
530 Scalar_t momentum)
531{
532 typename Architecture_t::Matrix_t dummy(0,0);
533 size_t depth = master.GetDepth();
534
535 // Forward
536 for (size_t j = 0; j < nets.size(); j++) {
537 nets[j].GetLayer(0).Forward(batches[j].GetInput(), true);
538 }
539
540 for (size_t i = 1; i < depth; i++)
541 {
542 for (size_t j = 0; j < nets.size(); j++) {
543 nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(), true);
544 }
545 }
546
547 // Gradients
548 for (size_t j = 0; j < nets.size(); j++) {
549 evaluateGradients<Architecture_t>(nets[j].GetLayer(depth - 1).GetActivationGradients(), nets[j].GetLossFunction(),
550 batches[j].GetOutput(), nets[j].GetLayer(depth - 1).GetOutput(),
551 batches[j].GetWeights());
552 }
553
554 // Backward
555 for (size_t i = depth - 1; i > 0; i--)
556 {
557 for (size_t j = 0; j < nets.size(); j++) {
558 nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
559 nets[j].GetLayer(i-1).GetOutput(),
560 nets[j].GetRegularization(),
561 nets[j].GetWeightDecay());
562 }
563 }
564
565 for (size_t j = 0; j < nets.size(); j++) {
566 nets[j].GetLayer(0).Backward(dummy,
567 batches[j].GetInput(),
568 nets[j].GetRegularization(),
569 nets[j].GetWeightDecay());
570 }
571
572 for (size_t i = 0; i < depth; i++)
573 {
574 auto &masterLayer = master.GetLayer(i);
575 for (size_t j = 0; j < nets.size(); j++) {
576 auto &layer = nets[j].GetLayer(i);
577 Architecture_t::Copy(layer.GetWeights(),
578 masterLayer.GetWeights());
579 Architecture_t::Copy(layer.GetBiases(),
580 masterLayer.GetBiases());
581 Architecture_t::ScaleAdd(layer.GetWeights(),
582 masterLayer.GetWeightGradients(),
583 1.0);
584 Architecture_t::ScaleAdd(layer.GetBiases(),
585 masterLayer.GetBiasGradients(),
586 1.0);
587 }
588 for (size_t j = 0; j < nets.size(); j++) {
589 auto &layer = nets[j].GetLayer(i);
590 Architecture_t::ScaleAdd(masterLayer.GetWeightGradients(),
591 layer.GetWeightGradients(),
592 - fLearningRate / momentum);
593 Architecture_t::ScaleAdd(masterLayer.GetBiasGradients(),
594 layer.GetBiasGradients(),
595 - fLearningRate / momentum);
596 }
597 Architecture_t::ScaleAdd(masterLayer.GetWeightGradients(),
598 masterLayer.GetWeightGradients(),
599 momentum - 1.0);
600 Architecture_t::ScaleAdd(masterLayer.GetBiasGradients(),
601 masterLayer.GetBiasGradients(),
602 momentum - 1.0);
603 Architecture_t::ScaleAdd(masterLayer.GetWeights(),
604 masterLayer.GetWeightGradients(),
605 1.0);
606 Architecture_t::ScaleAdd(masterLayer.GetBiases(),
607 masterLayer.GetBiasGradients(),
608 1.0);
609 }
610}
611
612//______________________________________________________________________________
613template<typename Architecture_t>
614template <typename Net_t>
616 Net_t & net,
617 Matrix_t &input,
618 const Matrix_t &output)
619{
620 net.Forward(input, true);
621 net.Backward(input, output);
622
623 for (size_t i = 0; i < net.GetDepth(); i++)
624 {
625 auto &layer = net.GetLayer(i);
626 Architecture_t::ScaleAdd(layer.GetWeights(),
627 layer.GetWeightGradients(),
628 -fLearningRate);
629 if (i == 0) {
630 Architecture_t::ScaleAdd(layer.GetBiases(),
631 layer.GetBiasGradients(),
632 -fLearningRate);
633 }
634 }
635}
636
637//______________________________________________________________________________
638template <typename Architecture_t>
639template <typename Net_t>
641 const Matrix_t &output, const Matrix_t &weights)
642 -> Scalar_t
643{
644 Scalar_t loss = net.Loss(input, output);
645 fTrainingError = loss;
646 net.Backward(input, output, weights);
647
648 for (size_t i = 0; i < net.GetDepth(); i++)
649 {
650 auto &layer = net.GetLayer(i);
651 Architecture_t::ScaleAdd(layer.GetWeights(),
652 layer.GetWeightGradients(),
653 -fLearningRate);
654 if (i == 0) {
655 Architecture_t::ScaleAdd(layer.GetBiases(),
656 layer.GetBiasGradients(),
657 -fLearningRate);
658 }
659 }
660 return loss;
661}
662
663//______________________________________________________________________________
664template<typename Architecture_t>
666{
667 if (fTestError < fMinimumError * 0.999) {
668 fConvergenceCount = 0;
669 fMinimumError = fTestError;
670 } else {
671 fConvergenceCount++;
672 }
673
674 return (fConvergenceCount >= fConvergenceSteps);
675}
676
677//______________________________________________________________________________
678template<typename Architecture_t>
680{
681 fTestError = testError;
682 if (fTestError < fMinimumError * 0.999) {
683 fConvergenceCount = 0;
684 fMinimumError = fTestError;
685 } else {
686 fConvergenceCount += fTestInterval;
687 }
688 return (fConvergenceCount >= fConvergenceSteps);
689}
690} // namespace DNN
691} // namespace TMVA
692
693#endif
#define b(i)
Definition: RSha256.hxx:100
static RooMathCoreReg dummy
double Double_t
Definition: RtypesCore.h:55
BatchIterator_t begin()
Definition: DataLoader.h:170
TBatch< AArchitecture > GetBatch()
Return the next batch from the training set.
Definition: DataLoader.h:228
void Shuffle()
Shuffle the order of the samples in the batch.
Definition: DataLoader.h:269
size_t fConvergenceCount
Current number of training epochs without.
Definition: Minimizers.h:65
bool HasConverged()
Increases the minimization step counter by the test error evaluation period and uses the current inte...
Definition: Minimizers.h:665
Scalar_t fTrainingError
Holds the most recently computed training loss.
Definition: Minimizers.h:68
void Step(Net_t &net, Matrix_t &input, const Matrix_t &output, const Matrix_t &weights)
Perform a single optimization step on a given batch.
Definition: Minimizers.h:329
size_t fConvergenceSteps
Number of training epochs without considerable.
Definition: Minimizers.h:63
Scalar_t StepReducedWeightsLoss(Net_t &net, Matrix_t &input, const Matrix_t &output, const Matrix_t &weights)
Similar to StepReducedWeights(...) but also evaluates the loss.
Scalar_t StepLoss(Net_t &net, Matrix_t &input, const Matrix_t &output, const Matrix_t &weights)
Same as Step(...) but also evaluate the loss on the given training data.
void Reset()
Reset minimizer object to default state.
Definition: Minimizers.h:81
Scalar_t GetTrainingError() const
Definition: Minimizers.h:161
size_t fTestInterval
Interval for the computation of the test error.
Definition: Minimizers.h:67
void StepNesterov(Net_t &master, std::vector< Net_t > &nets, std::vector< TBatch< Architecture_t > > &batches, Scalar_t momentum)
Same as the Step(...) method for multiple batches but uses Nesterov momentum.
Definition: Minimizers.h:526
void SetBatchSize(Scalar_t rate)
Definition: Minimizers.h:168
Scalar_t fTestError
Holds the most recently computed test loss.
Definition: Minimizers.h:69
typename Architecture_t::Matrix_t Matrix_t
Definition: Minimizers.h:58
size_t GetTestInterval() const
Definition: Minimizers.h:163
size_t fStepCount
Number of steps performed in the current training session.
Definition: Minimizers.h:62
void StepMomentum(Net_t &master, std::vector< Net_t > &nets, std::vector< TBatch< Architecture_t > > &batches, Scalar_t momentum)
Same as the Step(...) method for multiple batches but uses momentum.
Definition: Minimizers.h:436
void SetConvergenceSteps(size_t steps)
Definition: Minimizers.h:165
Scalar_t TrainMomentum(const Data_t &TrainingDataIn, size_t nTrainingSamples, const Data_t &TestDataIn, size_t nTestSamples, Net_t &net, Scalar_t momentum, size_t nThreads=1)
Same as Train(...) but uses the given momentum.
size_t GetConvergenceCount() const
Definition: Minimizers.h:159
Scalar_t fMinimumError
The minimum loss achieved on the training set.
Definition: Minimizers.h:71
void SetLearningRate(Scalar_t rate)
Definition: Minimizers.h:167
Scalar_t fLearningRate
Learning rate .
Definition: Minimizers.h:70
size_t fBatchSize
Batch size to use for the training.
Definition: Minimizers.h:61
size_t GetConvergenceSteps() const
Definition: Minimizers.h:160
Scalar_t Train(const Data_t &TrainingDataIn, size_t nTrainingSamples, const Data_t &TestDataIn, size_t nTestSamples, Net_t &net, size_t nThreads=1)
Train the given net using the given training input data (events), training output data (labels),...
void SetTestInterval(size_t interval)
Definition: Minimizers.h:166
void StepReducedWeights(Net_t &net, Matrix_t &input, const Matrix_t &output)
Does not evaluate the loss and therefore not trigger a possible synchronization with the device.
Definition: Minimizers.h:615
Scalar_t GetTestError() const
Definition: Minimizers.h:162
typename Architecture_t::Scalar_t Scalar_t
Definition: Minimizers.h:57
void Copy(void *source, void *dest)
void Step(const gsl_rng *r, void *xp, double step_size)
create variable transformations
static void output(int code)
Definition: gifencode.c:226