Logo ROOT   6.12/07
Reference Guide
Minimizers.h
Go to the documentation of this file.
1 // @(#)root/tmva $Id$
2 // Author: Simon Pfreundschuh 21/06/16
3 
4 /*************************************************************************
5  * All rights reserved. *
6  * *
7  * For the licensing terms see $ROOTSYS/LICENSE. *
8  * For the list of contributors see $ROOTSYS/README/CREDITS. *
9  *************************************************************************/
10 
11 #ifndef TMVA_DNN_MINIMIZERS
12 #define TMVA_DNN_MINIMIZERS
13 
14 #include "DataLoader.h"
15 #include "Functions.h"
16 
17 #include <limits>
18 
19 namespace TMVA {
20 namespace DNN {
21 
22 //______________________________________________________________________________
23 //
24 // Generic Gradient Descent Class
25 //______________________________________________________________________________
26 //
27 
28 /*** \class TGradientDescent
29 *
30 * Generic implementation of gradient descent minimization.
31 *
32 * The TGradientDescent class implements an architecture and input data
33 * independent implementation of the gradient descent minimization algorithm.
34 *
35 * Provides Train(...) and TrainMomentum(...) functions that perform a complete
36 * training of a neural network. Those are mainly used for testing since for
37 * production a more fine grained control of the training process is desirable.
38 * This is provided by the Step(...), StepMomentum(...) and StepNesterov(...)
39 * functions that perform a single minimization step.
40 *
41 * The main training characteristics are defined by the provided learning rate,
42 * the test interval, and the convergence steps required for convergence. The
43 * test interval defines how often the error on the validation set is computed,
44 * and the values with which the step counter is increased each time the
45 * HasConverged() member function is called. A convergence step is defined as
46 * a step in which the test error is NOT less than 0.999 times the current
47 * minimal test error that has been reached. If between two subsequent calls
48 * to HasConverged(Double_t) the test error has not been sufficiently reduced
49 * it is assumed that a number of convergence steps equal to the test interval
50 * has been performed.
51 *
52 */
53 template<typename Architecture_t>
55 {
56 public:
57  using Scalar_t = typename Architecture_t::Scalar_t;
58  using Matrix_t = typename Architecture_t::Matrix_t;
59 
60 private:
61  size_t fBatchSize; ///< Batch size to use for the training.
62  size_t fStepCount; ///< Number of steps performed in the current training session
63  size_t fConvergenceSteps; ///< Number of training epochs without considerable
64  ///< decrease in the test error for convergence.
65  size_t fConvergenceCount; ///< Current number of training epochs without
66  ///< considerable decrease in the test error.
67  size_t fTestInterval; ///< Interval for the computation of the test error.
68  Scalar_t fTrainingError;///< Holds the most recently computed training loss.
69  Scalar_t fTestError; ///< Holds the most recently computed test loss.
70  Scalar_t fLearningRate; ///< Learning rate \f$\alpha\f$
71  Scalar_t fMinimumError; ///< The minimum loss achieved on the training set
72  ///< during the current traning session.
73 
74 public:
76  TGradientDescent(Scalar_t learningRate,
77  size_t convergenceSteps,
78  size_t testInterval);
79 
80  /** Reset minimizer object to default state. */
81  void Reset()
82  {
83  fMinimumError = std::numeric_limits<Scalar_t>::infinity();
84  fConvergenceCount = 0;
85  fStepCount = 0;
86  };
87 
88  /** Train the given net using the given training input data (events), training
89  output data (labels), test input data (events), test output data (labels). */
90  template <typename Data_t, typename Net_t>
91  Scalar_t Train(const Data_t & TrainingDataIn, size_t nTrainingSamples,
92  const Data_t & TestDataIn, size_t nTestSamples,
93  Net_t & net, size_t nThreads = 1);
94 
95  /** Same as Train(...) but uses the given momentum.*/
96  template <typename Data_t, typename Net_t>
97  Scalar_t TrainMomentum(const Data_t & TrainingDataIn, size_t nTrainingSamples,
98  const Data_t & TestDataIn, size_t nTestSamples,
99  Net_t & net, Scalar_t momentum, size_t nThreads = 1);
100 
101  /** Perform a single optimization step on a given batch. Propagates the input
102  matrix foward through the net, evaluates the loss and propagates the gradients
103  backward through the net. The computed gradients are scaled by the learning
104  rate \f$\alpha\f$ and subtracted from the weights and bias values of each
105  layer. */
106  template <typename Net_t>
107  void Step(Net_t &net, Matrix_t &input, const Matrix_t &output, const Matrix_t &weights);
108 
109  /** Same as Step(...) but also evaluate the loss on the given training data.
110  * Note that this requires synchronization between host and device. */
111  template <typename Net_t>
112  Scalar_t StepLoss(Net_t &net, Matrix_t &input, const Matrix_t &output, const Matrix_t &weights);
113 
114  /** Perform multiple optimization steps simultaneously. Performs the
115  * backprop algorithm on the input batches given in \p batches on
116  * the neural networks given in \p nets. The forward and backward propagation
117  * steps are executed in an interleaving manner in order to exploit potential
118  * batch-level parallelism for asynchronous device calls.
119  */
120  template <typename Net_t>
121  void Step(Net_t &master,
122  std::vector<Net_t> &nets,
123  std::vector<TBatch<Architecture_t>> &batches);
124 
125  /** Same as the Step(...) method for multiple batches but uses momentum. */
126  template <typename Net_t>
127  void StepMomentum(Net_t &master,
128  std::vector<Net_t> &nets,
129  std::vector<TBatch<Architecture_t>> &batches,
130  Scalar_t momentum);
131  template <typename Net_t>
132 
133  /** Same as the Step(...) method for multiple batches but uses Nesterov
134  * momentum. */
135  void StepNesterov(Net_t &master,
136  std::vector<Net_t> &nets,
137  std::vector<TBatch<Architecture_t>> &batches,
138  Scalar_t momentum);
139 
140  /** Does not evaluate the loss and therefore not trigger a possible synchronization
141  * with the device. Trains the weights of each layer, but only the bias terms of
142  * the first layer for compatibility with the previous implementation. */
143  template <typename Net_t>
144  void StepReducedWeights(Net_t &net, Matrix_t &input, const Matrix_t &output);
145 
146  /** Similar to StepReducedWeights(...) but also evaluates the loss. May trigger
147  * synchronization with the device. */
148  template <typename Net_t>
149  Scalar_t StepReducedWeightsLoss(Net_t &net, Matrix_t &input, const Matrix_t &output, const Matrix_t &weights);
150  /** Increases the minimization step counter by the test error evaluation
151  * period and uses the current internal value of the test error to
152  * determine if the minimization has converged. */
153  bool HasConverged();
154  /** Increases the minimization step counter by the test error evaluation
155  * period and uses the provided test error value to determine if the
156  * minimization has converged. */
157  bool HasConverged(Scalar_t testError);
158 
159  size_t GetConvergenceCount() const {return fConvergenceCount;}
160  size_t GetConvergenceSteps() const {return fConvergenceSteps;}
162  Scalar_t GetTestError() const {return fTestError;}
163  size_t GetTestInterval() const {return fTestInterval;}
164 
165  void SetConvergenceSteps(size_t steps) {fConvergenceSteps = steps;}
166  void SetTestInterval(size_t interval) {fTestInterval = interval;}
167  void SetLearningRate(Scalar_t rate) {fLearningRate = rate;}
168  void SetBatchSize(Scalar_t rate) {fBatchSize = rate;}
169 };
170 
171 //
172 // Implementation
173 //______________________________________________________________________________
174 template <typename Architecture_t>
177  fMinimumError(std::numeric_limits<Scalar_t>::infinity())
178 {
179  // Nothing to do here.
180 }
181 
182 //______________________________________________________________________________
183 template <typename Architecture_t>
184 TGradientDescent<Architecture_t>::TGradientDescent(Scalar_t learningRate, size_t convergenceSteps, size_t testInterval)
185  : fBatchSize(0), fStepCount(0), fConvergenceSteps(convergenceSteps), fConvergenceCount(0),
186  fTestInterval(testInterval), fLearningRate(learningRate), fMinimumError(std::numeric_limits<Scalar_t>::infinity())
187 {
188  // Nothing to do here.
189 }
190 
191 //______________________________________________________________________________
192 template<typename Architecture_t>
193 template <typename Data_t, typename Net_t>
194  auto TGradientDescent<Architecture_t>::Train(const Data_t & trainingData,
195  size_t nTrainingSamples,
196  const Data_t & testData,
197  size_t nTestSamples,
198  Net_t & net,
199  size_t nThreads)
200  -> Scalar_t
201 {
202  Reset();
203 
204  // Prepare training data.
205  TDataLoader<Data_t, Architecture_t> trainLoader(trainingData, nTrainingSamples,
206  net.GetBatchSize(),
207  net.GetInputWidth(),
208  net.GetOutputWidth(), nThreads);
209  auto testNet = net.CreateClone(nTestSamples);
210  TDataLoader<Data_t, Architecture_t> testLoader(testData, nTestSamples,
211  testNet.GetBatchSize(),
212  testNet.GetInputWidth(),
213  net.GetOutputWidth());
214  std::vector<Net_t> nets{};
215  nets.reserve(nThreads);
216  for (size_t i = 0; i < nThreads; i++) {
217  nets.push_back(net);
218  for (size_t j = 0; j < net.GetDepth(); j++)
219  {
220  auto &masterLayer = net.GetLayer(j);
221  auto &layer = nets.back().GetLayer(j);
222  Architecture_t::Copy(layer.GetWeights(),
223  masterLayer.GetWeights());
224  Architecture_t::Copy(layer.GetBiases(),
225  masterLayer.GetBiases());
226  }
227  }
228 
229  size_t batchesInEpoch = nTrainingSamples / net.GetBatchSize();
230  std::vector<TBatch<Architecture_t>> batches{};
231  batches.reserve(nThreads);
232 
233  do {
235  trainLoader.Shuffle();
236  for (size_t i = 0; i < batchesInEpoch; i += nThreads) {
237  batches.clear();
238  for (size_t j = 0; j < nThreads; j++) batches.push_back(trainLoader.GetBatch());
239  Step(net, nets, batches);
240  }
241  }
242 
243  auto b = *testLoader.begin();
244  auto inputMatrix = b.GetInput();
245  auto outputMatrix = b.GetOutput();
246  auto weightMatrix = b.GetWeights();
247  fTestError = testNet.Loss(inputMatrix, outputMatrix, weightMatrix);
248 
249  } while (!HasConverged());
250 
251  return fMinimumError;
252 }
253 
254 //______________________________________________________________________________
255 template<typename Architecture_t>
256 template <typename Data_t, typename Net_t>
257 auto TGradientDescent<Architecture_t>::TrainMomentum(const Data_t & trainingData,
258  size_t nTrainingSamples,
259  const Data_t & testData,
260  size_t nTestSamples,
261  Net_t & net,
262  Scalar_t momentum,
263  size_t nThreads)
264  -> Scalar_t
265 {
266  Reset();
267 
268  // Prepare training data.
269  TDataLoader<Data_t, Architecture_t> trainLoader(trainingData, nTrainingSamples,
270  net.GetBatchSize(),
271  net.GetInputWidth(),
272  net.GetOutputWidth(), nThreads);
273  auto testNet = net.CreateClone(net.GetBatchSize());
274  TDataLoader<Data_t, Architecture_t> testLoader(testData, nTestSamples,
275  testNet.GetBatchSize(),
276  testNet.GetInputWidth(),
277  net.GetOutputWidth());
278 
279  net.InitializeGradients();
280  std::vector<Net_t> nets{};
281  nets.reserve(nThreads);
282  for (size_t i = 0; i < nThreads; i++) {
283  nets.push_back(net);
284  for (size_t j = 0; j < net.GetDepth(); j++)
285  {
286  auto &masterLayer = net.GetLayer(j);
287  auto &layer = nets.back().GetLayer(j);
288  Architecture_t::Copy(layer.GetWeights(),
289  masterLayer.GetWeights());
290  Architecture_t::Copy(layer.GetBiases(),
291  masterLayer.GetBiases());
292  }
293  }
294 
295  size_t batchesInEpoch = nTrainingSamples / net.GetBatchSize();
296  std::vector<TBatch<Architecture_t>> batches{};
297  batches.reserve(nThreads);
298 
299  do {
301  trainLoader.Shuffle();
302  for (size_t i = 0; i < batchesInEpoch; i += nThreads) {
303  batches.clear();
304  for (size_t j = 0; j < nThreads; j++) batches.push_back(trainLoader.GetBatch());
305  if (momentum != 0.0) {
306  StepMomentum(net, nets, batches, momentum);
307  } else {
308  Step(net, nets, batches);
309  }
310  }
311  }
312 
313  fTestError = 0.0;
314  for (size_t i = 0; i < batchesInEpoch; i++) {
315  auto b = testLoader.GetBatch();
316  auto inputMatrix = b.GetInput();
317  auto outputMatrix = b.GetOutput();
318  auto weightMatrix = b.GetWeights();
319  fTestError += testNet.Loss(inputMatrix, outputMatrix, weightMatrix);
320  }
321  fTestError /= (Double_t)batchesInEpoch;
322  } while (!HasConverged());
323  return fMinimumError;
324 }
325 
326 //______________________________________________________________________________
327 template <typename Architecture_t>
328 template <typename Net_t>
329 void inline TGradientDescent<Architecture_t>::Step(Net_t &net, Matrix_t &input, const Matrix_t &output,
330  const Matrix_t &weights)
331 {
332  net.Forward(input, true);
333  net.Backward(input, output, weights);
334 
335  for (size_t i = 0; i < net.GetDepth(); i++)
336  {
337  auto &layer = net.GetLayer(i);
338  Architecture_t::ScaleAdd(layer.GetWeights(),
339  layer.GetWeightGradients(),
340  -fLearningRate);
341  Architecture_t::ScaleAdd(layer.GetBiases(),
342  layer.GetBiasGradients(),
343  -fLearningRate);
344  }
345 }
346 
347 //______________________________________________________________________________
348 template <typename Architecture_t>
349 template <typename Net_t>
351  const Matrix_t &weights) -> Scalar_t
352 {
353  Scalar_t loss = net.Loss(input, output, weights);
354  net.Backward(input, output);
355 
356  for (size_t i = 0; i < net.GetDepth(); i++)
357  {
358  auto &layer = net.GetLayer(i);
359  Architecture_t::ScaleAdd(layer.GetWeights(),
360  layer.GetWeightGradients(),
361  -fLearningRate);
362  Architecture_t::ScaleAdd(layer.GetBiases(),
363  layer.GetBiasGradients(),
364  -fLearningRate);
365  }
366  return loss;
367 }
368 
369 //______________________________________________________________________________
370 template<typename Architecture_t>
371  template <typename Net_t>
373  Net_t & master,
374  std::vector<Net_t> & nets,
375  std::vector<TBatch<Architecture_t>> & batches)
376 {
377  typename Architecture_t::Matrix_t dummy(0,0);
378  size_t depth = master.GetDepth();
379 
380  // Forward
381  for (size_t j = 0; j < nets.size(); j++) {
382  nets[j].GetLayer(0).Forward(batches[j].GetInput(), true);
383  }
384 
385  for (size_t i = 1; i < depth; i++)
386  {
387  for (size_t j = 0; j < nets.size(); j++) {
388  nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(), true);
389  }
390  }
391  // Gradients
392  for (size_t j = 0; j < nets.size(); j++) {
393  evaluateGradients<Architecture_t>(nets[j].GetLayer(depth - 1).GetActivationGradients(), nets[j].GetLossFunction(),
394  batches[j].GetOutput(), nets[j].GetLayer(depth - 1).GetOutput(),
395  batches[j].GetWeights());
396  }
397  // Backward
398  for (size_t i = depth - 1; i > 0; i--)
399  {
400  for (size_t j = 0; j < nets.size(); j++) {
401  nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
402  nets[j].GetLayer(i-1).GetOutput(),
403  nets[j].GetRegularization(),
404  nets[j].GetWeightDecay());
405  }
406  }
407  for (size_t j = 0; j < nets.size(); j++) {
408  nets[j].GetLayer(0).Backward(dummy,
409  batches[j].GetInput(),
410  nets[j].GetRegularization(),
411  nets[j].GetWeightDecay());
412  }
413 
414  for (size_t j = 0; j < nets.size(); j++) {
415  for (size_t i = 0; i < depth; i++)
416  {
417  auto &masterLayer = master.GetLayer(i);
418  auto &layer = nets[j].GetLayer(i);
419  Architecture_t::ScaleAdd(masterLayer.GetWeights(),
420  layer.GetWeightGradients(),
421  -fLearningRate);
422  Architecture_t::Copy(layer.GetWeights(),
423  masterLayer.GetWeights());
424  Architecture_t::ScaleAdd(masterLayer.GetBiases(),
425  layer.GetBiasGradients(),
426  -fLearningRate);
427  Architecture_t::Copy(layer.GetBiases(),
428  masterLayer.GetBiases());
429  }
430  }
431 }
432 
433 //______________________________________________________________________________
434 template<typename Architecture_t>
435 template <typename Net_t>
437  Net_t & master,
438  std::vector<Net_t> & nets,
439  std::vector<TBatch<Architecture_t>> & batches,
440  Scalar_t momentum)
441 {
442  typename Architecture_t::Matrix_t dummy(0,0);
443  size_t depth = master.GetDepth();
444 
445  // Forward
446  for (size_t j = 0; j < nets.size(); j++) {
447  nets[j].GetLayer(0).Forward(batches[j].GetInput(), true);
448  }
449 
450  for (size_t i = 1; i < depth; i++)
451  {
452  for (size_t j = 0; j < nets.size(); j++) {
453  nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(), true);
454  }
455  }
456  // Gradients
457  for (size_t j = 0; j < nets.size(); j++) {
458  evaluateGradients<Architecture_t>(nets[j].GetLayer(depth - 1).GetActivationGradients(), nets[j].GetLossFunction(),
459  batches[j].GetOutput(), nets[j].GetLayer(depth - 1).GetOutput(),
460  batches[j].GetWeights());
461  }
462  // Backward
463  for (size_t i = depth - 1; i > 0; i--)
464  {
465  for (size_t j = 0; j < nets.size(); j++) {
466  nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
467  nets[j].GetLayer(i-1).GetOutput(),
468  nets[j].GetRegularization(),
469  nets[j].GetWeightDecay());
470  Architecture_t::ScaleAdd(master.GetLayer(i).GetWeightGradients(),
471  nets[j].GetLayer(i).GetWeightGradients(),
472  - fLearningRate / momentum);
473  Architecture_t::ScaleAdd(master.GetLayer(i).GetBiasGradients(),
474  nets[j].GetLayer(i).GetBiasGradients(),
475  - fLearningRate / momentum);
476  }
477  Architecture_t::ScaleAdd(master.GetLayer(i).GetWeightGradients(),
478  master.GetLayer(i).GetWeightGradients(),
479  momentum - 1.0);
480  Architecture_t::ScaleAdd(master.GetLayer(i).GetBiasGradients(),
481  master.GetLayer(i).GetBiasGradients(),
482  momentum - 1.0);
483  }
484  for (size_t j = 0; j < nets.size(); j++) {
485  nets[j].GetLayer(0).Backward(dummy,
486  batches[j].GetInput(),
487  nets[j].GetRegularization(),
488  nets[j].GetWeightDecay());
489  Architecture_t::ScaleAdd(master.GetLayer(0).GetWeightGradients(),
490  nets[j].GetLayer(0).GetWeightGradients(),
491  - fLearningRate / momentum);
492  Architecture_t::ScaleAdd(master.GetLayer(0).GetBiasGradients(),
493  nets[j].GetLayer(0).GetBiasGradients(),
494  - fLearningRate / momentum);
495  }
496 
497  Architecture_t::ScaleAdd(master.GetLayer(0).GetWeightGradients(),
498  master.GetLayer(0).GetWeightGradients(),
499  momentum - 1.0);
500  Architecture_t::ScaleAdd(master.GetLayer(0).GetBiasGradients(),
501  master.GetLayer(0).GetBiasGradients(),
502  momentum - 1.0);
503 
504  for (size_t i = 0; i < depth; i++)
505  {
506  auto &masterLayer = master.GetLayer(i);
507  Architecture_t::ScaleAdd(masterLayer.GetWeights(),
508  masterLayer.GetWeightGradients(),
509  1.0);
510  Architecture_t::ScaleAdd(masterLayer.GetBiases(),
511  masterLayer.GetBiasGradients(),
512  1.0);
513  for (size_t j = 0; j < nets.size(); j++) {
514  auto &layer = nets[j].GetLayer(i);
515  Architecture_t::Copy(layer.GetWeights(),
516  masterLayer.GetWeights());
517  Architecture_t::Copy(layer.GetBiases(),
518  masterLayer.GetBiases());
519  }
520  }
521 }
522 
523 //______________________________________________________________________________
524 template<typename Architecture_t>
525 template <typename Net_t>
527  Net_t & master,
528  std::vector<Net_t> & nets,
529  std::vector<TBatch<Architecture_t>> & batches,
530  Scalar_t momentum)
531 {
532  typename Architecture_t::Matrix_t dummy(0,0);
533  size_t depth = master.GetDepth();
534 
535  // Forward
536  for (size_t j = 0; j < nets.size(); j++) {
537  nets[j].GetLayer(0).Forward(batches[j].GetInput(), true);
538  }
539 
540  for (size_t i = 1; i < depth; i++)
541  {
542  for (size_t j = 0; j < nets.size(); j++) {
543  nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(), true);
544  }
545  }
546 
547  // Gradients
548  for (size_t j = 0; j < nets.size(); j++) {
549  evaluateGradients<Architecture_t>(nets[j].GetLayer(depth - 1).GetActivationGradients(), nets[j].GetLossFunction(),
550  batches[j].GetOutput(), nets[j].GetLayer(depth - 1).GetOutput(),
551  batches[j].GetWeights());
552  }
553 
554  // Backward
555  for (size_t i = depth - 1; i > 0; i--)
556  {
557  for (size_t j = 0; j < nets.size(); j++) {
558  nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
559  nets[j].GetLayer(i-1).GetOutput(),
560  nets[j].GetRegularization(),
561  nets[j].GetWeightDecay());
562  }
563  }
564 
565  for (size_t j = 0; j < nets.size(); j++) {
566  nets[j].GetLayer(0).Backward(dummy,
567  batches[j].GetInput(),
568  nets[j].GetRegularization(),
569  nets[j].GetWeightDecay());
570  }
571 
572  for (size_t i = 0; i < depth; i++)
573  {
574  auto &masterLayer = master.GetLayer(i);
575  for (size_t j = 0; j < nets.size(); j++) {
576  auto &layer = nets[j].GetLayer(i);
577  Architecture_t::Copy(layer.GetWeights(),
578  masterLayer.GetWeights());
579  Architecture_t::Copy(layer.GetBiases(),
580  masterLayer.GetBiases());
581  Architecture_t::ScaleAdd(layer.GetWeights(),
582  masterLayer.GetWeightGradients(),
583  1.0);
584  Architecture_t::ScaleAdd(layer.GetBiases(),
585  masterLayer.GetBiasGradients(),
586  1.0);
587  }
588  for (size_t j = 0; j < nets.size(); j++) {
589  auto &layer = nets[j].GetLayer(i);
590  Architecture_t::ScaleAdd(masterLayer.GetWeightGradients(),
591  layer.GetWeightGradients(),
592  - fLearningRate / momentum);
593  Architecture_t::ScaleAdd(masterLayer.GetBiasGradients(),
594  layer.GetBiasGradients(),
595  - fLearningRate / momentum);
596  }
597  Architecture_t::ScaleAdd(masterLayer.GetWeightGradients(),
598  masterLayer.GetWeightGradients(),
599  momentum - 1.0);
600  Architecture_t::ScaleAdd(masterLayer.GetBiasGradients(),
601  masterLayer.GetBiasGradients(),
602  momentum - 1.0);
603  Architecture_t::ScaleAdd(masterLayer.GetWeights(),
604  masterLayer.GetWeightGradients(),
605  1.0);
606  Architecture_t::ScaleAdd(masterLayer.GetBiases(),
607  masterLayer.GetBiasGradients(),
608  1.0);
609  }
610 }
611 
612 //______________________________________________________________________________
613 template<typename Architecture_t>
614 template <typename Net_t>
616  Net_t & net,
617  Matrix_t &input,
618  const Matrix_t &output)
619 {
620  net.Forward(input, true);
621  net.Backward(input, output);
622 
623  for (size_t i = 0; i < net.GetDepth(); i++)
624  {
625  auto &layer = net.GetLayer(i);
626  Architecture_t::ScaleAdd(layer.GetWeights(),
627  layer.GetWeightGradients(),
628  -fLearningRate);
629  if (i == 0) {
630  Architecture_t::ScaleAdd(layer.GetBiases(),
631  layer.GetBiasGradients(),
632  -fLearningRate);
633  }
634  }
635 }
636 
637 //______________________________________________________________________________
638 template <typename Architecture_t>
639 template <typename Net_t>
641  const Matrix_t &output, const Matrix_t &weights)
642  -> Scalar_t
643 {
644  Scalar_t loss = net.Loss(input, output);
646  net.Backward(input, output, weights);
647 
648  for (size_t i = 0; i < net.GetDepth(); i++)
649  {
650  auto &layer = net.GetLayer(i);
651  Architecture_t::ScaleAdd(layer.GetWeights(),
652  layer.GetWeightGradients(),
653  -fLearningRate);
654  if (i == 0) {
655  Architecture_t::ScaleAdd(layer.GetBiases(),
656  layer.GetBiasGradients(),
657  -fLearningRate);
658  }
659  }
660  return loss;
661 }
662 
663 //______________________________________________________________________________
664 template<typename Architecture_t>
666 {
667  if (fTestError < fMinimumError * 0.999) {
668  fConvergenceCount = 0;
670  } else {
672  }
673 
675 }
676 
677 //______________________________________________________________________________
678 template<typename Architecture_t>
680 {
681  fTestError = testError;
682  if (fTestError < fMinimumError * 0.999) {
683  fConvergenceCount = 0;
685  } else {
687  }
689 }
690 } // namespace DNN
691 } // namespace TMVA
692 
693 #endif
typename Architecture_t::Scalar_t Scalar_t
Definition: Minimizers.h:57
Scalar_t GetTrainingError() const
Definition: Minimizers.h:161
Scalar_t Train(const Data_t &TrainingDataIn, size_t nTrainingSamples, const Data_t &TestDataIn, size_t nTestSamples, Net_t &net, size_t nThreads=1)
Train the given net using the given training input data (events), training output data (labels)...
Scalar_t TrainMomentum(const Data_t &TrainingDataIn, size_t nTrainingSamples, const Data_t &TestDataIn, size_t nTestSamples, Net_t &net, Scalar_t momentum, size_t nThreads=1)
Same as Train(...) but uses the given momentum.
size_t fStepCount
Number of steps performed in the current training session.
Definition: Minimizers.h:62
size_t fBatchSize
Batch size to use for the training.
Definition: Minimizers.h:61
Scalar_t fMinimumError
The minimum loss achieved on the training set.
Definition: Minimizers.h:71
void Step(Net_t &net, Matrix_t &input, const Matrix_t &output, const Matrix_t &weights)
Perform a single optimization step on a given batch.
Definition: Minimizers.h:329
Scalar_t GetTestError() const
Definition: Minimizers.h:162
size_t fConvergenceSteps
Number of training epochs without considerable.
Definition: Minimizers.h:63
STL namespace.
Scalar_t StepReducedWeightsLoss(Net_t &net, Matrix_t &input, const Matrix_t &output, const Matrix_t &weights)
Similar to StepReducedWeights(...) but also evaluates the loss.
Scalar_t fTestError
Holds the most recently computed test loss.
Definition: Minimizers.h:69
void SetBatchSize(Scalar_t rate)
Definition: Minimizers.h:168
void SetConvergenceSteps(size_t steps)
Definition: Minimizers.h:165
void StepReducedWeights(Net_t &net, Matrix_t &input, const Matrix_t &output)
Does not evaluate the loss and therefore not trigger a possible synchronization with the device...
Definition: Minimizers.h:615
size_t fConvergenceCount
Current number of training epochs without.
Definition: Minimizers.h:65
TDataLoader.
Definition: DataLoader.h:79
size_t fTestInterval
Interval for the computation of the test error.
Definition: Minimizers.h:67
Scalar_t StepLoss(Net_t &net, Matrix_t &input, const Matrix_t &output, const Matrix_t &weights)
Same as Step(...) but also evaluate the loss on the given training data.
void Reset()
Reset minimizer object to default state.
Definition: Minimizers.h:81
bool HasConverged()
Increases the minimization step counter by the test error evaluation period and uses the current inte...
Definition: Minimizers.h:665
void StepNesterov(Net_t &master, std::vector< Net_t > &nets, std::vector< TBatch< Architecture_t >> &batches, Scalar_t momentum)
Same as the Step(...) method for multiple batches but uses Nesterov momentum.
Definition: Minimizers.h:526
void SetLearningRate(Scalar_t rate)
Definition: Minimizers.h:167
size_t GetTestInterval() const
Definition: Minimizers.h:163
void Copy(void *source, void *dest)
size_t GetConvergenceSteps() const
Definition: Minimizers.h:160
double Double_t
Definition: RtypesCore.h:55
static RooMathCoreReg dummy
void StepMomentum(Net_t &master, std::vector< Net_t > &nets, std::vector< TBatch< Architecture_t >> &batches, Scalar_t momentum)
Same as the Step(...) method for multiple batches but uses momentum.
Definition: Minimizers.h:436
Abstract ClassifierFactory template that handles arbitrary types.
typename Architecture_t::Matrix_t Matrix_t
Definition: Minimizers.h:58
Scalar_t fLearningRate
Learning rate .
Definition: Minimizers.h:70
you should not use this method at all Int_t Int_t Double_t Double_t Double_t Int_t Double_t Double_t Double_t Double_t b
Definition: TRolke.cxx:630
void SetTestInterval(size_t interval)
Definition: Minimizers.h:166
size_t GetConvergenceCount() const
Definition: Minimizers.h:159
Scalar_t fTrainingError
Holds the most recently computed training loss.
Definition: Minimizers.h:68