Logo ROOT   6.18/05
Reference Guide
SGD.h
Go to the documentation of this file.
1// @(#)root/tmva/tmva/dnn:$Id$
2// Author: Ravi Kiran S
3
4/**********************************************************************************
5 * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6 * Package: TMVA *
7 * Class : TSGD *
8 * Web : http://tmva.sourceforge.net *
9 * *
10 * Description: *
11 * Stochastic Batch Gradient Descent Optimizer Class *
12 * *
13 * Authors (alphabetical): *
14 * Ravi Kiran S <sravikiran0606@gmail.com> - CERN, Switzerland *
15 * *
16 * Copyright (c) 2005-2018: *
17 * CERN, Switzerland *
18 * U. of Victoria, Canada *
19 * MPI-K Heidelberg, Germany *
20 * U. of Bonn, Germany *
21 * *
22 * Redistribution and use in source and binary forms, with or without *
23 * modification, are permitted according to the terms listed in LICENSE *
24 * (http://tmva.sourceforge.net/LICENSE) *
25 **********************************************************************************/
26
27#ifndef TMVA_DNN_SGD
28#define TMVA_DNN_SGD
29
30#include "TMatrix.h"
31#include "TMVA/DNN/Optimizer.h"
32#include "TMVA/DNN/Functions.h"
33
34namespace TMVA {
35namespace DNN {
36
37/** \class TSGD
38 * Stochastic Batch Gradient Descent Optimizer class
39 *
40 * This class represents the Stochastic Batch Gradient Descent Optimizer with options for applying momentum
41 * and nesterov momentum.
42 */
43template <typename Architecture_t, typename Layer_t = VGeneralLayer<Architecture_t>,
44 typename DeepNet_t = TDeepNet<Architecture_t, Layer_t>>
45class TSGD : public VOptimizer<Architecture_t, Layer_t, DeepNet_t> {
46public:
47 using Matrix_t = typename Architecture_t::Matrix_t;
48 using Scalar_t = typename Architecture_t::Scalar_t;
49
50protected:
51 Scalar_t fMomentum; ///< The momentum used for training.
52 std::vector<std::vector<Matrix_t>>
53 fPastWeightGradients; ///< The sum of the past weight gradients associated with the deep net.
54 std::vector<std::vector<Matrix_t>>
55 fPastBiasGradients; ///< The sum of the past bias gradients associated with the deep net.
56
57 /*! Update the weights, given the current weight gradients. */
58 void UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights, const std::vector<Matrix_t> &weightGradients);
59
60 /*! Update the biases, given the current bias gradients. */
61 void UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases, const std::vector<Matrix_t> &biasGradients);
62
63public:
64 /*! Constructor. */
65 TSGD(Scalar_t learningRate, DeepNet_t &deepNet, Scalar_t momentum);
66
67 /*! Destructor. */
68 ~TSGD() = default;
69
70 /*! Getters */
71 Scalar_t GetMomentum() const { return fMomentum; }
72
73 std::vector<std::vector<Matrix_t>> &GetPastWeightGradients() { return fPastWeightGradients; }
74 std::vector<Matrix_t> &GetPastWeightGradientsAt(size_t i) { return fPastWeightGradients[i]; }
75
76 std::vector<std::vector<Matrix_t>> &GetPastBiasGradients() { return fPastBiasGradients; }
77 std::vector<Matrix_t> &GetPastBiasGradientsAt(size_t i) { return fPastBiasGradients[i]; }
78};
79
80//
81//
82// The Stochastic Gradient Descent Optimizer Class - Implementation
83//_________________________________________________________________________________________________
84template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
85TSGD<Architecture_t, Layer_t, DeepNet_t>::TSGD(Scalar_t learningRate, DeepNet_t &deepNet, Scalar_t momentum)
86 : VOptimizer<Architecture_t, Layer_t, DeepNet_t>(learningRate, deepNet), fMomentum(momentum)
87{
88 std::vector<Layer_t *> &layers = deepNet.GetLayers();
89 size_t layersNSlices = layers.size();
90 fPastWeightGradients.resize(layersNSlices);
91 fPastBiasGradients.resize(layersNSlices);
92
93 for (size_t i = 0; i < layersNSlices; i++) {
94 size_t weightsNSlices = (layers[i]->GetWeights()).size();
95
96 for (size_t j = 0; j < weightsNSlices; j++) {
97 Matrix_t &currentWeights = layers[i]->GetWeightsAt(j);
98 size_t weightsNRows = currentWeights.GetNrows();
99 size_t weightsNCols = currentWeights.GetNcols();
100
101 fPastWeightGradients[i].emplace_back(weightsNRows, weightsNCols);
102 initialize<Architecture_t>(fPastWeightGradients[i][j], EInitialization::kZero);
103 }
104
105 size_t biasesNSlices = (layers[i]->GetBiases()).size();
106
107 for (size_t j = 0; j < biasesNSlices; j++) {
108 Matrix_t &currentBiases = layers[i]->GetBiasesAt(j);
109 size_t biasesNRows = currentBiases.GetNrows();
110 size_t biasesNCols = currentBiases.GetNcols();
111
112 fPastBiasGradients[i].emplace_back(biasesNRows, biasesNCols);
113 initialize<Architecture_t>(fPastBiasGradients[i][j], EInitialization::kZero);
114 }
115 }
116}
117
118
119
120//_________________________________________________________________________________________________
121template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
122auto TSGD<Architecture_t, Layer_t, DeepNet_t>::UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights,
123 const std::vector<Matrix_t> &weightGradients) -> void
124{
125 // accumulating the current layer past weight gradients to include the current weight gradients.
126 // Vt = momentum * Vt-1 + currentGradients
127
128 std::vector<Matrix_t> &currentLayerPastWeightGradients = this->GetPastWeightGradientsAt(layerIndex);
129
130 for (size_t k = 0; k < currentLayerPastWeightGradients.size(); k++) {
131 Architecture_t::ConstMult(currentLayerPastWeightGradients[k], this->GetMomentum());
132 Architecture_t::ScaleAdd(currentLayerPastWeightGradients[k], weightGradients[k], 1.0);
133 }
134
135 // updating the weights.
136 // theta = theta - learningRate * Vt
137 for (size_t i = 0; i < weights.size(); i++) {
138 Architecture_t::ScaleAdd(weights[i], currentLayerPastWeightGradients[i], -this->GetLearningRate());
139 }
140}
141
142//_________________________________________________________________________________________________
143template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
144auto TSGD<Architecture_t, Layer_t, DeepNet_t>::UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases,
145 const std::vector<Matrix_t> &biasGradients) -> void
146{
147 // accumulating the current layer past bias gradients to include the current bias gradients.
148 // Vt = momentum * Vt-1 + currentGradients
149
150 std::vector<Matrix_t> &currentLayerPastBiasGradients = this->GetPastBiasGradientsAt(layerIndex);
151
152 for (size_t k = 0; k < currentLayerPastBiasGradients.size(); k++) {
153 Architecture_t::ConstMult(currentLayerPastBiasGradients[k], this->GetMomentum());
154 Architecture_t::ScaleAdd(currentLayerPastBiasGradients[k], biasGradients[k], 1.0);
155 }
156
157 // updating the biases
158 // theta = theta - learningRate * Vt
159 for (size_t i = 0; i < biases.size(); i++) {
160 Architecture_t::ScaleAdd(biases[i], currentLayerPastBiasGradients[i], -this->GetLearningRate());
161 }
162}
163
164} // namespace DNN
165} // namespace TMVA
166
167#endif
Stochastic Batch Gradient Descent Optimizer class.
Definition: SGD.h:45
void UpdateWeights(size_t layerIndex, std::vector< Matrix_t > &weights, const std::vector< Matrix_t > &weightGradients)
Update the weights, given the current weight gradients.
Definition: SGD.h:122
~TSGD()=default
Destructor.
Scalar_t fMomentum
The momentum used for training.
Definition: SGD.h:51
typename Architecture_t::Scalar_t Scalar_t
Definition: SGD.h:48
TSGD(Scalar_t learningRate, DeepNet_t &deepNet, Scalar_t momentum)
Constructor.
Definition: SGD.h:85
std::vector< std::vector< Matrix_t > > & GetPastBiasGradients()
Definition: SGD.h:76
std::vector< std::vector< Matrix_t > > fPastBiasGradients
The sum of the past bias gradients associated with the deep net.
Definition: SGD.h:55
std::vector< std::vector< Matrix_t > > & GetPastWeightGradients()
Definition: SGD.h:73
void UpdateBiases(size_t layerIndex, std::vector< Matrix_t > &biases, const std::vector< Matrix_t > &biasGradients)
Update the biases, given the current bias gradients.
Definition: SGD.h:144
std::vector< Matrix_t > & GetPastWeightGradientsAt(size_t i)
Definition: SGD.h:74
std::vector< Matrix_t > & GetPastBiasGradientsAt(size_t i)
Definition: SGD.h:77
std::vector< std::vector< Matrix_t > > fPastWeightGradients
The sum of the past weight gradients associated with the deep net.
Definition: SGD.h:53
typename Architecture_t::Matrix_t Matrix_t
Definition: SGD.h:47
Scalar_t GetMomentum() const
Getters.
Definition: SGD.h:71
Generic Optimizer class.
Definition: Optimizer.h:44
std::vector< Layer_t * > & GetLayers()
Definition: Optimizer.h:78
create variable transformations