Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RMSProp.h
Go to the documentation of this file.
1// @(#)root/tmva/tmva/dnn:$Id$
2// Author: Ravi Kiran S
3
4/**********************************************************************************
5 * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6 * Package: TMVA *
7 * Class : TRMSProp *
8 * Web : http://tmva.sourceforge.net *
9 * *
10 * Description: *
11 * RMSProp Optimizer Class *
12 * *
13 * Authors (alphabetical): *
14 * Ravi Kiran S <sravikiran0606@gmail.com> - CERN, Switzerland *
15 * *
16 * Copyright (c) 2005-2018: *
17 * CERN, Switzerland *
18 * U. of Victoria, Canada *
19 * MPI-K Heidelberg, Germany *
20 * U. of Bonn, Germany *
21 * *
22 * Redistribution and use in source and binary forms, with or without *
23 * modification, are permitted according to the terms listed in LICENSE *
24 * (http://tmva.sourceforge.net/LICENSE) *
25 **********************************************************************************/
26
27#ifndef TMVA_DNN_RMSPROP
28#define TMVA_DNN_RMSPROP
29
30#include "TMatrix.h"
31#include "TMVA/DNN/Optimizer.h"
32#include "TMVA/DNN/Functions.h"
33#include <vector>
34
35namespace TMVA {
36namespace DNN {
37
38/** \class TRMSProp
39 * RMSProp Optimizer class
40 *
41 * This class represents the RMSProp Optimizer with options for applying momentum.
42 */
43template <typename Architecture_t, typename Layer_t = VGeneralLayer<Architecture_t>,
44 typename DeepNet_t = TDeepNet<Architecture_t, Layer_t>>
45class TRMSProp : public VOptimizer<Architecture_t, Layer_t, DeepNet_t> {
46public:
47 using Matrix_t = typename Architecture_t::Matrix_t;
48 using Scalar_t = typename Architecture_t::Scalar_t;
49
50protected:
51 Scalar_t fMomentum; ///< The momentum used for training.
52 Scalar_t fRho; ///< The Rho constant used by the optimizer.
53 Scalar_t fEpsilon; ///< The Smoothing term used to avoid division by zero.
54 std::vector<std::vector<Matrix_t>>
55 fPastSquaredWeightGradients; ///< The sum of the square of the past weight gradients associated with the deep net.
56 std::vector<std::vector<Matrix_t>>
57 fPastSquaredBiasGradients; ///< The sum of the square of the past bias gradients associated with the deep net.
58
59 std::vector<std::vector<Matrix_t>> fWeightUpdates; ///< The accumulation of the past Weights for performing updates.
60 std::vector<std::vector<Matrix_t>> fBiasUpdates; ///< The accumulation of the past Biases for performing updates.
61 std::vector<std::vector<Matrix_t>>
62 fWorkWeightTensor1; ///< working tensor used to keep a temporary copy of weights or weight gradients
63 std::vector<std::vector<Matrix_t>>
64 fWorkBiasTensor1; ///< working tensor used to keep a temporary copy of bias or bias gradients
65 std::vector<std::vector<Matrix_t>>
66 fWorkWeightTensor2; ///< working tensor used to keep a temporary copy of weights or weight gradients
67 std::vector<std::vector<Matrix_t>>
68 fWorkBiasTensor2; ///< working tensor used to keep a temporary copy of bias or bias gradients
69
70 /*! Update the weights, given the current weight gradients. */
71 void UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights, const std::vector<Matrix_t> &weightGradients);
72
73 /*! Update the biases, given the current bias gradients. */
74 void UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases, const std::vector<Matrix_t> &biasGradients);
75
76public:
77 /*! Constructor. */
78 TRMSProp(DeepNet_t &deepNet, Scalar_t learningRate = 0.001, Scalar_t momentum = 0.0, Scalar_t rho = 0.9,
79 Scalar_t epsilon = 1e-7);
80
81 /*! Destructor. */
82 ~TRMSProp() = default;
83
84 /*! Getters */
85 Scalar_t GetMomentum() const { return fMomentum; }
86 Scalar_t GetRho() const { return fRho; }
87 Scalar_t GetEpsilon() const { return fEpsilon; }
88
89 std::vector<std::vector<Matrix_t>> &GetPastSquaredWeightGradients() { return fPastSquaredWeightGradients; }
90 std::vector<Matrix_t> &GetPastSquaredWeightGradientsAt(size_t i) { return fPastSquaredWeightGradients[i]; }
91
92 std::vector<std::vector<Matrix_t>> &GetPastSquaredBiasGradients() { return fPastSquaredBiasGradients; }
93 std::vector<Matrix_t> &GetPastSquaredBiasGradientsAt(size_t i) { return fPastSquaredBiasGradients[i]; }
94
95 std::vector<std::vector<Matrix_t>> &GetWeightUpdates() { return fWeightUpdates; }
96 std::vector<Matrix_t> &GetWeightUpdatesAt(size_t i) { return fWeightUpdates[i]; }
97
98 std::vector<std::vector<Matrix_t>> &GetBiasUpdates() { return fBiasUpdates; }
99 std::vector<Matrix_t> &GetBiasUpdatesAt(size_t i) { return fBiasUpdates[i]; }
100};
101
102//
103//
104// The RMSProp Optimizer Class - Implementation
105//_________________________________________________________________________________________________
106template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
109 : VOptimizer<Architecture_t, Layer_t, DeepNet_t>(learningRate, deepNet), fMomentum(momentum), fRho(rho),
110 fEpsilon(epsilon)
111{
112 std::vector<Layer_t *> &layers = deepNet.GetLayers();
113 const size_t layersNSlices = layers.size();
114 fPastSquaredWeightGradients.resize(layersNSlices);
115 fPastSquaredBiasGradients.resize(layersNSlices);
116 fWeightUpdates.resize(layersNSlices);
117 fBiasUpdates.resize(layersNSlices);
118 fWorkWeightTensor1.resize(layersNSlices);
119 fWorkBiasTensor1.resize(layersNSlices);
120 fWorkWeightTensor2.resize(layersNSlices);
121 fWorkBiasTensor2.resize(layersNSlices);
122
123 for (size_t i = 0; i < layersNSlices; i++) {
124 const size_t weightsNSlices = (layers[i]->GetWeights()).size();
125
126 Architecture_t::CreateWeightTensors(fPastSquaredWeightGradients[i], layers[i]->GetWeights());
127 Architecture_t::CreateWeightTensors(fWeightUpdates[i], layers[i]->GetWeights());
128
129 for (size_t j = 0; j < weightsNSlices; j++) {
130 initialize<Architecture_t>(fPastSquaredWeightGradients[i][j], EInitialization::kZero);
131 initialize<Architecture_t>(fWeightUpdates[i][j], EInitialization::kZero);
132 }
133
134 const size_t biasesNSlices = (layers[i]->GetBiases()).size();
135
136 Architecture_t::CreateWeightTensors( fPastSquaredBiasGradients[i], layers[i]->GetBiases());
137 Architecture_t::CreateWeightTensors( fBiasUpdates[i], layers[i]->GetBiases());
138
139 for (size_t j = 0; j < biasesNSlices; j++) {
140 initialize<Architecture_t>(fPastSquaredBiasGradients[i][j], EInitialization::kZero);
141 initialize<Architecture_t>(fBiasUpdates[i][j], EInitialization::kZero);
142 }
143 Architecture_t::CreateWeightTensors(fWorkWeightTensor1[i], layers[i]->GetWeights());
144 Architecture_t::CreateWeightTensors(fWorkBiasTensor1[i], layers[i]->GetBiases());
145 Architecture_t::CreateWeightTensors(fWorkWeightTensor2[i], layers[i]->GetWeights());
146 Architecture_t::CreateWeightTensors(fWorkBiasTensor2[i], layers[i]->GetBiases());
147 }
148}
149
150//_________________________________________________________________________________________________
151template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
152auto TRMSProp<Architecture_t, Layer_t, DeepNet_t>::UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights,
153 const std::vector<Matrix_t> &weightGradients) -> void
154{
155 std::vector<Matrix_t> &currentLayerPastSquaredWeightGradients = this->GetPastSquaredWeightGradientsAt(layerIndex);
156 std::vector<Matrix_t> &currentLayerWeightUpdates = this->GetWeightUpdatesAt(layerIndex);
157
158 for (size_t k = 0; k < currentLayerPastSquaredWeightGradients.size(); k++) {
159
160 // accumulation matrix used for temporary storing of the current accumulation
161 auto &accumulation = fWorkWeightTensor1[layerIndex][k];
162 auto &currentSquaredWeightGradients = fWorkWeightTensor2[layerIndex][k];
163
164 // Vt = rho * Vt-1 + (1-rho) * currentSquaredWeightGradients
165 initialize<Architecture_t>(accumulation, EInitialization::kZero);
166
167 Architecture_t::Copy(currentSquaredWeightGradients, weightGradients[k]);
168 Architecture_t::SquareElementWise(currentSquaredWeightGradients);
169 Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredWeightGradients[k], this->GetRho());
170 Architecture_t::ScaleAdd(accumulation, currentSquaredWeightGradients, 1 - (this->GetRho()));
171 Architecture_t::Copy(currentLayerPastSquaredWeightGradients[k], accumulation);
172
173 // Wt = momentum * Wt-1 + (learningRate * currentWeightGradients) / (sqrt(Vt + epsilon))
174 initialize<Architecture_t>(accumulation, EInitialization::kZero);
175 auto &dummy = fWorkWeightTensor2[layerIndex][k]; // reuse working tensor
176 Architecture_t::Copy(dummy, currentLayerPastSquaredWeightGradients[k]);
177 Architecture_t::ConstAdd(dummy, this->GetEpsilon());
178 Architecture_t::SqrtElementWise(dummy);
179 Architecture_t::ReciprocalElementWise(dummy);
180 Architecture_t::Hadamard(dummy, weightGradients[k]);
181
182 Architecture_t::ScaleAdd(accumulation, currentLayerWeightUpdates[k], this->GetMomentum());
183 Architecture_t::ScaleAdd(accumulation, dummy, this->GetLearningRate());
184 Architecture_t::Copy(currentLayerWeightUpdates[k], accumulation);
185 }
186
187 // updating the weights.
188 // theta = theta - Wt
189 for (size_t i = 0; i < weights.size(); i++) {
190 Architecture_t::ScaleAdd(weights[i], currentLayerWeightUpdates[i], -1.0);
191 }
192}
193
194//_________________________________________________________________________________________________
195template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
196auto TRMSProp<Architecture_t, Layer_t, DeepNet_t>::UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases,
197 const std::vector<Matrix_t> &biasGradients) -> void
198{
199 std::vector<Matrix_t> &currentLayerPastSquaredBiasGradients = this->GetPastSquaredBiasGradientsAt(layerIndex);
200 std::vector<Matrix_t> &currentLayerBiasUpdates = this->GetBiasUpdatesAt(layerIndex);
201
202 for (size_t k = 0; k < currentLayerPastSquaredBiasGradients.size(); k++) {
203
204 // accumulation matrix used for temporary storing of the current accumulation
205 auto &accumulation = fWorkBiasTensor1[layerIndex][k];
206 auto &currentSquaredBiasGradients = fWorkBiasTensor2[layerIndex][k];
207
208 // Vt = rho * Vt-1 + (1-rho) * currentSquaredBiasGradients
209 initialize<Architecture_t>(accumulation, EInitialization::kZero);
210 Architecture_t::Copy(currentSquaredBiasGradients, biasGradients[k]);
211 Architecture_t::SquareElementWise(currentSquaredBiasGradients);
212 Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredBiasGradients[k], this->GetRho());
213 Architecture_t::ScaleAdd(accumulation, currentSquaredBiasGradients, 1 - (this->GetRho()));
214 Architecture_t::Copy(currentLayerPastSquaredBiasGradients[k], accumulation);
215
216 // Wt = momentum * Wt-1 + (learningRate * currentBiasGradients) / (sqrt(Vt + epsilon))
217 initialize<Architecture_t>(accumulation, EInitialization::kZero);
218 auto &dummy = fWorkBiasTensor2[layerIndex][k]; // reuse working tensor
219
220 Architecture_t::Copy(dummy, currentLayerPastSquaredBiasGradients[k]);
221 Architecture_t::ConstAdd(dummy, this->GetEpsilon());
222 Architecture_t::SqrtElementWise(dummy);
223 Architecture_t::ReciprocalElementWise(dummy);
224 Architecture_t::Hadamard(dummy, biasGradients[k]);
225
226 Architecture_t::ScaleAdd(accumulation, currentLayerBiasUpdates[k], this->GetMomentum());
227 Architecture_t::ScaleAdd(accumulation, dummy, this->GetLearningRate());
228 Architecture_t::Copy(currentLayerBiasUpdates[k], accumulation);
229 }
230
231 // updating the Biases.
232 // theta = theta - Wt
233 for (size_t i = 0; i < biases.size(); i++) {
234 Architecture_t::ScaleAdd(biases[i], currentLayerBiasUpdates[i], -1.0);
235 }
236}
237
238} // namespace DNN
239} // namespace TMVA
240
241#endif
#define e(i)
Definition RSha256.hxx:103
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
RMSProp Optimizer class.
Definition RMSProp.h:45
Scalar_t fRho
The Rho constant used by the optimizer.
Definition RMSProp.h:52
typename Architecture_t::Scalar_t Scalar_t
Definition RMSProp.h:48
void UpdateWeights(size_t layerIndex, std::vector< Matrix_t > &weights, const std::vector< Matrix_t > &weightGradients)
Update the weights, given the current weight gradients.
Definition RMSProp.h:152
~TRMSProp()=default
Destructor.
std::vector< Matrix_t > & GetPastSquaredWeightGradientsAt(size_t i)
Definition RMSProp.h:90
std::vector< std::vector< Matrix_t > > fWorkBiasTensor2
working tensor used to keep a temporary copy of bias or bias gradients
Definition RMSProp.h:68
Scalar_t GetRho() const
Definition RMSProp.h:86
std::vector< std::vector< Matrix_t > > fPastSquaredWeightGradients
The sum of the square of the past weight gradients associated with the deep net.
Definition RMSProp.h:55
std::vector< std::vector< Matrix_t > > & GetBiasUpdates()
Definition RMSProp.h:98
std::vector< std::vector< Matrix_t > > fWorkWeightTensor2
working tensor used to keep a temporary copy of weights or weight gradients
Definition RMSProp.h:66
Scalar_t GetEpsilon() const
Definition RMSProp.h:87
std::vector< std::vector< Matrix_t > > fWorkBiasTensor1
working tensor used to keep a temporary copy of bias or bias gradients
Definition RMSProp.h:64
Scalar_t fMomentum
The momentum used for training.
Definition RMSProp.h:51
std::vector< std::vector< Matrix_t > > & GetPastSquaredBiasGradients()
Definition RMSProp.h:92
Scalar_t fEpsilon
The Smoothing term used to avoid division by zero.
Definition RMSProp.h:53
TRMSProp(DeepNet_t &deepNet, Scalar_t learningRate=0.001, Scalar_t momentum=0.0, Scalar_t rho=0.9, Scalar_t epsilon=1e-7)
Constructor.
Definition RMSProp.h:107
std::vector< std::vector< Matrix_t > > fPastSquaredBiasGradients
The sum of the square of the past bias gradients associated with the deep net.
Definition RMSProp.h:57
std::vector< std::vector< Matrix_t > > fWeightUpdates
The accumulation of the past Weights for performing updates.
Definition RMSProp.h:59
typename Architecture_t::Matrix_t Matrix_t
Definition RMSProp.h:47
void UpdateBiases(size_t layerIndex, std::vector< Matrix_t > &biases, const std::vector< Matrix_t > &biasGradients)
Update the biases, given the current bias gradients.
Definition RMSProp.h:196
std::vector< Matrix_t > & GetBiasUpdatesAt(size_t i)
Definition RMSProp.h:99
std::vector< std::vector< Matrix_t > > & GetWeightUpdates()
Definition RMSProp.h:95
std::vector< std::vector< Matrix_t > > fWorkWeightTensor1
working tensor used to keep a temporary copy of weights or weight gradients
Definition RMSProp.h:62
std::vector< Matrix_t > & GetWeightUpdatesAt(size_t i)
Definition RMSProp.h:96
std::vector< std::vector< Matrix_t > > & GetPastSquaredWeightGradients()
Definition RMSProp.h:89
std::vector< std::vector< Matrix_t > > fBiasUpdates
The accumulation of the past Biases for performing updates.
Definition RMSProp.h:60
Scalar_t GetMomentum() const
Getters.
Definition RMSProp.h:85
std::vector< Matrix_t > & GetPastSquaredBiasGradientsAt(size_t i)
Definition RMSProp.h:93
Generic Optimizer class.
Definition Optimizer.h:45
std::vector< Layer_t * > & GetLayers()
Definition Optimizer.h:82
create variable transformations
double epsilon
Definition triangle.c:618