Logo ROOT   6.18/05
Reference Guide
Adadelta.h
Go to the documentation of this file.
1// @(#)root/tmva/tmva/dnn:$Id$
2// Author: Ravi Kiran S
3
4/**********************************************************************************
5 * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6 * Package: TMVA *
7 * Class : TAdadelta *
8 * Web : http://tmva.sourceforge.net *
9 * *
10 * Description: *
11 * Adadelta Optimizer Class *
12 * *
13 * Authors (alphabetical): *
14 * Ravi Kiran S <sravikiran0606@gmail.com> - CERN, Switzerland *
15 * *
16 * Copyright (c) 2005-2018: *
17 * CERN, Switzerland *
18 * U. of Victoria, Canada *
19 * MPI-K Heidelberg, Germany *
20 * U. of Bonn, Germany *
21 * *
22 * Redistribution and use in source and binary forms, with or without *
23 * modification, are permitted according to the terms listed in LICENSE *
24 * (http://tmva.sourceforge.net/LICENSE) *
25 **********************************************************************************/
26
27#ifndef TMVA_DNN_ADADELTA
28#define TMVA_DNN_ADADELTA
29
30#include "TMatrix.h"
31#include "TMVA/DNN/Optimizer.h"
32#include "TMVA/DNN/Functions.h"
33
34namespace TMVA {
35namespace DNN {
36
37/** \class TAdadelta
38 * Adadelta Optimizer class
39 *
40 * This class represents the Adadelta Optimizer.
41 */
42template <typename Architecture_t, typename Layer_t = VGeneralLayer<Architecture_t>,
43 typename DeepNet_t = TDeepNet<Architecture_t, Layer_t>>
44class TAdadelta : public VOptimizer<Architecture_t, Layer_t, DeepNet_t> {
45public:
46 using Matrix_t = typename Architecture_t::Matrix_t;
47 using Scalar_t = typename Architecture_t::Scalar_t;
48
49protected:
50 Scalar_t fRho; ///< The Rho constant used by the optimizer.
51 Scalar_t fEpsilon; ///< The Smoothing term used to avoid division by zero.
52 std::vector<std::vector<Matrix_t>> fPastSquaredWeightGradients; ///< The accumulation of the square of the past
53 /// weight gradients associated with the deep net.
54 std::vector<std::vector<Matrix_t>> fPastSquaredBiasGradients; ///< The accumulation of the square of the past bias
55 /// gradients associated with the deep net.
56
57 std::vector<std::vector<Matrix_t>> fPastSquaredWeightUpdates; ///< The accumulation of the square of the past weight
58 /// updates associated with the deep net.
59 std::vector<std::vector<Matrix_t>> fPastSquaredBiasUpdates; ///< The accumulation of the square of the past bias
60 /// updates associated with the deep net.
61
62 /*! Update the weights, given the current weight gradients. */
63 void UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights, const std::vector<Matrix_t> &weightGradients);
64
65 /*! Update the biases, given the current bias gradients. */
66 void UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases, const std::vector<Matrix_t> &biasGradients);
67
68public:
69 /*! Constructor. */
70 TAdadelta(DeepNet_t &deepNet, Scalar_t learningRate = 1.0, Scalar_t rho = 0.95, Scalar_t epsilon = 1e-8);
71
72 /*! Destructor. */
73 ~TAdadelta() = default;
74
75 /*! Getters */
76 Scalar_t GetRho() const { return fRho; }
77 Scalar_t GetEpsilon() const { return fEpsilon; }
78
79 std::vector<std::vector<Matrix_t>> &GetPastSquaredWeightGradients() { return fPastSquaredWeightGradients; }
80 std::vector<Matrix_t> &GetPastSquaredWeightGradientsAt(size_t i) { return fPastSquaredWeightGradients[i]; }
81
82 std::vector<std::vector<Matrix_t>> &GetPastSquaredBiasGradients() { return fPastSquaredBiasGradients; }
83 std::vector<Matrix_t> &GetPastSquaredBiasGradientsAt(size_t i) { return fPastSquaredBiasGradients[i]; }
84
85 std::vector<std::vector<Matrix_t>> &GetPastSquaredWeightUpdates() { return fPastSquaredWeightUpdates; }
86 std::vector<Matrix_t> &GetPastSquaredWeightUpdatesAt(size_t i) { return fPastSquaredWeightUpdates[i]; }
87
88 std::vector<std::vector<Matrix_t>> &GetPastSquaredBiasUpdates() { return fPastSquaredBiasUpdates; }
89 std::vector<Matrix_t> &GetPastSquaredBiasUpdatesAt(size_t i) { return fPastSquaredBiasUpdates[i]; }
90};
91
92//
93//
94// The Adadelta Optimizer Class - Implementation
95//_________________________________________________________________________________________________
96template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
99 : VOptimizer<Architecture_t, Layer_t, DeepNet_t>(learningRate, deepNet), fRho(rho), fEpsilon(epsilon)
100{
101 std::vector<Layer_t *> &layers = deepNet.GetLayers();
102 const size_t layersNSlices = layers.size();
103 fPastSquaredWeightGradients.resize(layersNSlices);
104 fPastSquaredBiasGradients.resize(layersNSlices);
105 fPastSquaredWeightUpdates.resize(layersNSlices);
106 fPastSquaredBiasUpdates.resize(layersNSlices);
107
108 for (size_t i = 0; i < layersNSlices; i++) {
109 const size_t weightsNSlices = (layers[i]->GetWeights()).size();
110
111 for (size_t j = 0; j < weightsNSlices; j++) {
112 Matrix_t &currentWeights = layers[i]->GetWeightsAt(j);
113 const size_t weightsNRows = currentWeights.GetNrows();
114 const size_t weightsNCols = currentWeights.GetNcols();
115
116 fPastSquaredWeightGradients[i].emplace_back(weightsNRows, weightsNCols);
117 fPastSquaredWeightUpdates[i].emplace_back(weightsNRows, weightsNCols);
118 initialize<Architecture_t>(fPastSquaredWeightGradients[i][j], EInitialization::kZero);
119 initialize<Architecture_t>(fPastSquaredWeightUpdates[i][j], EInitialization::kZero);
120 }
121
122 const size_t biasesNSlices = (layers[i]->GetBiases()).size();
123
124 for (size_t j = 0; j < biasesNSlices; j++) {
125 Matrix_t &currentBiases = layers[i]->GetBiasesAt(j);
126 const size_t biasesNRows = currentBiases.GetNrows();
127 const size_t biasesNCols = currentBiases.GetNcols();
128
129 fPastSquaredBiasGradients[i].emplace_back(biasesNRows, biasesNCols);
130 fPastSquaredBiasUpdates[i].emplace_back(biasesNRows, biasesNCols);
131 initialize<Architecture_t>(fPastSquaredBiasGradients[i][j], EInitialization::kZero);
132 initialize<Architecture_t>(fPastSquaredBiasUpdates[i][j], EInitialization::kZero);
133 }
134 }
135}
136
137//_________________________________________________________________________________________________
138template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
139auto TAdadelta<Architecture_t, Layer_t, DeepNet_t>::UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights,
140 const std::vector<Matrix_t> &weightGradients) -> void
141{
142 std::vector<Matrix_t> &currentLayerPastSquaredWeightGradients = this->GetPastSquaredWeightGradientsAt(layerIndex);
143 std::vector<Matrix_t> &currentLayerPastSquaredWeightUpdates = this->GetPastSquaredWeightUpdatesAt(layerIndex);
144
145 for (size_t k = 0; k < currentLayerPastSquaredWeightGradients.size(); k++) {
146
147 // accumulation matrix used for temporary storing of the current accumulation
148 Matrix_t accumulation(currentLayerPastSquaredWeightGradients[k].GetNrows(),
149 currentLayerPastSquaredWeightGradients[k].GetNcols());
150
151 // Vt = rho * Vt-1 + (1-rho) * currentSquaredWeightGradients
152 initialize<Architecture_t>(accumulation, EInitialization::kZero);
153 Matrix_t currentSquaredWeightGradients(weightGradients[k].GetNrows(), weightGradients[k].GetNcols());
154 Architecture_t::Copy(currentSquaredWeightGradients, weightGradients[k]);
155 Architecture_t::SquareElementWise(currentSquaredWeightGradients);
156 Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredWeightGradients[k], this->GetRho());
157 Architecture_t::ScaleAdd(accumulation, currentSquaredWeightGradients, 1 - (this->GetRho()));
158 Architecture_t::Copy(currentLayerPastSquaredWeightGradients[k], accumulation);
159 }
160
161 // updating the weights.
162 for (size_t i = 0; i < weights.size(); i++) {
163
164 // currentWeightUpdates = sqrt(Wt + epsilon) * currentGradients / sqrt(Vt + epsilon)
165
166 // dummy1 = sqrt(Wt + epsilon)
167 Matrix_t dummy1(currentLayerPastSquaredWeightUpdates[i].GetNrows(),
168 currentLayerPastSquaredWeightUpdates[i].GetNcols());
169 Architecture_t::Copy(dummy1, currentLayerPastSquaredWeightUpdates[i]);
170 Architecture_t::ConstAdd(dummy1, this->GetEpsilon());
171 Architecture_t::SqrtElementWise(dummy1);
172
173 Matrix_t currentWeightUpdates(currentLayerPastSquaredWeightGradients[i].GetNrows(),
174 currentLayerPastSquaredWeightGradients[i].GetNcols());
175 Architecture_t::Copy(currentWeightUpdates, currentLayerPastSquaredWeightGradients[i]);
176 Architecture_t::ConstAdd(currentWeightUpdates, this->GetEpsilon());
177 Architecture_t::SqrtElementWise(currentWeightUpdates);
178 Architecture_t::ReciprocalElementWise(currentWeightUpdates);
179 Architecture_t::Hadamard(currentWeightUpdates, weightGradients[i]);
180 Architecture_t::Hadamard(currentWeightUpdates, dummy1);
181
182 // theta = theta - learningRate * currentWeightUpdates
183 Architecture_t::ScaleAdd(weights[i], currentWeightUpdates, -this->GetLearningRate());
184
185 // accumulation matrix used for temporary storing of the current accumulation
186 Matrix_t accumulation(currentLayerPastSquaredWeightUpdates[i].GetNrows(),
187 currentLayerPastSquaredWeightUpdates[i].GetNcols());
188
189 // Wt = rho * Wt-1 + (1-rho) * currentSquaredWeightUpdates
190 initialize<Architecture_t>(accumulation, EInitialization::kZero);
191 Matrix_t currentSquaredWeightUpdates(currentWeightUpdates.GetNrows(), currentWeightUpdates.GetNcols());
192 Architecture_t::Copy(currentSquaredWeightUpdates, currentWeightUpdates);
193 Architecture_t::SquareElementWise(currentSquaredWeightUpdates);
194 Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredWeightUpdates[i], this->GetRho());
195 Architecture_t::ScaleAdd(accumulation, currentSquaredWeightUpdates, 1 - (this->GetRho()));
196 Architecture_t::Copy(currentLayerPastSquaredWeightUpdates[i], accumulation);
197 }
198}
199
200//_________________________________________________________________________________________________
201template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
202auto TAdadelta<Architecture_t, Layer_t, DeepNet_t>::UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases,
203 const std::vector<Matrix_t> &biasGradients) -> void
204{
205 std::vector<Matrix_t> &currentLayerPastSquaredBiasGradients = this->GetPastSquaredBiasGradientsAt(layerIndex);
206 std::vector<Matrix_t> &currentLayerPastSquaredBiasUpdates = this->GetPastSquaredBiasUpdatesAt(layerIndex);
207
208 for (size_t k = 0; k < currentLayerPastSquaredBiasGradients.size(); k++) {
209
210 // accumulation matrix used for temporary storing of the current accumulation
211 Matrix_t accumulation(currentLayerPastSquaredBiasGradients[k].GetNrows(),
212 currentLayerPastSquaredBiasGradients[k].GetNcols());
213
214 // Vt = rho * Vt-1 + (1-rho) * currentSquaredBiasGradients
215 initialize<Architecture_t>(accumulation, EInitialization::kZero);
216 Matrix_t currentSquaredBiasGradients(biasGradients[k].GetNrows(), biasGradients[k].GetNcols());
217 Architecture_t::Copy(currentSquaredBiasGradients, biasGradients[k]);
218 Architecture_t::SquareElementWise(currentSquaredBiasGradients);
219 Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredBiasGradients[k], this->GetRho());
220 Architecture_t::ScaleAdd(accumulation, currentSquaredBiasGradients, 1 - (this->GetRho()));
221 Architecture_t::Copy(currentLayerPastSquaredBiasGradients[k], accumulation);
222 }
223
224 // updating the biases.
225 for (size_t i = 0; i < biases.size(); i++) {
226
227 // currentBiasUpdates = sqrt(Wt + epsilon) * currentGradients / sqrt(Vt + epsilon)
228
229 // dummy1 = sqrt(Wt + epsilon)
230 Matrix_t dummy1(currentLayerPastSquaredBiasUpdates[i].GetNrows(),
231 currentLayerPastSquaredBiasUpdates[i].GetNcols());
232 Architecture_t::Copy(dummy1, currentLayerPastSquaredBiasUpdates[i]);
233 Architecture_t::ConstAdd(dummy1, this->GetEpsilon());
234 Architecture_t::SqrtElementWise(dummy1);
235
236 Matrix_t currentBiasUpdates(currentLayerPastSquaredBiasGradients[i].GetNrows(),
237 currentLayerPastSquaredBiasGradients[i].GetNcols());
238 Architecture_t::Copy(currentBiasUpdates, currentLayerPastSquaredBiasGradients[i]);
239 Architecture_t::ConstAdd(currentBiasUpdates, this->GetEpsilon());
240 Architecture_t::SqrtElementWise(currentBiasUpdates);
241 Architecture_t::ReciprocalElementWise(currentBiasUpdates);
242 Architecture_t::Hadamard(currentBiasUpdates, biasGradients[i]);
243 Architecture_t::Hadamard(currentBiasUpdates, dummy1);
244
245 // theta = theta - learningRate * currentBiasUpdates
246 Architecture_t::ScaleAdd(biases[i], currentBiasUpdates, -this->GetLearningRate());
247
248 // accumulation matrix used for temporary storing of the current accumulation
249 Matrix_t accumulation(currentLayerPastSquaredBiasUpdates[i].GetNrows(),
250 currentLayerPastSquaredBiasUpdates[i].GetNcols());
251
252 // Wt = rho * Wt-1 + (1-rho) * currentSquaredBiasUpdates
253 initialize<Architecture_t>(accumulation, EInitialization::kZero);
254 Matrix_t currentSquaredBiasUpdates(currentBiasUpdates.GetNrows(), currentBiasUpdates.GetNcols());
255 Architecture_t::Copy(currentSquaredBiasUpdates, currentBiasUpdates);
256 Architecture_t::SquareElementWise(currentSquaredBiasUpdates);
257 Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredBiasUpdates[i], this->GetRho());
258 Architecture_t::ScaleAdd(accumulation, currentSquaredBiasUpdates, 1 - (this->GetRho()));
259 Architecture_t::Copy(currentLayerPastSquaredBiasUpdates[i], accumulation);
260 }
261}
262
263} // namespace DNN
264} // namespace TMVA
265
266#endif
#define e(i)
Definition: RSha256.hxx:103
Adadelta Optimizer class.
Definition: Adadelta.h:44
Scalar_t GetRho() const
Getters.
Definition: Adadelta.h:76
Scalar_t fEpsilon
The Smoothing term used to avoid division by zero.
Definition: Adadelta.h:51
std::vector< std::vector< Matrix_t > > & GetPastSquaredWeightUpdates()
Definition: Adadelta.h:85
void UpdateWeights(size_t layerIndex, std::vector< Matrix_t > &weights, const std::vector< Matrix_t > &weightGradients)
Update the weights, given the current weight gradients.
Definition: Adadelta.h:139
std::vector< Matrix_t > & GetPastSquaredBiasGradientsAt(size_t i)
Definition: Adadelta.h:83
std::vector< Matrix_t > & GetPastSquaredWeightGradientsAt(size_t i)
Definition: Adadelta.h:80
Scalar_t fRho
The Rho constant used by the optimizer.
Definition: Adadelta.h:50
std::vector< std::vector< Matrix_t > > & GetPastSquaredBiasUpdates()
Definition: Adadelta.h:88
std::vector< Matrix_t > & GetPastSquaredWeightUpdatesAt(size_t i)
Definition: Adadelta.h:86
std::vector< std::vector< Matrix_t > > fPastSquaredBiasUpdates
The accumulation of the square of the past bias updates associated with the deep net.
Definition: Adadelta.h:59
void UpdateBiases(size_t layerIndex, std::vector< Matrix_t > &biases, const std::vector< Matrix_t > &biasGradients)
Update the biases, given the current bias gradients.
Definition: Adadelta.h:202
TAdadelta(DeepNet_t &deepNet, Scalar_t learningRate=1.0, Scalar_t rho=0.95, Scalar_t epsilon=1e-8)
Constructor.
Definition: Adadelta.h:97
Scalar_t GetEpsilon() const
Definition: Adadelta.h:77
std::vector< std::vector< Matrix_t > > & GetPastSquaredBiasGradients()
Definition: Adadelta.h:82
~TAdadelta()=default
Destructor.
std::vector< std::vector< Matrix_t > > fPastSquaredWeightUpdates
The accumulation of the square of the past weight updates associated with the deep net.
Definition: Adadelta.h:57
std::vector< std::vector< Matrix_t > > & GetPastSquaredWeightGradients()
Definition: Adadelta.h:79
typename Architecture_t::Scalar_t Scalar_t
Definition: Adadelta.h:47
typename Architecture_t::Matrix_t Matrix_t
Definition: Adadelta.h:46
std::vector< std::vector< Matrix_t > > fPastSquaredWeightGradients
The accumulation of the square of the past weight gradients associated with the deep net.
Definition: Adadelta.h:52
std::vector< std::vector< Matrix_t > > fPastSquaredBiasGradients
The accumulation of the square of the past bias gradients associated with the deep net.
Definition: Adadelta.h:54
std::vector< Matrix_t > & GetPastSquaredBiasUpdatesAt(size_t i)
Definition: Adadelta.h:89
Generic Optimizer class.
Definition: Optimizer.h:44
std::vector< Layer_t * > & GetLayers()
Definition: Optimizer.h:78
void Copy(void *source, void *dest)
create variable transformations
REAL epsilon
Definition: triangle.c:617