Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
Adadelta.h
Go to the documentation of this file.
1// @(#)root/tmva/tmva/dnn:$Id$
2// Author: Ravi Kiran S
3
4/**********************************************************************************
5 * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6 * Package: TMVA *
7 * Class : TAdadelta *
8 * Web : http://tmva.sourceforge.net *
9 * *
10 * Description: *
11 * Adadelta Optimizer Class *
12 * *
13 * Authors (alphabetical): *
14 * Ravi Kiran S <sravikiran0606@gmail.com> - CERN, Switzerland *
15 * *
16 * Copyright (c) 2005-2018: *
17 * CERN, Switzerland *
18 * U. of Victoria, Canada *
19 * MPI-K Heidelberg, Germany *
20 * U. of Bonn, Germany *
21 * *
22 * Redistribution and use in source and binary forms, with or without *
23 * modification, are permitted according to the terms listed in LICENSE *
24 * (http://tmva.sourceforge.net/LICENSE) *
25 **********************************************************************************/
26
27#ifndef TMVA_DNN_ADADELTA
28#define TMVA_DNN_ADADELTA
29
30#include "TMatrix.h"
31#include "TMVA/DNN/Optimizer.h"
32#include "TMVA/DNN/Functions.h"
33#include <vector>
34
35namespace TMVA {
36namespace DNN {
37
38/** \class TAdadelta
39 * Adadelta Optimizer class
40 *
41 * This class represents the Adadelta Optimizer.
42 */
43template <typename Architecture_t, typename Layer_t = VGeneralLayer<Architecture_t>,
44 typename DeepNet_t = TDeepNet<Architecture_t, Layer_t>>
45class TAdadelta : public VOptimizer<Architecture_t, Layer_t, DeepNet_t> {
46public:
47 using Matrix_t = typename Architecture_t::Matrix_t;
48 using Scalar_t = typename Architecture_t::Scalar_t;
49
50protected:
51 Scalar_t fRho; ///< The Rho constant used by the optimizer.
52 Scalar_t fEpsilon; ///< The Smoothing term used to avoid division by zero.
53 std::vector<std::vector<Matrix_t>> fPastSquaredWeightGradients; ///< The accumulation of the square of the past
54 /// weight gradients associated with the deep net.
55 std::vector<std::vector<Matrix_t>> fPastSquaredBiasGradients; ///< The accumulation of the square of the past bias
56 /// gradients associated with the deep net.
57
58 std::vector<std::vector<Matrix_t>> fPastSquaredWeightUpdates; ///< The accumulation of the square of the past weight
59 /// updates associated with the deep net.
60 std::vector<std::vector<Matrix_t>> fPastSquaredBiasUpdates; ///< The accumulation of the square of the past bias
61 /// updates associated with the deep net.
62 std::vector<std::vector<Matrix_t>> fWorkWeightTensor1; ///< working tensor used to keep a temporary copy of weights or weight gradients
63 std::vector<std::vector<Matrix_t>> fWorkBiasTensor1; ///< working tensor used to keep a temporary copy of bias or bias gradients
64 std::vector<std::vector<Matrix_t>> fWorkWeightTensor2; ///< working tensor used to keep a temporary copy of weights or weight gradients
65 std::vector<std::vector<Matrix_t>> fWorkBiasTensor2; ///< working tensor used to keep a temporary copy of bias or bias gradients
66
67 /*! Update the weights, given the current weight gradients. */
68 void UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights, const std::vector<Matrix_t> &weightGradients);
69
70 /*! Update the biases, given the current bias gradients. */
71 void UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases, const std::vector<Matrix_t> &biasGradients);
72
73public:
74 /*! Constructor. */
75 TAdadelta(DeepNet_t &deepNet, Scalar_t learningRate = 1.0, Scalar_t rho = 0.95, Scalar_t epsilon = 1e-8);
76
77 /*! Destructor. */
78 ~TAdadelta() = default;
79
80 /*! Getters */
81 Scalar_t GetRho() const { return fRho; }
82 Scalar_t GetEpsilon() const { return fEpsilon; }
83
84 std::vector<std::vector<Matrix_t>> &GetPastSquaredWeightGradients() { return fPastSquaredWeightGradients; }
85 std::vector<Matrix_t> &GetPastSquaredWeightGradientsAt(size_t i) { return fPastSquaredWeightGradients[i]; }
86
87 std::vector<std::vector<Matrix_t>> &GetPastSquaredBiasGradients() { return fPastSquaredBiasGradients; }
88 std::vector<Matrix_t> &GetPastSquaredBiasGradientsAt(size_t i) { return fPastSquaredBiasGradients[i]; }
89
90 std::vector<std::vector<Matrix_t>> &GetPastSquaredWeightUpdates() { return fPastSquaredWeightUpdates; }
91 std::vector<Matrix_t> &GetPastSquaredWeightUpdatesAt(size_t i) { return fPastSquaredWeightUpdates[i]; }
92
93 std::vector<std::vector<Matrix_t>> &GetPastSquaredBiasUpdates() { return fPastSquaredBiasUpdates; }
94 std::vector<Matrix_t> &GetPastSquaredBiasUpdatesAt(size_t i) { return fPastSquaredBiasUpdates[i]; }
95};
96
97//
98//
99// The Adadelta Optimizer Class - Implementation
100//_________________________________________________________________________________________________
101template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
104 : VOptimizer<Architecture_t, Layer_t, DeepNet_t>(learningRate, deepNet), fRho(rho), fEpsilon(epsilon)
105{
106 std::vector<Layer_t *> &layers = deepNet.GetLayers();
107 const size_t layersNSlices = layers.size();
108 fPastSquaredWeightGradients.resize(layersNSlices);
109 fPastSquaredBiasGradients.resize(layersNSlices);
110 fPastSquaredWeightUpdates.resize(layersNSlices);
111 fPastSquaredBiasUpdates.resize(layersNSlices);
112 fWorkWeightTensor1.resize(layersNSlices);
113 fWorkBiasTensor1.resize(layersNSlices);
114 fWorkWeightTensor2.resize(layersNSlices);
115 fWorkBiasTensor2.resize(layersNSlices);
116
117 for (size_t i = 0; i < layersNSlices; i++) {
118 const size_t weightsNSlices = (layers[i]->GetWeights()).size();
119
120 Architecture_t::CreateWeightTensors( fPastSquaredWeightGradients[i], layers[i]->GetWeights());
121 Architecture_t::CreateWeightTensors( fPastSquaredWeightUpdates[i], layers[i]->GetWeights());
122
123 for (size_t j = 0; j < weightsNSlices; j++) {
124 initialize<Architecture_t>(fPastSquaredWeightGradients[i][j], EInitialization::kZero);
125 initialize<Architecture_t>(fPastSquaredWeightUpdates[i][j], EInitialization::kZero);
126 }
127
128 const size_t biasesNSlices = (layers[i]->GetBiases()).size();
129
130 Architecture_t::CreateWeightTensors( fPastSquaredBiasGradients[i], layers[i]->GetBiases());
131 Architecture_t::CreateWeightTensors( fPastSquaredBiasUpdates[i], layers[i]->GetBiases());
132
133 for (size_t j = 0; j < biasesNSlices; j++) {
134 initialize<Architecture_t>(fPastSquaredBiasGradients[i][j], EInitialization::kZero);
135 initialize<Architecture_t>(fPastSquaredBiasUpdates[i][j], EInitialization::kZero);
136 }
137
138 Architecture_t::CreateWeightTensors(fWorkWeightTensor1[i], layers[i]->GetWeights());
139 Architecture_t::CreateWeightTensors(fWorkBiasTensor1[i], layers[i]->GetBiases());
140 Architecture_t::CreateWeightTensors(fWorkWeightTensor2[i], layers[i]->GetWeights());
141 Architecture_t::CreateWeightTensors(fWorkBiasTensor2[i], layers[i]->GetBiases());
142 }
143}
144
145//_________________________________________________________________________________________________
146template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
147auto TAdadelta<Architecture_t, Layer_t, DeepNet_t>::UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights,
148 const std::vector<Matrix_t> &weightGradients) -> void
149{
150 std::vector<Matrix_t> &currentLayerPastSquaredWeightGradients = this->GetPastSquaredWeightGradientsAt(layerIndex);
151 std::vector<Matrix_t> &currentLayerPastSquaredWeightUpdates = this->GetPastSquaredWeightUpdatesAt(layerIndex);
152
153 const size_t weightsNSlices = weights.size();
154 assert(currentLayerPastSquaredWeightGradients.size() == weightsNSlices);
155
156 for (size_t i = 0; i < weightsNSlices; i++) {
157 // accumulation matrix used for temporary storing of the current accumulation
158 auto &accumulation = fWorkWeightTensor1[layerIndex][i];
159 auto &currentSquaredWeightGradients = fWorkWeightTensor2[layerIndex][i];
160
161 // Vt = rho * Vt-1 + (1-rho) * currentSquaredWeightGradients
162 initialize<Architecture_t>(accumulation, EInitialization::kZero);
163
164 Architecture_t::Copy(currentSquaredWeightGradients, weightGradients[i]);
165 Architecture_t::SquareElementWise(currentSquaredWeightGradients);
166 Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredWeightGradients[i], this->GetRho());
167 Architecture_t::ScaleAdd(accumulation, currentSquaredWeightGradients, 1 - (this->GetRho()));
168 Architecture_t::Copy(currentLayerPastSquaredWeightGradients[i], accumulation);
169
170
171 // updating the weights.
172 // currentWeightUpdates = sqrt(Wt + epsilon) * currentGradients / sqrt(Vt + epsilon)
173
174 // dummy1 = sqrt(Wt + epsilon)
175 auto &dummy1 = fWorkWeightTensor1[layerIndex][i]; // reuse working tensor
176 Architecture_t::Copy(dummy1, currentLayerPastSquaredWeightUpdates[i]);
177 Architecture_t::ConstAdd(dummy1, this->GetEpsilon());
178 Architecture_t::SqrtElementWise(dummy1);
179
180 auto &currentWeightUpdates = fWorkWeightTensor2[layerIndex][i]; // reuse the work tensor for the weight updates now
181
182 Architecture_t::Copy(currentWeightUpdates, currentLayerPastSquaredWeightGradients[i]);
183 Architecture_t::ConstAdd(currentWeightUpdates, this->GetEpsilon());
184 Architecture_t::SqrtElementWise(currentWeightUpdates);
185 Architecture_t::ReciprocalElementWise(currentWeightUpdates);
186 Architecture_t::Hadamard(currentWeightUpdates, weightGradients[i]);
187 Architecture_t::Hadamard(currentWeightUpdates, dummy1);
188
189 // theta = theta - learningRate * currentWeightUpdates
190 Architecture_t::ScaleAdd(weights[i], currentWeightUpdates, -this->GetLearningRate());
191
192 // Wt = rho * Wt-1 + (1-rho) * currentSquaredWeightUpdates
193 // re-use accumulation matrix used for temporary storing of the current accumulation
194 initialize<Architecture_t>(accumulation, EInitialization::kZero);
195 auto &currentSquaredWeightUpdates = fWorkWeightTensor2[layerIndex][i]; // reuse work tensor
196 Architecture_t::Copy(currentSquaredWeightUpdates, currentWeightUpdates);
197 Architecture_t::SquareElementWise(currentSquaredWeightUpdates);
198 Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredWeightUpdates[i], this->GetRho());
199 Architecture_t::ScaleAdd(accumulation, currentSquaredWeightUpdates, 1 - (this->GetRho()));
200 Architecture_t::Copy(currentLayerPastSquaredWeightUpdates[i], accumulation);
201 }
202}
203
204//_________________________________________________________________________________________________
205template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
206auto TAdadelta<Architecture_t, Layer_t, DeepNet_t>::UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases,
207 const std::vector<Matrix_t> &biasGradients) -> void
208{
209 std::vector<Matrix_t> &currentLayerPastSquaredBiasGradients = this->GetPastSquaredBiasGradientsAt(layerIndex);
210 std::vector<Matrix_t> &currentLayerPastSquaredBiasUpdates = this->GetPastSquaredBiasUpdatesAt(layerIndex);
211
212 const size_t biasesNSlices = biases.size();
213 assert(currentLayerPastSquaredBiasGradients.size() == biasesNSlices);
214 for (size_t i = 0; i < biasesNSlices; i++) {
215
216 // accumulation matrix used for temporary storing of the current accumulation
217 auto &accumulation = fWorkBiasTensor1[layerIndex][i];
218
219 // Vt = rho * Vt-1 + (1-rho) * currentSquaredBiasGradients
220 initialize<Architecture_t>(accumulation, EInitialization::kZero);
221
222 auto &currentSquaredBiasGradients = fWorkBiasTensor2[layerIndex][i];
223 Architecture_t::Copy(currentSquaredBiasGradients, biasGradients[i]);
224 Architecture_t::SquareElementWise(currentSquaredBiasGradients);
225 Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredBiasGradients[i], this->GetRho());
226 Architecture_t::ScaleAdd(accumulation, currentSquaredBiasGradients, 1 - (this->GetRho()));
227 Architecture_t::Copy(currentLayerPastSquaredBiasGradients[i], accumulation);
228
229 // updating the biases.
230
231 // currentBiasUpdates = sqrt(Wt + epsilon) * currentGradients / sqrt(Vt + epsilon)
232 // dummy1 = sqrt(Wt + epsilon)
233 auto &dummy1 = fWorkBiasTensor1[layerIndex][i]; // reuse working tensor
234 Architecture_t::Copy(dummy1, currentLayerPastSquaredBiasUpdates[i]);
235 Architecture_t::ConstAdd(dummy1, this->GetEpsilon());
236 Architecture_t::SqrtElementWise(dummy1);
237
238 auto &currentBiasUpdates = fWorkBiasTensor2[layerIndex][i];
239 Architecture_t::Copy(currentBiasUpdates, currentLayerPastSquaredBiasGradients[i]);
240 Architecture_t::ConstAdd(currentBiasUpdates, this->GetEpsilon());
241 Architecture_t::SqrtElementWise(currentBiasUpdates);
242 Architecture_t::ReciprocalElementWise(currentBiasUpdates);
243 Architecture_t::Hadamard(currentBiasUpdates, biasGradients[i]);
244 Architecture_t::Hadamard(currentBiasUpdates, dummy1);
245
246 // theta = theta - learningRate * currentBiasUpdates
247 Architecture_t::ScaleAdd(biases[i], currentBiasUpdates, -this->GetLearningRate());
248
249
250 // Wt = rho * Wt-1 + (1-rho) * currentSquaredBiasUpdates
251 // re-use accumulation matrix used for temporary storing of the current accumulation
252 initialize<Architecture_t>(accumulation, EInitialization::kZero);
253 auto &currentSquaredBiasUpdates = fWorkBiasTensor2[layerIndex][i]; // reuse work tensor
254 Architecture_t::Copy(currentSquaredBiasUpdates, currentBiasUpdates);
255 Architecture_t::SquareElementWise(currentSquaredBiasUpdates);
256 Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredBiasUpdates[i], this->GetRho());
257 Architecture_t::ScaleAdd(accumulation, currentSquaredBiasUpdates, 1 - (this->GetRho()));
258 Architecture_t::Copy(currentLayerPastSquaredBiasUpdates[i], accumulation);
259 }
260}
261
262} // namespace DNN
263} // namespace TMVA
264
265#endif
#define e(i)
Definition RSha256.hxx:103
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
Adadelta Optimizer class.
Definition Adadelta.h:45
std::vector< std::vector< Matrix_t > > fWorkBiasTensor2
working tensor used to keep a temporary copy of bias or bias gradients
Definition Adadelta.h:65
Scalar_t GetRho() const
Getters.
Definition Adadelta.h:81
Scalar_t fEpsilon
The Smoothing term used to avoid division by zero.
Definition Adadelta.h:52
std::vector< std::vector< Matrix_t > > & GetPastSquaredWeightUpdates()
Definition Adadelta.h:90
void UpdateWeights(size_t layerIndex, std::vector< Matrix_t > &weights, const std::vector< Matrix_t > &weightGradients)
Update the weights, given the current weight gradients.
Definition Adadelta.h:147
std::vector< Matrix_t > & GetPastSquaredBiasGradientsAt(size_t i)
Definition Adadelta.h:88
std::vector< Matrix_t > & GetPastSquaredWeightGradientsAt(size_t i)
Definition Adadelta.h:85
Scalar_t fRho
The Rho constant used by the optimizer.
Definition Adadelta.h:51
std::vector< std::vector< Matrix_t > > & GetPastSquaredBiasUpdates()
Definition Adadelta.h:93
std::vector< Matrix_t > & GetPastSquaredWeightUpdatesAt(size_t i)
Definition Adadelta.h:91
std::vector< std::vector< Matrix_t > > fWorkBiasTensor1
working tensor used to keep a temporary copy of bias or bias gradients
Definition Adadelta.h:63
std::vector< std::vector< Matrix_t > > fPastSquaredBiasUpdates
The accumulation of the square of the past bias updates associated with the deep net.
Definition Adadelta.h:60
void UpdateBiases(size_t layerIndex, std::vector< Matrix_t > &biases, const std::vector< Matrix_t > &biasGradients)
Update the biases, given the current bias gradients.
Definition Adadelta.h:206
TAdadelta(DeepNet_t &deepNet, Scalar_t learningRate=1.0, Scalar_t rho=0.95, Scalar_t epsilon=1e-8)
Constructor.
Definition Adadelta.h:102
Scalar_t GetEpsilon() const
Definition Adadelta.h:82
std::vector< std::vector< Matrix_t > > & GetPastSquaredBiasGradients()
Definition Adadelta.h:87
std::vector< std::vector< Matrix_t > > fWorkWeightTensor2
working tensor used to keep a temporary copy of weights or weight gradients
Definition Adadelta.h:64
~TAdadelta()=default
Destructor.
std::vector< std::vector< Matrix_t > > fPastSquaredWeightUpdates
The accumulation of the square of the past weight updates associated with the deep net.
Definition Adadelta.h:58
std::vector< std::vector< Matrix_t > > & GetPastSquaredWeightGradients()
Definition Adadelta.h:84
typename Architecture_t::Scalar_t Scalar_t
Definition Adadelta.h:48
typename Architecture_t::Matrix_t Matrix_t
Definition Adadelta.h:47
std::vector< std::vector< Matrix_t > > fPastSquaredWeightGradients
The accumulation of the square of the past weight gradients associated with the deep net.
Definition Adadelta.h:53
std::vector< std::vector< Matrix_t > > fWorkWeightTensor1
working tensor used to keep a temporary copy of weights or weight gradients
Definition Adadelta.h:62
std::vector< std::vector< Matrix_t > > fPastSquaredBiasGradients
The accumulation of the square of the past bias gradients associated with the deep net.
Definition Adadelta.h:55
std::vector< Matrix_t > & GetPastSquaredBiasUpdatesAt(size_t i)
Definition Adadelta.h:94
Generic Optimizer class.
Definition Optimizer.h:45
std::vector< Layer_t * > & GetLayers()
Definition Optimizer.h:82
create variable transformations
REAL epsilon
Definition triangle.c:618