27#ifndef TMVA_DNN_ADADELTA
28#define TMVA_DNN_ADADELTA
42template <
typename Architecture_t,
typename Layer_t = VGeneralLayer<Architecture_t>,
43 typename DeepNet_t = TDeepNet<Architecture_t, Layer_t>>
46 using Matrix_t =
typename Architecture_t::Matrix_t;
47 using Scalar_t =
typename Architecture_t::Scalar_t;
63 void UpdateWeights(
size_t layerIndex, std::vector<Matrix_t> &weights,
const std::vector<Matrix_t> &weightGradients);
66 void UpdateBiases(
size_t layerIndex, std::vector<Matrix_t> &biases,
const std::vector<Matrix_t> &biasGradients);
96template <
typename Architecture_t,
typename Layer_t,
typename DeepNet_t>
99 :
VOptimizer<Architecture_t, Layer_t, DeepNet_t>(learningRate, deepNet), fRho(rho), fEpsilon(
epsilon)
101 std::vector<Layer_t *> &layers = deepNet.
GetLayers();
102 const size_t layersNSlices = layers.size();
108 for (
size_t i = 0; i < layersNSlices; i++) {
109 const size_t weightsNSlices = (layers[i]->GetWeights()).size();
111 for (
size_t j = 0; j < weightsNSlices; j++) {
112 Matrix_t ¤tWeights = layers[i]->GetWeightsAt(j);
113 const size_t weightsNRows = currentWeights.GetNrows();
114 const size_t weightsNCols = currentWeights.GetNcols();
122 const size_t biasesNSlices = (layers[i]->GetBiases()).size();
124 for (
size_t j = 0; j < biasesNSlices; j++) {
125 Matrix_t ¤tBiases = layers[i]->GetBiasesAt(j);
126 const size_t biasesNRows = currentBiases.GetNrows();
127 const size_t biasesNCols = currentBiases.GetNcols();
138template <
typename Architecture_t,
typename Layer_t,
typename DeepNet_t>
140 const std::vector<Matrix_t> &weightGradients) ->
void
142 std::vector<Matrix_t> ¤tLayerPastSquaredWeightGradients = this->GetPastSquaredWeightGradientsAt(layerIndex);
143 std::vector<Matrix_t> ¤tLayerPastSquaredWeightUpdates = this->GetPastSquaredWeightUpdatesAt(layerIndex);
145 for (
size_t k = 0; k < currentLayerPastSquaredWeightGradients.size(); k++) {
148 Matrix_t accumulation(currentLayerPastSquaredWeightGradients[k].GetNrows(),
149 currentLayerPastSquaredWeightGradients[k].GetNcols());
153 Matrix_t currentSquaredWeightGradients(weightGradients[k].GetNrows(), weightGradients[k].GetNcols());
155 Architecture_t::SquareElementWise(currentSquaredWeightGradients);
156 Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredWeightGradients[k], this->GetRho());
157 Architecture_t::ScaleAdd(accumulation, currentSquaredWeightGradients, 1 - (this->GetRho()));
162 for (
size_t i = 0; i < weights.size(); i++) {
167 Matrix_t dummy1(currentLayerPastSquaredWeightUpdates[i].GetNrows(),
168 currentLayerPastSquaredWeightUpdates[i].GetNcols());
170 Architecture_t::ConstAdd(dummy1, this->GetEpsilon());
171 Architecture_t::SqrtElementWise(dummy1);
173 Matrix_t currentWeightUpdates(currentLayerPastSquaredWeightGradients[i].GetNrows(),
174 currentLayerPastSquaredWeightGradients[i].GetNcols());
176 Architecture_t::ConstAdd(currentWeightUpdates, this->GetEpsilon());
177 Architecture_t::SqrtElementWise(currentWeightUpdates);
178 Architecture_t::ReciprocalElementWise(currentWeightUpdates);
179 Architecture_t::Hadamard(currentWeightUpdates, weightGradients[i]);
180 Architecture_t::Hadamard(currentWeightUpdates, dummy1);
183 Architecture_t::ScaleAdd(weights[i], currentWeightUpdates, -this->GetLearningRate());
186 Matrix_t accumulation(currentLayerPastSquaredWeightUpdates[i].GetNrows(),
187 currentLayerPastSquaredWeightUpdates[i].GetNcols());
191 Matrix_t currentSquaredWeightUpdates(currentWeightUpdates.GetNrows(), currentWeightUpdates.GetNcols());
193 Architecture_t::SquareElementWise(currentSquaredWeightUpdates);
194 Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredWeightUpdates[i], this->GetRho());
195 Architecture_t::ScaleAdd(accumulation, currentSquaredWeightUpdates, 1 - (this->GetRho()));
201template <
typename Architecture_t,
typename Layer_t,
typename DeepNet_t>
203 const std::vector<Matrix_t> &biasGradients) ->
void
205 std::vector<Matrix_t> ¤tLayerPastSquaredBiasGradients = this->GetPastSquaredBiasGradientsAt(layerIndex);
206 std::vector<Matrix_t> ¤tLayerPastSquaredBiasUpdates = this->GetPastSquaredBiasUpdatesAt(layerIndex);
208 for (
size_t k = 0; k < currentLayerPastSquaredBiasGradients.size(); k++) {
211 Matrix_t accumulation(currentLayerPastSquaredBiasGradients[k].GetNrows(),
212 currentLayerPastSquaredBiasGradients[k].GetNcols());
216 Matrix_t currentSquaredBiasGradients(biasGradients[k].GetNrows(), biasGradients[k].GetNcols());
218 Architecture_t::SquareElementWise(currentSquaredBiasGradients);
219 Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredBiasGradients[k], this->GetRho());
220 Architecture_t::ScaleAdd(accumulation, currentSquaredBiasGradients, 1 - (this->GetRho()));
225 for (
size_t i = 0; i < biases.size(); i++) {
230 Matrix_t dummy1(currentLayerPastSquaredBiasUpdates[i].GetNrows(),
231 currentLayerPastSquaredBiasUpdates[i].GetNcols());
233 Architecture_t::ConstAdd(dummy1, this->GetEpsilon());
234 Architecture_t::SqrtElementWise(dummy1);
236 Matrix_t currentBiasUpdates(currentLayerPastSquaredBiasGradients[i].GetNrows(),
237 currentLayerPastSquaredBiasGradients[i].GetNcols());
239 Architecture_t::ConstAdd(currentBiasUpdates, this->GetEpsilon());
240 Architecture_t::SqrtElementWise(currentBiasUpdates);
241 Architecture_t::ReciprocalElementWise(currentBiasUpdates);
242 Architecture_t::Hadamard(currentBiasUpdates, biasGradients[i]);
243 Architecture_t::Hadamard(currentBiasUpdates, dummy1);
246 Architecture_t::ScaleAdd(biases[i], currentBiasUpdates, -this->GetLearningRate());
249 Matrix_t accumulation(currentLayerPastSquaredBiasUpdates[i].GetNrows(),
250 currentLayerPastSquaredBiasUpdates[i].GetNcols());
254 Matrix_t currentSquaredBiasUpdates(currentBiasUpdates.GetNrows(), currentBiasUpdates.GetNcols());
256 Architecture_t::SquareElementWise(currentSquaredBiasUpdates);
257 Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredBiasUpdates[i], this->GetRho());
258 Architecture_t::ScaleAdd(accumulation, currentSquaredBiasUpdates, 1 - (this->GetRho()));
Adadelta Optimizer class.
Scalar_t GetRho() const
Getters.
Scalar_t fEpsilon
The Smoothing term used to avoid division by zero.
std::vector< std::vector< Matrix_t > > & GetPastSquaredWeightUpdates()
void UpdateWeights(size_t layerIndex, std::vector< Matrix_t > &weights, const std::vector< Matrix_t > &weightGradients)
Update the weights, given the current weight gradients.
std::vector< Matrix_t > & GetPastSquaredBiasGradientsAt(size_t i)
std::vector< Matrix_t > & GetPastSquaredWeightGradientsAt(size_t i)
Scalar_t fRho
The Rho constant used by the optimizer.
std::vector< std::vector< Matrix_t > > & GetPastSquaredBiasUpdates()
std::vector< Matrix_t > & GetPastSquaredWeightUpdatesAt(size_t i)
std::vector< std::vector< Matrix_t > > fPastSquaredBiasUpdates
The accumulation of the square of the past bias updates associated with the deep net.
void UpdateBiases(size_t layerIndex, std::vector< Matrix_t > &biases, const std::vector< Matrix_t > &biasGradients)
Update the biases, given the current bias gradients.
TAdadelta(DeepNet_t &deepNet, Scalar_t learningRate=1.0, Scalar_t rho=0.95, Scalar_t epsilon=1e-8)
Constructor.
Scalar_t GetEpsilon() const
std::vector< std::vector< Matrix_t > > & GetPastSquaredBiasGradients()
~TAdadelta()=default
Destructor.
std::vector< std::vector< Matrix_t > > fPastSquaredWeightUpdates
The accumulation of the square of the past weight updates associated with the deep net.
std::vector< std::vector< Matrix_t > > & GetPastSquaredWeightGradients()
typename Architecture_t::Scalar_t Scalar_t
typename Architecture_t::Matrix_t Matrix_t
std::vector< std::vector< Matrix_t > > fPastSquaredWeightGradients
The accumulation of the square of the past weight gradients associated with the deep net.
std::vector< std::vector< Matrix_t > > fPastSquaredBiasGradients
The accumulation of the square of the past bias gradients associated with the deep net.
std::vector< Matrix_t > & GetPastSquaredBiasUpdatesAt(size_t i)
std::vector< Layer_t * > & GetLayers()
void Copy(void *source, void *dest)
create variable transformations