27#ifndef TMVA_DNN_ADAGRAD
28#define TMVA_DNN_ADAGRAD
43template <
typename Architecture_t,
typename Layer_t = VGeneralLayer<Architecture_t>,
44 typename DeepNet_t = TDeepNet<Architecture_t, Layer_t>>
47 using Matrix_t =
typename Architecture_t::Matrix_t;
48 using Scalar_t =
typename Architecture_t::Scalar_t;
53 std::vector<std::vector<Matrix_t>>
55 std::vector<std::vector<Matrix_t>>
57 std::vector<std::vector<Matrix_t>>
59 std::vector<std::vector<Matrix_t>>
63 void UpdateWeights(
size_t layerIndex, std::vector<Matrix_t> &weights,
const std::vector<Matrix_t> &weightGradients);
66 void UpdateBiases(
size_t layerIndex, std::vector<Matrix_t> &biases,
const std::vector<Matrix_t> &biasGradients);
89template <
typename Architecture_t,
typename Layer_t,
typename DeepNet_t>
91 :
VOptimizer<Architecture_t, Layer_t, DeepNet_t>(learningRate, deepNet), fEpsilon(
epsilon)
93 std::vector<Layer_t *> &layers = deepNet.
GetLayers();
94 const size_t layersNSlices = layers.size();
100 for (
size_t i = 0; i < layersNSlices; i++) {
101 const size_t weightsNSlices = (layers[i]->GetWeights()).size();
106 for (
size_t j = 0; j < weightsNSlices; j++) {
110 const size_t biasesNSlices = (layers[i]->GetBiases()).size();
114 for (
size_t j = 0; j < biasesNSlices; j++) {
118 Architecture_t::CreateWeightTensors(
fWorkWeightTensor[i], layers[i]->GetWeights());
119 Architecture_t::CreateWeightTensors(
fWorkBiasTensor[i], layers[i]->GetBiases());
125template <
typename Architecture_t,
typename Layer_t,
typename DeepNet_t>
127 const std::vector<Matrix_t> &weightGradients) ->
void
129 auto ¤tLayerPastSquaredWeightGradients = this->GetPastSquaredWeightGradientsAt(layerIndex);
132 const size_t weightsNSlices = weights.size();
133 assert(currentLayerPastSquaredWeightGradients.size() == weightsNSlices);
135 for (
size_t i = 0; i < weightsNSlices; i++) {
137 auto ¤tSquaredWeightGradients = fWorkWeightTensor[layerIndex][i];
139 Architecture_t::Copy(currentSquaredWeightGradients, weightGradients[i]);
140 Architecture_t::SquareElementWise(currentSquaredWeightGradients);
141 Architecture_t::ScaleAdd(currentLayerPastSquaredWeightGradients[i], currentSquaredWeightGradients, 1.0);
146 auto ¤tWeightUpdates = fWorkWeightTensor[layerIndex][i];
147 Architecture_t::Copy(currentWeightUpdates, currentLayerPastSquaredWeightGradients[i]);
148 Architecture_t::ConstAdd(currentWeightUpdates, this->GetEpsilon());
149 Architecture_t::SqrtElementWise(currentWeightUpdates);
150 Architecture_t::ReciprocalElementWise(currentWeightUpdates);
151 Architecture_t::Hadamard(currentWeightUpdates, weightGradients[i]);
152 Architecture_t::ScaleAdd(weights[i], currentWeightUpdates, -this->GetLearningRate());
157template <
typename Architecture_t,
typename Layer_t,
typename DeepNet_t>
159 const std::vector<Matrix_t> &biasGradients) ->
void
161 std::vector<Matrix_t> ¤tLayerPastSquaredBiasGradients = this->GetPastSquaredBiasGradientsAt(layerIndex);
163 const size_t biasesNSlices = biases.size();
164 assert(currentLayerPastSquaredBiasGradients.size() == biasesNSlices);
165 for (
size_t i = 0; i < biasesNSlices; i++) {
168 auto ¤tSquaredBiasGradients = fWorkBiasTensor[layerIndex][i];
169 Architecture_t::Copy(currentSquaredBiasGradients, biasGradients[i]);
170 Architecture_t::SquareElementWise(currentSquaredBiasGradients);
171 Architecture_t::ScaleAdd(currentLayerPastSquaredBiasGradients[i], currentSquaredBiasGradients, 1.0);
176 auto ¤tBiasUpdates = fWorkBiasTensor[layerIndex][i];
177 Architecture_t::Copy(currentBiasUpdates, currentLayerPastSquaredBiasGradients[i]);
178 Architecture_t::ConstAdd(currentBiasUpdates, this->GetEpsilon());
179 Architecture_t::SqrtElementWise(currentBiasUpdates);
180 Architecture_t::ReciprocalElementWise(currentBiasUpdates);
181 Architecture_t::Hadamard(currentBiasUpdates, biasGradients[i]);
182 Architecture_t::ScaleAdd(biases[i], currentBiasUpdates, -this->GetLearningRate());
void UpdateWeights(size_t layerIndex, std::vector< Matrix_t > &weights, const std::vector< Matrix_t > &weightGradients)
Update the weights, given the current weight gradients.
void UpdateBiases(size_t layerIndex, std::vector< Matrix_t > &biases, const std::vector< Matrix_t > &biasGradients)
Update the biases, given the current bias gradients.
std::vector< std::vector< Matrix_t > > & GetPastSquaredBiasGradients()
std::vector< std::vector< Matrix_t > > fPastSquaredBiasGradients
The sum of the square of the past bias gradients associated with the deep net.
Scalar_t GetEpsilon() const
Getters.
TAdagrad(DeepNet_t &deepNet, Scalar_t learningRate=0.01, Scalar_t epsilon=1e-8)
Constructor.
std::vector< std::vector< Matrix_t > > fPastSquaredWeightGradients
The sum of the square of the past weight gradients associated with the deep net.
typename Architecture_t::Matrix_t Matrix_t
typename Architecture_t::Scalar_t Scalar_t
Scalar_t fEpsilon
The Smoothing term used to avoid division by zero.
std::vector< std::vector< Matrix_t > > & GetPastSquaredWeightGradients()
std::vector< Matrix_t > & GetPastSquaredBiasGradientsAt(size_t i)
std::vector< std::vector< Matrix_t > > fWorkWeightTensor
working tensor used to keep a temporary copy of weights or weight gradients
std::vector< std::vector< Matrix_t > > fWorkBiasTensor
working tensor used to keep a temporary copy of bias or bias gradients
~TAdagrad()=default
Destructor.
std::vector< Matrix_t > & GetPastSquaredWeightGradientsAt(size_t i)
std::vector< Layer_t * > & GetLayers()
create variable transformations