Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
Adagrad.h
Go to the documentation of this file.
1// @(#)root/tmva/tmva/dnn:$Id$
2// Author: Ravi Kiran S
3
4/**********************************************************************************
5 * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6 * Package: TMVA *
7 * Class : TAdagrad *
8 * Web : http://tmva.sourceforge.net *
9 * *
10 * Description: *
11 * Adagrad Optimizer Class *
12 * *
13 * Authors (alphabetical): *
14 * Ravi Kiran S <sravikiran0606@gmail.com> - CERN, Switzerland *
15 * *
16 * Copyright (c) 2005-2018: *
17 * CERN, Switzerland *
18 * U. of Victoria, Canada *
19 * MPI-K Heidelberg, Germany *
20 * U. of Bonn, Germany *
21 * *
22 * Redistribution and use in source and binary forms, with or without *
23 * modification, are permitted according to the terms listed in LICENSE *
24 * (http://tmva.sourceforge.net/LICENSE) *
25 **********************************************************************************/
26
27#ifndef TMVA_DNN_ADAGRAD
28#define TMVA_DNN_ADAGRAD
29
30#include "TMatrix.h"
31#include "TMVA/DNN/Optimizer.h"
32#include "TMVA/DNN/Functions.h"
33#include <vector>
34
35namespace TMVA {
36namespace DNN {
37
38/** \class TAdagrad
39 * Adagrad Optimizer class
40 *
41 * This class represents the Adagrad Optimizer.
42 */
43template <typename Architecture_t, typename Layer_t = VGeneralLayer<Architecture_t>,
44 typename DeepNet_t = TDeepNet<Architecture_t, Layer_t>>
45class TAdagrad : public VOptimizer<Architecture_t, Layer_t, DeepNet_t> {
46public:
47 using Matrix_t = typename Architecture_t::Matrix_t;
48 using Scalar_t = typename Architecture_t::Scalar_t;
49
50protected:
51 Scalar_t fEpsilon; ///< The Smoothing term used to avoid division by zero.
52
53 std::vector<std::vector<Matrix_t>>
54 fPastSquaredWeightGradients; ///< The sum of the square of the past weight gradients associated with the deep net.
55 std::vector<std::vector<Matrix_t>>
56 fPastSquaredBiasGradients; ///< The sum of the square of the past bias gradients associated with the deep net.
57 std::vector<std::vector<Matrix_t>>
58 fWorkWeightTensor; ///< working tensor used to keep a temporary copy of weights or weight gradients
59 std::vector<std::vector<Matrix_t>>
60 fWorkBiasTensor; ///< working tensor used to keep a temporary copy of bias or bias gradients
61
62 /*! Update the weights, given the current weight gradients. */
63 void UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights, const std::vector<Matrix_t> &weightGradients);
64
65 /*! Update the biases, given the current bias gradients. */
66 void UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases, const std::vector<Matrix_t> &biasGradients);
67
68public:
69 /*! Constructor. */
70 TAdagrad(DeepNet_t &deepNet, Scalar_t learningRate = 0.01, Scalar_t epsilon = 1e-8);
71
72 /*! Destructor. */
73 ~TAdagrad() = default;
74
75 /*! Getters */
76 Scalar_t GetEpsilon() const { return fEpsilon; }
77
78 std::vector<std::vector<Matrix_t>> &GetPastSquaredWeightGradients() { return fPastSquaredWeightGradients; }
79 std::vector<Matrix_t> &GetPastSquaredWeightGradientsAt(size_t i) { return fPastSquaredWeightGradients[i]; }
80
81 std::vector<std::vector<Matrix_t>> &GetPastSquaredBiasGradients() { return fPastSquaredBiasGradients; }
82 std::vector<Matrix_t> &GetPastSquaredBiasGradientsAt(size_t i) { return fPastSquaredBiasGradients[i]; }
83};
84
85//
86//
87// The Adagrad Optimizer Class - Implementation
88//_________________________________________________________________________________________________
89template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
91 : VOptimizer<Architecture_t, Layer_t, DeepNet_t>(learningRate, deepNet), fEpsilon(epsilon)
92{
93 std::vector<Layer_t *> &layers = deepNet.GetLayers();
94 const size_t layersNSlices = layers.size();
95 fPastSquaredWeightGradients.resize(layersNSlices);
96 fPastSquaredBiasGradients.resize(layersNSlices);
97 fWorkWeightTensor.resize(layersNSlices);
98 fWorkBiasTensor.resize(layersNSlices);
99
100 for (size_t i = 0; i < layersNSlices; i++) {
101 const size_t weightsNSlices = (layers[i]->GetWeights()).size();
102
103 // weight and weight gradients tensors should have same
104 Architecture_t::CreateWeightTensors( fPastSquaredWeightGradients[i], layers[i]->GetWeights());
105
106 for (size_t j = 0; j < weightsNSlices; j++) {
107 initialize<Architecture_t>(fPastSquaredWeightGradients[i][j], EInitialization::kZero);
108 }
109
110 const size_t biasesNSlices = (layers[i]->GetBiases()).size();
111
112 Architecture_t::CreateWeightTensors( fPastSquaredBiasGradients[i], layers[i]->GetBiases());
113
114 for (size_t j = 0; j < biasesNSlices; j++) {
115 initialize<Architecture_t>(fPastSquaredBiasGradients[i][j], EInitialization::kZero);
116 }
117
118 Architecture_t::CreateWeightTensors(fWorkWeightTensor[i], layers[i]->GetWeights());
119 Architecture_t::CreateWeightTensors(fWorkBiasTensor[i], layers[i]->GetBiases());
120
121 }
122}
123
124//_________________________________________________________________________________________________
125template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
126auto TAdagrad<Architecture_t, Layer_t, DeepNet_t>::UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights,
127 const std::vector<Matrix_t> &weightGradients) -> void
128{
129 auto &currentLayerPastSquaredWeightGradients = this->GetPastSquaredWeightGradientsAt(layerIndex);
130
131
132 const size_t weightsNSlices = weights.size();
133 assert(currentLayerPastSquaredWeightGradients.size() == weightsNSlices);
134
135 for (size_t i = 0; i < weightsNSlices; i++) {
136
137 auto &currentSquaredWeightGradients = fWorkWeightTensor[layerIndex][i];
138 // Vt = Vt-1 + currentSquaredWeightGradients
139 Architecture_t::Copy(currentSquaredWeightGradients, weightGradients[i]);
140 Architecture_t::SquareElementWise(currentSquaredWeightGradients);
141 Architecture_t::ScaleAdd(currentLayerPastSquaredWeightGradients[i], currentSquaredWeightGradients, 1.0);
142
143 // updating the weights.
144 // theta = theta - learningRate * currentWeightGradients / (sqrt(Vt + epsilon))
145
146 auto &currentWeightUpdates = fWorkWeightTensor[layerIndex][i]; // reuse the work tensor for the weight updates now
147 Architecture_t::Copy(currentWeightUpdates, currentLayerPastSquaredWeightGradients[i]);
148 Architecture_t::ConstAdd(currentWeightUpdates, this->GetEpsilon());
149 Architecture_t::SqrtElementWise(currentWeightUpdates);
150 Architecture_t::ReciprocalElementWise(currentWeightUpdates);
151 Architecture_t::Hadamard(currentWeightUpdates, weightGradients[i]);
152 Architecture_t::ScaleAdd(weights[i], currentWeightUpdates, -this->GetLearningRate());
153 }
154}
155
156//_________________________________________________________________________________________________
157template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
158auto TAdagrad<Architecture_t, Layer_t, DeepNet_t>::UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases,
159 const std::vector<Matrix_t> &biasGradients) -> void
160{
161 std::vector<Matrix_t> &currentLayerPastSquaredBiasGradients = this->GetPastSquaredBiasGradientsAt(layerIndex);
162
163 const size_t biasesNSlices = biases.size();
164 assert(currentLayerPastSquaredBiasGradients.size() == biasesNSlices);
165 for (size_t i = 0; i < biasesNSlices; i++) {
166
167 // Vt = Vt-1 + currentSquaredBiasGradients
168 auto &currentSquaredBiasGradients = fWorkBiasTensor[layerIndex][i];
169 Architecture_t::Copy(currentSquaredBiasGradients, biasGradients[i]);
170 Architecture_t::SquareElementWise(currentSquaredBiasGradients);
171 Architecture_t::ScaleAdd(currentLayerPastSquaredBiasGradients[i], currentSquaredBiasGradients, 1.0);
172
173 // updating the biases.
174 // theta = theta - learningRate * currentBiasGradients / (sqrt(Vt + epsilon))
175
176 auto &currentBiasUpdates = fWorkBiasTensor[layerIndex][i];
177 Architecture_t::Copy(currentBiasUpdates, currentLayerPastSquaredBiasGradients[i]);
178 Architecture_t::ConstAdd(currentBiasUpdates, this->GetEpsilon());
179 Architecture_t::SqrtElementWise(currentBiasUpdates);
180 Architecture_t::ReciprocalElementWise(currentBiasUpdates);
181 Architecture_t::Hadamard(currentBiasUpdates, biasGradients[i]);
182 Architecture_t::ScaleAdd(biases[i], currentBiasUpdates, -this->GetLearningRate());
183 }
184}
185
186} // namespace DNN
187} // namespace TMVA
188
189#endif
#define e(i)
Definition RSha256.hxx:103
Adagrad Optimizer class.
Definition Adagrad.h:45
void UpdateWeights(size_t layerIndex, std::vector< Matrix_t > &weights, const std::vector< Matrix_t > &weightGradients)
Update the weights, given the current weight gradients.
Definition Adagrad.h:126
void UpdateBiases(size_t layerIndex, std::vector< Matrix_t > &biases, const std::vector< Matrix_t > &biasGradients)
Update the biases, given the current bias gradients.
Definition Adagrad.h:158
std::vector< std::vector< Matrix_t > > & GetPastSquaredBiasGradients()
Definition Adagrad.h:81
std::vector< std::vector< Matrix_t > > fPastSquaredBiasGradients
The sum of the square of the past bias gradients associated with the deep net.
Definition Adagrad.h:56
Scalar_t GetEpsilon() const
Getters.
Definition Adagrad.h:76
TAdagrad(DeepNet_t &deepNet, Scalar_t learningRate=0.01, Scalar_t epsilon=1e-8)
Constructor.
Definition Adagrad.h:90
std::vector< std::vector< Matrix_t > > fPastSquaredWeightGradients
The sum of the square of the past weight gradients associated with the deep net.
Definition Adagrad.h:54
typename Architecture_t::Matrix_t Matrix_t
Definition Adagrad.h:47
typename Architecture_t::Scalar_t Scalar_t
Definition Adagrad.h:48
Scalar_t fEpsilon
The Smoothing term used to avoid division by zero.
Definition Adagrad.h:51
std::vector< std::vector< Matrix_t > > & GetPastSquaredWeightGradients()
Definition Adagrad.h:78
std::vector< Matrix_t > & GetPastSquaredBiasGradientsAt(size_t i)
Definition Adagrad.h:82
std::vector< std::vector< Matrix_t > > fWorkWeightTensor
working tensor used to keep a temporary copy of weights or weight gradients
Definition Adagrad.h:58
std::vector< std::vector< Matrix_t > > fWorkBiasTensor
working tensor used to keep a temporary copy of bias or bias gradients
Definition Adagrad.h:60
~TAdagrad()=default
Destructor.
std::vector< Matrix_t > & GetPastSquaredWeightGradientsAt(size_t i)
Definition Adagrad.h:79
Generic Optimizer class.
Definition Optimizer.h:45
std::vector< Layer_t * > & GetLayers()
Definition Optimizer.h:82
create variable transformations
REAL epsilon
Definition triangle.c:617