Logo ROOT  
Reference Guide
Adam.h
Go to the documentation of this file.
1// @(#)root/tmva/tmva/dnn:$Id$
2// Author: Ravi Kiran S
3
4/**********************************************************************************
5 * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6 * Package: TMVA *
7 * Class : TAdam *
8 * Web : http://tmva.sourceforge.net *
9 * *
10 * Description: *
11 * Adam Optimizer Class *
12 * *
13 * Authors (alphabetical): *
14 * Ravi Kiran S <sravikiran0606@gmail.com> - CERN, Switzerland *
15 * *
16 * Copyright (c) 2005-2018: *
17 * CERN, Switzerland *
18 * U. of Victoria, Canada *
19 * MPI-K Heidelberg, Germany *
20 * U. of Bonn, Germany *
21 * *
22 * Redistribution and use in source and binary forms, with or without *
23 * modification, are permitted according to the terms listed in LICENSE *
24 * (http://tmva.sourceforge.net/LICENSE) *
25 **********************************************************************************/
26
27#ifndef TMVA_DNN_ADAM
28#define TMVA_DNN_ADAM
29
30#include "TMatrix.h"
31#include "TMVA/DNN/Optimizer.h"
32#include "TMVA/DNN/Functions.h"
33#include <vector>
34
35namespace TMVA {
36namespace DNN {
37
38/** \class TAdam
39 * Adam Optimizer class
40 *
41 * This class represents the Adam Optimizer.
42 */
43template <typename Architecture_t, typename Layer_t = VGeneralLayer<Architecture_t>,
44 typename DeepNet_t = TDeepNet<Architecture_t, Layer_t>>
45class TAdam : public VOptimizer<Architecture_t, Layer_t, DeepNet_t> {
46public:
47 using Matrix_t = typename Architecture_t::Matrix_t;
48 using Scalar_t = typename Architecture_t::Scalar_t;
49
50protected:
51 Scalar_t fBeta1; ///< The Beta1 constant used by the optimizer.
52 Scalar_t fBeta2; ///< The Beta2 constant used by the optimizer.
53 Scalar_t fEpsilon; ///< The Smoothing term used to avoid division by zero.
54
55 std::vector<std::vector<Matrix_t>> fFirstMomentWeights; ///< The decaying average of the first moment of the past
56 /// weight gradients associated with the deep net.
57 std::vector<std::vector<Matrix_t>> fFirstMomentBiases; ///< The decaying average of the first moment of the past bias
58 /// gradients associated with the deep net.
59
60 std::vector<std::vector<Matrix_t>> fSecondMomentWeights; ///< The decaying average of the second moment of the past
61 /// weight gradients associated with the deep net.
62 std::vector<std::vector<Matrix_t>> fSecondMomentBiases; ///< The decaying average of the second moment of the past
63 /// bias gradients associated with the deep net.
64
65 /*! Update the weights, given the current weight gradients. */
66 void UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights, const std::vector<Matrix_t> &weightGradients);
67
68 /*! Update the biases, given the current bias gradients. */
69 void UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases, const std::vector<Matrix_t> &biasGradients);
70
71public:
72 /*! Constructor. */
73 TAdam(DeepNet_t &deepNet, Scalar_t learningRate = 0.001, Scalar_t beta1 = 0.9, Scalar_t beta2 = 0.999,
74 Scalar_t epsilon = 1e-7);
75
76 /*! Destructor. */
77 ~TAdam() = default;
78
79 /*! Getters */
80 Scalar_t GetBeta1() const { return fBeta1; }
81 Scalar_t GetBeta2() const { return fBeta2; }
82 Scalar_t GetEpsilon() const { return fEpsilon; }
83
84 std::vector<std::vector<Matrix_t>> &GetFirstMomentWeights() { return fFirstMomentWeights; }
85 std::vector<Matrix_t> &GetFirstMomentWeightsAt(size_t i) { return fFirstMomentWeights[i]; }
86
87 std::vector<std::vector<Matrix_t>> &GetFirstMomentBiases() { return fFirstMomentBiases; }
88 std::vector<Matrix_t> &GetFirstMomentBiasesAt(size_t i) { return fFirstMomentBiases[i]; }
89
90 std::vector<std::vector<Matrix_t>> &GetSecondMomentWeights() { return fSecondMomentWeights; }
91 std::vector<Matrix_t> &GetSecondMomentWeightsAt(size_t i) { return fSecondMomentWeights[i]; }
92
93 std::vector<std::vector<Matrix_t>> &GetSecondMomentBiases() { return fSecondMomentBiases; }
94 std::vector<Matrix_t> &GetSecondMomentBiasesAt(size_t i) { return fSecondMomentBiases[i]; }
95};
96
97//
98//
99// The Adam Optimizer Class - Implementation
100//_________________________________________________________________________________________________
101template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
104 : VOptimizer<Architecture_t, Layer_t, DeepNet_t>(learningRate, deepNet), fBeta1(beta1), fBeta2(beta2),
105 fEpsilon(epsilon)
106{
107 std::vector<Layer_t *> &layers = deepNet.GetLayers();
108 const size_t layersNSlices = layers.size();
109 fFirstMomentWeights.resize(layersNSlices);
110 fFirstMomentBiases.resize(layersNSlices);
111 fSecondMomentWeights.resize(layersNSlices);
112 fSecondMomentBiases.resize(layersNSlices);
113
114
115 for (size_t i = 0; i < layersNSlices; i++) {
116
117 Architecture_t::CreateWeightTensors( fFirstMomentWeights[i], layers[i]->GetWeights());
118 Architecture_t::CreateWeightTensors( fSecondMomentWeights[i], layers[i]->GetWeights());
119
120 const size_t weightsNSlices = (layers[i]->GetWeights()).size();
121
122 for (size_t j = 0; j < weightsNSlices; j++) {
123 initialize<Architecture_t>(fFirstMomentWeights[i][j], EInitialization::kZero);
124 initialize<Architecture_t>(fSecondMomentWeights[i][j], EInitialization::kZero);
125 }
126
127 const size_t biasesNSlices = (layers[i]->GetBiases()).size();
128
129 Architecture_t::CreateWeightTensors( fFirstMomentBiases[i], layers[i]->GetBiases());
130 Architecture_t::CreateWeightTensors( fSecondMomentBiases[i], layers[i]->GetBiases());
131
132 for (size_t j = 0; j < biasesNSlices; j++) {
133 initialize<Architecture_t>(fFirstMomentBiases[i][j], EInitialization::kZero);
134 initialize<Architecture_t>(fSecondMomentBiases[i][j], EInitialization::kZero);
135 }
136 }
137}
138
139//_________________________________________________________________________________________________
140template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
141auto TAdam<Architecture_t, Layer_t, DeepNet_t>::UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights,
142 const std::vector<Matrix_t> &weightGradients) -> void
143{
144 // update of weights using Adam algorithm
145 // we use the formulation defined before section 2.1 in the original paper
146 // 'Adam: A method for stochastic optimization, D. Kingma, J. Ba, see https://arxiv.org/abs/1412.6980
147
148 std::vector<Matrix_t> &currentLayerFirstMomentWeights = this->GetFirstMomentWeightsAt(layerIndex);
149 std::vector<Matrix_t> &currentLayerSecondMomentWeights = this->GetSecondMomentWeightsAt(layerIndex);
150
151 // alpha = learningRate * sqrt(1 - beta2^t) / (1-beta1^t)
152 Scalar_t alpha = (this->GetLearningRate()) * (sqrt(1 - pow(this->GetBeta2(), this->GetGlobalStep()))) /
153 (1 - pow(this->GetBeta1(), this->GetGlobalStep()));
154
155 /// Adam update of first and second momentum of the weights
156 for (size_t i = 0; i < weights.size(); i++) {
157 // Mt = beta1 * Mt-1 + (1-beta1) * WeightGradients
158 Architecture_t::AdamUpdateFirstMom(currentLayerFirstMomentWeights[i], weightGradients[i], this->GetBeta1() );
159 // Vt = beta2 * Vt-1 + (1-beta2) * WeightGradients^2
160 Architecture_t::AdamUpdateSecondMom(currentLayerSecondMomentWeights[i], weightGradients[i], this->GetBeta2() );
161 // Weight = Weight - alpha * Mt / (sqrt(Vt) + epsilon)
162 Architecture_t::AdamUpdate(weights[i], currentLayerFirstMomentWeights[i], currentLayerSecondMomentWeights[i],
163 alpha, this->GetEpsilon() );
164 }
165}
166
167//_________________________________________________________________________________________________
168template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
169auto TAdam<Architecture_t, Layer_t, DeepNet_t>::UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases,
170 const std::vector<Matrix_t> &biasGradients) -> void
171{
172 std::vector<Matrix_t> &currentLayerFirstMomentBiases = this->GetFirstMomentBiasesAt(layerIndex);
173 std::vector<Matrix_t> &currentLayerSecondMomentBiases = this->GetSecondMomentBiasesAt(layerIndex);
174
175 // alpha = learningRate * sqrt(1 - beta2^t) / (1-beta1^t)
176 Scalar_t alpha = (this->GetLearningRate()) * (sqrt(1 - pow(this->GetBeta2(), this->GetGlobalStep()))) /
177 (1 - pow(this->GetBeta1(), this->GetGlobalStep()));
178
179 // updating of the biases.
180 for (size_t i = 0; i < biases.size(); i++) {
181 // Mt = beta1 * Mt-1 + (1-beta1) * BiasGradients
182 Architecture_t::AdamUpdateFirstMom(currentLayerFirstMomentBiases[i], biasGradients[i], this->GetBeta1() );
183 // Vt = beta2 * Vt-1 + (1-beta2) * BiasGradients^2
184 Architecture_t::AdamUpdateSecondMom(currentLayerSecondMomentBiases[i], biasGradients[i], this->GetBeta2() );
185 // theta = theta - alpha * Mt / (sqrt(Vt) + epsilon)
186 Architecture_t::AdamUpdate(biases[i], currentLayerFirstMomentBiases[i], currentLayerSecondMomentBiases[i],
187 alpha, this->GetEpsilon() );
188 }
189}
190
191} // namespace DNN
192} // namespace TMVA
193
194#endif
#define e(i)
Definition: RSha256.hxx:103
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix
Adam Optimizer class.
Definition: Adam.h:45
std::vector< std::vector< Matrix_t > > fSecondMomentWeights
The decaying average of the second moment of the past weight gradients associated with the deep net.
Definition: Adam.h:60
std::vector< Matrix_t > & GetSecondMomentBiasesAt(size_t i)
Definition: Adam.h:94
Scalar_t GetEpsilon() const
Definition: Adam.h:82
typename Architecture_t::Matrix_t Matrix_t
Definition: Adam.h:47
std::vector< Matrix_t > & GetFirstMomentBiasesAt(size_t i)
Definition: Adam.h:88
Scalar_t fBeta2
The Beta2 constant used by the optimizer.
Definition: Adam.h:52
std::vector< std::vector< Matrix_t > > fSecondMomentBiases
The decaying average of the second moment of the past bias gradients associated with the deep net.
Definition: Adam.h:62
std::vector< std::vector< Matrix_t > > & GetSecondMomentBiases()
Definition: Adam.h:93
~TAdam()=default
Destructor.
std::vector< Matrix_t > & GetFirstMomentWeightsAt(size_t i)
Definition: Adam.h:85
std::vector< std::vector< Matrix_t > > & GetSecondMomentWeights()
Definition: Adam.h:90
Scalar_t GetBeta2() const
Definition: Adam.h:81
std::vector< std::vector< Matrix_t > > fFirstMomentBiases
The decaying average of the first moment of the past bias gradients associated with the deep net.
Definition: Adam.h:57
Scalar_t fEpsilon
The Smoothing term used to avoid division by zero.
Definition: Adam.h:53
void UpdateWeights(size_t layerIndex, std::vector< Matrix_t > &weights, const std::vector< Matrix_t > &weightGradients)
Update the weights, given the current weight gradients.
Definition: Adam.h:141
std::vector< std::vector< Matrix_t > > & GetFirstMomentWeights()
Definition: Adam.h:84
TAdam(DeepNet_t &deepNet, Scalar_t learningRate=0.001, Scalar_t beta1=0.9, Scalar_t beta2=0.999, Scalar_t epsilon=1e-7)
Constructor.
Definition: Adam.h:102
std::vector< Matrix_t > & GetSecondMomentWeightsAt(size_t i)
Definition: Adam.h:91
std::vector< std::vector< Matrix_t > > fFirstMomentWeights
The decaying average of the first moment of the past weight gradients associated with the deep net.
Definition: Adam.h:55
Scalar_t fBeta1
The Beta1 constant used by the optimizer.
Definition: Adam.h:51
void UpdateBiases(size_t layerIndex, std::vector< Matrix_t > &biases, const std::vector< Matrix_t > &biasGradients)
Update the biases, given the current bias gradients.
Definition: Adam.h:169
std::vector< std::vector< Matrix_t > > & GetFirstMomentBiases()
Definition: Adam.h:87
typename Architecture_t::Scalar_t Scalar_t
Definition: Adam.h:48
Scalar_t GetBeta1() const
Getters.
Definition: Adam.h:80
Generic Optimizer class.
Definition: Optimizer.h:45
std::vector< Layer_t * > & GetLayers()
Definition: Optimizer.h:82
RVec< PromoteTypes< T0, T1 > > pow(const T0 &x, const RVec< T1 > &v)
Definition: RVec.hxx:1753
VecExpr< UnaryOp< Sqrt< T >, VecExpr< A, T, D >, T >, T, D > sqrt(const VecExpr< A, T, D > &rhs)
create variable transformations
double epsilon
Definition: triangle.c:618