Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
SGD.h
Go to the documentation of this file.
1// @(#)root/tmva/tmva/dnn:$Id$
2// Author: Ravi Kiran S
3
4/**********************************************************************************
5 * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6 * Package: TMVA *
7 * Class : TSGD *
8 * *
9 * *
10 * Description: *
11 * Stochastic Batch Gradient Descent Optimizer Class *
12 * *
13 * Authors (alphabetical): *
14 * Ravi Kiran S <sravikiran0606@gmail.com> - CERN, Switzerland *
15 * *
16 * Copyright (c) 2005-2018: *
17 * CERN, Switzerland *
18 * U. of Victoria, Canada *
19 * MPI-K Heidelberg, Germany *
20 * U. of Bonn, Germany *
21 * *
22 * Redistribution and use in source and binary forms, with or without *
23 * modification, are permitted according to the terms listed in LICENSE *
24 * (see tmva/doc/LICENSE) *
25 **********************************************************************************/
26
27#ifndef TMVA_DNN_SGD
28#define TMVA_DNN_SGD
29
30#include "TMatrix.h"
31#include "TMVA/DNN/Optimizer.h"
32#include "TMVA/DNN/Functions.h"
33#include <vector>
34
35namespace TMVA {
36namespace DNN {
37
38/** \class TSGD
39 * Stochastic Batch Gradient Descent Optimizer class
40 *
41 * This class represents the Stochastic Batch Gradient Descent Optimizer with options for applying momentum
42 * and nesterov momentum.
43 */
44template <typename Architecture_t, typename Layer_t = VGeneralLayer<Architecture_t>,
45 typename DeepNet_t = TDeepNet<Architecture_t, Layer_t>>
46class TSGD : public VOptimizer<Architecture_t, Layer_t, DeepNet_t> {
47public:
48 using Matrix_t = typename Architecture_t::Matrix_t;
49 using Scalar_t = typename Architecture_t::Scalar_t;
50
51protected:
52 Scalar_t fMomentum; ///< The momentum used for training.
53 std::vector<std::vector<Matrix_t>>
54 fPastWeightGradients; ///< The sum of the past weight gradients associated with the deep net.
55 std::vector<std::vector<Matrix_t>>
56 fPastBiasGradients; ///< The sum of the past bias gradients associated with the deep net.
57
58 /*! Update the weights, given the current weight gradients. */
59 void UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights, const std::vector<Matrix_t> &weightGradients);
60
61 /*! Update the biases, given the current bias gradients. */
62 void UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases, const std::vector<Matrix_t> &biasGradients);
63
64public:
65 /*! Constructor. */
66 TSGD(Scalar_t learningRate, DeepNet_t &deepNet, Scalar_t momentum);
67
68 /*! Destructor. */
69 ~TSGD() = default;
70
71 /*! Getters */
72 Scalar_t GetMomentum() const { return fMomentum; }
73
74 std::vector<std::vector<Matrix_t>> &GetPastWeightGradients() { return fPastWeightGradients; }
75 std::vector<Matrix_t> &GetPastWeightGradientsAt(size_t i) { return fPastWeightGradients[i]; }
76
77 std::vector<std::vector<Matrix_t>> &GetPastBiasGradients() { return fPastBiasGradients; }
78 std::vector<Matrix_t> &GetPastBiasGradientsAt(size_t i) { return fPastBiasGradients[i]; }
79};
80
81//
82//
83// The Stochastic Gradient Descent Optimizer Class - Implementation
84//_________________________________________________________________________________________________
85template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
86TSGD<Architecture_t, Layer_t, DeepNet_t>::TSGD(Scalar_t learningRate, DeepNet_t &deepNet, Scalar_t momentum)
87 : VOptimizer<Architecture_t, Layer_t, DeepNet_t>(learningRate, deepNet), fMomentum(momentum)
88{
89 std::vector<Layer_t *> &layers = deepNet.GetLayers();
90 size_t layersNSlices = layers.size();
91 fPastWeightGradients.resize(layersNSlices);
92 fPastBiasGradients.resize(layersNSlices);
93
94 for (size_t i = 0; i < layersNSlices; i++) {
95
96 Architecture_t::CreateWeightTensors( fPastWeightGradients[i], layers[i]->GetWeights());
97 size_t weightsNSlices = fPastWeightGradients[i].size();
98 for (size_t j = 0; j < weightsNSlices; j++) {
99 initialize<Architecture_t>(fPastWeightGradients[i][j], EInitialization::kZero);
100 }
101
102 Architecture_t::CreateWeightTensors( fPastBiasGradients[i], layers[i]->GetBiases());
103 size_t biasesNSlices = fPastBiasGradients[i].size();
104 for (size_t j = 0; j < biasesNSlices; j++) {
105 initialize<Architecture_t>(fPastBiasGradients[i][j], EInitialization::kZero);
106 }
107 }
108}
109
110
111
112//_________________________________________________________________________________________________
113template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
114auto TSGD<Architecture_t, Layer_t, DeepNet_t>::UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights,
115 const std::vector<Matrix_t> &weightGradients) -> void
116{
117 // accumulating the current layer past weight gradients to include the current weight gradients.
118 // Vt = momentum * Vt-1 + currentGradients
119
120 std::vector<Matrix_t> &currentLayerPastWeightGradients = this->GetPastWeightGradientsAt(layerIndex);
121
122 for (size_t k = 0; k < currentLayerPastWeightGradients.size(); k++) {
123 Architecture_t::ConstMult(currentLayerPastWeightGradients[k], this->GetMomentum());
124 Architecture_t::ScaleAdd(currentLayerPastWeightGradients[k], weightGradients[k], 1.0);
125 }
126
127 // updating the weights.
128 // theta = theta - learningRate * Vt
129 for (size_t i = 0; i < weights.size(); i++) {
130 Architecture_t::ScaleAdd(weights[i], currentLayerPastWeightGradients[i], -this->GetLearningRate());
131 }
132}
133
134//_________________________________________________________________________________________________
135template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
136auto TSGD<Architecture_t, Layer_t, DeepNet_t>::UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases,
137 const std::vector<Matrix_t> &biasGradients) -> void
138{
139 // accumulating the current layer past bias gradients to include the current bias gradients.
140 // Vt = momentum * Vt-1 + currentGradients
141
142 std::vector<Matrix_t> &currentLayerPastBiasGradients = this->GetPastBiasGradientsAt(layerIndex);
143
144 for (size_t k = 0; k < currentLayerPastBiasGradients.size(); k++) {
145 Architecture_t::ConstMult(currentLayerPastBiasGradients[k], this->GetMomentum());
146 Architecture_t::ScaleAdd(currentLayerPastBiasGradients[k], biasGradients[k], 1.0);
147 }
148
149 // updating the biases
150 // theta = theta - learningRate * Vt
151 for (size_t i = 0; i < biases.size(); i++) {
152 Architecture_t::ScaleAdd(biases[i], currentLayerPastBiasGradients[i], -this->GetLearningRate());
153 }
154}
155
156} // namespace DNN
157} // namespace TMVA
158
159#endif
Stochastic Batch Gradient Descent Optimizer class.
Definition SGD.h:46
void UpdateWeights(size_t layerIndex, std::vector< Matrix_t > &weights, const std::vector< Matrix_t > &weightGradients)
Update the weights, given the current weight gradients.
Definition SGD.h:114
~TSGD()=default
Destructor.
Scalar_t fMomentum
The momentum used for training.
Definition SGD.h:52
typename Architecture_t::Scalar_t Scalar_t
Definition SGD.h:49
TSGD(Scalar_t learningRate, DeepNet_t &deepNet, Scalar_t momentum)
Constructor.
Definition SGD.h:86
std::vector< std::vector< Matrix_t > > & GetPastBiasGradients()
Definition SGD.h:77
std::vector< std::vector< Matrix_t > > fPastBiasGradients
The sum of the past bias gradients associated with the deep net.
Definition SGD.h:56
std::vector< std::vector< Matrix_t > > & GetPastWeightGradients()
Definition SGD.h:74
void UpdateBiases(size_t layerIndex, std::vector< Matrix_t > &biases, const std::vector< Matrix_t > &biasGradients)
Update the biases, given the current bias gradients.
Definition SGD.h:136
std::vector< Matrix_t > & GetPastWeightGradientsAt(size_t i)
Definition SGD.h:75
std::vector< Matrix_t > & GetPastBiasGradientsAt(size_t i)
Definition SGD.h:78
std::vector< std::vector< Matrix_t > > fPastWeightGradients
The sum of the past weight gradients associated with the deep net.
Definition SGD.h:54
typename Architecture_t::Matrix_t Matrix_t
Definition SGD.h:48
Scalar_t GetMomentum() const
Getters.
Definition SGD.h:72
Generic Optimizer class.
Definition Optimizer.h:45
std::vector< Layer_t * > & GetLayers()
Definition Optimizer.h:82
create variable transformations