doc/latest-stable/RMSProp_8h_source.html

// @(#)root/tmva/tmva/dnn:$Id$

// Author: Ravi Kiran S


/**********************************************************************************

 * Project: TMVA - a Root-integrated toolkit for multivariate data analysis       *

 * Package: TMVA                                                                  *

 * Class  : TRMSProp                                                              *

 *                                             *

 *                                                                                *

 * Description:                                                                   *

 *      RMSProp Optimizer Class                                                   *

 *                                                                                *

 * Authors (alphabetical):                                                        *

 *      Ravi Kiran S      <sravikiran0606@gmail.com>  - CERN, Switzerland         *

 *                                                                                *

 * Copyright (c) 2005-2018:                                                       *

 *      CERN, Switzerland                                                         *

 *      U. of Victoria, Canada                                                    *

 *      MPI-K Heidelberg, Germany                                                 *

 *      U. of Bonn, Germany                                                       *

 *                                                                                *

 * Redistribution and use in source and binary forms, with or without             *

 * modification, are permitted according to the terms listed in LICENSE           *

 * (see tmva/doc/LICENSE)                                          *

 **********************************************************************************/


#ifndef TMVA_DNN_RMSPROP

#define TMVA_DNN_RMSPROP


#include "TMatrix.h"

#include "TMVA/DNN/Optimizer.h"

#include "TMVA/DNN/Functions.h"

#include <vector>


namespace TMVA {

namespace DNN {


/** \class TRMSProp

 *  RMSProp Optimizer class

 *

 *  This class represents the RMSProp Optimizer with options for applying momentum.

 */

template <typename Architecture_t, typename Layer_t = VGeneralLayer<Architecture_t>,

          typename DeepNet_t = TDeepNet<Architecture_t, Layer_t>>


class TRMSProp : public VOptimizer<Architecture_t, Layer_t, DeepNet_t> {

public:

   using Matrix_t = typename Architecture_t::Matrix_t;

   using Scalar_t = typename Architecture_t::Scalar_t;


protected:

   Scalar_t fMomentum; ///< The momentum used for training.

   Scalar_t fRho;      ///< The Rho constant used by the optimizer.

   Scalar_t fEpsilon;  ///< The Smoothing term used to avoid division by zero.

   std::vector<std::vector<Matrix_t>>

      fPastSquaredWeightGradients; ///< The sum of the square of the past weight gradients associated with the deep net.

   std::vector<std::vector<Matrix_t>>

      fPastSquaredBiasGradients; ///< The sum of the square of the past bias gradients associated with the deep net.


   std::vector<std::vector<Matrix_t>> fWeightUpdates; ///< The accumulation of the past Weights for performing updates.

   std::vector<std::vector<Matrix_t>> fBiasUpdates;   ///< The accumulation of the past Biases for performing updates.

   std::vector<std::vector<Matrix_t>>

      fWorkWeightTensor1; ///< working tensor used to keep a temporary copy of weights or weight gradients

   std::vector<std::vector<Matrix_t>>

      fWorkBiasTensor1; ///< working tensor used to keep a temporary copy of bias or bias gradients

   std::vector<std::vector<Matrix_t>>

      fWorkWeightTensor2; ///< working tensor used to keep a temporary copy of weights or weight gradients

   std::vector<std::vector<Matrix_t>>

      fWorkBiasTensor2; ///< working tensor used to keep a temporary copy of bias or bias gradients


   /*! Update the weights, given the current weight gradients. */

   void UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights, const std::vector<Matrix_t> &weightGradients);


   /*! Update the biases, given the current bias gradients. */

   void UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases, const std::vector<Matrix_t> &biasGradients);


public:

   /*! Constructor. */

   TRMSProp(DeepNet_t &deepNet, Scalar_t learningRate = 0.001, Scalar_t momentum = 0.0, Scalar_t rho = 0.9,

            Scalar_t epsilon = 1e-7);


   /*! Destructor. */

   ~TRMSProp() = default;


   /*! Getters */

   Scalar_t GetMomentum() const { return fMomentum; }

   Scalar_t GetRho() const { return fRho; }

   Scalar_t GetEpsilon() const { return fEpsilon; }


   std::vector<std::vector<Matrix_t>> &GetPastSquaredWeightGradients() { return fPastSquaredWeightGradients; }

   std::vector<Matrix_t> &GetPastSquaredWeightGradientsAt(size_t i) { return fPastSquaredWeightGradients[i]; }


   std::vector<std::vector<Matrix_t>> &GetPastSquaredBiasGradients() { return fPastSquaredBiasGradients; }

   std::vector<Matrix_t> &GetPastSquaredBiasGradientsAt(size_t i) { return fPastSquaredBiasGradients[i]; }


   std::vector<std::vector<Matrix_t>> &GetWeightUpdates() { return fWeightUpdates; }

   std::vector<Matrix_t> &GetWeightUpdatesAt(size_t i) { return fWeightUpdates[i]; }


   std::vector<std::vector<Matrix_t>> &GetBiasUpdates() { return fBiasUpdates; }

   std::vector<Matrix_t> &GetBiasUpdatesAt(size_t i) { return fBiasUpdates[i]; }

};


//

//

//  The RMSProp Optimizer Class - Implementation

//_________________________________________________________________________________________________

template <typename Architecture_t, typename Layer_t, typename DeepNet_t>


TRMSProp<Architecture_t, Layer_t, DeepNet_t>::TRMSProp(DeepNet_t &deepNet, Scalar_t learningRate, Scalar_t momentum,

                                                       Scalar_t rho, Scalar_t epsilon)

   : VOptimizer<Architecture_t, Layer_t, DeepNet_t>(learningRate, deepNet), fMomentum(momentum), fRho(rho),

     fEpsilon(epsilon)

{

   std::vector<Layer_t *> &layers = deepNet.GetLayers();

   const size_t layersNSlices = layers.size();

   fPastSquaredWeightGradients.resize(layersNSlices);

   fPastSquaredBiasGradients.resize(layersNSlices);

   fWeightUpdates.resize(layersNSlices);

   fBiasUpdates.resize(layersNSlices);

   fWorkWeightTensor1.resize(layersNSlices);

   fWorkBiasTensor1.resize(layersNSlices);

   fWorkWeightTensor2.resize(layersNSlices);

   fWorkBiasTensor2.resize(layersNSlices);


   for (size_t i = 0; i < layersNSlices; i++) {

      const size_t weightsNSlices = (layers[i]->GetWeights()).size();


      Architecture_t::CreateWeightTensors(fPastSquaredWeightGradients[i], layers[i]->GetWeights());

      Architecture_t::CreateWeightTensors(fWeightUpdates[i], layers[i]->GetWeights());


      for (size_t j = 0; j < weightsNSlices; j++) {

         initialize<Architecture_t>(fPastSquaredWeightGradients[i][j], EInitialization::kZero);

         initialize<Architecture_t>(fWeightUpdates[i][j], EInitialization::kZero);

      }


      const size_t biasesNSlices = (layers[i]->GetBiases()).size();


      Architecture_t::CreateWeightTensors( fPastSquaredBiasGradients[i], layers[i]->GetBiases());

      Architecture_t::CreateWeightTensors( fBiasUpdates[i], layers[i]->GetBiases());


      for (size_t j = 0; j < biasesNSlices; j++) {

         initialize<Architecture_t>(fPastSquaredBiasGradients[i][j], EInitialization::kZero);

         initialize<Architecture_t>(fBiasUpdates[i][j], EInitialization::kZero);

      }

      Architecture_t::CreateWeightTensors(fWorkWeightTensor1[i], layers[i]->GetWeights());

      Architecture_t::CreateWeightTensors(fWorkBiasTensor1[i], layers[i]->GetBiases());

      Architecture_t::CreateWeightTensors(fWorkWeightTensor2[i], layers[i]->GetWeights());

      Architecture_t::CreateWeightTensors(fWorkBiasTensor2[i], layers[i]->GetBiases());

   }

}


//_________________________________________________________________________________________________

template <typename Architecture_t, typename Layer_t, typename DeepNet_t>


auto TRMSProp<Architecture_t, Layer_t, DeepNet_t>::UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights,

                                                                 const std::vector<Matrix_t> &weightGradients) -> void

{

   std::vector<Matrix_t> &currentLayerPastSquaredWeightGradients = this->GetPastSquaredWeightGradientsAt(layerIndex);

   std::vector<Matrix_t> &currentLayerWeightUpdates = this->GetWeightUpdatesAt(layerIndex);


   for (size_t k = 0; k < currentLayerPastSquaredWeightGradients.size(); k++) {


      // accumulation matrix used for temporary storing of the current accumulation

      auto &accumulation = fWorkWeightTensor1[layerIndex][k];

      auto &currentSquaredWeightGradients = fWorkWeightTensor2[layerIndex][k];


      // Vt = rho * Vt-1 + (1-rho) * currentSquaredWeightGradients

      initialize<Architecture_t>(accumulation, EInitialization::kZero);


      Architecture_t::Copy(currentSquaredWeightGradients, weightGradients[k]);

      Architecture_t::SquareElementWise(currentSquaredWeightGradients);

      Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredWeightGradients[k], this->GetRho());

      Architecture_t::ScaleAdd(accumulation, currentSquaredWeightGradients, 1 - (this->GetRho()));

      Architecture_t::Copy(currentLayerPastSquaredWeightGradients[k], accumulation);


      // Wt = momentum * Wt-1 + (learningRate * currentWeightGradients) / (sqrt(Vt + epsilon))

      initialize<Architecture_t>(accumulation, EInitialization::kZero);

      auto &dummy = fWorkWeightTensor2[layerIndex][k]; // reuse working tensor

      Architecture_t::Copy(dummy, currentLayerPastSquaredWeightGradients[k]);

      Architecture_t::ConstAdd(dummy, this->GetEpsilon());

      Architecture_t::SqrtElementWise(dummy);

      Architecture_t::ReciprocalElementWise(dummy);

      Architecture_t::Hadamard(dummy, weightGradients[k]);


      Architecture_t::ScaleAdd(accumulation, currentLayerWeightUpdates[k], this->GetMomentum());

      Architecture_t::ScaleAdd(accumulation, dummy, this->GetLearningRate());

      Architecture_t::Copy(currentLayerWeightUpdates[k], accumulation);

   }


   // updating the weights.

   // theta = theta - Wt

   for (size_t i = 0; i < weights.size(); i++) {

      Architecture_t::ScaleAdd(weights[i], currentLayerWeightUpdates[i], -1.0);

   }

}


//_________________________________________________________________________________________________

template <typename Architecture_t, typename Layer_t, typename DeepNet_t>


auto TRMSProp<Architecture_t, Layer_t, DeepNet_t>::UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases,

                                                                const std::vector<Matrix_t> &biasGradients) -> void

{

   std::vector<Matrix_t> &currentLayerPastSquaredBiasGradients = this->GetPastSquaredBiasGradientsAt(layerIndex);

   std::vector<Matrix_t> &currentLayerBiasUpdates = this->GetBiasUpdatesAt(layerIndex);


   for (size_t k = 0; k < currentLayerPastSquaredBiasGradients.size(); k++) {


      // accumulation matrix used for temporary storing of the current accumulation

      auto &accumulation = fWorkBiasTensor1[layerIndex][k];

      auto &currentSquaredBiasGradients = fWorkBiasTensor2[layerIndex][k];


      // Vt = rho * Vt-1 + (1-rho) * currentSquaredBiasGradients

      initialize<Architecture_t>(accumulation, EInitialization::kZero);

      Architecture_t::Copy(currentSquaredBiasGradients, biasGradients[k]);

      Architecture_t::SquareElementWise(currentSquaredBiasGradients);

      Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredBiasGradients[k], this->GetRho());

      Architecture_t::ScaleAdd(accumulation, currentSquaredBiasGradients, 1 - (this->GetRho()));

      Architecture_t::Copy(currentLayerPastSquaredBiasGradients[k], accumulation);


      // Wt = momentum * Wt-1 + (learningRate * currentBiasGradients) / (sqrt(Vt + epsilon))

      initialize<Architecture_t>(accumulation, EInitialization::kZero);

      auto &dummy = fWorkBiasTensor2[layerIndex][k]; // reuse working tensor


      Architecture_t::Copy(dummy, currentLayerPastSquaredBiasGradients[k]);

      Architecture_t::ConstAdd(dummy, this->GetEpsilon());

      Architecture_t::SqrtElementWise(dummy);

      Architecture_t::ReciprocalElementWise(dummy);

      Architecture_t::Hadamard(dummy, biasGradients[k]);


      Architecture_t::ScaleAdd(accumulation, currentLayerBiasUpdates[k], this->GetMomentum());

      Architecture_t::ScaleAdd(accumulation, dummy, this->GetLearningRate());

      Architecture_t::Copy(currentLayerBiasUpdates[k], accumulation);

   }


   // updating the Biases.

   // theta = theta - Wt

   for (size_t i = 0; i < biases.size(); i++) {

      Architecture_t::ScaleAdd(biases[i], currentLayerBiasUpdates[i], -1.0);

   }

}


} // namespace DNN

} // namespace TMVA


#endif

Optimizer.h

e
#define e(i)
Definition RSha256.hxx:103

size
size_t size(const MatrixT &matrix)
retrieve the size of a square matrix

TRangeDynCast
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
Definition TCollection.h:358

TMatrix.h

ROOT::Detail::TRangeCast
Definition TCollection.h:311

TMVA::DNN::TRMSProp
RMSProp Optimizer class.
Definition RMSProp.h:45

TMVA::DNN::TRMSProp::fRho
Scalar_t fRho
The Rho constant used by the optimizer.
Definition RMSProp.h:52

TMVA::DNN::TRMSProp::Scalar_t
typename Architecture_t::Scalar_t Scalar_t
Definition RMSProp.h:48

TMVA::DNN::TRMSProp::UpdateWeights
void UpdateWeights(size_t layerIndex, std::vector< Matrix_t > &weights, const std::vector< Matrix_t > &weightGradients)
Update the weights, given the current weight gradients.
Definition RMSProp.h:152

TMVA::DNN::TRMSProp::~TRMSProp
~TRMSProp()=default
Destructor.

TMVA::DNN::TRMSProp::GetPastSquaredWeightGradientsAt
std::vector< Matrix_t > & GetPastSquaredWeightGradientsAt(size_t i)
Definition RMSProp.h:90

TMVA::DNN::TRMSProp::fWorkBiasTensor2
std::vector< std::vector< Matrix_t > > fWorkBiasTensor2
working tensor used to keep a temporary copy of bias or bias gradients
Definition RMSProp.h:68

TMVA::DNN::TRMSProp::GetRho
Scalar_t GetRho() const
Definition RMSProp.h:86

TMVA::DNN::TRMSProp::fPastSquaredWeightGradients
std::vector< std::vector< Matrix_t > > fPastSquaredWeightGradients
The sum of the square of the past weight gradients associated with the deep net.
Definition RMSProp.h:55

TMVA::DNN::TRMSProp::GetBiasUpdates
std::vector< std::vector< Matrix_t > > & GetBiasUpdates()
Definition RMSProp.h:98

TMVA::DNN::TRMSProp::fWorkWeightTensor2
std::vector< std::vector< Matrix_t > > fWorkWeightTensor2
working tensor used to keep a temporary copy of weights or weight gradients
Definition RMSProp.h:66

TMVA::DNN::TRMSProp::GetEpsilon
Scalar_t GetEpsilon() const
Definition RMSProp.h:87

TMVA::DNN::TRMSProp::fWorkBiasTensor1
std::vector< std::vector< Matrix_t > > fWorkBiasTensor1
working tensor used to keep a temporary copy of bias or bias gradients
Definition RMSProp.h:64

TMVA::DNN::TRMSProp::fMomentum
Scalar_t fMomentum
The momentum used for training.
Definition RMSProp.h:51

TMVA::DNN::TRMSProp::GetPastSquaredBiasGradients
std::vector< std::vector< Matrix_t > > & GetPastSquaredBiasGradients()
Definition RMSProp.h:92

TMVA::DNN::TRMSProp::fEpsilon
Scalar_t fEpsilon
The Smoothing term used to avoid division by zero.
Definition RMSProp.h:53

TMVA::DNN::TRMSProp::TRMSProp
TRMSProp(DeepNet_t &deepNet, Scalar_t learningRate=0.001, Scalar_t momentum=0.0, Scalar_t rho=0.9, Scalar_t epsilon=1e-7)
Constructor.
Definition RMSProp.h:107

TMVA::DNN::TRMSProp::fPastSquaredBiasGradients
std::vector< std::vector< Matrix_t > > fPastSquaredBiasGradients
The sum of the square of the past bias gradients associated with the deep net.
Definition RMSProp.h:57

TMVA::DNN::TRMSProp::fWeightUpdates
std::vector< std::vector< Matrix_t > > fWeightUpdates
The accumulation of the past Weights for performing updates.
Definition RMSProp.h:59

TMVA::DNN::TRMSProp::Matrix_t
typename Architecture_t::Matrix_t Matrix_t
Definition RMSProp.h:47

TMVA::DNN::TRMSProp::UpdateBiases
void UpdateBiases(size_t layerIndex, std::vector< Matrix_t > &biases, const std::vector< Matrix_t > &biasGradients)
Update the biases, given the current bias gradients.
Definition RMSProp.h:196

TMVA::DNN::TRMSProp::GetBiasUpdatesAt
std::vector< Matrix_t > & GetBiasUpdatesAt(size_t i)
Definition RMSProp.h:99

TMVA::DNN::TRMSProp::GetWeightUpdates
std::vector< std::vector< Matrix_t > > & GetWeightUpdates()
Definition RMSProp.h:95

TMVA::DNN::TRMSProp::fWorkWeightTensor1
std::vector< std::vector< Matrix_t > > fWorkWeightTensor1
working tensor used to keep a temporary copy of weights or weight gradients
Definition RMSProp.h:62

TMVA::DNN::TRMSProp::GetWeightUpdatesAt
std::vector< Matrix_t > & GetWeightUpdatesAt(size_t i)
Definition RMSProp.h:96

TMVA::DNN::TRMSProp::GetPastSquaredWeightGradients
std::vector< std::vector< Matrix_t > > & GetPastSquaredWeightGradients()
Definition RMSProp.h:89

TMVA::DNN::TRMSProp::fBiasUpdates
std::vector< std::vector< Matrix_t > > fBiasUpdates
The accumulation of the past Biases for performing updates.
Definition RMSProp.h:60

TMVA::DNN::TRMSProp::GetMomentum
Scalar_t GetMomentum() const
Getters.
Definition RMSProp.h:85

TMVA::DNN::TRMSProp::GetPastSquaredBiasGradientsAt
std::vector< Matrix_t > & GetPastSquaredBiasGradientsAt(size_t i)
Definition RMSProp.h:93

TMVA::DNN::VOptimizer
Generic Optimizer class.
Definition Optimizer.h:45

TMVA::DNN::EInitialization::kZero
@ kZero

TMVA
create variable transformations
Definition GeneticMinimizer.h:22

Functions.h