30#ifndef TMVA_DNN_LSTM_LAYER
31#define TMVA_DNN_LSTM_LAYER
55template<
typename Architecture_t>
61 using Matrix_t =
typename Architecture_t::Matrix_t;
62 using Scalar_t =
typename Architecture_t::Scalar_t;
63 using Tensor_t =
typename Architecture_t::Tensor_t;
147 TBasicLSTMLayer(
size_t batchSize,
size_t stateSize,
size_t inputSize,
size_t timeSteps,
bool rememberState =
false,
148 bool returnSequence =
false,
174 const Tensor_t &activations_backward)
override;
183 const Matrix_t & precStateActivations,
const Matrix_t & precCellActivations,
202 void Print()
const override;
340template <
typename Architecture_t>
346 batchSize, 1, timeSteps, inputSize, 1, (returnSequence) ? timeSteps : 1, stateSize, 8,
347 {stateSize, stateSize, stateSize, stateSize, stateSize, stateSize, stateSize, stateSize},
348 {inputSize, inputSize, inputSize, inputSize, stateSize, stateSize, stateSize, stateSize}, 4,
349 {stateSize, stateSize, stateSize, stateSize}, {1, 1, 1, 1}, batchSize, (returnSequence) ? timeSteps : 1,
351 fStateSize(stateSize), fCellSize(stateSize), fTimeSteps(timeSteps), fRememberState(rememberState),
352 fReturnSequence(returnSequence), fF1(
f1), fF2(f2), fInputValue(batchSize, stateSize),
353 fCandidateValue(batchSize, stateSize), fForgetValue(batchSize, stateSize), fOutputValue(batchSize, stateSize),
354 fState(batchSize, stateSize), fCell(batchSize, stateSize), fWeightsInputGate(this->GetWeightsAt(0)),
355 fWeightsInputGateState(this->GetWeightsAt(4)), fInputGateBias(this->GetBiasesAt(0)),
356 fWeightsForgetGate(this->GetWeightsAt(1)), fWeightsForgetGateState(this->GetWeightsAt(5)),
357 fForgetGateBias(this->GetBiasesAt(1)), fWeightsCandidate(this->GetWeightsAt(2)),
358 fWeightsCandidateState(this->GetWeightsAt(6)), fCandidateBias(this->GetBiasesAt(2)),
359 fWeightsOutputGate(this->GetWeightsAt(3)), fWeightsOutputGateState(this->GetWeightsAt(7)),
360 fOutputGateBias(this->GetBiasesAt(3)), fWeightsInputGradients(this->GetWeightGradientsAt(0)),
361 fWeightsInputStateGradients(this->GetWeightGradientsAt(4)), fInputBiasGradients(this->GetBiasGradientsAt(0)),
362 fWeightsForgetGradients(this->GetWeightGradientsAt(1)),
363 fWeightsForgetStateGradients(this->GetWeightGradientsAt(5)), fForgetBiasGradients(this->GetBiasGradientsAt(1)),
364 fWeightsCandidateGradients(this->GetWeightGradientsAt(2)),
365 fWeightsCandidateStateGradients(this->GetWeightGradientsAt(6)),
366 fCandidateBiasGradients(this->GetBiasGradientsAt(2)), fWeightsOutputGradients(this->GetWeightGradientsAt(3)),
367 fWeightsOutputStateGradients(this->GetWeightGradientsAt(7)), fOutputBiasGradients(this->GetBiasGradientsAt(3))
369 for (
size_t i = 0; i < timeSteps; ++i) {
370 fDerivativesInput.emplace_back(batchSize, stateSize);
371 fDerivativesForget.emplace_back(batchSize, stateSize);
372 fDerivativesCandidate.emplace_back(batchSize, stateSize);
373 fDerivativesOutput.emplace_back(batchSize, stateSize);
374 input_gate_value.emplace_back(batchSize, stateSize);
375 forget_gate_value.emplace_back(batchSize, stateSize);
376 candidate_gate_value.emplace_back(batchSize, stateSize);
377 output_gate_value.emplace_back(batchSize, stateSize);
378 cell_value.emplace_back(batchSize, stateSize);
380 Architecture_t::InitializeLSTMTensors(
this);
384template <
typename Architecture_t>
464 Architecture_t::InitializeLSTMTensors(
this);
468template <
typename Architecture_t>
473 Architecture_t::InitializeLSTMDescriptors(
fDescriptors,
this);
478template <
typename Architecture_t>
496template <
typename Architecture_t>
514template <
typename Architecture_t>
532template <
typename Architecture_t>
552template <
typename Architecture_t>
558 if (Architecture_t::IsCudnn()) {
561 assert(input.GetStrides()[1] == this->GetInputSize());
565 Architecture_t::Rearrange(
x, input);
575 auto &cx = this->
fCell;
578 auto &cy = this->
fCell;
583 Architecture_t::RNNForward(
x, hx, cx, weights,
y, hy, cy, rnnDesc, rnnWork, isTraining);
586 Architecture_t::Rearrange(this->
GetOutput(),
y);
589 Tensor_t tmp = (
y.At(
y.GetShape()[0] - 1)).Reshape({
y.GetShape()[1], 1,
y.GetShape()[2]});
590 Architecture_t::Copy(this->
GetOutput(), tmp);
606 Architecture_t::Rearrange(arrInput, input);
631 Matrix_t arrOutputMt = arrOutput[t];
632 Architecture_t::Copy(arrOutputMt,
fState);
638 Architecture_t::Rearrange(this->
GetOutput(), arrOutput);
644 tmp = tmp.Reshape( {tmp.GetShape()[0], tmp.GetShape()[1], 1});
645 assert(tmp.GetSize() == this->GetOutput().GetSize());
646 assert( tmp.GetShape()[0] == this->GetOutput().GetShape()[2]);
647 Architecture_t::Rearrange(this->
GetOutput(), tmp);
654template <
typename Architecture_t>
661 Architecture_t::Hadamard(
fCell, forgetGateValues);
662 Architecture_t::Hadamard(inputGateValues, candidateValues);
663 Architecture_t::ScaleAdd(
fCell, inputGateValues);
666 Architecture_t::Copy(cache,
fCell);
675 Architecture_t::Copy(
fState, cache);
676 Architecture_t::Hadamard(
fState, outputGateValues);
680template <
typename Architecture_t>
682 const Tensor_t &activations_backward)
687 if (Architecture_t::IsCudnn()) {
695 assert(activations_backward.GetStrides()[1] == this->GetInputSize());
697 Architecture_t::Rearrange(
x, activations_backward);
702 Architecture_t::InitializeZero(dy);
707 Tensor_t tmp2 = dy.At(dy.GetShape()[0] - 1).Reshape({dy.GetShape()[1], 1, dy.GetShape()[2]});
712 Architecture_t::Rearrange(
y, this->
GetOutput());
723 Architecture_t::InitializeZero(weightGradients);
738 Architecture_t::RNNBackward(
x, hx, cx,
y, dy, dhy, dcy, weights, dx, dhx, dcx, weightGradients, rnnDesc, rnnWork);
742 if (gradients_backward.GetSize() != 0)
743 Architecture_t::Rearrange(gradients_backward, dx);
761 if (gradients_backward.GetSize() == 0 || gradients_backward[0].GetNrows() == 0 || gradients_backward[0].GetNcols() == 0) {
773 Architecture_t::Rearrange(arr_activations_backward, activations_backward);
787 Architecture_t::Rearrange(arr_output, this->
GetOutput());
792 Architecture_t::InitializeZero(arr_actgradients);
795 assert(tmp_grad.GetSize() == this->GetActivationGradients().GetSize());
796 assert(tmp_grad.GetShape()[0] == this->GetActivationGradients().GetShape()[2]);
827 Architecture_t::ScaleAdd(state_gradients_backward, arr_actgradients[t-1]);
829 const Matrix_t &prevStateActivations = arr_output[t-2];
832 Matrix_t dx = arr_gradients_backward[t-1];
833 CellBackward(state_gradients_backward, cell_gradients_backward,
834 prevStateActivations, prevCellActivations,
837 arr_activations_backward[t-1], dx,
841 const Matrix_t &prevStateActivations = initState;
842 const Matrix_t &prevCellActivations = initState;
843 Matrix_t dx = arr_gradients_backward[t-1];
844 CellBackward(state_gradients_backward, cell_gradients_backward,
845 prevStateActivations, prevCellActivations,
848 arr_activations_backward[t-1], dx,
855 Architecture_t::Rearrange(gradients_backward, arr_gradients_backward );
862template <
typename Architecture_t>
865 const Matrix_t & precStateActivations,
const Matrix_t & precCellActivations,
887 return Architecture_t::LSTMLayerBackward(state_gradients_backward, cell_gradients_backward,
892 precStateActivations, precCellActivations,
893 input_gate, forget_gate, candidate_gate, output_gate,
897 cell_gradient, cell_tanh);
901template <
typename Architecture_t>
910template<
typename Architecture_t>
914 std::cout <<
" LSTM Layer: \t ";
917 std::cout <<
", NTime = " << this->
GetTimeSteps() <<
" )";
918 std::cout <<
"\tOutput = ( " << this->
GetOutput().GetFirstSize() <<
" , " << this->
GetOutput()[0].GetNrows() <<
" , " << this->
GetOutput()[0].GetNcols() <<
" )\n";
922template <
typename Architecture_t>
952template <
typename Architecture_t>
void InputGate(const Matrix_t &input, Matrix_t &di)
Decides the values we'll update (NN with Sigmoid).
const Matrix_t & GetForgetGateTensorAt(size_t i) const
Matrix_t & GetWeightsOutputGateState()
const std::vector< Matrix_t > & GetOutputGateTensor() const
Tensor_t fWeightsTensor
Tensor for all weights.
const std::vector< Matrix_t > & GetInputGateTensor() const
std::vector< Matrix_t > & GetDerivativesOutput()
const Matrix_t & GetWeigthsForgetStateGradients() const
Matrix_t & GetWeightsForgetGate()
typename Architecture_t::Matrix_t Matrix_t
Matrix_t & GetCandidateGateTensorAt(size_t i)
void InitState(DNN::EInitialization m=DNN::EInitialization::kZero)
Initialize the hidden state and cell state method.
Matrix_t & fWeightsCandidateGradients
Gradients w.r.t the candidate gate - input weights.
const Matrix_t & GetOutputGateBias() const
Matrix_t & GetWeightsCandidateStateGradients()
Matrix_t & GetWeightsInputGate()
Matrix_t & GetWeightsInputGateState()
const std::vector< Matrix_t > & GetCandidateGateTensor() const
const Matrix_t & GetInputGateTensorAt(size_t i) const
std::vector< Matrix_t > & GetForgetGateTensor()
std::vector< Matrix_t > cell_value
cell value for every time step
void Backward(Tensor_t &gradients_backward, const Tensor_t &activations_backward) override
Backpropagates the error.
Matrix_t & fWeightsOutputGradients
Gradients w.r.t the output gate - input weights.
Matrix_t & GetOutputGateBias()
Matrix_t & fOutputBiasGradients
Gradients w.r.t the output gate - bias weights.
void Initialize() override
Initialize the weights according to the given initialization method.
DNN::EActivationFunction fF1
Activation function: sigmoid.
Tensor_t fDy
cached activation gradient (input of backward) as T x B x S
Matrix_t & fWeightsOutputGate
Output Gate weights for input, fWeights[6].
Matrix_t & GetForgetGateBias()
Matrix_t & fWeightsCandidateStateGradients
Gradients w.r.t the candidate gate - hidden state weights.
const Matrix_t & GetInputGateBias() const
typename Architecture_t::Scalar_t Scalar_t
size_t GetInputSize() const
Getters.
Matrix_t & GetForgetGateTensorAt(size_t i)
const Matrix_t & GetOutputGateTensorAt(size_t i) const
const Matrix_t & GetCellTensorAt(size_t i) const
Tensor_t fX
cached input tensor as T x B x I
DNN::EActivationFunction GetActivationFunctionF2() const
Matrix_t & GetCellTensorAt(size_t i)
Matrix_t & fWeightsInputStateGradients
Gradients w.r.t the input gate - hidden state weights.
void CellForward(Matrix_t &inputGateValues, const Matrix_t &forgetGateValues, const Matrix_t &candidateValues, const Matrix_t &outputGateValues)
Forward for a single cell (time unit).
Matrix_t & CellBackward(Matrix_t &state_gradients_backward, Matrix_t &cell_gradients_backward, const Matrix_t &precStateActivations, const Matrix_t &precCellActivations, const Matrix_t &input_gate, const Matrix_t &forget_gate, const Matrix_t &candidate_gate, const Matrix_t &output_gate, const Matrix_t &input, Matrix_t &input_gradient, Matrix_t &di, Matrix_t &df, Matrix_t &dc, Matrix_t &dout, size_t t)
Backward for a single time unit a the corresponding call to Forward(...).
const Matrix_t & GetWeightsInputStateGradients() const
std::vector< Matrix_t > fDerivativesOutput
First fDerivatives of the activations output gate.
size_t GetStateSize() const
void ReadWeightsFromXML(void *parent) override
Read the information and the weights about the layer from XML node.
Matrix_t & fWeightsForgetGateState
Forget Gate weights for prev state, fWeights[3].
Matrix_t & fOutputGateBias
Output Gate bias.
std::vector< Matrix_t > fDerivativesCandidate
First fDerivatives of the activations candidate gate.
const Matrix_t & GetInputDerivativesAt(size_t i) const
Matrix_t & fWeightsForgetGate
Forget Gate weights for input, fWeights[2].
Matrix_t & fWeightsInputGradients
Gradients w.r.t the input gate - input weights.
typename Architecture_t::Tensor_t Tensor_t
const std::vector< Matrix_t > & GetDerivativesInput() const
Matrix_t & GetWeightsCandidate()
Matrix_t & fForgetGateBias
Forget Gate bias.
Matrix_t & GetWeightsInputGradients()
Matrix_t & GetCandidateBiasGradients()
Matrix_t & GetWeightsOutputGradients()
Matrix_t & fCandidateBias
Candidate Gate bias.
Matrix_t fCandidateValue
Computed candidate values.
Tensor_t & GetWeightGradientsTensor()
bool DoesRememberState() const
const Matrix_t & GetWeightsOutputGradients() const
typename Architecture_t::RecurrentDescriptor_t LayerDescriptor_t
const Matrix_t & GetWeightsInputGradients() const
Matrix_t & GetWeightsCandidateState()
Matrix_t & GetInputBiasGradients()
const Matrix_t & GetInputBiasGradients() const
size_t GetTimeSteps() const
DNN::EActivationFunction fF2
Activation function: tanh.
void AddWeightsXMLTo(void *parent) override
Writes the information and the weights about the layer in an XML node.
Matrix_t & fInputBiasGradients
Gradients w.r.t the input gate - bias weights.
Matrix_t & GetWeightsOutputStateGradients()
Matrix_t & fWeightsCandidateState
Candidate Gate weights for prev state, fWeights[5].
Matrix_t & GetForgetGateValue()
std::vector< Matrix_t > fDerivativesForget
First fDerivatives of the activations forget gate.
const Tensor_t & GetWeightGradientsTensor() const
Matrix_t & GetForgetDerivativesAt(size_t i)
const Matrix_t & GetWeightsInputGateState() const
Matrix_t & GetWeightsInputStateGradients()
typename Architecture_t::DropoutDescriptor_t HelperDescriptor_t
Matrix_t & fForgetBiasGradients
Gradients w.r.t the forget gate - bias weights.
const Matrix_t & GetCandidateBias() const
std::vector< Matrix_t > output_gate_value
output gate value for every time step
const std::vector< Matrix_t > & GetDerivativesCandidate() const
size_t fStateSize
Hidden state size for LSTM.
void CandidateValue(const Matrix_t &input, Matrix_t &dc)
Decides the new candidate values (NN with Tanh).
std::vector< Matrix_t > fDerivativesInput
First fDerivatives of the activations input gate.
const Matrix_t & GetWeightsForgetGateState() const
Matrix_t & GetWeightsForgetGateState()
const Matrix_t & GetWeightsInputGate() const
const Matrix_t & GetInputGateValue() const
void Update(const Scalar_t learningRate)
bool DoesReturnSequence() const
Tensor_t fDx
cached gradient on the input (output of backward) as T x B x I
typename Architecture_t::RNNWorkspace_t RNNWorkspace_t
Matrix_t & GetOutputGateValue()
TBasicLSTMLayer(size_t batchSize, size_t stateSize, size_t inputSize, size_t timeSteps, bool rememberState=false, bool returnSequence=false, DNN::EActivationFunction f1=DNN::EActivationFunction::kSigmoid, DNN::EActivationFunction f2=DNN::EActivationFunction::kTanh, bool training=true, DNN::EInitialization fA=DNN::EInitialization::kZero)
Constructor.
Matrix_t & GetWeightsForgetStateGradients()
const Matrix_t & GetOutputBiasGradients() const
typename Architecture_t::TensorDescriptor_t TensorDescriptor_t
const Matrix_t & GetWeightsOutputStateGradients() const
Matrix_t & fWeightsOutputStateGradients
Gradients w.r.t the output gate - hidden state weights.
bool fReturnSequence
Return in output full sequence or just last element.
Matrix_t & GetWeightsForgetGradients()
Matrix_t & GetWeightsCandidateGradients()
void Forward(Tensor_t &input, bool isTraining=true) override
Computes the next hidden state and next cell state with given input matrix.
const Matrix_t & GetWeightsForgetGradients() const
Matrix_t fCell
Cell state of LSTM.
std::vector< Matrix_t > & GetDerivativesCandidate()
const Matrix_t & GetForgetBiasGradients() const
std::vector< Matrix_t > & GetOutputGateTensor()
Matrix_t & GetCandidateValue()
const Matrix_t & GetForgetDerivativesAt(size_t i) const
Matrix_t fState
Hidden state of LSTM.
void OutputGate(const Matrix_t &input, Matrix_t &dout)
Computes output values (NN with Sigmoid).
const Matrix_t & GetForgetGateValue() const
std::vector< Matrix_t > candidate_gate_value
candidate gate value for every time step
Matrix_t & GetInputGateValue()
const Matrix_t & GetState() const
const Matrix_t & GetWeightsCandidateState() const
Matrix_t & GetCandidateBias()
const std::vector< Matrix_t > & GetForgetGateTensor() const
const std::vector< Matrix_t > & GetDerivativesOutput() const
const std::vector< Matrix_t > & GetCellTensor() const
const Tensor_t & GetWeightsTensor() const
Matrix_t & fWeightsInputGate
Input Gate weights for input, fWeights[0].
std::vector< Matrix_t > & GetCandidateGateTensor()
const Matrix_t & GetOutputDerivativesAt(size_t i) const
const Matrix_t & GetCell() const
Matrix_t & fWeightsForgetStateGradients
Gradients w.r.t the forget gate - hidden state weights.
const Matrix_t & GetCandidateGateTensorAt(size_t i) const
Matrix_t fOutputValue
Computed output gate values.
size_t fCellSize
Cell state size of LSTM.
Matrix_t & GetOutputDerivativesAt(size_t i)
Matrix_t & GetInputGateTensorAt(size_t i)
std::vector< Matrix_t > & GetDerivativesInput()
Matrix_t & fWeightsOutputGateState
Output Gate weights for prev state, fWeights[7].
const std::vector< Matrix_t > & GetDerivativesForget() const
Matrix_t & GetForgetBiasGradients()
const Matrix_t & GetForgetGateBias() const
const Matrix_t & GetCandidateDerivativesAt(size_t i) const
Matrix_t & GetInputGateBias()
Matrix_t & GetOutputGateTensorAt(size_t i)
size_t fTimeSteps
Timesteps for LSTM.
const Matrix_t & GetCandidateBiasGradients() const
const Matrix_t & GetCandidateValue() const
typename Architecture_t::FilterDescriptor_t WeightsDescriptor_t
Matrix_t & fInputGateBias
Input Gate bias.
const Matrix_t & GetWeightsForgetGate() const
std::vector< Matrix_t > input_gate_value
input gate value for every time step
const Matrix_t & GetWeightsCandidateStateGradients() const
Tensor_t & GetWeightsTensor()
Matrix_t & fWeightsForgetGradients
Gradients w.r.t the forget gate - input weights.
std::vector< Matrix_t > & GetDerivativesForget()
const Matrix_t & GetWeightsOutputGate() const
void ForgetGate(const Matrix_t &input, Matrix_t &df)
Forgets the past values (NN with Sigmoid).
std::vector< Matrix_t > & GetInputGateTensor()
Matrix_t & GetOutputBiasGradients()
void Print() const override
Prints the info about the layer.
const Matrix_t & GetOutputGateValue() const
const Matrix_t & GetWeightsOutputGateState() const
Matrix_t & GetCandidateDerivativesAt(size_t i)
Matrix_t fInputValue
Computed input gate values.
Matrix_t & GetWeightsOutputGate()
const Matrix_t & GetWeightsCandidate() const
const Matrix_t & GetWeightsCandidateGradients() const
Tensor_t fWeightGradientsTensor
Tensor for all weight gradients.
Matrix_t & GetInputDerivativesAt(size_t i)
typename Architecture_t::RNNDescriptors_t RNNDescriptors_t
DNN::EActivationFunction GetActivationFunctionF1() const
Tensor_t fY
cached output tensor as T x B x S
std::vector< Matrix_t > forget_gate_value
forget gate value for every time step
Matrix_t & fWeightsCandidate
Candidate Gate weights for input, fWeights[4].
bool fRememberState
Remember state in next pass.
Matrix_t & fWeightsInputGateState
Input Gate weights for prev state, fWeights[1].
TDescriptors * fDescriptors
Keeps all the RNN descriptors.
std::vector< Matrix_t > & GetCellTensor()
size_t GetCellSize() const
Matrix_t & fCandidateBiasGradients
Gradients w.r.t the candidate gate - bias weights.
Matrix_t fForgetValue
Computed forget gate values.
const Matrix_t & GetWeightsAt(size_t i) const
virtual void Initialize()
Initialize the weights and biases according to the given initialization method.
const Tensor_t & GetOutput() const
void WriteMatrixToXML(void *node, const char *name, const Matrix_t &matrix)
const Tensor_t & GetActivationGradients() const
const Matrix_t & GetBiasesAt(size_t i) const
const Matrix_t & GetBiasGradientsAt(size_t i) const
size_t GetBatchSize() const
Getters.
void ReadMatrixXML(void *node, const char *name, Matrix_t &matrix)
const Matrix_t & GetWeightGradientsAt(size_t i) const
VGeneralLayer(size_t BatchSize, size_t InputDepth, size_t InputHeight, size_t InputWidth, size_t Depth, size_t Height, size_t Width, size_t WeightsNSlices, size_t WeightsNRows, size_t WeightsNCols, size_t BiasesNSlices, size_t BiasesNRows, size_t BiasesNCols, size_t OutputNSlices, size_t OutputNRows, size_t OutputNCols, EInitialization Init)
Constructor.
size_t GetInputWidth() const
XMLNodePointer_t NewChild(XMLNodePointer_t parent, XMLNsPointer_t ns, const char *name, const char *content=nullptr)
create new child element for parent node
XMLAttrPointer_t NewAttr(XMLNodePointer_t xmlnode, XMLNsPointer_t, const char *name, const char *value)
creates new attribute for xmlnode, namespaces are not supported for attributes
void evaluateDerivativeMatrix(typename Architecture_t::Matrix_t &B, EActivationFunction f, const typename Architecture_t::Matrix_t &A)
void evaluateMatrix(typename Architecture_t::Matrix_t &A, EActivationFunction f)
EActivationFunction
Enum that represents layer activation functions.
void initialize(typename Architecture_t::Matrix_t &A, EInitialization m)
create variable transformations