Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
GRULayer.h
Go to the documentation of this file.
1// @(#)root/tmva/tmva/dnn/gru:$Id$
2// Author: Surya S Dwivedi 03/07/19
3
4/**********************************************************************************
5 * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6 * Package: TMVA *
7 * Class : BasicGRULayer *
8 * *
9 * Description: *
10 * NeuralNetwork *
11 * *
12 * Authors (alphabetical): *
13 * Surya S Dwivedi <surya2191997@gmail.com> - IIT Kharagpur, India *
14 * *
15 * Copyright (c) 2005-2019: *
16 * All rights reserved. *
17 * CERN, Switzerland *
18 * *
19 * For the licensing terms see $ROOTSYS/LICENSE. *
20 * For the list of contributors see $ROOTSYS/README/CREDITS. *
21 **********************************************************************************/
22
23//#pragma once
24
25//////////////////////////////////////////////////////////////////////
26// This class implements the GRU layer. GRU is a variant of vanilla
27// RNN which is capable of learning long range dependencies.
28//////////////////////////////////////////////////////////////////////
29
30#ifndef TMVA_DNN_GRU_LAYER
31#define TMVA_DNN_GRU_LAYER
32
33#include <cmath>
34#include <iostream>
35#include <vector>
36
37#include "TMatrix.h"
38#include "TMVA/DNN/Functions.h"
39
40namespace TMVA
41{
42namespace DNN
43{
44namespace RNN
45{
46
47//______________________________________________________________________________
48//
49// Basic GRU Layer
50//______________________________________________________________________________
51
52/** \class BasicGRULayer
53 Generic implementation
54*/
55template<typename Architecture_t>
56 class TBasicGRULayer : public VGeneralLayer<Architecture_t>
57{
58
59public:
60
61 using Matrix_t = typename Architecture_t::Matrix_t;
62 using Scalar_t = typename Architecture_t::Scalar_t;
63 using Tensor_t = typename Architecture_t::Tensor_t;
64
65 using LayerDescriptor_t = typename Architecture_t::RecurrentDescriptor_t;
66 using WeightsDescriptor_t = typename Architecture_t::FilterDescriptor_t;
67 using TensorDescriptor_t = typename Architecture_t::TensorDescriptor_t;
68 using HelperDescriptor_t = typename Architecture_t::DropoutDescriptor_t;
69
70 using RNNWorkspace_t = typename Architecture_t::RNNWorkspace_t;
71 using RNNDescriptors_t = typename Architecture_t::RNNDescriptors_t;
72
73private:
74
75 size_t fStateSize; ///< Hidden state size for GRU
76 size_t fTimeSteps; ///< Timesteps for GRU
77
78 bool fRememberState; ///< Remember state in next pass
79 bool fReturnSequence = false; ///< Return in output full sequence or just last element
80 bool fResetGateAfter = false; ///< GRU variant to Apply the reset gate multiplication afterwards (used by cuDNN)
81
82 DNN::EActivationFunction fF1; ///< Activation function: sigmoid
83 DNN::EActivationFunction fF2; ///< Activaton function: tanh
84
85 Matrix_t fResetValue; ///< Computed reset gate values
86 Matrix_t fUpdateValue; ///< Computed forget gate values
87 Matrix_t fCandidateValue; ///< Computed candidate values
88 Matrix_t fState; ///< Hidden state of GRU
89
90
91 Matrix_t &fWeightsResetGate; ///< Reset Gate weights for input, fWeights[0]
92 Matrix_t &fWeightsResetGateState; ///< Input Gate weights for prev state, fWeights[1]
93 Matrix_t &fResetGateBias; ///< Input Gate bias
94
95 Matrix_t &fWeightsUpdateGate; ///< Update Gate weights for input, fWeights[2]
96 Matrix_t &fWeightsUpdateGateState; ///< Update Gate weights for prev state, fWeights[3]
97 Matrix_t &fUpdateGateBias; ///< Update Gate bias
98
99 Matrix_t &fWeightsCandidate; ///< Candidate Gate weights for input, fWeights[4]
100 Matrix_t &fWeightsCandidateState; ///< Candidate Gate weights for prev state, fWeights[5]
101 Matrix_t &fCandidateBias; ///< Candidate Gate bias
102
103
104 std::vector<Matrix_t> reset_gate_value; ///< Reset gate value for every time step
105 std::vector<Matrix_t> update_gate_value; ///< Update gate value for every time step
106 std::vector<Matrix_t> candidate_gate_value; ///< Candidate gate value for every time step
107
108 std::vector<Matrix_t> fDerivativesReset; ///< First fDerivatives of the activations reset gate
109 std::vector<Matrix_t> fDerivativesUpdate; ///< First fDerivatives of the activations update gate
110 std::vector<Matrix_t> fDerivativesCandidate; ///< First fDerivatives of the activations candidate gate
111
112 Matrix_t &fWeightsResetGradients; ///< Gradients w.r.t the reset gate - input weights
113 Matrix_t &fWeightsResetStateGradients; ///< Gradients w.r.t the reset gate - hidden state weights
114 Matrix_t &fResetBiasGradients; ///< Gradients w.r.t the reset gate - bias weights
115 Matrix_t &fWeightsUpdateGradients; ///< Gradients w.r.t the update gate - input weights
116 Matrix_t &fWeightsUpdateStateGradients; ///< Gradients w.r.t the update gate - hidden state weights
117 Matrix_t &fUpdateBiasGradients; ///< Gradients w.r.t the update gate - bias weights
118 Matrix_t &fWeightsCandidateGradients; ///< Gradients w.r.t the candidate gate - input weights
119 Matrix_t &fWeightsCandidateStateGradients; ///< Gradients w.r.t the candidate gate - hidden state weights
120 Matrix_t &fCandidateBiasGradients; ///< Gradients w.r.t the candidate gate - bias weights
121
122 Matrix_t fCell; ///< EMpty matrix for GRU
123
124 // Tensor representing all weights (used by cuDNN)
125 Tensor_t fWeightsTensor; ///< Tensor for all weights
126 Tensor_t fWeightGradientsTensor; ///< Tensor for all weight gradients
127
128 // tensors used internally for the forward and backward pass
129 Tensor_t fX; ///< cached input tensor as T x B x I
130 Tensor_t fY; ///< cached output tensor as T x B x S
131 Tensor_t fDx; ///< cached gradient on the input (output of backward) as T x B x I
132 Tensor_t fDy; ///< cached activation gradient (input of backward) as T x B x S
133
134 TDescriptors *fDescriptors = nullptr; ///< Keeps all the RNN descriptors
135 TWorkspace *fWorkspace = nullptr; // workspace needed for GPU computation (CudNN)
136
137public:
138
139 /*! Constructor */
140 TBasicGRULayer(size_t batchSize, size_t stateSize, size_t inputSize,
141 size_t timeSteps, bool rememberState = false, bool returnSequence = false,
142 bool resetGateAfter = false,
145 bool training = true, DNN::EInitialization fA = DNN::EInitialization::kZero);
146
147 /*! Copy Constructor */
149
150 /*! Initialize the weights according to the given initialization
151 ** method. */
152 virtual void Initialize();
153
154 /*! Initialize the hidden state and cell state method. */
156
157 /*! Computes the next hidden state
158 * and next cell state with given input matrix. */
159 void Forward(Tensor_t &input, bool isTraining = true);
160
161 /*! Forward for a single cell (time unit) */
162 void CellForward(Matrix_t &updateGateValues, Matrix_t &candidateValues);
163
164 /*! Backpropagates the error. Must only be called directly at the corresponding
165 * call to Forward(...). */
166 void Backward(Tensor_t &gradients_backward,
167 const Tensor_t &activations_backward);
168
169 /* Updates weights and biases, given the learning rate */
170 void Update(const Scalar_t learningRate);
171
172 /*! Backward for a single time unit
173 * a the corresponding call to Forward(...). */
174 Matrix_t & CellBackward(Matrix_t & state_gradients_backward,
175 const Matrix_t & precStateActivations,
176 const Matrix_t & reset_gate, const Matrix_t & update_gate,
177 const Matrix_t & candidate_gate,
178 const Matrix_t & input, Matrix_t & input_gradient,
179 Matrix_t &dr, Matrix_t &du, Matrix_t &dc);
180
181 /*! Decides the values we'll update (NN with Sigmoid) */
182 void ResetGate(const Matrix_t &input, Matrix_t &di);
183
184 /*! Forgets the past values (NN with Sigmoid) */
185 void UpdateGate(const Matrix_t &input, Matrix_t &df);
186
187 /*! Decides the new candidate values (NN with Tanh) */
188 void CandidateValue(const Matrix_t &input, Matrix_t &dc);
189
190 /*! Prints the info about the layer */
191 void Print() const;
192
193 /*! Writes the information and the weights about the layer in an XML node. */
194 void AddWeightsXMLTo(void *parent);
195
196 /*! Read the information and the weights about the layer from XML node. */
197 void ReadWeightsFromXML(void *parent);
198
199 /*! Getters */
200 size_t GetInputSize() const { return this->GetInputWidth(); }
201 size_t GetTimeSteps() const { return fTimeSteps; }
202 size_t GetStateSize() const { return fStateSize; }
203
204 inline bool DoesRememberState() const { return fRememberState; }
205 inline bool DoesReturnSequence() const { return fReturnSequence; }
206
209
210 const Matrix_t & GetResetGateValue() const { return fResetValue; }
212 const Matrix_t & GetCandidateValue() const { return fCandidateValue; }
214 const Matrix_t & GetUpdateGateValue() const { return fUpdateValue; }
216
217 const Matrix_t & GetState() const { return fState; }
218 Matrix_t & GetState() { return fState; }
219 const Matrix_t &GetCell() const { return fCell; }
220 Matrix_t & GetCell() { return fCell; }
221
228
235
236 const std::vector<Matrix_t> & GetDerivativesReset() const { return fDerivativesReset; }
237 std::vector<Matrix_t> & GetDerivativesReset() { return fDerivativesReset; }
238 const Matrix_t & GetResetDerivativesAt(size_t i) const { return fDerivativesReset[i]; }
240 const std::vector<Matrix_t> & GetDerivativesUpdate() const { return fDerivativesUpdate; }
241 std::vector<Matrix_t> & GetDerivativesUpdate() { return fDerivativesUpdate; }
242 const Matrix_t & GetUpdateDerivativesAt(size_t i) const { return fDerivativesUpdate[i]; }
244 const std::vector<Matrix_t> & GetDerivativesCandidate() const { return fDerivativesCandidate; }
245 std::vector<Matrix_t> & GetDerivativesCandidate() { return fDerivativesCandidate; }
246 const Matrix_t & GetCandidateDerivativesAt(size_t i) const { return fDerivativesCandidate[i]; }
248
249 const std::vector<Matrix_t> & GetResetGateTensor() const { return reset_gate_value; }
250 std::vector<Matrix_t> & GetResetGateTensor() { return reset_gate_value; }
251 const Matrix_t & GetResetGateTensorAt(size_t i) const { return reset_gate_value[i]; }
253 const std::vector<Matrix_t> & GetUpdateGateTensor() const { return update_gate_value; }
254 std::vector<Matrix_t> & GetUpdateGateTensor() { return update_gate_value; }
255 const Matrix_t & GetUpdateGateTensorAt(size_t i) const { return update_gate_value[i]; }
257 const std::vector<Matrix_t> & GetCandidateGateTensor() const { return candidate_gate_value; }
258 std::vector<Matrix_t> & GetCandidateGateTensor() { return candidate_gate_value; }
259 const Matrix_t & GetCandidateGateTensorAt(size_t i) const { return candidate_gate_value[i]; }
261
262
263
264 const Matrix_t & GetResetGateBias() const { return fResetGateBias; }
266 const Matrix_t & GetUpdateGateBias() const { return fUpdateGateBias; }
268 const Matrix_t & GetCandidateBias() const { return fCandidateBias; }
270
289
291 const Tensor_t &GetWeightsTensor() const { return fWeightsTensor; }
294
295 Tensor_t &GetX() { return fX; }
296 Tensor_t &GetY() { return fY; }
297 Tensor_t &GetDX() { return fDx; }
298 Tensor_t &GetDY() { return fDy; }
299};
300
301
302//______________________________________________________________________________
303//
304// Basic GRU-Layer Implementation
305//______________________________________________________________________________
306
307template <typename Architecture_t>
308TBasicGRULayer<Architecture_t>::TBasicGRULayer(size_t batchSize, size_t stateSize, size_t inputSize, size_t timeSteps,
309 bool rememberState, bool returnSequence, bool resetGateAfter, DNN::EActivationFunction f1,
310 DNN::EActivationFunction f2, bool /* training */,
312 : VGeneralLayer<Architecture_t>(batchSize, 1, timeSteps, inputSize, 1, (returnSequence) ? timeSteps : 1, stateSize,
313 6, {stateSize, stateSize, stateSize, stateSize, stateSize, stateSize},
314 {inputSize, inputSize, inputSize, stateSize, stateSize, stateSize}, 3,
315 {stateSize, stateSize, stateSize}, {1, 1, 1}, batchSize,
316 (returnSequence) ? timeSteps : 1, stateSize, fA),
317 fStateSize(stateSize), fTimeSteps(timeSteps), fRememberState(rememberState), fReturnSequence(returnSequence), fResetGateAfter(resetGateAfter),
318 fF1(f1), fF2(f2), fResetValue(batchSize, stateSize), fUpdateValue(batchSize, stateSize),
319 fCandidateValue(batchSize, stateSize), fState(batchSize, stateSize), fWeightsResetGate(this->GetWeightsAt(0)),
320 fWeightsResetGateState(this->GetWeightsAt(3)), fResetGateBias(this->GetBiasesAt(0)),
321 fWeightsUpdateGate(this->GetWeightsAt(1)), fWeightsUpdateGateState(this->GetWeightsAt(4)),
322 fUpdateGateBias(this->GetBiasesAt(1)), fWeightsCandidate(this->GetWeightsAt(2)),
323 fWeightsCandidateState(this->GetWeightsAt(5)), fCandidateBias(this->GetBiasesAt(2)),
324 fWeightsResetGradients(this->GetWeightGradientsAt(0)), fWeightsResetStateGradients(this->GetWeightGradientsAt(3)),
325 fResetBiasGradients(this->GetBiasGradientsAt(0)), fWeightsUpdateGradients(this->GetWeightGradientsAt(1)),
326 fWeightsUpdateStateGradients(this->GetWeightGradientsAt(4)), fUpdateBiasGradients(this->GetBiasGradientsAt(1)),
327 fWeightsCandidateGradients(this->GetWeightGradientsAt(2)),
328 fWeightsCandidateStateGradients(this->GetWeightGradientsAt(5)),
329 fCandidateBiasGradients(this->GetBiasGradientsAt(2))
330{
331 for (size_t i = 0; i < timeSteps; ++i) {
332 fDerivativesReset.emplace_back(batchSize, stateSize);
333 fDerivativesUpdate.emplace_back(batchSize, stateSize);
334 fDerivativesCandidate.emplace_back(batchSize, stateSize);
335 reset_gate_value.emplace_back(batchSize, stateSize);
336 update_gate_value.emplace_back(batchSize, stateSize);
337 candidate_gate_value.emplace_back(batchSize, stateSize);
338 }
339 Architecture_t::InitializeGRUTensors(this);
340}
341
342 //______________________________________________________________________________
343template <typename Architecture_t>
345 : VGeneralLayer<Architecture_t>(layer),
346 fStateSize(layer.fStateSize),
347 fTimeSteps(layer.fTimeSteps),
348 fRememberState(layer.fRememberState),
349 fReturnSequence(layer.fReturnSequence),
350 fResetGateAfter(layer.fResetGateAfter),
351 fF1(layer.GetActivationFunctionF1()),
352 fF2(layer.GetActivationFunctionF2()),
353 fResetValue(layer.GetBatchSize(), layer.GetStateSize()),
354 fUpdateValue(layer.GetBatchSize(), layer.GetStateSize()),
355 fCandidateValue(layer.GetBatchSize(), layer.GetStateSize()),
356 fState(layer.GetBatchSize(), layer.GetStateSize()),
357 fWeightsResetGate(this->GetWeightsAt(0)),
358 fWeightsResetGateState(this->GetWeightsAt(3)),
359 fResetGateBias(this->GetBiasesAt(0)),
360 fWeightsUpdateGate(this->GetWeightsAt(1)),
361 fWeightsUpdateGateState(this->GetWeightsAt(4)),
362 fUpdateGateBias(this->GetBiasesAt(1)),
363 fWeightsCandidate(this->GetWeightsAt(2)),
364 fWeightsCandidateState(this->GetWeightsAt(5)),
365 fCandidateBias(this->GetBiasesAt(2)),
366 fWeightsResetGradients(this->GetWeightGradientsAt(0)),
367 fWeightsResetStateGradients(this->GetWeightGradientsAt(3)),
368 fResetBiasGradients(this->GetBiasGradientsAt(0)),
369 fWeightsUpdateGradients(this->GetWeightGradientsAt(1)),
370 fWeightsUpdateStateGradients(this->GetWeightGradientsAt(4)),
371 fUpdateBiasGradients(this->GetBiasGradientsAt(1)),
372 fWeightsCandidateGradients(this->GetWeightGradientsAt(2)),
373 fWeightsCandidateStateGradients(this->GetWeightGradientsAt(5)),
374 fCandidateBiasGradients(this->GetBiasGradientsAt(2))
375{
376 for (size_t i = 0; i < fTimeSteps; ++i) {
377 fDerivativesReset.emplace_back(layer.GetBatchSize(), layer.GetStateSize());
378 Architecture_t::Copy(fDerivativesReset[i], layer.GetResetDerivativesAt(i));
379
380 fDerivativesUpdate.emplace_back(layer.GetBatchSize(), layer.GetStateSize());
381 Architecture_t::Copy(fDerivativesUpdate[i], layer.GetUpdateDerivativesAt(i));
382
383 fDerivativesCandidate.emplace_back(layer.GetBatchSize(), layer.GetStateSize());
384 Architecture_t::Copy(fDerivativesCandidate[i], layer.GetCandidateDerivativesAt(i));
385
386 reset_gate_value.emplace_back(layer.GetBatchSize(), layer.GetStateSize());
387 Architecture_t::Copy(reset_gate_value[i], layer.GetResetGateTensorAt(i));
388
389 update_gate_value.emplace_back(layer.GetBatchSize(), layer.GetStateSize());
390 Architecture_t::Copy(update_gate_value[i], layer.GetUpdateGateTensorAt(i));
391
392 candidate_gate_value.emplace_back(layer.GetBatchSize(), layer.GetStateSize());
393 Architecture_t::Copy(candidate_gate_value[i], layer.GetCandidateGateTensorAt(i));
394 }
395
396 // Gradient matrices not copied
397 Architecture_t::Copy(fState, layer.GetState());
398
399 // Copy each gate values.
400 Architecture_t::Copy(fResetValue, layer.GetResetGateValue());
401 Architecture_t::Copy(fCandidateValue, layer.GetCandidateValue());
402 Architecture_t::Copy(fUpdateValue, layer.GetUpdateGateValue());
403
404 Architecture_t::InitializeGRUTensors(this);
405}
406
407//______________________________________________________________________________
408template <typename Architecture_t>
410{
412
413 Architecture_t::InitializeGRUDescriptors(fDescriptors, this);
414 Architecture_t::InitializeGRUWorkspace(fWorkspace, fDescriptors, this);
415
416 //cuDNN only supports resetGate after
417 if (Architecture_t::IsCudnn())
418 fResetGateAfter = true;
419}
420
421//______________________________________________________________________________
422template <typename Architecture_t>
424-> void
425{
426 /*! Computes reset gate values according to equation:
427 * input = act(W_input . input + W_state . state + bias)
428 * activation function: sigmoid. */
429 const DNN::EActivationFunction fRst = this->GetActivationFunctionF1();
430 Matrix_t tmpState(fResetValue.GetNrows(), fResetValue.GetNcols());
431 Architecture_t::MultiplyTranspose(tmpState, fState, fWeightsResetGateState);
432 Architecture_t::MultiplyTranspose(fResetValue, input, fWeightsResetGate);
433 Architecture_t::ScaleAdd(fResetValue, tmpState);
434 Architecture_t::AddRowWise(fResetValue, fResetGateBias);
435 DNN::evaluateDerivativeMatrix<Architecture_t>(dr, fRst, fResetValue);
436 DNN::evaluateMatrix<Architecture_t>(fResetValue, fRst);
437}
438
439 //______________________________________________________________________________
440template <typename Architecture_t>
442-> void
443{
444 /*! Computes update gate values according to equation:
445 * forget = act(W_input . input + W_state . state + bias)
446 * activation function: sigmoid. */
447 const DNN::EActivationFunction fUpd = this->GetActivationFunctionF1();
448 Matrix_t tmpState(fUpdateValue.GetNrows(), fUpdateValue.GetNcols());
449 Architecture_t::MultiplyTranspose(tmpState, fState, fWeightsUpdateGateState);
450 Architecture_t::MultiplyTranspose(fUpdateValue, input, fWeightsUpdateGate);
451 Architecture_t::ScaleAdd(fUpdateValue, tmpState);
452 Architecture_t::AddRowWise(fUpdateValue, fUpdateGateBias);
453 DNN::evaluateDerivativeMatrix<Architecture_t>(du, fUpd, fUpdateValue);
454 DNN::evaluateMatrix<Architecture_t>(fUpdateValue, fUpd);
455}
456
457 //______________________________________________________________________________
458template <typename Architecture_t>
460-> void
461{
462 /*!
463 vanilla GRU:
464 candidate_value = act(W_input . input + W_state . (reset*state) + bias)
465
466 but CuDNN uses reset_after variant that is faster (with bias mode = input)
467 (apply reset gate multiplication after matrix multiplication)
468 candidate_value = act(W_input . input + reset * (W_state . state) + bias
469
470 activation function = tanh.
471
472 */
473
474 const DNN::EActivationFunction fCan = this->GetActivationFunctionF2();
475 Matrix_t tmp(fCandidateValue.GetNrows(), fCandidateValue.GetNcols());
476 if (!fResetGateAfter) {
477 Matrix_t tmpState(fResetValue); // I think here tmpState uses fResetValue buffer
478 Architecture_t::Hadamard(tmpState, fState);
479 Architecture_t::MultiplyTranspose(tmp, tmpState, fWeightsCandidateState);
480 } else {
481 // variant GRU used in cuDNN slightly faster
482 Architecture_t::MultiplyTranspose(tmp, fState, fWeightsCandidateState);
483 Architecture_t::Hadamard(tmp, fResetValue);
484 }
485 Architecture_t::MultiplyTranspose(fCandidateValue, input, fWeightsCandidate);
486 Architecture_t::ScaleAdd(fCandidateValue, tmp);
487 Architecture_t::AddRowWise(fCandidateValue, fCandidateBias);
488 DNN::evaluateDerivativeMatrix<Architecture_t>(dc, fCan, fCandidateValue);
489 DNN::evaluateMatrix<Architecture_t>(fCandidateValue, fCan);
490}
491
492 //______________________________________________________________________________
493template <typename Architecture_t>
494auto inline TBasicGRULayer<Architecture_t>::Forward(Tensor_t &input, bool isTraining )
495-> void
496{
497 // for Cudnn
498 if (Architecture_t::IsCudnn()) {
499
500 // input size is stride[1] of input tensor that is B x T x inputSize
501 assert(input.GetStrides()[1] == this->GetInputSize());
502
503 Tensor_t &x = this->fX;
504 Tensor_t &y = this->fY;
505 Architecture_t::Rearrange(x, input);
506
507 const auto &weights = this->GetWeightsAt(0);
508
509 auto &hx = this->fState;
510 auto &cx = this->fCell;
511 // use same for hy and cy
512 auto &hy = this->fState;
513 auto &cy = this->fCell;
514
515 auto rnnDesc = static_cast<RNNDescriptors_t &>(*fDescriptors);
516 auto rnnWork = static_cast<RNNWorkspace_t &>(*fWorkspace);
517
518 Architecture_t::RNNForward(x, hx, cx, weights, y, hy, cy, rnnDesc, rnnWork, isTraining);
519
520 if (fReturnSequence) {
521 Architecture_t::Rearrange(this->GetOutput(), y); // swap B and T from y to Output
522 } else {
523 // tmp is a reference to y (full cudnn output)
524 Tensor_t tmp = (y.At(y.GetShape()[0] - 1)).Reshape({y.GetShape()[1], 1, y.GetShape()[2]});
525 Architecture_t::Copy(this->GetOutput(), tmp);
526 }
527
528 return;
529 }
530
531 // D : input size
532 // H : state size
533 // T : time size
534 // B : batch size
535
536 Tensor_t arrInput ( fTimeSteps, this->GetBatchSize(), this->GetInputWidth());
537 // for (size_t t = 0; t < fTimeSteps; ++t) {
538 // arrInput.emplace_back(this->GetBatchSize(), this->GetInputWidth()); // T x B x D
539 // }
540 Architecture_t::Rearrange(arrInput, input); // B x T x D
541
542 Tensor_t arrOutput ( fTimeSteps, this->GetBatchSize(), fStateSize );
543 // for (size_t t = 0; t < fTimeSteps;++t) {
544 // arrOutput.emplace_back(this->GetBatchSize(), fStateSize); // T x B x H
545 // }
546
547 if (!this->fRememberState) {
549 }
550
551 /*! Pass each gate values to CellForward() to calculate
552 * next hidden state and next cell state. */
553 for (size_t t = 0; t < fTimeSteps; ++t) {
554 /* Feed forward network: value of each gate being computed at each timestep t. */
555 ResetGate(arrInput[t], fDerivativesReset[t]);
556 Architecture_t::Copy(this->GetResetGateTensorAt(t), fResetValue);
557 UpdateGate(arrInput[t], fDerivativesUpdate[t]);
558 Architecture_t::Copy(this->GetUpdateGateTensorAt(t), fUpdateValue);
559
560 CandidateValue(arrInput[t], fDerivativesCandidate[t]);
561 Architecture_t::Copy(this->GetCandidateGateTensorAt(t), fCandidateValue);
562
563
564 CellForward(fUpdateValue, fCandidateValue);
565
566 // Architecture_t::PrintTensor(Tensor_t(fState), "state output");
567
568 Matrix_t arrOutputMt = arrOutput[t];
569 Architecture_t::Copy(arrOutputMt, fState);
570 }
571
572 if (fReturnSequence)
573 Architecture_t::Rearrange(this->GetOutput(), arrOutput); // B x T x D
574 else {
575 // get T[end[]]
576 Tensor_t tmp = arrOutput.At(fTimeSteps - 1); // take last time step
577 // shape of tmp is for CPU (columnwise) B x D , need to reshape to make a B x D x 1
578 // and transpose it to 1 x D x B (this is how output is expected in columnmajor format)
579 tmp = tmp.Reshape({tmp.GetShape()[0], tmp.GetShape()[1], 1});
580 assert(tmp.GetSize() == this->GetOutput().GetSize());
581 assert(tmp.GetShape()[0] == this->GetOutput().GetShape()[2]); // B is last dim in output and first in tmp
582 Architecture_t::Rearrange(this->GetOutput(), tmp);
583 // keep array output
584 fY = arrOutput;
585 }
586}
587
588//______________________________________________________________________________
589template <typename Architecture_t>
590auto inline TBasicGRULayer<Architecture_t>::CellForward(Matrix_t &updateGateValues, Matrix_t &candidateValues)
591-> void
592{
593 Architecture_t::Hadamard(fState, updateGateValues);
594
595 // this will reuse content of updateGateValues
596 Matrix_t tmp(updateGateValues); // H X 1
597 for (size_t j = 0; j < (size_t) tmp.GetNcols(); j++) {
598 for (size_t i = 0; i < (size_t) tmp.GetNrows(); i++) {
599 tmp(i,j) = 1 - tmp(i,j);
600 }
601 }
602
603 // Update state
604 Architecture_t::Hadamard(candidateValues, tmp);
605 Architecture_t::ScaleAdd(fState, candidateValues);
606}
607
608//____________________________________________________________________________
609template <typename Architecture_t>
610auto inline TBasicGRULayer<Architecture_t>::Backward(Tensor_t &gradients_backward, // B x T x D
611 const Tensor_t &activations_backward) // B x T x D
612-> void
613{
614 // BACKWARD for CUDNN
615 if (Architecture_t::IsCudnn()) {
616
617 Tensor_t &x = this->fX;
618 Tensor_t &y = this->fY;
619 Tensor_t &dx = this->fDx;
620 Tensor_t &dy = this->fDy;
621
622 // input size is stride[1] of input tensor that is B x T x inputSize
623 assert(activations_backward.GetStrides()[1] == this->GetInputSize());
624
625
626 Architecture_t::Rearrange(x, activations_backward);
627
628 if (!fReturnSequence) {
629
630 // Architecture_t::InitializeZero(dy);
631 Architecture_t::InitializeZero(dy);
632
633 // Tensor_t tmp1 = y.At(y.GetShape()[0] - 1).Reshape({y.GetShape()[1], 1, y.GetShape()[2]});
634 Tensor_t tmp2 = dy.At(dy.GetShape()[0] - 1).Reshape({dy.GetShape()[1], 1, dy.GetShape()[2]});
635
636 // Architecture_t::Copy(tmp1, this->GetOutput());
637 Architecture_t::Copy(tmp2, this->GetActivationGradients());
638 } else {
639 Architecture_t::Rearrange(y, this->GetOutput());
640 Architecture_t::Rearrange(dy, this->GetActivationGradients());
641 }
642
643 // Architecture_t::PrintTensor(this->GetOutput(), "output before bwd");
644
645 // for cudnn Matrix_t and Tensor_t are same type
646 const auto &weights = this->GetWeightsTensor();
647 auto &weightGradients = this->GetWeightGradientsTensor();
648
649 // note that cudnnRNNBackwardWeights accumulate the weight gradients.
650 // We need then to initialize the tensor to zero every time
651 Architecture_t::InitializeZero(weightGradients);
652
653 // hx is fState
654 auto &hx = this->GetState();
655 auto &cx = this->GetCell();
656 // use same for hy and cy
657 auto &dhy = hx;
658 auto &dcy = cx;
659 auto &dhx = hx;
660 auto &dcx = cx;
661
662 auto rnnDesc = static_cast<RNNDescriptors_t &>(*fDescriptors);
663 auto rnnWork = static_cast<RNNWorkspace_t &>(*fWorkspace);
664
665 Architecture_t::RNNBackward(x, hx, cx, y, dy, dhy, dcy, weights, dx, dhx, dcx, weightGradients, rnnDesc, rnnWork);
666
667 // Architecture_t::PrintTensor(this->GetOutput(), "output after bwd");
668
669 if (gradients_backward.GetSize() != 0)
670 Architecture_t::Rearrange(gradients_backward, dx);
671
672 return;
673 }
674
675 // gradients_backward is activationGradients of layer before it, which is input layer.
676 // Currently, gradients_backward is for input(x) and not for state.
677 // For the state it can be:
678 Matrix_t state_gradients_backward(this->GetBatchSize(), fStateSize); // B x H
679 DNN::initialize<Architecture_t>(state_gradients_backward, DNN::EInitialization::kZero); // B x H
680
681 // if dummy is false gradients_backward will be written back on the matrix
682 bool dummy = false;
683 if (gradients_backward.GetSize() == 0 || gradients_backward[0].GetNrows() == 0 || gradients_backward[0].GetNcols() == 0) {
684 dummy = true;
685 }
686
687 Tensor_t arr_gradients_backward ( fTimeSteps, this->GetBatchSize(), this->GetInputSize());
688
689
690 //Architecture_t::Rearrange(arr_gradients_backward, gradients_backward); // B x T x D
691 // activations_backward is input.
692 Tensor_t arr_activations_backward ( fTimeSteps, this->GetBatchSize(), this->GetInputSize());
693
694 Architecture_t::Rearrange(arr_activations_backward, activations_backward); // B x T x D
695
696 /*! For backpropagation, we need to calculate loss. For loss, output must be known.
697 * We obtain outputs during forward propagation and place the results in arr_output tensor. */
698 Tensor_t arr_output ( fTimeSteps, this->GetBatchSize(), fStateSize);
699
700 Matrix_t initState(this->GetBatchSize(), fStateSize); // B x H
701 DNN::initialize<Architecture_t>(initState, DNN::EInitialization::kZero); // B x H
702
703 // This will take partial derivative of state[t] w.r.t state[t-1]
704 Tensor_t arr_actgradients ( fTimeSteps, this->GetBatchSize(), fStateSize);
705
706 if (fReturnSequence) {
707 Architecture_t::Rearrange(arr_output, this->GetOutput());
708 Architecture_t::Rearrange(arr_actgradients, this->GetActivationGradients());
709 } else {
710 //
711 arr_output = fY;
712 Architecture_t::InitializeZero(arr_actgradients);
713 // need to reshape to pad a time dimension = 1 (note here is columnmajor tensors)
714 Tensor_t tmp_grad = arr_actgradients.At(fTimeSteps - 1).Reshape({this->GetBatchSize(), fStateSize, 1});
715 assert(tmp_grad.GetSize() == this->GetActivationGradients().GetSize());
716 assert(tmp_grad.GetShape()[0] ==
717 this->GetActivationGradients().GetShape()[2]); // B in tmp is [0] and [2] in input act. gradients
718
719 Architecture_t::Rearrange(tmp_grad, this->GetActivationGradients());
720 }
721
722 /*! There are total 8 different weight matrices and 4 bias vectors.
723 * Re-initialize them with zero because it should have some value. (can't be garbage values) */
724
725 // Reset Gate.
726 fWeightsResetGradients.Zero();
727 fWeightsResetStateGradients.Zero();
728 fResetBiasGradients.Zero();
729
730 // Update Gate.
731 fWeightsUpdateGradients.Zero();
732 fWeightsUpdateStateGradients.Zero();
733 fUpdateBiasGradients.Zero();
734
735 // Candidate Gate.
736 fWeightsCandidateGradients.Zero();
737 fWeightsCandidateStateGradients.Zero();
738 fCandidateBiasGradients.Zero();
739
740
741 for (size_t t = fTimeSteps; t > 0; t--) {
742 // Store the sum of gradients obtained at each timestep during backward pass.
743 Architecture_t::ScaleAdd(state_gradients_backward, arr_actgradients[t-1]);
744 if (t > 1) {
745 const Matrix_t &prevStateActivations = arr_output[t-2];
746 Matrix_t dx = arr_gradients_backward[t-1];
747 // During forward propagation, each gate value calculates their gradients.
748 CellBackward(state_gradients_backward, prevStateActivations,
749 this->GetResetGateTensorAt(t-1), this->GetUpdateGateTensorAt(t-1),
750 this->GetCandidateGateTensorAt(t-1),
751 arr_activations_backward[t-1], dx ,
752 fDerivativesReset[t-1], fDerivativesUpdate[t-1],
753 fDerivativesCandidate[t-1]);
754 } else {
755 const Matrix_t &prevStateActivations = initState;
756 Matrix_t dx = arr_gradients_backward[t-1];
757 CellBackward(state_gradients_backward, prevStateActivations,
758 this->GetResetGateTensorAt(t-1), this->GetUpdateGateTensorAt(t-1),
759 this->GetCandidateGateTensorAt(t-1),
760 arr_activations_backward[t-1], dx ,
761 fDerivativesReset[t-1], fDerivativesUpdate[t-1],
762 fDerivativesCandidate[t-1]);
763 }
764 }
765
766 if (!dummy) {
767 Architecture_t::Rearrange(gradients_backward, arr_gradients_backward );
768 }
769
770}
771
772
773//______________________________________________________________________________
774template <typename Architecture_t>
775auto inline TBasicGRULayer<Architecture_t>::CellBackward(Matrix_t & state_gradients_backward,
776 const Matrix_t & precStateActivations,
777 const Matrix_t & reset_gate, const Matrix_t & update_gate,
778 const Matrix_t & candidate_gate,
779 const Matrix_t & input, Matrix_t & input_gradient,
780 Matrix_t &dr, Matrix_t &du, Matrix_t &dc)
781-> Matrix_t &
782{
783 /*! Call here GRULayerBackward() to pass parameters i.e. gradient
784 * values obtained from each gate during forward propagation. */
785 return Architecture_t::GRULayerBackward(state_gradients_backward,
786 fWeightsResetGradients, fWeightsUpdateGradients, fWeightsCandidateGradients,
787 fWeightsResetStateGradients, fWeightsUpdateStateGradients,
788 fWeightsCandidateStateGradients, fResetBiasGradients, fUpdateBiasGradients,
789 fCandidateBiasGradients, dr, du, dc,
790 precStateActivations,
791 reset_gate, update_gate, candidate_gate,
792 fWeightsResetGate, fWeightsUpdateGate, fWeightsCandidate,
793 fWeightsResetGateState, fWeightsUpdateGateState, fWeightsCandidateState,
794 input, input_gradient, fResetGateAfter);
795}
796
797
798//______________________________________________________________________________
799template <typename Architecture_t>
801-> void
802{
803 DNN::initialize<Architecture_t>(this->GetState(), DNN::EInitialization::kZero);
804}
805
806 //______________________________________________________________________________
807template<typename Architecture_t>
809-> void
810{
811 std::cout << " GRU Layer: \t ";
812 std::cout << " (NInput = " << this->GetInputSize(); // input size
813 std::cout << ", NState = " << this->GetStateSize(); // hidden state size
814 std::cout << ", NTime = " << this->GetTimeSteps() << " )"; // time size
815 std::cout << "\tOutput = ( " << this->GetOutput().GetFirstSize() << " , " << this->GetOutput()[0].GetNrows() << " , " << this->GetOutput()[0].GetNcols() << " )\n";
816}
817
818//______________________________________________________________________________
819template <typename Architecture_t>
821-> void
822{
823 auto layerxml = gTools().xmlengine().NewChild(parent, 0, "GRULayer");
824
825 // Write all other info like outputSize, cellSize, inputSize, timeSteps, rememberState
826 gTools().xmlengine().NewAttr(layerxml, 0, "StateSize", gTools().StringFromInt(this->GetStateSize()));
827 gTools().xmlengine().NewAttr(layerxml, 0, "InputSize", gTools().StringFromInt(this->GetInputSize()));
828 gTools().xmlengine().NewAttr(layerxml, 0, "TimeSteps", gTools().StringFromInt(this->GetTimeSteps()));
829 gTools().xmlengine().NewAttr(layerxml, 0, "RememberState", gTools().StringFromInt(this->DoesRememberState()));
830 gTools().xmlengine().NewAttr(layerxml, 0, "ReturnSequence", gTools().StringFromInt(this->DoesReturnSequence()));
831 gTools().xmlengine().NewAttr(layerxml, 0, "ResetGateAfter", gTools().StringFromInt(this->fResetGateAfter));
832
833 // write weights and bias matrices
834 this->WriteMatrixToXML(layerxml, "ResetWeights", this->GetWeightsAt(0));
835 this->WriteMatrixToXML(layerxml, "ResetStateWeights", this->GetWeightsAt(1));
836 this->WriteMatrixToXML(layerxml, "ResetBiases", this->GetBiasesAt(0));
837 this->WriteMatrixToXML(layerxml, "UpdateWeights", this->GetWeightsAt(2));
838 this->WriteMatrixToXML(layerxml, "UpdateStateWeights", this->GetWeightsAt(3));
839 this->WriteMatrixToXML(layerxml, "UpdateBiases", this->GetBiasesAt(1));
840 this->WriteMatrixToXML(layerxml, "CandidateWeights", this->GetWeightsAt(4));
841 this->WriteMatrixToXML(layerxml, "CandidateStateWeights", this->GetWeightsAt(5));
842 this->WriteMatrixToXML(layerxml, "CandidateBiases", this->GetBiasesAt(2));
843}
844
845 //______________________________________________________________________________
846template <typename Architecture_t>
848-> void
849{
850 // Read weights and biases
851 this->ReadMatrixXML(parent, "ResetWeights", this->GetWeightsAt(0));
852 this->ReadMatrixXML(parent, "ResetStateWeights", this->GetWeightsAt(1));
853 this->ReadMatrixXML(parent, "ResetBiases", this->GetBiasesAt(0));
854 this->ReadMatrixXML(parent, "UpdateWeights", this->GetWeightsAt(2));
855 this->ReadMatrixXML(parent, "UpdateStateWeights", this->GetWeightsAt(3));
856 this->ReadMatrixXML(parent, "UpdateBiases", this->GetBiasesAt(1));
857 this->ReadMatrixXML(parent, "CandidateWeights", this->GetWeightsAt(4));
858 this->ReadMatrixXML(parent, "CandidateStateWeights", this->GetWeightsAt(5));
859 this->ReadMatrixXML(parent, "CandidateBiases", this->GetBiasesAt(2));
860}
861
862} // namespace GRU
863} // namespace DNN
864} // namespace TMVA
865
866#endif // GRU_LAYER_H
const Matrix_t & GetWeightsCandidate() const
Definition GRULayer.h:224
Matrix_t & GetWeightsCandidateStateGradients()
Definition GRULayer.h:286
typename Architecture_t::RecurrentDescriptor_t LayerDescriptor_t
Definition GRULayer.h:65
Matrix_t & fResetBiasGradients
Gradients w.r.t the reset gate - bias weights.
Definition GRULayer.h:114
std::vector< Matrix_t > & GetUpdateGateTensor()
Definition GRULayer.h:254
typename Architecture_t::Tensor_t Tensor_t
Definition GRULayer.h:63
std::vector< Matrix_t > reset_gate_value
Reset gate value for every time step.
Definition GRULayer.h:104
Matrix_t & CellBackward(Matrix_t &state_gradients_backward, const Matrix_t &precStateActivations, const Matrix_t &reset_gate, const Matrix_t &update_gate, const Matrix_t &candidate_gate, const Matrix_t &input, Matrix_t &input_gradient, Matrix_t &dr, Matrix_t &du, Matrix_t &dc)
Backward for a single time unit a the corresponding call to Forward(...).
Definition GRULayer.h:775
size_t fStateSize
Hidden state size for GRU.
Definition GRULayer.h:75
const Matrix_t & GetWeightsResetGradients() const
Definition GRULayer.h:271
const Matrix_t & GetUpdateBiasGradients() const
Definition GRULayer.h:281
bool fReturnSequence
Return in output full sequence or just last element.
Definition GRULayer.h:79
void Forward(Tensor_t &input, bool isTraining=true)
Computes the next hidden state and next cell state with given input matrix.
Definition GRULayer.h:494
const Matrix_t & GetWeightsResetStateGradients() const
Definition GRULayer.h:273
std::vector< Matrix_t > fDerivativesReset
First fDerivatives of the activations reset gate.
Definition GRULayer.h:108
const Tensor_t & GetWeightsTensor() const
Definition GRULayer.h:291
std::vector< Matrix_t > & GetResetGateTensor()
Definition GRULayer.h:250
Matrix_t & GetWeightsUpdateGateState()
Definition GRULayer.h:232
const std::vector< Matrix_t > & GetCandidateGateTensor() const
Definition GRULayer.h:257
const Matrix_t & GetUpdateDerivativesAt(size_t i) const
Definition GRULayer.h:242
Matrix_t & GetWeightsUpdateStateGradients()
Definition GRULayer.h:280
size_t GetInputSize() const
Getters.
Definition GRULayer.h:200
Matrix_t fState
Hidden state of GRU.
Definition GRULayer.h:88
Matrix_t & GetWeightsResetGradients()
Definition GRULayer.h:272
Tensor_t & GetWeightGradientsTensor()
Definition GRULayer.h:292
const Matrix_t & GetCandidateBias() const
Definition GRULayer.h:268
std::vector< Matrix_t > update_gate_value
Update gate value for every time step.
Definition GRULayer.h:105
Tensor_t fX
cached input tensor as T x B x I
Definition GRULayer.h:129
Matrix_t & GetCandidateGateTensorAt(size_t i)
Definition GRULayer.h:260
Matrix_t & GetResetBiasGradients()
Definition GRULayer.h:276
Matrix_t & GetWeightsResetGateState()
Definition GRULayer.h:230
DNN::EActivationFunction fF1
Activation function: sigmoid.
Definition GRULayer.h:82
const Matrix_t & GetWeightsUpdateGate() const
Definition GRULayer.h:226
const std::vector< Matrix_t > & GetDerivativesReset() const
Definition GRULayer.h:236
const Matrix_t & GetUpdateGateBias() const
Definition GRULayer.h:266
Matrix_t & fWeightsResetGradients
Gradients w.r.t the reset gate - input weights.
Definition GRULayer.h:112
std::vector< Matrix_t > & GetDerivativesUpdate()
Definition GRULayer.h:241
Matrix_t & fCandidateBiasGradients
Gradients w.r.t the candidate gate - bias weights.
Definition GRULayer.h:120
Matrix_t & fCandidateBias
Candidate Gate bias.
Definition GRULayer.h:101
Matrix_t & GetUpdateGateTensorAt(size_t i)
Definition GRULayer.h:256
DNN::EActivationFunction fF2
Activaton function: tanh.
Definition GRULayer.h:83
const Matrix_t & GetWeightsUpdateGradients() const
Definition GRULayer.h:277
Matrix_t & GetWeightsCandidateGradients()
Definition GRULayer.h:284
Matrix_t & fWeightsUpdateStateGradients
Gradients w.r.t the update gate - hidden state weights.
Definition GRULayer.h:116
void AddWeightsXMLTo(void *parent)
Writes the information and the weights about the layer in an XML node.
Definition GRULayer.h:820
Matrix_t & fWeightsUpdateGradients
Gradients w.r.t the update gate - input weights.
Definition GRULayer.h:115
size_t fTimeSteps
Timesteps for GRU.
Definition GRULayer.h:76
std::vector< Matrix_t > fDerivativesCandidate
First fDerivatives of the activations candidate gate.
Definition GRULayer.h:110
const Tensor_t & GetWeightGradientsTensor() const
Definition GRULayer.h:293
typename Architecture_t::FilterDescriptor_t WeightsDescriptor_t
Definition GRULayer.h:66
Tensor_t fWeightGradientsTensor
Tensor for all weight gradients.
Definition GRULayer.h:126
Matrix_t & fUpdateBiasGradients
Gradients w.r.t the update gate - bias weights.
Definition GRULayer.h:117
Matrix_t & GetWeightsResetStateGradients()
Definition GRULayer.h:274
std::vector< Matrix_t > & GetCandidateGateTensor()
Definition GRULayer.h:258
Matrix_t & fWeightsResetGate
Reset Gate weights for input, fWeights[0].
Definition GRULayer.h:91
const Matrix_t & GetResetDerivativesAt(size_t i) const
Definition GRULayer.h:238
Matrix_t & GetWeightsUpdateGate()
Definition GRULayer.h:227
typename Architecture_t::Matrix_t Matrix_t
Definition GRULayer.h:61
void Backward(Tensor_t &gradients_backward, const Tensor_t &activations_backward)
Backpropagates the error.
Definition GRULayer.h:610
const Matrix_t & GetCandidateGateTensorAt(size_t i) const
Definition GRULayer.h:259
Matrix_t & GetWeightsCandidateState()
Definition GRULayer.h:234
const Matrix_t & GetCandidateBiasGradients() const
Definition GRULayer.h:287
Matrix_t & GetResetGateTensorAt(size_t i)
Definition GRULayer.h:252
Matrix_t & fResetGateBias
Input Gate bias.
Definition GRULayer.h:93
const std::vector< Matrix_t > & GetResetGateTensor() const
Definition GRULayer.h:249
Matrix_t fCell
EMpty matrix for GRU.
Definition GRULayer.h:122
std::vector< Matrix_t > candidate_gate_value
Candidate gate value for every time step.
Definition GRULayer.h:106
typename Architecture_t::Scalar_t Scalar_t
Definition GRULayer.h:62
const Matrix_t & GetWeigthsUpdateStateGradients() const
Definition GRULayer.h:279
const Matrix_t & GetCandidateValue() const
Definition GRULayer.h:212
Matrix_t & GetCandidateBiasGradients()
Definition GRULayer.h:288
Matrix_t & fWeightsCandidateStateGradients
Gradients w.r.t the candidate gate - hidden state weights.
Definition GRULayer.h:119
const std::vector< Matrix_t > & GetDerivativesUpdate() const
Definition GRULayer.h:240
const Matrix_t & GetCell() const
Definition GRULayer.h:219
void UpdateGate(const Matrix_t &input, Matrix_t &df)
Forgets the past values (NN with Sigmoid)
Definition GRULayer.h:441
const Matrix_t & GetCandidateDerivativesAt(size_t i) const
Definition GRULayer.h:246
Matrix_t fResetValue
Computed reset gate values.
Definition GRULayer.h:85
DNN::EActivationFunction GetActivationFunctionF2() const
Definition GRULayer.h:208
typename Architecture_t::RNNWorkspace_t RNNWorkspace_t
Definition GRULayer.h:70
Matrix_t fUpdateValue
Computed forget gate values.
Definition GRULayer.h:86
const Matrix_t & GetResetBiasGradients() const
Definition GRULayer.h:275
bool fResetGateAfter
GRU variant to Apply the reset gate multiplication afterwards (used by cuDNN)
Definition GRULayer.h:80
const Matrix_t & GetWeightsCandidateGradients() const
Definition GRULayer.h:283
DNN::EActivationFunction GetActivationFunctionF1() const
Definition GRULayer.h:207
Matrix_t & GetUpdateBiasGradients()
Definition GRULayer.h:282
const Matrix_t & GetUpdateGateTensorAt(size_t i) const
Definition GRULayer.h:255
Matrix_t & fWeightsResetGateState
Input Gate weights for prev state, fWeights[1].
Definition GRULayer.h:92
Matrix_t & fWeightsUpdateGateState
Update Gate weights for prev state, fWeights[3].
Definition GRULayer.h:96
const std::vector< Matrix_t > & GetDerivativesCandidate() const
Definition GRULayer.h:244
virtual void Initialize()
Initialize the weights according to the given initialization method.
Definition GRULayer.h:409
Tensor_t fWeightsTensor
Tensor for all weights.
Definition GRULayer.h:125
void ReadWeightsFromXML(void *parent)
Read the information and the weights about the layer from XML node.
Definition GRULayer.h:847
typename Architecture_t::RNNDescriptors_t RNNDescriptors_t
Definition GRULayer.h:71
const Matrix_t & GetResetGateBias() const
Definition GRULayer.h:264
Matrix_t & GetResetDerivativesAt(size_t i)
Definition GRULayer.h:239
const Matrix_t & GetUpdateGateValue() const
Definition GRULayer.h:214
const Matrix_t & GetResetGateTensorAt(size_t i) const
Definition GRULayer.h:251
TDescriptors * fDescriptors
Keeps all the RNN descriptors.
Definition GRULayer.h:134
void CellForward(Matrix_t &updateGateValues, Matrix_t &candidateValues)
Forward for a single cell (time unit)
Definition GRULayer.h:590
Matrix_t & GetWeightsUpdateGradients()
Definition GRULayer.h:278
Matrix_t & fWeightsResetStateGradients
Gradients w.r.t the reset gate - hidden state weights.
Definition GRULayer.h:113
Matrix_t & fWeightsCandidateState
Candidate Gate weights for prev state, fWeights[5].
Definition GRULayer.h:100
void Print() const
Prints the info about the layer.
Definition GRULayer.h:808
std::vector< Matrix_t > & GetDerivativesReset()
Definition GRULayer.h:237
Matrix_t & fUpdateGateBias
Update Gate bias.
Definition GRULayer.h:97
const Matrix_t & GetWeightsCandidateStateGradients() const
Definition GRULayer.h:285
void ResetGate(const Matrix_t &input, Matrix_t &di)
Decides the values we'll update (NN with Sigmoid)
Definition GRULayer.h:423
const Matrix_t & GetWeightsResetGate() const
Definition GRULayer.h:222
Tensor_t fDx
cached gradient on the input (output of backward) as T x B x I
Definition GRULayer.h:131
typename Architecture_t::TensorDescriptor_t TensorDescriptor_t
Definition GRULayer.h:67
bool fRememberState
Remember state in next pass.
Definition GRULayer.h:78
Matrix_t & fWeightsCandidate
Candidate Gate weights for input, fWeights[4].
Definition GRULayer.h:99
Matrix_t & fWeightsCandidateGradients
Gradients w.r.t the candidate gate - input weights.
Definition GRULayer.h:118
const Matrix_t & GetWeightsCandidateState() const
Definition GRULayer.h:233
const std::vector< Matrix_t > & GetUpdateGateTensor() const
Definition GRULayer.h:253
const Matrix_t & GetResetGateValue() const
Definition GRULayer.h:210
void Update(const Scalar_t learningRate)
Tensor_t fY
cached output tensor as T x B x S
Definition GRULayer.h:130
Matrix_t fCandidateValue
Computed candidate values.
Definition GRULayer.h:87
const Matrix_t & GetState() const
Definition GRULayer.h:217
void InitState(DNN::EInitialization m=DNN::EInitialization::kZero)
Initialize the hidden state and cell state method.
Definition GRULayer.h:800
Tensor_t fDy
cached activation gradient (input of backward) as T x B x S
Definition GRULayer.h:132
Matrix_t & GetCandidateDerivativesAt(size_t i)
Definition GRULayer.h:247
std::vector< Matrix_t > fDerivativesUpdate
First fDerivatives of the activations update gate.
Definition GRULayer.h:109
const Matrix_t & GetWeightsUpdateGateState() const
Definition GRULayer.h:231
std::vector< Matrix_t > & GetDerivativesCandidate()
Definition GRULayer.h:245
const Matrix_t & GetWeightsResetGateState() const
Definition GRULayer.h:229
void CandidateValue(const Matrix_t &input, Matrix_t &dc)
Decides the new candidate values (NN with Tanh)
Definition GRULayer.h:459
TBasicGRULayer(size_t batchSize, size_t stateSize, size_t inputSize, size_t timeSteps, bool rememberState=false, bool returnSequence=false, bool resetGateAfter=false, DNN::EActivationFunction f1=DNN::EActivationFunction::kSigmoid, DNN::EActivationFunction f2=DNN::EActivationFunction::kTanh, bool training=true, DNN::EInitialization fA=DNN::EInitialization::kZero)
Constructor.
Definition GRULayer.h:308
Matrix_t & GetUpdateDerivativesAt(size_t i)
Definition GRULayer.h:243
Matrix_t & fWeightsUpdateGate
Update Gate weights for input, fWeights[2].
Definition GRULayer.h:95
typename Architecture_t::DropoutDescriptor_t HelperDescriptor_t
Definition GRULayer.h:68
Generic General Layer class.
virtual void Initialize()
Initialize the weights and biases according to the given initialization method.
size_t GetBatchSize() const
Getters.
size_t GetInputWidth() const
TXMLEngine & xmlengine()
Definition Tools.h:262
XMLNodePointer_t NewChild(XMLNodePointer_t parent, XMLNsPointer_t ns, const char *name, const char *content=nullptr)
create new child element for parent node
XMLAttrPointer_t NewAttr(XMLNodePointer_t xmlnode, XMLNsPointer_t, const char *name, const char *value)
creates new attribute for xmlnode, namespaces are not supported for attributes
Double_t y[n]
Definition legend1.C:17
Double_t x[n]
Definition legend1.C:17
TF1 * f1
Definition legend1.C:11
EActivationFunction
Enum that represents layer activation functions.
Definition Functions.h:32
create variable transformations
Tools & gTools()
auto * m
Definition textangle.C:8