Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
GRULayer.h
Go to the documentation of this file.
1// @(#)root/tmva/tmva/dnn/gru:$Id$
2// Author: Surya S Dwivedi 03/07/19
3
4/**********************************************************************************
5 * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6 * Package: TMVA *
7 * Class : BasicGRULayer *
8 * *
9 * Description: *
10 * NeuralNetwork *
11 * *
12 * Authors (alphabetical): *
13 * Surya S Dwivedi <surya2191997@gmail.com> - IIT Kharagpur, India *
14 * *
15 * Copyright (c) 2005-2019: *
16 * All rights reserved. *
17 * CERN, Switzerland *
18 * *
19 * For the licensing terms see $ROOTSYS/LICENSE. *
20 * For the list of contributors see $ROOTSYS/README/CREDITS. *
21 **********************************************************************************/
22
23//#pragma once
24
25//////////////////////////////////////////////////////////////////////
26// This class implements the GRU layer. GRU is a variant of vanilla
27// RNN which is capable of learning long range dependencies.
28//////////////////////////////////////////////////////////////////////
29
30#ifndef TMVA_DNN_GRU_LAYER
31#define TMVA_DNN_GRU_LAYER
32
33#include <cmath>
34#include <iostream>
35#include <vector>
36
37#include "TMatrix.h"
38#include "TMVA/DNN/Functions.h"
39
40namespace TMVA
41{
42namespace DNN
43{
44namespace RNN
45{
46
47//______________________________________________________________________________
48//
49// Basic GRU Layer
50//______________________________________________________________________________
51
52/** \class BasicGRULayer
53 Generic implementation
54*/
55template<typename Architecture_t>
56 class TBasicGRULayer : public VGeneralLayer<Architecture_t>
57{
58
59public:
60
61 using Matrix_t = typename Architecture_t::Matrix_t;
62 using Scalar_t = typename Architecture_t::Scalar_t;
63 using Tensor_t = typename Architecture_t::Tensor_t;
64
65 using LayerDescriptor_t = typename Architecture_t::RecurrentDescriptor_t;
66 using WeightsDescriptor_t = typename Architecture_t::FilterDescriptor_t;
67 using TensorDescriptor_t = typename Architecture_t::TensorDescriptor_t;
68 using HelperDescriptor_t = typename Architecture_t::DropoutDescriptor_t;
69
70 using RNNWorkspace_t = typename Architecture_t::RNNWorkspace_t;
71 using RNNDescriptors_t = typename Architecture_t::RNNDescriptors_t;
72
73private:
74
75 size_t fStateSize; ///< Hidden state size for GRU
76 size_t fTimeSteps; ///< Timesteps for GRU
77
78 bool fRememberState; ///< Remember state in next pass
79 bool fReturnSequence = false; ///< Return in output full sequence or just last element
80 bool fResetGateAfter = false; ///< GRU variant to Apply the reset gate multiplication afterwards (used by cuDNN)
81
82 DNN::EActivationFunction fF1; ///< Activation function: sigmoid
83 DNN::EActivationFunction fF2; ///< Activation function: tanh
84
85 Matrix_t fResetValue; ///< Computed reset gate values
86 Matrix_t fUpdateValue; ///< Computed forget gate values
87 Matrix_t fCandidateValue; ///< Computed candidate values
88 Matrix_t fState; ///< Hidden state of GRU
89
90
91 Matrix_t &fWeightsResetGate; ///< Reset Gate weights for input, fWeights[0]
92 Matrix_t &fWeightsResetGateState; ///< Input Gate weights for prev state, fWeights[1]
93 Matrix_t &fResetGateBias; ///< Input Gate bias
94
95 Matrix_t &fWeightsUpdateGate; ///< Update Gate weights for input, fWeights[2]
96 Matrix_t &fWeightsUpdateGateState; ///< Update Gate weights for prev state, fWeights[3]
97 Matrix_t &fUpdateGateBias; ///< Update Gate bias
98
99 Matrix_t &fWeightsCandidate; ///< Candidate Gate weights for input, fWeights[4]
100 Matrix_t &fWeightsCandidateState; ///< Candidate Gate weights for prev state, fWeights[5]
101 Matrix_t &fCandidateBias; ///< Candidate Gate bias
102
103
104 std::vector<Matrix_t> reset_gate_value; ///< Reset gate value for every time step
105 std::vector<Matrix_t> update_gate_value; ///< Update gate value for every time step
106 std::vector<Matrix_t> candidate_gate_value; ///< Candidate gate value for every time step
107
108 std::vector<Matrix_t> fDerivativesReset; ///< First fDerivatives of the activations reset gate
109 std::vector<Matrix_t> fDerivativesUpdate; ///< First fDerivatives of the activations update gate
110 std::vector<Matrix_t> fDerivativesCandidate; ///< First fDerivatives of the activations candidate gate
111
112 Matrix_t &fWeightsResetGradients; ///< Gradients w.r.t the reset gate - input weights
113 Matrix_t &fWeightsResetStateGradients; ///< Gradients w.r.t the reset gate - hidden state weights
114 Matrix_t &fResetBiasGradients; ///< Gradients w.r.t the reset gate - bias weights
115 Matrix_t &fWeightsUpdateGradients; ///< Gradients w.r.t the update gate - input weights
116 Matrix_t &fWeightsUpdateStateGradients; ///< Gradients w.r.t the update gate - hidden state weights
117 Matrix_t &fUpdateBiasGradients; ///< Gradients w.r.t the update gate - bias weights
118 Matrix_t &fWeightsCandidateGradients; ///< Gradients w.r.t the candidate gate - input weights
119 Matrix_t &fWeightsCandidateStateGradients; ///< Gradients w.r.t the candidate gate - hidden state weights
120 Matrix_t &fCandidateBiasGradients; ///< Gradients w.r.t the candidate gate - bias weights
121
122 Matrix_t fCell; ///< Empty matrix for GRU
123
124 // Tensor representing all weights (used by cuDNN)
125 Tensor_t fWeightsTensor; ///< Tensor for all weights
126 Tensor_t fWeightGradientsTensor; ///< Tensor for all weight gradients
127
128 // tensors used internally for the forward and backward pass
129 Tensor_t fX; ///< cached input tensor as T x B x I
130 Tensor_t fY; ///< cached output tensor as T x B x S
131 Tensor_t fDx; ///< cached gradient on the input (output of backward) as T x B x I
132 Tensor_t fDy; ///< cached activation gradient (input of backward) as T x B x S
133
134 TDescriptors *fDescriptors = nullptr; ///< Keeps all the RNN descriptors
135 TWorkspace *fWorkspace = nullptr; // workspace needed for GPU computation (CudNN)
136
137public:
138
139 /*! Constructor */
140 TBasicGRULayer(size_t batchSize, size_t stateSize, size_t inputSize,
141 size_t timeSteps, bool rememberState = false, bool returnSequence = false,
142 bool resetGateAfter = false,
145 bool training = true, DNN::EInitialization fA = DNN::EInitialization::kZero);
146
147 /*! Copy Constructor */
149
150 /*! Initialize the weights according to the given initialization
151 ** method. */
152 virtual void Initialize();
153
154 /*! Initialize the hidden state and cell state method. */
156
157 /*! Computes the next hidden state
158 * and next cell state with given input matrix. */
159 void Forward(Tensor_t &input, bool isTraining = true);
160
161 /*! Forward for a single cell (time unit) */
162 void CellForward(Matrix_t &updateGateValues, Matrix_t &candidateValues);
163
164 /*! Backpropagates the error. Must only be called directly at the corresponding
165 * call to Forward(...). */
166 void Backward(Tensor_t &gradients_backward,
167 const Tensor_t &activations_backward);
168
169 /* Updates weights and biases, given the learning rate */
170 void Update(const Scalar_t learningRate);
171
172 /*! Backward for a single time unit
173 * a the corresponding call to Forward(...). */
174 Matrix_t & CellBackward(Matrix_t & state_gradients_backward,
175 const Matrix_t & precStateActivations,
176 const Matrix_t & reset_gate, const Matrix_t & update_gate,
177 const Matrix_t & candidate_gate,
178 const Matrix_t & input, Matrix_t & input_gradient,
179 Matrix_t &dr, Matrix_t &du, Matrix_t &dc);
180
181 /*! Decides the values we'll update (NN with Sigmoid) */
182 void ResetGate(const Matrix_t &input, Matrix_t &di);
183
184 /*! Forgets the past values (NN with Sigmoid) */
185 void UpdateGate(const Matrix_t &input, Matrix_t &df);
186
187 /*! Decides the new candidate values (NN with Tanh) */
188 void CandidateValue(const Matrix_t &input, Matrix_t &dc);
189
190 /*! Prints the info about the layer */
191 void Print() const;
192
193 /*! Writes the information and the weights about the layer in an XML node. */
194 void AddWeightsXMLTo(void *parent);
195
196 /*! Read the information and the weights about the layer from XML node. */
197 void ReadWeightsFromXML(void *parent);
198
199 /*! Getters */
200 size_t GetInputSize() const { return this->GetInputWidth(); }
201 size_t GetTimeSteps() const { return fTimeSteps; }
202 size_t GetStateSize() const { return fStateSize; }
203
204 inline bool DoesRememberState() const { return fRememberState; }
205 inline bool DoesReturnSequence() const { return fReturnSequence; }
206
209
210 const Matrix_t & GetResetGateValue() const { return fResetValue; }
212 const Matrix_t & GetCandidateValue() const { return fCandidateValue; }
214 const Matrix_t & GetUpdateGateValue() const { return fUpdateValue; }
216
217 const Matrix_t & GetState() const { return fState; }
218 Matrix_t & GetState() { return fState; }
219 const Matrix_t &GetCell() const { return fCell; }
220 Matrix_t & GetCell() { return fCell; }
221
228
235
236 const std::vector<Matrix_t> & GetDerivativesReset() const { return fDerivativesReset; }
237 std::vector<Matrix_t> & GetDerivativesReset() { return fDerivativesReset; }
238 const Matrix_t & GetResetDerivativesAt(size_t i) const { return fDerivativesReset[i]; }
240 const std::vector<Matrix_t> & GetDerivativesUpdate() const { return fDerivativesUpdate; }
241 std::vector<Matrix_t> & GetDerivativesUpdate() { return fDerivativesUpdate; }
242 const Matrix_t & GetUpdateDerivativesAt(size_t i) const { return fDerivativesUpdate[i]; }
244 const std::vector<Matrix_t> & GetDerivativesCandidate() const { return fDerivativesCandidate; }
245 std::vector<Matrix_t> & GetDerivativesCandidate() { return fDerivativesCandidate; }
246 const Matrix_t & GetCandidateDerivativesAt(size_t i) const { return fDerivativesCandidate[i]; }
248
249 const std::vector<Matrix_t> & GetResetGateTensor() const { return reset_gate_value; }
250 std::vector<Matrix_t> & GetResetGateTensor() { return reset_gate_value; }
251 const Matrix_t & GetResetGateTensorAt(size_t i) const { return reset_gate_value[i]; }
253 const std::vector<Matrix_t> & GetUpdateGateTensor() const { return update_gate_value; }
254 std::vector<Matrix_t> & GetUpdateGateTensor() { return update_gate_value; }
255 const Matrix_t & GetUpdateGateTensorAt(size_t i) const { return update_gate_value[i]; }
257 const std::vector<Matrix_t> & GetCandidateGateTensor() const { return candidate_gate_value; }
258 std::vector<Matrix_t> & GetCandidateGateTensor() { return candidate_gate_value; }
259 const Matrix_t & GetCandidateGateTensorAt(size_t i) const { return candidate_gate_value[i]; }
261
262
263
264 const Matrix_t & GetResetGateBias() const { return fResetGateBias; }
266 const Matrix_t & GetUpdateGateBias() const { return fUpdateGateBias; }
268 const Matrix_t & GetCandidateBias() const { return fCandidateBias; }
270
289
291 const Tensor_t &GetWeightsTensor() const { return fWeightsTensor; }
294
295 Tensor_t &GetX() { return fX; }
296 Tensor_t &GetY() { return fY; }
297 Tensor_t &GetDX() { return fDx; }
298 Tensor_t &GetDY() { return fDy; }
299};
300
301
302//______________________________________________________________________________
303//
304// Basic GRU-Layer Implementation
305//______________________________________________________________________________
306
307template <typename Architecture_t>
308TBasicGRULayer<Architecture_t>::TBasicGRULayer(size_t batchSize, size_t stateSize, size_t inputSize, size_t timeSteps,
309 bool rememberState, bool returnSequence, bool resetGateAfter, DNN::EActivationFunction f1,
310 DNN::EActivationFunction f2, bool /* training */,
312 : VGeneralLayer<Architecture_t>(batchSize, 1, timeSteps, inputSize, 1, (returnSequence) ? timeSteps : 1, stateSize,
313 6, {stateSize, stateSize, stateSize, stateSize, stateSize, stateSize},
314 {inputSize, inputSize, inputSize, stateSize, stateSize, stateSize}, 3,
315 {stateSize, stateSize, stateSize}, {1, 1, 1}, batchSize,
316 (returnSequence) ? timeSteps : 1, stateSize, fA),
317 fStateSize(stateSize), fTimeSteps(timeSteps), fRememberState(rememberState), fReturnSequence(returnSequence), fResetGateAfter(resetGateAfter),
318 fF1(f1), fF2(f2), fResetValue(batchSize, stateSize), fUpdateValue(batchSize, stateSize),
319 fCandidateValue(batchSize, stateSize), fState(batchSize, stateSize), fWeightsResetGate(this->GetWeightsAt(0)),
320 fWeightsResetGateState(this->GetWeightsAt(3)), fResetGateBias(this->GetBiasesAt(0)),
321 fWeightsUpdateGate(this->GetWeightsAt(1)), fWeightsUpdateGateState(this->GetWeightsAt(4)),
322 fUpdateGateBias(this->GetBiasesAt(1)), fWeightsCandidate(this->GetWeightsAt(2)),
323 fWeightsCandidateState(this->GetWeightsAt(5)), fCandidateBias(this->GetBiasesAt(2)),
324 fWeightsResetGradients(this->GetWeightGradientsAt(0)), fWeightsResetStateGradients(this->GetWeightGradientsAt(3)),
325 fResetBiasGradients(this->GetBiasGradientsAt(0)), fWeightsUpdateGradients(this->GetWeightGradientsAt(1)),
326 fWeightsUpdateStateGradients(this->GetWeightGradientsAt(4)), fUpdateBiasGradients(this->GetBiasGradientsAt(1)),
327 fWeightsCandidateGradients(this->GetWeightGradientsAt(2)),
328 fWeightsCandidateStateGradients(this->GetWeightGradientsAt(5)),
329 fCandidateBiasGradients(this->GetBiasGradientsAt(2))
330{
331 for (size_t i = 0; i < timeSteps; ++i) {
332 fDerivativesReset.emplace_back(batchSize, stateSize);
333 fDerivativesUpdate.emplace_back(batchSize, stateSize);
334 fDerivativesCandidate.emplace_back(batchSize, stateSize);
335 reset_gate_value.emplace_back(batchSize, stateSize);
336 update_gate_value.emplace_back(batchSize, stateSize);
337 candidate_gate_value.emplace_back(batchSize, stateSize);
338 }
339 Architecture_t::InitializeGRUTensors(this);
340}
341
342 //______________________________________________________________________________
343template <typename Architecture_t>
345 : VGeneralLayer<Architecture_t>(layer),
346 fStateSize(layer.fStateSize),
347 fTimeSteps(layer.fTimeSteps),
348 fRememberState(layer.fRememberState),
349 fReturnSequence(layer.fReturnSequence),
350 fResetGateAfter(layer.fResetGateAfter),
351 fF1(layer.GetActivationFunctionF1()),
352 fF2(layer.GetActivationFunctionF2()),
353 fResetValue(layer.GetBatchSize(), layer.GetStateSize()),
354 fUpdateValue(layer.GetBatchSize(), layer.GetStateSize()),
355 fCandidateValue(layer.GetBatchSize(), layer.GetStateSize()),
356 fState(layer.GetBatchSize(), layer.GetStateSize()),
357 fWeightsResetGate(this->GetWeightsAt(0)),
358 fWeightsResetGateState(this->GetWeightsAt(3)),
359 fResetGateBias(this->GetBiasesAt(0)),
360 fWeightsUpdateGate(this->GetWeightsAt(1)),
361 fWeightsUpdateGateState(this->GetWeightsAt(4)),
362 fUpdateGateBias(this->GetBiasesAt(1)),
363 fWeightsCandidate(this->GetWeightsAt(2)),
364 fWeightsCandidateState(this->GetWeightsAt(5)),
365 fCandidateBias(this->GetBiasesAt(2)),
366 fWeightsResetGradients(this->GetWeightGradientsAt(0)),
367 fWeightsResetStateGradients(this->GetWeightGradientsAt(3)),
368 fResetBiasGradients(this->GetBiasGradientsAt(0)),
369 fWeightsUpdateGradients(this->GetWeightGradientsAt(1)),
370 fWeightsUpdateStateGradients(this->GetWeightGradientsAt(4)),
371 fUpdateBiasGradients(this->GetBiasGradientsAt(1)),
372 fWeightsCandidateGradients(this->GetWeightGradientsAt(2)),
373 fWeightsCandidateStateGradients(this->GetWeightGradientsAt(5)),
374 fCandidateBiasGradients(this->GetBiasGradientsAt(2))
375{
376 for (size_t i = 0; i < fTimeSteps; ++i) {
377 fDerivativesReset.emplace_back(layer.GetBatchSize(), layer.GetStateSize());
378 Architecture_t::Copy(fDerivativesReset[i], layer.GetResetDerivativesAt(i));
379
380 fDerivativesUpdate.emplace_back(layer.GetBatchSize(), layer.GetStateSize());
381 Architecture_t::Copy(fDerivativesUpdate[i], layer.GetUpdateDerivativesAt(i));
382
383 fDerivativesCandidate.emplace_back(layer.GetBatchSize(), layer.GetStateSize());
384 Architecture_t::Copy(fDerivativesCandidate[i], layer.GetCandidateDerivativesAt(i));
385
386 reset_gate_value.emplace_back(layer.GetBatchSize(), layer.GetStateSize());
387 Architecture_t::Copy(reset_gate_value[i], layer.GetResetGateTensorAt(i));
388
389 update_gate_value.emplace_back(layer.GetBatchSize(), layer.GetStateSize());
390 Architecture_t::Copy(update_gate_value[i], layer.GetUpdateGateTensorAt(i));
391
392 candidate_gate_value.emplace_back(layer.GetBatchSize(), layer.GetStateSize());
393 Architecture_t::Copy(candidate_gate_value[i], layer.GetCandidateGateTensorAt(i));
394 }
395
396 // Gradient matrices not copied
397 Architecture_t::Copy(fState, layer.GetState());
398
399 // Copy each gate values.
400 Architecture_t::Copy(fResetValue, layer.GetResetGateValue());
401 Architecture_t::Copy(fCandidateValue, layer.GetCandidateValue());
402 Architecture_t::Copy(fUpdateValue, layer.GetUpdateGateValue());
403
404 Architecture_t::InitializeGRUTensors(this);
405}
406
407//______________________________________________________________________________
408template <typename Architecture_t>
410{
412
413 Architecture_t::InitializeGRUDescriptors(fDescriptors, this);
414 Architecture_t::InitializeGRUWorkspace(fWorkspace, fDescriptors, this);
415
416 //cuDNN only supports resetGate after
417 if (Architecture_t::IsCudnn())
418 fResetGateAfter = true;
419}
420
421//______________________________________________________________________________
422template <typename Architecture_t>
424-> void
425{
426 /*! Computes reset gate values according to equation:
427 * input = act(W_input . input + W_state . state + bias)
428 * activation function: sigmoid. */
429 const DNN::EActivationFunction fRst = this->GetActivationFunctionF1();
430 Matrix_t tmpState(fResetValue.GetNrows(), fResetValue.GetNcols());
431 Architecture_t::MultiplyTranspose(tmpState, fState, fWeightsResetGateState);
432 Architecture_t::MultiplyTranspose(fResetValue, input, fWeightsResetGate);
433 Architecture_t::ScaleAdd(fResetValue, tmpState);
434 Architecture_t::AddRowWise(fResetValue, fResetGateBias);
435 DNN::evaluateDerivativeMatrix<Architecture_t>(dr, fRst, fResetValue);
436 DNN::evaluateMatrix<Architecture_t>(fResetValue, fRst);
437}
438
439 //______________________________________________________________________________
440template <typename Architecture_t>
442-> void
443{
444 /*! Computes update gate values according to equation:
445 * forget = act(W_input . input + W_state . state + bias)
446 * activation function: sigmoid. */
447 const DNN::EActivationFunction fUpd = this->GetActivationFunctionF1();
448 Matrix_t tmpState(fUpdateValue.GetNrows(), fUpdateValue.GetNcols());
449 Architecture_t::MultiplyTranspose(tmpState, fState, fWeightsUpdateGateState);
450 Architecture_t::MultiplyTranspose(fUpdateValue, input, fWeightsUpdateGate);
451 Architecture_t::ScaleAdd(fUpdateValue, tmpState);
452 Architecture_t::AddRowWise(fUpdateValue, fUpdateGateBias);
453 DNN::evaluateDerivativeMatrix<Architecture_t>(du, fUpd, fUpdateValue);
454 DNN::evaluateMatrix<Architecture_t>(fUpdateValue, fUpd);
455}
456
457 //______________________________________________________________________________
458template <typename Architecture_t>
460-> void
461{
462 /*!
463 vanilla GRU:
464 candidate_value = act(W_input . input + W_state . (reset*state) + bias)
465
466 but CuDNN uses reset_after variant that is faster (with bias mode = input)
467 (apply reset gate multiplication after matrix multiplication)
468 candidate_value = act(W_input . input + reset * (W_state . state) + bias
469
470 activation function = tanh.
471
472 */
473
474 const DNN::EActivationFunction fCan = this->GetActivationFunctionF2();
475 Matrix_t tmp(fCandidateValue.GetNrows(), fCandidateValue.GetNcols());
476 if (!fResetGateAfter) {
477 Matrix_t tmpState(fResetValue); // I think here tmpState uses fResetValue buffer
478 Architecture_t::Hadamard(tmpState, fState);
479 Architecture_t::MultiplyTranspose(tmp, tmpState, fWeightsCandidateState);
480 } else {
481 // variant GRU used in cuDNN slightly faster
482 Architecture_t::MultiplyTranspose(tmp, fState, fWeightsCandidateState);
483 Architecture_t::Hadamard(tmp, fResetValue);
484 }
485 Architecture_t::MultiplyTranspose(fCandidateValue, input, fWeightsCandidate);
486 Architecture_t::ScaleAdd(fCandidateValue, tmp);
487 Architecture_t::AddRowWise(fCandidateValue, fCandidateBias);
488 DNN::evaluateDerivativeMatrix<Architecture_t>(dc, fCan, fCandidateValue);
489 DNN::evaluateMatrix<Architecture_t>(fCandidateValue, fCan);
490}
491
492 //______________________________________________________________________________
493template <typename Architecture_t>
495-> void
496{
497 // for Cudnn
498 if (Architecture_t::IsCudnn()) {
499
500 // input size is stride[1] of input tensor that is B x T x inputSize
501 assert(input.GetStrides()[1] == this->GetInputSize());
502
503 Tensor_t &x = this->fX;
504 Tensor_t &y = this->fY;
505 Architecture_t::Rearrange(x, input);
506
507 //const auto &weights = this->GetWeightsAt(0);
508 const auto &weights = this->GetWeightsTensor();
509
510 auto &hx = this->fState;
511 auto &cx = this->fCell;
512 // use same for hy and cy
513 auto &hy = this->fState;
514 auto &cy = this->fCell;
515
516 auto & rnnDesc = static_cast<RNNDescriptors_t &>(*fDescriptors);
517 auto & rnnWork = static_cast<RNNWorkspace_t &>(*fWorkspace);
518
519 Architecture_t::RNNForward(x, hx, cx, weights, y, hy, cy, rnnDesc, rnnWork, isTraining);
520
521 if (fReturnSequence) {
522 Architecture_t::Rearrange(this->GetOutput(), y); // swap B and T from y to Output
523 } else {
524 // tmp is a reference to y (full cudnn output)
525 Tensor_t tmp = (y.At(y.GetShape()[0] - 1)).Reshape({y.GetShape()[1], 1, y.GetShape()[2]});
526 Architecture_t::Copy(this->GetOutput(), tmp);
527 }
528
529 return;
530 }
531
532 // D : input size
533 // H : state size
534 // T : time size
535 // B : batch size
536
537 Tensor_t arrInput ( fTimeSteps, this->GetBatchSize(), this->GetInputWidth());
538 // for (size_t t = 0; t < fTimeSteps; ++t) {
539 // arrInput.emplace_back(this->GetBatchSize(), this->GetInputWidth()); // T x B x D
540 // }
541 Architecture_t::Rearrange(arrInput, input); // B x T x D
542
543 Tensor_t arrOutput ( fTimeSteps, this->GetBatchSize(), fStateSize );
544 // for (size_t t = 0; t < fTimeSteps;++t) {
545 // arrOutput.emplace_back(this->GetBatchSize(), fStateSize); // T x B x H
546 // }
547
548 if (!this->fRememberState) {
550 }
551
552 /*! Pass each gate values to CellForward() to calculate
553 * next hidden state and next cell state. */
554 for (size_t t = 0; t < fTimeSteps; ++t) {
555 /* Feed forward network: value of each gate being computed at each timestep t. */
556 ResetGate(arrInput[t], fDerivativesReset[t]);
557 Architecture_t::Copy(this->GetResetGateTensorAt(t), fResetValue);
558 UpdateGate(arrInput[t], fDerivativesUpdate[t]);
559 Architecture_t::Copy(this->GetUpdateGateTensorAt(t), fUpdateValue);
560
561 CandidateValue(arrInput[t], fDerivativesCandidate[t]);
562 Architecture_t::Copy(this->GetCandidateGateTensorAt(t), fCandidateValue);
563
564
565 CellForward(fUpdateValue, fCandidateValue);
566
567 // Architecture_t::PrintTensor(Tensor_t(fState), "state output");
568
569 Matrix_t arrOutputMt = arrOutput[t];
570 Architecture_t::Copy(arrOutputMt, fState);
571 }
572
573 if (fReturnSequence)
574 Architecture_t::Rearrange(this->GetOutput(), arrOutput); // B x T x D
575 else {
576 // get T[end[]]
577 Tensor_t tmp = arrOutput.At(fTimeSteps - 1); // take last time step
578 // shape of tmp is for CPU (column wise) B x D , need to reshape to make a B x D x 1
579 // and transpose it to 1 x D x B (this is how output is expected in columnmajor format)
580 tmp = tmp.Reshape({tmp.GetShape()[0], tmp.GetShape()[1], 1});
581 assert(tmp.GetSize() == this->GetOutput().GetSize());
582 assert(tmp.GetShape()[0] == this->GetOutput().GetShape()[2]); // B is last dim in output and first in tmp
583 Architecture_t::Rearrange(this->GetOutput(), tmp);
584 // keep array output
585 fY = arrOutput;
586 }
587}
588
589//______________________________________________________________________________
590template <typename Architecture_t>
591auto inline TBasicGRULayer<Architecture_t>::CellForward(Matrix_t &updateGateValues, Matrix_t &candidateValues)
592-> void
593{
594 Architecture_t::Hadamard(fState, updateGateValues);
595
596 // this will reuse content of updateGateValues
597 Matrix_t tmp(updateGateValues); // H X 1
598 for (size_t j = 0; j < (size_t) tmp.GetNcols(); j++) {
599 for (size_t i = 0; i < (size_t) tmp.GetNrows(); i++) {
600 tmp(i,j) = 1 - tmp(i,j);
601 }
602 }
603
604 // Update state
605 Architecture_t::Hadamard(candidateValues, tmp);
606 Architecture_t::ScaleAdd(fState, candidateValues);
607}
608
609//____________________________________________________________________________
610template <typename Architecture_t>
611auto inline TBasicGRULayer<Architecture_t>::Backward(Tensor_t &gradients_backward, // B x T x D
612 const Tensor_t &activations_backward) // B x T x D
613-> void
614{
615 // BACKWARD for CUDNN
616 if (Architecture_t::IsCudnn()) {
617
618 Tensor_t &x = this->fX;
619 Tensor_t &y = this->fY;
620 Tensor_t &dx = this->fDx;
621 Tensor_t &dy = this->fDy;
622
623 // input size is stride[1] of input tensor that is B x T x inputSize
624 assert(activations_backward.GetStrides()[1] == this->GetInputSize());
625
626
627 Architecture_t::Rearrange(x, activations_backward);
628
629 if (!fReturnSequence) {
630
631 // Architecture_t::InitializeZero(dy);
632 Architecture_t::InitializeZero(dy);
633
634 // Tensor_t tmp1 = y.At(y.GetShape()[0] - 1).Reshape({y.GetShape()[1], 1, y.GetShape()[2]});
635 Tensor_t tmp2 = dy.At(dy.GetShape()[0] - 1).Reshape({dy.GetShape()[1], 1, dy.GetShape()[2]});
636
637 // Architecture_t::Copy(tmp1, this->GetOutput());
638 Architecture_t::Copy(tmp2, this->GetActivationGradients());
639 } else {
640 Architecture_t::Rearrange(y, this->GetOutput());
641 Architecture_t::Rearrange(dy, this->GetActivationGradients());
642 }
643
644 // Architecture_t::PrintTensor(this->GetOutput(), "output before bwd");
645
646 // for cudnn Matrix_t and Tensor_t are same type
647 const auto &weights = this->GetWeightsTensor();
648 auto &weightGradients = this->GetWeightGradientsTensor();
649
650 // note that cudnnRNNBackwardWeights accumulate the weight gradients.
651 // We need then to initialize the tensor to zero every time
652 Architecture_t::InitializeZero(weightGradients);
653
654 // hx is fState
655 auto &hx = this->GetState();
656 auto &cx = this->GetCell();
657 // use same for hy and cy
658 auto &dhy = hx;
659 auto &dcy = cx;
660 auto &dhx = hx;
661 auto &dcx = cx;
662
663 auto & rnnDesc = static_cast<RNNDescriptors_t &>(*fDescriptors);
664 auto & rnnWork = static_cast<RNNWorkspace_t &>(*fWorkspace);
665
666 Architecture_t::RNNBackward(x, hx, cx, y, dy, dhy, dcy, weights, dx, dhx, dcx, weightGradients, rnnDesc, rnnWork);
667
668 // Architecture_t::PrintTensor(this->GetOutput(), "output after bwd");
669
670 if (gradients_backward.GetSize() != 0)
671 Architecture_t::Rearrange(gradients_backward, dx);
672
673 return;
674 }
675
676 // gradients_backward is activationGradients of layer before it, which is input layer.
677 // Currently, gradients_backward is for input(x) and not for state.
678 // For the state it can be:
679 Matrix_t state_gradients_backward(this->GetBatchSize(), fStateSize); // B x H
680 DNN::initialize<Architecture_t>(state_gradients_backward, DNN::EInitialization::kZero); // B x H
681
682 // if dummy is false gradients_backward will be written back on the matrix
683 bool dummy = false;
684 if (gradients_backward.GetSize() == 0 || gradients_backward[0].GetNrows() == 0 || gradients_backward[0].GetNcols() == 0) {
685 dummy = true;
686 }
687
688 Tensor_t arr_gradients_backward ( fTimeSteps, this->GetBatchSize(), this->GetInputSize());
689
690
691 //Architecture_t::Rearrange(arr_gradients_backward, gradients_backward); // B x T x D
692 // activations_backward is input.
693 Tensor_t arr_activations_backward ( fTimeSteps, this->GetBatchSize(), this->GetInputSize());
694
695 Architecture_t::Rearrange(arr_activations_backward, activations_backward); // B x T x D
696
697 /*! For backpropagation, we need to calculate loss. For loss, output must be known.
698 * We obtain outputs during forward propagation and place the results in arr_output tensor. */
699 Tensor_t arr_output ( fTimeSteps, this->GetBatchSize(), fStateSize);
700
701 Matrix_t initState(this->GetBatchSize(), fStateSize); // B x H
702 DNN::initialize<Architecture_t>(initState, DNN::EInitialization::kZero); // B x H
703
704 // This will take partial derivative of state[t] w.r.t state[t-1]
705 Tensor_t arr_actgradients ( fTimeSteps, this->GetBatchSize(), fStateSize);
706
707 if (fReturnSequence) {
708 Architecture_t::Rearrange(arr_output, this->GetOutput());
709 Architecture_t::Rearrange(arr_actgradients, this->GetActivationGradients());
710 } else {
711 //
712 arr_output = fY;
713 Architecture_t::InitializeZero(arr_actgradients);
714 // need to reshape to pad a time dimension = 1 (note here is columnmajor tensors)
715 Tensor_t tmp_grad = arr_actgradients.At(fTimeSteps - 1).Reshape({this->GetBatchSize(), fStateSize, 1});
716 assert(tmp_grad.GetSize() == this->GetActivationGradients().GetSize());
717 assert(tmp_grad.GetShape()[0] ==
718 this->GetActivationGradients().GetShape()[2]); // B in tmp is [0] and [2] in input act. gradients
719
720 Architecture_t::Rearrange(tmp_grad, this->GetActivationGradients());
721 }
722
723 /*! There are total 8 different weight matrices and 4 bias vectors.
724 * Re-initialize them with zero because it should have some value. (can't be garbage values) */
725
726 // Reset Gate.
727 fWeightsResetGradients.Zero();
728 fWeightsResetStateGradients.Zero();
729 fResetBiasGradients.Zero();
730
731 // Update Gate.
732 fWeightsUpdateGradients.Zero();
733 fWeightsUpdateStateGradients.Zero();
734 fUpdateBiasGradients.Zero();
735
736 // Candidate Gate.
737 fWeightsCandidateGradients.Zero();
738 fWeightsCandidateStateGradients.Zero();
739 fCandidateBiasGradients.Zero();
740
741
742 for (size_t t = fTimeSteps; t > 0; t--) {
743 // Store the sum of gradients obtained at each timestep during backward pass.
744 Architecture_t::ScaleAdd(state_gradients_backward, arr_actgradients[t-1]);
745 if (t > 1) {
746 const Matrix_t &prevStateActivations = arr_output[t-2];
747 Matrix_t dx = arr_gradients_backward[t-1];
748 // During forward propagation, each gate value calculates their gradients.
749 CellBackward(state_gradients_backward, prevStateActivations,
750 this->GetResetGateTensorAt(t-1), this->GetUpdateGateTensorAt(t-1),
751 this->GetCandidateGateTensorAt(t-1),
752 arr_activations_backward[t-1], dx ,
753 fDerivativesReset[t-1], fDerivativesUpdate[t-1],
754 fDerivativesCandidate[t-1]);
755 } else {
756 const Matrix_t &prevStateActivations = initState;
757 Matrix_t dx = arr_gradients_backward[t-1];
758 CellBackward(state_gradients_backward, prevStateActivations,
759 this->GetResetGateTensorAt(t-1), this->GetUpdateGateTensorAt(t-1),
760 this->GetCandidateGateTensorAt(t-1),
761 arr_activations_backward[t-1], dx ,
762 fDerivativesReset[t-1], fDerivativesUpdate[t-1],
763 fDerivativesCandidate[t-1]);
764 }
765 }
766
767 if (!dummy) {
768 Architecture_t::Rearrange(gradients_backward, arr_gradients_backward );
769 }
770
771}
772
773
774//______________________________________________________________________________
775template <typename Architecture_t>
776auto inline TBasicGRULayer<Architecture_t>::CellBackward(Matrix_t & state_gradients_backward,
777 const Matrix_t & precStateActivations,
778 const Matrix_t & reset_gate, const Matrix_t & update_gate,
779 const Matrix_t & candidate_gate,
780 const Matrix_t & input, Matrix_t & input_gradient,
781 Matrix_t &dr, Matrix_t &du, Matrix_t &dc)
782-> Matrix_t &
783{
784 /*! Call here GRULayerBackward() to pass parameters i.e. gradient
785 * values obtained from each gate during forward propagation. */
786 return Architecture_t::GRULayerBackward(state_gradients_backward,
787 fWeightsResetGradients, fWeightsUpdateGradients, fWeightsCandidateGradients,
788 fWeightsResetStateGradients, fWeightsUpdateStateGradients,
789 fWeightsCandidateStateGradients, fResetBiasGradients, fUpdateBiasGradients,
790 fCandidateBiasGradients, dr, du, dc,
791 precStateActivations,
792 reset_gate, update_gate, candidate_gate,
793 fWeightsResetGate, fWeightsUpdateGate, fWeightsCandidate,
794 fWeightsResetGateState, fWeightsUpdateGateState, fWeightsCandidateState,
795 input, input_gradient, fResetGateAfter);
796}
797
798
799//______________________________________________________________________________
800template <typename Architecture_t>
802-> void
803{
804 DNN::initialize<Architecture_t>(this->GetState(), DNN::EInitialization::kZero);
805}
806
807 //______________________________________________________________________________
808template<typename Architecture_t>
810-> void
811{
812 std::cout << " GRU Layer: \t ";
813 std::cout << " (NInput = " << this->GetInputSize(); // input size
814 std::cout << ", NState = " << this->GetStateSize(); // hidden state size
815 std::cout << ", NTime = " << this->GetTimeSteps() << " )"; // time size
816 std::cout << "\tOutput = ( " << this->GetOutput().GetFirstSize() << " , " << this->GetOutput()[0].GetNrows() << " , " << this->GetOutput()[0].GetNcols() << " )\n";
817}
818
819//______________________________________________________________________________
820template <typename Architecture_t>
822-> void
823{
824 auto layerxml = gTools().xmlengine().NewChild(parent, nullptr, "GRULayer");
825
826 // Write all other info like outputSize, cellSize, inputSize, timeSteps, rememberState
827 gTools().xmlengine().NewAttr(layerxml, nullptr, "StateSize", gTools().StringFromInt(this->GetStateSize()));
828 gTools().xmlengine().NewAttr(layerxml, nullptr, "InputSize", gTools().StringFromInt(this->GetInputSize()));
829 gTools().xmlengine().NewAttr(layerxml, nullptr, "TimeSteps", gTools().StringFromInt(this->GetTimeSteps()));
830 gTools().xmlengine().NewAttr(layerxml, nullptr, "RememberState", gTools().StringFromInt(this->DoesRememberState()));
831 gTools().xmlengine().NewAttr(layerxml, nullptr, "ReturnSequence", gTools().StringFromInt(this->DoesReturnSequence()));
832 gTools().xmlengine().NewAttr(layerxml, nullptr, "ResetGateAfter", gTools().StringFromInt(this->fResetGateAfter));
833
834 // write weights and bias matrices
835 this->WriteMatrixToXML(layerxml, "ResetWeights", this->GetWeightsAt(0));
836 this->WriteMatrixToXML(layerxml, "ResetStateWeights", this->GetWeightsAt(1));
837 this->WriteMatrixToXML(layerxml, "ResetBiases", this->GetBiasesAt(0));
838 this->WriteMatrixToXML(layerxml, "UpdateWeights", this->GetWeightsAt(2));
839 this->WriteMatrixToXML(layerxml, "UpdateStateWeights", this->GetWeightsAt(3));
840 this->WriteMatrixToXML(layerxml, "UpdateBiases", this->GetBiasesAt(1));
841 this->WriteMatrixToXML(layerxml, "CandidateWeights", this->GetWeightsAt(4));
842 this->WriteMatrixToXML(layerxml, "CandidateStateWeights", this->GetWeightsAt(5));
843 this->WriteMatrixToXML(layerxml, "CandidateBiases", this->GetBiasesAt(2));
844}
845
846 //______________________________________________________________________________
847template <typename Architecture_t>
849-> void
850{
851 // Read weights and biases
852 this->ReadMatrixXML(parent, "ResetWeights", this->GetWeightsAt(0));
853 this->ReadMatrixXML(parent, "ResetStateWeights", this->GetWeightsAt(1));
854 this->ReadMatrixXML(parent, "ResetBiases", this->GetBiasesAt(0));
855 this->ReadMatrixXML(parent, "UpdateWeights", this->GetWeightsAt(2));
856 this->ReadMatrixXML(parent, "UpdateStateWeights", this->GetWeightsAt(3));
857 this->ReadMatrixXML(parent, "UpdateBiases", this->GetBiasesAt(1));
858 this->ReadMatrixXML(parent, "CandidateWeights", this->GetWeightsAt(4));
859 this->ReadMatrixXML(parent, "CandidateStateWeights", this->GetWeightsAt(5));
860 this->ReadMatrixXML(parent, "CandidateBiases", this->GetBiasesAt(2));
861}
862
863} // namespace GRU
864} // namespace DNN
865} // namespace TMVA
866
867#endif // GRU_LAYER_H
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void input
const Matrix_t & GetWeightsCandidate() const
Definition GRULayer.h:224
Matrix_t & GetWeightsCandidateStateGradients()
Definition GRULayer.h:286
typename Architecture_t::RecurrentDescriptor_t LayerDescriptor_t
Definition GRULayer.h:65
Matrix_t & fResetBiasGradients
Gradients w.r.t the reset gate - bias weights.
Definition GRULayer.h:114
std::vector< Matrix_t > & GetUpdateGateTensor()
Definition GRULayer.h:254
typename Architecture_t::Tensor_t Tensor_t
Definition GRULayer.h:63
std::vector< Matrix_t > reset_gate_value
Reset gate value for every time step.
Definition GRULayer.h:104
Matrix_t & CellBackward(Matrix_t &state_gradients_backward, const Matrix_t &precStateActivations, const Matrix_t &reset_gate, const Matrix_t &update_gate, const Matrix_t &candidate_gate, const Matrix_t &input, Matrix_t &input_gradient, Matrix_t &dr, Matrix_t &du, Matrix_t &dc)
Backward for a single time unit a the corresponding call to Forward(...).
Definition GRULayer.h:776
size_t fStateSize
Hidden state size for GRU.
Definition GRULayer.h:75
const Matrix_t & GetWeightsResetGradients() const
Definition GRULayer.h:271
const Matrix_t & GetUpdateBiasGradients() const
Definition GRULayer.h:281
bool fReturnSequence
Return in output full sequence or just last element.
Definition GRULayer.h:79
void Forward(Tensor_t &input, bool isTraining=true)
Computes the next hidden state and next cell state with given input matrix.
Definition GRULayer.h:494
const Matrix_t & GetWeightsResetStateGradients() const
Definition GRULayer.h:273
std::vector< Matrix_t > fDerivativesReset
First fDerivatives of the activations reset gate.
Definition GRULayer.h:108
const Tensor_t & GetWeightsTensor() const
Definition GRULayer.h:291
std::vector< Matrix_t > & GetResetGateTensor()
Definition GRULayer.h:250
Matrix_t & GetWeightsUpdateGateState()
Definition GRULayer.h:232
const std::vector< Matrix_t > & GetCandidateGateTensor() const
Definition GRULayer.h:257
const Matrix_t & GetUpdateDerivativesAt(size_t i) const
Definition GRULayer.h:242
Matrix_t & GetWeightsUpdateStateGradients()
Definition GRULayer.h:280
size_t GetInputSize() const
Getters.
Definition GRULayer.h:200
Matrix_t fState
Hidden state of GRU.
Definition GRULayer.h:88
Matrix_t & GetWeightsResetGradients()
Definition GRULayer.h:272
Tensor_t & GetWeightGradientsTensor()
Definition GRULayer.h:292
const Matrix_t & GetCandidateBias() const
Definition GRULayer.h:268
std::vector< Matrix_t > update_gate_value
Update gate value for every time step.
Definition GRULayer.h:105
Tensor_t fX
cached input tensor as T x B x I
Definition GRULayer.h:129
Matrix_t & GetCandidateGateTensorAt(size_t i)
Definition GRULayer.h:260
Matrix_t & GetResetBiasGradients()
Definition GRULayer.h:276
Matrix_t & GetWeightsResetGateState()
Definition GRULayer.h:230
DNN::EActivationFunction fF1
Activation function: sigmoid.
Definition GRULayer.h:82
const Matrix_t & GetWeightsUpdateGate() const
Definition GRULayer.h:226
const std::vector< Matrix_t > & GetDerivativesReset() const
Definition GRULayer.h:236
const Matrix_t & GetUpdateGateBias() const
Definition GRULayer.h:266
Matrix_t & fWeightsResetGradients
Gradients w.r.t the reset gate - input weights.
Definition GRULayer.h:112
std::vector< Matrix_t > & GetDerivativesUpdate()
Definition GRULayer.h:241
Matrix_t & fCandidateBiasGradients
Gradients w.r.t the candidate gate - bias weights.
Definition GRULayer.h:120
Matrix_t & fCandidateBias
Candidate Gate bias.
Definition GRULayer.h:101
Matrix_t & GetUpdateGateTensorAt(size_t i)
Definition GRULayer.h:256
DNN::EActivationFunction fF2
Activation function: tanh.
Definition GRULayer.h:83
const Matrix_t & GetWeightsUpdateGradients() const
Definition GRULayer.h:277
Matrix_t & GetWeightsCandidateGradients()
Definition GRULayer.h:284
Matrix_t & fWeightsUpdateStateGradients
Gradients w.r.t the update gate - hidden state weights.
Definition GRULayer.h:116
void AddWeightsXMLTo(void *parent)
Writes the information and the weights about the layer in an XML node.
Definition GRULayer.h:821
Matrix_t & fWeightsUpdateGradients
Gradients w.r.t the update gate - input weights.
Definition GRULayer.h:115
size_t fTimeSteps
Timesteps for GRU.
Definition GRULayer.h:76
std::vector< Matrix_t > fDerivativesCandidate
First fDerivatives of the activations candidate gate.
Definition GRULayer.h:110
const Tensor_t & GetWeightGradientsTensor() const
Definition GRULayer.h:293
typename Architecture_t::FilterDescriptor_t WeightsDescriptor_t
Definition GRULayer.h:66
Tensor_t fWeightGradientsTensor
Tensor for all weight gradients.
Definition GRULayer.h:126
Matrix_t & fUpdateBiasGradients
Gradients w.r.t the update gate - bias weights.
Definition GRULayer.h:117
Matrix_t & GetWeightsResetStateGradients()
Definition GRULayer.h:274
std::vector< Matrix_t > & GetCandidateGateTensor()
Definition GRULayer.h:258
Matrix_t & fWeightsResetGate
Reset Gate weights for input, fWeights[0].
Definition GRULayer.h:91
const Matrix_t & GetResetDerivativesAt(size_t i) const
Definition GRULayer.h:238
Matrix_t & GetWeightsUpdateGate()
Definition GRULayer.h:227
typename Architecture_t::Matrix_t Matrix_t
Definition GRULayer.h:61
void Backward(Tensor_t &gradients_backward, const Tensor_t &activations_backward)
Backpropagates the error.
Definition GRULayer.h:611
const Matrix_t & GetCandidateGateTensorAt(size_t i) const
Definition GRULayer.h:259
Matrix_t & GetWeightsCandidateState()
Definition GRULayer.h:234
const Matrix_t & GetCandidateBiasGradients() const
Definition GRULayer.h:287
Matrix_t & GetResetGateTensorAt(size_t i)
Definition GRULayer.h:252
Matrix_t & fResetGateBias
Input Gate bias.
Definition GRULayer.h:93
const std::vector< Matrix_t > & GetResetGateTensor() const
Definition GRULayer.h:249
Matrix_t fCell
Empty matrix for GRU.
Definition GRULayer.h:122
std::vector< Matrix_t > candidate_gate_value
Candidate gate value for every time step.
Definition GRULayer.h:106
typename Architecture_t::Scalar_t Scalar_t
Definition GRULayer.h:62
const Matrix_t & GetWeigthsUpdateStateGradients() const
Definition GRULayer.h:279
const Matrix_t & GetCandidateValue() const
Definition GRULayer.h:212
Matrix_t & GetCandidateBiasGradients()
Definition GRULayer.h:288
Matrix_t & fWeightsCandidateStateGradients
Gradients w.r.t the candidate gate - hidden state weights.
Definition GRULayer.h:119
const std::vector< Matrix_t > & GetDerivativesUpdate() const
Definition GRULayer.h:240
const Matrix_t & GetCell() const
Definition GRULayer.h:219
void UpdateGate(const Matrix_t &input, Matrix_t &df)
Forgets the past values (NN with Sigmoid)
Definition GRULayer.h:441
const Matrix_t & GetCandidateDerivativesAt(size_t i) const
Definition GRULayer.h:246
Matrix_t fResetValue
Computed reset gate values.
Definition GRULayer.h:85
DNN::EActivationFunction GetActivationFunctionF2() const
Definition GRULayer.h:208
typename Architecture_t::RNNWorkspace_t RNNWorkspace_t
Definition GRULayer.h:70
Matrix_t fUpdateValue
Computed forget gate values.
Definition GRULayer.h:86
const Matrix_t & GetResetBiasGradients() const
Definition GRULayer.h:275
bool fResetGateAfter
GRU variant to Apply the reset gate multiplication afterwards (used by cuDNN)
Definition GRULayer.h:80
const Matrix_t & GetWeightsCandidateGradients() const
Definition GRULayer.h:283
DNN::EActivationFunction GetActivationFunctionF1() const
Definition GRULayer.h:207
Matrix_t & GetUpdateBiasGradients()
Definition GRULayer.h:282
const Matrix_t & GetUpdateGateTensorAt(size_t i) const
Definition GRULayer.h:255
Matrix_t & fWeightsResetGateState
Input Gate weights for prev state, fWeights[1].
Definition GRULayer.h:92
Matrix_t & fWeightsUpdateGateState
Update Gate weights for prev state, fWeights[3].
Definition GRULayer.h:96
const std::vector< Matrix_t > & GetDerivativesCandidate() const
Definition GRULayer.h:244
virtual void Initialize()
Initialize the weights according to the given initialization method.
Definition GRULayer.h:409
Tensor_t fWeightsTensor
Tensor for all weights.
Definition GRULayer.h:125
void ReadWeightsFromXML(void *parent)
Read the information and the weights about the layer from XML node.
Definition GRULayer.h:848
typename Architecture_t::RNNDescriptors_t RNNDescriptors_t
Definition GRULayer.h:71
const Matrix_t & GetResetGateBias() const
Definition GRULayer.h:264
Matrix_t & GetResetDerivativesAt(size_t i)
Definition GRULayer.h:239
const Matrix_t & GetUpdateGateValue() const
Definition GRULayer.h:214
const Matrix_t & GetResetGateTensorAt(size_t i) const
Definition GRULayer.h:251
TDescriptors * fDescriptors
Keeps all the RNN descriptors.
Definition GRULayer.h:134
void CellForward(Matrix_t &updateGateValues, Matrix_t &candidateValues)
Forward for a single cell (time unit)
Definition GRULayer.h:591
Matrix_t & GetWeightsUpdateGradients()
Definition GRULayer.h:278
Matrix_t & fWeightsResetStateGradients
Gradients w.r.t the reset gate - hidden state weights.
Definition GRULayer.h:113
Matrix_t & fWeightsCandidateState
Candidate Gate weights for prev state, fWeights[5].
Definition GRULayer.h:100
void Print() const
Prints the info about the layer.
Definition GRULayer.h:809
std::vector< Matrix_t > & GetDerivativesReset()
Definition GRULayer.h:237
Matrix_t & fUpdateGateBias
Update Gate bias.
Definition GRULayer.h:97
const Matrix_t & GetWeightsCandidateStateGradients() const
Definition GRULayer.h:285
void ResetGate(const Matrix_t &input, Matrix_t &di)
Decides the values we'll update (NN with Sigmoid)
Definition GRULayer.h:423
const Matrix_t & GetWeightsResetGate() const
Definition GRULayer.h:222
Tensor_t fDx
cached gradient on the input (output of backward) as T x B x I
Definition GRULayer.h:131
typename Architecture_t::TensorDescriptor_t TensorDescriptor_t
Definition GRULayer.h:67
bool fRememberState
Remember state in next pass.
Definition GRULayer.h:78
Matrix_t & fWeightsCandidate
Candidate Gate weights for input, fWeights[4].
Definition GRULayer.h:99
Matrix_t & fWeightsCandidateGradients
Gradients w.r.t the candidate gate - input weights.
Definition GRULayer.h:118
const Matrix_t & GetWeightsCandidateState() const
Definition GRULayer.h:233
const std::vector< Matrix_t > & GetUpdateGateTensor() const
Definition GRULayer.h:253
const Matrix_t & GetResetGateValue() const
Definition GRULayer.h:210
void Update(const Scalar_t learningRate)
Tensor_t fY
cached output tensor as T x B x S
Definition GRULayer.h:130
Matrix_t fCandidateValue
Computed candidate values.
Definition GRULayer.h:87
const Matrix_t & GetState() const
Definition GRULayer.h:217
void InitState(DNN::EInitialization m=DNN::EInitialization::kZero)
Initialize the hidden state and cell state method.
Definition GRULayer.h:801
Tensor_t fDy
cached activation gradient (input of backward) as T x B x S
Definition GRULayer.h:132
Matrix_t & GetCandidateDerivativesAt(size_t i)
Definition GRULayer.h:247
std::vector< Matrix_t > fDerivativesUpdate
First fDerivatives of the activations update gate.
Definition GRULayer.h:109
const Matrix_t & GetWeightsUpdateGateState() const
Definition GRULayer.h:231
std::vector< Matrix_t > & GetDerivativesCandidate()
Definition GRULayer.h:245
const Matrix_t & GetWeightsResetGateState() const
Definition GRULayer.h:229
void CandidateValue(const Matrix_t &input, Matrix_t &dc)
Decides the new candidate values (NN with Tanh)
Definition GRULayer.h:459
TBasicGRULayer(size_t batchSize, size_t stateSize, size_t inputSize, size_t timeSteps, bool rememberState=false, bool returnSequence=false, bool resetGateAfter=false, DNN::EActivationFunction f1=DNN::EActivationFunction::kSigmoid, DNN::EActivationFunction f2=DNN::EActivationFunction::kTanh, bool training=true, DNN::EInitialization fA=DNN::EInitialization::kZero)
Constructor.
Definition GRULayer.h:308
Matrix_t & GetUpdateDerivativesAt(size_t i)
Definition GRULayer.h:243
Matrix_t & fWeightsUpdateGate
Update Gate weights for input, fWeights[2].
Definition GRULayer.h:95
typename Architecture_t::DropoutDescriptor_t HelperDescriptor_t
Definition GRULayer.h:68
Generic General Layer class.
virtual void Initialize()
Initialize the weights and biases according to the given initialization method.
size_t GetBatchSize() const
Getters.
size_t GetInputWidth() const
TXMLEngine & xmlengine()
Definition Tools.h:262
XMLNodePointer_t NewChild(XMLNodePointer_t parent, XMLNsPointer_t ns, const char *name, const char *content=nullptr)
create new child element for parent node
XMLAttrPointer_t NewAttr(XMLNodePointer_t xmlnode, XMLNsPointer_t, const char *name, const char *value)
creates new attribute for xmlnode, namespaces are not supported for attributes
Double_t y[n]
Definition legend1.C:17
Double_t x[n]
Definition legend1.C:17
TF1 * f1
Definition legend1.C:11
EActivationFunction
Enum that represents layer activation functions.
Definition Functions.h:32
create variable transformations
Tools & gTools()
TMarker m
Definition textangle.C:8