18#ifndef TMVA_DNN_ARCHITECTURES_CUDA
19#define TMVA_DNN_ARCHITECTURES_CUDA
63template<
typename AReal = Float_t>
132 static void CreateWeightTensors( std::vector<Matrix_t> & newWeights,
const std::vector<Matrix_t> & weights) {
133 if (!newWeights.empty()) newWeights.clear();
134 size_t n = weights.size();
135 for (
size_t i = 0; i <
n; ++i)
136 newWeights.emplace_back( weights[i].GetNrows(), weights[i].GetNcols());
149 Error(
"InitializeBNormDescriptrs",
"Batch normalization on GPU is supported only with Cudnn");
240 const Tensor_t & activationGradients,
242 const Tensor_t & activationBackward);
255 template<
typename AMatrix_t>
268 template<
typename ATensor_t>
270 const ATensor_t & B);
273 template<
typename AMatrix_t>
275 const std::vector<AMatrix_t> & B);
299 const double coef = 0.0,
const AFloat alpha = 1,
515 static size_t calculateDimension(
size_t imgDim,
size_t fltDim,
size_t padding,
size_t stride);
527 size_t zeroPaddingHeight,
528 size_t zeroPaddingWidth);
530 static void Im2colIndices(std::vector<int> &V,
const Matrix_t &B,
size_t nLocalViews,
size_t imgHeight,
size_t imgWidth,
size_t fltHeight,
531 size_t fltWidth,
size_t strideRows,
size_t strideCols,
size_t zeroPaddingHeight,
532 size_t zeroPaddingWidth);
538 size_t filterWidth,
size_t numFilters);
579 size_t batchSize,
size_t inputHeight,
580 size_t inputWidth,
size_t depth,
582 size_t filterDepth,
size_t filterHeight,
583 size_t filterWidth,
size_t nLocalViews );
589 const Matrix_t &weights,
size_t batchSize,
590 size_t inputHeight,
size_t inputWidth,
size_t depth,
size_t height,
591 size_t width,
size_t filterDepth,
size_t filterHeight,
598 const Tensor_t &activations_backward,
599 size_t batchSize,
size_t inputHeight,
size_t inputWidth,
size_t depth,
600 size_t height,
size_t width,
size_t filterDepth,
size_t filterHeight,
601 size_t filterWidth,
size_t nLocalViews);
606 size_t batchSize,
size_t depth,
size_t nLocalViews);
623 size_t imgHeight,
size_t imgWidth,
size_t fltHeight,
624 size_t fltWidth,
size_t strideRows,
size_t strideCols);
635 const Tensor_t &activationGradients,
718 Fatal(
"TCuda::LSTMLayerBackward",
"Recurrent layers are not supported in the native Cuda architecture!!!");
719 return state_gradients_backward;
736 Fatal(
"TCuda::GRULayerBackward",
"Recurrent layers are not supported in the native Cuda architecture!!!");
737 return state_gradients_backward;
837template <
typename AFloat>
838template <
typename AMatrix_t>
849template <
typename AFloat>
850template <
typename AMatrix_t>
852 const std::vector<AMatrix_t> &A)
854 for (
size_t i = 0; i < B.size(); ++i) {
855 CopyDiffArch(B[i], A[i]);
859template <
typename AFloat>
862 std::cout <<
name <<
" size = " << A.
GetSize() <<
" shape = { ";
864 for (
size_t k = 0; k < shape.size()-1; ++k)
865 std::cout << shape[k] <<
" , ";
866 std::cout << shape.back() <<
" } ";
867 std::cout <<
" strides = { ";
869 for (
size_t k = 0; k < strides.size()-1; ++k)
870 std::cout << strides[k] <<
" , ";
871 std::cout << strides.back() <<
" }\n ";
874 for (
size_t i = 0; i < A.
GetShape()[0]; ++i) {
876 for (
size_t j = 0; j < A.
GetShape()[1]; ++j) {
877 std::cout << A(i,j) <<
" ";
879 std::cout <<
" } " << std::endl;
881 }
else if (A.
GetShape().size() == 3 ) {
884 for (
size_t j = 0; j < A.
GetHSize(); ++j) {
886 for (
size_t k = 0; k < A.
GetWSize(); ++k) {
887 std::cout << A(i,j,k) <<
" ";
889 std::cout <<
" } " << std::endl;
891 std::cout <<
" } " << std::endl;
void Error(const char *location, const char *msgfmt,...)
Use this function in case an error occurred.
void Fatal(const char *location, const char *msgfmt,...)
Use this function in case of a fatal error. It will abort the program.
winID h TVirtualViewer3D TVirtualGLPainter p
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void input
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t height
Implementation of the CrossEntropy as separation criterion.
Generic Max Pooling Layer class.
Layer implementing Batch Normalization.
const Shape_t & GetShape() const
const AFloat * GetData() const
const Shape_t & GetStrides() const
size_t GetFirstSize() const
The TCuda architecture class.
static void RNNBackward(const Tensor_t &, const Matrix_t &, const Matrix_t &, const Tensor_t &, const Tensor_t &, const Matrix_t &, const Matrix_t &, const Tensor_t &, Tensor_t &, Matrix_t &, Matrix_t &, Tensor_t &, const RNNDescriptors_t &, RNNWorkspace_t &)
static void SetRandomSeed(size_t seed)
CNN::TCNNDescriptors< ConvLayer_t > ConvDescriptors_t
static void Backward(Tensor_t &activationGradientsBackward, Matrix_t &weightGradients, Matrix_t &biasGradients, const Tensor_t &df, const Tensor_t &activationGradients, const Matrix_t &weights, const Tensor_t &activationBackward)
Perform the complete backward propagation step.
static void InitializeGRUWorkspace(TWorkspace *&, TDescriptors *&, GenLayer_t *)
TCudaMatrix< AFloat > Matrix_t
static Matrix_t & LSTMLayerBackward(Matrix_t &state_gradients_backward, Matrix_t &, Matrix_t &, Matrix_t &, Matrix_t &, Matrix_t &, Matrix_t &, Matrix_t &, Matrix_t &, Matrix_t &, Matrix_t &, Matrix_t &, Matrix_t &, Matrix_t &, Matrix_t &, Matrix_t &, Matrix_t &, Matrix_t &, const Matrix_t &, const Matrix_t &, const Matrix_t &, const Matrix_t &, const Matrix_t &, const Matrix_t &, const Matrix_t &, const Matrix_t &, const Matrix_t &, const Matrix_t &, const Matrix_t &, const Matrix_t &, const Matrix_t &, const Matrix_t &, const Matrix_t &, Matrix_t &, Matrix_t &, Matrix_t &)
static void FastTanh(Tensor_t &B)
static void InitializeGlorotUniform(Matrix_t &A)
Sample from a uniform distribution in range [ -lim,+lim] where lim = sqrt(6/N_in+N_out).
static void SoftSignDerivative(Tensor_t &B, const Tensor_t &A)
static Scalar_t L1Regularization(const Matrix_t &W)
static void SymmetricReluDerivative(Tensor_t &B, const Tensor_t &A)
static void InitializeGlorotNormal(Matrix_t &A)
Truncated normal initialization (Glorot, called also Xavier normal) The values are sample with a norm...
static void Im2colFast(Matrix_t &A, const Matrix_t &B, const std::vector< int > &V)
static Matrix_t & RecurrentLayerBackward(Matrix_t &state_gradients_backward, Matrix_t &input_weight_gradients, Matrix_t &state_weight_gradients, Matrix_t &bias_gradients, Matrix_t &df, const Matrix_t &state, const Matrix_t &weights_input, const Matrix_t &weights_state, const Matrix_t &input, Matrix_t &input_gradient)
Backward pass for Recurrent Networks.
static void ConvLayerBackward(Tensor_t &activationGradientsBackward, Matrix_t &weightGradients, Matrix_t &biasGradients, Tensor_t &df, Tensor_t &activationGradients, const Matrix_t &weights, const Tensor_t &activationBackward, const Tensor_t &outputTensor, EActivationFunction activFunc, const ConvDescriptors_t &, ConvWorkspace_t &, size_t batchSize, size_t inputHeight, size_t inputWidth, size_t depth, size_t height, size_t width, size_t filterDepth, size_t filterHeight, size_t filterWidth, size_t nLocalViews)
Perform the complete backward propagation step in a Convolutional Layer.
static void CalculateConvWeightGradients(Matrix_t &weightGradients, const Tensor_t &df, const Tensor_t &activations_backward, size_t batchSize, size_t inputHeight, size_t inputWidth, size_t depth, size_t height, size_t width, size_t filterDepth, size_t filterHeight, size_t filterWidth, size_t nLocalViews)
Utility function for calculating the weight gradients of the convolutional layer.
static void IdentityDerivative(Tensor_t &B, const Tensor_t &A)
static void InitializeRNNDescriptors(TDescriptors *&, GenLayer_t *)
static Tensor_t CreateTensor(size_t b, size_t t, size_t w)
static size_t calculateDimension(size_t imgDim, size_t fltDim, size_t padding, size_t stride)
Calculate how many neurons "fit" in the output layer, given the input as well as the layer's hyperpar...
static void CopyDiffArch(Matrix_t &B, const AMatrix_t &A)
CNN::TCNNWorkspace< ConvLayer_t > ConvWorkspace_t
static void DropoutForward(Tensor_t &A, TDescriptors *descriptors, TWorkspace *workspace, Scalar_t p)
Apply dropout with activation probability p to the given tensor A and scale the result by reciprocal ...
static void InitializeGRUDescriptors(TDescriptors *&, GenLayer_t *)
static void ActivationFunctionForward(Tensor_t &X, EActivationFunction activFunct, const ActivationDescriptor_t activationDescr, const double coef=0.0, const AFloat alpha=1, const AFloat beta=0)
static void InitializeGRUTensors(GenLayer_t *)
static void FreeConvWorkspace(TWorkspace *&)
Only used for certain cudnn on-device memory.
static void ConvLayerForward(Tensor_t &output, Tensor_t &inputActivationFunc, const Tensor_t &input, const Matrix_t &weights, const Matrix_t &biases, const DNN::CNN::TConvParams ¶ms, EActivationFunction activFunc, Tensor_t &, const ConvDescriptors_t &, ConvWorkspace_t &)
Forward propagation in the Convolutional layer.
static void CalculateConvActivationGradients(Tensor_t &activationGradientsBackward, const Tensor_t &df, const Matrix_t &weights, size_t batchSize, size_t inputHeight, size_t inputWidth, size_t depth, size_t height, size_t width, size_t filterDepth, size_t filterHeight, size_t filterWidth)
Utility function for calculating the activation gradients of the layer before the convolutional layer...
static void Sigmoid(Matrix_t &YHat, const Matrix_t &)
static void InitializeZero(Tensor_t &A)
static void InitializeRNNWorkspace(TWorkspace *&, TDescriptors *&, GenLayer_t *)
static void InitializeUniform(Matrix_t &A)
static void InitializeBNormDescriptors(TDescriptors *&, BNormLayer_t *)
Initialize CNN data/operator descriptors.
static bool AlmostEquals(const Matrix_t &A, const Matrix_t &B, double epsilon=0.1)
Check two matrices for equality, taking floating point arithmetic errors into account.
static void SqrtElementWise(Matrix_t &A)
Square root each element of the matrix A and write the result into A.
static void SumRows(Matrix_t &B, const Matrix_t &A)
extra functions defined only for CPU architecture !!!
static void MeanSquaredErrorGradients(Matrix_t &dY, const Matrix_t &Y, const Matrix_t &output, const Matrix_t &weights)
static void Flatten(Tensor_t &A, const Tensor_t &B)
Flattens the tensor B, such that each matrix, is stretched in one row, resulting with a matrix A.
static void Sigmoid(Tensor_t &B)
static void DropoutBackward(Tensor_t &, TDescriptors *, TWorkspace *)
static void InitializeLSTMTensors(GenLayer_t *)
static void FreeRNNWorkspace(TWorkspace *&)
static Scalar_t MeanSquaredError(const Matrix_t &Y, const Matrix_t &output, const Matrix_t &weights)
static Tensor_t CreateTensor(DeviceBuffer_t buffer, size_t b, size_t t, size_t w)
static void MaxPoolLayerBackward(Tensor_t &activationGradientsBackward, const Tensor_t &activationGradients, const Tensor_t &indexMatrix, const Tensor_t &, const Tensor_t &, const PoolingDescriptors_t &, PoolingWorkspace_t &, size_t imgHeight, size_t imgWidth, size_t fltHeight, size_t fltWidth, size_t strideRows, size_t strideCols, size_t nLocalViews)
Perform the complete backward propagation step in a Pooling Layer.
static void InitializeLSTMWorkspace(TWorkspace *&, TDescriptors *&, GenLayer_t *)
static void PrintTensor(const Tensor_t &A, const std::string name="Cuda-tensor", bool=false)
static void Im2colIndices(std::vector< int > &V, const Matrix_t &B, size_t nLocalViews, size_t imgHeight, size_t imgWidth, size_t fltHeight, size_t fltWidth, size_t strideRows, size_t strideCols, size_t zeroPaddingHeight, size_t zeroPaddingWidth)
static void ActivationFunctionBackward(Tensor_t &dX, const Tensor_t &Y, const Tensor_t &dY, const Tensor_t &X, EActivationFunction activFunct, const ActivationDescriptor_t activationDescr, const AFloat alpha=1, const AFloat beta=0)
Computes the gradient of the activation function.
static void Copy(Tensor_t &A, const Tensor_t &B)
static void InitializeConvDescriptors(TDescriptors *&, ConvLayer_t *)
static void AddL2RegularizationGradients(Matrix_t &A, const Matrix_t &W, Scalar_t weightDecay)
static void InitializeGauss(Matrix_t &A)
static void InitializePoolDropoutWorkspace(TWorkspace *&, TDescriptors *&, const DNN::CNN::TConvParams &, PoolingLayer_t *)
static Scalar_t L2Regularization(const Matrix_t &W)
static void AddRowWise(Matrix_t &output, const Matrix_t &biases)
Add the vectors biases row-wise to the matrix output.
static void ScaleAdd(Tensor_t &A, const Tensor_t &B, Scalar_t beta=1.0)
Above functions extended to vectors.
static TMVA::Experimental::MemoryLayout GetTensorLayout()
static void Multiply(Matrix_t &C, const Matrix_t &A, const Matrix_t &B)
Standard multiplication of two matrices A and B with the result being written into C.
static void Downsample(Tensor_t &A, Tensor_t &B, const Tensor_t &C, const PoolingDescriptors_t &, PoolingWorkspace_t &, size_t imgHeight, size_t imgWidth, size_t fltHeight, size_t fltWidth, size_t strideRows, size_t strideCols)
Downsample the matrix C to the matrix A, using max operation, such that the winning indices are store...
static TRandom * fgRandomGen
CNN::TCNNDescriptors< PoolingLayer_t > PoolingDescriptors_t
static void AddL1RegularizationGradients(Matrix_t &A, const Matrix_t &W, Scalar_t weightDecay)
static void AdamUpdate(Matrix_t &A, const Matrix_t &M, const Matrix_t &V, Scalar_t alpha, Scalar_t eps)
Adam updates.
static void InitializeIdentity(Matrix_t &A)
static void RNNForward(const Tensor_t &, const Matrix_t &, const Matrix_t &, const Tensor_t &, Tensor_t &, Matrix_t &, Matrix_t &, const RNNDescriptors_t &, RNNWorkspace_t &, bool)
static void InitializePoolDescriptors(TDescriptors *&, PoolingLayer_t *)
static void Hadamard(Matrix_t &A, const Matrix_t &B)
static Tensor_t CreateTensor(DeviceBuffer_t buffer, size_t n, size_t c, size_t h, size_t w)
TCudaTensor< AFloat > Tensor_t
CNN::TCNNWorkspace< PoolingLayer_t > PoolingWorkspace_t
static void SumColumns(Matrix_t &B, const Matrix_t &A, Scalar_t alpha=1.0, Scalar_t beta=0.)
Sum columns of (m x n) matrix A and write the results into the first m elements in A.
static void AdamUpdateSecondMom(Matrix_t &A, const Matrix_t &B, Scalar_t beta)
static void RotateWeights(Matrix_t &A, const Matrix_t &B, size_t filterDepth, size_t filterHeight, size_t filterWidth, size_t numFilters)
Rotates the matrix B, which is representing a weights, and stores them in the matrix A.
static void ReluDerivative(Tensor_t &B, const Tensor_t &A)
static void Hadamard(Tensor_t &A, const Tensor_t &B)
In-place Hadamard (element-wise) product of matrices A and B with the result being written into A.
static void CreateWeightTensors(std::vector< Matrix_t > &newWeights, const std::vector< Matrix_t > &weights)
static void Im2col(Matrix_t &A, const Matrix_t &B, size_t imgHeight, size_t imgWidth, size_t fltHeight, size_t fltWidth, size_t strideRows, size_t strideCols, size_t zeroPaddingHeight, size_t zeroPaddingWidth)
Transform the matrix B in local view format, suitable for convolution, and store it in matrix A.
static void Softmax(Matrix_t &YHat, const Matrix_t &)
static void InitializeActivationDescriptor(ActivationDescriptor_t &, EActivationFunction, double=0.0)
static void InitializeConvWorkspace(TWorkspace *&, TDescriptors *&, const DNN::CNN::TConvParams &, ConvLayer_t *)
static void InitializeLSTMDescriptors(TDescriptors *&, GenLayer_t *)
static void ReleaseBNormDescriptors(TDescriptors *&)
static void GaussDerivative(Tensor_t &B, const Tensor_t &A)
static void Relu(Tensor_t &B)
static void CalculateConvBiasGradients(Matrix_t &biasGradients, const Tensor_t &df, size_t batchSize, size_t depth, size_t nLocalViews)
Utility function for calculating the bias gradients of the convolutional layer.
static void PrepareInternals(Tensor_t &)
Dummy placeholder - preparation is currently only required for the CUDA architecture.
static void AddRowWise(Tensor_t &output, const Matrix_t &biases)
static void ReciprocalElementWise(Matrix_t &A)
Reciprocal each element of the matrix A and write the result into A.
static void SquareElementWise(Matrix_t &A)
Square each element of the matrix A and write the result into A.
static void Deflatten(Tensor_t &A, const Tensor_t &B)
Transforms each row of B to a matrix and stores it in the tensor B.
static void InitializeRNNTensors(GenLayer_t *)
TCudaDeviceBuffer< AFloat > DeviceBuffer_t
static Scalar_t Sum(const Matrix_t &A)
Compute the sum of all elements in A.
static void MultiplyTranspose(Tensor_t &output, const Tensor_t &input, const Matrix_t &weights)
static void MultiplyTranspose(Matrix_t &output, const Matrix_t &input, const Matrix_t &weights)
Matrix-multiply input with the transpose of weights and write the results into output.
static void SymmetricRelu(Tensor_t &B)
DummyCudaDataType TensorDescriptor_t
static void ReleaseConvDescriptors(TDescriptors *&)
Release CNN data/operator descriptors.
static void FreePoolDropoutWorkspace(TWorkspace *&)
static TRandom & GetRandomGenerator()
static void FastTanhDerivative(Tensor_t &B, const Tensor_t &A)
static void BatchNormLayerForwardTraining(int axis, const Tensor_t &x, Tensor_t &y, Matrix_t &gamma, Matrix_t &beta, Matrix_t &mean, Matrix_t &, Matrix_t &iVariance, Matrix_t &runningMeans, Matrix_t &runningVars, Scalar_t nTrainedBatches, Scalar_t momentum, Scalar_t epsilon, const TensorDescriptor_t &bnParDescriptor)
The input from each batch are normalized during training to have zero mean and unit variance and they...
static void BatchNormLayerBackward(int axis, const Tensor_t &x, const Tensor_t &dy, Tensor_t &dx, Matrix_t &gamma, Matrix_t &dgamma, Matrix_t &dbeta, const Matrix_t &mean, const Matrix_t &variance, const Matrix_t &iVariance, Scalar_t epsilon, const TensorDescriptor_t &)
static void Copy(Matrix_t &B, const Matrix_t &A)
static void SigmoidDerivative(Tensor_t &B, const Tensor_t &A)
static void ReleaseDescriptor(ActivationDescriptor_t &)
static void BatchNormLayerForwardInference(int axis, const Tensor_t &x, Matrix_t &gamma, Matrix_t &beta, Tensor_t &y, const Matrix_t &runningMeans, const Matrix_t &runningVars, Scalar_t epsilon, const TensorDescriptor_t &)
During inference the inputs are not normalized using the batch mean but the previously computed at ru...
static void CopyDiffArch(std::vector< Matrix_t > &A, const std::vector< AMatrix_t > &B)
static void Rearrange(Tensor_t &out, const Tensor_t &in)
Rearrage data according to time fill B x T x D out with T x B x D matrix in.
static void ReleaseRNNDescriptors(TDescriptors *&)
static Matrix_t & GRULayerBackward(Matrix_t &state_gradients_backward, Matrix_t &, Matrix_t &, Matrix_t &, Matrix_t &, Matrix_t &, Matrix_t &, Matrix_t &, Matrix_t &, Matrix_t &, Matrix_t &, Matrix_t &, Matrix_t &, const Matrix_t &, const Matrix_t &, const Matrix_t &, const Matrix_t &, const Matrix_t &, const Matrix_t &, const Matrix_t &, const Matrix_t &, const Matrix_t &, const Matrix_t &, const Matrix_t &, Matrix_t &, bool)
Backward pass for GRU Network.
static void Reshape(Matrix_t &A, const Matrix_t &B)
Transform the matrix B to a matrix with different dimensions A.
static void ConstMult(Matrix_t &A, Scalar_t beta)
Multiply the constant beta to all the elements of matrix A and write the result into A.
static void AdamUpdateFirstMom(Matrix_t &A, const Matrix_t &B, Scalar_t beta)
static void ConstAdd(Matrix_t &A, Scalar_t beta)
Add the constant beta to all the elements of matrix A and write the result into A.
static Scalar_t SoftmaxCrossEntropy(const Matrix_t &Y, const Matrix_t &output, const Matrix_t &weights)
Softmax transformation is implicitly applied, thus output should hold the linear activations of the l...
static void InitializeZero(Matrix_t &A)
static void AddConvBiases(Matrix_t &output, const Matrix_t &biases)
Add the biases in the Convolutional Layer.
static void DropoutForward(Matrix_t &A, Scalar_t p)
static void TransposeMultiply(Matrix_t &output, const Matrix_t &input, const Matrix_t &Weights, Scalar_t alpha=1.0, Scalar_t beta=0.)
Matrix multiplication of two matrices A and B^T (transposed) with the result being written into C.
static void CrossEntropyGradients(Matrix_t &dY, const Matrix_t &Y, const Matrix_t &output, const Matrix_t &weights)
static void ScaleAdd(Matrix_t &A, const Matrix_t &B, Scalar_t beta=1.0)
Adds a the elements in matrix B scaled by c to the elements in the matrix A.
static void SoftmaxCrossEntropyGradients(Matrix_t &dY, const Matrix_t &Y, const Matrix_t &output, const Matrix_t &weights)
static void TanhDerivative(Tensor_t &B, const Tensor_t &A)
static void CopyDiffArch(Tensor_t &A, const ATensor_t &B)
static void ReleasePoolDescriptors(TDescriptors *&)
static Tensor_t CreateTensor(size_t n, size_t c, size_t h, size_t w)
Generic General Layer class.
This is the base class for the ROOT Random number generators.
std::shared_ptr< std::function< double(double)> > Tanh
double weightDecay(double error, ItWeight itWeight, ItWeight itWeightEnd, double factorWeightDecay, EnumRegularization eRegularization)
compute the weight decay for regularization (L1 or L2)
EActivationFunction
Enum that represents layer activation functions.
std::shared_ptr< std::function< double(double)> > Gauss
std::shared_ptr< std::function< double(double)> > SoftSign
MemoryLayout
Memory layout type (copy from RTensor.hxx)
create variable transformations