18#ifndef TMVA_DNN_ARCHITECTURES_CUDNN
19#define TMVA_DNN_ARCHITECTURES_CUDNN
21#include "RConfigure.h"
24#error This file can be compiled only when cudnn is available in ROOT
50struct TCudnnEmptyDescriptor {};
60template<
typename AFloat = Float_t>
67 using Scalar_t = AFloat;
68 using Matrix_t = TCudaTensor<AFloat>;
69 using Tensor_t = TCudaTensor<AFloat>;
70 using DeviceBuffer_t = TCudaDeviceBuffer<AFloat>;
71 using HostBuffer_t = TCudaHostBuffer<AFloat>;
74 using ActivationDescriptor_t = cudnnActivationDescriptor_t;
75 using ConvolutionDescriptor_t = cudnnConvolutionDescriptor_t;
76 using DropoutDescriptor_t = cudnnDropoutDescriptor_t;
77 using FilterDescriptor_t = cudnnFilterDescriptor_t;
79 using PoolingDescriptor_t = cudnnPoolingDescriptor_t;
81 using AlgorithmForward_t = cudnnConvolutionFwdAlgo_t;
82 using AlgorithmBackward_t = cudnnConvolutionBwdDataAlgo_t;
83 using AlgorithmHelper_t = cudnnConvolutionBwdFilterAlgo_t;
84 using AlgorithmDataType_t = cudnnDataType_t;
85 using ReduceTensorDescriptor_t = cudnnReduceTensorDescriptor_t;
86 using TensorDescriptor_t = cudnnTensorDescriptor_t;
88 using EmptyDescriptor_t = TCudnnEmptyDescriptor;
90 using BNormLayer_t = TBatchNormLayer<TCudnn<AFloat>>;
91 using BNormDescriptors_t = TDNNGenDescriptors<BNormLayer_t>;
93 using ConvLayer_t = CNN::TConvLayer<TCudnn<AFloat>>;
94 using ConvDescriptors_t = CNN::TCNNDescriptors<ConvLayer_t>;
95 using ConvWorkspace_t = CNN::TCNNWorkspace<ConvLayer_t>;
96 using PoolingLayer_t = CNN::TMaxPoolLayer<TCudnn<AFloat>>;
97 using PoolingDescriptors_t = CNN::TCNNDescriptors<PoolingLayer_t>;
98 using PoolingWorkspace_t = CNN::TCNNWorkspace<PoolingLayer_t>;
107 static int ConvFwdAlgorithm;
108 static int ConvBwdDataAlgorithm;
109 static int ConvBwdFilterAlgorithm;
111 static Long_t ConvMaxWorkspaceSize;
117 static Tensor_t CreateTensor(
size_t n,
size_t c,
size_t h,
size_t w) {
118 return Tensor_t( {
n,
c,
h,w}, GetTensorLayout(), 0, 0);
121 static Tensor_t CreateTensor(DeviceBuffer_t buffer,
size_t n,
size_t c,
size_t h,
size_t w) {
122 return Tensor_t( buffer, {
n,
c,
h,w}, GetTensorLayout(), 0, 0);
127 static void CreateWeightTensors( std::vector<Matrix_t> & newWeights,
const std::vector<Matrix_t> & weights) {
128 if (!newWeights.empty()) newWeights.clear();
129 size_t n = weights.size();
130 for (
size_t i = 0; i <
n; ++i)
131 newWeights.emplace_back( weights[i].GetShape(), weights[i].GetLayout(), 0, 0);
138 static void InitializeBNormDescriptors(TDescriptors * & descriptors,
139 BNormLayer_t *
L =
nullptr);
141 static void InitializeConvDescriptors(TDescriptors * & descriptors,
142 ConvLayer_t *
L =
nullptr);
144 static void InitializePoolDescriptors(TDescriptors * & descriptors,
145 PoolingLayer_t *
L =
nullptr);
147 static void InitializeActivationDescriptor(ActivationDescriptor_t & descriptors,
EActivationFunction activFunc,
double coef = 0.0);
149 static void ReleaseConvDescriptors(TDescriptors * descriptors );
150 static void ReleasePoolDescriptors(TDescriptors * descriptors );
151 static void ReleaseBNormDescriptors(TDescriptors * descriptors );
152 static void ReleaseDescriptor(EmptyDescriptor_t & emptyDescr) {}
153 static void ReleaseDescriptor(ActivationDescriptor_t & activationDescr);
154 static void ReleaseDescriptor(ConvolutionDescriptor_t & convolutionDescr);
155 static void ReleaseDescriptor(DropoutDescriptor_t & dropoutDescr);
156 static void ReleaseDescriptor(FilterDescriptor_t & filterDescr);
157 static void ReleaseDescriptor(PoolingDescriptor_t & poolingDescr);
158 static void ReleaseDescriptor(TensorDescriptor_t & tensorDescr);
161 static void InitializeConvWorkspace(TWorkspace * & workspace,
162 TDescriptors * & descriptors,
164 ConvLayer_t *
L =
nullptr);
165 static void InitializePoolDropoutWorkspace(TWorkspace * & workspace,
166 TDescriptors * & descriptors,
168 PoolingLayer_t *
L =
nullptr);
170 static void FreeConvWorkspace(TWorkspace * workspace, ConvLayer_t *
L =
nullptr);
171 static void FreePoolDropoutWorkspace(TWorkspace * workspace, PoolingLayer_t *
L =
nullptr);
184 static void MultiplyTranspose(Tensor_t &
output,
const Tensor_t &input,
const Matrix_t &weights);
187 static void AddRowWise(Tensor_t &
output,
const Matrix_t &biases);
202 static void Backward(Tensor_t & activationGradientsBackward,
203 Matrix_t & weightGradients,
204 Matrix_t & biasGradients,
206 const Tensor_t & activationGradients,
207 const Matrix_t & weights,
208 const Tensor_t & activationBackward);
211 static void ScaleAdd(Tensor_t &
A,
const Tensor_t &
B,
212 Scalar_t alpha = 1.0,
213 Scalar_t
beta = 1.0);
216 static void Copy(Tensor_t &
A,
const Tensor_t &
B);
219 template<
typename ATensor_t>
220 static void CopyDiffArch(Tensor_t &
A,
221 const ATensor_t &
B);
223 template <
typename ATensor_t>
224 static void CopyWeightsDiffArch(Tensor_t &
A,
const ATensor_t &
B);
227 static void CopyDiffArch(Tensor_t
A,
const Tensor_t &
B ) {
Copy(
A,
B); }
230 template<
typename AMatrix_t>
231 static void CopyDiffArch(std::vector<Tensor_t> &
A,
232 const std::vector<AMatrix_t> &
B);
247 static void Identity(Tensor_t & X) {}
248 static void IdentityDerivative(Tensor_t & dX, Tensor_t& X,
249 Tensor_t & Y, Tensor_t & dY,
250 ActivationDescriptor_t activationDescr,
251 const AFloat alpha = 1,
252 const AFloat
beta = 1) {}
255 const ActivationDescriptor_t activationDescr,
256 const double coef = 0.0,
const AFloat alpha = 1,
257 const AFloat
beta = 0);
260 static void ActivationFunctionForward(Tensor_t &Y,
const Tensor_t & X,
EActivationFunction activFunct,
261 const ActivationDescriptor_t activationDescr,
const double coef = 0.0,
262 const AFloat alpha = 1,
const AFloat
beta = 0);
265 static void ActivationFunctionBackward(Tensor_t & dX,
const Tensor_t & Y,
266 const Tensor_t & dY,
const Tensor_t & X,
268 const ActivationDescriptor_t activationDescr,
269 const AFloat alpha = 1,
270 const AFloat
beta = 0);
276 static void SymmetricReluDerivative(Tensor_t &
B,
277 const Tensor_t &
A) {}
280 static void SoftSignDerivative(Tensor_t &
B,
281 const Tensor_t &
A) {}
284 static void GaussDerivative(Tensor_t &
B,
285 const Tensor_t &
A) {}
302 static Scalar_t MeanSquaredError(
const Matrix_t &Y,
const Matrix_t &
output,
303 const Matrix_t &weights);
304 static void MeanSquaredErrorGradients(Matrix_t &dY,
const Matrix_t &Y,
305 const Matrix_t &
output,
const Matrix_t &weights);
309 static Scalar_t CrossEntropy(
const Matrix_t &Y,
const Matrix_t &
output,
310 const Matrix_t &weights);
312 static void CrossEntropyGradients(Matrix_t &dY,
const Matrix_t &Y,
313 const Matrix_t &
output,
const Matrix_t &weights);
317 static Scalar_t SoftmaxCrossEntropy(
const Matrix_t &Y,
const Matrix_t &
output,
318 const Matrix_t &weights);
319 static void SoftmaxCrossEntropyGradients(Matrix_t &dY,
const Matrix_t &Y,
320 const Matrix_t &
output,
const Matrix_t &weights);
336 static void Sigmoid(Matrix_t &YHat,
338 static void Softmax(Matrix_t &YHat,
355 static void DropoutForward(Tensor_t &
A,
356 TDescriptors * descriptors,
357 TWorkspace * workspace,
360 static void DropoutBackward(Tensor_t &
A,
361 TDescriptors * descriptors,
362 TWorkspace * workspace);
380 static void BatchNormLayerForwardTraining(
int axis,
const Tensor_t &
x, Tensor_t &
y, Matrix_t &
gamma, Matrix_t &
beta,
381 Matrix_t &mean, Matrix_t &, Matrix_t &iVariance, Matrix_t &runningMeans,
382 Matrix_t &runningVars, Scalar_t nTrainedBatches, Scalar_t momentum,
383 Scalar_t
epsilon,
const TensorDescriptor_t &bnParDescriptor);
388 static void BatchNormLayerForwardInference(
int axis,
const Tensor_t &
x, Matrix_t &
gamma, Matrix_t &
beta,
389 Tensor_t &
y,
const Matrix_t &runningMeans,
390 const Matrix_t &runningVars, Scalar_t
epsilon,
391 const TensorDescriptor_t &);
393 static void BatchNormLayerBackward(
int axis,
const Tensor_t &
x,
const Tensor_t &dy, Tensor_t &dx,
395 Matrix_t &dgamma, Matrix_t &dbeta,
const Matrix_t &mean,
const Matrix_t &variance,
396 const Matrix_t &iVariance, Scalar_t
epsilon,
const TensorDescriptor_t &);
411 static Scalar_t L1Regularization(
const Matrix_t &W)
413 TCudaMatrix<AFloat> mW(W.GetDeviceBuffer(), W.GetSize(), 1);
416 static void AddL1RegularizationGradients(Matrix_t &
A,
const Matrix_t &W, Scalar_t
weightDecay)
418 TCudaMatrix<AFloat> mA(
A.GetDeviceBuffer(),
A.GetSize(), 1);
419 TCudaMatrix<AFloat> mW(W.GetDeviceBuffer(), W.GetSize(), 1);
423 static Scalar_t L2Regularization(
const Matrix_t &W)
425 TCudaMatrix<AFloat> mW(W.GetDeviceBuffer(), W.GetSize(), 1);
428 static void AddL2RegularizationGradients(Matrix_t &
A,
const Matrix_t &W, Scalar_t
weightDecay)
430 TCudaMatrix<AFloat> mA(
A.GetDeviceBuffer(),
A.GetSize(), 1);
431 TCudaMatrix<AFloat> mW(W.GetDeviceBuffer(), W.GetSize(), 1);
448 static void InitializeGauss(Matrix_t &
A);
449 static void InitializeUniform(Matrix_t &
A);
450 static void InitializeIdentity(Matrix_t &
A);
451 static void InitializeZero(Matrix_t &
A);
452 static void InitializeGlorotNormal(Matrix_t &
A);
453 static void InitializeGlorotUniform(Matrix_t &
A);
457 static TRandom &GetRandomGenerator();
460 static void SetRandomSeed(
size_t seed);
474 static void Dropout(Tensor_t &
A, Scalar_t p) {}
488 static void AddConvBiases(Matrix_t &
output,
const Matrix_t &biases);
492 static void PrepareInternals(Tensor_t &) {}
495 static void ConvLayerForward(Tensor_t &
output,
496 Tensor_t &inputActivationFunc,
497 const Tensor_t &input,
const Matrix_t &weights,
const Matrix_t &biases,
499 Tensor_t & ,
const ConvDescriptors_t &descriptors,
500 ConvWorkspace_t &workspace);
516 static void ConvLayerBackward(Tensor_t &activationGradientsBackward, Matrix_t &weightGradients,
517 Matrix_t &biasGradients, Tensor_t &inputActivation, Tensor_t &activationGradients,
518 const Matrix_t &weights,
const Tensor_t &activationBackward,
520 const ConvDescriptors_t &descriptors, ConvWorkspace_t &workspace,
size_t ,
521 size_t ,
size_t ,
size_t ,
size_t ,
522 size_t ,
size_t ,
size_t ,
538 static void Downsample(Tensor_t &
A, Tensor_t & ,
const Tensor_t &
C,
const PoolingDescriptors_t &descriptors,
539 PoolingWorkspace_t &workspace,
size_t imgHeight,
size_t imgWidth,
size_t fltHeight,
540 size_t fltWidth,
size_t strideRows,
size_t strideCols);
550 static void MaxPoolLayerBackward(Tensor_t &activationGradientsBackward,
const Tensor_t &activationGradients,
551 const Tensor_t & ,
const Tensor_t &inputActivation,
552 const Tensor_t &outputTensor,
const PoolingDescriptors_t &descriptors,
553 PoolingWorkspace_t &workspace,
size_t imgHeight,
size_t imgWidth,
size_t fltHeight,
554 size_t fltWidth,
size_t strideRows,
size_t strideCols,
size_t nLocalViews);
571 static void Flatten(Tensor_t &
A,
const Tensor_t &
B);
575 static void Deflatten(Tensor_t &
A,
const Tensor_t &
B);
581 static Matrix_t &RecurrentLayerBackward(Matrix_t &state_gradients_backward,
582 Matrix_t & , Matrix_t &,
591 return state_gradients_backward;
610 static void Hadamard(Tensor_t &
A,
const Tensor_t &
B)
612 TCudaMatrix<AFloat> tmpA(
A.GetDeviceBuffer(), 1,
A.GetSize());
613 TCudaMatrix<AFloat> tmpB(
B.GetDeviceBuffer(), 1,
B.GetSize());
614 assert(
A.GetSize() ==
B.GetSize());
626 static Scalar_t
Sum(
const Matrix_t &
A, Scalar_t alpha = 1.0, Scalar_t
beta = 0.0);
634 static void ConstAdd(Matrix_t &
A, Scalar_t
beta) {
635 TCudaMatrix<AFloat> tmp(
A.GetDeviceBuffer(), 1,
A.GetSize());
642 static void ConstMult(Matrix_t &
A, Scalar_t
beta) {
643 TCudaMatrix<AFloat> tmp(
A.GetDeviceBuffer(), 1,
A.GetSize());
650 static void ReciprocalElementWise(Matrix_t &
A) {
651 TCudaMatrix<AFloat> tmp(
A.GetDeviceBuffer(), 1,
A.GetSize());
658 static void SquareElementWise(Matrix_t &
A) {
659 TCudaMatrix<AFloat> tmp(
A.GetDeviceBuffer(), 1,
A.GetSize());
667 static void SqrtElementWise(Matrix_t &
A) {
668 TCudaMatrix<AFloat> tmp(
A.GetDeviceBuffer(), 1,
A.GetSize());
673 static void AdamUpdate(Matrix_t &
A,
const Matrix_t & M,
const Matrix_t & V, Scalar_t alpha, Scalar_t eps) {
674 TCudaMatrix<AFloat> tmpA(
A.GetDeviceBuffer(),
A.GetSize(),1);
675 TCudaMatrix<AFloat> tmpM(M.GetDeviceBuffer(), M.GetSize(),1);
676 TCudaMatrix<AFloat> tmpV(V.GetDeviceBuffer(), V.GetSize(),1);
679 static void AdamUpdateFirstMom(Matrix_t &
A,
const Matrix_t &
B, Scalar_t
beta) {
680 TCudaMatrix<AFloat> tmpA(
A.GetDeviceBuffer(),
A.GetSize(),1);
681 TCudaMatrix<AFloat> tmpB(
B.GetDeviceBuffer(),
B.GetSize(),1);
684 static void AdamUpdateSecondMom(Matrix_t &
A,
const Matrix_t &
B, Scalar_t
beta) {
685 TCudaMatrix<AFloat> tmpA(
A.GetDeviceBuffer(),
A.GetSize(),1);
686 TCudaMatrix<AFloat> tmpB(
B.GetDeviceBuffer(),
B.GetSize(),1);
691 static void PrintTensor(
const Tensor_t &
A,
const std::string
name =
"tensor",
bool =
false);
702 static void SumRows(Matrix_t &
B,
const Matrix_t &
A);
709template <
typename AFloat>
710template <
typename ATensor>
711void TCudnn<AFloat>::CopyDiffArch(TCudaTensor<AFloat> &
B,
718 if (
B.GetLayout() == GetTensorLayout() ) {
719 assert(
B.GetShape().size() == 4);
720 for (
size_t i = 0; i <
A.GetFirstSize(); ++i) {
723 TCudaTensor<AFloat> tmpOut =
B.At(i);
725 TCudaTensor<AFloat> tmpIn(matIn.
GetMatrixArray(), tmpOut.GetShape(), tmpOut.GetLayout());
731 TCudaMatrix<AFloat> tmp2(tmp);
732 TCudaTensor<AFloat> tA(tmp2);
738template <
typename AFloat>
739template <
typename AMatrix>
740void TCudnn<AFloat>::CopyWeightsDiffArch(TCudaTensor<AFloat> &
B,
const AMatrix &
A)
746 if (
B.GetLayout() == GetTensorLayout() ) {
748 assert(
B.GetShape().size() == 4);
751 TCudaMatrix<AFloat> tmp2(tmp);
752 TCudaTensor<AFloat> tA(tmp2);
757template <
typename AFloat>
758template <
typename AMatrix_t>
759void TCudnn<AFloat>::CopyDiffArch(std::vector<Tensor_t> &
B,
760 const std::vector<AMatrix_t> &
A)
762 for (
size_t i = 0; i <
B.size(); ++i) {
763 CopyWeightsDiffArch(
B[i],
A[i]);
767template <
typename AFloat>
768void TCudnn<AFloat>::PrintTensor(
const typename TCudnn<AFloat>::Tensor_t &
A,
const std::string
name,
bool truncate )
770 std::cout <<
name <<
" size = " <<
A.GetSize() <<
" shape = { ";
771 auto shape =
A.GetShape();
772 for (
size_t k = 0; k < shape.size()-1; ++k)
773 std::cout << shape[k] <<
" , ";
774 std::cout << shape.back() <<
" } ";
775 std::cout <<
" strides = { ";
776 auto strides =
A.GetStrides();
777 for (
size_t k = 0; k < strides.size()-1; ++k)
778 std::cout << strides[k] <<
" , ";
779 std::cout << strides.back() <<
" }\n ";
781 if (
A.GetShape().size() == 2 ) {
782 for (
size_t i = 0; i <
A.GetShape()[0]; ++i) {
784 size_t n =
A.GetShape()[1];
785 if (truncate)
n = std::min(
n,
size_t(10));
786 for (
size_t j = 0; j <
n; ++j) {
787 std::cout <<
A(i,j) <<
" ";
790 if (truncate &&
n <
A.GetShape()[1]) std::cout <<
" ...... ";
791 std::cout <<
" } " << std::endl;
793 }
else if (
A.GetShape().size() == 3 ) {
794 for (
size_t i = 0; i <
A.GetFirstSize(); ++i) {
796 for (
size_t j = 0; j <
A.GetHSize(); ++j) {
798 size_t n =
A.GetWSize();
799 if (truncate)
n = std::min(
n,
size_t(10));
800 for (
size_t k = 0; k <
n; ++k) {
801 std::cout <<
A(i,j,k) <<
" ";
803 if (truncate &&
n <
A.GetWSize()) std::cout <<
" ...... ";
804 std::cout <<
" } " << std::endl;
806 std::cout <<
" } " << std::endl;
808 }
else if (
A.GetShape().size() == 4 ) {
809 for (
size_t i = 0; i <
A.GetShape()[0]; ++i) {
811 for (
size_t j = 0; j <
A.GetShape()[1]; ++j) {
813 for (
size_t k = 0; k <
A.GetShape()[2]; ++k) {
814 size_t n =
A.GetShape()[3];
815 if (truncate)
n = std::min(
n,
size_t(10));
816 for (
size_t l = 0;
l <
n; ++
l) {
817 std::cout <<
A(i,j,k,
l) <<
" ";
819 if (truncate &&
n <
A.GetShape()[3]) std::cout <<
" ...... ";
820 std::cout <<
" } " << std::endl;
822 std::cout <<
" } " << std::endl;
824 std::cout <<
" } " << std::endl;
828 for (
size_t l = 0;
l <
A.GetSize(); ++
l) {
829 std::cout <<
A.GetData()[
l] <<
" ";
847template <
typename AFloat>
848int TCudnn<AFloat>::CNNOptions::ConvFwdAlgorithm = -1;
849template <
typename AFloat>
850int TCudnn<AFloat>::CNNOptions::ConvBwdDataAlgorithm = -1;
851template <
typename AFloat>
852int TCudnn<AFloat>::CNNOptions::ConvBwdFilterAlgorithm = -1;
853template <
typename AFloat>
854Long_t TCudnn<AFloat>::CNNOptions::ConvMaxWorkspaceSize = -1;
static void AdamUpdate(Matrix_t &A, const Matrix_t &M, const Matrix_t &V, Scalar_t alpha, Scalar_t eps)
static void ReciprocalElementWise(Matrix_t &A)
Reciprocal each element of the matrix A and write the result into A.
static void ConstAdd(Matrix_t &A, Scalar_t beta)
Add the constant beta to all the elements of matrix A and write the result into A.
static void AdamUpdateFirstMom(Matrix_t &A, const Matrix_t &B, Scalar_t beta)
static void AddL1RegularizationGradients(Matrix_t &A, const Matrix_t &W, Scalar_t weightDecay)
static void AdamUpdateSecondMom(Matrix_t &A, const Matrix_t &B, Scalar_t beta)
static void SquareElementWise(Matrix_t &A)
Square each element of the matrix A and write the result into A.
static Scalar_t L2Regularization(const Matrix_t &W)
static void Hadamard(Tensor_t &A, const Tensor_t &B)
In-place Hadamard (element-wise) product of matrices A and B with the result being written into A.
static void SqrtElementWise(Matrix_t &A)
Square root each element of the matrix A and write the result into A.
static void ConstMult(Matrix_t &A, Scalar_t beta)
Multiply the constant beta to all the elements of matrix A and write the result into A.
static void Rearrange(Tensor_t &out, const Tensor_t &in)
Rearrage data accoring to time fill B x T x D out with T x B x D matrix in.
static Scalar_t L1Regularization(const Matrix_t &W)
virtual const Element * GetMatrixArray() const
TMatrixT< Element > & T()
This is the base class for the ROOT Random number generators.
double beta(double x, double y)
Calculates the beta function.
void Copy(void *source, void *dest)
T Sum(const RVec< T > &v)
Sum elements of an RVec.
static constexpr double L
struct TMVA::DNN::CNN::TConvParams TConvParams
double weightDecay(double error, ItWeight itWeight, ItWeight itWeightEnd, double factorWeightDecay, EnumRegularization eRegularization)
compute the weight decay for regularization (L1 or L2)
EActivationFunction
Enum that represents layer activation functions.
std::shared_ptr< std::function< double(double)> > Sigmoid
T Identity(T value)
Identity function f(x) = x.
MemoryLayout
Memory layout type (copy from RTensor.hxx)
create variable transformations
static void output(int code)